diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,182103 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 18206, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.894879937171936, + "epoch": 0.00010985691137293675, + "grad_norm": 1.5214658975601196, + "learning_rate": 0.0, + "loss": 1.4908, + "mean_token_accuracy": 0.6401820083459219, + "num_tokens": 168636.0, + "step": 1 + }, + { + "entropy": 1.9256814221541088, + "epoch": 0.0002197138227458735, + "grad_norm": 1.8711611032485962, + "learning_rate": 3.656307129798904e-08, + "loss": 1.4614, + "mean_token_accuracy": 0.6461619188388189, + "num_tokens": 297186.0, + "step": 2 + }, + { + "entropy": 1.936289479335149, + "epoch": 0.00032957073411881023, + "grad_norm": 1.2696285247802734, + "learning_rate": 7.312614259597807e-08, + "loss": 1.5012, + "mean_token_accuracy": 0.6218364934126536, + "num_tokens": 471129.0, + "step": 3 + }, + { + "entropy": 1.9193981885910034, + "epoch": 0.000439427645491747, + "grad_norm": 1.9576971530914307, + "learning_rate": 1.096892138939671e-07, + "loss": 1.4109, + "mean_token_accuracy": 0.6370265334844589, + "num_tokens": 632787.0, + "step": 4 + }, + { + "entropy": 1.8874422013759613, + "epoch": 0.0005492845568646837, + "grad_norm": 1.4753385782241821, + "learning_rate": 1.4625228519195615e-07, + "loss": 1.53, + "mean_token_accuracy": 0.6317170361677805, + "num_tokens": 814767.0, + "step": 5 + }, + { + "entropy": 1.9161963363488514, + "epoch": 0.0006591414682376205, + "grad_norm": 1.3654813766479492, + "learning_rate": 1.8281535648994517e-07, + "loss": 1.513, + "mean_token_accuracy": 0.6349450548489889, + "num_tokens": 955276.0, + "step": 6 + }, + { + "entropy": 1.9518279830614726, + "epoch": 0.0007689983796105573, + "grad_norm": 2.4183151721954346, + "learning_rate": 2.193784277879342e-07, + "loss": 1.4296, + "mean_token_accuracy": 0.6418899148702621, + "num_tokens": 1123639.0, + "step": 7 + }, + { + "entropy": 1.9186626176039379, + "epoch": 0.000878855290983494, + "grad_norm": 2.4768245220184326, + "learning_rate": 2.5594149908592327e-07, + "loss": 1.4314, + "mean_token_accuracy": 0.6455866148074468, + "num_tokens": 1256300.0, + "step": 8 + }, + { + "entropy": 1.8859932323296864, + "epoch": 0.0009887122023564308, + "grad_norm": 2.537692070007324, + "learning_rate": 2.925045703839123e-07, + "loss": 1.3804, + "mean_token_accuracy": 0.6570307711760203, + "num_tokens": 1391902.0, + "step": 9 + }, + { + "entropy": 1.9118959108988445, + "epoch": 0.0010985691137293675, + "grad_norm": 2.2627501487731934, + "learning_rate": 3.2906764168190127e-07, + "loss": 1.4766, + "mean_token_accuracy": 0.6348355263471603, + "num_tokens": 1513301.0, + "step": 10 + }, + { + "entropy": 1.927810400724411, + "epoch": 0.0012084260251023042, + "grad_norm": 2.3921263217926025, + "learning_rate": 3.6563071297989034e-07, + "loss": 1.4505, + "mean_token_accuracy": 0.6397508382797241, + "num_tokens": 1646836.0, + "step": 11 + }, + { + "entropy": 1.9784689545631409, + "epoch": 0.001318282936475241, + "grad_norm": 1.7080570459365845, + "learning_rate": 4.021937842778794e-07, + "loss": 1.4932, + "mean_token_accuracy": 0.6328056206305822, + "num_tokens": 1790819.0, + "step": 12 + }, + { + "entropy": 1.832368512948354, + "epoch": 0.0014281398478481777, + "grad_norm": 1.9487069845199585, + "learning_rate": 4.387568555758684e-07, + "loss": 1.4971, + "mean_token_accuracy": 0.6369460622469584, + "num_tokens": 1944997.0, + "step": 13 + }, + { + "entropy": 1.9581014811992645, + "epoch": 0.0015379967592211146, + "grad_norm": 1.672973871231079, + "learning_rate": 4.7531992687385747e-07, + "loss": 1.572, + "mean_token_accuracy": 0.6146093358596166, + "num_tokens": 2091666.0, + "step": 14 + }, + { + "entropy": 2.0294719139734902, + "epoch": 0.0016478536705940513, + "grad_norm": 2.086653709411621, + "learning_rate": 5.118829981718465e-07, + "loss": 1.4884, + "mean_token_accuracy": 0.6278541535139084, + "num_tokens": 2216390.0, + "step": 15 + }, + { + "entropy": 1.934642493724823, + "epoch": 0.001757710581966988, + "grad_norm": 1.553402066230774, + "learning_rate": 5.484460694698355e-07, + "loss": 1.4823, + "mean_token_accuracy": 0.634530633687973, + "num_tokens": 2388588.0, + "step": 16 + }, + { + "entropy": 1.977916826804479, + "epoch": 0.0018675674933399248, + "grad_norm": 1.5559946298599243, + "learning_rate": 5.850091407678246e-07, + "loss": 1.4955, + "mean_token_accuracy": 0.6247076193491617, + "num_tokens": 2557798.0, + "step": 17 + }, + { + "entropy": 1.8902287880579631, + "epoch": 0.0019774244047128615, + "grad_norm": 1.4230289459228516, + "learning_rate": 6.215722120658136e-07, + "loss": 1.4787, + "mean_token_accuracy": 0.6302339931329092, + "num_tokens": 2738112.0, + "step": 18 + }, + { + "entropy": 1.8939658204714458, + "epoch": 0.0020872813160857985, + "grad_norm": 1.492349624633789, + "learning_rate": 6.581352833638025e-07, + "loss": 1.4823, + "mean_token_accuracy": 0.6441057572762171, + "num_tokens": 2898646.0, + "step": 19 + }, + { + "entropy": 1.8878755668799083, + "epoch": 0.002197138227458735, + "grad_norm": 1.266729712486267, + "learning_rate": 6.946983546617917e-07, + "loss": 1.4777, + "mean_token_accuracy": 0.6475276350975037, + "num_tokens": 3110857.0, + "step": 20 + }, + { + "entropy": 1.8882379333178203, + "epoch": 0.002306995138831672, + "grad_norm": 1.5952550172805786, + "learning_rate": 7.312614259597807e-07, + "loss": 1.4565, + "mean_token_accuracy": 0.6390559126933416, + "num_tokens": 3307447.0, + "step": 21 + }, + { + "entropy": 1.9016104241212208, + "epoch": 0.0024168520502046084, + "grad_norm": 1.1307969093322754, + "learning_rate": 7.678244972577697e-07, + "loss": 1.4924, + "mean_token_accuracy": 0.6293008426825205, + "num_tokens": 3485179.0, + "step": 22 + }, + { + "entropy": 1.9216719369093578, + "epoch": 0.0025267089615775454, + "grad_norm": 1.1407924890518188, + "learning_rate": 8.043875685557588e-07, + "loss": 1.6257, + "mean_token_accuracy": 0.6146015028158823, + "num_tokens": 3717262.0, + "step": 23 + }, + { + "entropy": 1.8951739370822906, + "epoch": 0.002636565872950482, + "grad_norm": 1.7030082941055298, + "learning_rate": 8.409506398537478e-07, + "loss": 1.5923, + "mean_token_accuracy": 0.6380182355642319, + "num_tokens": 3838144.0, + "step": 24 + }, + { + "entropy": 1.9046282172203064, + "epoch": 0.002746422784323419, + "grad_norm": 1.6221411228179932, + "learning_rate": 8.775137111517368e-07, + "loss": 1.3792, + "mean_token_accuracy": 0.6614825973908106, + "num_tokens": 3988846.0, + "step": 25 + }, + { + "entropy": 1.9025114277998607, + "epoch": 0.0028562796956963553, + "grad_norm": 1.7412817478179932, + "learning_rate": 9.140767824497258e-07, + "loss": 1.5043, + "mean_token_accuracy": 0.6499176571766535, + "num_tokens": 4123835.0, + "step": 26 + }, + { + "entropy": 1.9772209624449413, + "epoch": 0.0029661366070692923, + "grad_norm": 1.6628296375274658, + "learning_rate": 9.506398537477149e-07, + "loss": 1.5479, + "mean_token_accuracy": 0.6233152449131012, + "num_tokens": 4264365.0, + "step": 27 + }, + { + "entropy": 1.9088290234406788, + "epoch": 0.003075993518442229, + "grad_norm": 1.2123667001724243, + "learning_rate": 9.87202925045704e-07, + "loss": 1.5121, + "mean_token_accuracy": 0.6258416324853897, + "num_tokens": 4480352.0, + "step": 28 + }, + { + "entropy": 1.9050040543079376, + "epoch": 0.0031858504298151657, + "grad_norm": 2.060734510421753, + "learning_rate": 1.023765996343693e-06, + "loss": 1.3907, + "mean_token_accuracy": 0.6540684401988983, + "num_tokens": 4613774.0, + "step": 29 + }, + { + "entropy": 1.9359141091505687, + "epoch": 0.0032957073411881027, + "grad_norm": 0.9076595306396484, + "learning_rate": 1.060329067641682e-06, + "loss": 1.6216, + "mean_token_accuracy": 0.6132212653756142, + "num_tokens": 4864127.0, + "step": 30 + }, + { + "entropy": 1.9393312633037567, + "epoch": 0.003405564252561039, + "grad_norm": 1.4876841306686401, + "learning_rate": 1.096892138939671e-06, + "loss": 1.5932, + "mean_token_accuracy": 0.6290038675069809, + "num_tokens": 5048366.0, + "step": 31 + }, + { + "entropy": 1.8906433979670207, + "epoch": 0.003515421163933976, + "grad_norm": 1.0335384607315063, + "learning_rate": 1.13345521023766e-06, + "loss": 1.6327, + "mean_token_accuracy": 0.6198825240135193, + "num_tokens": 5249785.0, + "step": 32 + }, + { + "entropy": 1.891570011774699, + "epoch": 0.0036252780753069126, + "grad_norm": 1.4123249053955078, + "learning_rate": 1.1700182815356492e-06, + "loss": 1.3654, + "mean_token_accuracy": 0.6579603354136149, + "num_tokens": 5414378.0, + "step": 33 + }, + { + "entropy": 1.9350098570187886, + "epoch": 0.0037351349866798496, + "grad_norm": 1.7732880115509033, + "learning_rate": 1.206581352833638e-06, + "loss": 1.4123, + "mean_token_accuracy": 0.6357505569855372, + "num_tokens": 5576716.0, + "step": 34 + }, + { + "entropy": 1.9273627698421478, + "epoch": 0.003844991898052786, + "grad_norm": 2.0864319801330566, + "learning_rate": 1.2431444241316272e-06, + "loss": 1.4579, + "mean_token_accuracy": 0.6357160607973734, + "num_tokens": 5702401.0, + "step": 35 + }, + { + "entropy": 1.8696773449579875, + "epoch": 0.003954848809425723, + "grad_norm": 1.2789888381958008, + "learning_rate": 1.2797074954296162e-06, + "loss": 1.5019, + "mean_token_accuracy": 0.6273581286271414, + "num_tokens": 5888777.0, + "step": 36 + }, + { + "entropy": 1.8879920840263367, + "epoch": 0.0040647057207986595, + "grad_norm": 1.2594259977340698, + "learning_rate": 1.316270566727605e-06, + "loss": 1.4994, + "mean_token_accuracy": 0.6347703188657761, + "num_tokens": 6086963.0, + "step": 37 + }, + { + "entropy": 1.8589064280192058, + "epoch": 0.004174562632171597, + "grad_norm": 1.607129693031311, + "learning_rate": 1.3528336380255944e-06, + "loss": 1.4691, + "mean_token_accuracy": 0.6347041179736456, + "num_tokens": 6240554.0, + "step": 38 + }, + { + "entropy": 1.917704850435257, + "epoch": 0.004284419543544533, + "grad_norm": 2.4967267513275146, + "learning_rate": 1.3893967093235833e-06, + "loss": 1.4689, + "mean_token_accuracy": 0.634268601735433, + "num_tokens": 6369609.0, + "step": 39 + }, + { + "entropy": 1.921486258506775, + "epoch": 0.00439427645491747, + "grad_norm": 1.696545124053955, + "learning_rate": 1.4259597806215722e-06, + "loss": 1.4784, + "mean_token_accuracy": 0.6417160034179688, + "num_tokens": 6521087.0, + "step": 40 + }, + { + "entropy": 1.9418790340423584, + "epoch": 0.0045041333662904064, + "grad_norm": 1.5666840076446533, + "learning_rate": 1.4625228519195614e-06, + "loss": 1.4172, + "mean_token_accuracy": 0.6557289958000183, + "num_tokens": 6688333.0, + "step": 41 + }, + { + "entropy": 1.8845481673876445, + "epoch": 0.004613990277663344, + "grad_norm": 1.5776560306549072, + "learning_rate": 1.4990859232175503e-06, + "loss": 1.4631, + "mean_token_accuracy": 0.6422620664040247, + "num_tokens": 6832991.0, + "step": 42 + }, + { + "entropy": 1.9316304922103882, + "epoch": 0.00472384718903628, + "grad_norm": 2.2713351249694824, + "learning_rate": 1.5356489945155394e-06, + "loss": 1.5357, + "mean_token_accuracy": 0.6292731215556463, + "num_tokens": 6962510.0, + "step": 43 + }, + { + "entropy": 1.8830671906471252, + "epoch": 0.004833704100409217, + "grad_norm": 1.8595447540283203, + "learning_rate": 1.5722120658135283e-06, + "loss": 1.3832, + "mean_token_accuracy": 0.6504772454500198, + "num_tokens": 7113892.0, + "step": 44 + }, + { + "entropy": 1.9839610954125722, + "epoch": 0.004943561011782153, + "grad_norm": 1.653223991394043, + "learning_rate": 1.6087751371115177e-06, + "loss": 1.6831, + "mean_token_accuracy": 0.6217963248491287, + "num_tokens": 7242184.0, + "step": 45 + }, + { + "entropy": 1.8768392503261566, + "epoch": 0.005053417923155091, + "grad_norm": 2.0047905445098877, + "learning_rate": 1.6453382084095066e-06, + "loss": 1.3338, + "mean_token_accuracy": 0.6631583720445633, + "num_tokens": 7439452.0, + "step": 46 + }, + { + "entropy": 1.9426932732264202, + "epoch": 0.005163274834528027, + "grad_norm": 1.812389850616455, + "learning_rate": 1.6819012797074955e-06, + "loss": 1.5984, + "mean_token_accuracy": 0.6237726360559464, + "num_tokens": 7581382.0, + "step": 47 + }, + { + "entropy": 1.9155326286951702, + "epoch": 0.005273131745900964, + "grad_norm": 1.7920594215393066, + "learning_rate": 1.7184643510054846e-06, + "loss": 1.502, + "mean_token_accuracy": 0.6386721283197403, + "num_tokens": 7734683.0, + "step": 48 + }, + { + "entropy": 1.9195171296596527, + "epoch": 0.005382988657273901, + "grad_norm": 2.0478262901306152, + "learning_rate": 1.7550274223034736e-06, + "loss": 1.4524, + "mean_token_accuracy": 0.6335723251104355, + "num_tokens": 7913144.0, + "step": 49 + }, + { + "entropy": 1.9190079867839813, + "epoch": 0.005492845568646838, + "grad_norm": 1.5729821920394897, + "learning_rate": 1.7915904936014627e-06, + "loss": 1.5228, + "mean_token_accuracy": 0.6325205812851588, + "num_tokens": 8053143.0, + "step": 50 + }, + { + "entropy": 1.9134818116823833, + "epoch": 0.005602702480019774, + "grad_norm": 2.2003588676452637, + "learning_rate": 1.8281535648994516e-06, + "loss": 1.4542, + "mean_token_accuracy": 0.6385191728671392, + "num_tokens": 8223896.0, + "step": 51 + }, + { + "entropy": 1.8575187226136525, + "epoch": 0.005712559391392711, + "grad_norm": 2.713563919067383, + "learning_rate": 1.864716636197441e-06, + "loss": 1.4948, + "mean_token_accuracy": 0.6294996092716852, + "num_tokens": 8382141.0, + "step": 52 + }, + { + "entropy": 1.9104352692763011, + "epoch": 0.005822416302765648, + "grad_norm": 1.6903510093688965, + "learning_rate": 1.9012797074954299e-06, + "loss": 1.4369, + "mean_token_accuracy": 0.6408693542083105, + "num_tokens": 8544191.0, + "step": 53 + }, + { + "entropy": 1.9804502725601196, + "epoch": 0.0059322732141385845, + "grad_norm": 2.0042126178741455, + "learning_rate": 1.9378427787934186e-06, + "loss": 1.5279, + "mean_token_accuracy": 0.6251623729864756, + "num_tokens": 8703483.0, + "step": 54 + }, + { + "entropy": 1.9068463842074077, + "epoch": 0.006042130125511521, + "grad_norm": 1.6808873414993286, + "learning_rate": 1.974405850091408e-06, + "loss": 1.4428, + "mean_token_accuracy": 0.6341716150442759, + "num_tokens": 8874273.0, + "step": 55 + }, + { + "entropy": 1.8545368413130443, + "epoch": 0.006151987036884458, + "grad_norm": 1.35502290725708, + "learning_rate": 2.010968921389397e-06, + "loss": 1.4585, + "mean_token_accuracy": 0.6444613436857859, + "num_tokens": 9032964.0, + "step": 56 + }, + { + "entropy": 1.9203276832898457, + "epoch": 0.006261843948257395, + "grad_norm": 2.1629741191864014, + "learning_rate": 2.047531992687386e-06, + "loss": 1.4645, + "mean_token_accuracy": 0.6436514407396317, + "num_tokens": 9161160.0, + "step": 57 + }, + { + "entropy": 1.9330822229385376, + "epoch": 0.0063717008596303314, + "grad_norm": 2.02375864982605, + "learning_rate": 2.084095063985375e-06, + "loss": 1.4975, + "mean_token_accuracy": 0.6350375364224116, + "num_tokens": 9332649.0, + "step": 58 + }, + { + "entropy": 1.8069697419802349, + "epoch": 0.006481557771003268, + "grad_norm": 1.4925216436386108, + "learning_rate": 2.120658135283364e-06, + "loss": 1.4355, + "mean_token_accuracy": 0.6453156520922979, + "num_tokens": 9484300.0, + "step": 59 + }, + { + "entropy": 1.9819251795609791, + "epoch": 0.006591414682376205, + "grad_norm": 1.2172040939331055, + "learning_rate": 2.157221206581353e-06, + "loss": 1.5083, + "mean_token_accuracy": 0.6365701605876287, + "num_tokens": 9689189.0, + "step": 60 + }, + { + "entropy": 1.8836710751056671, + "epoch": 0.006701271593749142, + "grad_norm": 1.4082310199737549, + "learning_rate": 2.193784277879342e-06, + "loss": 1.4055, + "mean_token_accuracy": 0.641944482922554, + "num_tokens": 9871400.0, + "step": 61 + }, + { + "entropy": 1.862162669499715, + "epoch": 0.006811128505122078, + "grad_norm": 1.952700138092041, + "learning_rate": 2.230347349177331e-06, + "loss": 1.4579, + "mean_token_accuracy": 0.6387267460425695, + "num_tokens": 10030629.0, + "step": 62 + }, + { + "entropy": 1.86505792538325, + "epoch": 0.006920985416495015, + "grad_norm": 2.528541326522827, + "learning_rate": 2.26691042047532e-06, + "loss": 1.4184, + "mean_token_accuracy": 0.6428513675928116, + "num_tokens": 10174909.0, + "step": 63 + }, + { + "entropy": 1.8634338974952698, + "epoch": 0.007030842327867952, + "grad_norm": 1.5209519863128662, + "learning_rate": 2.3034734917733095e-06, + "loss": 1.6417, + "mean_token_accuracy": 0.6137880484263102, + "num_tokens": 10389155.0, + "step": 64 + }, + { + "entropy": 2.011722683906555, + "epoch": 0.007140699239240889, + "grad_norm": 2.0947635173797607, + "learning_rate": 2.3400365630712984e-06, + "loss": 1.6333, + "mean_token_accuracy": 0.6242827375729879, + "num_tokens": 10510783.0, + "step": 65 + }, + { + "entropy": 1.9394526779651642, + "epoch": 0.007250556150613825, + "grad_norm": 2.9054830074310303, + "learning_rate": 2.3765996343692873e-06, + "loss": 1.4163, + "mean_token_accuracy": 0.6395674695571264, + "num_tokens": 10644860.0, + "step": 66 + }, + { + "entropy": 1.8742198546727498, + "epoch": 0.007360413061986763, + "grad_norm": 1.4175904989242554, + "learning_rate": 2.413162705667276e-06, + "loss": 1.4076, + "mean_token_accuracy": 0.6420022894938787, + "num_tokens": 10827216.0, + "step": 67 + }, + { + "entropy": 1.8331912557284038, + "epoch": 0.007470269973359699, + "grad_norm": 1.3983286619186401, + "learning_rate": 2.449725776965265e-06, + "loss": 1.5168, + "mean_token_accuracy": 0.623612051208814, + "num_tokens": 11040709.0, + "step": 68 + }, + { + "entropy": 2.0345064600308738, + "epoch": 0.007580126884732636, + "grad_norm": 1.5471314191818237, + "learning_rate": 2.4862888482632545e-06, + "loss": 1.486, + "mean_token_accuracy": 0.6237680067618688, + "num_tokens": 11237129.0, + "step": 69 + }, + { + "entropy": 1.9124818940957387, + "epoch": 0.007689983796105572, + "grad_norm": 2.0542972087860107, + "learning_rate": 2.5228519195612434e-06, + "loss": 1.5191, + "mean_token_accuracy": 0.6362631718317667, + "num_tokens": 11393268.0, + "step": 70 + }, + { + "entropy": 1.9250177542368572, + "epoch": 0.0077998407074785095, + "grad_norm": 1.6303218603134155, + "learning_rate": 2.5594149908592323e-06, + "loss": 1.566, + "mean_token_accuracy": 0.6235620379447937, + "num_tokens": 11577989.0, + "step": 71 + }, + { + "entropy": 1.9424512485663097, + "epoch": 0.007909697618851446, + "grad_norm": 1.447934627532959, + "learning_rate": 2.5959780621572212e-06, + "loss": 1.4839, + "mean_token_accuracy": 0.630613719423612, + "num_tokens": 11740054.0, + "step": 72 + }, + { + "entropy": 1.8930113911628723, + "epoch": 0.008019554530224383, + "grad_norm": 1.1407986879348755, + "learning_rate": 2.63254113345521e-06, + "loss": 1.4887, + "mean_token_accuracy": 0.6346247841914495, + "num_tokens": 12000084.0, + "step": 73 + }, + { + "entropy": 1.9313855667908986, + "epoch": 0.008129411441597319, + "grad_norm": 1.1938432455062866, + "learning_rate": 2.6691042047531995e-06, + "loss": 1.5619, + "mean_token_accuracy": 0.6224364936351776, + "num_tokens": 12206632.0, + "step": 74 + }, + { + "entropy": 1.9749772349993389, + "epoch": 0.008239268352970256, + "grad_norm": 1.916087031364441, + "learning_rate": 2.705667276051189e-06, + "loss": 1.5133, + "mean_token_accuracy": 0.63983054459095, + "num_tokens": 12304314.0, + "step": 75 + }, + { + "entropy": 1.9599759479363759, + "epoch": 0.008349125264343194, + "grad_norm": 1.359966516494751, + "learning_rate": 2.7422303473491773e-06, + "loss": 1.5562, + "mean_token_accuracy": 0.6150669455528259, + "num_tokens": 12501896.0, + "step": 76 + }, + { + "entropy": 1.8981466790040333, + "epoch": 0.00845898217571613, + "grad_norm": 1.2441012859344482, + "learning_rate": 2.7787934186471667e-06, + "loss": 1.5665, + "mean_token_accuracy": 0.6264956047137579, + "num_tokens": 12691020.0, + "step": 77 + }, + { + "entropy": 1.9287964701652527, + "epoch": 0.008568839087089067, + "grad_norm": 1.3505768775939941, + "learning_rate": 2.8153564899451556e-06, + "loss": 1.6804, + "mean_token_accuracy": 0.6293187389771143, + "num_tokens": 12892070.0, + "step": 78 + }, + { + "entropy": 1.8864564498265584, + "epoch": 0.008678695998462002, + "grad_norm": 1.230501413345337, + "learning_rate": 2.8519195612431445e-06, + "loss": 1.5448, + "mean_token_accuracy": 0.6269682745138804, + "num_tokens": 13066612.0, + "step": 79 + }, + { + "entropy": 1.9039170742034912, + "epoch": 0.00878855290983494, + "grad_norm": 1.3592942953109741, + "learning_rate": 2.8884826325411334e-06, + "loss": 1.4797, + "mean_token_accuracy": 0.6321501731872559, + "num_tokens": 13232664.0, + "step": 80 + }, + { + "entropy": 1.9469401339689891, + "epoch": 0.008898409821207877, + "grad_norm": 2.1602654457092285, + "learning_rate": 2.9250457038391228e-06, + "loss": 1.4569, + "mean_token_accuracy": 0.6350138882795969, + "num_tokens": 13378630.0, + "step": 81 + }, + { + "entropy": 1.8785783151785533, + "epoch": 0.009008266732580813, + "grad_norm": 1.3273346424102783, + "learning_rate": 2.961608775137112e-06, + "loss": 1.3908, + "mean_token_accuracy": 0.6404936015605927, + "num_tokens": 13525919.0, + "step": 82 + }, + { + "entropy": 1.9613807598749797, + "epoch": 0.00911812364395375, + "grad_norm": 1.595691204071045, + "learning_rate": 2.9981718464351006e-06, + "loss": 1.4964, + "mean_token_accuracy": 0.6223418762286504, + "num_tokens": 13685579.0, + "step": 83 + }, + { + "entropy": 1.9553207556406658, + "epoch": 0.009227980555326688, + "grad_norm": 1.1953165531158447, + "learning_rate": 3.03473491773309e-06, + "loss": 1.5688, + "mean_token_accuracy": 0.6186368266741434, + "num_tokens": 13901601.0, + "step": 84 + }, + { + "entropy": 1.8914847175280254, + "epoch": 0.009337837466699623, + "grad_norm": 2.109435558319092, + "learning_rate": 3.071297989031079e-06, + "loss": 1.4397, + "mean_token_accuracy": 0.639526754617691, + "num_tokens": 14060730.0, + "step": 85 + }, + { + "entropy": 1.9378711183865864, + "epoch": 0.00944769437807256, + "grad_norm": 1.584813117980957, + "learning_rate": 3.1078610603290678e-06, + "loss": 1.6438, + "mean_token_accuracy": 0.6247407471140226, + "num_tokens": 14220171.0, + "step": 86 + }, + { + "entropy": 1.8821549117565155, + "epoch": 0.009557551289445498, + "grad_norm": 1.3728727102279663, + "learning_rate": 3.1444241316270567e-06, + "loss": 1.4598, + "mean_token_accuracy": 0.6492441594600677, + "num_tokens": 14426508.0, + "step": 87 + }, + { + "entropy": 1.945362647374471, + "epoch": 0.009667408200818434, + "grad_norm": 1.353712797164917, + "learning_rate": 3.180987202925046e-06, + "loss": 1.4469, + "mean_token_accuracy": 0.6372426698605219, + "num_tokens": 14586179.0, + "step": 88 + }, + { + "entropy": 2.001281092564265, + "epoch": 0.009777265112191371, + "grad_norm": 1.7841237783432007, + "learning_rate": 3.2175502742230354e-06, + "loss": 1.6689, + "mean_token_accuracy": 0.6123541295528412, + "num_tokens": 14753587.0, + "step": 89 + }, + { + "entropy": 1.8964291512966156, + "epoch": 0.009887122023564307, + "grad_norm": 1.8038142919540405, + "learning_rate": 3.254113345521024e-06, + "loss": 1.5155, + "mean_token_accuracy": 0.6321656405925751, + "num_tokens": 14897850.0, + "step": 90 + }, + { + "entropy": 1.9467082222302754, + "epoch": 0.009996978934937244, + "grad_norm": 1.504103422164917, + "learning_rate": 3.290676416819013e-06, + "loss": 1.472, + "mean_token_accuracy": 0.638060932358106, + "num_tokens": 15085294.0, + "step": 91 + }, + { + "entropy": 1.895034392674764, + "epoch": 0.010106835846310181, + "grad_norm": 1.6101082563400269, + "learning_rate": 3.327239488117002e-06, + "loss": 1.4261, + "mean_token_accuracy": 0.6464575280745825, + "num_tokens": 15240577.0, + "step": 92 + }, + { + "entropy": 1.9196984469890594, + "epoch": 0.010216692757683117, + "grad_norm": 2.471315383911133, + "learning_rate": 3.363802559414991e-06, + "loss": 1.4078, + "mean_token_accuracy": 0.6479363540808359, + "num_tokens": 15390254.0, + "step": 93 + }, + { + "entropy": 1.8948115607102711, + "epoch": 0.010326549669056054, + "grad_norm": 1.4012452363967896, + "learning_rate": 3.40036563071298e-06, + "loss": 1.5964, + "mean_token_accuracy": 0.6217137028773626, + "num_tokens": 15591903.0, + "step": 94 + }, + { + "entropy": 1.9357371429602306, + "epoch": 0.010436406580428992, + "grad_norm": 1.724313735961914, + "learning_rate": 3.4369287020109693e-06, + "loss": 1.4818, + "mean_token_accuracy": 0.6213598748048147, + "num_tokens": 15757784.0, + "step": 95 + }, + { + "entropy": 1.9804232120513916, + "epoch": 0.010546263491801927, + "grad_norm": 1.3886477947235107, + "learning_rate": 3.4734917733089586e-06, + "loss": 1.5319, + "mean_token_accuracy": 0.625564381480217, + "num_tokens": 15919103.0, + "step": 96 + }, + { + "entropy": 1.8893166085084279, + "epoch": 0.010656120403174865, + "grad_norm": 2.0127227306365967, + "learning_rate": 3.510054844606947e-06, + "loss": 1.4025, + "mean_token_accuracy": 0.652299756805102, + "num_tokens": 16065462.0, + "step": 97 + }, + { + "entropy": 2.0099782148996987, + "epoch": 0.010765977314547802, + "grad_norm": 1.174246072769165, + "learning_rate": 3.5466179159049365e-06, + "loss": 1.5865, + "mean_token_accuracy": 0.6206269313891729, + "num_tokens": 16230912.0, + "step": 98 + }, + { + "entropy": 1.9607905944188435, + "epoch": 0.010875834225920738, + "grad_norm": 1.2836897373199463, + "learning_rate": 3.5831809872029254e-06, + "loss": 1.5893, + "mean_token_accuracy": 0.6317617396513621, + "num_tokens": 16401514.0, + "step": 99 + }, + { + "entropy": 1.9421872198581696, + "epoch": 0.010985691137293675, + "grad_norm": 1.39162015914917, + "learning_rate": 3.6197440585009143e-06, + "loss": 1.5033, + "mean_token_accuracy": 0.6282460689544678, + "num_tokens": 16572187.0, + "step": 100 + }, + { + "entropy": 1.9639080266157787, + "epoch": 0.011095548048666611, + "grad_norm": 1.5374300479888916, + "learning_rate": 3.6563071297989032e-06, + "loss": 1.5919, + "mean_token_accuracy": 0.6466963390509287, + "num_tokens": 16722551.0, + "step": 101 + }, + { + "entropy": 1.951703170935313, + "epoch": 0.011205404960039548, + "grad_norm": 1.8960427045822144, + "learning_rate": 3.6928702010968926e-06, + "loss": 1.5075, + "mean_token_accuracy": 0.6368091404438019, + "num_tokens": 16898011.0, + "step": 102 + }, + { + "entropy": 1.9429233868916829, + "epoch": 0.011315261871412486, + "grad_norm": 1.956182599067688, + "learning_rate": 3.729433272394882e-06, + "loss": 1.5054, + "mean_token_accuracy": 0.6246836185455322, + "num_tokens": 17058898.0, + "step": 103 + }, + { + "entropy": 1.873151530822118, + "epoch": 0.011425118782785421, + "grad_norm": 2.225306510925293, + "learning_rate": 3.7659963436928704e-06, + "loss": 1.4609, + "mean_token_accuracy": 0.6369840502738953, + "num_tokens": 17181314.0, + "step": 104 + }, + { + "entropy": 2.0154978732268014, + "epoch": 0.011534975694158359, + "grad_norm": 1.3400015830993652, + "learning_rate": 3.8025594149908597e-06, + "loss": 1.5389, + "mean_token_accuracy": 0.6156003673871359, + "num_tokens": 17384388.0, + "step": 105 + }, + { + "entropy": 1.8415813446044922, + "epoch": 0.011644832605531296, + "grad_norm": 1.7141013145446777, + "learning_rate": 3.839122486288849e-06, + "loss": 1.4315, + "mean_token_accuracy": 0.642757977048556, + "num_tokens": 17534920.0, + "step": 106 + }, + { + "entropy": 1.8614780008792877, + "epoch": 0.011754689516904232, + "grad_norm": 0.8813568949699402, + "learning_rate": 3.875685557586837e-06, + "loss": 1.5428, + "mean_token_accuracy": 0.6348659793535868, + "num_tokens": 17788973.0, + "step": 107 + }, + { + "entropy": 1.9813534617424011, + "epoch": 0.011864546428277169, + "grad_norm": 1.8785918951034546, + "learning_rate": 3.912248628884827e-06, + "loss": 1.5586, + "mean_token_accuracy": 0.6213805874188741, + "num_tokens": 17924037.0, + "step": 108 + }, + { + "entropy": 1.981442888577779, + "epoch": 0.011974403339650106, + "grad_norm": 1.5279945135116577, + "learning_rate": 3.948811700182816e-06, + "loss": 1.5167, + "mean_token_accuracy": 0.6179635375738144, + "num_tokens": 18063221.0, + "step": 109 + }, + { + "entropy": 1.9768773019313812, + "epoch": 0.012084260251023042, + "grad_norm": 0.9247986078262329, + "learning_rate": 3.985374771480805e-06, + "loss": 1.5735, + "mean_token_accuracy": 0.6179714898268381, + "num_tokens": 18322095.0, + "step": 110 + }, + { + "entropy": 1.8607787589232128, + "epoch": 0.01219411716239598, + "grad_norm": 1.2437028884887695, + "learning_rate": 4.021937842778794e-06, + "loss": 1.4921, + "mean_token_accuracy": 0.6286398619413376, + "num_tokens": 18535525.0, + "step": 111 + }, + { + "entropy": 1.913169115781784, + "epoch": 0.012303974073768917, + "grad_norm": 1.9801437854766846, + "learning_rate": 4.058500914076783e-06, + "loss": 1.4637, + "mean_token_accuracy": 0.6328051636616389, + "num_tokens": 18696140.0, + "step": 112 + }, + { + "entropy": 2.0254646937052407, + "epoch": 0.012413830985141852, + "grad_norm": 1.2046679258346558, + "learning_rate": 4.095063985374772e-06, + "loss": 1.5517, + "mean_token_accuracy": 0.6129643271366755, + "num_tokens": 18877101.0, + "step": 113 + }, + { + "entropy": 1.8931627968947093, + "epoch": 0.01252368789651479, + "grad_norm": 1.4202874898910522, + "learning_rate": 4.1316270566727604e-06, + "loss": 1.5459, + "mean_token_accuracy": 0.6342976987361908, + "num_tokens": 19060103.0, + "step": 114 + }, + { + "entropy": 1.9346506992975872, + "epoch": 0.012633544807887725, + "grad_norm": 1.8455705642700195, + "learning_rate": 4.16819012797075e-06, + "loss": 1.4788, + "mean_token_accuracy": 0.635664368669192, + "num_tokens": 19193959.0, + "step": 115 + }, + { + "entropy": 1.891279657681783, + "epoch": 0.012743401719260663, + "grad_norm": 1.2223505973815918, + "learning_rate": 4.204753199268739e-06, + "loss": 1.581, + "mean_token_accuracy": 0.6405478020509084, + "num_tokens": 19369545.0, + "step": 116 + }, + { + "entropy": 1.856790542602539, + "epoch": 0.0128532586306336, + "grad_norm": 2.1477673053741455, + "learning_rate": 4.241316270566728e-06, + "loss": 1.5261, + "mean_token_accuracy": 0.6329584916432699, + "num_tokens": 19520434.0, + "step": 117 + }, + { + "entropy": 1.9323392510414124, + "epoch": 0.012963115542006536, + "grad_norm": 1.760062336921692, + "learning_rate": 4.277879341864717e-06, + "loss": 1.5352, + "mean_token_accuracy": 0.6246002415815989, + "num_tokens": 19706187.0, + "step": 118 + }, + { + "entropy": 1.792329490184784, + "epoch": 0.013072972453379473, + "grad_norm": 1.7655450105667114, + "learning_rate": 4.314442413162706e-06, + "loss": 1.5414, + "mean_token_accuracy": 0.6422147999207178, + "num_tokens": 19895609.0, + "step": 119 + }, + { + "entropy": 1.9673140943050385, + "epoch": 0.01318282936475241, + "grad_norm": 1.106261968612671, + "learning_rate": 4.351005484460696e-06, + "loss": 1.5714, + "mean_token_accuracy": 0.6150963505109152, + "num_tokens": 20129061.0, + "step": 120 + }, + { + "entropy": 1.9717350602149963, + "epoch": 0.013292686276125346, + "grad_norm": 1.8436874151229858, + "learning_rate": 4.387568555758684e-06, + "loss": 1.553, + "mean_token_accuracy": 0.617359588543574, + "num_tokens": 20323094.0, + "step": 121 + }, + { + "entropy": 1.9026523629824321, + "epoch": 0.013402543187498284, + "grad_norm": 1.9257417917251587, + "learning_rate": 4.4241316270566735e-06, + "loss": 1.6199, + "mean_token_accuracy": 0.6336856335401535, + "num_tokens": 20478148.0, + "step": 122 + }, + { + "entropy": 1.9043993254502614, + "epoch": 0.013512400098871221, + "grad_norm": 1.8216344118118286, + "learning_rate": 4.460694698354662e-06, + "loss": 1.4728, + "mean_token_accuracy": 0.6363042940696081, + "num_tokens": 20659428.0, + "step": 123 + }, + { + "entropy": 1.923029104868571, + "epoch": 0.013622257010244157, + "grad_norm": 1.6235153675079346, + "learning_rate": 4.497257769652651e-06, + "loss": 1.4689, + "mean_token_accuracy": 0.6350155224402746, + "num_tokens": 20848469.0, + "step": 124 + }, + { + "entropy": 1.9097663859526317, + "epoch": 0.013732113921617094, + "grad_norm": 2.066504955291748, + "learning_rate": 4.53382084095064e-06, + "loss": 1.6551, + "mean_token_accuracy": 0.6251655717690786, + "num_tokens": 20990401.0, + "step": 125 + }, + { + "entropy": 1.8578063448270161, + "epoch": 0.01384197083299003, + "grad_norm": 2.0339179039001465, + "learning_rate": 4.570383912248629e-06, + "loss": 1.4402, + "mean_token_accuracy": 0.6496027906735738, + "num_tokens": 21126262.0, + "step": 126 + }, + { + "entropy": 1.8716703255971272, + "epoch": 0.013951827744362967, + "grad_norm": 1.5795789957046509, + "learning_rate": 4.606946983546619e-06, + "loss": 1.4847, + "mean_token_accuracy": 0.6409290333588918, + "num_tokens": 21277528.0, + "step": 127 + }, + { + "entropy": 1.960438460111618, + "epoch": 0.014061684655735904, + "grad_norm": 1.5360902547836304, + "learning_rate": 4.643510054844607e-06, + "loss": 1.5797, + "mean_token_accuracy": 0.6345019638538361, + "num_tokens": 21444158.0, + "step": 128 + }, + { + "entropy": 1.940459320942561, + "epoch": 0.01417154156710884, + "grad_norm": 1.4799344539642334, + "learning_rate": 4.680073126142597e-06, + "loss": 1.6964, + "mean_token_accuracy": 0.6091892321904501, + "num_tokens": 21610345.0, + "step": 129 + }, + { + "entropy": 1.9162492255369823, + "epoch": 0.014281398478481777, + "grad_norm": 1.6671146154403687, + "learning_rate": 4.716636197440586e-06, + "loss": 1.6158, + "mean_token_accuracy": 0.616663247346878, + "num_tokens": 21774213.0, + "step": 130 + }, + { + "entropy": 1.9880765875180562, + "epoch": 0.014391255389854715, + "grad_norm": 2.272390604019165, + "learning_rate": 4.753199268738575e-06, + "loss": 1.646, + "mean_token_accuracy": 0.6181524445613226, + "num_tokens": 21905255.0, + "step": 131 + }, + { + "entropy": 1.8855270047982533, + "epoch": 0.01450111230122765, + "grad_norm": 1.2006255388259888, + "learning_rate": 4.7897623400365635e-06, + "loss": 1.6963, + "mean_token_accuracy": 0.6048007061084112, + "num_tokens": 22098818.0, + "step": 132 + }, + { + "entropy": 1.9694058795770009, + "epoch": 0.014610969212600588, + "grad_norm": 1.6814290285110474, + "learning_rate": 4.826325411334552e-06, + "loss": 1.5216, + "mean_token_accuracy": 0.6235235830148061, + "num_tokens": 22262213.0, + "step": 133 + }, + { + "entropy": 1.8912192583084106, + "epoch": 0.014720826123973525, + "grad_norm": 2.5680627822875977, + "learning_rate": 4.862888482632542e-06, + "loss": 1.4485, + "mean_token_accuracy": 0.639568880200386, + "num_tokens": 22415716.0, + "step": 134 + }, + { + "entropy": 1.8896643221378326, + "epoch": 0.014830683035346461, + "grad_norm": 2.191824436187744, + "learning_rate": 4.89945155393053e-06, + "loss": 1.6072, + "mean_token_accuracy": 0.6254559010267258, + "num_tokens": 22554208.0, + "step": 135 + }, + { + "entropy": 1.838125040133794, + "epoch": 0.014940539946719398, + "grad_norm": 2.2072277069091797, + "learning_rate": 4.93601462522852e-06, + "loss": 1.4288, + "mean_token_accuracy": 0.6442883511384329, + "num_tokens": 22686348.0, + "step": 136 + }, + { + "entropy": 1.9419346153736115, + "epoch": 0.015050396858092334, + "grad_norm": 1.4282070398330688, + "learning_rate": 4.972577696526509e-06, + "loss": 1.588, + "mean_token_accuracy": 0.6240701427062353, + "num_tokens": 22914244.0, + "step": 137 + }, + { + "entropy": 1.9297908544540405, + "epoch": 0.015160253769465271, + "grad_norm": 1.7204630374908447, + "learning_rate": 5.009140767824498e-06, + "loss": 1.53, + "mean_token_accuracy": 0.6246543924013773, + "num_tokens": 23064559.0, + "step": 138 + }, + { + "entropy": 1.89528426527977, + "epoch": 0.015270110680838209, + "grad_norm": 1.2303673028945923, + "learning_rate": 5.045703839122487e-06, + "loss": 1.559, + "mean_token_accuracy": 0.6299286683400472, + "num_tokens": 23247723.0, + "step": 139 + }, + { + "entropy": 1.964120090007782, + "epoch": 0.015379967592211144, + "grad_norm": 1.3174797296524048, + "learning_rate": 5.082266910420476e-06, + "loss": 1.578, + "mean_token_accuracy": 0.6308384935061137, + "num_tokens": 23430389.0, + "step": 140 + }, + { + "entropy": 1.9435697793960571, + "epoch": 0.015489824503584082, + "grad_norm": 1.4430499076843262, + "learning_rate": 5.118829981718465e-06, + "loss": 1.4911, + "mean_token_accuracy": 0.6396051446596781, + "num_tokens": 23586104.0, + "step": 141 + }, + { + "entropy": 1.9481945832570393, + "epoch": 0.015599681414957019, + "grad_norm": 1.983757734298706, + "learning_rate": 5.155393053016454e-06, + "loss": 1.5476, + "mean_token_accuracy": 0.6408113439877828, + "num_tokens": 23735997.0, + "step": 142 + }, + { + "entropy": 1.9852807819843292, + "epoch": 0.015709538326329955, + "grad_norm": 1.51121985912323, + "learning_rate": 5.1919561243144424e-06, + "loss": 1.7149, + "mean_token_accuracy": 0.6295592884222666, + "num_tokens": 23906403.0, + "step": 143 + }, + { + "entropy": 1.9033388594786327, + "epoch": 0.015819395237702892, + "grad_norm": 1.7496778964996338, + "learning_rate": 5.228519195612431e-06, + "loss": 1.5541, + "mean_token_accuracy": 0.6278304755687714, + "num_tokens": 24082125.0, + "step": 144 + }, + { + "entropy": 1.929619828859965, + "epoch": 0.01592925214907583, + "grad_norm": 1.250235676765442, + "learning_rate": 5.26508226691042e-06, + "loss": 1.5135, + "mean_token_accuracy": 0.6288062930107117, + "num_tokens": 24305628.0, + "step": 145 + }, + { + "entropy": 2.0030274192492166, + "epoch": 0.016039109060448767, + "grad_norm": 1.5068280696868896, + "learning_rate": 5.30164533820841e-06, + "loss": 1.4626, + "mean_token_accuracy": 0.6244446535905203, + "num_tokens": 24472766.0, + "step": 146 + }, + { + "entropy": 1.9357780913511913, + "epoch": 0.0161489659718217, + "grad_norm": 1.569165825843811, + "learning_rate": 5.338208409506399e-06, + "loss": 1.4038, + "mean_token_accuracy": 0.6452462822198868, + "num_tokens": 24617139.0, + "step": 147 + }, + { + "entropy": 1.9441987375418346, + "epoch": 0.016258822883194638, + "grad_norm": 1.4736301898956299, + "learning_rate": 5.374771480804388e-06, + "loss": 1.5742, + "mean_token_accuracy": 0.6242514302333196, + "num_tokens": 24825673.0, + "step": 148 + }, + { + "entropy": 1.9694486260414124, + "epoch": 0.016368679794567575, + "grad_norm": 1.5289890766143799, + "learning_rate": 5.411334552102378e-06, + "loss": 1.4772, + "mean_token_accuracy": 0.6373162666956583, + "num_tokens": 24991928.0, + "step": 149 + }, + { + "entropy": 1.886027862628301, + "epoch": 0.016478536705940513, + "grad_norm": 1.8170188665390015, + "learning_rate": 5.447897623400366e-06, + "loss": 1.5112, + "mean_token_accuracy": 0.6386896967887878, + "num_tokens": 25126815.0, + "step": 150 + }, + { + "entropy": 2.017523467540741, + "epoch": 0.01658839361731345, + "grad_norm": 1.759878158569336, + "learning_rate": 5.484460694698355e-06, + "loss": 1.7175, + "mean_token_accuracy": 0.6125520120064417, + "num_tokens": 25317846.0, + "step": 151 + }, + { + "entropy": 1.9565064509709675, + "epoch": 0.016698250528686388, + "grad_norm": 1.3674354553222656, + "learning_rate": 5.5210237659963435e-06, + "loss": 1.5125, + "mean_token_accuracy": 0.6129873792330424, + "num_tokens": 25517208.0, + "step": 152 + }, + { + "entropy": 1.8165283501148224, + "epoch": 0.01680810744005932, + "grad_norm": 1.8085861206054688, + "learning_rate": 5.557586837294333e-06, + "loss": 1.3189, + "mean_token_accuracy": 0.6418418337901434, + "num_tokens": 25710724.0, + "step": 153 + }, + { + "entropy": 1.9543904463450115, + "epoch": 0.01691796435143226, + "grad_norm": 2.4002819061279297, + "learning_rate": 5.594149908592322e-06, + "loss": 1.5342, + "mean_token_accuracy": 0.625735859076182, + "num_tokens": 25863294.0, + "step": 154 + }, + { + "entropy": 1.9318216741085052, + "epoch": 0.017027821262805196, + "grad_norm": 1.7552255392074585, + "learning_rate": 5.630712979890311e-06, + "loss": 1.4478, + "mean_token_accuracy": 0.6387412895758947, + "num_tokens": 25995035.0, + "step": 155 + }, + { + "entropy": 1.843320260445277, + "epoch": 0.017137678174178134, + "grad_norm": 1.9175851345062256, + "learning_rate": 5.667276051188301e-06, + "loss": 1.4263, + "mean_token_accuracy": 0.6444782565037409, + "num_tokens": 26181344.0, + "step": 156 + }, + { + "entropy": 1.9701976378758748, + "epoch": 0.01724753508555107, + "grad_norm": 1.541599988937378, + "learning_rate": 5.703839122486289e-06, + "loss": 1.5272, + "mean_token_accuracy": 0.642794132232666, + "num_tokens": 26359706.0, + "step": 157 + }, + { + "entropy": 1.9773990114529927, + "epoch": 0.017357391996924005, + "grad_norm": 2.1367106437683105, + "learning_rate": 5.740402193784278e-06, + "loss": 1.5112, + "mean_token_accuracy": 0.636732429265976, + "num_tokens": 26479942.0, + "step": 158 + }, + { + "entropy": 1.9307551781336467, + "epoch": 0.017467248908296942, + "grad_norm": 2.5110087394714355, + "learning_rate": 5.776965265082267e-06, + "loss": 1.4965, + "mean_token_accuracy": 0.628907784819603, + "num_tokens": 26595408.0, + "step": 159 + }, + { + "entropy": 1.8777441680431366, + "epoch": 0.01757710581966988, + "grad_norm": 1.30020272731781, + "learning_rate": 5.813528336380257e-06, + "loss": 1.5253, + "mean_token_accuracy": 0.6290498375892639, + "num_tokens": 26817903.0, + "step": 160 + }, + { + "entropy": 1.9326928953329723, + "epoch": 0.017686962731042817, + "grad_norm": 1.6048803329467773, + "learning_rate": 5.8500914076782455e-06, + "loss": 1.5385, + "mean_token_accuracy": 0.632832944393158, + "num_tokens": 26960043.0, + "step": 161 + }, + { + "entropy": 2.001479814449946, + "epoch": 0.017796819642415754, + "grad_norm": 1.7055351734161377, + "learning_rate": 5.886654478976234e-06, + "loss": 1.4679, + "mean_token_accuracy": 0.6328243414560953, + "num_tokens": 27105562.0, + "step": 162 + }, + { + "entropy": 1.9939979513486226, + "epoch": 0.017906676553788692, + "grad_norm": 1.595533847808838, + "learning_rate": 5.923217550274224e-06, + "loss": 1.5478, + "mean_token_accuracy": 0.6230746358633041, + "num_tokens": 27254524.0, + "step": 163 + }, + { + "entropy": 1.9124678770701091, + "epoch": 0.018016533465161626, + "grad_norm": 1.401327133178711, + "learning_rate": 5.959780621572212e-06, + "loss": 1.6268, + "mean_token_accuracy": 0.6118254562218984, + "num_tokens": 27451743.0, + "step": 164 + }, + { + "entropy": 1.8883071442445118, + "epoch": 0.018126390376534563, + "grad_norm": 1.3325417041778564, + "learning_rate": 5.996343692870201e-06, + "loss": 1.5537, + "mean_token_accuracy": 0.6336751828591028, + "num_tokens": 27609520.0, + "step": 165 + }, + { + "entropy": 1.8839130004247029, + "epoch": 0.0182362472879075, + "grad_norm": 1.713529348373413, + "learning_rate": 6.03290676416819e-06, + "loss": 1.529, + "mean_token_accuracy": 0.6286770900090536, + "num_tokens": 27781402.0, + "step": 166 + }, + { + "entropy": 2.000154842933019, + "epoch": 0.018346104199280438, + "grad_norm": 1.4660075902938843, + "learning_rate": 6.06946983546618e-06, + "loss": 1.6884, + "mean_token_accuracy": 0.6122722874085108, + "num_tokens": 27947872.0, + "step": 167 + }, + { + "entropy": 1.9248465200265248, + "epoch": 0.018455961110653375, + "grad_norm": 1.3339463472366333, + "learning_rate": 6.106032906764169e-06, + "loss": 1.4863, + "mean_token_accuracy": 0.6232618043820063, + "num_tokens": 28170975.0, + "step": 168 + }, + { + "entropy": 1.895066907008489, + "epoch": 0.01856581802202631, + "grad_norm": 0.8725059032440186, + "learning_rate": 6.142595978062158e-06, + "loss": 1.5651, + "mean_token_accuracy": 0.6305927385886511, + "num_tokens": 28384244.0, + "step": 169 + }, + { + "entropy": 1.908642550309499, + "epoch": 0.018675674933399247, + "grad_norm": 1.8071403503417969, + "learning_rate": 6.1791590493601475e-06, + "loss": 1.4144, + "mean_token_accuracy": 0.6478618135054907, + "num_tokens": 28540304.0, + "step": 170 + }, + { + "entropy": 1.8365615010261536, + "epoch": 0.018785531844772184, + "grad_norm": 1.5986486673355103, + "learning_rate": 6.2157221206581355e-06, + "loss": 1.4964, + "mean_token_accuracy": 0.6445286770661672, + "num_tokens": 28693212.0, + "step": 171 + }, + { + "entropy": 1.8890428642431896, + "epoch": 0.01889538875614512, + "grad_norm": 1.403101921081543, + "learning_rate": 6.2522851919561244e-06, + "loss": 1.4468, + "mean_token_accuracy": 0.6386788686116537, + "num_tokens": 28850662.0, + "step": 172 + }, + { + "entropy": 1.9007167915503185, + "epoch": 0.01900524566751806, + "grad_norm": 1.734114170074463, + "learning_rate": 6.288848263254113e-06, + "loss": 1.4575, + "mean_token_accuracy": 0.6430951108535131, + "num_tokens": 29008183.0, + "step": 173 + }, + { + "entropy": 1.9034869869550068, + "epoch": 0.019115102578890996, + "grad_norm": 1.3347978591918945, + "learning_rate": 6.325411334552103e-06, + "loss": 1.4581, + "mean_token_accuracy": 0.636214479804039, + "num_tokens": 29178708.0, + "step": 174 + }, + { + "entropy": 1.9231548706690471, + "epoch": 0.01922495949026393, + "grad_norm": 0.9329301714897156, + "learning_rate": 6.361974405850092e-06, + "loss": 1.433, + "mean_token_accuracy": 0.6341615468263626, + "num_tokens": 29404955.0, + "step": 175 + }, + { + "entropy": 1.9202434321244557, + "epoch": 0.019334816401636867, + "grad_norm": 1.496565341949463, + "learning_rate": 6.398537477148081e-06, + "loss": 1.4469, + "mean_token_accuracy": 0.6378689457972845, + "num_tokens": 29591016.0, + "step": 176 + }, + { + "entropy": 1.9373224675655365, + "epoch": 0.019444673313009805, + "grad_norm": 1.8180593252182007, + "learning_rate": 6.435100548446071e-06, + "loss": 1.4546, + "mean_token_accuracy": 0.6288415739933649, + "num_tokens": 29730607.0, + "step": 177 + }, + { + "entropy": 1.8911834458510082, + "epoch": 0.019554530224382742, + "grad_norm": 1.377733588218689, + "learning_rate": 6.471663619744059e-06, + "loss": 1.4094, + "mean_token_accuracy": 0.6404121816158295, + "num_tokens": 29945024.0, + "step": 178 + }, + { + "entropy": 1.9313226739565532, + "epoch": 0.01966438713575568, + "grad_norm": 1.3180339336395264, + "learning_rate": 6.508226691042048e-06, + "loss": 1.4791, + "mean_token_accuracy": 0.6364747583866119, + "num_tokens": 30130011.0, + "step": 179 + }, + { + "entropy": 1.8546662032604218, + "epoch": 0.019774244047128613, + "grad_norm": 0.9060352444648743, + "learning_rate": 6.544789762340037e-06, + "loss": 1.5613, + "mean_token_accuracy": 0.6342605948448181, + "num_tokens": 30414308.0, + "step": 180 + }, + { + "entropy": 1.9606173137823741, + "epoch": 0.01988410095850155, + "grad_norm": 1.4632275104522705, + "learning_rate": 6.581352833638026e-06, + "loss": 1.5579, + "mean_token_accuracy": 0.618272011478742, + "num_tokens": 30618377.0, + "step": 181 + }, + { + "entropy": 1.8522493441899617, + "epoch": 0.019993957869874488, + "grad_norm": 1.4936124086380005, + "learning_rate": 6.617915904936015e-06, + "loss": 1.4107, + "mean_token_accuracy": 0.6411967029174169, + "num_tokens": 30836318.0, + "step": 182 + }, + { + "entropy": 2.0339955588181815, + "epoch": 0.020103814781247425, + "grad_norm": 1.1835054159164429, + "learning_rate": 6.654478976234004e-06, + "loss": 1.5639, + "mean_token_accuracy": 0.6246629556020101, + "num_tokens": 31012714.0, + "step": 183 + }, + { + "entropy": 1.972231497367223, + "epoch": 0.020213671692620363, + "grad_norm": 2.2008109092712402, + "learning_rate": 6.691042047531994e-06, + "loss": 1.4934, + "mean_token_accuracy": 0.6375860174496969, + "num_tokens": 31125259.0, + "step": 184 + }, + { + "entropy": 1.9959478378295898, + "epoch": 0.0203235286039933, + "grad_norm": 1.3271251916885376, + "learning_rate": 6.727605118829982e-06, + "loss": 1.5039, + "mean_token_accuracy": 0.6275109102328619, + "num_tokens": 31303032.0, + "step": 185 + }, + { + "entropy": 1.9124859770139058, + "epoch": 0.020433385515366234, + "grad_norm": 1.273916244506836, + "learning_rate": 6.764168190127971e-06, + "loss": 1.5072, + "mean_token_accuracy": 0.6474013924598694, + "num_tokens": 31472688.0, + "step": 186 + }, + { + "entropy": 1.94366854429245, + "epoch": 0.02054324242673917, + "grad_norm": 1.5350209474563599, + "learning_rate": 6.80073126142596e-06, + "loss": 1.5753, + "mean_token_accuracy": 0.6296733965476354, + "num_tokens": 31603992.0, + "step": 187 + }, + { + "entropy": 1.830802450577418, + "epoch": 0.02065309933811211, + "grad_norm": 1.5503315925598145, + "learning_rate": 6.83729433272395e-06, + "loss": 1.5068, + "mean_token_accuracy": 0.6352782646814982, + "num_tokens": 31754236.0, + "step": 188 + }, + { + "entropy": 1.961773047844569, + "epoch": 0.020762956249485046, + "grad_norm": 1.3327734470367432, + "learning_rate": 6.873857404021939e-06, + "loss": 1.6572, + "mean_token_accuracy": 0.6188698361317316, + "num_tokens": 31951147.0, + "step": 189 + }, + { + "entropy": 1.9484667479991913, + "epoch": 0.020872813160857984, + "grad_norm": 1.9495567083358765, + "learning_rate": 6.9104204753199275e-06, + "loss": 1.4939, + "mean_token_accuracy": 0.6419304311275482, + "num_tokens": 32069372.0, + "step": 190 + }, + { + "entropy": 1.952804942925771, + "epoch": 0.020982670072230918, + "grad_norm": 2.0952329635620117, + "learning_rate": 6.946983546617917e-06, + "loss": 1.466, + "mean_token_accuracy": 0.6325685183207194, + "num_tokens": 32247672.0, + "step": 191 + }, + { + "entropy": 1.8967472811539967, + "epoch": 0.021092526983603855, + "grad_norm": 1.9613308906555176, + "learning_rate": 6.983546617915905e-06, + "loss": 1.4358, + "mean_token_accuracy": 0.6374901284774145, + "num_tokens": 32400081.0, + "step": 192 + }, + { + "entropy": 1.8581411341826122, + "epoch": 0.021202383894976792, + "grad_norm": 1.4401475191116333, + "learning_rate": 7.020109689213894e-06, + "loss": 1.4406, + "mean_token_accuracy": 0.6475908011198044, + "num_tokens": 32612337.0, + "step": 193 + }, + { + "entropy": 1.9438115656375885, + "epoch": 0.02131224080634973, + "grad_norm": 1.6511321067810059, + "learning_rate": 7.056672760511883e-06, + "loss": 1.569, + "mean_token_accuracy": 0.6184766987959543, + "num_tokens": 32757705.0, + "step": 194 + }, + { + "entropy": 1.8474779923756917, + "epoch": 0.021422097717722667, + "grad_norm": 1.7577476501464844, + "learning_rate": 7.093235831809873e-06, + "loss": 1.4082, + "mean_token_accuracy": 0.6572687774896622, + "num_tokens": 32909533.0, + "step": 195 + }, + { + "entropy": 1.9480952123800914, + "epoch": 0.021531954629095604, + "grad_norm": 1.2074416875839233, + "learning_rate": 7.129798903107862e-06, + "loss": 1.4497, + "mean_token_accuracy": 0.6373476584752401, + "num_tokens": 33091590.0, + "step": 196 + }, + { + "entropy": 1.8216406504313152, + "epoch": 0.02164181154046854, + "grad_norm": 1.491611361503601, + "learning_rate": 7.166361974405851e-06, + "loss": 1.419, + "mean_token_accuracy": 0.6391265342632929, + "num_tokens": 33261986.0, + "step": 197 + }, + { + "entropy": 1.9061803619066875, + "epoch": 0.021751668451841476, + "grad_norm": 1.422453761100769, + "learning_rate": 7.2029250457038405e-06, + "loss": 1.5071, + "mean_token_accuracy": 0.6227595210075378, + "num_tokens": 33436559.0, + "step": 198 + }, + { + "entropy": 1.9401950438817341, + "epoch": 0.021861525363214413, + "grad_norm": 1.4699177742004395, + "learning_rate": 7.239488117001829e-06, + "loss": 1.5314, + "mean_token_accuracy": 0.6238148510456085, + "num_tokens": 33610708.0, + "step": 199 + }, + { + "entropy": 1.9451914032300313, + "epoch": 0.02197138227458735, + "grad_norm": 1.5435770750045776, + "learning_rate": 7.2760511882998175e-06, + "loss": 1.7341, + "mean_token_accuracy": 0.6216254606842995, + "num_tokens": 33789223.0, + "step": 200 + }, + { + "entropy": 1.9354910055796306, + "epoch": 0.022081239185960288, + "grad_norm": 1.6288329362869263, + "learning_rate": 7.3126142595978065e-06, + "loss": 1.4906, + "mean_token_accuracy": 0.6289151956637701, + "num_tokens": 33926328.0, + "step": 201 + }, + { + "entropy": 1.920799712340037, + "epoch": 0.022191096097333222, + "grad_norm": 2.159276247024536, + "learning_rate": 7.349177330895796e-06, + "loss": 1.4639, + "mean_token_accuracy": 0.6422193894783655, + "num_tokens": 34086922.0, + "step": 202 + }, + { + "entropy": 1.8773599565029144, + "epoch": 0.02230095300870616, + "grad_norm": 1.5784872770309448, + "learning_rate": 7.385740402193785e-06, + "loss": 1.4203, + "mean_token_accuracy": 0.6452515174945196, + "num_tokens": 34234623.0, + "step": 203 + }, + { + "entropy": 1.9058987200260162, + "epoch": 0.022410809920079097, + "grad_norm": 1.9286620616912842, + "learning_rate": 7.422303473491774e-06, + "loss": 1.4685, + "mean_token_accuracy": 0.6433494488398234, + "num_tokens": 34403187.0, + "step": 204 + }, + { + "entropy": 1.9196257293224335, + "epoch": 0.022520666831452034, + "grad_norm": 1.517842411994934, + "learning_rate": 7.458866544789764e-06, + "loss": 1.4634, + "mean_token_accuracy": 0.6412399162848791, + "num_tokens": 34547042.0, + "step": 205 + }, + { + "entropy": 1.9245548446973164, + "epoch": 0.02263052374282497, + "grad_norm": 1.4361844062805176, + "learning_rate": 7.495429616087752e-06, + "loss": 1.5076, + "mean_token_accuracy": 0.6369704306125641, + "num_tokens": 34719845.0, + "step": 206 + }, + { + "entropy": 1.9506233930587769, + "epoch": 0.02274038065419791, + "grad_norm": 2.3668041229248047, + "learning_rate": 7.531992687385741e-06, + "loss": 1.4556, + "mean_token_accuracy": 0.6397630920012792, + "num_tokens": 34826986.0, + "step": 207 + }, + { + "entropy": 1.915660818417867, + "epoch": 0.022850237565570843, + "grad_norm": 1.7417556047439575, + "learning_rate": 7.56855575868373e-06, + "loss": 1.4855, + "mean_token_accuracy": 0.628084714214007, + "num_tokens": 34995629.0, + "step": 208 + }, + { + "entropy": 1.9720592200756073, + "epoch": 0.02296009447694378, + "grad_norm": 1.7598878145217896, + "learning_rate": 7.6051188299817195e-06, + "loss": 1.6025, + "mean_token_accuracy": 0.6329106787840525, + "num_tokens": 35109490.0, + "step": 209 + }, + { + "entropy": 1.9277808268864949, + "epoch": 0.023069951388316717, + "grad_norm": 1.1871633529663086, + "learning_rate": 7.641681901279708e-06, + "loss": 1.5092, + "mean_token_accuracy": 0.6277731756369272, + "num_tokens": 35273576.0, + "step": 210 + }, + { + "entropy": 1.9870645701885223, + "epoch": 0.023179808299689655, + "grad_norm": 1.901222825050354, + "learning_rate": 7.678244972577698e-06, + "loss": 1.6778, + "mean_token_accuracy": 0.6138087809085846, + "num_tokens": 35420120.0, + "step": 211 + }, + { + "entropy": 1.8992568055788677, + "epoch": 0.023289665211062592, + "grad_norm": 1.1016038656234741, + "learning_rate": 7.714808043875686e-06, + "loss": 1.4592, + "mean_token_accuracy": 0.6364717036485672, + "num_tokens": 35602325.0, + "step": 212 + }, + { + "entropy": 1.888885885477066, + "epoch": 0.02339952212243553, + "grad_norm": 2.0824167728424072, + "learning_rate": 7.751371115173674e-06, + "loss": 1.5275, + "mean_token_accuracy": 0.6334254344304403, + "num_tokens": 35733356.0, + "step": 213 + }, + { + "entropy": 1.982912798722585, + "epoch": 0.023509379033808463, + "grad_norm": 1.8428221940994263, + "learning_rate": 7.787934186471664e-06, + "loss": 1.6603, + "mean_token_accuracy": 0.6229775846004486, + "num_tokens": 35880300.0, + "step": 214 + }, + { + "entropy": 1.9338585535685222, + "epoch": 0.0236192359451814, + "grad_norm": 1.234574794769287, + "learning_rate": 7.824497257769654e-06, + "loss": 1.5378, + "mean_token_accuracy": 0.6240969995657603, + "num_tokens": 36069653.0, + "step": 215 + }, + { + "entropy": 1.9078516761461894, + "epoch": 0.023729092856554338, + "grad_norm": 1.3974380493164062, + "learning_rate": 7.861060329067642e-06, + "loss": 1.4083, + "mean_token_accuracy": 0.6485424588123957, + "num_tokens": 36230390.0, + "step": 216 + }, + { + "entropy": 1.9500950674215953, + "epoch": 0.023838949767927275, + "grad_norm": 1.337957501411438, + "learning_rate": 7.897623400365632e-06, + "loss": 1.6635, + "mean_token_accuracy": 0.6214189380407333, + "num_tokens": 36418461.0, + "step": 217 + }, + { + "entropy": 1.8814655443032582, + "epoch": 0.023948806679300213, + "grad_norm": 0.930263876914978, + "learning_rate": 7.934186471663621e-06, + "loss": 1.4714, + "mean_token_accuracy": 0.6292876054843267, + "num_tokens": 36666565.0, + "step": 218 + }, + { + "entropy": 1.9561232924461365, + "epoch": 0.024058663590673147, + "grad_norm": 1.3758008480072021, + "learning_rate": 7.97074954296161e-06, + "loss": 1.4744, + "mean_token_accuracy": 0.6297362099091212, + "num_tokens": 36831778.0, + "step": 219 + }, + { + "entropy": 1.9421695868174236, + "epoch": 0.024168520502046084, + "grad_norm": 1.9885149002075195, + "learning_rate": 8.007312614259598e-06, + "loss": 1.4071, + "mean_token_accuracy": 0.6410440603892008, + "num_tokens": 36984298.0, + "step": 220 + }, + { + "entropy": 1.9113211333751678, + "epoch": 0.02427837741341902, + "grad_norm": 1.6628094911575317, + "learning_rate": 8.043875685557587e-06, + "loss": 1.3944, + "mean_token_accuracy": 0.6423366914192835, + "num_tokens": 37123033.0, + "step": 221 + }, + { + "entropy": 1.8789688448111217, + "epoch": 0.02438823432479196, + "grad_norm": 1.8397406339645386, + "learning_rate": 8.080438756855577e-06, + "loss": 1.3697, + "mean_token_accuracy": 0.6559399515390396, + "num_tokens": 37276948.0, + "step": 222 + }, + { + "entropy": 1.9408772091070812, + "epoch": 0.024498091236164896, + "grad_norm": 1.2659918069839478, + "learning_rate": 8.117001828153565e-06, + "loss": 1.6394, + "mean_token_accuracy": 0.6148115148146948, + "num_tokens": 37493584.0, + "step": 223 + }, + { + "entropy": 1.9383413990338643, + "epoch": 0.024607948147537834, + "grad_norm": 2.0002858638763428, + "learning_rate": 8.153564899451555e-06, + "loss": 1.4873, + "mean_token_accuracy": 0.6363619416952133, + "num_tokens": 37625713.0, + "step": 224 + }, + { + "entropy": 1.8870685597260792, + "epoch": 0.024717805058910768, + "grad_norm": 1.661469578742981, + "learning_rate": 8.190127970749545e-06, + "loss": 1.5554, + "mean_token_accuracy": 0.6477372944355011, + "num_tokens": 37796925.0, + "step": 225 + }, + { + "entropy": 1.9707025090853374, + "epoch": 0.024827661970283705, + "grad_norm": 1.3756967782974243, + "learning_rate": 8.226691042047533e-06, + "loss": 1.6058, + "mean_token_accuracy": 0.6214409867922465, + "num_tokens": 37958284.0, + "step": 226 + }, + { + "entropy": 1.9486571947733562, + "epoch": 0.024937518881656642, + "grad_norm": 1.891913652420044, + "learning_rate": 8.263254113345521e-06, + "loss": 1.548, + "mean_token_accuracy": 0.6239824940760931, + "num_tokens": 38121912.0, + "step": 227 + }, + { + "entropy": 1.9066686630249023, + "epoch": 0.02504737579302958, + "grad_norm": 1.301985263824463, + "learning_rate": 8.29981718464351e-06, + "loss": 1.4455, + "mean_token_accuracy": 0.6375877112150192, + "num_tokens": 38293286.0, + "step": 228 + }, + { + "entropy": 1.9013386964797974, + "epoch": 0.025157232704402517, + "grad_norm": 1.0296814441680908, + "learning_rate": 8.3363802559415e-06, + "loss": 1.5593, + "mean_token_accuracy": 0.6204620003700256, + "num_tokens": 38493428.0, + "step": 229 + }, + { + "entropy": 1.889691025018692, + "epoch": 0.02526708961577545, + "grad_norm": 1.2949299812316895, + "learning_rate": 8.372943327239488e-06, + "loss": 1.4609, + "mean_token_accuracy": 0.6394095073143641, + "num_tokens": 38689047.0, + "step": 230 + }, + { + "entropy": 1.9693353275458019, + "epoch": 0.02537694652714839, + "grad_norm": 1.41304349899292, + "learning_rate": 8.409506398537478e-06, + "loss": 1.6868, + "mean_token_accuracy": 0.6182350367307663, + "num_tokens": 38874286.0, + "step": 231 + }, + { + "entropy": 1.9255563914775848, + "epoch": 0.025486803438521326, + "grad_norm": 2.2932870388031006, + "learning_rate": 8.446069469835468e-06, + "loss": 1.4642, + "mean_token_accuracy": 0.6350040584802628, + "num_tokens": 39041995.0, + "step": 232 + }, + { + "entropy": 1.8631162444750469, + "epoch": 0.025596660349894263, + "grad_norm": 1.4570585489273071, + "learning_rate": 8.482632541133456e-06, + "loss": 1.4474, + "mean_token_accuracy": 0.6374172319968542, + "num_tokens": 39247147.0, + "step": 233 + }, + { + "entropy": 1.9238406717777252, + "epoch": 0.0257065172612672, + "grad_norm": 0.969900369644165, + "learning_rate": 8.519195612431444e-06, + "loss": 1.5364, + "mean_token_accuracy": 0.6210780193408331, + "num_tokens": 39442503.0, + "step": 234 + }, + { + "entropy": 1.830154150724411, + "epoch": 0.025816374172640138, + "grad_norm": 1.632808804512024, + "learning_rate": 8.555758683729434e-06, + "loss": 1.3361, + "mean_token_accuracy": 0.6548681904872259, + "num_tokens": 39608656.0, + "step": 235 + }, + { + "entropy": 1.8890958329041798, + "epoch": 0.025926231084013072, + "grad_norm": 1.6013661623001099, + "learning_rate": 8.592321755027424e-06, + "loss": 1.4588, + "mean_token_accuracy": 0.6372386415799459, + "num_tokens": 39745398.0, + "step": 236 + }, + { + "entropy": 1.9264869689941406, + "epoch": 0.02603608799538601, + "grad_norm": 1.3071633577346802, + "learning_rate": 8.628884826325412e-06, + "loss": 1.5205, + "mean_token_accuracy": 0.6315742234388987, + "num_tokens": 39932055.0, + "step": 237 + }, + { + "entropy": 1.8503678143024445, + "epoch": 0.026145944906758947, + "grad_norm": 1.1339292526245117, + "learning_rate": 8.665447897623402e-06, + "loss": 1.3504, + "mean_token_accuracy": 0.647294615705808, + "num_tokens": 40125022.0, + "step": 238 + }, + { + "entropy": 1.9106312990188599, + "epoch": 0.026255801818131884, + "grad_norm": 1.3184049129486084, + "learning_rate": 8.702010968921391e-06, + "loss": 1.5338, + "mean_token_accuracy": 0.6321128904819489, + "num_tokens": 40314336.0, + "step": 239 + }, + { + "entropy": 1.8953208327293396, + "epoch": 0.02636565872950482, + "grad_norm": 1.3068339824676514, + "learning_rate": 8.73857404021938e-06, + "loss": 1.5125, + "mean_token_accuracy": 0.6311918099721273, + "num_tokens": 40500962.0, + "step": 240 + }, + { + "entropy": 1.8495961129665375, + "epoch": 0.026475515640877755, + "grad_norm": 1.4714800119400024, + "learning_rate": 8.775137111517367e-06, + "loss": 1.4861, + "mean_token_accuracy": 0.6417611440022787, + "num_tokens": 40684363.0, + "step": 241 + }, + { + "entropy": 1.9483478566010792, + "epoch": 0.026585372552250693, + "grad_norm": 1.7499563694000244, + "learning_rate": 8.811700182815357e-06, + "loss": 1.513, + "mean_token_accuracy": 0.6161421338717142, + "num_tokens": 40854917.0, + "step": 242 + }, + { + "entropy": 1.9359776973724365, + "epoch": 0.02669522946362363, + "grad_norm": 1.2601664066314697, + "learning_rate": 8.848263254113347e-06, + "loss": 1.5529, + "mean_token_accuracy": 0.614978551864624, + "num_tokens": 41041982.0, + "step": 243 + }, + { + "entropy": 1.9827852447827656, + "epoch": 0.026805086374996567, + "grad_norm": 1.3667547702789307, + "learning_rate": 8.884826325411335e-06, + "loss": 1.5346, + "mean_token_accuracy": 0.6175629794597626, + "num_tokens": 41252018.0, + "step": 244 + }, + { + "entropy": 1.9970279932022095, + "epoch": 0.026914943286369505, + "grad_norm": 4.996973037719727, + "learning_rate": 8.921389396709325e-06, + "loss": 1.4708, + "mean_token_accuracy": 0.6463018904129664, + "num_tokens": 41388067.0, + "step": 245 + }, + { + "entropy": 1.95156333843867, + "epoch": 0.027024800197742442, + "grad_norm": 1.5964291095733643, + "learning_rate": 8.957952468007315e-06, + "loss": 1.633, + "mean_token_accuracy": 0.611007904012998, + "num_tokens": 41521865.0, + "step": 246 + }, + { + "entropy": 1.8909566402435303, + "epoch": 0.027134657109115376, + "grad_norm": 1.72969388961792, + "learning_rate": 8.994515539305303e-06, + "loss": 1.487, + "mean_token_accuracy": 0.6293840358654658, + "num_tokens": 41653324.0, + "step": 247 + }, + { + "entropy": 1.8896038234233856, + "epoch": 0.027244514020488313, + "grad_norm": 1.3360822200775146, + "learning_rate": 9.03107861060329e-06, + "loss": 1.474, + "mean_token_accuracy": 0.6422435492277145, + "num_tokens": 41849809.0, + "step": 248 + }, + { + "entropy": 1.9821850061416626, + "epoch": 0.02735437093186125, + "grad_norm": 1.6421992778778076, + "learning_rate": 9.06764168190128e-06, + "loss": 1.451, + "mean_token_accuracy": 0.6302864154179891, + "num_tokens": 41996056.0, + "step": 249 + }, + { + "entropy": 1.8893661499023438, + "epoch": 0.027464227843234188, + "grad_norm": 1.9778544902801514, + "learning_rate": 9.10420475319927e-06, + "loss": 1.4556, + "mean_token_accuracy": 0.6386625617742538, + "num_tokens": 42136282.0, + "step": 250 + }, + { + "entropy": 1.964593380689621, + "epoch": 0.027574084754607125, + "grad_norm": 1.21237313747406, + "learning_rate": 9.140767824497258e-06, + "loss": 1.5264, + "mean_token_accuracy": 0.6220163901646932, + "num_tokens": 42287103.0, + "step": 251 + }, + { + "entropy": 1.8926392396291096, + "epoch": 0.02768394166598006, + "grad_norm": 1.2302354574203491, + "learning_rate": 9.177330895795248e-06, + "loss": 1.4971, + "mean_token_accuracy": 0.6313800662755966, + "num_tokens": 42437027.0, + "step": 252 + }, + { + "entropy": 1.8762815594673157, + "epoch": 0.027793798577352997, + "grad_norm": 1.0389925241470337, + "learning_rate": 9.213893967093238e-06, + "loss": 1.5259, + "mean_token_accuracy": 0.6269082774718603, + "num_tokens": 42628984.0, + "step": 253 + }, + { + "entropy": 1.861397624015808, + "epoch": 0.027903655488725934, + "grad_norm": 1.1108849048614502, + "learning_rate": 9.250457038391226e-06, + "loss": 1.6276, + "mean_token_accuracy": 0.6273967996239662, + "num_tokens": 42804896.0, + "step": 254 + }, + { + "entropy": 1.9415236016114552, + "epoch": 0.02801351240009887, + "grad_norm": 1.8820470571517944, + "learning_rate": 9.287020109689214e-06, + "loss": 1.579, + "mean_token_accuracy": 0.6264342914024988, + "num_tokens": 42935360.0, + "step": 255 + }, + { + "entropy": 1.9186277190844219, + "epoch": 0.02812336931147181, + "grad_norm": 1.1692508459091187, + "learning_rate": 9.323583180987204e-06, + "loss": 1.4381, + "mean_token_accuracy": 0.6264500568310419, + "num_tokens": 43136428.0, + "step": 256 + }, + { + "entropy": 1.9373709559440613, + "epoch": 0.028233226222844746, + "grad_norm": 1.8746132850646973, + "learning_rate": 9.360146252285193e-06, + "loss": 1.4601, + "mean_token_accuracy": 0.6457020888725916, + "num_tokens": 43287978.0, + "step": 257 + }, + { + "entropy": 1.9027895232041676, + "epoch": 0.02834308313421768, + "grad_norm": 1.2509558200836182, + "learning_rate": 9.396709323583182e-06, + "loss": 1.4337, + "mean_token_accuracy": 0.6443447520335516, + "num_tokens": 43436841.0, + "step": 258 + }, + { + "entropy": 1.863025466601054, + "epoch": 0.028452940045590618, + "grad_norm": 1.5175418853759766, + "learning_rate": 9.433272394881171e-06, + "loss": 1.3925, + "mean_token_accuracy": 0.65195099512736, + "num_tokens": 43575151.0, + "step": 259 + }, + { + "entropy": 1.9013051688671112, + "epoch": 0.028562796956963555, + "grad_norm": 1.6341294050216675, + "learning_rate": 9.469835466179161e-06, + "loss": 1.5872, + "mean_token_accuracy": 0.6424743135770162, + "num_tokens": 43716089.0, + "step": 260 + }, + { + "entropy": 1.9514261881510417, + "epoch": 0.028672653868336492, + "grad_norm": 1.6734215021133423, + "learning_rate": 9.50639853747715e-06, + "loss": 1.5115, + "mean_token_accuracy": 0.6237670431534449, + "num_tokens": 43853512.0, + "step": 261 + }, + { + "entropy": 1.9595048030217488, + "epoch": 0.02878251077970943, + "grad_norm": 2.273057460784912, + "learning_rate": 9.542961608775137e-06, + "loss": 1.45, + "mean_token_accuracy": 0.639571433266004, + "num_tokens": 43964136.0, + "step": 262 + }, + { + "entropy": 1.86832395195961, + "epoch": 0.028892367691082364, + "grad_norm": 1.6332321166992188, + "learning_rate": 9.579524680073127e-06, + "loss": 1.481, + "mean_token_accuracy": 0.6361501961946487, + "num_tokens": 44106187.0, + "step": 263 + }, + { + "entropy": 1.8855204284191132, + "epoch": 0.0290022246024553, + "grad_norm": 1.0685125589370728, + "learning_rate": 9.616087751371117e-06, + "loss": 1.4531, + "mean_token_accuracy": 0.6394118815660477, + "num_tokens": 44263324.0, + "step": 264 + }, + { + "entropy": 1.9358879923820496, + "epoch": 0.02911208151382824, + "grad_norm": 1.6074949502944946, + "learning_rate": 9.652650822669105e-06, + "loss": 1.4889, + "mean_token_accuracy": 0.643065462509791, + "num_tokens": 44383088.0, + "step": 265 + }, + { + "entropy": 1.893241822719574, + "epoch": 0.029221938425201176, + "grad_norm": 1.4432519674301147, + "learning_rate": 9.689213893967095e-06, + "loss": 1.3719, + "mean_token_accuracy": 0.6543690661589304, + "num_tokens": 44514573.0, + "step": 266 + }, + { + "entropy": 1.9079320927460988, + "epoch": 0.029331795336574113, + "grad_norm": 1.385563611984253, + "learning_rate": 9.725776965265084e-06, + "loss": 1.4954, + "mean_token_accuracy": 0.6275987525780996, + "num_tokens": 44664023.0, + "step": 267 + }, + { + "entropy": 1.9555991490681965, + "epoch": 0.02944165224794705, + "grad_norm": 1.320134162902832, + "learning_rate": 9.762340036563072e-06, + "loss": 1.528, + "mean_token_accuracy": 0.6275275399287542, + "num_tokens": 44806322.0, + "step": 268 + }, + { + "entropy": 1.9195038080215454, + "epoch": 0.029551509159319984, + "grad_norm": 1.5830590724945068, + "learning_rate": 9.79890310786106e-06, + "loss": 1.4392, + "mean_token_accuracy": 0.6394519209861755, + "num_tokens": 44943371.0, + "step": 269 + }, + { + "entropy": 1.9248672624429066, + "epoch": 0.029661366070692922, + "grad_norm": 1.2923870086669922, + "learning_rate": 9.83546617915905e-06, + "loss": 1.5068, + "mean_token_accuracy": 0.6360269586245219, + "num_tokens": 45101406.0, + "step": 270 + }, + { + "entropy": 1.9413983821868896, + "epoch": 0.02977122298206586, + "grad_norm": 1.126284122467041, + "learning_rate": 9.87202925045704e-06, + "loss": 1.5993, + "mean_token_accuracy": 0.6119322826464971, + "num_tokens": 45359982.0, + "step": 271 + }, + { + "entropy": 1.9256538450717926, + "epoch": 0.029881079893438797, + "grad_norm": 1.5768324136734009, + "learning_rate": 9.908592321755028e-06, + "loss": 1.5289, + "mean_token_accuracy": 0.6280013422171274, + "num_tokens": 45489978.0, + "step": 272 + }, + { + "entropy": 1.9650197923183441, + "epoch": 0.029990936804811734, + "grad_norm": 2.4262402057647705, + "learning_rate": 9.945155393053018e-06, + "loss": 1.4462, + "mean_token_accuracy": 0.6426471074422201, + "num_tokens": 45591818.0, + "step": 273 + }, + { + "entropy": 1.966247429450353, + "epoch": 0.030100793716184668, + "grad_norm": 1.6343317031860352, + "learning_rate": 9.981718464351006e-06, + "loss": 1.4539, + "mean_token_accuracy": 0.6315440734227499, + "num_tokens": 45786109.0, + "step": 274 + }, + { + "entropy": 1.9131847222646077, + "epoch": 0.030210650627557605, + "grad_norm": 0.7987267971038818, + "learning_rate": 1.0018281535648996e-05, + "loss": 1.5537, + "mean_token_accuracy": 0.6154775619506836, + "num_tokens": 46015605.0, + "step": 275 + }, + { + "entropy": 1.9687570333480835, + "epoch": 0.030320507538930543, + "grad_norm": 1.7003246545791626, + "learning_rate": 1.0054844606946985e-05, + "loss": 1.5739, + "mean_token_accuracy": 0.6201535413662592, + "num_tokens": 46170159.0, + "step": 276 + }, + { + "entropy": 1.9644801914691925, + "epoch": 0.03043036445030348, + "grad_norm": 0.9804157614707947, + "learning_rate": 1.0091407678244974e-05, + "loss": 1.5158, + "mean_token_accuracy": 0.6364769091208776, + "num_tokens": 46387669.0, + "step": 277 + }, + { + "entropy": 1.898623416821162, + "epoch": 0.030540221361676417, + "grad_norm": 1.5173487663269043, + "learning_rate": 1.0127970749542962e-05, + "loss": 1.4926, + "mean_token_accuracy": 0.6275862356026968, + "num_tokens": 46563977.0, + "step": 278 + }, + { + "entropy": 1.995850036541621, + "epoch": 0.030650078273049355, + "grad_norm": 1.4851152896881104, + "learning_rate": 1.0164533820840951e-05, + "loss": 1.4706, + "mean_token_accuracy": 0.6297584424416224, + "num_tokens": 46718556.0, + "step": 279 + }, + { + "entropy": 1.8533688286940257, + "epoch": 0.03075993518442229, + "grad_norm": 1.2799378633499146, + "learning_rate": 1.020109689213894e-05, + "loss": 1.4704, + "mean_token_accuracy": 0.6369777669509252, + "num_tokens": 46878881.0, + "step": 280 + }, + { + "entropy": 1.9496891895929973, + "epoch": 0.030869792095795226, + "grad_norm": 7.689694404602051, + "learning_rate": 1.023765996343693e-05, + "loss": 1.443, + "mean_token_accuracy": 0.6240266213814417, + "num_tokens": 47091397.0, + "step": 281 + }, + { + "entropy": 1.9444605509440105, + "epoch": 0.030979649007168163, + "grad_norm": 2.042428970336914, + "learning_rate": 1.0274223034734917e-05, + "loss": 1.4686, + "mean_token_accuracy": 0.6330529451370239, + "num_tokens": 47221651.0, + "step": 282 + }, + { + "entropy": 1.889829029639562, + "epoch": 0.0310895059185411, + "grad_norm": 1.3333542346954346, + "learning_rate": 1.0310786106032909e-05, + "loss": 1.4242, + "mean_token_accuracy": 0.6345295310020447, + "num_tokens": 47396569.0, + "step": 283 + }, + { + "entropy": 1.844144841035207, + "epoch": 0.031199362829914038, + "grad_norm": 1.2614295482635498, + "learning_rate": 1.0347349177330897e-05, + "loss": 1.4626, + "mean_token_accuracy": 0.6344168136517206, + "num_tokens": 47542762.0, + "step": 284 + }, + { + "entropy": 1.8404381672541301, + "epoch": 0.031309219741286975, + "grad_norm": 2.0773274898529053, + "learning_rate": 1.0383912248628885e-05, + "loss": 1.3985, + "mean_token_accuracy": 0.6463861962159475, + "num_tokens": 47669565.0, + "step": 285 + }, + { + "entropy": 1.8757590055465698, + "epoch": 0.03141907665265991, + "grad_norm": 0.996104896068573, + "learning_rate": 1.0420475319926875e-05, + "loss": 1.4716, + "mean_token_accuracy": 0.6297437200943629, + "num_tokens": 47852702.0, + "step": 286 + }, + { + "entropy": 1.957614282766978, + "epoch": 0.03152893356403285, + "grad_norm": 1.0234733819961548, + "learning_rate": 1.0457038391224863e-05, + "loss": 1.5283, + "mean_token_accuracy": 0.628383403023084, + "num_tokens": 48039754.0, + "step": 287 + }, + { + "entropy": 1.8798251052697499, + "epoch": 0.031638790475405784, + "grad_norm": 1.2098981142044067, + "learning_rate": 1.0493601462522852e-05, + "loss": 1.5128, + "mean_token_accuracy": 0.6379542450110117, + "num_tokens": 48191936.0, + "step": 288 + }, + { + "entropy": 1.8851182560125987, + "epoch": 0.03174864738677872, + "grad_norm": 1.206680178642273, + "learning_rate": 1.053016453382084e-05, + "loss": 1.5542, + "mean_token_accuracy": 0.6433884302775065, + "num_tokens": 48385870.0, + "step": 289 + }, + { + "entropy": 1.8928188979625702, + "epoch": 0.03185850429815166, + "grad_norm": 1.523961067199707, + "learning_rate": 1.0566727605118832e-05, + "loss": 1.432, + "mean_token_accuracy": 0.6425420343875885, + "num_tokens": 48558121.0, + "step": 290 + }, + { + "entropy": 1.945473462343216, + "epoch": 0.03196836120952459, + "grad_norm": 0.9344412088394165, + "learning_rate": 1.060329067641682e-05, + "loss": 1.5703, + "mean_token_accuracy": 0.6286270767450333, + "num_tokens": 48722343.0, + "step": 291 + }, + { + "entropy": 1.881322979927063, + "epoch": 0.032078218120897534, + "grad_norm": 0.9196475148200989, + "learning_rate": 1.0639853747714808e-05, + "loss": 1.5234, + "mean_token_accuracy": 0.6369537711143494, + "num_tokens": 48887704.0, + "step": 292 + }, + { + "entropy": 1.933806041876475, + "epoch": 0.03218807503227047, + "grad_norm": 1.59644615650177, + "learning_rate": 1.0676416819012798e-05, + "loss": 1.4071, + "mean_token_accuracy": 0.6504911333322525, + "num_tokens": 49020557.0, + "step": 293 + }, + { + "entropy": 1.9321398834387462, + "epoch": 0.0322979319436434, + "grad_norm": 0.9138100147247314, + "learning_rate": 1.0712979890310786e-05, + "loss": 1.5022, + "mean_token_accuracy": 0.6257292628288269, + "num_tokens": 49207179.0, + "step": 294 + }, + { + "entropy": 1.968072275320689, + "epoch": 0.03240778885501634, + "grad_norm": 1.0833994150161743, + "learning_rate": 1.0749542961608776e-05, + "loss": 1.5052, + "mean_token_accuracy": 0.6170014639695486, + "num_tokens": 49368907.0, + "step": 295 + }, + { + "entropy": 1.897968828678131, + "epoch": 0.032517645766389276, + "grad_norm": 1.578821063041687, + "learning_rate": 1.0786106032906764e-05, + "loss": 1.4847, + "mean_token_accuracy": 0.6308430184920629, + "num_tokens": 49501311.0, + "step": 296 + }, + { + "entropy": 1.8983619312445323, + "epoch": 0.03262750267776222, + "grad_norm": 0.7992421388626099, + "learning_rate": 1.0822669104204755e-05, + "loss": 1.5683, + "mean_token_accuracy": 0.622817466656367, + "num_tokens": 49710655.0, + "step": 297 + }, + { + "entropy": 1.904332121213277, + "epoch": 0.03273735958913515, + "grad_norm": 0.8225474953651428, + "learning_rate": 1.0859232175502743e-05, + "loss": 1.6713, + "mean_token_accuracy": 0.6234669287999471, + "num_tokens": 49927190.0, + "step": 298 + }, + { + "entropy": 1.9980522493521373, + "epoch": 0.032847216500508085, + "grad_norm": 1.2448490858078003, + "learning_rate": 1.0895795246800731e-05, + "loss": 1.72, + "mean_token_accuracy": 0.6147239456574122, + "num_tokens": 50130606.0, + "step": 299 + }, + { + "entropy": 1.8948036630948384, + "epoch": 0.032957073411881026, + "grad_norm": 0.9344723224639893, + "learning_rate": 1.0932358318098721e-05, + "loss": 1.4923, + "mean_token_accuracy": 0.636970043182373, + "num_tokens": 50313035.0, + "step": 300 + }, + { + "entropy": 1.9244210918744404, + "epoch": 0.03306693032325396, + "grad_norm": 1.0805492401123047, + "learning_rate": 1.096892138939671e-05, + "loss": 1.5029, + "mean_token_accuracy": 0.6375883320967356, + "num_tokens": 50508718.0, + "step": 301 + }, + { + "entropy": 1.9689223965009053, + "epoch": 0.0331767872346269, + "grad_norm": 1.3119393587112427, + "learning_rate": 1.1005484460694699e-05, + "loss": 1.6139, + "mean_token_accuracy": 0.6174919605255127, + "num_tokens": 50686071.0, + "step": 302 + }, + { + "entropy": 1.9293088515599568, + "epoch": 0.033286644145999834, + "grad_norm": 1.0145171880722046, + "learning_rate": 1.1042047531992687e-05, + "loss": 1.4831, + "mean_token_accuracy": 0.634314775466919, + "num_tokens": 50862535.0, + "step": 303 + }, + { + "entropy": 1.9416919847329457, + "epoch": 0.033396501057372775, + "grad_norm": 1.5745701789855957, + "learning_rate": 1.1078610603290679e-05, + "loss": 1.5724, + "mean_token_accuracy": 0.63288913667202, + "num_tokens": 51040714.0, + "step": 304 + }, + { + "entropy": 1.914773811896642, + "epoch": 0.03350635796874571, + "grad_norm": 1.8054081201553345, + "learning_rate": 1.1115173674588667e-05, + "loss": 1.4908, + "mean_token_accuracy": 0.6410078605016073, + "num_tokens": 51196791.0, + "step": 305 + }, + { + "entropy": 1.8429247538248699, + "epoch": 0.03361621488011864, + "grad_norm": 0.9021672606468201, + "learning_rate": 1.1151736745886655e-05, + "loss": 1.584, + "mean_token_accuracy": 0.6183707366387049, + "num_tokens": 51438626.0, + "step": 306 + }, + { + "entropy": 1.926130364338557, + "epoch": 0.033726071791491584, + "grad_norm": 1.2748171091079712, + "learning_rate": 1.1188299817184644e-05, + "loss": 1.4385, + "mean_token_accuracy": 0.6522895991802216, + "num_tokens": 51584083.0, + "step": 307 + }, + { + "entropy": 1.893998791774114, + "epoch": 0.03383592870286452, + "grad_norm": 0.9571841955184937, + "learning_rate": 1.1224862888482633e-05, + "loss": 1.4666, + "mean_token_accuracy": 0.638142466545105, + "num_tokens": 51765032.0, + "step": 308 + }, + { + "entropy": 2.0008548498153687, + "epoch": 0.03394578561423746, + "grad_norm": 1.4947288036346436, + "learning_rate": 1.1261425959780622e-05, + "loss": 1.4839, + "mean_token_accuracy": 0.6292637437582016, + "num_tokens": 51912342.0, + "step": 309 + }, + { + "entropy": 1.8920707205931346, + "epoch": 0.03405564252561039, + "grad_norm": 1.2760716676712036, + "learning_rate": 1.129798903107861e-05, + "loss": 1.4213, + "mean_token_accuracy": 0.6458606521288554, + "num_tokens": 52051740.0, + "step": 310 + }, + { + "entropy": 1.9140145977338154, + "epoch": 0.034165499436983326, + "grad_norm": 0.9365392923355103, + "learning_rate": 1.1334552102376602e-05, + "loss": 1.5388, + "mean_token_accuracy": 0.6314593305190405, + "num_tokens": 52231050.0, + "step": 311 + }, + { + "entropy": 1.9039499859015148, + "epoch": 0.03427535634835627, + "grad_norm": 1.0918009281158447, + "learning_rate": 1.137111517367459e-05, + "loss": 1.6301, + "mean_token_accuracy": 0.6101748992999395, + "num_tokens": 52405268.0, + "step": 312 + }, + { + "entropy": 1.9176567395528157, + "epoch": 0.0343852132597292, + "grad_norm": 1.2120338678359985, + "learning_rate": 1.1407678244972578e-05, + "loss": 1.5766, + "mean_token_accuracy": 0.6275134632984797, + "num_tokens": 52615722.0, + "step": 313 + }, + { + "entropy": 1.8687150677045186, + "epoch": 0.03449507017110214, + "grad_norm": 1.3236573934555054, + "learning_rate": 1.1444241316270568e-05, + "loss": 1.5724, + "mean_token_accuracy": 0.6229855716228485, + "num_tokens": 52788308.0, + "step": 314 + }, + { + "entropy": 1.9559665719668071, + "epoch": 0.034604927082475076, + "grad_norm": 1.5506702661514282, + "learning_rate": 1.1480804387568556e-05, + "loss": 1.5085, + "mean_token_accuracy": 0.6220680375893911, + "num_tokens": 52927707.0, + "step": 315 + }, + { + "entropy": 1.88349653283755, + "epoch": 0.03471478399384801, + "grad_norm": 1.3259515762329102, + "learning_rate": 1.1517367458866546e-05, + "loss": 1.5174, + "mean_token_accuracy": 0.6426070580879847, + "num_tokens": 53062021.0, + "step": 316 + }, + { + "entropy": 1.9310015539328258, + "epoch": 0.03482464090522095, + "grad_norm": 1.5750491619110107, + "learning_rate": 1.1553930530164534e-05, + "loss": 1.5892, + "mean_token_accuracy": 0.61604871849219, + "num_tokens": 53181467.0, + "step": 317 + }, + { + "entropy": 1.8769896825154622, + "epoch": 0.034934497816593885, + "grad_norm": 0.743971049785614, + "learning_rate": 1.1590493601462525e-05, + "loss": 1.5217, + "mean_token_accuracy": 0.6268236736456553, + "num_tokens": 53407027.0, + "step": 318 + }, + { + "entropy": 1.9732247789700825, + "epoch": 0.035044354727966825, + "grad_norm": 1.4356918334960938, + "learning_rate": 1.1627056672760513e-05, + "loss": 1.4453, + "mean_token_accuracy": 0.626713772614797, + "num_tokens": 53556426.0, + "step": 319 + }, + { + "entropy": 1.9230054517587025, + "epoch": 0.03515421163933976, + "grad_norm": 1.635143756866455, + "learning_rate": 1.1663619744058501e-05, + "loss": 1.4392, + "mean_token_accuracy": 0.6414381017287573, + "num_tokens": 53663337.0, + "step": 320 + }, + { + "entropy": 1.9362250169118245, + "epoch": 0.03526406855071269, + "grad_norm": 1.7073150873184204, + "learning_rate": 1.1700182815356491e-05, + "loss": 1.617, + "mean_token_accuracy": 0.6255223502715429, + "num_tokens": 53799717.0, + "step": 321 + }, + { + "entropy": 1.8816980421543121, + "epoch": 0.035373925462085634, + "grad_norm": 1.4336020946502686, + "learning_rate": 1.1736745886654479e-05, + "loss": 1.4752, + "mean_token_accuracy": 0.6383567899465561, + "num_tokens": 53946152.0, + "step": 322 + }, + { + "entropy": 1.8929463227589924, + "epoch": 0.03548378237345857, + "grad_norm": 2.153032064437866, + "learning_rate": 1.1773308957952469e-05, + "loss": 1.5695, + "mean_token_accuracy": 0.6398225873708725, + "num_tokens": 54116796.0, + "step": 323 + }, + { + "entropy": 1.8566848436991374, + "epoch": 0.03559363928483151, + "grad_norm": 0.8859526515007019, + "learning_rate": 1.1809872029250457e-05, + "loss": 1.5324, + "mean_token_accuracy": 0.6276658624410629, + "num_tokens": 54293425.0, + "step": 324 + }, + { + "entropy": 1.8859939277172089, + "epoch": 0.03570349619620444, + "grad_norm": 1.7319366931915283, + "learning_rate": 1.1846435100548448e-05, + "loss": 1.4423, + "mean_token_accuracy": 0.6443535685539246, + "num_tokens": 54418340.0, + "step": 325 + }, + { + "entropy": 1.8691116273403168, + "epoch": 0.035813353107577384, + "grad_norm": 0.8846819400787354, + "learning_rate": 1.1882998171846436e-05, + "loss": 1.5374, + "mean_token_accuracy": 0.6237656573454539, + "num_tokens": 54593530.0, + "step": 326 + }, + { + "entropy": 1.9191945890585582, + "epoch": 0.03592321001895032, + "grad_norm": 1.6270257234573364, + "learning_rate": 1.1919561243144425e-05, + "loss": 1.3593, + "mean_token_accuracy": 0.6514436999956766, + "num_tokens": 54754982.0, + "step": 327 + }, + { + "entropy": 1.9296835362911224, + "epoch": 0.03603306693032325, + "grad_norm": 1.3112103939056396, + "learning_rate": 1.1956124314442414e-05, + "loss": 1.548, + "mean_token_accuracy": 0.6345210323731104, + "num_tokens": 54911330.0, + "step": 328 + }, + { + "entropy": 1.9412719508012135, + "epoch": 0.03614292384169619, + "grad_norm": 1.8282943964004517, + "learning_rate": 1.1992687385740402e-05, + "loss": 1.5502, + "mean_token_accuracy": 0.6237274209658304, + "num_tokens": 55070562.0, + "step": 329 + }, + { + "entropy": 1.9365448355674744, + "epoch": 0.036252780753069126, + "grad_norm": 1.3936512470245361, + "learning_rate": 1.2029250457038392e-05, + "loss": 1.4556, + "mean_token_accuracy": 0.635651042064031, + "num_tokens": 55234754.0, + "step": 330 + }, + { + "entropy": 1.90728959441185, + "epoch": 0.03636263766444207, + "grad_norm": 1.0101035833358765, + "learning_rate": 1.206581352833638e-05, + "loss": 1.4667, + "mean_token_accuracy": 0.6326676507790884, + "num_tokens": 55403593.0, + "step": 331 + }, + { + "entropy": 1.9017587800820668, + "epoch": 0.036472494575815, + "grad_norm": 0.9737944006919861, + "learning_rate": 1.2102376599634372e-05, + "loss": 1.4969, + "mean_token_accuracy": 0.6238344510396322, + "num_tokens": 55615207.0, + "step": 332 + }, + { + "entropy": 1.9239464402198792, + "epoch": 0.036582351487187935, + "grad_norm": 1.1247916221618652, + "learning_rate": 1.213893967093236e-05, + "loss": 1.5176, + "mean_token_accuracy": 0.6404502739508947, + "num_tokens": 55772900.0, + "step": 333 + }, + { + "entropy": 1.97471684217453, + "epoch": 0.036692208398560876, + "grad_norm": 1.3233532905578613, + "learning_rate": 1.2175502742230348e-05, + "loss": 1.463, + "mean_token_accuracy": 0.6393190721670786, + "num_tokens": 55917794.0, + "step": 334 + }, + { + "entropy": 1.8958436946074169, + "epoch": 0.03680206530993381, + "grad_norm": 1.441660761833191, + "learning_rate": 1.2212065813528338e-05, + "loss": 1.413, + "mean_token_accuracy": 0.6413801709810892, + "num_tokens": 56056541.0, + "step": 335 + }, + { + "entropy": 1.9842320779959361, + "epoch": 0.03691192222130675, + "grad_norm": 1.0363242626190186, + "learning_rate": 1.2248628884826326e-05, + "loss": 1.4963, + "mean_token_accuracy": 0.6350095023711523, + "num_tokens": 56192437.0, + "step": 336 + }, + { + "entropy": 1.9338072041670482, + "epoch": 0.037021779132679684, + "grad_norm": 1.5733563899993896, + "learning_rate": 1.2285191956124315e-05, + "loss": 1.554, + "mean_token_accuracy": 0.6295024752616882, + "num_tokens": 56358968.0, + "step": 337 + }, + { + "entropy": 1.897556722164154, + "epoch": 0.03713163604405262, + "grad_norm": 0.9414510130882263, + "learning_rate": 1.2321755027422303e-05, + "loss": 1.4555, + "mean_token_accuracy": 0.6441336075464884, + "num_tokens": 56547658.0, + "step": 338 + }, + { + "entropy": 1.9216489593187969, + "epoch": 0.03724149295542556, + "grad_norm": 0.890693187713623, + "learning_rate": 1.2358318098720295e-05, + "loss": 1.5634, + "mean_token_accuracy": 0.6117081145445505, + "num_tokens": 56755945.0, + "step": 339 + }, + { + "entropy": 1.878822495539983, + "epoch": 0.03735134986679849, + "grad_norm": 1.6289485692977905, + "learning_rate": 1.2394881170018283e-05, + "loss": 1.346, + "mean_token_accuracy": 0.6585533966620764, + "num_tokens": 56889301.0, + "step": 340 + }, + { + "entropy": 1.9101523856321971, + "epoch": 0.037461206778171434, + "grad_norm": 1.244612455368042, + "learning_rate": 1.2431444241316271e-05, + "loss": 1.4946, + "mean_token_accuracy": 0.625670313835144, + "num_tokens": 57043658.0, + "step": 341 + }, + { + "entropy": 1.9623491565386455, + "epoch": 0.03757106368954437, + "grad_norm": 1.3181416988372803, + "learning_rate": 1.246800731261426e-05, + "loss": 1.5015, + "mean_token_accuracy": 0.6292295505603155, + "num_tokens": 57171786.0, + "step": 342 + }, + { + "entropy": 1.894644061724345, + "epoch": 0.03768092060091731, + "grad_norm": 1.3845021724700928, + "learning_rate": 1.2504570383912249e-05, + "loss": 1.5347, + "mean_token_accuracy": 0.6457602828741074, + "num_tokens": 57329673.0, + "step": 343 + }, + { + "entropy": 1.9226838648319244, + "epoch": 0.03779077751229024, + "grad_norm": 1.0904666185379028, + "learning_rate": 1.2541133455210239e-05, + "loss": 1.4927, + "mean_token_accuracy": 0.6328802009423574, + "num_tokens": 57504063.0, + "step": 344 + }, + { + "entropy": 1.8712241252263386, + "epoch": 0.037900634423663176, + "grad_norm": 0.674035906791687, + "learning_rate": 1.2577696526508227e-05, + "loss": 1.5697, + "mean_token_accuracy": 0.6206586708625158, + "num_tokens": 57748561.0, + "step": 345 + }, + { + "entropy": 1.8309160272280376, + "epoch": 0.03801049133503612, + "grad_norm": 0.8603072166442871, + "learning_rate": 1.2614259597806218e-05, + "loss": 1.5903, + "mean_token_accuracy": 0.6370584319035212, + "num_tokens": 57941124.0, + "step": 346 + }, + { + "entropy": 1.867393175760905, + "epoch": 0.03812034824640905, + "grad_norm": 1.344534158706665, + "learning_rate": 1.2650822669104206e-05, + "loss": 1.425, + "mean_token_accuracy": 0.6433531989653906, + "num_tokens": 58091957.0, + "step": 347 + }, + { + "entropy": 1.8822752038637798, + "epoch": 0.03823020515778199, + "grad_norm": 1.8091611862182617, + "learning_rate": 1.2687385740402194e-05, + "loss": 1.3816, + "mean_token_accuracy": 0.6489702761173248, + "num_tokens": 58240675.0, + "step": 348 + }, + { + "entropy": 1.9269113938013713, + "epoch": 0.038340062069154926, + "grad_norm": 1.1259180307388306, + "learning_rate": 1.2723948811700184e-05, + "loss": 1.5647, + "mean_token_accuracy": 0.6245926817258199, + "num_tokens": 58431982.0, + "step": 349 + }, + { + "entropy": 1.861362983783086, + "epoch": 0.03844991898052786, + "grad_norm": 1.033488392829895, + "learning_rate": 1.2760511882998172e-05, + "loss": 1.468, + "mean_token_accuracy": 0.6400075107812881, + "num_tokens": 58609741.0, + "step": 350 + }, + { + "entropy": 1.956882357597351, + "epoch": 0.0385597758919008, + "grad_norm": 1.2942620515823364, + "learning_rate": 1.2797074954296162e-05, + "loss": 1.4972, + "mean_token_accuracy": 0.6373551438252131, + "num_tokens": 58725263.0, + "step": 351 + }, + { + "entropy": 1.9582510491212208, + "epoch": 0.038669632803273735, + "grad_norm": 1.3242194652557373, + "learning_rate": 1.283363802559415e-05, + "loss": 1.4572, + "mean_token_accuracy": 0.6369108855724335, + "num_tokens": 58890257.0, + "step": 352 + }, + { + "entropy": 1.8531585335731506, + "epoch": 0.038779489714646675, + "grad_norm": 1.8601542711257935, + "learning_rate": 1.2870201096892141e-05, + "loss": 1.3595, + "mean_token_accuracy": 0.6562222242355347, + "num_tokens": 59013387.0, + "step": 353 + }, + { + "entropy": 1.938563883304596, + "epoch": 0.03888934662601961, + "grad_norm": 1.148009181022644, + "learning_rate": 1.290676416819013e-05, + "loss": 1.4834, + "mean_token_accuracy": 0.6284010857343674, + "num_tokens": 59194237.0, + "step": 354 + }, + { + "entropy": 1.8674622178077698, + "epoch": 0.03899920353739254, + "grad_norm": 0.8510828018188477, + "learning_rate": 1.2943327239488118e-05, + "loss": 1.4612, + "mean_token_accuracy": 0.6370283762613932, + "num_tokens": 59393642.0, + "step": 355 + }, + { + "entropy": 1.8750587205092113, + "epoch": 0.039109060448765484, + "grad_norm": 1.027628779411316, + "learning_rate": 1.2979890310786107e-05, + "loss": 1.4421, + "mean_token_accuracy": 0.635211726029714, + "num_tokens": 59545332.0, + "step": 356 + }, + { + "entropy": 1.9101734459400177, + "epoch": 0.03921891736013842, + "grad_norm": 1.4470945596694946, + "learning_rate": 1.3016453382084095e-05, + "loss": 1.3906, + "mean_token_accuracy": 0.6514510164658228, + "num_tokens": 59690347.0, + "step": 357 + }, + { + "entropy": 1.8950016995271046, + "epoch": 0.03932877427151136, + "grad_norm": 1.020830512046814, + "learning_rate": 1.3053016453382085e-05, + "loss": 1.4614, + "mean_token_accuracy": 0.6369933982690176, + "num_tokens": 59897237.0, + "step": 358 + }, + { + "entropy": 1.9750393331050873, + "epoch": 0.03943863118288429, + "grad_norm": 1.3367540836334229, + "learning_rate": 1.3089579524680073e-05, + "loss": 1.5399, + "mean_token_accuracy": 0.620476762453715, + "num_tokens": 60056227.0, + "step": 359 + }, + { + "entropy": 1.833806296189626, + "epoch": 0.03954848809425723, + "grad_norm": 1.280727505683899, + "learning_rate": 1.3126142595978065e-05, + "loss": 1.4652, + "mean_token_accuracy": 0.6454865237077078, + "num_tokens": 60203405.0, + "step": 360 + }, + { + "entropy": 1.9267512361208599, + "epoch": 0.03965834500563017, + "grad_norm": 1.0081630945205688, + "learning_rate": 1.3162705667276053e-05, + "loss": 1.5328, + "mean_token_accuracy": 0.6300081759691238, + "num_tokens": 60402318.0, + "step": 361 + }, + { + "entropy": 1.8845655421415966, + "epoch": 0.0397682019170031, + "grad_norm": 0.7514256834983826, + "learning_rate": 1.3199268738574041e-05, + "loss": 1.4632, + "mean_token_accuracy": 0.6361817816893259, + "num_tokens": 60603115.0, + "step": 362 + }, + { + "entropy": 1.8751682738463085, + "epoch": 0.03987805882837604, + "grad_norm": 0.9772806167602539, + "learning_rate": 1.323583180987203e-05, + "loss": 1.5428, + "mean_token_accuracy": 0.6149017065763474, + "num_tokens": 60849896.0, + "step": 363 + }, + { + "entropy": 1.9592778881390889, + "epoch": 0.039987915739748976, + "grad_norm": 1.4042431116104126, + "learning_rate": 1.3272394881170019e-05, + "loss": 1.5267, + "mean_token_accuracy": 0.638154923915863, + "num_tokens": 61010485.0, + "step": 364 + }, + { + "entropy": 1.8496886094411213, + "epoch": 0.04009777265112192, + "grad_norm": 0.871859610080719, + "learning_rate": 1.3308957952468008e-05, + "loss": 1.5193, + "mean_token_accuracy": 0.6397547672192255, + "num_tokens": 61197551.0, + "step": 365 + }, + { + "entropy": 1.9339230159918468, + "epoch": 0.04020762956249485, + "grad_norm": 1.5222102403640747, + "learning_rate": 1.3345521023765997e-05, + "loss": 1.5256, + "mean_token_accuracy": 0.6346002717812856, + "num_tokens": 61334172.0, + "step": 366 + }, + { + "entropy": 1.84268722931544, + "epoch": 0.040317486473867785, + "grad_norm": 0.8281468749046326, + "learning_rate": 1.3382084095063988e-05, + "loss": 1.53, + "mean_token_accuracy": 0.6305044641097387, + "num_tokens": 61550034.0, + "step": 367 + }, + { + "entropy": 1.8627445697784424, + "epoch": 0.040427343385240726, + "grad_norm": 0.6707728505134583, + "learning_rate": 1.3418647166361976e-05, + "loss": 1.4511, + "mean_token_accuracy": 0.6334594835837682, + "num_tokens": 61766707.0, + "step": 368 + }, + { + "entropy": 1.927617460489273, + "epoch": 0.04053720029661366, + "grad_norm": 1.031119465827942, + "learning_rate": 1.3455210237659964e-05, + "loss": 1.4601, + "mean_token_accuracy": 0.6361829191446304, + "num_tokens": 61899435.0, + "step": 369 + }, + { + "entropy": 1.8108851512273152, + "epoch": 0.0406470572079866, + "grad_norm": 1.329870343208313, + "learning_rate": 1.3491773308957954e-05, + "loss": 1.392, + "mean_token_accuracy": 0.6440057257811228, + "num_tokens": 62068166.0, + "step": 370 + }, + { + "entropy": 1.83323472738266, + "epoch": 0.040756914119359534, + "grad_norm": 0.8489660620689392, + "learning_rate": 1.3528336380255942e-05, + "loss": 1.3747, + "mean_token_accuracy": 0.651435524225235, + "num_tokens": 62237791.0, + "step": 371 + }, + { + "entropy": 1.8929267923037212, + "epoch": 0.04086677103073247, + "grad_norm": 1.568328857421875, + "learning_rate": 1.3564899451553932e-05, + "loss": 1.3254, + "mean_token_accuracy": 0.6630758593479792, + "num_tokens": 62354504.0, + "step": 372 + }, + { + "entropy": 1.904485156138738, + "epoch": 0.04097662794210541, + "grad_norm": 1.0895646810531616, + "learning_rate": 1.360146252285192e-05, + "loss": 1.4935, + "mean_token_accuracy": 0.6399994641542435, + "num_tokens": 62560184.0, + "step": 373 + }, + { + "entropy": 1.9404686590035756, + "epoch": 0.04108648485347834, + "grad_norm": 0.7966954112052917, + "learning_rate": 1.3638025594149911e-05, + "loss": 1.6445, + "mean_token_accuracy": 0.6181567882498106, + "num_tokens": 62762821.0, + "step": 374 + }, + { + "entropy": 1.930189887682597, + "epoch": 0.041196341764851284, + "grad_norm": 1.237733006477356, + "learning_rate": 1.36745886654479e-05, + "loss": 1.4306, + "mean_token_accuracy": 0.6403100987275442, + "num_tokens": 62911553.0, + "step": 375 + }, + { + "entropy": 1.932339499394099, + "epoch": 0.04130619867622422, + "grad_norm": 1.387355923652649, + "learning_rate": 1.3711151736745887e-05, + "loss": 1.4726, + "mean_token_accuracy": 0.6367992361386617, + "num_tokens": 63048004.0, + "step": 376 + }, + { + "entropy": 1.8782165547211964, + "epoch": 0.04141605558759715, + "grad_norm": 1.2075997591018677, + "learning_rate": 1.3747714808043877e-05, + "loss": 1.4862, + "mean_token_accuracy": 0.6394857068856558, + "num_tokens": 63211370.0, + "step": 377 + }, + { + "entropy": 1.9089668989181519, + "epoch": 0.04152591249897009, + "grad_norm": 1.1602435111999512, + "learning_rate": 1.3784277879341865e-05, + "loss": 1.4771, + "mean_token_accuracy": 0.6510659754276276, + "num_tokens": 63393016.0, + "step": 378 + }, + { + "entropy": 1.993513544400533, + "epoch": 0.041635769410343026, + "grad_norm": 1.2444241046905518, + "learning_rate": 1.3820840950639855e-05, + "loss": 1.4608, + "mean_token_accuracy": 0.6339425295591354, + "num_tokens": 63528722.0, + "step": 379 + }, + { + "entropy": 1.794573426246643, + "epoch": 0.04174562632171597, + "grad_norm": 0.9674469232559204, + "learning_rate": 1.3857404021937843e-05, + "loss": 1.5345, + "mean_token_accuracy": 0.6317428996165594, + "num_tokens": 63712882.0, + "step": 380 + }, + { + "entropy": 1.9814475774765015, + "epoch": 0.0418554832330889, + "grad_norm": 1.197340488433838, + "learning_rate": 1.3893967093235835e-05, + "loss": 1.4852, + "mean_token_accuracy": 0.6282167633374532, + "num_tokens": 63842716.0, + "step": 381 + }, + { + "entropy": 1.9127925833066304, + "epoch": 0.041965340144461835, + "grad_norm": 1.0025110244750977, + "learning_rate": 1.3930530164533823e-05, + "loss": 1.446, + "mean_token_accuracy": 0.6374160995086035, + "num_tokens": 64050221.0, + "step": 382 + }, + { + "entropy": 1.8482964634895325, + "epoch": 0.042075197055834776, + "grad_norm": 1.02582585811615, + "learning_rate": 1.396709323583181e-05, + "loss": 1.4667, + "mean_token_accuracy": 0.6344637920459112, + "num_tokens": 64211219.0, + "step": 383 + }, + { + "entropy": 1.9587088723977406, + "epoch": 0.04218505396720771, + "grad_norm": 1.2455130815505981, + "learning_rate": 1.40036563071298e-05, + "loss": 1.5323, + "mean_token_accuracy": 0.6333187768856684, + "num_tokens": 64323175.0, + "step": 384 + }, + { + "entropy": 1.9571288426717122, + "epoch": 0.04229491087858065, + "grad_norm": 1.6702572107315063, + "learning_rate": 1.4040219378427789e-05, + "loss": 1.5039, + "mean_token_accuracy": 0.6258624742428461, + "num_tokens": 64442759.0, + "step": 385 + }, + { + "entropy": 1.9019165933132172, + "epoch": 0.042404767789953585, + "grad_norm": 0.7855273485183716, + "learning_rate": 1.4076782449725778e-05, + "loss": 1.6998, + "mean_token_accuracy": 0.6055170843998591, + "num_tokens": 64665676.0, + "step": 386 + }, + { + "entropy": 1.9035062193870544, + "epoch": 0.042514624701326525, + "grad_norm": 1.8162872791290283, + "learning_rate": 1.4113345521023766e-05, + "loss": 1.3691, + "mean_token_accuracy": 0.6513401865959167, + "num_tokens": 64782379.0, + "step": 387 + }, + { + "entropy": 1.903337796529134, + "epoch": 0.04262448161269946, + "grad_norm": 1.0532211065292358, + "learning_rate": 1.4149908592321758e-05, + "loss": 1.3714, + "mean_token_accuracy": 0.6637005259593328, + "num_tokens": 64934687.0, + "step": 388 + }, + { + "entropy": 1.9479155739148457, + "epoch": 0.04273433852407239, + "grad_norm": 1.551796555519104, + "learning_rate": 1.4186471663619746e-05, + "loss": 1.4819, + "mean_token_accuracy": 0.6278212567170461, + "num_tokens": 65082079.0, + "step": 389 + }, + { + "entropy": 1.8967718482017517, + "epoch": 0.042844195435445334, + "grad_norm": 1.1235419511795044, + "learning_rate": 1.4223034734917734e-05, + "loss": 1.4021, + "mean_token_accuracy": 0.6365053604046503, + "num_tokens": 65289665.0, + "step": 390 + }, + { + "entropy": 1.9098777274290721, + "epoch": 0.04295405234681827, + "grad_norm": 0.99347984790802, + "learning_rate": 1.4259597806215724e-05, + "loss": 1.5961, + "mean_token_accuracy": 0.6231525763869286, + "num_tokens": 65423010.0, + "step": 391 + }, + { + "entropy": 1.9435608784357707, + "epoch": 0.04306390925819121, + "grad_norm": 0.733677864074707, + "learning_rate": 1.4296160877513712e-05, + "loss": 1.5477, + "mean_token_accuracy": 0.6210927665233612, + "num_tokens": 65632732.0, + "step": 392 + }, + { + "entropy": 1.939713458220164, + "epoch": 0.04317376616956414, + "grad_norm": 1.312638282775879, + "learning_rate": 1.4332723948811702e-05, + "loss": 1.4456, + "mean_token_accuracy": 0.6392714977264404, + "num_tokens": 65774896.0, + "step": 393 + }, + { + "entropy": 1.8830204804738362, + "epoch": 0.04328362308093708, + "grad_norm": 0.9776220917701721, + "learning_rate": 1.436928702010969e-05, + "loss": 1.4522, + "mean_token_accuracy": 0.6250886768102646, + "num_tokens": 65965831.0, + "step": 394 + }, + { + "entropy": 1.8725888232390087, + "epoch": 0.04339347999231002, + "grad_norm": 0.9504810571670532, + "learning_rate": 1.4405850091407681e-05, + "loss": 1.507, + "mean_token_accuracy": 0.6213281452655792, + "num_tokens": 66131698.0, + "step": 395 + }, + { + "entropy": 1.946168194214503, + "epoch": 0.04350333690368295, + "grad_norm": 1.1905755996704102, + "learning_rate": 1.444241316270567e-05, + "loss": 1.4674, + "mean_token_accuracy": 0.6266982605059942, + "num_tokens": 66250281.0, + "step": 396 + }, + { + "entropy": 1.8628549575805664, + "epoch": 0.04361319381505589, + "grad_norm": 1.0694218873977661, + "learning_rate": 1.4478976234003657e-05, + "loss": 1.3903, + "mean_token_accuracy": 0.6529761354128519, + "num_tokens": 66380753.0, + "step": 397 + }, + { + "entropy": 1.8151433169841766, + "epoch": 0.043723050726428826, + "grad_norm": 1.0954636335372925, + "learning_rate": 1.4515539305301647e-05, + "loss": 1.3429, + "mean_token_accuracy": 0.6568314780791601, + "num_tokens": 66511047.0, + "step": 398 + }, + { + "entropy": 1.8963292141755421, + "epoch": 0.04383290763780176, + "grad_norm": 0.7910407781600952, + "learning_rate": 1.4552102376599635e-05, + "loss": 1.5738, + "mean_token_accuracy": 0.6402320464452108, + "num_tokens": 66691862.0, + "step": 399 + }, + { + "entropy": 1.9180162648359935, + "epoch": 0.0439427645491747, + "grad_norm": 0.9959750175476074, + "learning_rate": 1.4588665447897625e-05, + "loss": 1.7048, + "mean_token_accuracy": 0.6067099720239639, + "num_tokens": 66884961.0, + "step": 400 + }, + { + "entropy": 1.8748231430848439, + "epoch": 0.044052621460547635, + "grad_norm": 0.9748513102531433, + "learning_rate": 1.4625228519195613e-05, + "loss": 1.5165, + "mean_token_accuracy": 0.6368064184983572, + "num_tokens": 67041958.0, + "step": 401 + }, + { + "entropy": 1.85904856522878, + "epoch": 0.044162478371920576, + "grad_norm": 1.2120349407196045, + "learning_rate": 1.4661791590493604e-05, + "loss": 1.3507, + "mean_token_accuracy": 0.6649549951155981, + "num_tokens": 67176830.0, + "step": 402 + }, + { + "entropy": 1.8290843864281972, + "epoch": 0.04427233528329351, + "grad_norm": 0.9248878359794617, + "learning_rate": 1.4698354661791592e-05, + "loss": 1.4839, + "mean_token_accuracy": 0.6410348663727442, + "num_tokens": 67335145.0, + "step": 403 + }, + { + "entropy": 1.86801873644193, + "epoch": 0.044382192194666444, + "grad_norm": 1.033895492553711, + "learning_rate": 1.473491773308958e-05, + "loss": 1.6367, + "mean_token_accuracy": 0.6232090393702189, + "num_tokens": 67511434.0, + "step": 404 + }, + { + "entropy": 1.9208786884943645, + "epoch": 0.044492049106039384, + "grad_norm": 0.9780264496803284, + "learning_rate": 1.477148080438757e-05, + "loss": 1.4249, + "mean_token_accuracy": 0.659845232963562, + "num_tokens": 67644219.0, + "step": 405 + }, + { + "entropy": 1.8616258203983307, + "epoch": 0.04460190601741232, + "grad_norm": 1.050032377243042, + "learning_rate": 1.4808043875685558e-05, + "loss": 1.522, + "mean_token_accuracy": 0.6423654605944952, + "num_tokens": 67802624.0, + "step": 406 + }, + { + "entropy": 1.9324693580468495, + "epoch": 0.04471176292878526, + "grad_norm": 0.8673065900802612, + "learning_rate": 1.4844606946983548e-05, + "loss": 1.4363, + "mean_token_accuracy": 0.6219440003236135, + "num_tokens": 67979554.0, + "step": 407 + }, + { + "entropy": 1.8592036068439484, + "epoch": 0.04482161984015819, + "grad_norm": 0.809765100479126, + "learning_rate": 1.4881170018281536e-05, + "loss": 1.5808, + "mean_token_accuracy": 0.63762233654658, + "num_tokens": 68165091.0, + "step": 408 + }, + { + "entropy": 1.9416759411493938, + "epoch": 0.044931476751531134, + "grad_norm": 0.6962368488311768, + "learning_rate": 1.4917733089579528e-05, + "loss": 1.4768, + "mean_token_accuracy": 0.6251722325881323, + "num_tokens": 68410038.0, + "step": 409 + }, + { + "entropy": 1.950746734937032, + "epoch": 0.04504133366290407, + "grad_norm": 1.4196857213974, + "learning_rate": 1.4954296160877516e-05, + "loss": 1.402, + "mean_token_accuracy": 0.6466809262832006, + "num_tokens": 68514877.0, + "step": 410 + }, + { + "entropy": 1.963972936073939, + "epoch": 0.045151190574277, + "grad_norm": 1.451259970664978, + "learning_rate": 1.4990859232175504e-05, + "loss": 1.4126, + "mean_token_accuracy": 0.6355726569890976, + "num_tokens": 68660564.0, + "step": 411 + }, + { + "entropy": 1.8489231765270233, + "epoch": 0.04526104748564994, + "grad_norm": 1.0938141345977783, + "learning_rate": 1.5027422303473494e-05, + "loss": 1.387, + "mean_token_accuracy": 0.6448115805784861, + "num_tokens": 68811370.0, + "step": 412 + }, + { + "entropy": 1.8644197285175323, + "epoch": 0.045370904397022876, + "grad_norm": 1.0362029075622559, + "learning_rate": 1.5063985374771482e-05, + "loss": 1.5106, + "mean_token_accuracy": 0.6414720167716345, + "num_tokens": 68959627.0, + "step": 413 + }, + { + "entropy": 1.9586124916871388, + "epoch": 0.04548076130839582, + "grad_norm": 0.8306599259376526, + "learning_rate": 1.5100548446069471e-05, + "loss": 1.6478, + "mean_token_accuracy": 0.6033438295125961, + "num_tokens": 69141689.0, + "step": 414 + }, + { + "entropy": 1.9255466957887013, + "epoch": 0.04559061821976875, + "grad_norm": 0.7374395728111267, + "learning_rate": 1.513711151736746e-05, + "loss": 1.5721, + "mean_token_accuracy": 0.622237409154574, + "num_tokens": 69346313.0, + "step": 415 + }, + { + "entropy": 1.9461825489997864, + "epoch": 0.045700475131141685, + "grad_norm": 1.0214322805404663, + "learning_rate": 1.5173674588665451e-05, + "loss": 1.3917, + "mean_token_accuracy": 0.6484548399845759, + "num_tokens": 69484013.0, + "step": 416 + }, + { + "entropy": 1.9209075768788655, + "epoch": 0.045810332042514626, + "grad_norm": 1.165231466293335, + "learning_rate": 1.5210237659963439e-05, + "loss": 1.4843, + "mean_token_accuracy": 0.637021337946256, + "num_tokens": 69634261.0, + "step": 417 + }, + { + "entropy": 1.8663530945777893, + "epoch": 0.04592018895388756, + "grad_norm": 0.8267627358436584, + "learning_rate": 1.5246800731261427e-05, + "loss": 1.5228, + "mean_token_accuracy": 0.6384973078966141, + "num_tokens": 69810623.0, + "step": 418 + }, + { + "entropy": 1.8647344807783763, + "epoch": 0.0460300458652605, + "grad_norm": 1.1516979932785034, + "learning_rate": 1.5283363802559417e-05, + "loss": 1.3857, + "mean_token_accuracy": 0.649631142616272, + "num_tokens": 69971092.0, + "step": 419 + }, + { + "entropy": 1.8947654863198597, + "epoch": 0.046139902776633435, + "grad_norm": 1.20870041847229, + "learning_rate": 1.5319926873857403e-05, + "loss": 1.4169, + "mean_token_accuracy": 0.6381567666927973, + "num_tokens": 70117842.0, + "step": 420 + }, + { + "entropy": 1.95854847629865, + "epoch": 0.04624975968800637, + "grad_norm": 1.056316614151001, + "learning_rate": 1.5356489945155396e-05, + "loss": 1.4916, + "mean_token_accuracy": 0.6343552867571512, + "num_tokens": 70258941.0, + "step": 421 + }, + { + "entropy": 1.8973442415396373, + "epoch": 0.04635961659937931, + "grad_norm": 0.7159221172332764, + "learning_rate": 1.5393053016453383e-05, + "loss": 1.5651, + "mean_token_accuracy": 0.621953676144282, + "num_tokens": 70458247.0, + "step": 422 + }, + { + "entropy": 1.9163278142611186, + "epoch": 0.04646947351075224, + "grad_norm": 0.6779471039772034, + "learning_rate": 1.5429616087751372e-05, + "loss": 1.4651, + "mean_token_accuracy": 0.6381291598081589, + "num_tokens": 70633332.0, + "step": 423 + }, + { + "entropy": 1.853121320406596, + "epoch": 0.046579330422125184, + "grad_norm": 0.7182997465133667, + "learning_rate": 1.5466179159049362e-05, + "loss": 1.4749, + "mean_token_accuracy": 0.6441103170315424, + "num_tokens": 70828981.0, + "step": 424 + }, + { + "entropy": 1.9391433397928874, + "epoch": 0.04668918733349812, + "grad_norm": 0.8690926432609558, + "learning_rate": 1.550274223034735e-05, + "loss": 1.4578, + "mean_token_accuracy": 0.6315609067678452, + "num_tokens": 70989991.0, + "step": 425 + }, + { + "entropy": 1.8937304317951202, + "epoch": 0.04679904424487106, + "grad_norm": 1.377400517463684, + "learning_rate": 1.553930530164534e-05, + "loss": 1.4113, + "mean_token_accuracy": 0.645716001590093, + "num_tokens": 71123738.0, + "step": 426 + }, + { + "entropy": 1.92280246814092, + "epoch": 0.04690890115624399, + "grad_norm": 1.2862893342971802, + "learning_rate": 1.5575868372943328e-05, + "loss": 1.4553, + "mean_token_accuracy": 0.6406665394703547, + "num_tokens": 71236228.0, + "step": 427 + }, + { + "entropy": 1.8118858635425568, + "epoch": 0.04701875806761693, + "grad_norm": 0.903378963470459, + "learning_rate": 1.5612431444241318e-05, + "loss": 1.5105, + "mean_token_accuracy": 0.6435635139544805, + "num_tokens": 71383417.0, + "step": 428 + }, + { + "entropy": 2.0178940494855246, + "epoch": 0.04712861497898987, + "grad_norm": 1.603965163230896, + "learning_rate": 1.5648994515539308e-05, + "loss": 1.3997, + "mean_token_accuracy": 0.6480491409699122, + "num_tokens": 71521903.0, + "step": 429 + }, + { + "entropy": 1.865706165631612, + "epoch": 0.0472384718903628, + "grad_norm": 1.1885672807693481, + "learning_rate": 1.5685557586837297e-05, + "loss": 1.5575, + "mean_token_accuracy": 0.6412462542454401, + "num_tokens": 71704942.0, + "step": 430 + }, + { + "entropy": 1.816953221956889, + "epoch": 0.04734832880173574, + "grad_norm": 0.7502696514129639, + "learning_rate": 1.5722120658135284e-05, + "loss": 1.4532, + "mean_token_accuracy": 0.6353533814350764, + "num_tokens": 71906527.0, + "step": 431 + }, + { + "entropy": 1.8986171980698903, + "epoch": 0.047458185713108676, + "grad_norm": 1.043899655342102, + "learning_rate": 1.5758683729433274e-05, + "loss": 1.407, + "mean_token_accuracy": 0.6350090801715851, + "num_tokens": 72112800.0, + "step": 432 + }, + { + "entropy": 1.9298064609368641, + "epoch": 0.04756804262448161, + "grad_norm": 0.8479198217391968, + "learning_rate": 1.5795246800731263e-05, + "loss": 1.5159, + "mean_token_accuracy": 0.6414108375708262, + "num_tokens": 72320482.0, + "step": 433 + }, + { + "entropy": 1.9490727484226227, + "epoch": 0.04767789953585455, + "grad_norm": 0.848849356174469, + "learning_rate": 1.583180987202925e-05, + "loss": 1.5658, + "mean_token_accuracy": 0.6233013023932775, + "num_tokens": 72463724.0, + "step": 434 + }, + { + "entropy": 1.8321526845296223, + "epoch": 0.047787756447227485, + "grad_norm": 0.7417627573013306, + "learning_rate": 1.5868372943327243e-05, + "loss": 1.3661, + "mean_token_accuracy": 0.6409125824769338, + "num_tokens": 72635818.0, + "step": 435 + }, + { + "entropy": 1.8534736235936482, + "epoch": 0.047897613358600426, + "grad_norm": 0.836335301399231, + "learning_rate": 1.590493601462523e-05, + "loss": 1.6775, + "mean_token_accuracy": 0.630969633658727, + "num_tokens": 72801261.0, + "step": 436 + }, + { + "entropy": 1.969543606042862, + "epoch": 0.04800747026997336, + "grad_norm": 0.9996944665908813, + "learning_rate": 1.594149908592322e-05, + "loss": 1.4598, + "mean_token_accuracy": 0.634533574183782, + "num_tokens": 72940719.0, + "step": 437 + }, + { + "entropy": 1.893303821484248, + "epoch": 0.048117327181346294, + "grad_norm": 1.1567001342773438, + "learning_rate": 1.597806215722121e-05, + "loss": 1.3812, + "mean_token_accuracy": 0.6442390580972036, + "num_tokens": 73072820.0, + "step": 438 + }, + { + "entropy": 1.9349376459916432, + "epoch": 0.048227184092719234, + "grad_norm": 0.9590914845466614, + "learning_rate": 1.6014625228519195e-05, + "loss": 1.4763, + "mean_token_accuracy": 0.6355199714501699, + "num_tokens": 73241924.0, + "step": 439 + }, + { + "entropy": 1.8869278033574421, + "epoch": 0.04833704100409217, + "grad_norm": 0.7127754092216492, + "learning_rate": 1.6051188299817185e-05, + "loss": 1.4739, + "mean_token_accuracy": 0.6266596366961797, + "num_tokens": 73421388.0, + "step": 440 + }, + { + "entropy": 1.8205039203166962, + "epoch": 0.04844689791546511, + "grad_norm": 1.2497098445892334, + "learning_rate": 1.6087751371115175e-05, + "loss": 1.4651, + "mean_token_accuracy": 0.6429401089747747, + "num_tokens": 73574441.0, + "step": 441 + }, + { + "entropy": 1.880224694808324, + "epoch": 0.04855675482683804, + "grad_norm": 0.8544715046882629, + "learning_rate": 1.6124314442413164e-05, + "loss": 1.4796, + "mean_token_accuracy": 0.6351951907078425, + "num_tokens": 73732021.0, + "step": 442 + }, + { + "entropy": 1.845553586880366, + "epoch": 0.04866661173821098, + "grad_norm": 0.8492904305458069, + "learning_rate": 1.6160877513711154e-05, + "loss": 1.4662, + "mean_token_accuracy": 0.6401056249936422, + "num_tokens": 73898879.0, + "step": 443 + }, + { + "entropy": 1.8184046844641368, + "epoch": 0.04877646864958392, + "grad_norm": 0.8159205913543701, + "learning_rate": 1.6197440585009144e-05, + "loss": 1.3999, + "mean_token_accuracy": 0.6590509961048762, + "num_tokens": 74048185.0, + "step": 444 + }, + { + "entropy": 1.909311443567276, + "epoch": 0.04888632556095685, + "grad_norm": 0.8159104585647583, + "learning_rate": 1.623400365630713e-05, + "loss": 1.4732, + "mean_token_accuracy": 0.639503538608551, + "num_tokens": 74205846.0, + "step": 445 + }, + { + "entropy": 1.8864035904407501, + "epoch": 0.04899618247232979, + "grad_norm": 1.0417039394378662, + "learning_rate": 1.627056672760512e-05, + "loss": 1.35, + "mean_token_accuracy": 0.6542015026013056, + "num_tokens": 74350478.0, + "step": 446 + }, + { + "entropy": 1.9047284424304962, + "epoch": 0.049106039383702726, + "grad_norm": 0.7739196419715881, + "learning_rate": 1.630712979890311e-05, + "loss": 1.4764, + "mean_token_accuracy": 0.6416764905055364, + "num_tokens": 74523233.0, + "step": 447 + }, + { + "entropy": 1.8290265500545502, + "epoch": 0.04921589629507567, + "grad_norm": 0.8136515021324158, + "learning_rate": 1.6343692870201096e-05, + "loss": 1.4462, + "mean_token_accuracy": 0.6408629318078359, + "num_tokens": 74662652.0, + "step": 448 + }, + { + "entropy": 1.836196482181549, + "epoch": 0.0493257532064486, + "grad_norm": 0.8380835056304932, + "learning_rate": 1.638025594149909e-05, + "loss": 1.4125, + "mean_token_accuracy": 0.6456181158622106, + "num_tokens": 74836953.0, + "step": 449 + }, + { + "entropy": 1.8020283778508503, + "epoch": 0.049435610117821535, + "grad_norm": 1.37300705909729, + "learning_rate": 1.6416819012797076e-05, + "loss": 1.3302, + "mean_token_accuracy": 0.6666079958279928, + "num_tokens": 74981072.0, + "step": 450 + }, + { + "entropy": 1.9055909911791484, + "epoch": 0.049545467029194476, + "grad_norm": 0.9503870010375977, + "learning_rate": 1.6453382084095066e-05, + "loss": 1.4723, + "mean_token_accuracy": 0.6394474705060323, + "num_tokens": 75121906.0, + "step": 451 + }, + { + "entropy": 1.842297613620758, + "epoch": 0.04965532394056741, + "grad_norm": 0.7884616851806641, + "learning_rate": 1.6489945155393055e-05, + "loss": 1.4174, + "mean_token_accuracy": 0.6418876697619756, + "num_tokens": 75293694.0, + "step": 452 + }, + { + "entropy": 1.921643594900767, + "epoch": 0.04976518085194035, + "grad_norm": 1.0184119939804077, + "learning_rate": 1.6526508226691042e-05, + "loss": 1.5131, + "mean_token_accuracy": 0.6456418732802073, + "num_tokens": 75431517.0, + "step": 453 + }, + { + "entropy": 1.916659524043401, + "epoch": 0.049875037763313285, + "grad_norm": 1.5741225481033325, + "learning_rate": 1.656307129798903e-05, + "loss": 1.348, + "mean_token_accuracy": 0.6527373790740967, + "num_tokens": 75541765.0, + "step": 454 + }, + { + "entropy": 1.8336934447288513, + "epoch": 0.04998489467468622, + "grad_norm": 0.9903491735458374, + "learning_rate": 1.659963436928702e-05, + "loss": 1.399, + "mean_token_accuracy": 0.653240958849589, + "num_tokens": 75694830.0, + "step": 455 + }, + { + "entropy": 1.8270506660143535, + "epoch": 0.05009475158605916, + "grad_norm": 0.7361817955970764, + "learning_rate": 1.663619744058501e-05, + "loss": 1.5485, + "mean_token_accuracy": 0.637207085887591, + "num_tokens": 75927979.0, + "step": 456 + }, + { + "entropy": 1.9009975989659627, + "epoch": 0.05020460849743209, + "grad_norm": 1.2144572734832764, + "learning_rate": 1.6672760511883e-05, + "loss": 1.3843, + "mean_token_accuracy": 0.6520481109619141, + "num_tokens": 76028451.0, + "step": 457 + }, + { + "entropy": 1.933722198009491, + "epoch": 0.050314465408805034, + "grad_norm": 0.9374269843101501, + "learning_rate": 1.6709323583180987e-05, + "loss": 1.399, + "mean_token_accuracy": 0.6511914978424708, + "num_tokens": 76162186.0, + "step": 458 + }, + { + "entropy": 1.8920509020487468, + "epoch": 0.05042432232017797, + "grad_norm": 0.7262760400772095, + "learning_rate": 1.6745886654478977e-05, + "loss": 1.5039, + "mean_token_accuracy": 0.6288343866666158, + "num_tokens": 76330676.0, + "step": 459 + }, + { + "entropy": 1.8482555548350017, + "epoch": 0.0505341792315509, + "grad_norm": 0.8332237601280212, + "learning_rate": 1.6782449725776967e-05, + "loss": 1.5237, + "mean_token_accuracy": 0.6457237054904302, + "num_tokens": 76468568.0, + "step": 460 + }, + { + "entropy": 1.820657879114151, + "epoch": 0.05064403614292384, + "grad_norm": 1.824617624282837, + "learning_rate": 1.6819012797074956e-05, + "loss": 1.0706, + "mean_token_accuracy": 0.6815401464700699, + "num_tokens": 76622873.0, + "step": 461 + }, + { + "entropy": 1.8888212939103444, + "epoch": 0.05075389305429678, + "grad_norm": 0.8382301926612854, + "learning_rate": 1.6855575868372943e-05, + "loss": 1.5501, + "mean_token_accuracy": 0.6339965413014094, + "num_tokens": 76788721.0, + "step": 462 + }, + { + "entropy": 1.782186617453893, + "epoch": 0.05086374996566972, + "grad_norm": 0.8659656643867493, + "learning_rate": 1.6892138939670936e-05, + "loss": 1.382, + "mean_token_accuracy": 0.6629950056473414, + "num_tokens": 76929800.0, + "step": 463 + }, + { + "entropy": 1.84979913632075, + "epoch": 0.05097360687704265, + "grad_norm": 1.0096579790115356, + "learning_rate": 1.6928702010968922e-05, + "loss": 1.4331, + "mean_token_accuracy": 0.6603454450766245, + "num_tokens": 77088974.0, + "step": 464 + }, + { + "entropy": 1.8461360732714336, + "epoch": 0.051083463788415585, + "grad_norm": 0.851254940032959, + "learning_rate": 1.6965265082266912e-05, + "loss": 1.5439, + "mean_token_accuracy": 0.6388277113437653, + "num_tokens": 77269063.0, + "step": 465 + }, + { + "entropy": 1.8718996942043304, + "epoch": 0.051193320699788526, + "grad_norm": 1.469465732574463, + "learning_rate": 1.7001828153564902e-05, + "loss": 1.2971, + "mean_token_accuracy": 0.6635722517967224, + "num_tokens": 77403714.0, + "step": 466 + }, + { + "entropy": 1.785783976316452, + "epoch": 0.05130317761116146, + "grad_norm": 0.9720367193222046, + "learning_rate": 1.7038391224862888e-05, + "loss": 1.3768, + "mean_token_accuracy": 0.660425583521525, + "num_tokens": 77551768.0, + "step": 467 + }, + { + "entropy": 1.783895234266917, + "epoch": 0.0514130345225344, + "grad_norm": 0.8119345903396606, + "learning_rate": 1.7074954296160878e-05, + "loss": 1.3155, + "mean_token_accuracy": 0.6677038272221884, + "num_tokens": 77707970.0, + "step": 468 + }, + { + "entropy": 1.8844469288984935, + "epoch": 0.051522891433907335, + "grad_norm": 1.0332210063934326, + "learning_rate": 1.7111517367458868e-05, + "loss": 1.3061, + "mean_token_accuracy": 0.6705901821454366, + "num_tokens": 77838254.0, + "step": 469 + }, + { + "entropy": 1.899887502193451, + "epoch": 0.051632748345280276, + "grad_norm": 0.8115286231040955, + "learning_rate": 1.7148080438756858e-05, + "loss": 1.6136, + "mean_token_accuracy": 0.6468661973873774, + "num_tokens": 78030097.0, + "step": 470 + }, + { + "entropy": 1.8445066312948863, + "epoch": 0.05174260525665321, + "grad_norm": 0.67425936460495, + "learning_rate": 1.7184643510054847e-05, + "loss": 1.4869, + "mean_token_accuracy": 0.6447852005561193, + "num_tokens": 78218757.0, + "step": 471 + }, + { + "entropy": 1.8734458883603413, + "epoch": 0.051852462168026144, + "grad_norm": 0.7984296679496765, + "learning_rate": 1.7221206581352834e-05, + "loss": 1.4225, + "mean_token_accuracy": 0.6457456847031912, + "num_tokens": 78377793.0, + "step": 472 + }, + { + "entropy": 1.9007401863733928, + "epoch": 0.051962319079399084, + "grad_norm": 0.760857343673706, + "learning_rate": 1.7257769652650823e-05, + "loss": 1.5456, + "mean_token_accuracy": 0.6302074193954468, + "num_tokens": 78573934.0, + "step": 473 + }, + { + "entropy": 1.849319765965144, + "epoch": 0.05207217599077202, + "grad_norm": 1.178850531578064, + "learning_rate": 1.7294332723948813e-05, + "loss": 1.3321, + "mean_token_accuracy": 0.6654231746991476, + "num_tokens": 78684333.0, + "step": 474 + }, + { + "entropy": 1.8049305478731792, + "epoch": 0.05218203290214496, + "grad_norm": 0.7811275720596313, + "learning_rate": 1.7330895795246803e-05, + "loss": 1.3971, + "mean_token_accuracy": 0.6538741886615753, + "num_tokens": 78851016.0, + "step": 475 + }, + { + "entropy": 1.8060388763745625, + "epoch": 0.05229188981351789, + "grad_norm": 1.0945056676864624, + "learning_rate": 1.736745886654479e-05, + "loss": 1.339, + "mean_token_accuracy": 0.6550626158714294, + "num_tokens": 79003715.0, + "step": 476 + }, + { + "entropy": 1.8515853186448414, + "epoch": 0.05240174672489083, + "grad_norm": 0.6653461456298828, + "learning_rate": 1.7404021937842783e-05, + "loss": 1.5661, + "mean_token_accuracy": 0.6174762199322382, + "num_tokens": 79283121.0, + "step": 477 + }, + { + "entropy": 1.851629654566447, + "epoch": 0.05251160363626377, + "grad_norm": 0.7771194577217102, + "learning_rate": 1.744058500914077e-05, + "loss": 1.5499, + "mean_token_accuracy": 0.6309017390012741, + "num_tokens": 79463326.0, + "step": 478 + }, + { + "entropy": 1.8349732557932537, + "epoch": 0.0526214605476367, + "grad_norm": 0.9575709700584412, + "learning_rate": 1.747714808043876e-05, + "loss": 1.4673, + "mean_token_accuracy": 0.6315694997708002, + "num_tokens": 79601389.0, + "step": 479 + }, + { + "entropy": 1.9489451746145885, + "epoch": 0.05273131745900964, + "grad_norm": 0.7346012592315674, + "learning_rate": 1.751371115173675e-05, + "loss": 1.4289, + "mean_token_accuracy": 0.6379700899124146, + "num_tokens": 79742483.0, + "step": 480 + }, + { + "entropy": 1.8353569904963176, + "epoch": 0.052841174370382576, + "grad_norm": 0.7082385420799255, + "learning_rate": 1.7550274223034735e-05, + "loss": 1.4335, + "mean_token_accuracy": 0.6611069192488989, + "num_tokens": 79996396.0, + "step": 481 + }, + { + "entropy": 1.8154561916987102, + "epoch": 0.05295103128175551, + "grad_norm": 0.6445807218551636, + "learning_rate": 1.7586837294332725e-05, + "loss": 1.4676, + "mean_token_accuracy": 0.6439694265524546, + "num_tokens": 80231599.0, + "step": 482 + }, + { + "entropy": 1.8863433003425598, + "epoch": 0.05306088819312845, + "grad_norm": 0.8372637629508972, + "learning_rate": 1.7623400365630714e-05, + "loss": 1.6164, + "mean_token_accuracy": 0.6192357142766317, + "num_tokens": 80385089.0, + "step": 483 + }, + { + "entropy": 1.8703928589820862, + "epoch": 0.053170745104501385, + "grad_norm": 0.7205429673194885, + "learning_rate": 1.7659963436928704e-05, + "loss": 1.4353, + "mean_token_accuracy": 0.6483421623706818, + "num_tokens": 80590298.0, + "step": 484 + }, + { + "entropy": 1.869334836800893, + "epoch": 0.053280602015874326, + "grad_norm": 0.6076232194900513, + "learning_rate": 1.7696526508226694e-05, + "loss": 1.4977, + "mean_token_accuracy": 0.6276814242204031, + "num_tokens": 80811725.0, + "step": 485 + }, + { + "entropy": 1.7474851707617443, + "epoch": 0.05339045892724726, + "grad_norm": 0.8083134889602661, + "learning_rate": 1.773308957952468e-05, + "loss": 1.2768, + "mean_token_accuracy": 0.6715359588464102, + "num_tokens": 80953065.0, + "step": 486 + }, + { + "entropy": 1.8078358471393585, + "epoch": 0.053500315838620194, + "grad_norm": 0.9833588600158691, + "learning_rate": 1.776965265082267e-05, + "loss": 1.3843, + "mean_token_accuracy": 0.6488884389400482, + "num_tokens": 81123124.0, + "step": 487 + }, + { + "entropy": 1.8762567341327667, + "epoch": 0.053610172749993135, + "grad_norm": 0.7375379800796509, + "learning_rate": 1.780621572212066e-05, + "loss": 1.443, + "mean_token_accuracy": 0.6359260429938635, + "num_tokens": 81279364.0, + "step": 488 + }, + { + "entropy": 1.852634310722351, + "epoch": 0.05372002966136607, + "grad_norm": 0.9888647794723511, + "learning_rate": 1.784277879341865e-05, + "loss": 1.412, + "mean_token_accuracy": 0.6469916899998983, + "num_tokens": 81430039.0, + "step": 489 + }, + { + "entropy": 1.89104425907135, + "epoch": 0.05382988657273901, + "grad_norm": 0.8109338879585266, + "learning_rate": 1.7879341864716636e-05, + "loss": 1.4514, + "mean_token_accuracy": 0.6364847421646118, + "num_tokens": 81619758.0, + "step": 490 + }, + { + "entropy": 1.9149113893508911, + "epoch": 0.05393974348411194, + "grad_norm": 0.7840366363525391, + "learning_rate": 1.791590493601463e-05, + "loss": 1.4288, + "mean_token_accuracy": 0.6536184300978979, + "num_tokens": 81796494.0, + "step": 491 + }, + { + "entropy": 1.8676457504431407, + "epoch": 0.054049600395484884, + "grad_norm": 0.8361501097679138, + "learning_rate": 1.7952468007312615e-05, + "loss": 1.3977, + "mean_token_accuracy": 0.663863534728686, + "num_tokens": 81932706.0, + "step": 492 + }, + { + "entropy": 1.8649681508541107, + "epoch": 0.05415945730685782, + "grad_norm": 0.9290244579315186, + "learning_rate": 1.7989031078610605e-05, + "loss": 1.5228, + "mean_token_accuracy": 0.6413531800111135, + "num_tokens": 82154289.0, + "step": 493 + }, + { + "entropy": 1.8859212299187977, + "epoch": 0.05426931421823075, + "grad_norm": 0.792782723903656, + "learning_rate": 1.8025594149908595e-05, + "loss": 1.6165, + "mean_token_accuracy": 0.6251773834228516, + "num_tokens": 82343958.0, + "step": 494 + }, + { + "entropy": 1.867518424987793, + "epoch": 0.05437917112960369, + "grad_norm": 0.6810131669044495, + "learning_rate": 1.806215722120658e-05, + "loss": 1.4721, + "mean_token_accuracy": 0.6292106856902441, + "num_tokens": 82553546.0, + "step": 495 + }, + { + "entropy": 1.7545313934485118, + "epoch": 0.05448902804097663, + "grad_norm": 0.6590803861618042, + "learning_rate": 1.809872029250457e-05, + "loss": 1.3972, + "mean_token_accuracy": 0.6627217878897985, + "num_tokens": 82736724.0, + "step": 496 + }, + { + "entropy": 1.8064947426319122, + "epoch": 0.05459888495234957, + "grad_norm": 0.7147844433784485, + "learning_rate": 1.813528336380256e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.6673514246940613, + "num_tokens": 82928814.0, + "step": 497 + }, + { + "entropy": 1.8595764935016632, + "epoch": 0.0547087418637225, + "grad_norm": 0.7674292325973511, + "learning_rate": 1.817184643510055e-05, + "loss": 1.5436, + "mean_token_accuracy": 0.6367508967717489, + "num_tokens": 83169052.0, + "step": 498 + }, + { + "entropy": 1.7980642318725586, + "epoch": 0.054818598775095435, + "grad_norm": 0.7615039348602295, + "learning_rate": 1.820840950639854e-05, + "loss": 1.4503, + "mean_token_accuracy": 0.6509919663270315, + "num_tokens": 83362711.0, + "step": 499 + }, + { + "entropy": 1.848442365725835, + "epoch": 0.054928455686468376, + "grad_norm": 0.6286888718605042, + "learning_rate": 1.8244972577696527e-05, + "loss": 1.3508, + "mean_token_accuracy": 0.6550141274929047, + "num_tokens": 83540687.0, + "step": 500 + }, + { + "entropy": 1.8486445744832356, + "epoch": 0.05503831259784131, + "grad_norm": 1.779735803604126, + "learning_rate": 1.8281535648994517e-05, + "loss": 1.1664, + "mean_token_accuracy": 0.6659951458374659, + "num_tokens": 83716601.0, + "step": 501 + }, + { + "entropy": 1.7482089002927144, + "epoch": 0.05514816950921425, + "grad_norm": 0.715691089630127, + "learning_rate": 1.8318098720292506e-05, + "loss": 1.4013, + "mean_token_accuracy": 0.6483576248089472, + "num_tokens": 83897803.0, + "step": 502 + }, + { + "entropy": 1.8891556064287822, + "epoch": 0.055258026420587185, + "grad_norm": 0.7861650586128235, + "learning_rate": 1.8354661791590496e-05, + "loss": 1.4081, + "mean_token_accuracy": 0.6608035564422607, + "num_tokens": 84052066.0, + "step": 503 + }, + { + "entropy": 1.8510177036126454, + "epoch": 0.05536788333196012, + "grad_norm": 1.1780167818069458, + "learning_rate": 1.8391224862888482e-05, + "loss": 1.4559, + "mean_token_accuracy": 0.6438465466101965, + "num_tokens": 84214794.0, + "step": 504 + }, + { + "entropy": 1.8411860366662343, + "epoch": 0.05547774024333306, + "grad_norm": 0.6785144805908203, + "learning_rate": 1.8427787934186476e-05, + "loss": 1.6069, + "mean_token_accuracy": 0.6364410271247228, + "num_tokens": 84426411.0, + "step": 505 + }, + { + "entropy": 1.7741001347700756, + "epoch": 0.055587597154705994, + "grad_norm": 0.7365214824676514, + "learning_rate": 1.8464351005484462e-05, + "loss": 1.3837, + "mean_token_accuracy": 0.6612890263398489, + "num_tokens": 84564688.0, + "step": 506 + }, + { + "entropy": 1.858109325170517, + "epoch": 0.055697454066078934, + "grad_norm": 0.6560879945755005, + "learning_rate": 1.8500914076782452e-05, + "loss": 1.5934, + "mean_token_accuracy": 0.6287341316541036, + "num_tokens": 84764716.0, + "step": 507 + }, + { + "entropy": 1.8872665762901306, + "epoch": 0.05580731097745187, + "grad_norm": 0.8644893169403076, + "learning_rate": 1.853747714808044e-05, + "loss": 1.3268, + "mean_token_accuracy": 0.6630988270044327, + "num_tokens": 84906107.0, + "step": 508 + }, + { + "entropy": 1.8340339064598083, + "epoch": 0.05591716788882481, + "grad_norm": 0.7128955125808716, + "learning_rate": 1.8574040219378428e-05, + "loss": 1.3998, + "mean_token_accuracy": 0.658979594707489, + "num_tokens": 85057303.0, + "step": 509 + }, + { + "entropy": 1.877872258424759, + "epoch": 0.05602702480019774, + "grad_norm": 1.0351197719573975, + "learning_rate": 1.8610603290676418e-05, + "loss": 1.4379, + "mean_token_accuracy": 0.6406905551751455, + "num_tokens": 85237921.0, + "step": 510 + }, + { + "entropy": 1.832102398077647, + "epoch": 0.05613688171157068, + "grad_norm": 0.9562404155731201, + "learning_rate": 1.8647166361974407e-05, + "loss": 1.4014, + "mean_token_accuracy": 0.6476325045029322, + "num_tokens": 85363257.0, + "step": 511 + }, + { + "entropy": 1.8245374759038289, + "epoch": 0.05624673862294362, + "grad_norm": 0.7608838081359863, + "learning_rate": 1.8683729433272397e-05, + "loss": 1.4446, + "mean_token_accuracy": 0.6506583044926325, + "num_tokens": 85492436.0, + "step": 512 + }, + { + "entropy": 1.8646320700645447, + "epoch": 0.05635659553431655, + "grad_norm": 0.967135488986969, + "learning_rate": 1.8720292504570387e-05, + "loss": 1.3519, + "mean_token_accuracy": 0.6563466837008795, + "num_tokens": 85653597.0, + "step": 513 + }, + { + "entropy": 1.8935543298721313, + "epoch": 0.05646645244568949, + "grad_norm": 0.8624943494796753, + "learning_rate": 1.8756855575868373e-05, + "loss": 1.4485, + "mean_token_accuracy": 0.6568788141012192, + "num_tokens": 85765957.0, + "step": 514 + }, + { + "entropy": 1.8232440849145253, + "epoch": 0.056576309357062426, + "grad_norm": 0.7825310230255127, + "learning_rate": 1.8793418647166363e-05, + "loss": 1.4999, + "mean_token_accuracy": 0.6494305729866028, + "num_tokens": 85933528.0, + "step": 515 + }, + { + "entropy": 1.8484807411829631, + "epoch": 0.05668616626843536, + "grad_norm": 0.6889421939849854, + "learning_rate": 1.8829981718464353e-05, + "loss": 1.5048, + "mean_token_accuracy": 0.6314730395873388, + "num_tokens": 86120045.0, + "step": 516 + }, + { + "entropy": 1.8484809299310048, + "epoch": 0.0567960231798083, + "grad_norm": 0.9059920310974121, + "learning_rate": 1.8866544789762343e-05, + "loss": 1.4745, + "mean_token_accuracy": 0.6385734875996908, + "num_tokens": 86286777.0, + "step": 517 + }, + { + "entropy": 1.853780855735143, + "epoch": 0.056905880091181235, + "grad_norm": 0.8004304766654968, + "learning_rate": 1.890310786106033e-05, + "loss": 1.547, + "mean_token_accuracy": 0.640062207976977, + "num_tokens": 86451355.0, + "step": 518 + }, + { + "entropy": 1.807072252035141, + "epoch": 0.057015737002554176, + "grad_norm": 0.7398921847343445, + "learning_rate": 1.8939670932358322e-05, + "loss": 1.3051, + "mean_token_accuracy": 0.6686479697624842, + "num_tokens": 86619527.0, + "step": 519 + }, + { + "entropy": 1.8454334139823914, + "epoch": 0.05712559391392711, + "grad_norm": 0.68968665599823, + "learning_rate": 1.897623400365631e-05, + "loss": 1.3527, + "mean_token_accuracy": 0.6641424546639124, + "num_tokens": 86762429.0, + "step": 520 + }, + { + "entropy": 1.90091206630071, + "epoch": 0.057235450825300044, + "grad_norm": 0.9172680974006653, + "learning_rate": 1.90127970749543e-05, + "loss": 1.3641, + "mean_token_accuracy": 0.6482406457265218, + "num_tokens": 86864597.0, + "step": 521 + }, + { + "entropy": 1.8825439810752869, + "epoch": 0.057345307736672985, + "grad_norm": 0.9436008334159851, + "learning_rate": 1.9049360146252288e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.6459259490172068, + "num_tokens": 86980728.0, + "step": 522 + }, + { + "entropy": 1.847435434659322, + "epoch": 0.05745516464804592, + "grad_norm": 0.5968527793884277, + "learning_rate": 1.9085923217550274e-05, + "loss": 1.4007, + "mean_token_accuracy": 0.6396257479985555, + "num_tokens": 87210371.0, + "step": 523 + }, + { + "entropy": 1.8333716690540314, + "epoch": 0.05756502155941886, + "grad_norm": 0.7715458273887634, + "learning_rate": 1.9122486288848264e-05, + "loss": 1.5396, + "mean_token_accuracy": 0.6410951962073644, + "num_tokens": 87404365.0, + "step": 524 + }, + { + "entropy": 1.7693612972895305, + "epoch": 0.05767487847079179, + "grad_norm": 1.143162727355957, + "learning_rate": 1.9159049360146254e-05, + "loss": 1.3754, + "mean_token_accuracy": 0.6539959609508514, + "num_tokens": 87539027.0, + "step": 525 + }, + { + "entropy": 1.8589285711447399, + "epoch": 0.05778473538216473, + "grad_norm": 0.8187215924263, + "learning_rate": 1.9195612431444244e-05, + "loss": 1.4923, + "mean_token_accuracy": 0.6357589811086655, + "num_tokens": 87746160.0, + "step": 526 + }, + { + "entropy": 1.8425296048323314, + "epoch": 0.05789459229353767, + "grad_norm": 0.7849891185760498, + "learning_rate": 1.9232175502742234e-05, + "loss": 1.3778, + "mean_token_accuracy": 0.6585688690344492, + "num_tokens": 87885330.0, + "step": 527 + }, + { + "entropy": 1.813408613204956, + "epoch": 0.0580044492049106, + "grad_norm": 0.9070361256599426, + "learning_rate": 1.926873857404022e-05, + "loss": 1.4793, + "mean_token_accuracy": 0.6466079652309418, + "num_tokens": 88023429.0, + "step": 528 + }, + { + "entropy": 1.8070883452892303, + "epoch": 0.05811430611628354, + "grad_norm": 0.8531019687652588, + "learning_rate": 1.930530164533821e-05, + "loss": 1.5821, + "mean_token_accuracy": 0.6390858789285024, + "num_tokens": 88217336.0, + "step": 529 + }, + { + "entropy": 1.762501190106074, + "epoch": 0.05822416302765648, + "grad_norm": 0.6754366755485535, + "learning_rate": 1.93418647166362e-05, + "loss": 1.3344, + "mean_token_accuracy": 0.6736961950858434, + "num_tokens": 88391452.0, + "step": 530 + }, + { + "entropy": 1.8850489755471547, + "epoch": 0.05833401993902942, + "grad_norm": 0.8538105487823486, + "learning_rate": 1.937842778793419e-05, + "loss": 1.3894, + "mean_token_accuracy": 0.6557394365469614, + "num_tokens": 88522769.0, + "step": 531 + }, + { + "entropy": 1.8417104880015056, + "epoch": 0.05844387685040235, + "grad_norm": 0.665955662727356, + "learning_rate": 1.9414990859232176e-05, + "loss": 1.4531, + "mean_token_accuracy": 0.6464942395687103, + "num_tokens": 88695328.0, + "step": 532 + }, + { + "entropy": 1.8750036259492238, + "epoch": 0.058553733761775285, + "grad_norm": 0.8706235289573669, + "learning_rate": 1.945155393053017e-05, + "loss": 1.4869, + "mean_token_accuracy": 0.6461243083079656, + "num_tokens": 88884132.0, + "step": 533 + }, + { + "entropy": 1.9077441891034443, + "epoch": 0.058663590673148226, + "grad_norm": 0.7450928092002869, + "learning_rate": 1.9488117001828155e-05, + "loss": 1.3803, + "mean_token_accuracy": 0.6466763665278753, + "num_tokens": 89011757.0, + "step": 534 + }, + { + "entropy": 1.8615180750687916, + "epoch": 0.05877344758452116, + "grad_norm": 0.6712978482246399, + "learning_rate": 1.9524680073126145e-05, + "loss": 1.4863, + "mean_token_accuracy": 0.63641490538915, + "num_tokens": 89201664.0, + "step": 535 + }, + { + "entropy": 1.8326091468334198, + "epoch": 0.0588833044958941, + "grad_norm": 0.731995701789856, + "learning_rate": 1.9561243144424135e-05, + "loss": 1.4143, + "mean_token_accuracy": 0.6499495257933935, + "num_tokens": 89342221.0, + "step": 536 + }, + { + "entropy": 1.768342783053716, + "epoch": 0.058993161407267035, + "grad_norm": 0.7949745655059814, + "learning_rate": 1.959780621572212e-05, + "loss": 1.473, + "mean_token_accuracy": 0.6656158169110616, + "num_tokens": 89483457.0, + "step": 537 + }, + { + "entropy": 1.8990335762500763, + "epoch": 0.05910301831863997, + "grad_norm": 0.7969281077384949, + "learning_rate": 1.963436928702011e-05, + "loss": 1.3961, + "mean_token_accuracy": 0.6416173179944357, + "num_tokens": 89600539.0, + "step": 538 + }, + { + "entropy": 1.9141974449157715, + "epoch": 0.05921287523001291, + "grad_norm": 0.8687071800231934, + "learning_rate": 1.96709323583181e-05, + "loss": 1.4503, + "mean_token_accuracy": 0.6416681607564291, + "num_tokens": 89730626.0, + "step": 539 + }, + { + "entropy": 1.924128810564677, + "epoch": 0.059322732141385844, + "grad_norm": 0.8359556198120117, + "learning_rate": 1.970749542961609e-05, + "loss": 1.3636, + "mean_token_accuracy": 0.6541641503572464, + "num_tokens": 89842957.0, + "step": 540 + }, + { + "entropy": 1.90417875846227, + "epoch": 0.059432589052758784, + "grad_norm": 0.7051600217819214, + "learning_rate": 1.974405850091408e-05, + "loss": 1.4902, + "mean_token_accuracy": 0.6339495678742727, + "num_tokens": 90053658.0, + "step": 541 + }, + { + "entropy": 1.81476491689682, + "epoch": 0.05954244596413172, + "grad_norm": 1.2421592473983765, + "learning_rate": 1.9780621572212066e-05, + "loss": 1.3046, + "mean_token_accuracy": 0.6642891814311346, + "num_tokens": 90217444.0, + "step": 542 + }, + { + "entropy": 1.8745914101600647, + "epoch": 0.05965230287550465, + "grad_norm": 0.7224368453025818, + "learning_rate": 1.9817184643510056e-05, + "loss": 1.4492, + "mean_token_accuracy": 0.6379654556512833, + "num_tokens": 90424582.0, + "step": 543 + }, + { + "entropy": 1.8410755693912506, + "epoch": 0.05976215978687759, + "grad_norm": 0.8019373416900635, + "learning_rate": 1.9853747714808046e-05, + "loss": 1.3688, + "mean_token_accuracy": 0.6595296114683151, + "num_tokens": 90581155.0, + "step": 544 + }, + { + "entropy": 1.8492278754711151, + "epoch": 0.05987201669825053, + "grad_norm": 0.7192192673683167, + "learning_rate": 1.9890310786106036e-05, + "loss": 1.4485, + "mean_token_accuracy": 0.6582233111063639, + "num_tokens": 90770719.0, + "step": 545 + }, + { + "entropy": 1.7911238272984822, + "epoch": 0.05998187360962347, + "grad_norm": 0.7712220549583435, + "learning_rate": 1.9926873857404022e-05, + "loss": 1.4184, + "mean_token_accuracy": 0.6521774580081304, + "num_tokens": 90954476.0, + "step": 546 + }, + { + "entropy": 1.7789829870065053, + "epoch": 0.0600917305209964, + "grad_norm": 0.7799301147460938, + "learning_rate": 1.9963436928702012e-05, + "loss": 1.3837, + "mean_token_accuracy": 0.6543701936801275, + "num_tokens": 91118773.0, + "step": 547 + }, + { + "entropy": 1.8910561104615529, + "epoch": 0.060201587432369336, + "grad_norm": 0.7084527611732483, + "learning_rate": 2e-05, + "loss": 1.5324, + "mean_token_accuracy": 0.6384007583061854, + "num_tokens": 91291657.0, + "step": 548 + }, + { + "entropy": 1.8717210789521534, + "epoch": 0.060311444343742276, + "grad_norm": 0.8148479461669922, + "learning_rate": 1.999999985757703e-05, + "loss": 1.375, + "mean_token_accuracy": 0.6619729151328405, + "num_tokens": 91417362.0, + "step": 549 + }, + { + "entropy": 1.8685090740521748, + "epoch": 0.06042130125511521, + "grad_norm": 0.9293744564056396, + "learning_rate": 1.9999999430308118e-05, + "loss": 1.4431, + "mean_token_accuracy": 0.644400030374527, + "num_tokens": 91664996.0, + "step": 550 + }, + { + "entropy": 1.8850101232528687, + "epoch": 0.06053115816648815, + "grad_norm": 0.6854516267776489, + "learning_rate": 1.999999871819328e-05, + "loss": 1.4789, + "mean_token_accuracy": 0.6571693470080694, + "num_tokens": 91816730.0, + "step": 551 + }, + { + "entropy": 1.8250452876091003, + "epoch": 0.060641015077861085, + "grad_norm": 0.8001742959022522, + "learning_rate": 1.9999997721232536e-05, + "loss": 1.3613, + "mean_token_accuracy": 0.6481845825910568, + "num_tokens": 91975856.0, + "step": 552 + }, + { + "entropy": 1.798028330008189, + "epoch": 0.060750871989234026, + "grad_norm": 0.7020228505134583, + "learning_rate": 1.999999643942592e-05, + "loss": 1.484, + "mean_token_accuracy": 0.6372916350762049, + "num_tokens": 92166416.0, + "step": 553 + }, + { + "entropy": 1.7991608679294586, + "epoch": 0.06086072890060696, + "grad_norm": 0.6366422772407532, + "learning_rate": 1.9999994872773474e-05, + "loss": 1.4652, + "mean_token_accuracy": 0.6406016697486242, + "num_tokens": 92426296.0, + "step": 554 + }, + { + "entropy": 1.8256397744019826, + "epoch": 0.060970585811979894, + "grad_norm": 0.7002333998680115, + "learning_rate": 1.9999993021275244e-05, + "loss": 1.2857, + "mean_token_accuracy": 0.659576748808225, + "num_tokens": 92610146.0, + "step": 555 + }, + { + "entropy": 1.8464308480421703, + "epoch": 0.061080442723352835, + "grad_norm": 0.7711760997772217, + "learning_rate": 1.999999088493129e-05, + "loss": 1.3101, + "mean_token_accuracy": 0.6737810522317886, + "num_tokens": 92745387.0, + "step": 556 + }, + { + "entropy": 1.8655918041865032, + "epoch": 0.06119029963472577, + "grad_norm": 0.7186883091926575, + "learning_rate": 1.999998846374168e-05, + "loss": 1.461, + "mean_token_accuracy": 0.6509124487638474, + "num_tokens": 92891313.0, + "step": 557 + }, + { + "entropy": 1.8603834410508473, + "epoch": 0.06130015654609871, + "grad_norm": 0.7382420301437378, + "learning_rate": 1.9999985757706496e-05, + "loss": 1.4309, + "mean_token_accuracy": 0.6502951284249624, + "num_tokens": 93118663.0, + "step": 558 + }, + { + "entropy": 1.8049174745877583, + "epoch": 0.06141001345747164, + "grad_norm": 0.7095914483070374, + "learning_rate": 1.9999982766825814e-05, + "loss": 1.4092, + "mean_token_accuracy": 0.6614819119373957, + "num_tokens": 93288993.0, + "step": 559 + }, + { + "entropy": 1.8342632949352264, + "epoch": 0.06151987036884458, + "grad_norm": 0.8286252021789551, + "learning_rate": 1.9999979491099732e-05, + "loss": 1.3481, + "mean_token_accuracy": 0.6633094002803167, + "num_tokens": 93426205.0, + "step": 560 + }, + { + "entropy": 1.8448581198851268, + "epoch": 0.06162972728021752, + "grad_norm": 1.0569968223571777, + "learning_rate": 1.9999975930528356e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6365474959214529, + "num_tokens": 93634167.0, + "step": 561 + }, + { + "entropy": 1.789972831805547, + "epoch": 0.06173958419159045, + "grad_norm": 0.9230442643165588, + "learning_rate": 1.9999972085111797e-05, + "loss": 1.4326, + "mean_token_accuracy": 0.6513793369134268, + "num_tokens": 93781489.0, + "step": 562 + }, + { + "entropy": 1.8464637994766235, + "epoch": 0.06184944110296339, + "grad_norm": 0.8019546866416931, + "learning_rate": 1.9999967954850177e-05, + "loss": 1.419, + "mean_token_accuracy": 0.6493262598911921, + "num_tokens": 93996272.0, + "step": 563 + }, + { + "entropy": 1.8182610770066578, + "epoch": 0.06195929801433633, + "grad_norm": 0.8727892637252808, + "learning_rate": 1.9999963539743628e-05, + "loss": 1.4969, + "mean_token_accuracy": 0.6455451051394144, + "num_tokens": 94171319.0, + "step": 564 + }, + { + "entropy": 1.8235016266504924, + "epoch": 0.06206915492570926, + "grad_norm": 0.8368512392044067, + "learning_rate": 1.9999958839792286e-05, + "loss": 1.4576, + "mean_token_accuracy": 0.6452821691830953, + "num_tokens": 94335499.0, + "step": 565 + }, + { + "entropy": 1.78616197903951, + "epoch": 0.0621790118370822, + "grad_norm": 0.7931045889854431, + "learning_rate": 1.9999953854996303e-05, + "loss": 1.3282, + "mean_token_accuracy": 0.6569087654352188, + "num_tokens": 94479337.0, + "step": 566 + }, + { + "entropy": 1.7730096379915874, + "epoch": 0.062288868748455135, + "grad_norm": 0.8699260950088501, + "learning_rate": 1.9999948585355836e-05, + "loss": 1.3678, + "mean_token_accuracy": 0.6560121526320776, + "num_tokens": 94648746.0, + "step": 567 + }, + { + "entropy": 1.8760800262292225, + "epoch": 0.062398725659828076, + "grad_norm": 1.1178594827651978, + "learning_rate": 1.9999943030871053e-05, + "loss": 1.3277, + "mean_token_accuracy": 0.661268358429273, + "num_tokens": 94747983.0, + "step": 568 + }, + { + "entropy": 1.86210831006368, + "epoch": 0.06250858257120101, + "grad_norm": 0.7388879656791687, + "learning_rate": 1.9999937191542128e-05, + "loss": 1.4365, + "mean_token_accuracy": 0.650276447335879, + "num_tokens": 94919214.0, + "step": 569 + }, + { + "entropy": 1.8915256162484486, + "epoch": 0.06261843948257395, + "grad_norm": 0.7970117926597595, + "learning_rate": 1.9999931067369246e-05, + "loss": 1.4995, + "mean_token_accuracy": 0.6465731561183929, + "num_tokens": 95084859.0, + "step": 570 + }, + { + "entropy": 1.7767747739950817, + "epoch": 0.06272829639394688, + "grad_norm": 0.9821522235870361, + "learning_rate": 1.99999246583526e-05, + "loss": 1.506, + "mean_token_accuracy": 0.6511118859052658, + "num_tokens": 95242471.0, + "step": 571 + }, + { + "entropy": 1.7983069618542988, + "epoch": 0.06283815330531982, + "grad_norm": 0.7964573502540588, + "learning_rate": 1.9999917964492393e-05, + "loss": 1.418, + "mean_token_accuracy": 0.6608897646268209, + "num_tokens": 95411496.0, + "step": 572 + }, + { + "entropy": 1.7514270345369976, + "epoch": 0.06294801021669276, + "grad_norm": 0.7527838349342346, + "learning_rate": 1.9999910985788842e-05, + "loss": 1.4319, + "mean_token_accuracy": 0.6370367358128229, + "num_tokens": 95612011.0, + "step": 573 + }, + { + "entropy": 1.8501805861790974, + "epoch": 0.0630578671280657, + "grad_norm": 0.7433388233184814, + "learning_rate": 1.999990372224216e-05, + "loss": 1.4124, + "mean_token_accuracy": 0.6372717867294947, + "num_tokens": 95775330.0, + "step": 574 + }, + { + "entropy": 1.8343484302361805, + "epoch": 0.06316772403943863, + "grad_norm": 0.8306664824485779, + "learning_rate": 1.9999896173852585e-05, + "loss": 1.4024, + "mean_token_accuracy": 0.6379790206750234, + "num_tokens": 95954358.0, + "step": 575 + }, + { + "entropy": 1.8649726808071136, + "epoch": 0.06327758095081157, + "grad_norm": 0.7519362568855286, + "learning_rate": 1.999988834062035e-05, + "loss": 1.4086, + "mean_token_accuracy": 0.6535822004079819, + "num_tokens": 96118913.0, + "step": 576 + }, + { + "entropy": 1.866872598727544, + "epoch": 0.06338743786218451, + "grad_norm": 1.0160154104232788, + "learning_rate": 1.9999880222545703e-05, + "loss": 1.4077, + "mean_token_accuracy": 0.6465723812580109, + "num_tokens": 96233662.0, + "step": 577 + }, + { + "entropy": 1.8999827206134796, + "epoch": 0.06349729477355744, + "grad_norm": 0.7083912491798401, + "learning_rate": 1.99998718196289e-05, + "loss": 1.5182, + "mean_token_accuracy": 0.6312810579935709, + "num_tokens": 96372780.0, + "step": 578 + }, + { + "entropy": 1.8947786291440327, + "epoch": 0.06360715168493038, + "grad_norm": 0.771692156791687, + "learning_rate": 1.9999863131870213e-05, + "loss": 1.4229, + "mean_token_accuracy": 0.6529962420463562, + "num_tokens": 96532545.0, + "step": 579 + }, + { + "entropy": 1.849695046742757, + "epoch": 0.06371700859630332, + "grad_norm": 0.7248260378837585, + "learning_rate": 1.9999854159269915e-05, + "loss": 1.4054, + "mean_token_accuracy": 0.6576380530993143, + "num_tokens": 96708045.0, + "step": 580 + }, + { + "entropy": 1.757933537165324, + "epoch": 0.06382686550767624, + "grad_norm": 0.7588098645210266, + "learning_rate": 1.9999844901828286e-05, + "loss": 1.4921, + "mean_token_accuracy": 0.6364376048247019, + "num_tokens": 96907139.0, + "step": 581 + }, + { + "entropy": 1.796962042649587, + "epoch": 0.06393672241904919, + "grad_norm": 0.7149285674095154, + "learning_rate": 1.9999835359545622e-05, + "loss": 1.4685, + "mean_token_accuracy": 0.649954711397489, + "num_tokens": 97080342.0, + "step": 582 + }, + { + "entropy": 1.8116531372070312, + "epoch": 0.06404657933042213, + "grad_norm": 0.803112268447876, + "learning_rate": 1.999982553242222e-05, + "loss": 1.5352, + "mean_token_accuracy": 0.6461968272924423, + "num_tokens": 97248479.0, + "step": 583 + }, + { + "entropy": 1.8498141765594482, + "epoch": 0.06415643624179507, + "grad_norm": 1.0572980642318726, + "learning_rate": 1.99998154204584e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6570860395828882, + "num_tokens": 97389667.0, + "step": 584 + }, + { + "entropy": 1.8106311957041423, + "epoch": 0.064266293153168, + "grad_norm": 0.7023609280586243, + "learning_rate": 1.9999805023654474e-05, + "loss": 1.381, + "mean_token_accuracy": 0.6562709013621012, + "num_tokens": 97544195.0, + "step": 585 + }, + { + "entropy": 1.8761279384295146, + "epoch": 0.06437615006454094, + "grad_norm": 0.6949073076248169, + "learning_rate": 1.9999794342010777e-05, + "loss": 1.4237, + "mean_token_accuracy": 0.6387066642443339, + "num_tokens": 97697668.0, + "step": 586 + }, + { + "entropy": 1.7704632878303528, + "epoch": 0.06448600697591388, + "grad_norm": 0.6187701225280762, + "learning_rate": 1.9999783375527647e-05, + "loss": 1.4075, + "mean_token_accuracy": 0.6577561795711517, + "num_tokens": 97903522.0, + "step": 587 + }, + { + "entropy": 1.8219424188137054, + "epoch": 0.0645958638872868, + "grad_norm": 0.8520556688308716, + "learning_rate": 1.9999772124205423e-05, + "loss": 1.3681, + "mean_token_accuracy": 0.6523155321677526, + "num_tokens": 98057082.0, + "step": 588 + }, + { + "entropy": 1.8426192998886108, + "epoch": 0.06470572079865974, + "grad_norm": 0.8771872520446777, + "learning_rate": 1.999976058804447e-05, + "loss": 1.5718, + "mean_token_accuracy": 0.6308561414480209, + "num_tokens": 98242770.0, + "step": 589 + }, + { + "entropy": 1.8485606014728546, + "epoch": 0.06481557771003268, + "grad_norm": 0.7140512466430664, + "learning_rate": 1.9999748767045148e-05, + "loss": 1.4753, + "mean_token_accuracy": 0.6466710418462753, + "num_tokens": 98406161.0, + "step": 590 + }, + { + "entropy": 1.8343448042869568, + "epoch": 0.06492543462140563, + "grad_norm": 0.7732890248298645, + "learning_rate": 1.9999736661207833e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.6507249772548676, + "num_tokens": 98576357.0, + "step": 591 + }, + { + "entropy": 1.8259783387184143, + "epoch": 0.06503529153277855, + "grad_norm": 0.8179787993431091, + "learning_rate": 1.999972427053291e-05, + "loss": 1.4097, + "mean_token_accuracy": 0.6523296386003494, + "num_tokens": 98705683.0, + "step": 592 + }, + { + "entropy": 1.8165560364723206, + "epoch": 0.0651451484441515, + "grad_norm": 0.945043683052063, + "learning_rate": 1.999971159502077e-05, + "loss": 1.3726, + "mean_token_accuracy": 0.6657868524392446, + "num_tokens": 98842481.0, + "step": 593 + }, + { + "entropy": 1.8081829249858856, + "epoch": 0.06525500535552443, + "grad_norm": 0.7620939612388611, + "learning_rate": 1.9999698634671808e-05, + "loss": 1.5107, + "mean_token_accuracy": 0.6559838702281316, + "num_tokens": 99017663.0, + "step": 594 + }, + { + "entropy": 1.808142175277074, + "epoch": 0.06536486226689736, + "grad_norm": 0.682893693447113, + "learning_rate": 1.9999685389486444e-05, + "loss": 1.4069, + "mean_token_accuracy": 0.6539217978715897, + "num_tokens": 99199192.0, + "step": 595 + }, + { + "entropy": 1.8011847337086995, + "epoch": 0.0654747191782703, + "grad_norm": 0.7496669888496399, + "learning_rate": 1.9999671859465092e-05, + "loss": 1.4311, + "mean_token_accuracy": 0.6457031667232513, + "num_tokens": 99373748.0, + "step": 596 + }, + { + "entropy": 1.7991874118645985, + "epoch": 0.06558457608964324, + "grad_norm": 0.624677836894989, + "learning_rate": 1.999965804460818e-05, + "loss": 1.4802, + "mean_token_accuracy": 0.6430507103602091, + "num_tokens": 99603545.0, + "step": 597 + }, + { + "entropy": 1.8343899448712666, + "epoch": 0.06569443300101617, + "grad_norm": 0.9112517237663269, + "learning_rate": 1.999964394491615e-05, + "loss": 1.5462, + "mean_token_accuracy": 0.6385933210452398, + "num_tokens": 99744710.0, + "step": 598 + }, + { + "entropy": 1.7479709486166637, + "epoch": 0.06580428991238911, + "grad_norm": 0.8736753463745117, + "learning_rate": 1.999962956038944e-05, + "loss": 1.3665, + "mean_token_accuracy": 0.6576890349388123, + "num_tokens": 99941601.0, + "step": 599 + }, + { + "entropy": 1.7895345389842987, + "epoch": 0.06591414682376205, + "grad_norm": 1.08870267868042, + "learning_rate": 1.999961489102851e-05, + "loss": 1.3301, + "mean_token_accuracy": 0.6591797322034836, + "num_tokens": 100054777.0, + "step": 600 + }, + { + "entropy": 1.755403737227122, + "epoch": 0.06602400373513499, + "grad_norm": 0.7063686847686768, + "learning_rate": 1.9999599936833827e-05, + "loss": 1.3455, + "mean_token_accuracy": 0.6583442091941833, + "num_tokens": 100205128.0, + "step": 601 + }, + { + "entropy": 1.8366802831490834, + "epoch": 0.06613386064650792, + "grad_norm": 0.7118000388145447, + "learning_rate": 1.9999584697805858e-05, + "loss": 1.4197, + "mean_token_accuracy": 0.648671492934227, + "num_tokens": 100395366.0, + "step": 602 + }, + { + "entropy": 1.802819162607193, + "epoch": 0.06624371755788086, + "grad_norm": 0.8801571726799011, + "learning_rate": 1.999956917394509e-05, + "loss": 1.5157, + "mean_token_accuracy": 0.6466168984770775, + "num_tokens": 100539414.0, + "step": 603 + }, + { + "entropy": 1.8523845076560974, + "epoch": 0.0663535744692538, + "grad_norm": 0.715670645236969, + "learning_rate": 1.9999553365252014e-05, + "loss": 1.4567, + "mean_token_accuracy": 0.6485249350468317, + "num_tokens": 100703282.0, + "step": 604 + }, + { + "entropy": 1.8390779892603557, + "epoch": 0.06646343138062673, + "grad_norm": 0.8692449331283569, + "learning_rate": 1.9999537271727128e-05, + "loss": 1.3827, + "mean_token_accuracy": 0.6466074089209238, + "num_tokens": 100889824.0, + "step": 605 + }, + { + "entropy": 1.7545512715975444, + "epoch": 0.06657328829199967, + "grad_norm": 0.7428275942802429, + "learning_rate": 1.9999520893370944e-05, + "loss": 1.3813, + "mean_token_accuracy": 0.6578773707151413, + "num_tokens": 101029032.0, + "step": 606 + }, + { + "entropy": 1.7871941129366558, + "epoch": 0.06668314520337261, + "grad_norm": 0.6753717064857483, + "learning_rate": 1.9999504230183976e-05, + "loss": 1.4339, + "mean_token_accuracy": 0.6428654193878174, + "num_tokens": 101216884.0, + "step": 607 + }, + { + "entropy": 1.811436951160431, + "epoch": 0.06679300211474555, + "grad_norm": 0.7294607162475586, + "learning_rate": 1.9999487282166758e-05, + "loss": 1.5457, + "mean_token_accuracy": 0.628396287560463, + "num_tokens": 101407501.0, + "step": 608 + }, + { + "entropy": 1.901543140411377, + "epoch": 0.06690285902611848, + "grad_norm": 0.8634178638458252, + "learning_rate": 1.9999470049319823e-05, + "loss": 1.4976, + "mean_token_accuracy": 0.6367218047380447, + "num_tokens": 101567263.0, + "step": 609 + }, + { + "entropy": 1.7725516855716705, + "epoch": 0.06701271593749142, + "grad_norm": 0.6490088701248169, + "learning_rate": 1.999945253164371e-05, + "loss": 1.4208, + "mean_token_accuracy": 0.6521205753087997, + "num_tokens": 101736088.0, + "step": 610 + }, + { + "entropy": 1.8182121813297272, + "epoch": 0.06712257284886436, + "grad_norm": 0.7487345933914185, + "learning_rate": 1.999943472913899e-05, + "loss": 1.5168, + "mean_token_accuracy": 0.6501521567503611, + "num_tokens": 101882039.0, + "step": 611 + }, + { + "entropy": 1.7695377667744954, + "epoch": 0.06723242976023729, + "grad_norm": 0.6472978591918945, + "learning_rate": 1.9999416641806206e-05, + "loss": 1.4747, + "mean_token_accuracy": 0.6211903840303421, + "num_tokens": 102114402.0, + "step": 612 + }, + { + "entropy": 1.827112078666687, + "epoch": 0.06734228667161023, + "grad_norm": 1.119243860244751, + "learning_rate": 1.9999398269645947e-05, + "loss": 1.4288, + "mean_token_accuracy": 0.6412825981775919, + "num_tokens": 102270509.0, + "step": 613 + }, + { + "entropy": 1.726913293202718, + "epoch": 0.06745214358298317, + "grad_norm": 0.7255959510803223, + "learning_rate": 1.9999379612658785e-05, + "loss": 1.3827, + "mean_token_accuracy": 0.6515529453754425, + "num_tokens": 102435141.0, + "step": 614 + }, + { + "entropy": 1.8226731022198994, + "epoch": 0.0675620004943561, + "grad_norm": 0.8500895500183105, + "learning_rate": 1.9999360670845314e-05, + "loss": 1.5447, + "mean_token_accuracy": 0.6447988549868265, + "num_tokens": 102575866.0, + "step": 615 + }, + { + "entropy": 1.8276772399743397, + "epoch": 0.06767185740572904, + "grad_norm": 1.0635449886322021, + "learning_rate": 1.9999341444206133e-05, + "loss": 1.321, + "mean_token_accuracy": 0.6598222802082697, + "num_tokens": 102710803.0, + "step": 616 + }, + { + "entropy": 1.7802319824695587, + "epoch": 0.06778171431710198, + "grad_norm": 0.7771666646003723, + "learning_rate": 1.999932193274185e-05, + "loss": 1.3659, + "mean_token_accuracy": 0.6599055776993433, + "num_tokens": 102828396.0, + "step": 617 + }, + { + "entropy": 1.7876626749833424, + "epoch": 0.06789157122847492, + "grad_norm": 4.962174892425537, + "learning_rate": 1.9999302136453083e-05, + "loss": 1.4445, + "mean_token_accuracy": 0.6381362775961558, + "num_tokens": 103047628.0, + "step": 618 + }, + { + "entropy": 1.8322969575723012, + "epoch": 0.06800142813984784, + "grad_norm": 0.9945211410522461, + "learning_rate": 1.999928205534046e-05, + "loss": 1.4898, + "mean_token_accuracy": 0.6348124096790949, + "num_tokens": 103266857.0, + "step": 619 + }, + { + "entropy": 1.7776095767815907, + "epoch": 0.06811128505122079, + "grad_norm": 0.9018236994743347, + "learning_rate": 1.9999261689404615e-05, + "loss": 1.3107, + "mean_token_accuracy": 0.6572676748037338, + "num_tokens": 103398868.0, + "step": 620 + }, + { + "entropy": 1.8384911715984344, + "epoch": 0.06822114196259373, + "grad_norm": 0.7996423244476318, + "learning_rate": 1.9999241038646192e-05, + "loss": 1.5596, + "mean_token_accuracy": 0.6426488012075424, + "num_tokens": 103562795.0, + "step": 621 + }, + { + "entropy": 1.790726323922475, + "epoch": 0.06833099887396665, + "grad_norm": 0.7970197796821594, + "learning_rate": 1.9999220103065845e-05, + "loss": 1.4247, + "mean_token_accuracy": 0.6529985020558039, + "num_tokens": 103716669.0, + "step": 622 + }, + { + "entropy": 1.8390385210514069, + "epoch": 0.0684408557853396, + "grad_norm": 0.7664891481399536, + "learning_rate": 1.9999198882664236e-05, + "loss": 1.4072, + "mean_token_accuracy": 0.6501214305559794, + "num_tokens": 103860168.0, + "step": 623 + }, + { + "entropy": 1.8309910794099171, + "epoch": 0.06855071269671253, + "grad_norm": 0.7682109475135803, + "learning_rate": 1.9999177377442042e-05, + "loss": 1.4699, + "mean_token_accuracy": 0.6459651440382004, + "num_tokens": 104010898.0, + "step": 624 + }, + { + "entropy": 1.8477097650369008, + "epoch": 0.06866056960808548, + "grad_norm": 0.9405747056007385, + "learning_rate": 1.9999155587399934e-05, + "loss": 1.3493, + "mean_token_accuracy": 0.6499254653851191, + "num_tokens": 104123289.0, + "step": 625 + }, + { + "entropy": 1.8265024423599243, + "epoch": 0.0687704265194584, + "grad_norm": 0.8012916445732117, + "learning_rate": 1.999913351253861e-05, + "loss": 1.3671, + "mean_token_accuracy": 0.6617961873610815, + "num_tokens": 104287167.0, + "step": 626 + }, + { + "entropy": 1.8607315520445507, + "epoch": 0.06888028343083134, + "grad_norm": 0.7399889230728149, + "learning_rate": 1.999911115285876e-05, + "loss": 1.5387, + "mean_token_accuracy": 0.6346966524918874, + "num_tokens": 104482574.0, + "step": 627 + }, + { + "entropy": 1.8020449976126354, + "epoch": 0.06899014034220428, + "grad_norm": 0.6279281377792358, + "learning_rate": 1.9999088508361104e-05, + "loss": 1.4417, + "mean_token_accuracy": 0.6576824982961019, + "num_tokens": 104674006.0, + "step": 628 + }, + { + "entropy": 1.7799376746018727, + "epoch": 0.06909999725357721, + "grad_norm": 0.705096960067749, + "learning_rate": 1.999906557904635e-05, + "loss": 1.5049, + "mean_token_accuracy": 0.6500988801320394, + "num_tokens": 104865258.0, + "step": 629 + }, + { + "entropy": 1.8464046518007915, + "epoch": 0.06920985416495015, + "grad_norm": 0.7601498365402222, + "learning_rate": 1.9999042364915222e-05, + "loss": 1.372, + "mean_token_accuracy": 0.6506158063809077, + "num_tokens": 104993966.0, + "step": 630 + }, + { + "entropy": 1.8217159807682037, + "epoch": 0.06931971107632309, + "grad_norm": 0.6956254243850708, + "learning_rate": 1.9999018865968462e-05, + "loss": 1.363, + "mean_token_accuracy": 0.656999429066976, + "num_tokens": 105120668.0, + "step": 631 + }, + { + "entropy": 1.880683700243632, + "epoch": 0.06942956798769602, + "grad_norm": 0.792460560798645, + "learning_rate": 1.999899508220681e-05, + "loss": 1.4723, + "mean_token_accuracy": 0.6231094797452291, + "num_tokens": 105278220.0, + "step": 632 + }, + { + "entropy": 1.9088138242562611, + "epoch": 0.06953942489906896, + "grad_norm": 0.7916814088821411, + "learning_rate": 1.9998971013631017e-05, + "loss": 1.5356, + "mean_token_accuracy": 0.6298377265532812, + "num_tokens": 105413384.0, + "step": 633 + }, + { + "entropy": 1.7535264392693837, + "epoch": 0.0696492818104419, + "grad_norm": 1.0666766166687012, + "learning_rate": 1.9998946660241845e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.6519220570723215, + "num_tokens": 105636692.0, + "step": 634 + }, + { + "entropy": 1.762801597515742, + "epoch": 0.06975913872181484, + "grad_norm": 0.6462137699127197, + "learning_rate": 1.9998922022040068e-05, + "loss": 1.3319, + "mean_token_accuracy": 0.6682475755612055, + "num_tokens": 105776185.0, + "step": 635 + }, + { + "entropy": 1.802270730336507, + "epoch": 0.06986899563318777, + "grad_norm": 0.6272945404052734, + "learning_rate": 1.9998897099026464e-05, + "loss": 1.5092, + "mean_token_accuracy": 0.6357052127520243, + "num_tokens": 105943190.0, + "step": 636 + }, + { + "entropy": 1.7483725647131603, + "epoch": 0.06997885254456071, + "grad_norm": 0.6791301369667053, + "learning_rate": 1.9998871891201822e-05, + "loss": 1.3783, + "mean_token_accuracy": 0.6612305889527003, + "num_tokens": 106072514.0, + "step": 637 + }, + { + "entropy": 1.7557042439778645, + "epoch": 0.07008870945593365, + "grad_norm": 0.6274598836898804, + "learning_rate": 1.9998846398566937e-05, + "loss": 1.4067, + "mean_token_accuracy": 0.636664499839147, + "num_tokens": 106315956.0, + "step": 638 + }, + { + "entropy": 1.851299395163854, + "epoch": 0.07019856636730658, + "grad_norm": 0.8829707503318787, + "learning_rate": 1.9998820621122623e-05, + "loss": 1.5645, + "mean_token_accuracy": 0.6381447166204453, + "num_tokens": 106515312.0, + "step": 639 + }, + { + "entropy": 1.7740463018417358, + "epoch": 0.07030842327867952, + "grad_norm": 0.6600044369697571, + "learning_rate": 1.999879455886969e-05, + "loss": 1.4202, + "mean_token_accuracy": 0.6525014142195383, + "num_tokens": 106734565.0, + "step": 640 + }, + { + "entropy": 1.808898796637853, + "epoch": 0.07041828019005246, + "grad_norm": 0.7501150965690613, + "learning_rate": 1.9998768211808962e-05, + "loss": 1.5897, + "mean_token_accuracy": 0.6105268895626068, + "num_tokens": 106986985.0, + "step": 641 + }, + { + "entropy": 1.7719741264979045, + "epoch": 0.07052813710142539, + "grad_norm": 0.9000745415687561, + "learning_rate": 1.9998741579941278e-05, + "loss": 1.4318, + "mean_token_accuracy": 0.6551828881104788, + "num_tokens": 107135180.0, + "step": 642 + }, + { + "entropy": 1.8894204199314117, + "epoch": 0.07063799401279833, + "grad_norm": 0.8245925307273865, + "learning_rate": 1.9998714663267476e-05, + "loss": 1.461, + "mean_token_accuracy": 0.6333466370900472, + "num_tokens": 107343264.0, + "step": 643 + }, + { + "entropy": 1.8361808558305104, + "epoch": 0.07074785092417127, + "grad_norm": 0.8316481709480286, + "learning_rate": 1.999868746178841e-05, + "loss": 1.3824, + "mean_token_accuracy": 0.657560924688975, + "num_tokens": 107472183.0, + "step": 644 + }, + { + "entropy": 1.7973891297976177, + "epoch": 0.07085770783554421, + "grad_norm": 0.6126974821090698, + "learning_rate": 1.999865997550494e-05, + "loss": 1.5521, + "mean_token_accuracy": 0.6135254551966985, + "num_tokens": 107694331.0, + "step": 645 + }, + { + "entropy": 1.8026204109191895, + "epoch": 0.07096756474691714, + "grad_norm": 0.6637122631072998, + "learning_rate": 1.9998632204417937e-05, + "loss": 1.3808, + "mean_token_accuracy": 0.6509968092044195, + "num_tokens": 107878484.0, + "step": 646 + }, + { + "entropy": 1.7910412450631459, + "epoch": 0.07107742165829008, + "grad_norm": 0.6697704195976257, + "learning_rate": 1.9998604148528284e-05, + "loss": 1.658, + "mean_token_accuracy": 0.6217399090528488, + "num_tokens": 108061157.0, + "step": 647 + }, + { + "entropy": 1.7230544984340668, + "epoch": 0.07118727856966302, + "grad_norm": 0.6764496564865112, + "learning_rate": 1.999857580783686e-05, + "loss": 1.3284, + "mean_token_accuracy": 0.6676372985045115, + "num_tokens": 108202427.0, + "step": 648 + }, + { + "entropy": 1.7782044510046642, + "epoch": 0.07129713548103594, + "grad_norm": 0.7073798179626465, + "learning_rate": 1.9998547182344564e-05, + "loss": 1.4346, + "mean_token_accuracy": 0.6538479775190353, + "num_tokens": 108361157.0, + "step": 649 + }, + { + "entropy": 1.7819972435633342, + "epoch": 0.07140699239240889, + "grad_norm": 0.6922680735588074, + "learning_rate": 1.999851827205231e-05, + "loss": 1.3826, + "mean_token_accuracy": 0.6540144383907318, + "num_tokens": 108528441.0, + "step": 650 + }, + { + "entropy": 1.7943304777145386, + "epoch": 0.07151684930378183, + "grad_norm": 0.6376049518585205, + "learning_rate": 1.9998489076961005e-05, + "loss": 1.4412, + "mean_token_accuracy": 0.636512354016304, + "num_tokens": 108738071.0, + "step": 651 + }, + { + "entropy": 1.85460830728213, + "epoch": 0.07162670621515477, + "grad_norm": 0.8236610889434814, + "learning_rate": 1.999845959707158e-05, + "loss": 1.4868, + "mean_token_accuracy": 0.6557626873254776, + "num_tokens": 108885666.0, + "step": 652 + }, + { + "entropy": 1.7542377909024556, + "epoch": 0.0717365631265277, + "grad_norm": 0.7607982754707336, + "learning_rate": 1.9998429832384953e-05, + "loss": 1.3098, + "mean_token_accuracy": 0.6667990038792292, + "num_tokens": 109009005.0, + "step": 653 + }, + { + "entropy": 1.7943703830242157, + "epoch": 0.07184642003790064, + "grad_norm": 0.6870585680007935, + "learning_rate": 1.9998399782902083e-05, + "loss": 1.4376, + "mean_token_accuracy": 0.6404406229654948, + "num_tokens": 109180672.0, + "step": 654 + }, + { + "entropy": 1.81678972641627, + "epoch": 0.07195627694927358, + "grad_norm": 0.9632508158683777, + "learning_rate": 1.9998369448623916e-05, + "loss": 1.2715, + "mean_token_accuracy": 0.6628138174613317, + "num_tokens": 109287099.0, + "step": 655 + }, + { + "entropy": 1.7701709270477295, + "epoch": 0.0720661338606465, + "grad_norm": 0.941100001335144, + "learning_rate": 1.999833882955141e-05, + "loss": 1.367, + "mean_token_accuracy": 0.648131713271141, + "num_tokens": 109474542.0, + "step": 656 + }, + { + "entropy": 1.821062793334325, + "epoch": 0.07217599077201944, + "grad_norm": 0.6249318718910217, + "learning_rate": 1.9998307925685534e-05, + "loss": 1.3544, + "mean_token_accuracy": 0.6468546291192373, + "num_tokens": 109606382.0, + "step": 657 + }, + { + "entropy": 1.8237617115179698, + "epoch": 0.07228584768339238, + "grad_norm": 0.6935443878173828, + "learning_rate": 1.9998276737027266e-05, + "loss": 1.4338, + "mean_token_accuracy": 0.6507659604152044, + "num_tokens": 109770865.0, + "step": 658 + }, + { + "entropy": 1.8277263343334198, + "epoch": 0.07239570459476531, + "grad_norm": 0.6764071583747864, + "learning_rate": 1.9998245263577596e-05, + "loss": 1.5816, + "mean_token_accuracy": 0.6434828341007233, + "num_tokens": 109976506.0, + "step": 659 + }, + { + "entropy": 1.8320674896240234, + "epoch": 0.07250556150613825, + "grad_norm": 0.7720675468444824, + "learning_rate": 1.999821350533752e-05, + "loss": 1.64, + "mean_token_accuracy": 0.6294133017460505, + "num_tokens": 110132121.0, + "step": 660 + }, + { + "entropy": 1.7681880195935566, + "epoch": 0.0726154184175112, + "grad_norm": 0.6684180498123169, + "learning_rate": 1.9998181462308037e-05, + "loss": 1.4463, + "mean_token_accuracy": 0.6456720034281412, + "num_tokens": 110322861.0, + "step": 661 + }, + { + "entropy": 1.8428928156693776, + "epoch": 0.07272527532888413, + "grad_norm": 0.7397868633270264, + "learning_rate": 1.9998149134490165e-05, + "loss": 1.3386, + "mean_token_accuracy": 0.6623791555563608, + "num_tokens": 110483291.0, + "step": 662 + }, + { + "entropy": 1.7374838987986247, + "epoch": 0.07283513224025706, + "grad_norm": 0.6700147390365601, + "learning_rate": 1.999811652188493e-05, + "loss": 1.5732, + "mean_token_accuracy": 0.6497671554485956, + "num_tokens": 110678771.0, + "step": 663 + }, + { + "entropy": 1.8194133241971333, + "epoch": 0.07294498915163, + "grad_norm": 0.6404582858085632, + "learning_rate": 1.999808362449336e-05, + "loss": 1.623, + "mean_token_accuracy": 0.6257813026507696, + "num_tokens": 110835416.0, + "step": 664 + }, + { + "entropy": 1.7887710829575856, + "epoch": 0.07305484606300294, + "grad_norm": 1.0074845552444458, + "learning_rate": 1.9998050442316503e-05, + "loss": 1.4084, + "mean_token_accuracy": 0.651343877116839, + "num_tokens": 111000969.0, + "step": 665 + }, + { + "entropy": 1.764528065919876, + "epoch": 0.07316470297437587, + "grad_norm": 0.9937914609909058, + "learning_rate": 1.9998016975355397e-05, + "loss": 1.2696, + "mean_token_accuracy": 0.6718964874744415, + "num_tokens": 111115363.0, + "step": 666 + }, + { + "entropy": 1.8262263238430023, + "epoch": 0.07327455988574881, + "grad_norm": 0.8130075335502625, + "learning_rate": 1.9997983223611112e-05, + "loss": 1.4068, + "mean_token_accuracy": 0.6590779721736908, + "num_tokens": 111280328.0, + "step": 667 + }, + { + "entropy": 1.830139935016632, + "epoch": 0.07338441679712175, + "grad_norm": 0.7358323931694031, + "learning_rate": 1.999794918708471e-05, + "loss": 1.4667, + "mean_token_accuracy": 0.6425051589806875, + "num_tokens": 111466818.0, + "step": 668 + }, + { + "entropy": 1.8468297918637593, + "epoch": 0.07349427370849469, + "grad_norm": 0.8599507808685303, + "learning_rate": 1.9997914865777273e-05, + "loss": 1.3408, + "mean_token_accuracy": 0.6542004893223444, + "num_tokens": 111602406.0, + "step": 669 + }, + { + "entropy": 1.7656611204147339, + "epoch": 0.07360413061986762, + "grad_norm": 0.7185338735580444, + "learning_rate": 1.9997880259689886e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6685215532779694, + "num_tokens": 111751172.0, + "step": 670 + }, + { + "entropy": 1.7852209508419037, + "epoch": 0.07371398753124056, + "grad_norm": 0.7217685580253601, + "learning_rate": 1.999784536882364e-05, + "loss": 1.5029, + "mean_token_accuracy": 0.6421723713477453, + "num_tokens": 111929666.0, + "step": 671 + }, + { + "entropy": 1.8410307069619496, + "epoch": 0.0738238444426135, + "grad_norm": 0.6868441104888916, + "learning_rate": 1.9997810193179647e-05, + "loss": 1.4308, + "mean_token_accuracy": 0.6462933719158173, + "num_tokens": 112079312.0, + "step": 672 + }, + { + "entropy": 1.7862571875254314, + "epoch": 0.07393370135398643, + "grad_norm": 0.7277688980102539, + "learning_rate": 1.9997774732759013e-05, + "loss": 1.4031, + "mean_token_accuracy": 0.647498811284701, + "num_tokens": 112248478.0, + "step": 673 + }, + { + "entropy": 1.8328973750273387, + "epoch": 0.07404355826535937, + "grad_norm": 0.8798831105232239, + "learning_rate": 1.9997738987562866e-05, + "loss": 1.5971, + "mean_token_accuracy": 0.6418457180261612, + "num_tokens": 112434647.0, + "step": 674 + }, + { + "entropy": 1.821668843428294, + "epoch": 0.07415341517673231, + "grad_norm": 0.749891459941864, + "learning_rate": 1.999770295759233e-05, + "loss": 1.3465, + "mean_token_accuracy": 0.654508168498675, + "num_tokens": 112564952.0, + "step": 675 + }, + { + "entropy": 1.7971191604932149, + "epoch": 0.07426327208810524, + "grad_norm": 0.8182111978530884, + "learning_rate": 1.9997666642848554e-05, + "loss": 1.5722, + "mean_token_accuracy": 0.6354757895072302, + "num_tokens": 112725586.0, + "step": 676 + }, + { + "entropy": 1.8406287133693695, + "epoch": 0.07437312899947818, + "grad_norm": 0.8844257593154907, + "learning_rate": 1.999763004333268e-05, + "loss": 1.2809, + "mean_token_accuracy": 0.6689592599868774, + "num_tokens": 112853047.0, + "step": 677 + }, + { + "entropy": 1.8012821773688, + "epoch": 0.07448298591085112, + "grad_norm": 0.5953492522239685, + "learning_rate": 1.9997593159045873e-05, + "loss": 1.5063, + "mean_token_accuracy": 0.6295547236998876, + "num_tokens": 113080741.0, + "step": 678 + }, + { + "entropy": 1.8217666645844777, + "epoch": 0.07459284282222406, + "grad_norm": 220.8173370361328, + "learning_rate": 1.9997555989989293e-05, + "loss": 1.573, + "mean_token_accuracy": 0.6443419431646665, + "num_tokens": 113273229.0, + "step": 679 + }, + { + "entropy": 1.783298095067342, + "epoch": 0.07470269973359699, + "grad_norm": 0.668280303478241, + "learning_rate": 1.9997518536164123e-05, + "loss": 1.3078, + "mean_token_accuracy": 0.6795926441748937, + "num_tokens": 113430145.0, + "step": 680 + }, + { + "entropy": 1.7551721433798473, + "epoch": 0.07481255664496993, + "grad_norm": 0.7069210410118103, + "learning_rate": 1.9997480797571547e-05, + "loss": 1.3379, + "mean_token_accuracy": 0.6589397639036179, + "num_tokens": 113588815.0, + "step": 681 + }, + { + "entropy": 1.8191516598065693, + "epoch": 0.07492241355634287, + "grad_norm": 0.610933244228363, + "learning_rate": 1.9997442774212753e-05, + "loss": 1.4519, + "mean_token_accuracy": 0.6368223875761032, + "num_tokens": 113777021.0, + "step": 682 + }, + { + "entropy": 1.8061704138914745, + "epoch": 0.0750322704677158, + "grad_norm": 0.7445184588432312, + "learning_rate": 1.9997404466088953e-05, + "loss": 1.4369, + "mean_token_accuracy": 0.6480630288521448, + "num_tokens": 113933174.0, + "step": 683 + }, + { + "entropy": 1.7527543703715007, + "epoch": 0.07514212737908874, + "grad_norm": 0.6358804702758789, + "learning_rate": 1.9997365873201356e-05, + "loss": 1.4386, + "mean_token_accuracy": 0.6501162797212601, + "num_tokens": 114121410.0, + "step": 684 + }, + { + "entropy": 1.799436867237091, + "epoch": 0.07525198429046168, + "grad_norm": 0.8749188780784607, + "learning_rate": 1.9997326995551183e-05, + "loss": 1.5263, + "mean_token_accuracy": 0.6480574657519659, + "num_tokens": 114282873.0, + "step": 685 + }, + { + "entropy": 1.7453529338041942, + "epoch": 0.07536184120183462, + "grad_norm": 0.7320595979690552, + "learning_rate": 1.9997287833139666e-05, + "loss": 1.5296, + "mean_token_accuracy": 0.657763327161471, + "num_tokens": 114508923.0, + "step": 686 + }, + { + "entropy": 1.7952920198440552, + "epoch": 0.07547169811320754, + "grad_norm": 0.6980917453765869, + "learning_rate": 1.9997248385968042e-05, + "loss": 1.4428, + "mean_token_accuracy": 0.6694839894771576, + "num_tokens": 114681044.0, + "step": 687 + }, + { + "entropy": 1.7734041313330333, + "epoch": 0.07558155502458049, + "grad_norm": 0.6694082617759705, + "learning_rate": 1.999720865403756e-05, + "loss": 1.4143, + "mean_token_accuracy": 0.6455632597208023, + "num_tokens": 114862846.0, + "step": 688 + }, + { + "entropy": 1.8399201730887096, + "epoch": 0.07569141193595343, + "grad_norm": 0.7080643773078918, + "learning_rate": 1.999716863734948e-05, + "loss": 1.4556, + "mean_token_accuracy": 0.6443889985481898, + "num_tokens": 115039990.0, + "step": 689 + }, + { + "entropy": 1.7809200982252757, + "epoch": 0.07580126884732635, + "grad_norm": 0.6346096396446228, + "learning_rate": 1.9997128335905066e-05, + "loss": 1.4591, + "mean_token_accuracy": 0.6494338313738505, + "num_tokens": 115206329.0, + "step": 690 + }, + { + "entropy": 1.8355284929275513, + "epoch": 0.0759111257586993, + "grad_norm": 0.7674968242645264, + "learning_rate": 1.9997087749705595e-05, + "loss": 1.4145, + "mean_token_accuracy": 0.6622882982095083, + "num_tokens": 115352467.0, + "step": 691 + }, + { + "entropy": 1.7761068443457286, + "epoch": 0.07602098267007223, + "grad_norm": 0.6778233647346497, + "learning_rate": 1.999704687875235e-05, + "loss": 1.4126, + "mean_token_accuracy": 0.658729096253713, + "num_tokens": 115559309.0, + "step": 692 + }, + { + "entropy": 1.772262881199519, + "epoch": 0.07613083958144516, + "grad_norm": 0.7021632194519043, + "learning_rate": 1.9997005723046628e-05, + "loss": 1.4743, + "mean_token_accuracy": 0.6380442132552465, + "num_tokens": 115747604.0, + "step": 693 + }, + { + "entropy": 1.7603289783000946, + "epoch": 0.0762406964928181, + "grad_norm": 0.7222464680671692, + "learning_rate": 1.9996964282589724e-05, + "loss": 1.4163, + "mean_token_accuracy": 0.6446563949187597, + "num_tokens": 115907827.0, + "step": 694 + }, + { + "entropy": 1.841255287329356, + "epoch": 0.07635055340419104, + "grad_norm": 0.6588021516799927, + "learning_rate": 1.999692255738296e-05, + "loss": 1.4226, + "mean_token_accuracy": 0.6450923730929693, + "num_tokens": 116087244.0, + "step": 695 + }, + { + "entropy": 1.7898233930269878, + "epoch": 0.07646041031556398, + "grad_norm": 0.5840282440185547, + "learning_rate": 1.999688054742765e-05, + "loss": 1.4013, + "mean_token_accuracy": 0.6547726740439733, + "num_tokens": 116257019.0, + "step": 696 + }, + { + "entropy": 1.7955358823140461, + "epoch": 0.07657026722693691, + "grad_norm": 0.729434609413147, + "learning_rate": 1.9996838252725123e-05, + "loss": 1.3782, + "mean_token_accuracy": 0.6612616926431656, + "num_tokens": 116414138.0, + "step": 697 + }, + { + "entropy": 1.8139538665612538, + "epoch": 0.07668012413830985, + "grad_norm": 0.6815258264541626, + "learning_rate": 1.999679567327672e-05, + "loss": 1.4255, + "mean_token_accuracy": 0.6536463449398676, + "num_tokens": 116605641.0, + "step": 698 + }, + { + "entropy": 1.8517298797766368, + "epoch": 0.07678998104968279, + "grad_norm": 0.7695915102958679, + "learning_rate": 1.9996752809083788e-05, + "loss": 1.4368, + "mean_token_accuracy": 0.6444319734970728, + "num_tokens": 116753233.0, + "step": 699 + }, + { + "entropy": 1.746784100929896, + "epoch": 0.07689983796105572, + "grad_norm": 0.6605033278465271, + "learning_rate": 1.9996709660147683e-05, + "loss": 1.4952, + "mean_token_accuracy": 0.6499835948149363, + "num_tokens": 116916082.0, + "step": 700 + }, + { + "entropy": 1.7918027838071187, + "epoch": 0.07700969487242866, + "grad_norm": 0.7656397819519043, + "learning_rate": 1.999666622646977e-05, + "loss": 1.6114, + "mean_token_accuracy": 0.6382357329130173, + "num_tokens": 117116194.0, + "step": 701 + }, + { + "entropy": 1.7874523599942524, + "epoch": 0.0771195517838016, + "grad_norm": 0.6298776268959045, + "learning_rate": 1.999662250805143e-05, + "loss": 1.3409, + "mean_token_accuracy": 0.6549411416053772, + "num_tokens": 117255116.0, + "step": 702 + }, + { + "entropy": 1.8126642107963562, + "epoch": 0.07722940869517453, + "grad_norm": 0.9679374694824219, + "learning_rate": 1.9996578504894037e-05, + "loss": 1.3772, + "mean_token_accuracy": 0.6570501724878947, + "num_tokens": 117399980.0, + "step": 703 + }, + { + "entropy": 1.8066406548023224, + "epoch": 0.07733926560654747, + "grad_norm": 0.6479452252388, + "learning_rate": 1.999653421699899e-05, + "loss": 1.6135, + "mean_token_accuracy": 0.6204476977388064, + "num_tokens": 117653352.0, + "step": 704 + }, + { + "entropy": 1.746168166399002, + "epoch": 0.07744912251792041, + "grad_norm": 0.8578298091888428, + "learning_rate": 1.9996489644367688e-05, + "loss": 1.4505, + "mean_token_accuracy": 0.6426874895890554, + "num_tokens": 117816334.0, + "step": 705 + }, + { + "entropy": 1.729866623878479, + "epoch": 0.07755897942929335, + "grad_norm": 0.766589343547821, + "learning_rate": 1.999644478700154e-05, + "loss": 1.3927, + "mean_token_accuracy": 0.6578174283107122, + "num_tokens": 118010653.0, + "step": 706 + }, + { + "entropy": 1.7975072264671326, + "epoch": 0.07766883634066628, + "grad_norm": 0.9121028780937195, + "learning_rate": 1.9996399644901976e-05, + "loss": 1.5192, + "mean_token_accuracy": 0.6352412700653076, + "num_tokens": 118195093.0, + "step": 707 + }, + { + "entropy": 1.8069658875465393, + "epoch": 0.07777869325203922, + "grad_norm": 0.6980465650558472, + "learning_rate": 1.999635421807041e-05, + "loss": 1.4175, + "mean_token_accuracy": 0.6498169104258219, + "num_tokens": 118338858.0, + "step": 708 + }, + { + "entropy": 1.8504317104816437, + "epoch": 0.07788855016341216, + "grad_norm": 0.9060118794441223, + "learning_rate": 1.999630850650829e-05, + "loss": 1.4035, + "mean_token_accuracy": 0.6471539338429769, + "num_tokens": 118444002.0, + "step": 709 + }, + { + "entropy": 1.829773207505544, + "epoch": 0.07799840707478509, + "grad_norm": 0.7004634141921997, + "learning_rate": 1.9996262510217058e-05, + "loss": 1.3789, + "mean_token_accuracy": 0.6589366098244985, + "num_tokens": 118603170.0, + "step": 710 + }, + { + "entropy": 1.7559443612893422, + "epoch": 0.07810826398615803, + "grad_norm": 0.6373770236968994, + "learning_rate": 1.9996216229198175e-05, + "loss": 1.2677, + "mean_token_accuracy": 0.680022269487381, + "num_tokens": 118757923.0, + "step": 711 + }, + { + "entropy": 1.805822531382243, + "epoch": 0.07821812089753097, + "grad_norm": 0.7866727709770203, + "learning_rate": 1.9996169663453096e-05, + "loss": 1.5077, + "mean_token_accuracy": 0.6585622876882553, + "num_tokens": 119017661.0, + "step": 712 + }, + { + "entropy": 1.7577200531959534, + "epoch": 0.07832797780890391, + "grad_norm": 0.7266113758087158, + "learning_rate": 1.9996122812983307e-05, + "loss": 1.49, + "mean_token_accuracy": 0.6384626974662145, + "num_tokens": 119187084.0, + "step": 713 + }, + { + "entropy": 1.8151898682117462, + "epoch": 0.07843783472027684, + "grad_norm": 0.8451136350631714, + "learning_rate": 1.9996075677790284e-05, + "loss": 1.3513, + "mean_token_accuracy": 0.6595414280891418, + "num_tokens": 119316009.0, + "step": 714 + }, + { + "entropy": 1.7623566885789235, + "epoch": 0.07854769163164978, + "grad_norm": 0.8786435127258301, + "learning_rate": 1.9996028257875518e-05, + "loss": 1.2819, + "mean_token_accuracy": 0.667223796248436, + "num_tokens": 119478080.0, + "step": 715 + }, + { + "entropy": 1.793608546257019, + "epoch": 0.07865754854302272, + "grad_norm": 0.7934389114379883, + "learning_rate": 1.999598055324051e-05, + "loss": 1.4843, + "mean_token_accuracy": 0.6354232827822367, + "num_tokens": 119646813.0, + "step": 716 + }, + { + "entropy": 1.7802114486694336, + "epoch": 0.07876740545439564, + "grad_norm": 0.701699435710907, + "learning_rate": 1.9995932563886774e-05, + "loss": 1.5294, + "mean_token_accuracy": 0.6498485853274664, + "num_tokens": 119826582.0, + "step": 717 + }, + { + "entropy": 1.7280071278413136, + "epoch": 0.07887726236576859, + "grad_norm": 0.7847176194190979, + "learning_rate": 1.9995884289815822e-05, + "loss": 1.2134, + "mean_token_accuracy": 0.6839973479509354, + "num_tokens": 119938664.0, + "step": 718 + }, + { + "entropy": 1.8675900995731354, + "epoch": 0.07898711927714153, + "grad_norm": 0.9124090671539307, + "learning_rate": 1.9995835731029188e-05, + "loss": 1.3989, + "mean_token_accuracy": 0.6580939839283625, + "num_tokens": 120071040.0, + "step": 719 + }, + { + "entropy": 1.7469736437002819, + "epoch": 0.07909697618851445, + "grad_norm": 0.6930084228515625, + "learning_rate": 1.999578688752841e-05, + "loss": 1.4251, + "mean_token_accuracy": 0.6595380107561747, + "num_tokens": 120280438.0, + "step": 720 + }, + { + "entropy": 1.8206437130769093, + "epoch": 0.0792068330998874, + "grad_norm": 0.7369340062141418, + "learning_rate": 1.9995737759315025e-05, + "loss": 1.5097, + "mean_token_accuracy": 0.6350439786911011, + "num_tokens": 120485289.0, + "step": 721 + }, + { + "entropy": 1.7503860990206401, + "epoch": 0.07931669001126034, + "grad_norm": 0.7530749440193176, + "learning_rate": 1.99956883463906e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.6404246886571249, + "num_tokens": 120689052.0, + "step": 722 + }, + { + "entropy": 1.7276439766089122, + "epoch": 0.07942654692263328, + "grad_norm": 0.8877029418945312, + "learning_rate": 1.9995638648756686e-05, + "loss": 1.2943, + "mean_token_accuracy": 0.6667290230592092, + "num_tokens": 120798819.0, + "step": 723 + }, + { + "entropy": 1.7703753213087718, + "epoch": 0.0795364038340062, + "grad_norm": 0.7141546607017517, + "learning_rate": 1.9995588666414866e-05, + "loss": 1.4063, + "mean_token_accuracy": 0.6415145248174667, + "num_tokens": 120975866.0, + "step": 724 + }, + { + "entropy": 1.8610213895638783, + "epoch": 0.07964626074537914, + "grad_norm": 0.8251237869262695, + "learning_rate": 1.9995538399366716e-05, + "loss": 1.5718, + "mean_token_accuracy": 0.6517770936091741, + "num_tokens": 121107698.0, + "step": 725 + }, + { + "entropy": 1.7903367479642232, + "epoch": 0.07975611765675208, + "grad_norm": 0.7166335582733154, + "learning_rate": 1.9995487847613832e-05, + "loss": 1.5287, + "mean_token_accuracy": 0.633270596464475, + "num_tokens": 121295774.0, + "step": 726 + }, + { + "entropy": 1.7588022152582805, + "epoch": 0.07986597456812501, + "grad_norm": 0.723822832107544, + "learning_rate": 1.9995437011157805e-05, + "loss": 1.7178, + "mean_token_accuracy": 0.6282697518666586, + "num_tokens": 121554849.0, + "step": 727 + }, + { + "entropy": 1.8302726646264393, + "epoch": 0.07997583147949795, + "grad_norm": 0.7193813920021057, + "learning_rate": 1.9995385890000256e-05, + "loss": 1.4198, + "mean_token_accuracy": 0.6537833462158839, + "num_tokens": 121708057.0, + "step": 728 + }, + { + "entropy": 1.8833401600519817, + "epoch": 0.0800856883908709, + "grad_norm": 0.896878182888031, + "learning_rate": 1.9995334484142797e-05, + "loss": 1.4591, + "mean_token_accuracy": 0.6470515926678976, + "num_tokens": 121871319.0, + "step": 729 + }, + { + "entropy": 1.8520794709523518, + "epoch": 0.08019554530224383, + "grad_norm": 0.6501368284225464, + "learning_rate": 1.999528279358705e-05, + "loss": 1.5772, + "mean_token_accuracy": 0.6148606240749359, + "num_tokens": 122104472.0, + "step": 730 + }, + { + "entropy": 1.8304372231165569, + "epoch": 0.08030540221361676, + "grad_norm": 0.7300158143043518, + "learning_rate": 1.9995230818334665e-05, + "loss": 1.5162, + "mean_token_accuracy": 0.6417268216609955, + "num_tokens": 122318282.0, + "step": 731 + }, + { + "entropy": 1.8412455519040425, + "epoch": 0.0804152591249897, + "grad_norm": 0.7814407348632812, + "learning_rate": 1.9995178558387268e-05, + "loss": 1.5187, + "mean_token_accuracy": 0.6483007321755091, + "num_tokens": 122441137.0, + "step": 732 + }, + { + "entropy": 1.8073117434978485, + "epoch": 0.08052511603636264, + "grad_norm": 0.761740505695343, + "learning_rate": 1.9995126013746527e-05, + "loss": 1.436, + "mean_token_accuracy": 0.6581438233455023, + "num_tokens": 122574227.0, + "step": 733 + }, + { + "entropy": 1.7240748008092244, + "epoch": 0.08063497294773557, + "grad_norm": 0.6440936326980591, + "learning_rate": 1.9995073184414103e-05, + "loss": 1.31, + "mean_token_accuracy": 0.6681271195411682, + "num_tokens": 122704629.0, + "step": 734 + }, + { + "entropy": 1.8669129113356273, + "epoch": 0.08074482985910851, + "grad_norm": 0.8220887184143066, + "learning_rate": 1.9995020070391666e-05, + "loss": 1.5414, + "mean_token_accuracy": 0.645127202073733, + "num_tokens": 122870109.0, + "step": 735 + }, + { + "entropy": 1.767956554889679, + "epoch": 0.08085468677048145, + "grad_norm": 0.8655692338943481, + "learning_rate": 1.9994966671680892e-05, + "loss": 1.3607, + "mean_token_accuracy": 0.6563311517238617, + "num_tokens": 122991223.0, + "step": 736 + }, + { + "entropy": 1.8697227636973064, + "epoch": 0.08096454368185438, + "grad_norm": 0.8299003839492798, + "learning_rate": 1.999491298828348e-05, + "loss": 1.4838, + "mean_token_accuracy": 0.6484878609577814, + "num_tokens": 123119953.0, + "step": 737 + }, + { + "entropy": 1.8050644993782043, + "epoch": 0.08107440059322732, + "grad_norm": 0.9768658876419067, + "learning_rate": 1.9994859020201124e-05, + "loss": 1.301, + "mean_token_accuracy": 0.6755692362785339, + "num_tokens": 123261135.0, + "step": 738 + }, + { + "entropy": 1.7627781132857006, + "epoch": 0.08118425750460026, + "grad_norm": 0.846538245677948, + "learning_rate": 1.9994804767435535e-05, + "loss": 1.368, + "mean_token_accuracy": 0.6451980670293173, + "num_tokens": 123420991.0, + "step": 739 + }, + { + "entropy": 1.8099198838075001, + "epoch": 0.0812941144159732, + "grad_norm": 0.6865768432617188, + "learning_rate": 1.9994750229988426e-05, + "loss": 1.3777, + "mean_token_accuracy": 0.6596326579650243, + "num_tokens": 123562426.0, + "step": 740 + }, + { + "entropy": 1.7973152299722035, + "epoch": 0.08140397132734613, + "grad_norm": 0.6900340914726257, + "learning_rate": 1.9994695407861526e-05, + "loss": 1.4664, + "mean_token_accuracy": 0.6281344542900721, + "num_tokens": 123751162.0, + "step": 741 + }, + { + "entropy": 1.796221762895584, + "epoch": 0.08151382823871907, + "grad_norm": 0.661390483379364, + "learning_rate": 1.999464030105657e-05, + "loss": 1.6038, + "mean_token_accuracy": 0.6439404537280401, + "num_tokens": 123962173.0, + "step": 742 + }, + { + "entropy": 1.8211529751618702, + "epoch": 0.08162368515009201, + "grad_norm": 0.838100254535675, + "learning_rate": 1.99945849095753e-05, + "loss": 1.5889, + "mean_token_accuracy": 0.6407252550125122, + "num_tokens": 124092071.0, + "step": 743 + }, + { + "entropy": 1.7539305289586384, + "epoch": 0.08173354206146494, + "grad_norm": 0.7083438038825989, + "learning_rate": 1.999452923341947e-05, + "loss": 1.4526, + "mean_token_accuracy": 0.661969467997551, + "num_tokens": 124272313.0, + "step": 744 + }, + { + "entropy": 1.8283264338970184, + "epoch": 0.08184339897283788, + "grad_norm": 0.9990186095237732, + "learning_rate": 1.9994473272590848e-05, + "loss": 1.5733, + "mean_token_accuracy": 0.6551229556401571, + "num_tokens": 124408723.0, + "step": 745 + }, + { + "entropy": 1.7998952567577362, + "epoch": 0.08195325588421082, + "grad_norm": 0.9326064586639404, + "learning_rate": 1.9994417027091193e-05, + "loss": 1.3575, + "mean_token_accuracy": 0.6747980813185374, + "num_tokens": 124546937.0, + "step": 746 + }, + { + "entropy": 1.8038958807786305, + "epoch": 0.08206311279558375, + "grad_norm": 0.6932543516159058, + "learning_rate": 1.9994360496922297e-05, + "loss": 1.4123, + "mean_token_accuracy": 0.6614984820286433, + "num_tokens": 124690925.0, + "step": 747 + }, + { + "entropy": 1.8390637238820393, + "epoch": 0.08217296970695669, + "grad_norm": 0.9938632845878601, + "learning_rate": 1.9994303682085946e-05, + "loss": 1.4262, + "mean_token_accuracy": 0.6745589772860209, + "num_tokens": 124849459.0, + "step": 748 + }, + { + "entropy": 1.7794020473957062, + "epoch": 0.08228282661832963, + "grad_norm": 0.7585030198097229, + "learning_rate": 1.999424658258393e-05, + "loss": 1.5437, + "mean_token_accuracy": 0.63681960105896, + "num_tokens": 125045326.0, + "step": 749 + }, + { + "entropy": 1.752739042043686, + "epoch": 0.08239268352970257, + "grad_norm": 0.7272341251373291, + "learning_rate": 1.9994189198418067e-05, + "loss": 1.5744, + "mean_token_accuracy": 0.6451994031667709, + "num_tokens": 125215785.0, + "step": 750 + }, + { + "entropy": 1.7343992094198863, + "epoch": 0.0825025404410755, + "grad_norm": 0.8098207712173462, + "learning_rate": 1.9994131529590166e-05, + "loss": 1.5917, + "mean_token_accuracy": 0.6374679381648699, + "num_tokens": 125409547.0, + "step": 751 + }, + { + "entropy": 1.8226308226585388, + "epoch": 0.08261239735244844, + "grad_norm": 0.7430676221847534, + "learning_rate": 1.9994073576102058e-05, + "loss": 1.399, + "mean_token_accuracy": 0.6658477435509363, + "num_tokens": 125530731.0, + "step": 752 + }, + { + "entropy": 1.7828458150227864, + "epoch": 0.08272225426382138, + "grad_norm": 0.7506123185157776, + "learning_rate": 1.999401533795557e-05, + "loss": 1.3961, + "mean_token_accuracy": 0.6559909929831823, + "num_tokens": 125660742.0, + "step": 753 + }, + { + "entropy": 1.7625751396020253, + "epoch": 0.0828321111751943, + "grad_norm": 1.280418038368225, + "learning_rate": 1.9993956815152553e-05, + "loss": 1.1615, + "mean_token_accuracy": 0.6699869732062022, + "num_tokens": 125830165.0, + "step": 754 + }, + { + "entropy": 1.8572514255841572, + "epoch": 0.08294196808656724, + "grad_norm": 0.9156613945960999, + "learning_rate": 1.9993898007694857e-05, + "loss": 1.6035, + "mean_token_accuracy": 0.634151021639506, + "num_tokens": 125965207.0, + "step": 755 + }, + { + "entropy": 1.7810141642888386, + "epoch": 0.08305182499794019, + "grad_norm": 0.714108407497406, + "learning_rate": 1.999383891558434e-05, + "loss": 1.3414, + "mean_token_accuracy": 0.6630875319242477, + "num_tokens": 126126227.0, + "step": 756 + }, + { + "entropy": 1.755696713924408, + "epoch": 0.08316168190931313, + "grad_norm": 0.6537689566612244, + "learning_rate": 1.9993779538822873e-05, + "loss": 1.46, + "mean_token_accuracy": 0.6578283309936523, + "num_tokens": 126285094.0, + "step": 757 + }, + { + "entropy": 1.8099895517031352, + "epoch": 0.08327153882068605, + "grad_norm": 0.8549863696098328, + "learning_rate": 1.9993719877412333e-05, + "loss": 1.3394, + "mean_token_accuracy": 0.6538981397946676, + "num_tokens": 126461471.0, + "step": 758 + }, + { + "entropy": 1.8493448694547017, + "epoch": 0.083381395732059, + "grad_norm": 0.85292649269104, + "learning_rate": 1.9993659931354616e-05, + "loss": 1.4327, + "mean_token_accuracy": 0.6453457971413931, + "num_tokens": 126644623.0, + "step": 759 + }, + { + "entropy": 1.8416785299777985, + "epoch": 0.08349125264343193, + "grad_norm": 0.7345470190048218, + "learning_rate": 1.9993599700651612e-05, + "loss": 1.4578, + "mean_token_accuracy": 0.6381366650263468, + "num_tokens": 126832969.0, + "step": 760 + }, + { + "entropy": 1.7897752424081166, + "epoch": 0.08360110955480486, + "grad_norm": 0.6783207654953003, + "learning_rate": 1.9993539185305236e-05, + "loss": 1.3596, + "mean_token_accuracy": 0.6550974746545156, + "num_tokens": 127023280.0, + "step": 761 + }, + { + "entropy": 1.7872902353604634, + "epoch": 0.0837109664661778, + "grad_norm": 0.7059661746025085, + "learning_rate": 1.9993478385317392e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6592389543851217, + "num_tokens": 127161642.0, + "step": 762 + }, + { + "entropy": 1.8159588476022084, + "epoch": 0.08382082337755074, + "grad_norm": 0.6670881509780884, + "learning_rate": 1.999341730069001e-05, + "loss": 1.3591, + "mean_token_accuracy": 0.6564019024372101, + "num_tokens": 127301579.0, + "step": 763 + }, + { + "entropy": 1.7422301471233368, + "epoch": 0.08393068028892367, + "grad_norm": 0.6835038661956787, + "learning_rate": 1.9993355931425026e-05, + "loss": 1.3785, + "mean_token_accuracy": 0.6575357466936111, + "num_tokens": 127444174.0, + "step": 764 + }, + { + "entropy": 1.7988096475601196, + "epoch": 0.08404053720029661, + "grad_norm": 0.6521595120429993, + "learning_rate": 1.9993294277524376e-05, + "loss": 1.4665, + "mean_token_accuracy": 0.6468783915042877, + "num_tokens": 127635492.0, + "step": 765 + }, + { + "entropy": 1.808186541001002, + "epoch": 0.08415039411166955, + "grad_norm": 0.8931750655174255, + "learning_rate": 1.9993232338990017e-05, + "loss": 1.5414, + "mean_token_accuracy": 0.642045333981514, + "num_tokens": 127793125.0, + "step": 766 + }, + { + "entropy": 1.8431545893351238, + "epoch": 0.08426025102304249, + "grad_norm": 0.7338786125183105, + "learning_rate": 1.9993170115823907e-05, + "loss": 1.5627, + "mean_token_accuracy": 0.6344971805810928, + "num_tokens": 127993475.0, + "step": 767 + }, + { + "entropy": 1.822217325369517, + "epoch": 0.08437010793441542, + "grad_norm": 0.7734959721565247, + "learning_rate": 1.9993107608028014e-05, + "loss": 1.4285, + "mean_token_accuracy": 0.6521108448505402, + "num_tokens": 128139318.0, + "step": 768 + }, + { + "entropy": 1.8207875788211823, + "epoch": 0.08447996484578836, + "grad_norm": 0.6687442064285278, + "learning_rate": 1.9993044815604315e-05, + "loss": 1.4348, + "mean_token_accuracy": 0.6397057324647903, + "num_tokens": 128349536.0, + "step": 769 + }, + { + "entropy": 1.776291400194168, + "epoch": 0.0845898217571613, + "grad_norm": 0.7247187495231628, + "learning_rate": 1.9992981738554804e-05, + "loss": 1.4005, + "mean_token_accuracy": 0.6597668379545212, + "num_tokens": 128529318.0, + "step": 770 + }, + { + "entropy": 1.8065292338530223, + "epoch": 0.08469967866853423, + "grad_norm": 0.7673947215080261, + "learning_rate": 1.999291837688147e-05, + "loss": 1.2979, + "mean_token_accuracy": 0.6667843461036682, + "num_tokens": 128647932.0, + "step": 771 + }, + { + "entropy": 1.8455777664979298, + "epoch": 0.08480953557990717, + "grad_norm": 0.8621540069580078, + "learning_rate": 1.9992854730586328e-05, + "loss": 1.3875, + "mean_token_accuracy": 0.6518898904323578, + "num_tokens": 128806671.0, + "step": 772 + }, + { + "entropy": 1.7859807113806407, + "epoch": 0.08491939249128011, + "grad_norm": 0.806907057762146, + "learning_rate": 1.999279079967138e-05, + "loss": 1.4682, + "mean_token_accuracy": 0.6510076969861984, + "num_tokens": 128956643.0, + "step": 773 + }, + { + "entropy": 1.8100234270095825, + "epoch": 0.08502924940265305, + "grad_norm": 0.7432371973991394, + "learning_rate": 1.9992726584138654e-05, + "loss": 1.4843, + "mean_token_accuracy": 0.637720063328743, + "num_tokens": 129165449.0, + "step": 774 + }, + { + "entropy": 1.810623029867808, + "epoch": 0.08513910631402598, + "grad_norm": 0.7347936034202576, + "learning_rate": 1.999266208399019e-05, + "loss": 1.4643, + "mean_token_accuracy": 0.6474610765775045, + "num_tokens": 129345429.0, + "step": 775 + }, + { + "entropy": 1.8380460838476818, + "epoch": 0.08524896322539892, + "grad_norm": 0.778282642364502, + "learning_rate": 1.999259729922802e-05, + "loss": 1.364, + "mean_token_accuracy": 0.6652749627828598, + "num_tokens": 129470095.0, + "step": 776 + }, + { + "entropy": 1.733398546775182, + "epoch": 0.08535882013677186, + "grad_norm": 1.2146008014678955, + "learning_rate": 1.9992532229854198e-05, + "loss": 1.2652, + "mean_token_accuracy": 0.6567486921946207, + "num_tokens": 129738611.0, + "step": 777 + }, + { + "entropy": 1.7519733210404713, + "epoch": 0.08546867704814479, + "grad_norm": 0.7072291374206543, + "learning_rate": 1.9992466875870783e-05, + "loss": 1.3025, + "mean_token_accuracy": 0.6597904910643896, + "num_tokens": 129854306.0, + "step": 778 + }, + { + "entropy": 1.8180581033229828, + "epoch": 0.08557853395951773, + "grad_norm": 0.9036336541175842, + "learning_rate": 1.9992401237279842e-05, + "loss": 1.2928, + "mean_token_accuracy": 0.6717847138643265, + "num_tokens": 129988460.0, + "step": 779 + }, + { + "entropy": 1.8475947678089142, + "epoch": 0.08568839087089067, + "grad_norm": 0.690317690372467, + "learning_rate": 1.9992335314083456e-05, + "loss": 1.3999, + "mean_token_accuracy": 0.6502603391806284, + "num_tokens": 130130018.0, + "step": 780 + }, + { + "entropy": 1.7420289814472198, + "epoch": 0.0857982477822636, + "grad_norm": 0.5954359173774719, + "learning_rate": 1.999226910628371e-05, + "loss": 1.5268, + "mean_token_accuracy": 0.6370938271284103, + "num_tokens": 130339750.0, + "step": 781 + }, + { + "entropy": 1.833437740802765, + "epoch": 0.08590810469363654, + "grad_norm": 0.972395122051239, + "learning_rate": 1.9992202613882697e-05, + "loss": 1.4811, + "mean_token_accuracy": 0.6551183809836706, + "num_tokens": 130484058.0, + "step": 782 + }, + { + "entropy": 1.7667591671148937, + "epoch": 0.08601796160500948, + "grad_norm": 0.7758358716964722, + "learning_rate": 1.999213583688252e-05, + "loss": 1.3965, + "mean_token_accuracy": 0.6554379711548487, + "num_tokens": 130646142.0, + "step": 783 + }, + { + "entropy": 1.7984866201877594, + "epoch": 0.08612781851638242, + "grad_norm": 0.8162248134613037, + "learning_rate": 1.9992068775285306e-05, + "loss": 1.5106, + "mean_token_accuracy": 0.6294661909341812, + "num_tokens": 130834885.0, + "step": 784 + }, + { + "entropy": 1.7651668687661488, + "epoch": 0.08623767542775534, + "grad_norm": 0.9227822422981262, + "learning_rate": 1.9992001429093156e-05, + "loss": 1.2935, + "mean_token_accuracy": 0.6639310071865717, + "num_tokens": 130975509.0, + "step": 785 + }, + { + "entropy": 1.7787472208340962, + "epoch": 0.08634753233912829, + "grad_norm": 0.6106439828872681, + "learning_rate": 1.9991933798308222e-05, + "loss": 1.397, + "mean_token_accuracy": 0.6517676264047623, + "num_tokens": 131148150.0, + "step": 786 + }, + { + "entropy": 1.7501886288324993, + "epoch": 0.08645738925050123, + "grad_norm": 0.6313499808311462, + "learning_rate": 1.9991865882932628e-05, + "loss": 1.4274, + "mean_token_accuracy": 0.6402166783809662, + "num_tokens": 131368022.0, + "step": 787 + }, + { + "entropy": 1.8370747168858845, + "epoch": 0.08656724616187415, + "grad_norm": 0.7224745154380798, + "learning_rate": 1.9991797682968533e-05, + "loss": 1.337, + "mean_token_accuracy": 0.6554816514253616, + "num_tokens": 131494243.0, + "step": 788 + }, + { + "entropy": 1.7572102049986522, + "epoch": 0.0866771030732471, + "grad_norm": 0.6437149047851562, + "learning_rate": 1.9991729198418094e-05, + "loss": 1.3779, + "mean_token_accuracy": 0.6482534607251486, + "num_tokens": 131657195.0, + "step": 789 + }, + { + "entropy": 1.8201068341732025, + "epoch": 0.08678695998462004, + "grad_norm": 0.5996161699295044, + "learning_rate": 1.9991660429283475e-05, + "loss": 1.4695, + "mean_token_accuracy": 0.6353013664484024, + "num_tokens": 131837418.0, + "step": 790 + }, + { + "entropy": 1.737343708674113, + "epoch": 0.08689681689599298, + "grad_norm": 0.8244271278381348, + "learning_rate": 1.999159137556686e-05, + "loss": 1.3923, + "mean_token_accuracy": 0.6581053187449774, + "num_tokens": 131986253.0, + "step": 791 + }, + { + "entropy": 1.831222931543986, + "epoch": 0.0870066738073659, + "grad_norm": 0.6725685000419617, + "learning_rate": 1.9991522037270426e-05, + "loss": 1.5433, + "mean_token_accuracy": 0.6341465910275778, + "num_tokens": 132156185.0, + "step": 792 + }, + { + "entropy": 1.7623351514339447, + "epoch": 0.08711653071873884, + "grad_norm": 0.6132712364196777, + "learning_rate": 1.9991452414396374e-05, + "loss": 1.5282, + "mean_token_accuracy": 0.6300236731767654, + "num_tokens": 132375195.0, + "step": 793 + }, + { + "entropy": 1.8128896256287892, + "epoch": 0.08722638763011178, + "grad_norm": 0.8201103210449219, + "learning_rate": 1.99913825069469e-05, + "loss": 1.4092, + "mean_token_accuracy": 0.6543792635202408, + "num_tokens": 132572019.0, + "step": 794 + }, + { + "entropy": 1.8170464436213176, + "epoch": 0.08733624454148471, + "grad_norm": 0.7330581545829773, + "learning_rate": 1.9991312314924223e-05, + "loss": 1.5836, + "mean_token_accuracy": 0.6373669604460398, + "num_tokens": 132757713.0, + "step": 795 + }, + { + "entropy": 1.7433435519536336, + "epoch": 0.08744610145285765, + "grad_norm": 0.7479463219642639, + "learning_rate": 1.9991241838330563e-05, + "loss": 1.2838, + "mean_token_accuracy": 0.6742591361204783, + "num_tokens": 132907608.0, + "step": 796 + }, + { + "entropy": 1.7819043000539143, + "epoch": 0.0875559583642306, + "grad_norm": 0.8788211941719055, + "learning_rate": 1.999117107716815e-05, + "loss": 1.2552, + "mean_token_accuracy": 0.6783933192491531, + "num_tokens": 133021304.0, + "step": 797 + }, + { + "entropy": 1.803941269715627, + "epoch": 0.08766581527560352, + "grad_norm": 0.7351856827735901, + "learning_rate": 1.9991100031439226e-05, + "loss": 1.4558, + "mean_token_accuracy": 0.6422074437141418, + "num_tokens": 133170851.0, + "step": 798 + }, + { + "entropy": 1.7430227200190227, + "epoch": 0.08777567218697646, + "grad_norm": 0.7564266324043274, + "learning_rate": 1.999102870114604e-05, + "loss": 1.5169, + "mean_token_accuracy": 0.6481150388717651, + "num_tokens": 133331023.0, + "step": 799 + }, + { + "entropy": 1.8221300840377808, + "epoch": 0.0878855290983494, + "grad_norm": 0.6594426035881042, + "learning_rate": 1.9990957086290842e-05, + "loss": 1.5665, + "mean_token_accuracy": 0.6504103392362595, + "num_tokens": 133496097.0, + "step": 800 + }, + { + "entropy": 1.781892587741216, + "epoch": 0.08799538600972234, + "grad_norm": 0.7563036680221558, + "learning_rate": 1.9990885186875903e-05, + "loss": 1.5135, + "mean_token_accuracy": 0.6499631603558859, + "num_tokens": 133688537.0, + "step": 801 + }, + { + "entropy": 1.7549792230129242, + "epoch": 0.08810524292109527, + "grad_norm": 0.7245374321937561, + "learning_rate": 1.9990813002903504e-05, + "loss": 1.4476, + "mean_token_accuracy": 0.6485745906829834, + "num_tokens": 133853936.0, + "step": 802 + }, + { + "entropy": 1.762273798386256, + "epoch": 0.08821509983246821, + "grad_norm": 0.9437369108200073, + "learning_rate": 1.999074053437592e-05, + "loss": 1.3344, + "mean_token_accuracy": 0.6619789600372314, + "num_tokens": 134003645.0, + "step": 803 + }, + { + "entropy": 1.8330159882704418, + "epoch": 0.08832495674384115, + "grad_norm": 0.8058866858482361, + "learning_rate": 1.9990667781295453e-05, + "loss": 1.3736, + "mean_token_accuracy": 0.6473863820234934, + "num_tokens": 134140327.0, + "step": 804 + }, + { + "entropy": 1.7547661860783894, + "epoch": 0.08843481365521408, + "grad_norm": 0.7638370394706726, + "learning_rate": 1.9990594743664402e-05, + "loss": 1.4696, + "mean_token_accuracy": 0.652462974190712, + "num_tokens": 134275518.0, + "step": 805 + }, + { + "entropy": 1.7573677500089009, + "epoch": 0.08854467056658702, + "grad_norm": 0.7340158224105835, + "learning_rate": 1.9990521421485077e-05, + "loss": 1.2733, + "mean_token_accuracy": 0.6660685688257217, + "num_tokens": 134395935.0, + "step": 806 + }, + { + "entropy": 1.8871439099311829, + "epoch": 0.08865452747795996, + "grad_norm": 0.8130138516426086, + "learning_rate": 1.99904478147598e-05, + "loss": 1.3587, + "mean_token_accuracy": 0.6492532193660736, + "num_tokens": 134495765.0, + "step": 807 + }, + { + "entropy": 1.8198732137680054, + "epoch": 0.08876438438933289, + "grad_norm": 0.8263359665870667, + "learning_rate": 1.99903739234909e-05, + "loss": 1.4117, + "mean_token_accuracy": 0.6728939761718115, + "num_tokens": 134650045.0, + "step": 808 + }, + { + "entropy": 1.7290976345539093, + "epoch": 0.08887424130070583, + "grad_norm": 1.5325140953063965, + "learning_rate": 1.999029974768072e-05, + "loss": 1.2137, + "mean_token_accuracy": 0.675426850716273, + "num_tokens": 134879431.0, + "step": 809 + }, + { + "entropy": 1.7537180085976918, + "epoch": 0.08898409821207877, + "grad_norm": 0.7787648439407349, + "learning_rate": 1.99902252873316e-05, + "loss": 1.4029, + "mean_token_accuracy": 0.6435067852338155, + "num_tokens": 135037088.0, + "step": 810 + }, + { + "entropy": 1.793819099664688, + "epoch": 0.08909395512345171, + "grad_norm": 0.6838997006416321, + "learning_rate": 1.9990150542445904e-05, + "loss": 1.512, + "mean_token_accuracy": 0.650256002942721, + "num_tokens": 135257457.0, + "step": 811 + }, + { + "entropy": 1.8345171809196472, + "epoch": 0.08920381203482464, + "grad_norm": 0.6854283809661865, + "learning_rate": 1.999007551302599e-05, + "loss": 1.4529, + "mean_token_accuracy": 0.6385099937518438, + "num_tokens": 135443159.0, + "step": 812 + }, + { + "entropy": 1.7310991485913594, + "epoch": 0.08931366894619758, + "grad_norm": 0.7480428814888, + "learning_rate": 1.9990000199074244e-05, + "loss": 1.385, + "mean_token_accuracy": 0.6632754951715469, + "num_tokens": 135602809.0, + "step": 813 + }, + { + "entropy": 1.7727793554464977, + "epoch": 0.08942352585757052, + "grad_norm": 0.9646157026290894, + "learning_rate": 1.9989924600593037e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6555025527874628, + "num_tokens": 135716606.0, + "step": 814 + }, + { + "entropy": 1.7907099723815918, + "epoch": 0.08953338276894345, + "grad_norm": 0.7035730481147766, + "learning_rate": 1.998984871758477e-05, + "loss": 1.4499, + "mean_token_accuracy": 0.6439538051684698, + "num_tokens": 135878432.0, + "step": 815 + }, + { + "entropy": 1.8181606729825337, + "epoch": 0.08964323968031639, + "grad_norm": 0.6870342493057251, + "learning_rate": 1.998977255005184e-05, + "loss": 1.4373, + "mean_token_accuracy": 0.6356525272130966, + "num_tokens": 136081813.0, + "step": 816 + }, + { + "entropy": 1.7305361131827037, + "epoch": 0.08975309659168933, + "grad_norm": 0.5997280478477478, + "learning_rate": 1.9989696097996662e-05, + "loss": 1.4796, + "mean_token_accuracy": 0.6471946984529495, + "num_tokens": 136295261.0, + "step": 817 + }, + { + "entropy": 1.7904970049858093, + "epoch": 0.08986295350306227, + "grad_norm": 0.8890630006790161, + "learning_rate": 1.998961936142165e-05, + "loss": 1.403, + "mean_token_accuracy": 0.6517884830633799, + "num_tokens": 136466043.0, + "step": 818 + }, + { + "entropy": 1.770477294921875, + "epoch": 0.0899728104144352, + "grad_norm": 0.6830163598060608, + "learning_rate": 1.998954234032924e-05, + "loss": 1.5319, + "mean_token_accuracy": 0.6378083527088165, + "num_tokens": 136665178.0, + "step": 819 + }, + { + "entropy": 1.8271574676036835, + "epoch": 0.09008266732580814, + "grad_norm": 0.755791187286377, + "learning_rate": 1.9989465034721866e-05, + "loss": 1.533, + "mean_token_accuracy": 0.6352124363183975, + "num_tokens": 136800420.0, + "step": 820 + }, + { + "entropy": 1.7916331390539806, + "epoch": 0.09019252423718108, + "grad_norm": 0.7258456945419312, + "learning_rate": 1.998938744460197e-05, + "loss": 1.4128, + "mean_token_accuracy": 0.654478500286738, + "num_tokens": 136971123.0, + "step": 821 + }, + { + "entropy": 1.773153026898702, + "epoch": 0.090302381148554, + "grad_norm": 0.5961330533027649, + "learning_rate": 1.9989309569972014e-05, + "loss": 1.4252, + "mean_token_accuracy": 0.6488613287607828, + "num_tokens": 137200207.0, + "step": 822 + }, + { + "entropy": 1.7114134629567463, + "epoch": 0.09041223805992694, + "grad_norm": 0.6579198241233826, + "learning_rate": 1.9989231410834462e-05, + "loss": 1.4714, + "mean_token_accuracy": 0.6568086395661036, + "num_tokens": 137407339.0, + "step": 823 + }, + { + "entropy": 1.7578770518302917, + "epoch": 0.09052209497129989, + "grad_norm": 0.7298670411109924, + "learning_rate": 1.9989152967191788e-05, + "loss": 1.4054, + "mean_token_accuracy": 0.6561851551135381, + "num_tokens": 137533281.0, + "step": 824 + }, + { + "entropy": 1.7987795968850453, + "epoch": 0.09063195188267281, + "grad_norm": 0.7497153878211975, + "learning_rate": 1.9989074239046467e-05, + "loss": 1.5478, + "mean_token_accuracy": 0.6374392211437225, + "num_tokens": 137734737.0, + "step": 825 + }, + { + "entropy": 1.8171394268671672, + "epoch": 0.09074180879404575, + "grad_norm": 0.7937276363372803, + "learning_rate": 1.9988995226401e-05, + "loss": 1.4708, + "mean_token_accuracy": 0.6504966169595718, + "num_tokens": 137901984.0, + "step": 826 + }, + { + "entropy": 1.7756889462471008, + "epoch": 0.0908516657054187, + "grad_norm": 0.7066166996955872, + "learning_rate": 1.9988915929257887e-05, + "loss": 1.3252, + "mean_token_accuracy": 0.6584804703791937, + "num_tokens": 138067546.0, + "step": 827 + }, + { + "entropy": 1.6937337219715118, + "epoch": 0.09096152261679163, + "grad_norm": 0.8448572754859924, + "learning_rate": 1.9988836347619634e-05, + "loss": 1.2457, + "mean_token_accuracy": 0.6707619080940882, + "num_tokens": 138239095.0, + "step": 828 + }, + { + "entropy": 1.777471164862315, + "epoch": 0.09107137952816456, + "grad_norm": 0.7635485529899597, + "learning_rate": 1.998875648148876e-05, + "loss": 1.4174, + "mean_token_accuracy": 0.6492311904827753, + "num_tokens": 138382813.0, + "step": 829 + }, + { + "entropy": 1.7190299828847249, + "epoch": 0.0911812364395375, + "grad_norm": 0.6621118783950806, + "learning_rate": 1.9988676330867798e-05, + "loss": 1.4274, + "mean_token_accuracy": 0.6445133884747823, + "num_tokens": 138589846.0, + "step": 830 + }, + { + "entropy": 1.7541986008485158, + "epoch": 0.09129109335091044, + "grad_norm": 0.7409399151802063, + "learning_rate": 1.9988595895759276e-05, + "loss": 1.3882, + "mean_token_accuracy": 0.6580299437046051, + "num_tokens": 138717947.0, + "step": 831 + }, + { + "entropy": 1.8276172975699108, + "epoch": 0.09140095026228337, + "grad_norm": 0.815585196018219, + "learning_rate": 1.9988515176165748e-05, + "loss": 1.6175, + "mean_token_accuracy": 0.6387762427330017, + "num_tokens": 138900529.0, + "step": 832 + }, + { + "entropy": 1.7899891436100006, + "epoch": 0.09151080717365631, + "grad_norm": 0.6510075926780701, + "learning_rate": 1.998843417208976e-05, + "loss": 1.4375, + "mean_token_accuracy": 0.6496662348508835, + "num_tokens": 139045331.0, + "step": 833 + }, + { + "entropy": 1.7600055237611134, + "epoch": 0.09162066408502925, + "grad_norm": 0.7253756523132324, + "learning_rate": 1.9988352883533883e-05, + "loss": 1.5197, + "mean_token_accuracy": 0.6373974829912186, + "num_tokens": 139188419.0, + "step": 834 + }, + { + "entropy": 1.7518123586972554, + "epoch": 0.09173052099640219, + "grad_norm": 0.735378086566925, + "learning_rate": 1.9988271310500686e-05, + "loss": 1.3998, + "mean_token_accuracy": 0.6474562138319016, + "num_tokens": 139335708.0, + "step": 835 + }, + { + "entropy": 1.795258621374766, + "epoch": 0.09184037790777512, + "grad_norm": 0.659813404083252, + "learning_rate": 1.9988189452992755e-05, + "loss": 1.4814, + "mean_token_accuracy": 0.6482225656509399, + "num_tokens": 139570367.0, + "step": 836 + }, + { + "entropy": 1.8820240795612335, + "epoch": 0.09195023481914806, + "grad_norm": 0.6837143301963806, + "learning_rate": 1.9988107311012675e-05, + "loss": 1.4863, + "mean_token_accuracy": 0.6410119732220968, + "num_tokens": 139719920.0, + "step": 837 + }, + { + "entropy": 1.7456907431284587, + "epoch": 0.092060091730521, + "grad_norm": 0.6721776127815247, + "learning_rate": 1.9988024884563054e-05, + "loss": 1.339, + "mean_token_accuracy": 0.6614281088113785, + "num_tokens": 139892505.0, + "step": 838 + }, + { + "entropy": 1.7585683763027191, + "epoch": 0.09216994864189393, + "grad_norm": 0.713774561882019, + "learning_rate": 1.9987942173646488e-05, + "loss": 1.3296, + "mean_token_accuracy": 0.6708425531784693, + "num_tokens": 140041887.0, + "step": 839 + }, + { + "entropy": 1.753789484500885, + "epoch": 0.09227980555326687, + "grad_norm": 0.9725476503372192, + "learning_rate": 1.998785917826561e-05, + "loss": 1.5496, + "mean_token_accuracy": 0.6513581027587255, + "num_tokens": 140250586.0, + "step": 840 + }, + { + "entropy": 1.8161666095256805, + "epoch": 0.09238966246463981, + "grad_norm": 0.657247006893158, + "learning_rate": 1.9987775898423036e-05, + "loss": 1.3999, + "mean_token_accuracy": 0.6466280668973923, + "num_tokens": 140445865.0, + "step": 841 + }, + { + "entropy": 1.843330095211665, + "epoch": 0.09249951937601274, + "grad_norm": 0.7465802431106567, + "learning_rate": 1.9987692334121402e-05, + "loss": 1.4241, + "mean_token_accuracy": 0.6497205942869186, + "num_tokens": 140638169.0, + "step": 842 + }, + { + "entropy": 1.737547109524409, + "epoch": 0.09260937628738568, + "grad_norm": 0.6497205495834351, + "learning_rate": 1.998760848536336e-05, + "loss": 1.4405, + "mean_token_accuracy": 0.6456060359875361, + "num_tokens": 140833553.0, + "step": 843 + }, + { + "entropy": 1.6892894407113392, + "epoch": 0.09271923319875862, + "grad_norm": 0.6569632887840271, + "learning_rate": 1.9987524352151556e-05, + "loss": 1.438, + "mean_token_accuracy": 0.6561574091513952, + "num_tokens": 141038298.0, + "step": 844 + }, + { + "entropy": 1.7608444193998973, + "epoch": 0.09282909011013156, + "grad_norm": 0.7499677538871765, + "learning_rate": 1.9987439934488656e-05, + "loss": 1.3569, + "mean_token_accuracy": 0.6573406010866165, + "num_tokens": 141167798.0, + "step": 845 + }, + { + "entropy": 1.7646370430787404, + "epoch": 0.09293894702150449, + "grad_norm": 0.8782757520675659, + "learning_rate": 1.9987355232377334e-05, + "loss": 1.4156, + "mean_token_accuracy": 0.655216450492541, + "num_tokens": 141305587.0, + "step": 846 + }, + { + "entropy": 1.6969274083773296, + "epoch": 0.09304880393287743, + "grad_norm": 0.7114330530166626, + "learning_rate": 1.9987270245820266e-05, + "loss": 1.4441, + "mean_token_accuracy": 0.6574098318815231, + "num_tokens": 141494915.0, + "step": 847 + }, + { + "entropy": 1.7382513582706451, + "epoch": 0.09315866084425037, + "grad_norm": 0.633026659488678, + "learning_rate": 1.998718497482015e-05, + "loss": 1.4415, + "mean_token_accuracy": 0.6520839134852091, + "num_tokens": 141665803.0, + "step": 848 + }, + { + "entropy": 1.7377788325150807, + "epoch": 0.0932685177556233, + "grad_norm": 0.8016728162765503, + "learning_rate": 1.9987099419379674e-05, + "loss": 1.2266, + "mean_token_accuracy": 0.6830306202173233, + "num_tokens": 141771140.0, + "step": 849 + }, + { + "entropy": 1.7722203036149342, + "epoch": 0.09337837466699624, + "grad_norm": 0.816390335559845, + "learning_rate": 1.998701357950155e-05, + "loss": 1.4453, + "mean_token_accuracy": 0.6510142137606939, + "num_tokens": 141921548.0, + "step": 850 + }, + { + "entropy": 1.797320306301117, + "epoch": 0.09348823157836918, + "grad_norm": 0.7288161516189575, + "learning_rate": 1.9986927455188503e-05, + "loss": 1.4359, + "mean_token_accuracy": 0.647617906332016, + "num_tokens": 142051031.0, + "step": 851 + }, + { + "entropy": 1.7711346745491028, + "epoch": 0.09359808848974212, + "grad_norm": 0.6476997137069702, + "learning_rate": 1.9986841046443245e-05, + "loss": 1.4482, + "mean_token_accuracy": 0.6498673806587855, + "num_tokens": 142227046.0, + "step": 852 + }, + { + "entropy": 1.7939196527004242, + "epoch": 0.09370794540111504, + "grad_norm": 0.6596160531044006, + "learning_rate": 1.9986754353268522e-05, + "loss": 1.4829, + "mean_token_accuracy": 0.6461114784081777, + "num_tokens": 142400594.0, + "step": 853 + }, + { + "entropy": 1.723371555407842, + "epoch": 0.09381780231248799, + "grad_norm": 0.8292514681816101, + "learning_rate": 1.9986667375667067e-05, + "loss": 1.3327, + "mean_token_accuracy": 0.6621809701124827, + "num_tokens": 142541930.0, + "step": 854 + }, + { + "entropy": 1.7200664281845093, + "epoch": 0.09392765922386093, + "grad_norm": 0.6569286584854126, + "learning_rate": 1.9986580113641645e-05, + "loss": 1.4073, + "mean_token_accuracy": 0.6560418456792831, + "num_tokens": 142707884.0, + "step": 855 + }, + { + "entropy": 1.763913631439209, + "epoch": 0.09403751613523385, + "grad_norm": 0.7746670842170715, + "learning_rate": 1.998649256719501e-05, + "loss": 1.257, + "mean_token_accuracy": 0.680213044087092, + "num_tokens": 142845415.0, + "step": 856 + }, + { + "entropy": 1.7523279786109924, + "epoch": 0.0941473730466068, + "grad_norm": 0.8351315259933472, + "learning_rate": 1.9986404736329935e-05, + "loss": 1.3057, + "mean_token_accuracy": 0.6638447294632593, + "num_tokens": 142991596.0, + "step": 857 + }, + { + "entropy": 1.7808737655480702, + "epoch": 0.09425722995797974, + "grad_norm": 0.7735828161239624, + "learning_rate": 1.9986316621049198e-05, + "loss": 1.3841, + "mean_token_accuracy": 0.6511393884817759, + "num_tokens": 143141446.0, + "step": 858 + }, + { + "entropy": 1.8006083170572917, + "epoch": 0.09436708686935266, + "grad_norm": 0.6989002227783203, + "learning_rate": 1.9986228221355593e-05, + "loss": 1.4593, + "mean_token_accuracy": 0.6463498423496882, + "num_tokens": 143304040.0, + "step": 859 + }, + { + "entropy": 1.7336869637171428, + "epoch": 0.0944769437807256, + "grad_norm": 0.6207464337348938, + "learning_rate": 1.998613953725191e-05, + "loss": 1.4264, + "mean_token_accuracy": 0.6408118307590485, + "num_tokens": 143521739.0, + "step": 860 + }, + { + "entropy": 1.7804212868213654, + "epoch": 0.09458680069209854, + "grad_norm": 0.7322898507118225, + "learning_rate": 1.998605056874096e-05, + "loss": 1.2565, + "mean_token_accuracy": 0.6800819089015325, + "num_tokens": 143669420.0, + "step": 861 + }, + { + "entropy": 1.8095572888851166, + "epoch": 0.09469665760347148, + "grad_norm": 0.6634199619293213, + "learning_rate": 1.998596131582556e-05, + "loss": 1.3476, + "mean_token_accuracy": 0.6624392867088318, + "num_tokens": 143890464.0, + "step": 862 + }, + { + "entropy": 1.7374099691708882, + "epoch": 0.09480651451484441, + "grad_norm": 0.5917066931724548, + "learning_rate": 1.9985871778508536e-05, + "loss": 1.3316, + "mean_token_accuracy": 0.6642079999049505, + "num_tokens": 144071612.0, + "step": 863 + }, + { + "entropy": 1.7235216995080311, + "epoch": 0.09491637142621735, + "grad_norm": 0.6594040989875793, + "learning_rate": 1.9985781956792712e-05, + "loss": 1.5624, + "mean_token_accuracy": 0.6464419017235438, + "num_tokens": 144275211.0, + "step": 864 + }, + { + "entropy": 1.7869758109251659, + "epoch": 0.0950262283375903, + "grad_norm": 0.6814702153205872, + "learning_rate": 1.9985691850680945e-05, + "loss": 1.3704, + "mean_token_accuracy": 0.6551233877738317, + "num_tokens": 144461229.0, + "step": 865 + }, + { + "entropy": 1.7437097529570262, + "epoch": 0.09513608524896322, + "grad_norm": 0.6789867281913757, + "learning_rate": 1.998560146017608e-05, + "loss": 1.4037, + "mean_token_accuracy": 0.6533855001131693, + "num_tokens": 144626703.0, + "step": 866 + }, + { + "entropy": 1.797066976626714, + "epoch": 0.09524594216033616, + "grad_norm": 0.7511526346206665, + "learning_rate": 1.9985510785280973e-05, + "loss": 1.4707, + "mean_token_accuracy": 0.6475979636112849, + "num_tokens": 144816688.0, + "step": 867 + }, + { + "entropy": 1.8556463519732158, + "epoch": 0.0953557990717091, + "grad_norm": 0.729511559009552, + "learning_rate": 1.99854198259985e-05, + "loss": 1.4624, + "mean_token_accuracy": 0.6474516242742538, + "num_tokens": 144974185.0, + "step": 868 + }, + { + "entropy": 1.7502427001794179, + "epoch": 0.09546565598308203, + "grad_norm": 0.7686792016029358, + "learning_rate": 1.9985328582331543e-05, + "loss": 1.3928, + "mean_token_accuracy": 0.6506832788387934, + "num_tokens": 145131985.0, + "step": 869 + }, + { + "entropy": 1.8473133345444996, + "epoch": 0.09557551289445497, + "grad_norm": 0.7290443181991577, + "learning_rate": 1.998523705428298e-05, + "loss": 1.4376, + "mean_token_accuracy": 0.6440041263898214, + "num_tokens": 145300425.0, + "step": 870 + }, + { + "entropy": 1.7909137805302937, + "epoch": 0.09568536980582791, + "grad_norm": 1.0265684127807617, + "learning_rate": 1.9985145241855715e-05, + "loss": 1.3176, + "mean_token_accuracy": 0.6611884931723276, + "num_tokens": 145434375.0, + "step": 871 + }, + { + "entropy": 1.8016241292158763, + "epoch": 0.09579522671720085, + "grad_norm": 0.8837740421295166, + "learning_rate": 1.998505314505265e-05, + "loss": 1.4417, + "mean_token_accuracy": 0.6421179672082266, + "num_tokens": 145595112.0, + "step": 872 + }, + { + "entropy": 1.7559907833735149, + "epoch": 0.09590508362857378, + "grad_norm": 0.7442582249641418, + "learning_rate": 1.9984960763876707e-05, + "loss": 1.4568, + "mean_token_accuracy": 0.6378819495439529, + "num_tokens": 145809619.0, + "step": 873 + }, + { + "entropy": 1.7758004764715831, + "epoch": 0.09601494053994672, + "grad_norm": 0.7401770353317261, + "learning_rate": 1.99848680983308e-05, + "loss": 1.3542, + "mean_token_accuracy": 0.6558797508478165, + "num_tokens": 145962185.0, + "step": 874 + }, + { + "entropy": 1.701437811056773, + "epoch": 0.09612479745131966, + "grad_norm": 0.6981661319732666, + "learning_rate": 1.998477514841787e-05, + "loss": 1.272, + "mean_token_accuracy": 0.6712963829437891, + "num_tokens": 146122219.0, + "step": 875 + }, + { + "entropy": 1.8346630732218425, + "epoch": 0.09623465436269259, + "grad_norm": 1.0015085935592651, + "learning_rate": 1.998468191414085e-05, + "loss": 1.4049, + "mean_token_accuracy": 0.6612652838230133, + "num_tokens": 146286262.0, + "step": 876 + }, + { + "entropy": 1.8047532737255096, + "epoch": 0.09634451127406553, + "grad_norm": 0.6116564869880676, + "learning_rate": 1.99845883955027e-05, + "loss": 1.4939, + "mean_token_accuracy": 0.6393746683994929, + "num_tokens": 146482748.0, + "step": 877 + }, + { + "entropy": 1.8028401136398315, + "epoch": 0.09645436818543847, + "grad_norm": 0.8170140981674194, + "learning_rate": 1.9984494592506375e-05, + "loss": 1.4695, + "mean_token_accuracy": 0.6653387248516083, + "num_tokens": 146623953.0, + "step": 878 + }, + { + "entropy": 1.8251157999038696, + "epoch": 0.09656422509681141, + "grad_norm": 0.7140014171600342, + "learning_rate": 1.9984400505154845e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6512231677770615, + "num_tokens": 146771851.0, + "step": 879 + }, + { + "entropy": 1.7021491626898448, + "epoch": 0.09667408200818434, + "grad_norm": 0.7085208296775818, + "learning_rate": 1.9984306133451085e-05, + "loss": 1.5546, + "mean_token_accuracy": 0.6158442795276642, + "num_tokens": 147018276.0, + "step": 880 + }, + { + "entropy": 1.7345736026763916, + "epoch": 0.09678393891955728, + "grad_norm": 0.6248441934585571, + "learning_rate": 1.9984211477398087e-05, + "loss": 1.3856, + "mean_token_accuracy": 0.6412098606427511, + "num_tokens": 147164089.0, + "step": 881 + }, + { + "entropy": 1.7824423809846242, + "epoch": 0.09689379583093022, + "grad_norm": 0.7416592240333557, + "learning_rate": 1.9984116536998842e-05, + "loss": 1.3108, + "mean_token_accuracy": 0.6656797031561533, + "num_tokens": 147277397.0, + "step": 882 + }, + { + "entropy": 1.7290455897649128, + "epoch": 0.09700365274230315, + "grad_norm": 0.5920156836509705, + "learning_rate": 1.998402131225636e-05, + "loss": 1.464, + "mean_token_accuracy": 0.646095464626948, + "num_tokens": 147526287.0, + "step": 883 + }, + { + "entropy": 1.8121102452278137, + "epoch": 0.09711350965367609, + "grad_norm": 0.6940451264381409, + "learning_rate": 1.998392580317365e-05, + "loss": 1.5421, + "mean_token_accuracy": 0.6315521448850632, + "num_tokens": 147680370.0, + "step": 884 + }, + { + "entropy": 1.7699245909849803, + "epoch": 0.09722336656504903, + "grad_norm": 0.6525464653968811, + "learning_rate": 1.9983830009753736e-05, + "loss": 1.3875, + "mean_token_accuracy": 0.6589068621397018, + "num_tokens": 147840870.0, + "step": 885 + }, + { + "entropy": 1.7928034762541454, + "epoch": 0.09733322347642195, + "grad_norm": 0.6272910833358765, + "learning_rate": 1.9983733931999652e-05, + "loss": 1.4885, + "mean_token_accuracy": 0.6306471476952235, + "num_tokens": 148027455.0, + "step": 886 + }, + { + "entropy": 1.7910848359266918, + "epoch": 0.0974430803877949, + "grad_norm": 0.7335218191146851, + "learning_rate": 1.9983637569914434e-05, + "loss": 1.609, + "mean_token_accuracy": 0.619565524160862, + "num_tokens": 148195323.0, + "step": 887 + }, + { + "entropy": 1.8034427464008331, + "epoch": 0.09755293729916784, + "grad_norm": 1.0481171607971191, + "learning_rate": 1.9983540923501136e-05, + "loss": 1.3526, + "mean_token_accuracy": 0.6704971541961035, + "num_tokens": 148342254.0, + "step": 888 + }, + { + "entropy": 1.799752136071523, + "epoch": 0.09766279421054078, + "grad_norm": 0.8031909465789795, + "learning_rate": 1.9983443992762818e-05, + "loss": 1.4918, + "mean_token_accuracy": 0.660582959651947, + "num_tokens": 148456069.0, + "step": 889 + }, + { + "entropy": 1.7524477740128834, + "epoch": 0.0977726511219137, + "grad_norm": 0.66947340965271, + "learning_rate": 1.9983346777702546e-05, + "loss": 1.422, + "mean_token_accuracy": 0.6511275370915731, + "num_tokens": 148639070.0, + "step": 890 + }, + { + "entropy": 1.760518193244934, + "epoch": 0.09788250803328664, + "grad_norm": 0.71653151512146, + "learning_rate": 1.9983249278323394e-05, + "loss": 1.4249, + "mean_token_accuracy": 0.64531609416008, + "num_tokens": 148808597.0, + "step": 891 + }, + { + "entropy": 1.7460968097050984, + "epoch": 0.09799236494465959, + "grad_norm": 0.7005469799041748, + "learning_rate": 1.9983151494628452e-05, + "loss": 1.4135, + "mean_token_accuracy": 0.6504943122466406, + "num_tokens": 148986736.0, + "step": 892 + }, + { + "entropy": 1.6886422137419383, + "epoch": 0.09810222185603251, + "grad_norm": 0.7256191968917847, + "learning_rate": 1.9983053426620812e-05, + "loss": 1.282, + "mean_token_accuracy": 0.6670690476894379, + "num_tokens": 149123138.0, + "step": 893 + }, + { + "entropy": 1.764316588640213, + "epoch": 0.09821207876740545, + "grad_norm": 0.7242082357406616, + "learning_rate": 1.998295507430358e-05, + "loss": 1.4177, + "mean_token_accuracy": 0.6532685707012812, + "num_tokens": 149304620.0, + "step": 894 + }, + { + "entropy": 1.709983338912328, + "epoch": 0.0983219356787784, + "grad_norm": 0.760214626789093, + "learning_rate": 1.998285643767987e-05, + "loss": 1.3549, + "mean_token_accuracy": 0.6476463874181112, + "num_tokens": 149458167.0, + "step": 895 + }, + { + "entropy": 1.7316644787788391, + "epoch": 0.09843179259015133, + "grad_norm": 0.7626999020576477, + "learning_rate": 1.99827575167528e-05, + "loss": 1.3188, + "mean_token_accuracy": 0.6596734821796417, + "num_tokens": 149637723.0, + "step": 896 + }, + { + "entropy": 1.737456738948822, + "epoch": 0.09854164950152426, + "grad_norm": 0.7550251483917236, + "learning_rate": 1.9982658311525497e-05, + "loss": 1.488, + "mean_token_accuracy": 0.6350032538175583, + "num_tokens": 149820282.0, + "step": 897 + }, + { + "entropy": 1.7865496476491292, + "epoch": 0.0986515064128972, + "grad_norm": 0.7146956324577332, + "learning_rate": 1.9982558822001107e-05, + "loss": 1.3048, + "mean_token_accuracy": 0.6644372095664343, + "num_tokens": 149984370.0, + "step": 898 + }, + { + "entropy": 1.7460536360740662, + "epoch": 0.09876136332427014, + "grad_norm": 0.7782373428344727, + "learning_rate": 1.9982459048182787e-05, + "loss": 1.2861, + "mean_token_accuracy": 0.6806688259045283, + "num_tokens": 150100533.0, + "step": 899 + }, + { + "entropy": 1.7238269646962483, + "epoch": 0.09887122023564307, + "grad_norm": 0.9449548125267029, + "learning_rate": 1.9982358990073677e-05, + "loss": 1.3353, + "mean_token_accuracy": 0.6628151287635168, + "num_tokens": 150251335.0, + "step": 900 + }, + { + "entropy": 1.808404137690862, + "epoch": 0.09898107714701601, + "grad_norm": 0.7466446161270142, + "learning_rate": 1.9982258647676955e-05, + "loss": 1.5167, + "mean_token_accuracy": 0.6374181012312571, + "num_tokens": 150410855.0, + "step": 901 + }, + { + "entropy": 1.7830796142419179, + "epoch": 0.09909093405838895, + "grad_norm": 0.6440333127975464, + "learning_rate": 1.9982158020995797e-05, + "loss": 1.611, + "mean_token_accuracy": 0.6330165167649587, + "num_tokens": 150600761.0, + "step": 902 + }, + { + "entropy": 1.792793979247411, + "epoch": 0.09920079096976188, + "grad_norm": 0.6835723519325256, + "learning_rate": 1.998205711003338e-05, + "loss": 1.6041, + "mean_token_accuracy": 0.6333948771158854, + "num_tokens": 150845548.0, + "step": 903 + }, + { + "entropy": 1.7442950308322906, + "epoch": 0.09931064788113482, + "grad_norm": 0.7617843151092529, + "learning_rate": 1.9981955914792906e-05, + "loss": 1.3282, + "mean_token_accuracy": 0.6637561370929083, + "num_tokens": 150972304.0, + "step": 904 + }, + { + "entropy": 1.7536637882391612, + "epoch": 0.09942050479250776, + "grad_norm": 0.7179109454154968, + "learning_rate": 1.9981854435277577e-05, + "loss": 1.5067, + "mean_token_accuracy": 0.6481045782566071, + "num_tokens": 151143396.0, + "step": 905 + }, + { + "entropy": 1.7627435723940532, + "epoch": 0.0995303617038807, + "grad_norm": 0.7069958448410034, + "learning_rate": 1.9981752671490598e-05, + "loss": 1.3833, + "mean_token_accuracy": 0.6542354027430216, + "num_tokens": 151325547.0, + "step": 906 + }, + { + "entropy": 1.7693799436092377, + "epoch": 0.09964021861525363, + "grad_norm": 0.6431506872177124, + "learning_rate": 1.9981650623435194e-05, + "loss": 1.3678, + "mean_token_accuracy": 0.6576612442731857, + "num_tokens": 151536185.0, + "step": 907 + }, + { + "entropy": 1.7635951141516368, + "epoch": 0.09975007552662657, + "grad_norm": 0.7126159071922302, + "learning_rate": 1.9981548291114595e-05, + "loss": 1.3731, + "mean_token_accuracy": 0.6596753050883611, + "num_tokens": 151683858.0, + "step": 908 + }, + { + "entropy": 1.7573548754056294, + "epoch": 0.09985993243799951, + "grad_norm": 0.8124845027923584, + "learning_rate": 1.9981445674532046e-05, + "loss": 1.2352, + "mean_token_accuracy": 0.6722429444392523, + "num_tokens": 151803918.0, + "step": 909 + }, + { + "entropy": 1.7850454151630402, + "epoch": 0.09996978934937244, + "grad_norm": 0.7685918211936951, + "learning_rate": 1.9981342773690783e-05, + "loss": 1.512, + "mean_token_accuracy": 0.6380599588155746, + "num_tokens": 151971626.0, + "step": 910 + }, + { + "entropy": 1.805126855770747, + "epoch": 0.10007964626074538, + "grad_norm": 0.6546812057495117, + "learning_rate": 1.9981239588594072e-05, + "loss": 1.5592, + "mean_token_accuracy": 0.6306341290473938, + "num_tokens": 152201475.0, + "step": 911 + }, + { + "entropy": 1.7827772498130798, + "epoch": 0.10018950317211832, + "grad_norm": 0.6877257823944092, + "learning_rate": 1.998113611924517e-05, + "loss": 1.4371, + "mean_token_accuracy": 0.6537010818719864, + "num_tokens": 152351189.0, + "step": 912 + }, + { + "entropy": 1.7649867534637451, + "epoch": 0.10029936008349125, + "grad_norm": 0.7598903179168701, + "learning_rate": 1.998103236564736e-05, + "loss": 1.372, + "mean_token_accuracy": 0.6497568885485331, + "num_tokens": 152524073.0, + "step": 913 + }, + { + "entropy": 1.7680945495764415, + "epoch": 0.10040921699486419, + "grad_norm": 0.7872158885002136, + "learning_rate": 1.9980928327803923e-05, + "loss": 1.5074, + "mean_token_accuracy": 0.6456826428572336, + "num_tokens": 152670080.0, + "step": 914 + }, + { + "entropy": 1.7560344437758129, + "epoch": 0.10051907390623713, + "grad_norm": 0.5883249640464783, + "learning_rate": 1.998082400571815e-05, + "loss": 1.3791, + "mean_token_accuracy": 0.64297587176164, + "num_tokens": 152845888.0, + "step": 915 + }, + { + "entropy": 1.7675274113814037, + "epoch": 0.10062893081761007, + "grad_norm": 0.7053165435791016, + "learning_rate": 1.9980719399393343e-05, + "loss": 1.4325, + "mean_token_accuracy": 0.6427458872397741, + "num_tokens": 152978290.0, + "step": 916 + }, + { + "entropy": 1.7399719854195912, + "epoch": 0.100738787728983, + "grad_norm": 0.7673996686935425, + "learning_rate": 1.9980614508832815e-05, + "loss": 1.425, + "mean_token_accuracy": 0.6594684372345606, + "num_tokens": 153149847.0, + "step": 917 + }, + { + "entropy": 1.762896368900935, + "epoch": 0.10084864464035594, + "grad_norm": 0.8135155439376831, + "learning_rate": 1.9980509334039885e-05, + "loss": 1.5537, + "mean_token_accuracy": 0.6251424799362818, + "num_tokens": 153411080.0, + "step": 918 + }, + { + "entropy": 1.817118614912033, + "epoch": 0.10095850155172888, + "grad_norm": 1.1829191446304321, + "learning_rate": 1.998040387501788e-05, + "loss": 1.4132, + "mean_token_accuracy": 0.6469329843918482, + "num_tokens": 153647902.0, + "step": 919 + }, + { + "entropy": 1.723706712325414, + "epoch": 0.1010683584631018, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.998029813177014e-05, + "loss": 1.4235, + "mean_token_accuracy": 0.6411093920469284, + "num_tokens": 153887618.0, + "step": 920 + }, + { + "entropy": 1.7523813048998516, + "epoch": 0.10117821537447474, + "grad_norm": 0.6740756034851074, + "learning_rate": 1.998019210430001e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6468822509050369, + "num_tokens": 154040907.0, + "step": 921 + }, + { + "entropy": 1.7299329936504364, + "epoch": 0.10128807228584769, + "grad_norm": 0.670302152633667, + "learning_rate": 1.998008579261085e-05, + "loss": 1.4126, + "mean_token_accuracy": 0.6671505371729533, + "num_tokens": 154192950.0, + "step": 922 + }, + { + "entropy": 1.763699213663737, + "epoch": 0.10139792919722063, + "grad_norm": 0.803800106048584, + "learning_rate": 1.9979979196706015e-05, + "loss": 1.478, + "mean_token_accuracy": 0.6557512432336807, + "num_tokens": 154343446.0, + "step": 923 + }, + { + "entropy": 1.7053393324216206, + "epoch": 0.10150778610859355, + "grad_norm": 0.9171890020370483, + "learning_rate": 1.997987231658889e-05, + "loss": 1.4155, + "mean_token_accuracy": 0.6599433819452921, + "num_tokens": 154542170.0, + "step": 924 + }, + { + "entropy": 1.8018841644128163, + "epoch": 0.1016176430199665, + "grad_norm": 0.7261690497398376, + "learning_rate": 1.997976515226285e-05, + "loss": 1.3835, + "mean_token_accuracy": 0.6623082359631857, + "num_tokens": 154683041.0, + "step": 925 + }, + { + "entropy": 1.8124282856782277, + "epoch": 0.10172749993133944, + "grad_norm": 0.8906520009040833, + "learning_rate": 1.9979657703731293e-05, + "loss": 1.424, + "mean_token_accuracy": 0.6509944200515747, + "num_tokens": 154823935.0, + "step": 926 + }, + { + "entropy": 1.680219441652298, + "epoch": 0.10183735684271236, + "grad_norm": 0.7079525589942932, + "learning_rate": 1.9979549970997613e-05, + "loss": 1.35, + "mean_token_accuracy": 0.6591299722592036, + "num_tokens": 154979227.0, + "step": 927 + }, + { + "entropy": 1.7787267863750458, + "epoch": 0.1019472137540853, + "grad_norm": 0.8107025623321533, + "learning_rate": 1.9979441954065222e-05, + "loss": 1.3122, + "mean_token_accuracy": 0.6689551870028178, + "num_tokens": 155096447.0, + "step": 928 + }, + { + "entropy": 1.70075523853302, + "epoch": 0.10205707066545824, + "grad_norm": 0.642701268196106, + "learning_rate": 1.997933365293754e-05, + "loss": 1.4148, + "mean_token_accuracy": 0.6453291227420171, + "num_tokens": 155272639.0, + "step": 929 + }, + { + "entropy": 1.778489778439204, + "epoch": 0.10216692757683117, + "grad_norm": 0.7005394101142883, + "learning_rate": 1.9979225067617995e-05, + "loss": 1.3191, + "mean_token_accuracy": 0.662499725818634, + "num_tokens": 155389573.0, + "step": 930 + }, + { + "entropy": 1.7405783832073212, + "epoch": 0.10227678448820411, + "grad_norm": 0.7797371745109558, + "learning_rate": 1.9979116198110022e-05, + "loss": 1.4646, + "mean_token_accuracy": 0.6487743357817332, + "num_tokens": 155582478.0, + "step": 931 + }, + { + "entropy": 1.8176141182581584, + "epoch": 0.10238664139957705, + "grad_norm": 0.6615299582481384, + "learning_rate": 1.9979007044417068e-05, + "loss": 1.5006, + "mean_token_accuracy": 0.6347943594058355, + "num_tokens": 155734907.0, + "step": 932 + }, + { + "entropy": 1.801711489756902, + "epoch": 0.10249649831095, + "grad_norm": 0.9806280136108398, + "learning_rate": 1.9978897606542585e-05, + "loss": 1.532, + "mean_token_accuracy": 0.636087437470754, + "num_tokens": 155891281.0, + "step": 933 + }, + { + "entropy": 1.719766726096471, + "epoch": 0.10260635522232292, + "grad_norm": 0.6877168416976929, + "learning_rate": 1.9978787884490042e-05, + "loss": 1.4065, + "mean_token_accuracy": 0.6426805555820465, + "num_tokens": 156096557.0, + "step": 934 + }, + { + "entropy": 1.808685193459193, + "epoch": 0.10271621213369586, + "grad_norm": 0.8238107562065125, + "learning_rate": 1.997867787826291e-05, + "loss": 1.2805, + "mean_token_accuracy": 0.6664846589167913, + "num_tokens": 156197634.0, + "step": 935 + }, + { + "entropy": 1.6944943865140278, + "epoch": 0.1028260690450688, + "grad_norm": 0.5987890958786011, + "learning_rate": 1.9978567587864662e-05, + "loss": 1.2886, + "mean_token_accuracy": 0.6632740994294485, + "num_tokens": 156353948.0, + "step": 936 + }, + { + "entropy": 1.7655479113260906, + "epoch": 0.10293592595644173, + "grad_norm": 0.6797613501548767, + "learning_rate": 1.99784570132988e-05, + "loss": 1.4642, + "mean_token_accuracy": 0.6423184126615524, + "num_tokens": 156535094.0, + "step": 937 + }, + { + "entropy": 1.7759600281715393, + "epoch": 0.10304578286781467, + "grad_norm": 0.7226538062095642, + "learning_rate": 1.9978346154568816e-05, + "loss": 1.4255, + "mean_token_accuracy": 0.6578917105992635, + "num_tokens": 156662107.0, + "step": 938 + }, + { + "entropy": 1.7849325835704803, + "epoch": 0.10315563977918761, + "grad_norm": 0.7838888764381409, + "learning_rate": 1.9978235011678227e-05, + "loss": 1.457, + "mean_token_accuracy": 0.6387642721335093, + "num_tokens": 156816532.0, + "step": 939 + }, + { + "entropy": 1.8332215547561646, + "epoch": 0.10326549669056055, + "grad_norm": 1.3963543176651, + "learning_rate": 1.9978123584630543e-05, + "loss": 1.5022, + "mean_token_accuracy": 0.6504113674163818, + "num_tokens": 157001513.0, + "step": 940 + }, + { + "entropy": 1.7760482331116993, + "epoch": 0.10337535360193348, + "grad_norm": 0.7974473237991333, + "learning_rate": 1.9978011873429293e-05, + "loss": 1.5326, + "mean_token_accuracy": 0.6438284814357758, + "num_tokens": 157140942.0, + "step": 941 + }, + { + "entropy": 1.739799976348877, + "epoch": 0.10348521051330642, + "grad_norm": 0.6732959151268005, + "learning_rate": 1.9977899878078014e-05, + "loss": 1.3565, + "mean_token_accuracy": 0.6677434494098028, + "num_tokens": 157322095.0, + "step": 942 + }, + { + "entropy": 1.7245545585950215, + "epoch": 0.10359506742467936, + "grad_norm": 0.6832246780395508, + "learning_rate": 1.997778759858025e-05, + "loss": 1.4486, + "mean_token_accuracy": 0.6488750129938126, + "num_tokens": 157489321.0, + "step": 943 + }, + { + "entropy": 1.7096454600493114, + "epoch": 0.10370492433605229, + "grad_norm": 0.669666588306427, + "learning_rate": 1.9977675034939552e-05, + "loss": 1.398, + "mean_token_accuracy": 0.6563703020413717, + "num_tokens": 157648478.0, + "step": 944 + }, + { + "entropy": 1.7183943092823029, + "epoch": 0.10381478124742523, + "grad_norm": 0.6543197631835938, + "learning_rate": 1.9977562187159485e-05, + "loss": 1.5301, + "mean_token_accuracy": 0.6386249661445618, + "num_tokens": 157843823.0, + "step": 945 + }, + { + "entropy": 1.8044803241888683, + "epoch": 0.10392463815879817, + "grad_norm": 0.8148881793022156, + "learning_rate": 1.997744905524362e-05, + "loss": 1.3916, + "mean_token_accuracy": 0.6454024910926819, + "num_tokens": 157990593.0, + "step": 946 + }, + { + "entropy": 1.6993337571620941, + "epoch": 0.1040344950701711, + "grad_norm": 0.6236430406570435, + "learning_rate": 1.997733563919554e-05, + "loss": 1.3219, + "mean_token_accuracy": 0.6601608147223791, + "num_tokens": 158132629.0, + "step": 947 + }, + { + "entropy": 1.8001770675182343, + "epoch": 0.10414435198154404, + "grad_norm": 0.884178638458252, + "learning_rate": 1.9977221939018828e-05, + "loss": 1.2416, + "mean_token_accuracy": 0.6784195502599081, + "num_tokens": 158246943.0, + "step": 948 + }, + { + "entropy": 1.6941777964433034, + "epoch": 0.10425420889291698, + "grad_norm": 0.6711726784706116, + "learning_rate": 1.997710795471709e-05, + "loss": 1.3108, + "mean_token_accuracy": 0.6635520309209824, + "num_tokens": 158415493.0, + "step": 949 + }, + { + "entropy": 1.7117989857991536, + "epoch": 0.10436406580428992, + "grad_norm": 0.6808717846870422, + "learning_rate": 1.997699368629393e-05, + "loss": 1.2102, + "mean_token_accuracy": 0.6949316064516703, + "num_tokens": 158540142.0, + "step": 950 + }, + { + "entropy": 1.6830700536568959, + "epoch": 0.10447392271566285, + "grad_norm": 0.5724490880966187, + "learning_rate": 1.9976879133752968e-05, + "loss": 1.3332, + "mean_token_accuracy": 0.6712360580762228, + "num_tokens": 158724951.0, + "step": 951 + }, + { + "entropy": 1.7850287755330403, + "epoch": 0.10458377962703579, + "grad_norm": 0.6788071393966675, + "learning_rate": 1.9976764297097822e-05, + "loss": 1.5117, + "mean_token_accuracy": 0.6374179919560751, + "num_tokens": 158927317.0, + "step": 952 + }, + { + "entropy": 1.7926728427410126, + "epoch": 0.10469363653840873, + "grad_norm": 0.6966988444328308, + "learning_rate": 1.997664917633213e-05, + "loss": 1.4233, + "mean_token_accuracy": 0.6491448630889257, + "num_tokens": 159079596.0, + "step": 953 + }, + { + "entropy": 1.8218385577201843, + "epoch": 0.10480349344978165, + "grad_norm": 0.6010218262672424, + "learning_rate": 1.997653377145954e-05, + "loss": 1.5112, + "mean_token_accuracy": 0.6297382464011511, + "num_tokens": 159274755.0, + "step": 954 + }, + { + "entropy": 1.7728229264418285, + "epoch": 0.1049133503611546, + "grad_norm": 0.7513667345046997, + "learning_rate": 1.9976418082483702e-05, + "loss": 1.4339, + "mean_token_accuracy": 0.6446366707483927, + "num_tokens": 159482155.0, + "step": 955 + }, + { + "entropy": 1.7381539046764374, + "epoch": 0.10502320727252754, + "grad_norm": 0.7744698524475098, + "learning_rate": 1.9976302109408274e-05, + "loss": 1.2582, + "mean_token_accuracy": 0.6699808637301127, + "num_tokens": 159597958.0, + "step": 956 + }, + { + "entropy": 1.7696404854456584, + "epoch": 0.10513306418390048, + "grad_norm": 0.7949146032333374, + "learning_rate": 1.997618585223693e-05, + "loss": 1.5213, + "mean_token_accuracy": 0.644510825475057, + "num_tokens": 159798577.0, + "step": 957 + }, + { + "entropy": 1.7655962506930034, + "epoch": 0.1052429210952734, + "grad_norm": 0.8974981904029846, + "learning_rate": 1.9976069310973346e-05, + "loss": 1.5157, + "mean_token_accuracy": 0.6608059157927831, + "num_tokens": 159983233.0, + "step": 958 + }, + { + "entropy": 1.8399652739365895, + "epoch": 0.10535277800664634, + "grad_norm": 0.7629010081291199, + "learning_rate": 1.9975952485621216e-05, + "loss": 1.5352, + "mean_token_accuracy": 0.6394019822279612, + "num_tokens": 160163249.0, + "step": 959 + }, + { + "entropy": 1.791842778523763, + "epoch": 0.10546263491801929, + "grad_norm": 0.6020023822784424, + "learning_rate": 1.9975835376184234e-05, + "loss": 1.5612, + "mean_token_accuracy": 0.636442189415296, + "num_tokens": 160345525.0, + "step": 960 + }, + { + "entropy": 1.7129618525505066, + "epoch": 0.10557249182939221, + "grad_norm": 0.6331304311752319, + "learning_rate": 1.9975717982666106e-05, + "loss": 1.3834, + "mean_token_accuracy": 0.6419539799292883, + "num_tokens": 160529851.0, + "step": 961 + }, + { + "entropy": 1.719476044178009, + "epoch": 0.10568234874076515, + "grad_norm": 0.6035734415054321, + "learning_rate": 1.997560030507055e-05, + "loss": 1.3994, + "mean_token_accuracy": 0.6515898952881495, + "num_tokens": 160716656.0, + "step": 962 + }, + { + "entropy": 1.7441074351469676, + "epoch": 0.1057922056521381, + "grad_norm": 0.7362382411956787, + "learning_rate": 1.9975482343401288e-05, + "loss": 1.4038, + "mean_token_accuracy": 0.6557846516370773, + "num_tokens": 160877936.0, + "step": 963 + }, + { + "entropy": 1.7481865584850311, + "epoch": 0.10590206256351102, + "grad_norm": 0.7199084162712097, + "learning_rate": 1.9975364097662052e-05, + "loss": 1.4177, + "mean_token_accuracy": 0.6450687150160471, + "num_tokens": 161095705.0, + "step": 964 + }, + { + "entropy": 1.8243679702281952, + "epoch": 0.10601191947488396, + "grad_norm": 0.7219046950340271, + "learning_rate": 1.9975245567856588e-05, + "loss": 1.3518, + "mean_token_accuracy": 0.6582407156626383, + "num_tokens": 161210783.0, + "step": 965 + }, + { + "entropy": 1.7808333535989125, + "epoch": 0.1061217763862569, + "grad_norm": 2.4213051795959473, + "learning_rate": 1.9975126753988647e-05, + "loss": 1.1241, + "mean_token_accuracy": 0.6723516434431076, + "num_tokens": 161360356.0, + "step": 966 + }, + { + "entropy": 1.7639015515645344, + "epoch": 0.10623163329762984, + "grad_norm": 0.7218215465545654, + "learning_rate": 1.997500765606199e-05, + "loss": 1.3598, + "mean_token_accuracy": 0.6689661492904028, + "num_tokens": 161492497.0, + "step": 967 + }, + { + "entropy": 1.7146398623784382, + "epoch": 0.10634149020900277, + "grad_norm": 0.6931177377700806, + "learning_rate": 1.997488827408038e-05, + "loss": 1.253, + "mean_token_accuracy": 0.6601031124591827, + "num_tokens": 161610533.0, + "step": 968 + }, + { + "entropy": 1.8045812646547954, + "epoch": 0.10645134712037571, + "grad_norm": 0.7326215505599976, + "learning_rate": 1.997476860804761e-05, + "loss": 1.417, + "mean_token_accuracy": 0.6494764337937037, + "num_tokens": 161748491.0, + "step": 969 + }, + { + "entropy": 1.8294326663017273, + "epoch": 0.10656120403174865, + "grad_norm": 1.0038849115371704, + "learning_rate": 1.9974648657967446e-05, + "loss": 1.6506, + "mean_token_accuracy": 0.6360284189383189, + "num_tokens": 161920941.0, + "step": 970 + }, + { + "entropy": 1.7476938267548878, + "epoch": 0.10667106094312158, + "grad_norm": 0.6628261804580688, + "learning_rate": 1.9974528423843703e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.6495839109023412, + "num_tokens": 162065785.0, + "step": 971 + }, + { + "entropy": 1.724196970462799, + "epoch": 0.10678091785449452, + "grad_norm": 0.7884043455123901, + "learning_rate": 1.9974407905680176e-05, + "loss": 1.2882, + "mean_token_accuracy": 0.6736412594715754, + "num_tokens": 162210428.0, + "step": 972 + }, + { + "entropy": 1.7328240772088368, + "epoch": 0.10689077476586746, + "grad_norm": 0.7030888795852661, + "learning_rate": 1.9974287103480684e-05, + "loss": 1.4312, + "mean_token_accuracy": 0.6504674156506857, + "num_tokens": 162358621.0, + "step": 973 + }, + { + "entropy": 1.7429528137048085, + "epoch": 0.10700063167724039, + "grad_norm": 0.782061755657196, + "learning_rate": 1.997416601724905e-05, + "loss": 1.2338, + "mean_token_accuracy": 0.6769704719384512, + "num_tokens": 162479846.0, + "step": 974 + }, + { + "entropy": 1.7454247772693634, + "epoch": 0.10711048858861333, + "grad_norm": 0.6557988524436951, + "learning_rate": 1.9974044646989104e-05, + "loss": 1.4533, + "mean_token_accuracy": 0.6591964761416117, + "num_tokens": 162650080.0, + "step": 975 + }, + { + "entropy": 1.7760846018791199, + "epoch": 0.10722034549998627, + "grad_norm": 0.6828281879425049, + "learning_rate": 1.997392299270469e-05, + "loss": 1.407, + "mean_token_accuracy": 0.651309072971344, + "num_tokens": 162793424.0, + "step": 976 + }, + { + "entropy": 1.6812183062235515, + "epoch": 0.10733020241135921, + "grad_norm": 0.6630516052246094, + "learning_rate": 1.997380105439966e-05, + "loss": 1.3284, + "mean_token_accuracy": 0.6731794824202856, + "num_tokens": 162959826.0, + "step": 977 + }, + { + "entropy": 1.6819258431593578, + "epoch": 0.10744005932273214, + "grad_norm": 0.7564113736152649, + "learning_rate": 1.9973678832077864e-05, + "loss": 1.3177, + "mean_token_accuracy": 0.6723869691292444, + "num_tokens": 163115903.0, + "step": 978 + }, + { + "entropy": 1.7295123835404713, + "epoch": 0.10754991623410508, + "grad_norm": 0.7460121512413025, + "learning_rate": 1.997355632574318e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.671826089421908, + "num_tokens": 163258000.0, + "step": 979 + }, + { + "entropy": 1.739086627960205, + "epoch": 0.10765977314547802, + "grad_norm": 0.7044259905815125, + "learning_rate": 1.997343353539948e-05, + "loss": 1.243, + "mean_token_accuracy": 0.670919140179952, + "num_tokens": 163390903.0, + "step": 980 + }, + { + "entropy": 1.7413827578226726, + "epoch": 0.10776963005685095, + "grad_norm": 0.6770658493041992, + "learning_rate": 1.9973310461050656e-05, + "loss": 1.5073, + "mean_token_accuracy": 0.6495876014232635, + "num_tokens": 163623202.0, + "step": 981 + }, + { + "entropy": 1.7152122557163239, + "epoch": 0.10787948696822389, + "grad_norm": 0.6223786473274231, + "learning_rate": 1.99731871027006e-05, + "loss": 1.3162, + "mean_token_accuracy": 0.6673212250073751, + "num_tokens": 163747993.0, + "step": 982 + }, + { + "entropy": 1.76864688595136, + "epoch": 0.10798934387959683, + "grad_norm": 0.6262992024421692, + "learning_rate": 1.9973063460353207e-05, + "loss": 1.4522, + "mean_token_accuracy": 0.645324652393659, + "num_tokens": 163992793.0, + "step": 983 + }, + { + "entropy": 1.76500407854716, + "epoch": 0.10809920079096977, + "grad_norm": 0.5834012627601624, + "learning_rate": 1.997293953401241e-05, + "loss": 1.4136, + "mean_token_accuracy": 0.6545668741067251, + "num_tokens": 164176810.0, + "step": 984 + }, + { + "entropy": 1.7618548075358074, + "epoch": 0.1082090577023427, + "grad_norm": 0.6489772200584412, + "learning_rate": 1.997281532368211e-05, + "loss": 1.3492, + "mean_token_accuracy": 0.65315709511439, + "num_tokens": 164342656.0, + "step": 985 + }, + { + "entropy": 1.7967805167039235, + "epoch": 0.10831891461371564, + "grad_norm": 0.8636541962623596, + "learning_rate": 1.9972690829366254e-05, + "loss": 1.4586, + "mean_token_accuracy": 0.6364806840817133, + "num_tokens": 164534044.0, + "step": 986 + }, + { + "entropy": 1.79604172706604, + "epoch": 0.10842877152508858, + "grad_norm": 0.6198572516441345, + "learning_rate": 1.9972566051068775e-05, + "loss": 1.4286, + "mean_token_accuracy": 0.6450680047273636, + "num_tokens": 164692093.0, + "step": 987 + }, + { + "entropy": 1.74147434035937, + "epoch": 0.1085386284364615, + "grad_norm": 0.9842249155044556, + "learning_rate": 1.9972440988793623e-05, + "loss": 1.4988, + "mean_token_accuracy": 0.6356190939744314, + "num_tokens": 164968178.0, + "step": 988 + }, + { + "entropy": 1.699428141117096, + "epoch": 0.10864848534783444, + "grad_norm": 0.6791302561759949, + "learning_rate": 1.997231564254476e-05, + "loss": 1.3922, + "mean_token_accuracy": 0.6636618773142496, + "num_tokens": 165128695.0, + "step": 989 + }, + { + "entropy": 1.832966188589732, + "epoch": 0.10875834225920739, + "grad_norm": 0.7480493187904358, + "learning_rate": 1.9972190012326146e-05, + "loss": 1.505, + "mean_token_accuracy": 0.6330814162890116, + "num_tokens": 165312932.0, + "step": 990 + }, + { + "entropy": 1.7748298744360607, + "epoch": 0.10886819917058031, + "grad_norm": 0.8279789090156555, + "learning_rate": 1.9972064098141763e-05, + "loss": 1.5419, + "mean_token_accuracy": 0.6653905312220255, + "num_tokens": 165499258.0, + "step": 991 + }, + { + "entropy": 1.750197817881902, + "epoch": 0.10897805608195325, + "grad_norm": 0.8178195953369141, + "learning_rate": 1.9971937899995595e-05, + "loss": 1.2939, + "mean_token_accuracy": 0.6684920241435369, + "num_tokens": 165641236.0, + "step": 992 + }, + { + "entropy": 1.6943954626719158, + "epoch": 0.1090879129933262, + "grad_norm": 0.6711166501045227, + "learning_rate": 1.9971811417891634e-05, + "loss": 1.2783, + "mean_token_accuracy": 0.6719668805599213, + "num_tokens": 165795977.0, + "step": 993 + }, + { + "entropy": 1.7269932230313618, + "epoch": 0.10919776990469914, + "grad_norm": 0.6310712099075317, + "learning_rate": 1.9971684651833886e-05, + "loss": 1.3686, + "mean_token_accuracy": 0.6621618568897247, + "num_tokens": 165955938.0, + "step": 994 + }, + { + "entropy": 1.7690664927164714, + "epoch": 0.10930762681607206, + "grad_norm": 0.7837558388710022, + "learning_rate": 1.9971557601826358e-05, + "loss": 1.3947, + "mean_token_accuracy": 0.6452465703090032, + "num_tokens": 166087991.0, + "step": 995 + }, + { + "entropy": 1.7119774222373962, + "epoch": 0.109417483727445, + "grad_norm": 0.5863208770751953, + "learning_rate": 1.9971430267873077e-05, + "loss": 1.535, + "mean_token_accuracy": 0.6391033828258514, + "num_tokens": 166281825.0, + "step": 996 + }, + { + "entropy": 1.748874545097351, + "epoch": 0.10952734063881794, + "grad_norm": 0.5901543498039246, + "learning_rate": 1.997130264997807e-05, + "loss": 1.3562, + "mean_token_accuracy": 0.6514041026433309, + "num_tokens": 166481277.0, + "step": 997 + }, + { + "entropy": 1.7872538765271504, + "epoch": 0.10963719755019087, + "grad_norm": 0.7273993492126465, + "learning_rate": 1.9971174748145376e-05, + "loss": 1.3969, + "mean_token_accuracy": 0.6459978073835373, + "num_tokens": 166638635.0, + "step": 998 + }, + { + "entropy": 1.7589583992958069, + "epoch": 0.10974705446156381, + "grad_norm": 0.7236528992652893, + "learning_rate": 1.997104656237905e-05, + "loss": 1.443, + "mean_token_accuracy": 0.6443432023127874, + "num_tokens": 166842013.0, + "step": 999 + }, + { + "entropy": 1.716710348924001, + "epoch": 0.10985691137293675, + "grad_norm": 0.6968439817428589, + "learning_rate": 1.9970918092683133e-05, + "loss": 1.3732, + "mean_token_accuracy": 0.6586572974920273, + "num_tokens": 166983909.0, + "step": 1000 + }, + { + "entropy": 1.7281900147596996, + "epoch": 0.1099667682843097, + "grad_norm": 0.7102954983711243, + "learning_rate": 1.9970789339061707e-05, + "loss": 1.4898, + "mean_token_accuracy": 0.6502237866322199, + "num_tokens": 167207507.0, + "step": 1001 + }, + { + "entropy": 1.7563473085562389, + "epoch": 0.11007662519568262, + "grad_norm": 0.687754213809967, + "learning_rate": 1.997066030151884e-05, + "loss": 1.4765, + "mean_token_accuracy": 0.6534829437732697, + "num_tokens": 167361838.0, + "step": 1002 + }, + { + "entropy": 1.7929190198580425, + "epoch": 0.11018648210705556, + "grad_norm": 0.671809732913971, + "learning_rate": 1.9970530980058614e-05, + "loss": 1.518, + "mean_token_accuracy": 0.6443095902601877, + "num_tokens": 167562959.0, + "step": 1003 + }, + { + "entropy": 1.7678324580192566, + "epoch": 0.1102963390184285, + "grad_norm": 0.602029025554657, + "learning_rate": 1.9970401374685126e-05, + "loss": 1.4024, + "mean_token_accuracy": 0.6431177506844202, + "num_tokens": 167755692.0, + "step": 1004 + }, + { + "entropy": 1.7461299399534862, + "epoch": 0.11040619592980143, + "grad_norm": 0.564751386642456, + "learning_rate": 1.9970271485402478e-05, + "loss": 1.458, + "mean_token_accuracy": 0.6464813202619553, + "num_tokens": 167945870.0, + "step": 1005 + }, + { + "entropy": 1.7465039889017742, + "epoch": 0.11051605284117437, + "grad_norm": 0.6950850486755371, + "learning_rate": 1.9970141312214778e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.654125397404035, + "num_tokens": 168123031.0, + "step": 1006 + }, + { + "entropy": 1.7230841716130574, + "epoch": 0.11062590975254731, + "grad_norm": 0.5795581340789795, + "learning_rate": 1.9970010855126148e-05, + "loss": 1.3064, + "mean_token_accuracy": 0.6747977683941523, + "num_tokens": 168344827.0, + "step": 1007 + }, + { + "entropy": 1.768018126487732, + "epoch": 0.11073576666392024, + "grad_norm": 0.727882444858551, + "learning_rate": 1.9969880114140717e-05, + "loss": 1.3756, + "mean_token_accuracy": 0.6502227435509363, + "num_tokens": 168493595.0, + "step": 1008 + }, + { + "entropy": 1.7701697448889415, + "epoch": 0.11084562357529318, + "grad_norm": 0.7508238554000854, + "learning_rate": 1.9969749089262623e-05, + "loss": 1.2765, + "mean_token_accuracy": 0.6652316004037857, + "num_tokens": 168603351.0, + "step": 1009 + }, + { + "entropy": 1.7724807858467102, + "epoch": 0.11095548048666612, + "grad_norm": 0.6007605791091919, + "learning_rate": 1.9969617780496008e-05, + "loss": 1.3051, + "mean_token_accuracy": 0.666801263888677, + "num_tokens": 168780284.0, + "step": 1010 + }, + { + "entropy": 1.7377927700678508, + "epoch": 0.11106533739803906, + "grad_norm": 0.7759512662887573, + "learning_rate": 1.9969486187845037e-05, + "loss": 1.4752, + "mean_token_accuracy": 0.6457094500462214, + "num_tokens": 168976335.0, + "step": 1011 + }, + { + "entropy": 1.7481233775615692, + "epoch": 0.11117519430941199, + "grad_norm": 0.6538453698158264, + "learning_rate": 1.9969354311313868e-05, + "loss": 1.4398, + "mean_token_accuracy": 0.646173338095347, + "num_tokens": 169130001.0, + "step": 1012 + }, + { + "entropy": 1.7797774871190388, + "epoch": 0.11128505122078493, + "grad_norm": 0.8638194799423218, + "learning_rate": 1.9969222150906677e-05, + "loss": 1.4131, + "mean_token_accuracy": 0.6468067765235901, + "num_tokens": 169258271.0, + "step": 1013 + }, + { + "entropy": 1.7161829272905986, + "epoch": 0.11139490813215787, + "grad_norm": 0.8480156064033508, + "learning_rate": 1.9969089706627646e-05, + "loss": 1.351, + "mean_token_accuracy": 0.668408066034317, + "num_tokens": 169448988.0, + "step": 1014 + }, + { + "entropy": 1.721673975388209, + "epoch": 0.1115047650435308, + "grad_norm": 0.6855505108833313, + "learning_rate": 1.996895697848097e-05, + "loss": 1.4182, + "mean_token_accuracy": 0.6560923904180527, + "num_tokens": 169617296.0, + "step": 1015 + }, + { + "entropy": 1.754950036605199, + "epoch": 0.11161462195490374, + "grad_norm": 0.5711190104484558, + "learning_rate": 1.9968823966470844e-05, + "loss": 1.5817, + "mean_token_accuracy": 0.6348544011513392, + "num_tokens": 169828139.0, + "step": 1016 + }, + { + "entropy": 1.8216430644194286, + "epoch": 0.11172447886627668, + "grad_norm": 0.7866774201393127, + "learning_rate": 1.9968690670601483e-05, + "loss": 1.4799, + "mean_token_accuracy": 0.6281097233295441, + "num_tokens": 169993288.0, + "step": 1017 + }, + { + "entropy": 1.7923205494880676, + "epoch": 0.11183433577764962, + "grad_norm": 0.7133738398551941, + "learning_rate": 1.99685570908771e-05, + "loss": 1.5161, + "mean_token_accuracy": 0.6534708514809608, + "num_tokens": 170185324.0, + "step": 1018 + }, + { + "entropy": 1.7083572149276733, + "epoch": 0.11194419268902255, + "grad_norm": 0.6993825435638428, + "learning_rate": 1.9968423227301928e-05, + "loss": 1.3804, + "mean_token_accuracy": 0.6609247972567877, + "num_tokens": 170347764.0, + "step": 1019 + }, + { + "entropy": 1.7944194575150807, + "epoch": 0.11205404960039549, + "grad_norm": 0.6917293071746826, + "learning_rate": 1.9968289079880204e-05, + "loss": 1.4405, + "mean_token_accuracy": 0.6581203043460846, + "num_tokens": 170498990.0, + "step": 1020 + }, + { + "entropy": 1.7612830102443695, + "epoch": 0.11216390651176843, + "grad_norm": 0.6517383456230164, + "learning_rate": 1.9968154648616174e-05, + "loss": 1.4844, + "mean_token_accuracy": 0.6349473794301351, + "num_tokens": 170659958.0, + "step": 1021 + }, + { + "entropy": 1.7332187394301097, + "epoch": 0.11227376342314135, + "grad_norm": 0.7480264902114868, + "learning_rate": 1.996801993351408e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6647941619157791, + "num_tokens": 170819653.0, + "step": 1022 + }, + { + "entropy": 1.7455625931421916, + "epoch": 0.1123836203345143, + "grad_norm": 0.6116794943809509, + "learning_rate": 1.996788493457821e-05, + "loss": 1.4525, + "mean_token_accuracy": 0.6416516304016113, + "num_tokens": 170994301.0, + "step": 1023 + }, + { + "entropy": 1.731279730796814, + "epoch": 0.11249347724588724, + "grad_norm": 0.6867764592170715, + "learning_rate": 1.9967749651812815e-05, + "loss": 1.321, + "mean_token_accuracy": 0.6761416296164194, + "num_tokens": 171174374.0, + "step": 1024 + }, + { + "entropy": 1.7446598211924236, + "epoch": 0.11260333415726016, + "grad_norm": 0.7553854584693909, + "learning_rate": 1.9967614085222187e-05, + "loss": 1.5306, + "mean_token_accuracy": 0.6366226524114609, + "num_tokens": 171329470.0, + "step": 1025 + }, + { + "entropy": 1.7547922631104786, + "epoch": 0.1127131910686331, + "grad_norm": 0.7220216393470764, + "learning_rate": 1.996747823481061e-05, + "loss": 1.3455, + "mean_token_accuracy": 0.6626160144805908, + "num_tokens": 171493059.0, + "step": 1026 + }, + { + "entropy": 1.690296709537506, + "epoch": 0.11282304798000604, + "grad_norm": 0.6687548160552979, + "learning_rate": 1.9967342100582394e-05, + "loss": 1.3581, + "mean_token_accuracy": 0.6656670471032461, + "num_tokens": 171646415.0, + "step": 1027 + }, + { + "entropy": 1.7647127310434978, + "epoch": 0.11293290489137899, + "grad_norm": 0.637204110622406, + "learning_rate": 1.9967205682541834e-05, + "loss": 1.4116, + "mean_token_accuracy": 0.6447415898243586, + "num_tokens": 171828506.0, + "step": 1028 + }, + { + "entropy": 1.7863658169905345, + "epoch": 0.11304276180275191, + "grad_norm": 0.6792446374893188, + "learning_rate": 1.9967068980693262e-05, + "loss": 1.3788, + "mean_token_accuracy": 0.6593380073706309, + "num_tokens": 171976616.0, + "step": 1029 + }, + { + "entropy": 1.7621726393699646, + "epoch": 0.11315261871412485, + "grad_norm": 0.7082951068878174, + "learning_rate": 1.9966931995040992e-05, + "loss": 1.4682, + "mean_token_accuracy": 0.6501838515202204, + "num_tokens": 172158583.0, + "step": 1030 + }, + { + "entropy": 1.8039735853672028, + "epoch": 0.1132624756254978, + "grad_norm": 0.6356441974639893, + "learning_rate": 1.9966794725589368e-05, + "loss": 1.4936, + "mean_token_accuracy": 0.6384439915418625, + "num_tokens": 172320260.0, + "step": 1031 + }, + { + "entropy": 1.731901486714681, + "epoch": 0.11337233253687072, + "grad_norm": 0.7175825238227844, + "learning_rate": 1.9966657172342733e-05, + "loss": 1.2825, + "mean_token_accuracy": 0.6734850654999415, + "num_tokens": 172454571.0, + "step": 1032 + }, + { + "entropy": 1.7200752000013988, + "epoch": 0.11348218944824366, + "grad_norm": 0.9528830647468567, + "learning_rate": 1.9966519335305434e-05, + "loss": 1.3735, + "mean_token_accuracy": 0.653776541352272, + "num_tokens": 172605672.0, + "step": 1033 + }, + { + "entropy": 1.7589248915513356, + "epoch": 0.1135920463596166, + "grad_norm": 0.7243776917457581, + "learning_rate": 1.996638121448184e-05, + "loss": 1.4101, + "mean_token_accuracy": 0.6624555786450704, + "num_tokens": 172817225.0, + "step": 1034 + }, + { + "entropy": 1.7110880215962727, + "epoch": 0.11370190327098953, + "grad_norm": 0.7603755593299866, + "learning_rate": 1.9966242809876323e-05, + "loss": 1.3095, + "mean_token_accuracy": 0.6720947672923406, + "num_tokens": 172991993.0, + "step": 1035 + }, + { + "entropy": 1.70220681031545, + "epoch": 0.11381176018236247, + "grad_norm": 0.6258305311203003, + "learning_rate": 1.9966104121493262e-05, + "loss": 1.4045, + "mean_token_accuracy": 0.6527692576249441, + "num_tokens": 173175089.0, + "step": 1036 + }, + { + "entropy": 1.7945917348066966, + "epoch": 0.11392161709373541, + "grad_norm": 0.7107659578323364, + "learning_rate": 1.9965965149337044e-05, + "loss": 1.4265, + "mean_token_accuracy": 0.6489067127307256, + "num_tokens": 173367208.0, + "step": 1037 + }, + { + "entropy": 1.7529179851214092, + "epoch": 0.11403147400510835, + "grad_norm": 0.6977424621582031, + "learning_rate": 1.9965825893412066e-05, + "loss": 1.4642, + "mean_token_accuracy": 0.6490010867516199, + "num_tokens": 173529488.0, + "step": 1038 + }, + { + "entropy": 1.7823486824830372, + "epoch": 0.11414133091648128, + "grad_norm": 0.7429659366607666, + "learning_rate": 1.9965686353722744e-05, + "loss": 1.4706, + "mean_token_accuracy": 0.6589999397595724, + "num_tokens": 173743382.0, + "step": 1039 + }, + { + "entropy": 1.7522972325483959, + "epoch": 0.11425118782785422, + "grad_norm": 0.6352205276489258, + "learning_rate": 1.9965546530273484e-05, + "loss": 1.4292, + "mean_token_accuracy": 0.6400622725486755, + "num_tokens": 173912436.0, + "step": 1040 + }, + { + "entropy": 1.7382917702198029, + "epoch": 0.11436104473922716, + "grad_norm": 0.7589994072914124, + "learning_rate": 1.9965406423068722e-05, + "loss": 1.3622, + "mean_token_accuracy": 0.6633311361074448, + "num_tokens": 174093358.0, + "step": 1041 + }, + { + "entropy": 1.7625388304392497, + "epoch": 0.11447090165060009, + "grad_norm": 0.6129192113876343, + "learning_rate": 1.9965266032112883e-05, + "loss": 1.4215, + "mean_token_accuracy": 0.6418607930342356, + "num_tokens": 174317864.0, + "step": 1042 + }, + { + "entropy": 1.6891125738620758, + "epoch": 0.11458075856197303, + "grad_norm": 0.7192829251289368, + "learning_rate": 1.9965125357410415e-05, + "loss": 1.437, + "mean_token_accuracy": 0.6587806989749273, + "num_tokens": 174456645.0, + "step": 1043 + }, + { + "entropy": 1.7558738390604656, + "epoch": 0.11469061547334597, + "grad_norm": 0.8326495289802551, + "learning_rate": 1.9964984398965768e-05, + "loss": 1.3175, + "mean_token_accuracy": 0.6619067142407099, + "num_tokens": 174574320.0, + "step": 1044 + }, + { + "entropy": 1.7502717077732086, + "epoch": 0.11480047238471891, + "grad_norm": 0.6890818476676941, + "learning_rate": 1.9964843156783406e-05, + "loss": 1.456, + "mean_token_accuracy": 0.6418903172016144, + "num_tokens": 174761804.0, + "step": 1045 + }, + { + "entropy": 1.7364663283030193, + "epoch": 0.11491032929609184, + "grad_norm": 0.6975388526916504, + "learning_rate": 1.99647016308678e-05, + "loss": 1.4514, + "mean_token_accuracy": 0.6475592801968256, + "num_tokens": 174937492.0, + "step": 1046 + }, + { + "entropy": 1.7429968516031902, + "epoch": 0.11502018620746478, + "grad_norm": 0.8240295052528381, + "learning_rate": 1.9964559821223423e-05, + "loss": 1.3563, + "mean_token_accuracy": 0.6688429315884908, + "num_tokens": 175073825.0, + "step": 1047 + }, + { + "entropy": 1.765154093503952, + "epoch": 0.11513004311883772, + "grad_norm": 0.742708146572113, + "learning_rate": 1.9964417727854766e-05, + "loss": 1.6561, + "mean_token_accuracy": 0.629709780216217, + "num_tokens": 175240948.0, + "step": 1048 + }, + { + "entropy": 1.7909338076909382, + "epoch": 0.11523990003021065, + "grad_norm": 10.230766296386719, + "learning_rate": 1.9964275350766328e-05, + "loss": 1.2824, + "mean_token_accuracy": 0.6872695883115133, + "num_tokens": 175389639.0, + "step": 1049 + }, + { + "entropy": 1.7785474856694539, + "epoch": 0.11534975694158359, + "grad_norm": 0.7796043157577515, + "learning_rate": 1.996413268996262e-05, + "loss": 1.3464, + "mean_token_accuracy": 0.6501271277666092, + "num_tokens": 175507065.0, + "step": 1050 + }, + { + "entropy": 1.70500651995341, + "epoch": 0.11545961385295653, + "grad_norm": 0.7728234529495239, + "learning_rate": 1.9963989745448148e-05, + "loss": 1.314, + "mean_token_accuracy": 0.6640622218449911, + "num_tokens": 175645190.0, + "step": 1051 + }, + { + "entropy": 1.778702090183894, + "epoch": 0.11556947076432945, + "grad_norm": 0.7463306784629822, + "learning_rate": 1.9963846517227438e-05, + "loss": 1.4866, + "mean_token_accuracy": 0.6476029555002848, + "num_tokens": 175787142.0, + "step": 1052 + }, + { + "entropy": 1.7820600767930348, + "epoch": 0.1156793276757024, + "grad_norm": 0.8117401003837585, + "learning_rate": 1.9963703005305026e-05, + "loss": 1.6304, + "mean_token_accuracy": 0.6415733198324839, + "num_tokens": 175978725.0, + "step": 1053 + }, + { + "entropy": 1.767964760462443, + "epoch": 0.11578918458707534, + "grad_norm": 0.7086589336395264, + "learning_rate": 1.9963559209685453e-05, + "loss": 1.488, + "mean_token_accuracy": 0.6404663970073065, + "num_tokens": 176151940.0, + "step": 1054 + }, + { + "entropy": 1.7119649251302083, + "epoch": 0.11589904149844828, + "grad_norm": 0.6427537798881531, + "learning_rate": 1.9963415130373272e-05, + "loss": 1.4038, + "mean_token_accuracy": 0.6565761119127274, + "num_tokens": 176346366.0, + "step": 1055 + }, + { + "entropy": 1.6838423609733582, + "epoch": 0.1160088984098212, + "grad_norm": 0.6571147441864014, + "learning_rate": 1.996327076737304e-05, + "loss": 1.4475, + "mean_token_accuracy": 0.657142753402392, + "num_tokens": 176531787.0, + "step": 1056 + }, + { + "entropy": 1.7079651753107707, + "epoch": 0.11611875532119414, + "grad_norm": 0.7071012258529663, + "learning_rate": 1.9963126120689327e-05, + "loss": 1.245, + "mean_token_accuracy": 0.6794669379790624, + "num_tokens": 176644581.0, + "step": 1057 + }, + { + "entropy": 1.7078370153903961, + "epoch": 0.11622861223256709, + "grad_norm": 0.6504570841789246, + "learning_rate": 1.996298119032671e-05, + "loss": 1.4807, + "mean_token_accuracy": 0.6452956398328146, + "num_tokens": 176800118.0, + "step": 1058 + }, + { + "entropy": 1.7567949692408245, + "epoch": 0.11633846914394001, + "grad_norm": 0.6000586152076721, + "learning_rate": 1.996283597628978e-05, + "loss": 1.5075, + "mean_token_accuracy": 0.6261270443598429, + "num_tokens": 176997873.0, + "step": 1059 + }, + { + "entropy": 1.737035463253657, + "epoch": 0.11644832605531295, + "grad_norm": 0.785273551940918, + "learning_rate": 1.996269047858313e-05, + "loss": 1.4997, + "mean_token_accuracy": 0.6675494114557902, + "num_tokens": 177170996.0, + "step": 1060 + }, + { + "entropy": 1.778021514415741, + "epoch": 0.1165581829666859, + "grad_norm": 0.6912639141082764, + "learning_rate": 1.996254469721136e-05, + "loss": 1.3567, + "mean_token_accuracy": 0.6619671285152435, + "num_tokens": 177310138.0, + "step": 1061 + }, + { + "entropy": 1.7289757231871288, + "epoch": 0.11666803987805884, + "grad_norm": 0.6175736784934998, + "learning_rate": 1.9962398632179095e-05, + "loss": 1.4535, + "mean_token_accuracy": 0.6459249506394068, + "num_tokens": 177492607.0, + "step": 1062 + }, + { + "entropy": 1.7960573335488637, + "epoch": 0.11677789678943176, + "grad_norm": 0.64888596534729, + "learning_rate": 1.996225228349095e-05, + "loss": 1.3823, + "mean_token_accuracy": 0.6632821957270304, + "num_tokens": 177682018.0, + "step": 1063 + }, + { + "entropy": 1.7856767276922862, + "epoch": 0.1168877537008047, + "grad_norm": 0.8114152550697327, + "learning_rate": 1.9962105651151554e-05, + "loss": 1.507, + "mean_token_accuracy": 0.6501527229944865, + "num_tokens": 177849461.0, + "step": 1064 + }, + { + "entropy": 1.7049864828586578, + "epoch": 0.11699761061217764, + "grad_norm": 0.6832014918327332, + "learning_rate": 1.9961958735165558e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6496352106332779, + "num_tokens": 178013404.0, + "step": 1065 + }, + { + "entropy": 1.766481727361679, + "epoch": 0.11710746752355057, + "grad_norm": 0.925279438495636, + "learning_rate": 1.9961811535537607e-05, + "loss": 1.5581, + "mean_token_accuracy": 0.6499437689781189, + "num_tokens": 178180175.0, + "step": 1066 + }, + { + "entropy": 1.8251918852329254, + "epoch": 0.11721732443492351, + "grad_norm": 0.7407745718955994, + "learning_rate": 1.9961664052272355e-05, + "loss": 1.4513, + "mean_token_accuracy": 0.6350584477186203, + "num_tokens": 178338753.0, + "step": 1067 + }, + { + "entropy": 1.7265767951806386, + "epoch": 0.11732718134629645, + "grad_norm": 0.7054161429405212, + "learning_rate": 1.996151628537448e-05, + "loss": 1.3095, + "mean_token_accuracy": 0.6645342856645584, + "num_tokens": 178489201.0, + "step": 1068 + }, + { + "entropy": 1.687457690636317, + "epoch": 0.11743703825766938, + "grad_norm": 0.6299598813056946, + "learning_rate": 1.9961368234848647e-05, + "loss": 1.4949, + "mean_token_accuracy": 0.6444557011127472, + "num_tokens": 178682246.0, + "step": 1069 + }, + { + "entropy": 1.736327697833379, + "epoch": 0.11754689516904232, + "grad_norm": 0.7318028211593628, + "learning_rate": 1.9961219900699545e-05, + "loss": 1.5979, + "mean_token_accuracy": 0.6583315829435984, + "num_tokens": 178856231.0, + "step": 1070 + }, + { + "entropy": 1.7476358910401661, + "epoch": 0.11765675208041526, + "grad_norm": 0.5873830318450928, + "learning_rate": 1.996107128293188e-05, + "loss": 1.5281, + "mean_token_accuracy": 0.6360836823781332, + "num_tokens": 179036805.0, + "step": 1071 + }, + { + "entropy": 1.6975926160812378, + "epoch": 0.1177666089917882, + "grad_norm": 0.7855924367904663, + "learning_rate": 1.9960922381550342e-05, + "loss": 1.3234, + "mean_token_accuracy": 0.6804987043142319, + "num_tokens": 179179515.0, + "step": 1072 + }, + { + "entropy": 1.746078997850418, + "epoch": 0.11787646590316113, + "grad_norm": 0.7362106442451477, + "learning_rate": 1.9960773196559647e-05, + "loss": 1.3612, + "mean_token_accuracy": 0.6657725870609283, + "num_tokens": 179309228.0, + "step": 1073 + }, + { + "entropy": 1.6905361711978912, + "epoch": 0.11798632281453407, + "grad_norm": 0.5651462078094482, + "learning_rate": 1.9960623727964522e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6537969360748926, + "num_tokens": 179544581.0, + "step": 1074 + }, + { + "entropy": 1.7257480025291443, + "epoch": 0.11809617972590701, + "grad_norm": 0.9071959257125854, + "learning_rate": 1.9960473975769693e-05, + "loss": 1.3199, + "mean_token_accuracy": 0.6615277131398519, + "num_tokens": 179650333.0, + "step": 1075 + }, + { + "entropy": 1.7202151914437611, + "epoch": 0.11820603663727994, + "grad_norm": 0.5977594256401062, + "learning_rate": 1.9960323939979894e-05, + "loss": 1.4065, + "mean_token_accuracy": 0.6557470411062241, + "num_tokens": 179832640.0, + "step": 1076 + }, + { + "entropy": 1.7765184839566548, + "epoch": 0.11831589354865288, + "grad_norm": 0.7578040361404419, + "learning_rate": 1.9960173620599887e-05, + "loss": 1.3649, + "mean_token_accuracy": 0.6576367566982905, + "num_tokens": 179984410.0, + "step": 1077 + }, + { + "entropy": 1.8273439009984334, + "epoch": 0.11842575046002582, + "grad_norm": 0.8141303062438965, + "learning_rate": 1.996002301763442e-05, + "loss": 1.444, + "mean_token_accuracy": 0.6420343518257141, + "num_tokens": 180146615.0, + "step": 1078 + }, + { + "entropy": 1.7694277266661327, + "epoch": 0.11853560737139875, + "grad_norm": 0.6545711755752563, + "learning_rate": 1.9959872131088264e-05, + "loss": 1.417, + "mean_token_accuracy": 0.6598865836858749, + "num_tokens": 180292276.0, + "step": 1079 + }, + { + "entropy": 1.7825441459814708, + "epoch": 0.11864546428277169, + "grad_norm": 0.5675626397132874, + "learning_rate": 1.995972096096619e-05, + "loss": 1.46, + "mean_token_accuracy": 0.6376805355151495, + "num_tokens": 180499401.0, + "step": 1080 + }, + { + "entropy": 1.738411416610082, + "epoch": 0.11875532119414463, + "grad_norm": 0.7769458889961243, + "learning_rate": 1.9959569507272985e-05, + "loss": 1.5524, + "mean_token_accuracy": 0.6482310444116592, + "num_tokens": 180664053.0, + "step": 1081 + }, + { + "entropy": 1.7660705149173737, + "epoch": 0.11886517810551757, + "grad_norm": 0.6864280700683594, + "learning_rate": 1.9959417770013445e-05, + "loss": 1.4833, + "mean_token_accuracy": 0.6475979934136072, + "num_tokens": 180832870.0, + "step": 1082 + }, + { + "entropy": 1.7743451297283173, + "epoch": 0.1189750350168905, + "grad_norm": 0.7256821990013123, + "learning_rate": 1.995926574919237e-05, + "loss": 1.3349, + "mean_token_accuracy": 0.6730091770490011, + "num_tokens": 180944514.0, + "step": 1083 + }, + { + "entropy": 1.791404128074646, + "epoch": 0.11908489192826344, + "grad_norm": 0.8112614154815674, + "learning_rate": 1.9959113444814567e-05, + "loss": 1.3597, + "mean_token_accuracy": 0.6570585270722707, + "num_tokens": 181104580.0, + "step": 1084 + }, + { + "entropy": 1.6880301733811696, + "epoch": 0.11919474883963638, + "grad_norm": 0.6577028632164001, + "learning_rate": 1.9958960856884862e-05, + "loss": 1.4622, + "mean_token_accuracy": 0.6448287467161814, + "num_tokens": 181300738.0, + "step": 1085 + }, + { + "entropy": 1.722762902577718, + "epoch": 0.1193046057510093, + "grad_norm": 0.7643953561782837, + "learning_rate": 1.9958807985408083e-05, + "loss": 1.4948, + "mean_token_accuracy": 0.647366444269816, + "num_tokens": 181454434.0, + "step": 1086 + }, + { + "entropy": 1.7054348190625508, + "epoch": 0.11941446266238225, + "grad_norm": 0.7724200487136841, + "learning_rate": 1.995865483038907e-05, + "loss": 1.4808, + "mean_token_accuracy": 0.662505899866422, + "num_tokens": 181651668.0, + "step": 1087 + }, + { + "entropy": 1.7870188256104786, + "epoch": 0.11952431957375519, + "grad_norm": 0.8176293969154358, + "learning_rate": 1.995850139183267e-05, + "loss": 1.3453, + "mean_token_accuracy": 0.6639283945163091, + "num_tokens": 181788460.0, + "step": 1088 + }, + { + "entropy": 1.7554938594500225, + "epoch": 0.11963417648512813, + "grad_norm": 0.7022227644920349, + "learning_rate": 1.995834766974373e-05, + "loss": 1.3982, + "mean_token_accuracy": 0.6482542703549067, + "num_tokens": 182038105.0, + "step": 1089 + }, + { + "entropy": 1.7522353132565816, + "epoch": 0.11974403339650105, + "grad_norm": 0.9576560854911804, + "learning_rate": 1.995819366412713e-05, + "loss": 1.4416, + "mean_token_accuracy": 0.6475793421268463, + "num_tokens": 182216542.0, + "step": 1090 + }, + { + "entropy": 1.7578060527642567, + "epoch": 0.119853890307874, + "grad_norm": 0.6407710313796997, + "learning_rate": 1.9958039374987738e-05, + "loss": 1.5203, + "mean_token_accuracy": 0.6500546783208847, + "num_tokens": 182385751.0, + "step": 1091 + }, + { + "entropy": 1.777414192756017, + "epoch": 0.11996374721924694, + "grad_norm": 0.8721911311149597, + "learning_rate": 1.995788480233043e-05, + "loss": 1.5271, + "mean_token_accuracy": 0.635983943939209, + "num_tokens": 182566662.0, + "step": 1092 + }, + { + "entropy": 1.7177048722902934, + "epoch": 0.12007360413061986, + "grad_norm": 0.6299294829368591, + "learning_rate": 1.9957729946160108e-05, + "loss": 1.2945, + "mean_token_accuracy": 0.6612618813912073, + "num_tokens": 182701870.0, + "step": 1093 + }, + { + "entropy": 1.6905933519204457, + "epoch": 0.1201834610419928, + "grad_norm": 0.7173838019371033, + "learning_rate": 1.995757480648167e-05, + "loss": 1.4643, + "mean_token_accuracy": 0.6514955560366312, + "num_tokens": 182853909.0, + "step": 1094 + }, + { + "entropy": 1.8044182658195496, + "epoch": 0.12029331795336574, + "grad_norm": 0.7805381417274475, + "learning_rate": 1.995741938330003e-05, + "loss": 1.4079, + "mean_token_accuracy": 0.6487281521161398, + "num_tokens": 182996153.0, + "step": 1095 + }, + { + "entropy": 1.7813920577367146, + "epoch": 0.12040317486473867, + "grad_norm": 0.6752137541770935, + "learning_rate": 1.9957263676620094e-05, + "loss": 1.5411, + "mean_token_accuracy": 0.6410238941510519, + "num_tokens": 183196179.0, + "step": 1096 + }, + { + "entropy": 1.7193073829015095, + "epoch": 0.12051303177611161, + "grad_norm": 0.7395818829536438, + "learning_rate": 1.9957107686446805e-05, + "loss": 1.2063, + "mean_token_accuracy": 0.678206260005633, + "num_tokens": 183324746.0, + "step": 1097 + }, + { + "entropy": 1.7467433512210846, + "epoch": 0.12062288868748455, + "grad_norm": 0.7600111365318298, + "learning_rate": 1.995695141278509e-05, + "loss": 1.402, + "mean_token_accuracy": 0.6539099762837092, + "num_tokens": 183455165.0, + "step": 1098 + }, + { + "entropy": 1.766270915667216, + "epoch": 0.1207327455988575, + "grad_norm": 0.6678837537765503, + "learning_rate": 1.9956794855639902e-05, + "loss": 1.5132, + "mean_token_accuracy": 0.6394678006569544, + "num_tokens": 183653879.0, + "step": 1099 + }, + { + "entropy": 1.70885169506073, + "epoch": 0.12084260251023042, + "grad_norm": 0.6890634894371033, + "learning_rate": 1.9956638015016192e-05, + "loss": 1.4759, + "mean_token_accuracy": 0.6423639605442683, + "num_tokens": 183865460.0, + "step": 1100 + }, + { + "entropy": 1.7804962793986003, + "epoch": 0.12095245942160336, + "grad_norm": 0.6882741451263428, + "learning_rate": 1.9956480890918923e-05, + "loss": 1.3699, + "mean_token_accuracy": 0.6530927171309789, + "num_tokens": 184012429.0, + "step": 1101 + }, + { + "entropy": 1.7800053457419078, + "epoch": 0.1210623163329763, + "grad_norm": 0.7321879863739014, + "learning_rate": 1.9956323483353073e-05, + "loss": 1.3303, + "mean_token_accuracy": 0.6687973191340765, + "num_tokens": 184132575.0, + "step": 1102 + }, + { + "entropy": 1.7981793681780498, + "epoch": 0.12117217324434923, + "grad_norm": 0.7735952138900757, + "learning_rate": 1.995616579232362e-05, + "loss": 1.5259, + "mean_token_accuracy": 0.6438850810130438, + "num_tokens": 184294442.0, + "step": 1103 + }, + { + "entropy": 1.6824164589246113, + "epoch": 0.12128203015572217, + "grad_norm": 0.8133576512336731, + "learning_rate": 1.995600781783555e-05, + "loss": 1.3995, + "mean_token_accuracy": 0.6544992427031199, + "num_tokens": 184454040.0, + "step": 1104 + }, + { + "entropy": 1.8094957967599232, + "epoch": 0.12139188706709511, + "grad_norm": 0.7161358594894409, + "learning_rate": 1.9955849559893878e-05, + "loss": 1.3367, + "mean_token_accuracy": 0.6558258583148321, + "num_tokens": 184576608.0, + "step": 1105 + }, + { + "entropy": 1.6985692779223125, + "epoch": 0.12150174397846805, + "grad_norm": 0.7241420745849609, + "learning_rate": 1.9955691018503592e-05, + "loss": 1.5111, + "mean_token_accuracy": 0.6411018818616867, + "num_tokens": 184743688.0, + "step": 1106 + }, + { + "entropy": 1.8171222706635792, + "epoch": 0.12161160088984098, + "grad_norm": 0.7848587036132812, + "learning_rate": 1.995553219366973e-05, + "loss": 1.3863, + "mean_token_accuracy": 0.6589597115914027, + "num_tokens": 184908891.0, + "step": 1107 + }, + { + "entropy": 1.7540997366110485, + "epoch": 0.12172145780121392, + "grad_norm": 0.709570050239563, + "learning_rate": 1.9955373085397304e-05, + "loss": 1.3756, + "mean_token_accuracy": 0.6544551948706309, + "num_tokens": 185037502.0, + "step": 1108 + }, + { + "entropy": 1.7399804890155792, + "epoch": 0.12183131471258686, + "grad_norm": 2.1236889362335205, + "learning_rate": 1.9955213693691358e-05, + "loss": 1.2212, + "mean_token_accuracy": 0.6755783557891846, + "num_tokens": 185228287.0, + "step": 1109 + }, + { + "entropy": 1.7062017718950908, + "epoch": 0.12194117162395979, + "grad_norm": 0.6356903910636902, + "learning_rate": 1.9955054018556936e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.647825613617897, + "num_tokens": 185421671.0, + "step": 1110 + }, + { + "entropy": 1.8161241014798482, + "epoch": 0.12205102853533273, + "grad_norm": 0.9130175709724426, + "learning_rate": 1.9954894059999082e-05, + "loss": 1.2078, + "mean_token_accuracy": 0.671512246131897, + "num_tokens": 185538648.0, + "step": 1111 + }, + { + "entropy": 1.6999003887176514, + "epoch": 0.12216088544670567, + "grad_norm": 0.6050575375556946, + "learning_rate": 1.9954733818022873e-05, + "loss": 1.3409, + "mean_token_accuracy": 0.6672930121421814, + "num_tokens": 185718319.0, + "step": 1112 + }, + { + "entropy": 1.7451957364877064, + "epoch": 0.1222707423580786, + "grad_norm": 0.7081188559532166, + "learning_rate": 1.995457329263337e-05, + "loss": 1.4324, + "mean_token_accuracy": 0.6614968578020731, + "num_tokens": 185890214.0, + "step": 1113 + }, + { + "entropy": 1.7832291225592296, + "epoch": 0.12238059926945154, + "grad_norm": 0.6266985535621643, + "learning_rate": 1.9954412483835658e-05, + "loss": 1.4527, + "mean_token_accuracy": 0.6494892338911692, + "num_tokens": 186067267.0, + "step": 1114 + }, + { + "entropy": 1.7837847769260406, + "epoch": 0.12249045618082448, + "grad_norm": 0.8108149170875549, + "learning_rate": 1.995425139163483e-05, + "loss": 1.394, + "mean_token_accuracy": 0.6565323745210966, + "num_tokens": 186187770.0, + "step": 1115 + }, + { + "entropy": 1.7373790740966797, + "epoch": 0.12260031309219742, + "grad_norm": 0.6153585910797119, + "learning_rate": 1.9954090016035975e-05, + "loss": 1.4611, + "mean_token_accuracy": 0.6373415837685267, + "num_tokens": 186378770.0, + "step": 1116 + }, + { + "entropy": 1.8052891790866852, + "epoch": 0.12271017000357035, + "grad_norm": 0.7305307388305664, + "learning_rate": 1.9953928357044207e-05, + "loss": 1.584, + "mean_token_accuracy": 0.6416770915190378, + "num_tokens": 186558752.0, + "step": 1117 + }, + { + "entropy": 1.8447977602481842, + "epoch": 0.12282002691494329, + "grad_norm": 0.6544065475463867, + "learning_rate": 1.9953766414664643e-05, + "loss": 1.5598, + "mean_token_accuracy": 0.6420470277468363, + "num_tokens": 186735295.0, + "step": 1118 + }, + { + "entropy": 1.7785147627194722, + "epoch": 0.12292988382631623, + "grad_norm": 0.9531629681587219, + "learning_rate": 1.9953604188902407e-05, + "loss": 1.4761, + "mean_token_accuracy": 0.6378699193398157, + "num_tokens": 186921485.0, + "step": 1119 + }, + { + "entropy": 1.732460230588913, + "epoch": 0.12303974073768915, + "grad_norm": 0.6433352828025818, + "learning_rate": 1.995344167976263e-05, + "loss": 1.4005, + "mean_token_accuracy": 0.6507207999626795, + "num_tokens": 187083038.0, + "step": 1120 + }, + { + "entropy": 1.7945673267046611, + "epoch": 0.1231495976490621, + "grad_norm": 0.7739344239234924, + "learning_rate": 1.995327888725046e-05, + "loss": 1.3051, + "mean_token_accuracy": 0.6669691900412241, + "num_tokens": 187232544.0, + "step": 1121 + }, + { + "entropy": 1.735634684562683, + "epoch": 0.12325945456043504, + "grad_norm": 0.6914976239204407, + "learning_rate": 1.995311581137105e-05, + "loss": 1.3125, + "mean_token_accuracy": 0.6667511413494746, + "num_tokens": 187355432.0, + "step": 1122 + }, + { + "entropy": 1.7393087248007457, + "epoch": 0.12336931147180798, + "grad_norm": 0.6433930993080139, + "learning_rate": 1.9952952452129557e-05, + "loss": 1.3744, + "mean_token_accuracy": 0.6738745719194412, + "num_tokens": 187522816.0, + "step": 1123 + }, + { + "entropy": 1.7415795028209686, + "epoch": 0.1234791683831809, + "grad_norm": 0.632103443145752, + "learning_rate": 1.995278880953115e-05, + "loss": 1.4548, + "mean_token_accuracy": 0.6446659713983536, + "num_tokens": 187709230.0, + "step": 1124 + }, + { + "entropy": 1.7378952999909718, + "epoch": 0.12358902529455384, + "grad_norm": 0.6405538320541382, + "learning_rate": 1.9952624883581015e-05, + "loss": 1.5702, + "mean_token_accuracy": 0.6246414035558701, + "num_tokens": 187965508.0, + "step": 1125 + }, + { + "entropy": 1.7145747939745586, + "epoch": 0.12369888220592679, + "grad_norm": 0.7018551826477051, + "learning_rate": 1.9952460674284335e-05, + "loss": 1.2834, + "mean_token_accuracy": 0.6730683247248331, + "num_tokens": 188080441.0, + "step": 1126 + }, + { + "entropy": 1.6676550805568695, + "epoch": 0.12380873911729971, + "grad_norm": 0.5789968967437744, + "learning_rate": 1.995229618164631e-05, + "loss": 1.36, + "mean_token_accuracy": 0.6580460617939631, + "num_tokens": 188292733.0, + "step": 1127 + }, + { + "entropy": 1.7675324380397797, + "epoch": 0.12391859602867265, + "grad_norm": 1.9231761693954468, + "learning_rate": 1.9952131405672145e-05, + "loss": 1.3133, + "mean_token_accuracy": 0.6596612135569254, + "num_tokens": 188507601.0, + "step": 1128 + }, + { + "entropy": 1.7851240833600361, + "epoch": 0.1240284529400456, + "grad_norm": 0.6579344272613525, + "learning_rate": 1.9951966346367054e-05, + "loss": 1.3917, + "mean_token_accuracy": 0.6518159955739975, + "num_tokens": 188647743.0, + "step": 1129 + }, + { + "entropy": 1.8078663349151611, + "epoch": 0.12413830985141852, + "grad_norm": 0.9145793318748474, + "learning_rate": 1.9951801003736263e-05, + "loss": 1.198, + "mean_token_accuracy": 0.6740387082099915, + "num_tokens": 188748896.0, + "step": 1130 + }, + { + "entropy": 1.7432547012964885, + "epoch": 0.12424816676279146, + "grad_norm": 0.6863495707511902, + "learning_rate": 1.9951635377785002e-05, + "loss": 1.4592, + "mean_token_accuracy": 0.6456627547740936, + "num_tokens": 188953780.0, + "step": 1131 + }, + { + "entropy": 1.7238787710666656, + "epoch": 0.1243580236741644, + "grad_norm": 0.8133369088172913, + "learning_rate": 1.9951469468518516e-05, + "loss": 1.3483, + "mean_token_accuracy": 0.6647701015075048, + "num_tokens": 189125417.0, + "step": 1132 + }, + { + "entropy": 1.6680325170358021, + "epoch": 0.12446788058553734, + "grad_norm": 0.6783432364463806, + "learning_rate": 1.9951303275942055e-05, + "loss": 1.2737, + "mean_token_accuracy": 0.6636984546979269, + "num_tokens": 189260690.0, + "step": 1133 + }, + { + "entropy": 1.722198059161504, + "epoch": 0.12457773749691027, + "grad_norm": 0.784504234790802, + "learning_rate": 1.995113680006088e-05, + "loss": 1.3104, + "mean_token_accuracy": 0.6713638504346212, + "num_tokens": 189388782.0, + "step": 1134 + }, + { + "entropy": 1.7054710189501445, + "epoch": 0.12468759440828321, + "grad_norm": 0.7793455123901367, + "learning_rate": 1.995097004088026e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.66965984304746, + "num_tokens": 189574036.0, + "step": 1135 + }, + { + "entropy": 1.7583240966002147, + "epoch": 0.12479745131965615, + "grad_norm": 0.7912724018096924, + "learning_rate": 1.9950802998405468e-05, + "loss": 1.4305, + "mean_token_accuracy": 0.6560654044151306, + "num_tokens": 189702401.0, + "step": 1136 + }, + { + "entropy": 1.8047844966252644, + "epoch": 0.12490730823102908, + "grad_norm": 0.9431250691413879, + "learning_rate": 1.9950635672641797e-05, + "loss": 1.4202, + "mean_token_accuracy": 0.6618384718894958, + "num_tokens": 189861738.0, + "step": 1137 + }, + { + "entropy": 1.764178196589152, + "epoch": 0.12501716514240202, + "grad_norm": 0.7213315367698669, + "learning_rate": 1.995046806359454e-05, + "loss": 1.4782, + "mean_token_accuracy": 0.6546398401260376, + "num_tokens": 190010964.0, + "step": 1138 + }, + { + "entropy": 1.7561284104983013, + "epoch": 0.12512702205377496, + "grad_norm": 0.7379279136657715, + "learning_rate": 1.9950300171269e-05, + "loss": 1.3491, + "mean_token_accuracy": 0.6550362805525461, + "num_tokens": 190186627.0, + "step": 1139 + }, + { + "entropy": 1.6882170836130779, + "epoch": 0.1252368789651479, + "grad_norm": 0.6673735976219177, + "learning_rate": 1.9950131995670494e-05, + "loss": 1.414, + "mean_token_accuracy": 0.6644150763750076, + "num_tokens": 190355893.0, + "step": 1140 + }, + { + "entropy": 1.696480651696523, + "epoch": 0.12534673587652084, + "grad_norm": 2.3444578647613525, + "learning_rate": 1.994996353680434e-05, + "loss": 0.9837, + "mean_token_accuracy": 0.6970398674408594, + "num_tokens": 190506476.0, + "step": 1141 + }, + { + "entropy": 1.7339637279510498, + "epoch": 0.12545659278789376, + "grad_norm": 0.6325587630271912, + "learning_rate": 1.994979479467588e-05, + "loss": 1.4113, + "mean_token_accuracy": 0.6466666658719381, + "num_tokens": 190693618.0, + "step": 1142 + }, + { + "entropy": 1.7287168403466542, + "epoch": 0.1255664496992667, + "grad_norm": 0.7597485184669495, + "learning_rate": 1.9949625769290442e-05, + "loss": 1.3352, + "mean_token_accuracy": 0.6628235826889673, + "num_tokens": 190805076.0, + "step": 1143 + }, + { + "entropy": 1.70640030503273, + "epoch": 0.12567630661063964, + "grad_norm": 0.6051517128944397, + "learning_rate": 1.9949456460653382e-05, + "loss": 1.3608, + "mean_token_accuracy": 0.656550352772077, + "num_tokens": 190980015.0, + "step": 1144 + }, + { + "entropy": 1.7363159358501434, + "epoch": 0.12578616352201258, + "grad_norm": 0.7932606339454651, + "learning_rate": 1.9949286868770063e-05, + "loss": 1.4355, + "mean_token_accuracy": 0.6533750792344412, + "num_tokens": 191172729.0, + "step": 1145 + }, + { + "entropy": 1.7515104512373607, + "epoch": 0.12589602043338552, + "grad_norm": 0.5937777161598206, + "learning_rate": 1.9949116993645842e-05, + "loss": 1.3832, + "mean_token_accuracy": 0.6598383535941442, + "num_tokens": 191371147.0, + "step": 1146 + }, + { + "entropy": 1.8300546904404957, + "epoch": 0.12600587734475846, + "grad_norm": 0.7814804315567017, + "learning_rate": 1.9948946835286102e-05, + "loss": 1.4727, + "mean_token_accuracy": 0.630996306737264, + "num_tokens": 191552878.0, + "step": 1147 + }, + { + "entropy": 1.696203629175822, + "epoch": 0.1261157342561314, + "grad_norm": 0.6297745704650879, + "learning_rate": 1.9948776393696227e-05, + "loss": 1.3124, + "mean_token_accuracy": 0.667043482263883, + "num_tokens": 191733626.0, + "step": 1148 + }, + { + "entropy": 1.710612674554189, + "epoch": 0.12622559116750431, + "grad_norm": 0.803044855594635, + "learning_rate": 1.9948605668881608e-05, + "loss": 1.4977, + "mean_token_accuracy": 0.6440107375383377, + "num_tokens": 191888183.0, + "step": 1149 + }, + { + "entropy": 1.7293962438901265, + "epoch": 0.12633544807887725, + "grad_norm": 0.7975369095802307, + "learning_rate": 1.9948434660847658e-05, + "loss": 1.2799, + "mean_token_accuracy": 0.6679138342539469, + "num_tokens": 192005132.0, + "step": 1150 + }, + { + "entropy": 1.7680123845736186, + "epoch": 0.1264453049902502, + "grad_norm": 0.7152736186981201, + "learning_rate": 1.994826336959978e-05, + "loss": 1.4389, + "mean_token_accuracy": 0.6575280626614889, + "num_tokens": 192155335.0, + "step": 1151 + }, + { + "entropy": 1.7631124357382457, + "epoch": 0.12655516190162314, + "grad_norm": 0.7395944595336914, + "learning_rate": 1.99480917951434e-05, + "loss": 1.2721, + "mean_token_accuracy": 0.6726651241381963, + "num_tokens": 192270766.0, + "step": 1152 + }, + { + "entropy": 1.727259635925293, + "epoch": 0.12666501881299608, + "grad_norm": 0.6742376089096069, + "learning_rate": 1.9947919937483944e-05, + "loss": 1.43, + "mean_token_accuracy": 0.6421197056770325, + "num_tokens": 192433846.0, + "step": 1153 + }, + { + "entropy": 1.81740140914917, + "epoch": 0.12677487572436902, + "grad_norm": 0.8998958468437195, + "learning_rate": 1.9947747796626854e-05, + "loss": 1.3927, + "mean_token_accuracy": 0.653491660952568, + "num_tokens": 192557670.0, + "step": 1154 + }, + { + "entropy": 1.7740411162376404, + "epoch": 0.12688473263574196, + "grad_norm": 0.7846980690956116, + "learning_rate": 1.9947575372577583e-05, + "loss": 1.3235, + "mean_token_accuracy": 0.6758840531110764, + "num_tokens": 192690479.0, + "step": 1155 + }, + { + "entropy": 1.765354762474696, + "epoch": 0.12699458954711487, + "grad_norm": 0.7348425388336182, + "learning_rate": 1.994740266534158e-05, + "loss": 1.3549, + "mean_token_accuracy": 0.661831850806872, + "num_tokens": 192830194.0, + "step": 1156 + }, + { + "entropy": 1.7395083606243134, + "epoch": 0.1271044464584878, + "grad_norm": 0.6277830600738525, + "learning_rate": 1.9947229674924316e-05, + "loss": 1.5185, + "mean_token_accuracy": 0.6336728284756342, + "num_tokens": 193034894.0, + "step": 1157 + }, + { + "entropy": 1.7037833829720814, + "epoch": 0.12721430336986075, + "grad_norm": 0.6423079967498779, + "learning_rate": 1.9947056401331265e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.6632378300031027, + "num_tokens": 193280199.0, + "step": 1158 + }, + { + "entropy": 1.7469445566336315, + "epoch": 0.1273241602812337, + "grad_norm": 0.7091412544250488, + "learning_rate": 1.9946882844567906e-05, + "loss": 1.3312, + "mean_token_accuracy": 0.6669044842322668, + "num_tokens": 193443768.0, + "step": 1159 + }, + { + "entropy": 1.7258818745613098, + "epoch": 0.12743401719260664, + "grad_norm": 0.6195393800735474, + "learning_rate": 1.994670900463974e-05, + "loss": 1.3969, + "mean_token_accuracy": 0.6519241978724798, + "num_tokens": 193645217.0, + "step": 1160 + }, + { + "entropy": 1.7719605664412181, + "epoch": 0.12754387410397958, + "grad_norm": 0.6821731925010681, + "learning_rate": 1.9946534881552266e-05, + "loss": 1.3772, + "mean_token_accuracy": 0.6459381332000097, + "num_tokens": 193758088.0, + "step": 1161 + }, + { + "entropy": 1.7489538192749023, + "epoch": 0.1276537310153525, + "grad_norm": 0.6906759738922119, + "learning_rate": 1.9946360475310993e-05, + "loss": 1.4159, + "mean_token_accuracy": 0.662546748916308, + "num_tokens": 193951389.0, + "step": 1162 + }, + { + "entropy": 1.7279592752456665, + "epoch": 0.12776358792672543, + "grad_norm": 0.6182255148887634, + "learning_rate": 1.9946185785921442e-05, + "loss": 1.4738, + "mean_token_accuracy": 0.6493389358123144, + "num_tokens": 194166434.0, + "step": 1163 + }, + { + "entropy": 1.6567539076010387, + "epoch": 0.12787344483809837, + "grad_norm": 0.659775972366333, + "learning_rate": 1.9946010813389143e-05, + "loss": 1.1995, + "mean_token_accuracy": 0.689688558379809, + "num_tokens": 194303583.0, + "step": 1164 + }, + { + "entropy": 1.807248870531718, + "epoch": 0.1279833017494713, + "grad_norm": 0.6941498517990112, + "learning_rate": 1.9945835557719632e-05, + "loss": 1.4976, + "mean_token_accuracy": 0.6428570051987966, + "num_tokens": 194473880.0, + "step": 1165 + }, + { + "entropy": 1.7196752826372783, + "epoch": 0.12809315866084425, + "grad_norm": 0.6665788888931274, + "learning_rate": 1.9945660018918456e-05, + "loss": 1.467, + "mean_token_accuracy": 0.6387737194697062, + "num_tokens": 194655840.0, + "step": 1166 + }, + { + "entropy": 1.70614160100619, + "epoch": 0.1282030155722172, + "grad_norm": 0.7460064888000488, + "learning_rate": 1.9945484196991173e-05, + "loss": 1.4486, + "mean_token_accuracy": 0.6422029336293539, + "num_tokens": 194826668.0, + "step": 1167 + }, + { + "entropy": 1.713955005009969, + "epoch": 0.12831287248359013, + "grad_norm": 0.7533404231071472, + "learning_rate": 1.9945308091943348e-05, + "loss": 1.5628, + "mean_token_accuracy": 0.656499852736791, + "num_tokens": 195019203.0, + "step": 1168 + }, + { + "entropy": 1.7552619874477386, + "epoch": 0.12842272939496305, + "grad_norm": 0.9428662061691284, + "learning_rate": 1.994513170378055e-05, + "loss": 1.4442, + "mean_token_accuracy": 0.6546032627423605, + "num_tokens": 195199499.0, + "step": 1169 + }, + { + "entropy": 1.7821119626363118, + "epoch": 0.128532586306336, + "grad_norm": 0.7639563679695129, + "learning_rate": 1.9944955032508365e-05, + "loss": 1.4387, + "mean_token_accuracy": 0.6538108189900717, + "num_tokens": 195356654.0, + "step": 1170 + }, + { + "entropy": 1.7284887731075287, + "epoch": 0.12864244321770893, + "grad_norm": 2.3883559703826904, + "learning_rate": 1.994477807813238e-05, + "loss": 0.9387, + "mean_token_accuracy": 0.6993002146482468, + "num_tokens": 195492267.0, + "step": 1171 + }, + { + "entropy": 1.7171109517415364, + "epoch": 0.12875230012908187, + "grad_norm": 0.6380852460861206, + "learning_rate": 1.9944600840658207e-05, + "loss": 1.2273, + "mean_token_accuracy": 0.6765789190928141, + "num_tokens": 195647384.0, + "step": 1172 + }, + { + "entropy": 1.7350752850373585, + "epoch": 0.1288621570404548, + "grad_norm": 0.6649433374404907, + "learning_rate": 1.9944423320091445e-05, + "loss": 1.4263, + "mean_token_accuracy": 0.6540283511082331, + "num_tokens": 195798088.0, + "step": 1173 + }, + { + "entropy": 1.6744823853174846, + "epoch": 0.12897201395182775, + "grad_norm": 0.6647149920463562, + "learning_rate": 1.9944245516437714e-05, + "loss": 1.2886, + "mean_token_accuracy": 0.6681593358516693, + "num_tokens": 195964314.0, + "step": 1174 + }, + { + "entropy": 1.7713862359523773, + "epoch": 0.1290818708632007, + "grad_norm": 0.8901455998420715, + "learning_rate": 1.9944067429702644e-05, + "loss": 1.2277, + "mean_token_accuracy": 0.6852303644021353, + "num_tokens": 196078305.0, + "step": 1175 + }, + { + "entropy": 1.793348143498103, + "epoch": 0.1291917277745736, + "grad_norm": 0.8101892471313477, + "learning_rate": 1.994388905989187e-05, + "loss": 1.3155, + "mean_token_accuracy": 0.6616990566253662, + "num_tokens": 196179334.0, + "step": 1176 + }, + { + "entropy": 1.6949261128902435, + "epoch": 0.12930158468594655, + "grad_norm": 0.5549296736717224, + "learning_rate": 1.9943710407011038e-05, + "loss": 1.3279, + "mean_token_accuracy": 0.6614242494106293, + "num_tokens": 196345485.0, + "step": 1177 + }, + { + "entropy": 1.7543793419996898, + "epoch": 0.1294114415973195, + "grad_norm": 0.8616530299186707, + "learning_rate": 1.9943531471065798e-05, + "loss": 1.5046, + "mean_token_accuracy": 0.6579241951306661, + "num_tokens": 196497507.0, + "step": 1178 + }, + { + "entropy": 1.741701563199361, + "epoch": 0.12952129850869243, + "grad_norm": 0.7313582897186279, + "learning_rate": 1.9943352252061818e-05, + "loss": 1.418, + "mean_token_accuracy": 0.6414368947347006, + "num_tokens": 196674374.0, + "step": 1179 + }, + { + "entropy": 1.7809857726097107, + "epoch": 0.12963115542006537, + "grad_norm": 0.7206242680549622, + "learning_rate": 1.9943172750004773e-05, + "loss": 1.5279, + "mean_token_accuracy": 0.6377990394830704, + "num_tokens": 196852648.0, + "step": 1180 + }, + { + "entropy": 1.7742149730523427, + "epoch": 0.1297410123314383, + "grad_norm": 0.712735116481781, + "learning_rate": 1.994299296490034e-05, + "loss": 1.4867, + "mean_token_accuracy": 0.6416104336579641, + "num_tokens": 197005775.0, + "step": 1181 + }, + { + "entropy": 1.801190088192622, + "epoch": 0.12985086924281125, + "grad_norm": 0.7176236510276794, + "learning_rate": 1.9942812896754206e-05, + "loss": 1.535, + "mean_token_accuracy": 0.6445636649926504, + "num_tokens": 197172268.0, + "step": 1182 + }, + { + "entropy": 1.7565678854783375, + "epoch": 0.12996072615418416, + "grad_norm": 0.8221584558486938, + "learning_rate": 1.9942632545572073e-05, + "loss": 1.442, + "mean_token_accuracy": 0.6304828822612762, + "num_tokens": 197381956.0, + "step": 1183 + }, + { + "entropy": 1.7492178777853649, + "epoch": 0.1300705830655571, + "grad_norm": 0.569448709487915, + "learning_rate": 1.9942451911359655e-05, + "loss": 1.5449, + "mean_token_accuracy": 0.6221815447012583, + "num_tokens": 197666755.0, + "step": 1184 + }, + { + "entropy": 1.7311831414699554, + "epoch": 0.13018043997693005, + "grad_norm": 0.652637779712677, + "learning_rate": 1.994227099412266e-05, + "loss": 1.5645, + "mean_token_accuracy": 0.637279137969017, + "num_tokens": 197885096.0, + "step": 1185 + }, + { + "entropy": 1.7549742658933003, + "epoch": 0.130290296888303, + "grad_norm": 0.6172470450401306, + "learning_rate": 1.994208979386682e-05, + "loss": 1.471, + "mean_token_accuracy": 0.6358410517374674, + "num_tokens": 198047424.0, + "step": 1186 + }, + { + "entropy": 1.69119127591451, + "epoch": 0.13040015379967593, + "grad_norm": 0.572778582572937, + "learning_rate": 1.9941908310597862e-05, + "loss": 1.4412, + "mean_token_accuracy": 0.6451299836238226, + "num_tokens": 198258790.0, + "step": 1187 + }, + { + "entropy": 1.7269008060296376, + "epoch": 0.13051001071104887, + "grad_norm": 0.6882736682891846, + "learning_rate": 1.994172654432154e-05, + "loss": 1.4044, + "mean_token_accuracy": 0.6471477945645651, + "num_tokens": 198405031.0, + "step": 1188 + }, + { + "entropy": 1.8405869603157043, + "epoch": 0.1306198676224218, + "grad_norm": 0.7171852588653564, + "learning_rate": 1.99415444950436e-05, + "loss": 1.4245, + "mean_token_accuracy": 0.6366102347771326, + "num_tokens": 198536441.0, + "step": 1189 + }, + { + "entropy": 1.708700180053711, + "epoch": 0.13072972453379472, + "grad_norm": 0.6238212585449219, + "learning_rate": 1.994136216276981e-05, + "loss": 1.3952, + "mean_token_accuracy": 0.6541391809781393, + "num_tokens": 198771632.0, + "step": 1190 + }, + { + "entropy": 1.686782290538152, + "epoch": 0.13083958144516766, + "grad_norm": 0.6770201921463013, + "learning_rate": 1.994117954750593e-05, + "loss": 1.4392, + "mean_token_accuracy": 0.6496217797199885, + "num_tokens": 198940548.0, + "step": 1191 + }, + { + "entropy": 1.732615441083908, + "epoch": 0.1309494383565406, + "grad_norm": 0.635962724685669, + "learning_rate": 1.994099664925775e-05, + "loss": 1.4054, + "mean_token_accuracy": 0.6503652880589167, + "num_tokens": 199146118.0, + "step": 1192 + }, + { + "entropy": 1.6821688016255696, + "epoch": 0.13105929526791354, + "grad_norm": 0.6260414719581604, + "learning_rate": 1.9940813468031056e-05, + "loss": 1.5164, + "mean_token_accuracy": 0.639798546830813, + "num_tokens": 199337245.0, + "step": 1193 + }, + { + "entropy": 1.751666744550069, + "epoch": 0.13116915217928649, + "grad_norm": 0.6123976707458496, + "learning_rate": 1.9940630003831644e-05, + "loss": 1.6292, + "mean_token_accuracy": 0.6407297352949778, + "num_tokens": 199596254.0, + "step": 1194 + }, + { + "entropy": 1.74623238046964, + "epoch": 0.13127900909065943, + "grad_norm": 0.7188962697982788, + "learning_rate": 1.9940446256665317e-05, + "loss": 1.4197, + "mean_token_accuracy": 0.6419958025217056, + "num_tokens": 199772672.0, + "step": 1195 + }, + { + "entropy": 1.7535964945952098, + "epoch": 0.13138886600203234, + "grad_norm": 0.6890870928764343, + "learning_rate": 1.99402622265379e-05, + "loss": 1.3874, + "mean_token_accuracy": 0.6490504443645477, + "num_tokens": 199920027.0, + "step": 1196 + }, + { + "entropy": 1.7611814439296722, + "epoch": 0.13149872291340528, + "grad_norm": 0.8501882553100586, + "learning_rate": 1.994007791345521e-05, + "loss": 1.4224, + "mean_token_accuracy": 0.6603156328201294, + "num_tokens": 200084942.0, + "step": 1197 + }, + { + "entropy": 1.7293813327948253, + "epoch": 0.13160857982477822, + "grad_norm": 0.6405790448188782, + "learning_rate": 1.9939893317423086e-05, + "loss": 1.5273, + "mean_token_accuracy": 0.6357795844475428, + "num_tokens": 200288548.0, + "step": 1198 + }, + { + "entropy": 1.68119282523791, + "epoch": 0.13171843673615116, + "grad_norm": 0.6990883350372314, + "learning_rate": 1.9939708438447357e-05, + "loss": 1.1898, + "mean_token_accuracy": 0.685491551955541, + "num_tokens": 200429774.0, + "step": 1199 + }, + { + "entropy": 1.764894962310791, + "epoch": 0.1318282936475241, + "grad_norm": 0.629033088684082, + "learning_rate": 1.9939523276533893e-05, + "loss": 1.3889, + "mean_token_accuracy": 0.6700173169374466, + "num_tokens": 200570651.0, + "step": 1200 + }, + { + "entropy": 1.7846981485684712, + "epoch": 0.13193815055889704, + "grad_norm": 0.646364688873291, + "learning_rate": 1.9939337831688544e-05, + "loss": 1.5205, + "mean_token_accuracy": 0.6368842373291651, + "num_tokens": 200748458.0, + "step": 1201 + }, + { + "entropy": 1.7097224394480388, + "epoch": 0.13204800747026998, + "grad_norm": 0.641553521156311, + "learning_rate": 1.993915210391718e-05, + "loss": 1.5153, + "mean_token_accuracy": 0.650927260518074, + "num_tokens": 200931234.0, + "step": 1202 + }, + { + "entropy": 1.7454093396663666, + "epoch": 0.1321578643816429, + "grad_norm": 0.8706952333450317, + "learning_rate": 1.9938966093225683e-05, + "loss": 1.3199, + "mean_token_accuracy": 0.6697538246711096, + "num_tokens": 201066141.0, + "step": 1203 + }, + { + "entropy": 1.7411855657895405, + "epoch": 0.13226772129301584, + "grad_norm": 0.6181418895721436, + "learning_rate": 1.993877979961993e-05, + "loss": 1.3747, + "mean_token_accuracy": 0.6520515978336334, + "num_tokens": 201257542.0, + "step": 1204 + }, + { + "entropy": 1.7680325210094452, + "epoch": 0.13237757820438878, + "grad_norm": 0.7731313109397888, + "learning_rate": 1.993859322310583e-05, + "loss": 1.5037, + "mean_token_accuracy": 0.6352165639400482, + "num_tokens": 201434347.0, + "step": 1205 + }, + { + "entropy": 1.710806429386139, + "epoch": 0.13248743511576172, + "grad_norm": 0.7883396148681641, + "learning_rate": 1.993840636368928e-05, + "loss": 1.4203, + "mean_token_accuracy": 0.6624042391777039, + "num_tokens": 201612352.0, + "step": 1206 + }, + { + "entropy": 1.7501719494660695, + "epoch": 0.13259729202713466, + "grad_norm": 0.6519463062286377, + "learning_rate": 1.9938219221376198e-05, + "loss": 1.343, + "mean_token_accuracy": 0.656904548406601, + "num_tokens": 201805098.0, + "step": 1207 + }, + { + "entropy": 1.6922560433546703, + "epoch": 0.1327071489385076, + "grad_norm": 0.6619210243225098, + "learning_rate": 1.9938031796172504e-05, + "loss": 1.4414, + "mean_token_accuracy": 0.6696604192256927, + "num_tokens": 201974466.0, + "step": 1208 + }, + { + "entropy": 1.7348144352436066, + "epoch": 0.13281700584988054, + "grad_norm": 0.7165763974189758, + "learning_rate": 1.993784408808413e-05, + "loss": 1.422, + "mean_token_accuracy": 0.6480233718951544, + "num_tokens": 202120925.0, + "step": 1209 + }, + { + "entropy": 1.6581164697806041, + "epoch": 0.13292686276125346, + "grad_norm": 0.6778578162193298, + "learning_rate": 1.993765609711702e-05, + "loss": 1.4294, + "mean_token_accuracy": 0.6534278045097986, + "num_tokens": 202282750.0, + "step": 1210 + }, + { + "entropy": 1.6809436281522114, + "epoch": 0.1330367196726264, + "grad_norm": 0.7304653525352478, + "learning_rate": 1.9937467823277122e-05, + "loss": 1.4323, + "mean_token_accuracy": 0.6725066850582758, + "num_tokens": 202462082.0, + "step": 1211 + }, + { + "entropy": 1.6749194264411926, + "epoch": 0.13314657658399934, + "grad_norm": 0.8034223914146423, + "learning_rate": 1.9937279266570395e-05, + "loss": 1.4945, + "mean_token_accuracy": 0.6492439856131872, + "num_tokens": 202629513.0, + "step": 1212 + }, + { + "entropy": 1.7574256658554077, + "epoch": 0.13325643349537228, + "grad_norm": 0.741016149520874, + "learning_rate": 1.9937090427002806e-05, + "loss": 1.3436, + "mean_token_accuracy": 0.6710640490055084, + "num_tokens": 202774386.0, + "step": 1213 + }, + { + "entropy": 1.7507571478684743, + "epoch": 0.13336629040674522, + "grad_norm": 0.7227981686592102, + "learning_rate": 1.993690130458033e-05, + "loss": 1.4202, + "mean_token_accuracy": 0.6539425303538641, + "num_tokens": 202919220.0, + "step": 1214 + }, + { + "entropy": 1.784160981575648, + "epoch": 0.13347614731811816, + "grad_norm": 0.7705137133598328, + "learning_rate": 1.993671189930896e-05, + "loss": 1.3467, + "mean_token_accuracy": 0.6598477313915888, + "num_tokens": 203027863.0, + "step": 1215 + }, + { + "entropy": 1.642378608385722, + "epoch": 0.1335860042294911, + "grad_norm": 0.5922091603279114, + "learning_rate": 1.993652221119468e-05, + "loss": 1.232, + "mean_token_accuracy": 0.6796272893746694, + "num_tokens": 203157757.0, + "step": 1216 + }, + { + "entropy": 1.8348711729049683, + "epoch": 0.13369586114086401, + "grad_norm": 0.6379344463348389, + "learning_rate": 1.9936332240243503e-05, + "loss": 1.516, + "mean_token_accuracy": 0.6400525023539861, + "num_tokens": 203348155.0, + "step": 1217 + }, + { + "entropy": 1.695642501115799, + "epoch": 0.13380571805223695, + "grad_norm": 0.6818593740463257, + "learning_rate": 1.9936141986461434e-05, + "loss": 1.3798, + "mean_token_accuracy": 0.6709787100553513, + "num_tokens": 203499917.0, + "step": 1218 + }, + { + "entropy": 1.7197688619295757, + "epoch": 0.1339155749636099, + "grad_norm": 0.6506553292274475, + "learning_rate": 1.9935951449854502e-05, + "loss": 1.5437, + "mean_token_accuracy": 0.6586721042792002, + "num_tokens": 203678606.0, + "step": 1219 + }, + { + "entropy": 1.6951692700386047, + "epoch": 0.13402543187498284, + "grad_norm": 0.7173891067504883, + "learning_rate": 1.993576063042873e-05, + "loss": 1.3165, + "mean_token_accuracy": 0.6613740225632986, + "num_tokens": 203802059.0, + "step": 1220 + }, + { + "entropy": 1.7536791861057281, + "epoch": 0.13413528878635578, + "grad_norm": 0.8501359820365906, + "learning_rate": 1.993556952819016e-05, + "loss": 1.3935, + "mean_token_accuracy": 0.6629123538732529, + "num_tokens": 203961628.0, + "step": 1221 + }, + { + "entropy": 1.7680688103040059, + "epoch": 0.13424514569772872, + "grad_norm": 0.5954765677452087, + "learning_rate": 1.993537814314484e-05, + "loss": 1.386, + "mean_token_accuracy": 0.660725419720014, + "num_tokens": 204138349.0, + "step": 1222 + }, + { + "entropy": 1.8008166750272114, + "epoch": 0.13435500260910163, + "grad_norm": 0.7124298214912415, + "learning_rate": 1.993518647529883e-05, + "loss": 1.5329, + "mean_token_accuracy": 0.6388672788937887, + "num_tokens": 204301649.0, + "step": 1223 + }, + { + "entropy": 1.780760755141576, + "epoch": 0.13446485952047457, + "grad_norm": 0.6755843162536621, + "learning_rate": 1.9934994524658196e-05, + "loss": 1.3597, + "mean_token_accuracy": 0.6550761957963308, + "num_tokens": 204474630.0, + "step": 1224 + }, + { + "entropy": 1.7628767291704814, + "epoch": 0.1345747164318475, + "grad_norm": 0.6415227651596069, + "learning_rate": 1.993480229122901e-05, + "loss": 1.5455, + "mean_token_accuracy": 0.6402927239735922, + "num_tokens": 204680151.0, + "step": 1225 + }, + { + "entropy": 1.723549763361613, + "epoch": 0.13468457334322045, + "grad_norm": 0.605971097946167, + "learning_rate": 1.9934609775017357e-05, + "loss": 1.5025, + "mean_token_accuracy": 0.6315073023239771, + "num_tokens": 204876536.0, + "step": 1226 + }, + { + "entropy": 1.711145242055257, + "epoch": 0.1347944302545934, + "grad_norm": 0.696202278137207, + "learning_rate": 1.993441697602933e-05, + "loss": 1.4026, + "mean_token_accuracy": 0.6470880806446075, + "num_tokens": 205090640.0, + "step": 1227 + }, + { + "entropy": 1.7900481621424358, + "epoch": 0.13490428716596634, + "grad_norm": 0.7666972875595093, + "learning_rate": 1.9934223894271035e-05, + "loss": 1.4558, + "mean_token_accuracy": 0.6359467854102453, + "num_tokens": 205354906.0, + "step": 1228 + }, + { + "entropy": 1.7724877794583638, + "epoch": 0.13501414407733928, + "grad_norm": 0.6311783790588379, + "learning_rate": 1.993403052974858e-05, + "loss": 1.444, + "mean_token_accuracy": 0.6428662339846293, + "num_tokens": 205529189.0, + "step": 1229 + }, + { + "entropy": 1.7515977422396343, + "epoch": 0.1351240009887122, + "grad_norm": 0.706529974937439, + "learning_rate": 1.993383688246808e-05, + "loss": 1.3015, + "mean_token_accuracy": 0.6625747283299764, + "num_tokens": 205683718.0, + "step": 1230 + }, + { + "entropy": 1.7582630415757496, + "epoch": 0.13523385790008513, + "grad_norm": 0.6772528886795044, + "learning_rate": 1.993364295243567e-05, + "loss": 1.405, + "mean_token_accuracy": 0.6459440638621649, + "num_tokens": 205812951.0, + "step": 1231 + }, + { + "entropy": 1.7285764515399933, + "epoch": 0.13534371481145807, + "grad_norm": 0.6248936057090759, + "learning_rate": 1.9933448739657487e-05, + "loss": 1.3699, + "mean_token_accuracy": 0.6622784286737442, + "num_tokens": 205944308.0, + "step": 1232 + }, + { + "entropy": 1.6804889539877574, + "epoch": 0.135453571722831, + "grad_norm": 0.8179889917373657, + "learning_rate": 1.9933254244139675e-05, + "loss": 1.341, + "mean_token_accuracy": 0.6590113043785095, + "num_tokens": 206136941.0, + "step": 1233 + }, + { + "entropy": 1.7611981630325317, + "epoch": 0.13556342863420395, + "grad_norm": 0.7938576340675354, + "learning_rate": 1.9933059465888394e-05, + "loss": 1.4829, + "mean_token_accuracy": 0.6553379346927007, + "num_tokens": 206305282.0, + "step": 1234 + }, + { + "entropy": 1.7320065299669902, + "epoch": 0.1356732855455769, + "grad_norm": 0.6579363942146301, + "learning_rate": 1.9932864404909808e-05, + "loss": 1.399, + "mean_token_accuracy": 0.6568387846151987, + "num_tokens": 206530940.0, + "step": 1235 + }, + { + "entropy": 1.8249189853668213, + "epoch": 0.13578314245694983, + "grad_norm": 0.6946649551391602, + "learning_rate": 1.9932669061210082e-05, + "loss": 1.4977, + "mean_token_accuracy": 0.6561338355143865, + "num_tokens": 206728889.0, + "step": 1236 + }, + { + "entropy": 1.7712652484575908, + "epoch": 0.13589299936832275, + "grad_norm": 0.8788438439369202, + "learning_rate": 1.993247343479541e-05, + "loss": 1.2986, + "mean_token_accuracy": 0.6657395313183466, + "num_tokens": 206838081.0, + "step": 1237 + }, + { + "entropy": 1.7870614627997081, + "epoch": 0.1360028562796957, + "grad_norm": 0.9152439832687378, + "learning_rate": 1.993227752567198e-05, + "loss": 1.4701, + "mean_token_accuracy": 0.648246243596077, + "num_tokens": 206974371.0, + "step": 1238 + }, + { + "entropy": 1.7998821139335632, + "epoch": 0.13611271319106863, + "grad_norm": 0.6228598952293396, + "learning_rate": 1.9932081333845988e-05, + "loss": 1.3967, + "mean_token_accuracy": 0.6436318705479304, + "num_tokens": 207125473.0, + "step": 1239 + }, + { + "entropy": 1.7932134866714478, + "epoch": 0.13622257010244157, + "grad_norm": 0.8686763048171997, + "learning_rate": 1.993188485932365e-05, + "loss": 1.6097, + "mean_token_accuracy": 0.6425013393163681, + "num_tokens": 207272468.0, + "step": 1240 + }, + { + "entropy": 1.7816320955753326, + "epoch": 0.1363324270138145, + "grad_norm": 0.8052454590797424, + "learning_rate": 1.993168810211118e-05, + "loss": 1.2604, + "mean_token_accuracy": 0.6674903134504954, + "num_tokens": 207394408.0, + "step": 1241 + }, + { + "entropy": 1.7591705024242401, + "epoch": 0.13644228392518745, + "grad_norm": 0.8733900189399719, + "learning_rate": 1.9931491062214806e-05, + "loss": 1.326, + "mean_token_accuracy": 0.670804500579834, + "num_tokens": 207520568.0, + "step": 1242 + }, + { + "entropy": 1.7159309188524883, + "epoch": 0.1365521408365604, + "grad_norm": 0.6903548240661621, + "learning_rate": 1.993129373964076e-05, + "loss": 1.4192, + "mean_token_accuracy": 0.668173685669899, + "num_tokens": 207663994.0, + "step": 1243 + }, + { + "entropy": 1.7172527611255646, + "epoch": 0.1366619977479333, + "grad_norm": 0.8254104256629944, + "learning_rate": 1.9931096134395298e-05, + "loss": 1.366, + "mean_token_accuracy": 0.664167582988739, + "num_tokens": 207861655.0, + "step": 1244 + }, + { + "entropy": 1.7482962310314178, + "epoch": 0.13677185465930625, + "grad_norm": 0.805140495300293, + "learning_rate": 1.9930898246484664e-05, + "loss": 1.4019, + "mean_token_accuracy": 0.6474734991788864, + "num_tokens": 208035591.0, + "step": 1245 + }, + { + "entropy": 1.7081841230392456, + "epoch": 0.1368817115706792, + "grad_norm": 0.7142578959465027, + "learning_rate": 1.9930700075915127e-05, + "loss": 1.4685, + "mean_token_accuracy": 0.6366176108519236, + "num_tokens": 208215389.0, + "step": 1246 + }, + { + "entropy": 1.7701348463694255, + "epoch": 0.13699156848205213, + "grad_norm": 0.6305694580078125, + "learning_rate": 1.9930501622692955e-05, + "loss": 1.4108, + "mean_token_accuracy": 0.6463577598333359, + "num_tokens": 208432404.0, + "step": 1247 + }, + { + "entropy": 1.805136779944102, + "epoch": 0.13710142539342507, + "grad_norm": 0.9272505640983582, + "learning_rate": 1.9930302886824434e-05, + "loss": 1.4094, + "mean_token_accuracy": 0.641325443983078, + "num_tokens": 208584833.0, + "step": 1248 + }, + { + "entropy": 1.760273923476537, + "epoch": 0.137211282304798, + "grad_norm": 0.6633432507514954, + "learning_rate": 1.9930103868315845e-05, + "loss": 1.3002, + "mean_token_accuracy": 0.6639900704224905, + "num_tokens": 208717845.0, + "step": 1249 + }, + { + "entropy": 1.688180943330129, + "epoch": 0.13732113921617095, + "grad_norm": 0.662477433681488, + "learning_rate": 1.99299045671735e-05, + "loss": 1.5009, + "mean_token_accuracy": 0.6593229522307714, + "num_tokens": 208926014.0, + "step": 1250 + }, + { + "entropy": 1.8261633117993672, + "epoch": 0.13743099612754386, + "grad_norm": 0.8224329948425293, + "learning_rate": 1.9929704983403694e-05, + "loss": 1.2925, + "mean_token_accuracy": 0.6615334004163742, + "num_tokens": 209031355.0, + "step": 1251 + }, + { + "entropy": 1.7323819696903229, + "epoch": 0.1375408530389168, + "grad_norm": 0.5979375243186951, + "learning_rate": 1.9929505117012753e-05, + "loss": 1.5184, + "mean_token_accuracy": 0.6293915957212448, + "num_tokens": 209224580.0, + "step": 1252 + }, + { + "entropy": 1.6968580385049183, + "epoch": 0.13765070995028975, + "grad_norm": 0.6671169996261597, + "learning_rate": 1.9929304968006996e-05, + "loss": 1.3719, + "mean_token_accuracy": 0.6572729150454203, + "num_tokens": 209431303.0, + "step": 1253 + }, + { + "entropy": 1.7308607598145802, + "epoch": 0.1377605668616627, + "grad_norm": 0.7142148017883301, + "learning_rate": 1.992910453639276e-05, + "loss": 1.4229, + "mean_token_accuracy": 0.6546590526898702, + "num_tokens": 209596420.0, + "step": 1254 + }, + { + "entropy": 1.787158230940501, + "epoch": 0.13787042377303563, + "grad_norm": 0.6323195695877075, + "learning_rate": 1.9928903822176392e-05, + "loss": 1.5243, + "mean_token_accuracy": 0.6278480341037115, + "num_tokens": 209779786.0, + "step": 1255 + }, + { + "entropy": 1.6387408177057903, + "epoch": 0.13798028068440857, + "grad_norm": 3.0804104804992676, + "learning_rate": 1.992870282536424e-05, + "loss": 1.4143, + "mean_token_accuracy": 0.6523379882176717, + "num_tokens": 209938901.0, + "step": 1256 + }, + { + "entropy": 1.7122711837291718, + "epoch": 0.13809013759578148, + "grad_norm": 0.669571042060852, + "learning_rate": 1.9928501545962666e-05, + "loss": 1.3667, + "mean_token_accuracy": 0.6676834921042124, + "num_tokens": 210133192.0, + "step": 1257 + }, + { + "entropy": 1.8118088046709697, + "epoch": 0.13819999450715442, + "grad_norm": 0.7653446197509766, + "learning_rate": 1.992829998397804e-05, + "loss": 1.5617, + "mean_token_accuracy": 0.6499693269530932, + "num_tokens": 210305749.0, + "step": 1258 + }, + { + "entropy": 1.7411625186602275, + "epoch": 0.13830985141852736, + "grad_norm": 0.7534335255622864, + "learning_rate": 1.9928098139416745e-05, + "loss": 1.457, + "mean_token_accuracy": 0.6617699215809504, + "num_tokens": 210448909.0, + "step": 1259 + }, + { + "entropy": 1.742474267880122, + "epoch": 0.1384197083299003, + "grad_norm": 0.6153781414031982, + "learning_rate": 1.9927896012285168e-05, + "loss": 1.4539, + "mean_token_accuracy": 0.6413547496000925, + "num_tokens": 210620811.0, + "step": 1260 + }, + { + "entropy": 1.7018092572689056, + "epoch": 0.13852956524127324, + "grad_norm": 0.6760329008102417, + "learning_rate": 1.99276936025897e-05, + "loss": 1.3766, + "mean_token_accuracy": 0.6578912138938904, + "num_tokens": 210781045.0, + "step": 1261 + }, + { + "entropy": 1.7072451611359913, + "epoch": 0.13863942215264619, + "grad_norm": 0.6856552958488464, + "learning_rate": 1.992749091033676e-05, + "loss": 1.3279, + "mean_token_accuracy": 0.6731646209955215, + "num_tokens": 210906125.0, + "step": 1262 + }, + { + "entropy": 1.758314996957779, + "epoch": 0.13874927906401913, + "grad_norm": 0.7295485138893127, + "learning_rate": 1.9927287935532748e-05, + "loss": 1.6247, + "mean_token_accuracy": 0.6324874858061472, + "num_tokens": 211094941.0, + "step": 1263 + }, + { + "entropy": 1.7821235358715057, + "epoch": 0.13885913597539204, + "grad_norm": 0.7965490221977234, + "learning_rate": 1.99270846781841e-05, + "loss": 1.4094, + "mean_token_accuracy": 0.654320701956749, + "num_tokens": 211215450.0, + "step": 1264 + }, + { + "entropy": 1.7328162292639415, + "epoch": 0.13896899288676498, + "grad_norm": 0.7370543479919434, + "learning_rate": 1.9926881138297246e-05, + "loss": 1.4465, + "mean_token_accuracy": 0.6546304225921631, + "num_tokens": 211399399.0, + "step": 1265 + }, + { + "entropy": 1.7225729425748189, + "epoch": 0.13907884979813792, + "grad_norm": 0.7675600051879883, + "learning_rate": 1.9926677315878624e-05, + "loss": 1.2386, + "mean_token_accuracy": 0.6786759148041407, + "num_tokens": 211565381.0, + "step": 1266 + }, + { + "entropy": 1.8305182953675587, + "epoch": 0.13918870670951086, + "grad_norm": 0.710245668888092, + "learning_rate": 1.9926473210934686e-05, + "loss": 1.5366, + "mean_token_accuracy": 0.6416638592878977, + "num_tokens": 211706810.0, + "step": 1267 + }, + { + "entropy": 1.773509681224823, + "epoch": 0.1392985636208838, + "grad_norm": 0.6955118775367737, + "learning_rate": 1.9926268823471894e-05, + "loss": 1.4023, + "mean_token_accuracy": 0.6475071211655935, + "num_tokens": 211901076.0, + "step": 1268 + }, + { + "entropy": 1.7048786679903667, + "epoch": 0.13940842053225674, + "grad_norm": 0.6570739150047302, + "learning_rate": 1.992606415349672e-05, + "loss": 1.3676, + "mean_token_accuracy": 0.6701171100139618, + "num_tokens": 212066863.0, + "step": 1269 + }, + { + "entropy": 1.7483859062194824, + "epoch": 0.13951827744362968, + "grad_norm": 0.6274306774139404, + "learning_rate": 1.9925859201015633e-05, + "loss": 1.4419, + "mean_token_accuracy": 0.649980386098226, + "num_tokens": 212264417.0, + "step": 1270 + }, + { + "entropy": 1.6661617755889893, + "epoch": 0.1396281343550026, + "grad_norm": 0.8265875577926636, + "learning_rate": 1.9925653966035126e-05, + "loss": 1.1776, + "mean_token_accuracy": 0.6817067364851633, + "num_tokens": 212372748.0, + "step": 1271 + }, + { + "entropy": 1.7644007603327434, + "epoch": 0.13973799126637554, + "grad_norm": 0.653523862361908, + "learning_rate": 1.992544844856169e-05, + "loss": 1.3228, + "mean_token_accuracy": 0.6687373667955399, + "num_tokens": 212509269.0, + "step": 1272 + }, + { + "entropy": 1.750530868768692, + "epoch": 0.13984784817774848, + "grad_norm": 0.7181108593940735, + "learning_rate": 1.9925242648601837e-05, + "loss": 1.5806, + "mean_token_accuracy": 0.6340227698286375, + "num_tokens": 212692270.0, + "step": 1273 + }, + { + "entropy": 1.7549054125944774, + "epoch": 0.13995770508912142, + "grad_norm": 0.7157692909240723, + "learning_rate": 1.992503656616208e-05, + "loss": 1.3865, + "mean_token_accuracy": 0.6512027333180109, + "num_tokens": 212868340.0, + "step": 1274 + }, + { + "entropy": 1.7521459460258484, + "epoch": 0.14006756200049436, + "grad_norm": 0.8038479089736938, + "learning_rate": 1.9924830201248928e-05, + "loss": 1.3245, + "mean_token_accuracy": 0.6748414585987726, + "num_tokens": 212980640.0, + "step": 1275 + }, + { + "entropy": 1.7466616133848827, + "epoch": 0.1401774189118673, + "grad_norm": 0.6825430989265442, + "learning_rate": 1.9924623553868927e-05, + "loss": 1.3675, + "mean_token_accuracy": 0.6585159202416738, + "num_tokens": 213130813.0, + "step": 1276 + }, + { + "entropy": 1.7830199599266052, + "epoch": 0.14028727582324024, + "grad_norm": 0.7503454089164734, + "learning_rate": 1.992441662402861e-05, + "loss": 1.3358, + "mean_token_accuracy": 0.6596743067105612, + "num_tokens": 213304286.0, + "step": 1277 + }, + { + "entropy": 1.7447825769583385, + "epoch": 0.14039713273461316, + "grad_norm": 0.7130751013755798, + "learning_rate": 1.9924209411734526e-05, + "loss": 1.5629, + "mean_token_accuracy": 0.6291346848011017, + "num_tokens": 213532550.0, + "step": 1278 + }, + { + "entropy": 1.8167424301306407, + "epoch": 0.1405069896459861, + "grad_norm": 0.7561038732528687, + "learning_rate": 1.9924001916993238e-05, + "loss": 1.4062, + "mean_token_accuracy": 0.6401105572779974, + "num_tokens": 213737731.0, + "step": 1279 + }, + { + "entropy": 1.7469572722911835, + "epoch": 0.14061684655735904, + "grad_norm": 0.7100579142570496, + "learning_rate": 1.9923794139811313e-05, + "loss": 1.4572, + "mean_token_accuracy": 0.6476842015981674, + "num_tokens": 213912106.0, + "step": 1280 + }, + { + "entropy": 1.7979302604993184, + "epoch": 0.14072670346873198, + "grad_norm": 0.6927300691604614, + "learning_rate": 1.9923586080195323e-05, + "loss": 1.36, + "mean_token_accuracy": 0.6531442552804947, + "num_tokens": 214069371.0, + "step": 1281 + }, + { + "entropy": 1.6450412273406982, + "epoch": 0.14083656038010492, + "grad_norm": 0.689548671245575, + "learning_rate": 1.9923377738151856e-05, + "loss": 1.2596, + "mean_token_accuracy": 0.6867117385069529, + "num_tokens": 214201260.0, + "step": 1282 + }, + { + "entropy": 1.762027770280838, + "epoch": 0.14094641729147786, + "grad_norm": 0.709928572177887, + "learning_rate": 1.9923169113687503e-05, + "loss": 1.5263, + "mean_token_accuracy": 0.6340511639912924, + "num_tokens": 214406160.0, + "step": 1283 + }, + { + "entropy": 1.7095829248428345, + "epoch": 0.14105627420285077, + "grad_norm": 8.520343780517578, + "learning_rate": 1.9922960206808867e-05, + "loss": 1.0589, + "mean_token_accuracy": 0.6897023518880209, + "num_tokens": 214592081.0, + "step": 1284 + }, + { + "entropy": 1.7550367613633473, + "epoch": 0.14116613111422371, + "grad_norm": 0.6775065064430237, + "learning_rate": 1.992275101752256e-05, + "loss": 1.4911, + "mean_token_accuracy": 0.6530732462803522, + "num_tokens": 214764303.0, + "step": 1285 + }, + { + "entropy": 1.7507551113764446, + "epoch": 0.14127598802559665, + "grad_norm": 0.6907753944396973, + "learning_rate": 1.992254154583521e-05, + "loss": 1.4727, + "mean_token_accuracy": 0.6412150710821152, + "num_tokens": 214920143.0, + "step": 1286 + }, + { + "entropy": 1.7401640017827351, + "epoch": 0.1413858449369696, + "grad_norm": 0.7834397554397583, + "learning_rate": 1.9922331791753435e-05, + "loss": 1.3464, + "mean_token_accuracy": 0.6618963032960892, + "num_tokens": 215046344.0, + "step": 1287 + }, + { + "entropy": 1.7570099631945293, + "epoch": 0.14149570184834254, + "grad_norm": 0.70011967420578, + "learning_rate": 1.992212175528388e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6539260894060135, + "num_tokens": 215180311.0, + "step": 1288 + }, + { + "entropy": 1.6938535173734028, + "epoch": 0.14160555875971548, + "grad_norm": 0.7015315294265747, + "learning_rate": 1.9921911436433194e-05, + "loss": 1.5209, + "mean_token_accuracy": 0.6413632233937582, + "num_tokens": 215372204.0, + "step": 1289 + }, + { + "entropy": 1.7938569088776906, + "epoch": 0.14171541567108842, + "grad_norm": 0.6672449707984924, + "learning_rate": 1.992170083520803e-05, + "loss": 1.4559, + "mean_token_accuracy": 0.6347246567408243, + "num_tokens": 215554411.0, + "step": 1290 + }, + { + "entropy": 1.7035801708698273, + "epoch": 0.14182527258246133, + "grad_norm": 0.8814971446990967, + "learning_rate": 1.9921489951615057e-05, + "loss": 1.3534, + "mean_token_accuracy": 0.6513211578130722, + "num_tokens": 215689080.0, + "step": 1291 + }, + { + "entropy": 1.7192172209421794, + "epoch": 0.14193512949383427, + "grad_norm": 0.690263569355011, + "learning_rate": 1.9921278785660946e-05, + "loss": 1.4102, + "mean_token_accuracy": 0.6569246202707291, + "num_tokens": 215849803.0, + "step": 1292 + }, + { + "entropy": 1.7479885419209797, + "epoch": 0.1420449864052072, + "grad_norm": 0.6556616425514221, + "learning_rate": 1.9921067337352384e-05, + "loss": 1.4046, + "mean_token_accuracy": 0.6546976268291473, + "num_tokens": 216019645.0, + "step": 1293 + }, + { + "entropy": 1.7502675553162892, + "epoch": 0.14215484331658015, + "grad_norm": 0.7258479595184326, + "learning_rate": 1.9920855606696054e-05, + "loss": 1.5266, + "mean_token_accuracy": 0.6518742392460505, + "num_tokens": 216187505.0, + "step": 1294 + }, + { + "entropy": 1.7436367273330688, + "epoch": 0.1422647002279531, + "grad_norm": 0.8124620914459229, + "learning_rate": 1.992064359369867e-05, + "loss": 1.3587, + "mean_token_accuracy": 0.6601897577444712, + "num_tokens": 216323904.0, + "step": 1295 + }, + { + "entropy": 1.706505278746287, + "epoch": 0.14237455713932604, + "grad_norm": 0.7671903371810913, + "learning_rate": 1.992043129836693e-05, + "loss": 1.2957, + "mean_token_accuracy": 0.6700823853413264, + "num_tokens": 216450218.0, + "step": 1296 + }, + { + "entropy": 1.6708903312683105, + "epoch": 0.14248441405069898, + "grad_norm": 0.6247614026069641, + "learning_rate": 1.9920218720707563e-05, + "loss": 1.2455, + "mean_token_accuracy": 0.683778112133344, + "num_tokens": 216583167.0, + "step": 1297 + }, + { + "entropy": 1.755852033694585, + "epoch": 0.1425942709620719, + "grad_norm": 0.8267444968223572, + "learning_rate": 1.992000586072729e-05, + "loss": 1.2248, + "mean_token_accuracy": 0.6767070343097051, + "num_tokens": 216706610.0, + "step": 1298 + }, + { + "entropy": 1.7532562911510468, + "epoch": 0.14270412787344483, + "grad_norm": 0.7160501480102539, + "learning_rate": 1.9919792718432858e-05, + "loss": 1.4451, + "mean_token_accuracy": 0.6385903209447861, + "num_tokens": 216851311.0, + "step": 1299 + }, + { + "entropy": 1.7782372931639354, + "epoch": 0.14281398478481777, + "grad_norm": 0.8023732900619507, + "learning_rate": 1.9919579293831e-05, + "loss": 1.4043, + "mean_token_accuracy": 0.6392123301823934, + "num_tokens": 217002266.0, + "step": 1300 + }, + { + "entropy": 1.7266008655230205, + "epoch": 0.1429238416961907, + "grad_norm": 0.706461489200592, + "learning_rate": 1.9919365586928477e-05, + "loss": 1.431, + "mean_token_accuracy": 0.6418863981962204, + "num_tokens": 217151987.0, + "step": 1301 + }, + { + "entropy": 1.6731161773204803, + "epoch": 0.14303369860756365, + "grad_norm": 0.5926313400268555, + "learning_rate": 1.9919151597732055e-05, + "loss": 1.5354, + "mean_token_accuracy": 0.6458988140026728, + "num_tokens": 217370579.0, + "step": 1302 + }, + { + "entropy": 1.7368605931599934, + "epoch": 0.1431435555189366, + "grad_norm": 0.7459754943847656, + "learning_rate": 1.9918937326248503e-05, + "loss": 1.3562, + "mean_token_accuracy": 0.6788886686166128, + "num_tokens": 217495749.0, + "step": 1303 + }, + { + "entropy": 1.77315154671669, + "epoch": 0.14325341243030953, + "grad_norm": 0.9817308187484741, + "learning_rate": 1.99187227724846e-05, + "loss": 1.4833, + "mean_token_accuracy": 0.6524508446455002, + "num_tokens": 217617122.0, + "step": 1304 + }, + { + "entropy": 1.6556137005488079, + "epoch": 0.14336326934168245, + "grad_norm": 0.6126701235771179, + "learning_rate": 1.9918507936447146e-05, + "loss": 1.3457, + "mean_token_accuracy": 0.657344122727712, + "num_tokens": 217822669.0, + "step": 1305 + }, + { + "entropy": 1.7017977635065715, + "epoch": 0.1434731262530554, + "grad_norm": 0.8157427906990051, + "learning_rate": 1.9918292818142934e-05, + "loss": 1.3017, + "mean_token_accuracy": 0.6645906120538712, + "num_tokens": 217935144.0, + "step": 1306 + }, + { + "entropy": 1.6964992980162303, + "epoch": 0.14358298316442833, + "grad_norm": 0.664656400680542, + "learning_rate": 1.9918077417578768e-05, + "loss": 1.3651, + "mean_token_accuracy": 0.6545301824808121, + "num_tokens": 218130755.0, + "step": 1307 + }, + { + "entropy": 1.7069012820720673, + "epoch": 0.14369284007580127, + "grad_norm": 0.5952320694923401, + "learning_rate": 1.9917861734761476e-05, + "loss": 1.3134, + "mean_token_accuracy": 0.6620885580778122, + "num_tokens": 218293252.0, + "step": 1308 + }, + { + "entropy": 1.7563343445460002, + "epoch": 0.1438026969871742, + "grad_norm": 0.710450291633606, + "learning_rate": 1.9917645769697874e-05, + "loss": 1.2887, + "mean_token_accuracy": 0.6759348313013712, + "num_tokens": 218437112.0, + "step": 1309 + }, + { + "entropy": 1.6977879603703816, + "epoch": 0.14391255389854715, + "grad_norm": 0.5810772776603699, + "learning_rate": 1.99174295223948e-05, + "loss": 1.4859, + "mean_token_accuracy": 0.6420779774586359, + "num_tokens": 218652082.0, + "step": 1310 + }, + { + "entropy": 1.729242612918218, + "epoch": 0.1440224108099201, + "grad_norm": 0.6845049858093262, + "learning_rate": 1.9917212992859104e-05, + "loss": 1.5988, + "mean_token_accuracy": 0.6266419490178426, + "num_tokens": 218867489.0, + "step": 1311 + }, + { + "entropy": 1.7926994264125824, + "epoch": 0.144132267721293, + "grad_norm": 0.6462848782539368, + "learning_rate": 1.9916996181097635e-05, + "loss": 1.4226, + "mean_token_accuracy": 0.6535110026597977, + "num_tokens": 219003067.0, + "step": 1312 + }, + { + "entropy": 1.7463144659996033, + "epoch": 0.14424212463266595, + "grad_norm": 0.7083527445793152, + "learning_rate": 1.9916779087117255e-05, + "loss": 1.4073, + "mean_token_accuracy": 0.654004101951917, + "num_tokens": 219137606.0, + "step": 1313 + }, + { + "entropy": 1.7422666052977245, + "epoch": 0.1443519815440389, + "grad_norm": 0.6640010476112366, + "learning_rate": 1.9916561710924834e-05, + "loss": 1.4153, + "mean_token_accuracy": 0.6418075213829676, + "num_tokens": 219327443.0, + "step": 1314 + }, + { + "entropy": 1.7220148543516796, + "epoch": 0.14446183845541183, + "grad_norm": 0.7796133756637573, + "learning_rate": 1.9916344052527252e-05, + "loss": 1.2958, + "mean_token_accuracy": 0.6594913254181544, + "num_tokens": 219472432.0, + "step": 1315 + }, + { + "entropy": 1.7227765917778015, + "epoch": 0.14457169536678477, + "grad_norm": 0.8143858909606934, + "learning_rate": 1.99161261119314e-05, + "loss": 1.5051, + "mean_token_accuracy": 0.6485525220632553, + "num_tokens": 219634962.0, + "step": 1316 + }, + { + "entropy": 1.687673379977544, + "epoch": 0.1446815522781577, + "grad_norm": 0.7880772352218628, + "learning_rate": 1.9915907889144175e-05, + "loss": 1.2464, + "mean_token_accuracy": 0.6796143154303232, + "num_tokens": 219760215.0, + "step": 1317 + }, + { + "entropy": 1.69181493918101, + "epoch": 0.14479140918953062, + "grad_norm": 0.6202834248542786, + "learning_rate": 1.991568938417248e-05, + "loss": 1.3949, + "mean_token_accuracy": 0.6500192880630493, + "num_tokens": 220040624.0, + "step": 1318 + }, + { + "entropy": 1.7221740285555522, + "epoch": 0.14490126610090356, + "grad_norm": 0.6479632258415222, + "learning_rate": 1.9915470597023235e-05, + "loss": 1.4433, + "mean_token_accuracy": 0.6498556931813558, + "num_tokens": 220187439.0, + "step": 1319 + }, + { + "entropy": 1.8096940517425537, + "epoch": 0.1450111230122765, + "grad_norm": 0.8540252447128296, + "learning_rate": 1.9915251527703364e-05, + "loss": 1.4752, + "mean_token_accuracy": 0.6390475134054819, + "num_tokens": 220368796.0, + "step": 1320 + }, + { + "entropy": 1.7524564564228058, + "epoch": 0.14512097992364945, + "grad_norm": 0.6428526043891907, + "learning_rate": 1.9915032176219796e-05, + "loss": 1.4522, + "mean_token_accuracy": 0.638989175359408, + "num_tokens": 220553587.0, + "step": 1321 + }, + { + "entropy": 1.7355947295824687, + "epoch": 0.1452308368350224, + "grad_norm": 0.6749572157859802, + "learning_rate": 1.991481254257948e-05, + "loss": 1.4713, + "mean_token_accuracy": 0.6340489635864893, + "num_tokens": 220841767.0, + "step": 1322 + }, + { + "entropy": 1.7015680869420369, + "epoch": 0.14534069374639533, + "grad_norm": 0.6925226449966431, + "learning_rate": 1.9914592626789364e-05, + "loss": 1.5523, + "mean_token_accuracy": 0.6562323371569315, + "num_tokens": 221024974.0, + "step": 1323 + }, + { + "entropy": 1.712952087322871, + "epoch": 0.14545055065776827, + "grad_norm": 0.6849307417869568, + "learning_rate": 1.9914372428856407e-05, + "loss": 1.3889, + "mean_token_accuracy": 0.652979368964831, + "num_tokens": 221204468.0, + "step": 1324 + }, + { + "entropy": 1.6939956446488698, + "epoch": 0.14556040756914118, + "grad_norm": 0.877672553062439, + "learning_rate": 1.991415194878758e-05, + "loss": 1.2828, + "mean_token_accuracy": 0.6566557884216309, + "num_tokens": 221341403.0, + "step": 1325 + }, + { + "entropy": 1.674074947834015, + "epoch": 0.14567026448051412, + "grad_norm": 0.655870795249939, + "learning_rate": 1.9913931186589863e-05, + "loss": 1.3431, + "mean_token_accuracy": 0.6632597943147024, + "num_tokens": 221473989.0, + "step": 1326 + }, + { + "entropy": 1.8000195523103077, + "epoch": 0.14578012139188706, + "grad_norm": 0.6739341616630554, + "learning_rate": 1.991371014227024e-05, + "loss": 1.4806, + "mean_token_accuracy": 0.6396994342406591, + "num_tokens": 221646735.0, + "step": 1327 + }, + { + "entropy": 1.7230163713296254, + "epoch": 0.14588997830326, + "grad_norm": 0.6686187386512756, + "learning_rate": 1.9913488815835703e-05, + "loss": 1.3263, + "mean_token_accuracy": 0.65727499127388, + "num_tokens": 221804801.0, + "step": 1328 + }, + { + "entropy": 1.7675328155358632, + "epoch": 0.14599983521463294, + "grad_norm": 0.7255124449729919, + "learning_rate": 1.9913267207293266e-05, + "loss": 1.4374, + "mean_token_accuracy": 0.665775845448176, + "num_tokens": 221967384.0, + "step": 1329 + }, + { + "entropy": 1.689390778541565, + "epoch": 0.14610969212600589, + "grad_norm": 0.6025556921958923, + "learning_rate": 1.991304531664994e-05, + "loss": 1.3426, + "mean_token_accuracy": 0.6495958268642426, + "num_tokens": 222153590.0, + "step": 1330 + }, + { + "entropy": 1.7810916304588318, + "epoch": 0.14621954903737883, + "grad_norm": 0.7592765688896179, + "learning_rate": 1.991282314391274e-05, + "loss": 1.4854, + "mean_token_accuracy": 0.6385663896799088, + "num_tokens": 222342691.0, + "step": 1331 + }, + { + "entropy": 1.7684324781099956, + "epoch": 0.14632940594875174, + "grad_norm": 0.6830516457557678, + "learning_rate": 1.9912600689088706e-05, + "loss": 1.4813, + "mean_token_accuracy": 0.6428666114807129, + "num_tokens": 222508014.0, + "step": 1332 + }, + { + "entropy": 1.7351989547411601, + "epoch": 0.14643926286012468, + "grad_norm": 0.632705569267273, + "learning_rate": 1.9912377952184877e-05, + "loss": 1.3404, + "mean_token_accuracy": 0.6745259712139765, + "num_tokens": 222669382.0, + "step": 1333 + }, + { + "entropy": 1.7773430248101552, + "epoch": 0.14654911977149762, + "grad_norm": 0.7237338423728943, + "learning_rate": 1.9912154933208304e-05, + "loss": 1.3179, + "mean_token_accuracy": 0.669169470667839, + "num_tokens": 222795017.0, + "step": 1334 + }, + { + "entropy": 1.718500792980194, + "epoch": 0.14665897668287056, + "grad_norm": 0.756379246711731, + "learning_rate": 1.991193163216604e-05, + "loss": 1.3977, + "mean_token_accuracy": 0.6572215805451075, + "num_tokens": 222920812.0, + "step": 1335 + }, + { + "entropy": 1.7016732394695282, + "epoch": 0.1467688335942435, + "grad_norm": 0.6574013829231262, + "learning_rate": 1.9911708049065156e-05, + "loss": 1.4359, + "mean_token_accuracy": 0.6623116632302603, + "num_tokens": 223102526.0, + "step": 1336 + }, + { + "entropy": 1.7057139774163563, + "epoch": 0.14687869050561644, + "grad_norm": 0.6829228401184082, + "learning_rate": 1.991148418391273e-05, + "loss": 1.4427, + "mean_token_accuracy": 0.6601560066143671, + "num_tokens": 223284435.0, + "step": 1337 + }, + { + "entropy": 1.7417578597863514, + "epoch": 0.14698854741698938, + "grad_norm": 0.7731585502624512, + "learning_rate": 1.9911260036715847e-05, + "loss": 1.2842, + "mean_token_accuracy": 0.6613179345925649, + "num_tokens": 223436682.0, + "step": 1338 + }, + { + "entropy": 1.7527393798033397, + "epoch": 0.1470984043283623, + "grad_norm": 0.741357147693634, + "learning_rate": 1.9911035607481593e-05, + "loss": 1.4087, + "mean_token_accuracy": 0.6580107063055038, + "num_tokens": 223598162.0, + "step": 1339 + }, + { + "entropy": 1.7267196973164876, + "epoch": 0.14720826123973524, + "grad_norm": 0.5621564388275146, + "learning_rate": 1.991081089621708e-05, + "loss": 1.3998, + "mean_token_accuracy": 0.6478342314561208, + "num_tokens": 223756258.0, + "step": 1340 + }, + { + "entropy": 1.741438736518224, + "epoch": 0.14731811815110818, + "grad_norm": 0.7508729100227356, + "learning_rate": 1.991058590292942e-05, + "loss": 1.6231, + "mean_token_accuracy": 0.6208352545897166, + "num_tokens": 223971677.0, + "step": 1341 + }, + { + "entropy": 1.6777463555335999, + "epoch": 0.14742797506248112, + "grad_norm": 0.7933493852615356, + "learning_rate": 1.9910360627625727e-05, + "loss": 1.3107, + "mean_token_accuracy": 0.6797795047362646, + "num_tokens": 224146005.0, + "step": 1342 + }, + { + "entropy": 1.7229306896527607, + "epoch": 0.14753783197385406, + "grad_norm": 0.8197740316390991, + "learning_rate": 1.991013507031314e-05, + "loss": 1.4139, + "mean_token_accuracy": 0.6649947216113409, + "num_tokens": 224323644.0, + "step": 1343 + }, + { + "entropy": 1.748872071504593, + "epoch": 0.147647688885227, + "grad_norm": 0.6963921785354614, + "learning_rate": 1.9909909230998792e-05, + "loss": 1.3859, + "mean_token_accuracy": 0.6595577448606491, + "num_tokens": 224447585.0, + "step": 1344 + }, + { + "entropy": 1.778872420390447, + "epoch": 0.14775754579659992, + "grad_norm": 0.7228877544403076, + "learning_rate": 1.9909683109689832e-05, + "loss": 1.4321, + "mean_token_accuracy": 0.6432789415121078, + "num_tokens": 224622414.0, + "step": 1345 + }, + { + "entropy": 1.7511253654956818, + "epoch": 0.14786740270797286, + "grad_norm": 0.6408666968345642, + "learning_rate": 1.9909456706393412e-05, + "loss": 1.432, + "mean_token_accuracy": 0.6518001953760783, + "num_tokens": 224786040.0, + "step": 1346 + }, + { + "entropy": 1.710090051094691, + "epoch": 0.1479772596193458, + "grad_norm": 0.5858021974563599, + "learning_rate": 1.990923002111671e-05, + "loss": 1.4401, + "mean_token_accuracy": 0.6551679819822311, + "num_tokens": 225004135.0, + "step": 1347 + }, + { + "entropy": 1.7244882980982463, + "epoch": 0.14808711653071874, + "grad_norm": 0.6452533006668091, + "learning_rate": 1.9909003053866884e-05, + "loss": 1.3192, + "mean_token_accuracy": 0.6712134927511215, + "num_tokens": 225190143.0, + "step": 1348 + }, + { + "entropy": 1.7229234476884205, + "epoch": 0.14819697344209168, + "grad_norm": 0.6897783875465393, + "learning_rate": 1.990877580465113e-05, + "loss": 1.3271, + "mean_token_accuracy": 0.6669703423976898, + "num_tokens": 225357614.0, + "step": 1349 + }, + { + "entropy": 1.7231020828088124, + "epoch": 0.14830683035346462, + "grad_norm": 0.7694151997566223, + "learning_rate": 1.9908548273476634e-05, + "loss": 1.3355, + "mean_token_accuracy": 0.6548380752404531, + "num_tokens": 225467576.0, + "step": 1350 + }, + { + "entropy": 1.6866406897703807, + "epoch": 0.14841668726483756, + "grad_norm": 0.6810007095336914, + "learning_rate": 1.9908320460350604e-05, + "loss": 1.3045, + "mean_token_accuracy": 0.681728353103002, + "num_tokens": 225624930.0, + "step": 1351 + }, + { + "entropy": 1.7498182157675426, + "epoch": 0.14852654417621047, + "grad_norm": 0.6989384293556213, + "learning_rate": 1.990809236528024e-05, + "loss": 1.527, + "mean_token_accuracy": 0.6339794049660364, + "num_tokens": 225834299.0, + "step": 1352 + }, + { + "entropy": 1.7136845489343007, + "epoch": 0.14863640108758341, + "grad_norm": 0.8216297626495361, + "learning_rate": 1.990786398827277e-05, + "loss": 1.3575, + "mean_token_accuracy": 0.6578425218661627, + "num_tokens": 226001216.0, + "step": 1353 + }, + { + "entropy": 1.7596096694469452, + "epoch": 0.14874625799895635, + "grad_norm": 0.7053200006484985, + "learning_rate": 1.9907635329335417e-05, + "loss": 1.39, + "mean_token_accuracy": 0.6508052945137024, + "num_tokens": 226164348.0, + "step": 1354 + }, + { + "entropy": 1.7615208824475606, + "epoch": 0.1488561149103293, + "grad_norm": 0.6843299269676208, + "learning_rate": 1.990740638847542e-05, + "loss": 1.443, + "mean_token_accuracy": 0.648722713192304, + "num_tokens": 226324003.0, + "step": 1355 + }, + { + "entropy": 1.73859507838885, + "epoch": 0.14896597182170224, + "grad_norm": 0.7353606820106506, + "learning_rate": 1.9907177165700026e-05, + "loss": 1.4766, + "mean_token_accuracy": 0.6503659536441168, + "num_tokens": 226517283.0, + "step": 1356 + }, + { + "entropy": 1.681709756453832, + "epoch": 0.14907582873307518, + "grad_norm": 0.709335207939148, + "learning_rate": 1.9906947661016488e-05, + "loss": 1.3196, + "mean_token_accuracy": 0.6770564814408621, + "num_tokens": 226653702.0, + "step": 1357 + }, + { + "entropy": 1.7128639618555705, + "epoch": 0.14918568564444812, + "grad_norm": 0.6673030257225037, + "learning_rate": 1.9906717874432068e-05, + "loss": 1.3176, + "mean_token_accuracy": 0.6593418667713801, + "num_tokens": 226783131.0, + "step": 1358 + }, + { + "entropy": 1.6683493653933208, + "epoch": 0.14929554255582103, + "grad_norm": 0.7545623183250427, + "learning_rate": 1.9906487805954046e-05, + "loss": 1.332, + "mean_token_accuracy": 0.6640367060899734, + "num_tokens": 226903165.0, + "step": 1359 + }, + { + "entropy": 1.7147560715675354, + "epoch": 0.14940539946719397, + "grad_norm": 0.6294198632240295, + "learning_rate": 1.9906257455589693e-05, + "loss": 1.3845, + "mean_token_accuracy": 0.6489193687836329, + "num_tokens": 227123559.0, + "step": 1360 + }, + { + "entropy": 1.7493448158105214, + "epoch": 0.1495152563785669, + "grad_norm": 0.7184653878211975, + "learning_rate": 1.9906026823346304e-05, + "loss": 1.3539, + "mean_token_accuracy": 0.6593509018421173, + "num_tokens": 227249956.0, + "step": 1361 + }, + { + "entropy": 1.8085836668809254, + "epoch": 0.14962511328993985, + "grad_norm": 0.751181960105896, + "learning_rate": 1.9905795909231184e-05, + "loss": 1.4853, + "mean_token_accuracy": 0.648095632592837, + "num_tokens": 227436120.0, + "step": 1362 + }, + { + "entropy": 1.8024127682050068, + "epoch": 0.1497349702013128, + "grad_norm": 0.6900661587715149, + "learning_rate": 1.990556471325163e-05, + "loss": 1.477, + "mean_token_accuracy": 0.6429435362418493, + "num_tokens": 227581787.0, + "step": 1363 + }, + { + "entropy": 1.761184275150299, + "epoch": 0.14984482711268574, + "grad_norm": 0.6370431184768677, + "learning_rate": 1.9905333235414974e-05, + "loss": 1.4608, + "mean_token_accuracy": 0.6300752957661947, + "num_tokens": 227798907.0, + "step": 1364 + }, + { + "entropy": 1.729689121246338, + "epoch": 0.14995468402405868, + "grad_norm": 0.6400964856147766, + "learning_rate": 1.990510147572853e-05, + "loss": 1.3613, + "mean_token_accuracy": 0.6598215152819952, + "num_tokens": 227928221.0, + "step": 1365 + }, + { + "entropy": 1.7528914511203766, + "epoch": 0.1500645409354316, + "grad_norm": 0.6673919558525085, + "learning_rate": 1.9904869434199638e-05, + "loss": 1.3918, + "mean_token_accuracy": 0.6615985929965973, + "num_tokens": 228047390.0, + "step": 1366 + }, + { + "entropy": 1.7323197424411774, + "epoch": 0.15017439784680453, + "grad_norm": 0.9341157674789429, + "learning_rate": 1.9904637110835637e-05, + "loss": 1.4983, + "mean_token_accuracy": 0.651843269666036, + "num_tokens": 228240212.0, + "step": 1367 + }, + { + "entropy": 1.7418803771336873, + "epoch": 0.15028425475817747, + "grad_norm": 0.7539012432098389, + "learning_rate": 1.990440450564389e-05, + "loss": 1.3946, + "mean_token_accuracy": 0.652935266494751, + "num_tokens": 228398736.0, + "step": 1368 + }, + { + "entropy": 1.7218880355358124, + "epoch": 0.1503941116695504, + "grad_norm": 0.6365805268287659, + "learning_rate": 1.9904171618631745e-05, + "loss": 1.3038, + "mean_token_accuracy": 0.6718200296163559, + "num_tokens": 228572349.0, + "step": 1369 + }, + { + "entropy": 1.7416918476422627, + "epoch": 0.15050396858092335, + "grad_norm": 0.6741893887519836, + "learning_rate": 1.990393844980659e-05, + "loss": 1.4381, + "mean_token_accuracy": 0.6531198918819427, + "num_tokens": 228733406.0, + "step": 1370 + }, + { + "entropy": 1.7413328488667805, + "epoch": 0.1506138254922963, + "grad_norm": 0.7957093715667725, + "learning_rate": 1.9903704999175787e-05, + "loss": 1.4825, + "mean_token_accuracy": 0.6500815153121948, + "num_tokens": 228933666.0, + "step": 1371 + }, + { + "entropy": 1.7816022833188374, + "epoch": 0.15072368240366923, + "grad_norm": 0.7574257850646973, + "learning_rate": 1.990347126674674e-05, + "loss": 1.4431, + "mean_token_accuracy": 0.6395841191212336, + "num_tokens": 229105787.0, + "step": 1372 + }, + { + "entropy": 1.7355500161647797, + "epoch": 0.15083353931504215, + "grad_norm": 0.7089441418647766, + "learning_rate": 1.9903237252526834e-05, + "loss": 1.3002, + "mean_token_accuracy": 0.665576363603274, + "num_tokens": 229232132.0, + "step": 1373 + }, + { + "entropy": 1.7314164439837139, + "epoch": 0.1509433962264151, + "grad_norm": 0.6388537287712097, + "learning_rate": 1.9903002956523483e-05, + "loss": 1.378, + "mean_token_accuracy": 0.649604876836141, + "num_tokens": 229425376.0, + "step": 1374 + }, + { + "entropy": 1.7354730864365895, + "epoch": 0.15105325313778803, + "grad_norm": 0.6285607218742371, + "learning_rate": 1.99027683787441e-05, + "loss": 1.3263, + "mean_token_accuracy": 0.6639270832141241, + "num_tokens": 229590289.0, + "step": 1375 + }, + { + "entropy": 1.7280907134215038, + "epoch": 0.15116311004916097, + "grad_norm": 0.6440637111663818, + "learning_rate": 1.990253351919611e-05, + "loss": 1.3824, + "mean_token_accuracy": 0.6578231900930405, + "num_tokens": 229803918.0, + "step": 1376 + }, + { + "entropy": 1.7382831076780956, + "epoch": 0.1512729669605339, + "grad_norm": 0.6672256588935852, + "learning_rate": 1.9902298377886946e-05, + "loss": 1.3339, + "mean_token_accuracy": 0.6681515922149023, + "num_tokens": 229969814.0, + "step": 1377 + }, + { + "entropy": 1.7324320872624714, + "epoch": 0.15138282387190685, + "grad_norm": 0.7574326992034912, + "learning_rate": 1.990206295482405e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6632709354162216, + "num_tokens": 230122719.0, + "step": 1378 + }, + { + "entropy": 1.6780237257480621, + "epoch": 0.15149268078327977, + "grad_norm": 0.6588215231895447, + "learning_rate": 1.990182725001487e-05, + "loss": 1.2503, + "mean_token_accuracy": 0.6718401412169138, + "num_tokens": 230236303.0, + "step": 1379 + }, + { + "entropy": 1.711432198683421, + "epoch": 0.1516025376946527, + "grad_norm": 0.6185994744300842, + "learning_rate": 1.9901591263466872e-05, + "loss": 1.4058, + "mean_token_accuracy": 0.6546655098597208, + "num_tokens": 230396650.0, + "step": 1380 + }, + { + "entropy": 1.7140692472457886, + "epoch": 0.15171239460602565, + "grad_norm": 0.6763261556625366, + "learning_rate": 1.9901354995187517e-05, + "loss": 1.3125, + "mean_token_accuracy": 0.6666051745414734, + "num_tokens": 230512660.0, + "step": 1381 + }, + { + "entropy": 1.7215290268262227, + "epoch": 0.1518222515173986, + "grad_norm": 0.6929253935813904, + "learning_rate": 1.9901118445184292e-05, + "loss": 1.2935, + "mean_token_accuracy": 0.6653975496689478, + "num_tokens": 230644801.0, + "step": 1382 + }, + { + "entropy": 1.744094043970108, + "epoch": 0.15193210842877153, + "grad_norm": 0.67906653881073, + "learning_rate": 1.990088161346468e-05, + "loss": 1.5015, + "mean_token_accuracy": 0.633465126156807, + "num_tokens": 230843687.0, + "step": 1383 + }, + { + "entropy": 1.714278946320216, + "epoch": 0.15204196534014447, + "grad_norm": 0.6904776692390442, + "learning_rate": 1.9900644500036174e-05, + "loss": 1.3465, + "mean_token_accuracy": 0.6527802546819051, + "num_tokens": 231002297.0, + "step": 1384 + }, + { + "entropy": 1.7312476833661397, + "epoch": 0.1521518222515174, + "grad_norm": 0.6808450818061829, + "learning_rate": 1.990040710490628e-05, + "loss": 1.3135, + "mean_token_accuracy": 0.6642954846223196, + "num_tokens": 231185181.0, + "step": 1385 + }, + { + "entropy": 1.7672143479188283, + "epoch": 0.15226167916289032, + "grad_norm": 0.7930114269256592, + "learning_rate": 1.990016942808251e-05, + "loss": 1.3884, + "mean_token_accuracy": 0.6474938144286474, + "num_tokens": 231335247.0, + "step": 1386 + }, + { + "entropy": 1.7301386694113414, + "epoch": 0.15237153607426326, + "grad_norm": 0.7413761615753174, + "learning_rate": 1.989993146957239e-05, + "loss": 1.5081, + "mean_token_accuracy": 0.6514854778846105, + "num_tokens": 231490527.0, + "step": 1387 + }, + { + "entropy": 1.6940825978914897, + "epoch": 0.1524813929856362, + "grad_norm": 0.6954035758972168, + "learning_rate": 1.9899693229383447e-05, + "loss": 1.3801, + "mean_token_accuracy": 0.6533026595910391, + "num_tokens": 231707365.0, + "step": 1388 + }, + { + "entropy": 1.7286064724127452, + "epoch": 0.15259124989700915, + "grad_norm": 0.7006916999816895, + "learning_rate": 1.9899454707523228e-05, + "loss": 1.3657, + "mean_token_accuracy": 0.6559472481409708, + "num_tokens": 231838489.0, + "step": 1389 + }, + { + "entropy": 1.689795325199763, + "epoch": 0.1527011068083821, + "grad_norm": 0.7353735566139221, + "learning_rate": 1.9899215903999272e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6647296249866486, + "num_tokens": 231989155.0, + "step": 1390 + }, + { + "entropy": 1.8015301525592804, + "epoch": 0.15281096371975503, + "grad_norm": 0.8433383703231812, + "learning_rate": 1.989897681881915e-05, + "loss": 1.4108, + "mean_token_accuracy": 0.659388080239296, + "num_tokens": 232130833.0, + "step": 1391 + }, + { + "entropy": 1.7052685618400574, + "epoch": 0.15292082063112797, + "grad_norm": 0.7113513946533203, + "learning_rate": 1.989873745199042e-05, + "loss": 1.3169, + "mean_token_accuracy": 0.6690341283877691, + "num_tokens": 232299980.0, + "step": 1392 + }, + { + "entropy": 1.7220211327075958, + "epoch": 0.15303067754250088, + "grad_norm": 0.6374592781066895, + "learning_rate": 1.9898497803520652e-05, + "loss": 1.3122, + "mean_token_accuracy": 0.6635673840840658, + "num_tokens": 232454219.0, + "step": 1393 + }, + { + "entropy": 1.6884993215401967, + "epoch": 0.15314053445387382, + "grad_norm": 0.5745070576667786, + "learning_rate": 1.9898257873417445e-05, + "loss": 1.4229, + "mean_token_accuracy": 0.6347446690003077, + "num_tokens": 232668443.0, + "step": 1394 + }, + { + "entropy": 1.7229583462079365, + "epoch": 0.15325039136524676, + "grad_norm": 0.6316061615943909, + "learning_rate": 1.9898017661688384e-05, + "loss": 1.4632, + "mean_token_accuracy": 0.651705930630366, + "num_tokens": 232827806.0, + "step": 1395 + }, + { + "entropy": 1.7420443991820018, + "epoch": 0.1533602482766197, + "grad_norm": 0.8270453810691833, + "learning_rate": 1.9897777168341078e-05, + "loss": 1.3791, + "mean_token_accuracy": 0.6554538011550903, + "num_tokens": 232976688.0, + "step": 1396 + }, + { + "entropy": 1.7458803057670593, + "epoch": 0.15347010518799264, + "grad_norm": 0.725121021270752, + "learning_rate": 1.9897536393383126e-05, + "loss": 1.4105, + "mean_token_accuracy": 0.6579022953907648, + "num_tokens": 233112726.0, + "step": 1397 + }, + { + "entropy": 1.7431099613507588, + "epoch": 0.15357996209936559, + "grad_norm": 0.7982557415962219, + "learning_rate": 1.9897295336822163e-05, + "loss": 1.2854, + "mean_token_accuracy": 0.6716776788234711, + "num_tokens": 233223044.0, + "step": 1398 + }, + { + "entropy": 1.7481550176938374, + "epoch": 0.15368981901073853, + "grad_norm": 0.7132703065872192, + "learning_rate": 1.989705399866581e-05, + "loss": 1.5079, + "mean_token_accuracy": 0.6489944805701574, + "num_tokens": 233380580.0, + "step": 1399 + }, + { + "entropy": 1.7383308410644531, + "epoch": 0.15379967592211144, + "grad_norm": 0.7662017941474915, + "learning_rate": 1.9896812378921705e-05, + "loss": 1.5184, + "mean_token_accuracy": 0.6418096820513407, + "num_tokens": 233544584.0, + "step": 1400 + }, + { + "entropy": 1.7894498109817505, + "epoch": 0.15390953283348438, + "grad_norm": 0.6829231977462769, + "learning_rate": 1.98965704775975e-05, + "loss": 1.4861, + "mean_token_accuracy": 0.6324234555164973, + "num_tokens": 233743461.0, + "step": 1401 + }, + { + "entropy": 1.7661231060822804, + "epoch": 0.15401938974485732, + "grad_norm": 0.758860170841217, + "learning_rate": 1.989632829470085e-05, + "loss": 1.3311, + "mean_token_accuracy": 0.6602408438920975, + "num_tokens": 233886963.0, + "step": 1402 + }, + { + "entropy": 1.8040996094544728, + "epoch": 0.15412924665623026, + "grad_norm": 0.7048920392990112, + "learning_rate": 1.989608583023941e-05, + "loss": 1.4562, + "mean_token_accuracy": 0.6474844366312027, + "num_tokens": 234039711.0, + "step": 1403 + }, + { + "entropy": 1.6952104270458221, + "epoch": 0.1542391035676032, + "grad_norm": 0.6951699256896973, + "learning_rate": 1.989584308422087e-05, + "loss": 1.5166, + "mean_token_accuracy": 0.666146586338679, + "num_tokens": 234184971.0, + "step": 1404 + }, + { + "entropy": 1.7021946410338085, + "epoch": 0.15434896047897614, + "grad_norm": 0.6730145812034607, + "learning_rate": 1.9895600056652904e-05, + "loss": 1.3321, + "mean_token_accuracy": 0.6629207084576288, + "num_tokens": 234305549.0, + "step": 1405 + }, + { + "entropy": 1.7574187914530437, + "epoch": 0.15445881739034906, + "grad_norm": 0.6781946420669556, + "learning_rate": 1.98953567475432e-05, + "loss": 1.3731, + "mean_token_accuracy": 0.6600083112716675, + "num_tokens": 234429812.0, + "step": 1406 + }, + { + "entropy": 1.7185521523157756, + "epoch": 0.154568674301722, + "grad_norm": 0.6366341710090637, + "learning_rate": 1.9895113156899468e-05, + "loss": 1.4108, + "mean_token_accuracy": 0.6553384065628052, + "num_tokens": 234591666.0, + "step": 1407 + }, + { + "entropy": 1.7048729260762532, + "epoch": 0.15467853121309494, + "grad_norm": 0.7763659358024597, + "learning_rate": 1.989486928472941e-05, + "loss": 1.2446, + "mean_token_accuracy": 0.682253509759903, + "num_tokens": 234729738.0, + "step": 1408 + }, + { + "entropy": 1.7599883476893108, + "epoch": 0.15478838812446788, + "grad_norm": 0.6926746368408203, + "learning_rate": 1.9894625131040746e-05, + "loss": 1.3447, + "mean_token_accuracy": 0.658067504564921, + "num_tokens": 234885495.0, + "step": 1409 + }, + { + "entropy": 1.7993106842041016, + "epoch": 0.15489824503584082, + "grad_norm": 0.895706295967102, + "learning_rate": 1.9894380695841207e-05, + "loss": 1.6742, + "mean_token_accuracy": 0.6284699141979218, + "num_tokens": 235065714.0, + "step": 1410 + }, + { + "entropy": 1.7256481846173604, + "epoch": 0.15500810194721376, + "grad_norm": 0.6546118855476379, + "learning_rate": 1.989413597913853e-05, + "loss": 1.3875, + "mean_token_accuracy": 0.6590452939271927, + "num_tokens": 235210267.0, + "step": 1411 + }, + { + "entropy": 1.7416211764017742, + "epoch": 0.1551179588585867, + "grad_norm": 0.79004967212677, + "learning_rate": 1.9893890980940456e-05, + "loss": 1.388, + "mean_token_accuracy": 0.6537879854440689, + "num_tokens": 235364422.0, + "step": 1412 + }, + { + "entropy": 1.7044924398263295, + "epoch": 0.15522781576995962, + "grad_norm": 0.5850828289985657, + "learning_rate": 1.9893645701254737e-05, + "loss": 1.3368, + "mean_token_accuracy": 0.666097084681193, + "num_tokens": 235548464.0, + "step": 1413 + }, + { + "entropy": 1.757189800341924, + "epoch": 0.15533767268133256, + "grad_norm": 0.6978262066841125, + "learning_rate": 1.9893400140089138e-05, + "loss": 1.3671, + "mean_token_accuracy": 0.651861180861791, + "num_tokens": 235702467.0, + "step": 1414 + }, + { + "entropy": 1.7016917367776234, + "epoch": 0.1554475295927055, + "grad_norm": 0.934744656085968, + "learning_rate": 1.9893154297451437e-05, + "loss": 1.4425, + "mean_token_accuracy": 0.6469552119572958, + "num_tokens": 235865582.0, + "step": 1415 + }, + { + "entropy": 1.6989723841349285, + "epoch": 0.15555738650407844, + "grad_norm": 0.7889364361763, + "learning_rate": 1.9892908173349405e-05, + "loss": 1.3199, + "mean_token_accuracy": 0.6727441449960073, + "num_tokens": 235991724.0, + "step": 1416 + }, + { + "entropy": 1.6890127261479695, + "epoch": 0.15566724341545138, + "grad_norm": 0.8580669164657593, + "learning_rate": 1.989266176779084e-05, + "loss": 1.3835, + "mean_token_accuracy": 0.6598442941904068, + "num_tokens": 236149820.0, + "step": 1417 + }, + { + "entropy": 1.765044758717219, + "epoch": 0.15577710032682432, + "grad_norm": 0.7060015797615051, + "learning_rate": 1.9892415080783535e-05, + "loss": 1.397, + "mean_token_accuracy": 0.6596352259318033, + "num_tokens": 236275889.0, + "step": 1418 + }, + { + "entropy": 1.7240539093812306, + "epoch": 0.15588695723819726, + "grad_norm": 0.6542495489120483, + "learning_rate": 1.9892168112335303e-05, + "loss": 1.4078, + "mean_token_accuracy": 0.6570458362499872, + "num_tokens": 236448746.0, + "step": 1419 + }, + { + "entropy": 1.686035692691803, + "epoch": 0.15599681414957017, + "grad_norm": 0.7507334351539612, + "learning_rate": 1.9891920862453954e-05, + "loss": 1.5029, + "mean_token_accuracy": 0.6390158931414286, + "num_tokens": 236653518.0, + "step": 1420 + }, + { + "entropy": 1.751685917377472, + "epoch": 0.15610667106094311, + "grad_norm": 0.7609370946884155, + "learning_rate": 1.9891673331147315e-05, + "loss": 1.3791, + "mean_token_accuracy": 0.6601094206174215, + "num_tokens": 236785623.0, + "step": 1421 + }, + { + "entropy": 1.7528755863507588, + "epoch": 0.15621652797231605, + "grad_norm": 0.6571503281593323, + "learning_rate": 1.9891425518423225e-05, + "loss": 1.329, + "mean_token_accuracy": 0.6659893939892451, + "num_tokens": 236959510.0, + "step": 1422 + }, + { + "entropy": 1.8252331515153248, + "epoch": 0.156326384883689, + "grad_norm": 0.6614378094673157, + "learning_rate": 1.9891177424289524e-05, + "loss": 1.4472, + "mean_token_accuracy": 0.6394238173961639, + "num_tokens": 237176421.0, + "step": 1423 + }, + { + "entropy": 1.7615261673927307, + "epoch": 0.15643624179506194, + "grad_norm": 0.6956297755241394, + "learning_rate": 1.989092904875406e-05, + "loss": 1.5164, + "mean_token_accuracy": 0.6591284970442454, + "num_tokens": 237347871.0, + "step": 1424 + }, + { + "entropy": 1.7877203325430553, + "epoch": 0.15654609870643488, + "grad_norm": 0.6468766331672668, + "learning_rate": 1.9890680391824703e-05, + "loss": 1.5098, + "mean_token_accuracy": 0.6396308938662211, + "num_tokens": 237543259.0, + "step": 1425 + }, + { + "entropy": 1.7254225611686707, + "epoch": 0.15665595561780782, + "grad_norm": 0.6719499826431274, + "learning_rate": 1.9890431453509317e-05, + "loss": 1.5274, + "mean_token_accuracy": 0.6330312093098959, + "num_tokens": 237710891.0, + "step": 1426 + }, + { + "entropy": 1.6991487741470337, + "epoch": 0.15676581252918073, + "grad_norm": 0.7204530835151672, + "learning_rate": 1.9890182233815777e-05, + "loss": 1.2901, + "mean_token_accuracy": 0.6699622919162115, + "num_tokens": 237850728.0, + "step": 1427 + }, + { + "entropy": 1.7319742838541667, + "epoch": 0.15687566944055367, + "grad_norm": 0.7595884203910828, + "learning_rate": 1.988993273275198e-05, + "loss": 1.3026, + "mean_token_accuracy": 0.6677504330873489, + "num_tokens": 238048528.0, + "step": 1428 + }, + { + "entropy": 1.7359434564908345, + "epoch": 0.1569855263519266, + "grad_norm": 0.6204552054405212, + "learning_rate": 1.9889682950325814e-05, + "loss": 1.2607, + "mean_token_accuracy": 0.6708816637595495, + "num_tokens": 238201749.0, + "step": 1429 + }, + { + "entropy": 1.7319020132223766, + "epoch": 0.15709538326329955, + "grad_norm": 0.7870994806289673, + "learning_rate": 1.988943288654519e-05, + "loss": 1.4783, + "mean_token_accuracy": 0.6482026080290476, + "num_tokens": 238388738.0, + "step": 1430 + }, + { + "entropy": 1.7651021579901378, + "epoch": 0.1572052401746725, + "grad_norm": 0.6856090426445007, + "learning_rate": 1.9889182541418025e-05, + "loss": 1.4452, + "mean_token_accuracy": 0.6410435736179352, + "num_tokens": 238573579.0, + "step": 1431 + }, + { + "entropy": 1.7478333910306294, + "epoch": 0.15731509708604544, + "grad_norm": 0.8158244490623474, + "learning_rate": 1.9888931914952233e-05, + "loss": 1.391, + "mean_token_accuracy": 0.6592821230491003, + "num_tokens": 238712138.0, + "step": 1432 + }, + { + "entropy": 1.7397787074247997, + "epoch": 0.15742495399741835, + "grad_norm": 0.7736004590988159, + "learning_rate": 1.9888681007155754e-05, + "loss": 1.3865, + "mean_token_accuracy": 0.6538245578606924, + "num_tokens": 238883329.0, + "step": 1433 + }, + { + "entropy": 1.7427138984203339, + "epoch": 0.1575348109087913, + "grad_norm": 0.6095617413520813, + "learning_rate": 1.9888429818036526e-05, + "loss": 1.456, + "mean_token_accuracy": 0.6447610855102539, + "num_tokens": 239158770.0, + "step": 1434 + }, + { + "entropy": 1.7088470856348674, + "epoch": 0.15764466782016423, + "grad_norm": 0.6134941577911377, + "learning_rate": 1.98881783476025e-05, + "loss": 1.3449, + "mean_token_accuracy": 0.6597955723603567, + "num_tokens": 239303140.0, + "step": 1435 + }, + { + "entropy": 1.731537361939748, + "epoch": 0.15775452473153717, + "grad_norm": 0.6490273475646973, + "learning_rate": 1.988792659586163e-05, + "loss": 1.2962, + "mean_token_accuracy": 0.6675442407528559, + "num_tokens": 239429104.0, + "step": 1436 + }, + { + "entropy": 1.7000961601734161, + "epoch": 0.1578643816429101, + "grad_norm": 0.6515488028526306, + "learning_rate": 1.9887674562821892e-05, + "loss": 1.4389, + "mean_token_accuracy": 0.659287025531133, + "num_tokens": 239630290.0, + "step": 1437 + }, + { + "entropy": 1.6991292238235474, + "epoch": 0.15797423855428305, + "grad_norm": 0.630832850933075, + "learning_rate": 1.9887422248491263e-05, + "loss": 1.3143, + "mean_token_accuracy": 0.6765478601058325, + "num_tokens": 239751335.0, + "step": 1438 + }, + { + "entropy": 1.7824784815311432, + "epoch": 0.158084095465656, + "grad_norm": 0.7180033326148987, + "learning_rate": 1.988716965287772e-05, + "loss": 1.373, + "mean_token_accuracy": 0.6581806441148123, + "num_tokens": 239933924.0, + "step": 1439 + }, + { + "entropy": 1.732376217842102, + "epoch": 0.1581939523770289, + "grad_norm": 0.7710111141204834, + "learning_rate": 1.9886916775989263e-05, + "loss": 1.3055, + "mean_token_accuracy": 0.674410010377566, + "num_tokens": 240093624.0, + "step": 1440 + }, + { + "entropy": 1.6590780516465504, + "epoch": 0.15830380928840185, + "grad_norm": 0.702910840511322, + "learning_rate": 1.988666361783389e-05, + "loss": 1.233, + "mean_token_accuracy": 0.6768196622530619, + "num_tokens": 240232409.0, + "step": 1441 + }, + { + "entropy": 1.7165300846099854, + "epoch": 0.1584136661997748, + "grad_norm": 0.6681031584739685, + "learning_rate": 1.9886410178419624e-05, + "loss": 1.32, + "mean_token_accuracy": 0.6700728883345922, + "num_tokens": 240390864.0, + "step": 1442 + }, + { + "entropy": 1.7243541578451793, + "epoch": 0.15852352311114773, + "grad_norm": 0.6899517178535461, + "learning_rate": 1.9886156457754476e-05, + "loss": 1.2152, + "mean_token_accuracy": 0.6867374628782272, + "num_tokens": 240554611.0, + "step": 1443 + }, + { + "entropy": 1.6928447286287944, + "epoch": 0.15863338002252067, + "grad_norm": 0.698421835899353, + "learning_rate": 1.9885902455846486e-05, + "loss": 1.3928, + "mean_token_accuracy": 0.6612624774376551, + "num_tokens": 240762372.0, + "step": 1444 + }, + { + "entropy": 1.6924518247445424, + "epoch": 0.1587432369338936, + "grad_norm": 0.6793832182884216, + "learning_rate": 1.988564817270368e-05, + "loss": 1.3986, + "mean_token_accuracy": 0.649079958597819, + "num_tokens": 240915721.0, + "step": 1445 + }, + { + "entropy": 1.7608232696851094, + "epoch": 0.15885309384526655, + "grad_norm": 0.7623583078384399, + "learning_rate": 1.988539360833412e-05, + "loss": 1.4448, + "mean_token_accuracy": 0.6457181026538213, + "num_tokens": 241082642.0, + "step": 1446 + }, + { + "entropy": 1.709537297487259, + "epoch": 0.15896295075663947, + "grad_norm": 0.7644019722938538, + "learning_rate": 1.988513876274585e-05, + "loss": 1.5179, + "mean_token_accuracy": 0.6708864470322927, + "num_tokens": 241260001.0, + "step": 1447 + }, + { + "entropy": 1.7161897718906403, + "epoch": 0.1590728076680124, + "grad_norm": 0.7442562580108643, + "learning_rate": 1.9884883635946946e-05, + "loss": 1.4027, + "mean_token_accuracy": 0.6529113352298737, + "num_tokens": 241416125.0, + "step": 1448 + }, + { + "entropy": 1.784531682729721, + "epoch": 0.15918266457938535, + "grad_norm": 0.7441882491111755, + "learning_rate": 1.988462822794548e-05, + "loss": 1.5471, + "mean_token_accuracy": 0.6351829022169113, + "num_tokens": 241538589.0, + "step": 1449 + }, + { + "entropy": 1.8008837799231212, + "epoch": 0.1592925214907583, + "grad_norm": 0.6900771260261536, + "learning_rate": 1.988437253874953e-05, + "loss": 1.3957, + "mean_token_accuracy": 0.6521695852279663, + "num_tokens": 241676766.0, + "step": 1450 + }, + { + "entropy": 1.7292368113994598, + "epoch": 0.15940237840213123, + "grad_norm": 0.7718958854675293, + "learning_rate": 1.9884116568367197e-05, + "loss": 1.3787, + "mean_token_accuracy": 0.6554831564426422, + "num_tokens": 241893683.0, + "step": 1451 + }, + { + "entropy": 1.722067544857661, + "epoch": 0.15951223531350417, + "grad_norm": 0.7179570198059082, + "learning_rate": 1.9883860316806574e-05, + "loss": 1.3341, + "mean_token_accuracy": 0.6695725172758102, + "num_tokens": 242045379.0, + "step": 1452 + }, + { + "entropy": 1.7238109707832336, + "epoch": 0.1596220922248771, + "grad_norm": 0.7079585790634155, + "learning_rate": 1.9883603784075775e-05, + "loss": 1.2653, + "mean_token_accuracy": 0.668131892879804, + "num_tokens": 242161184.0, + "step": 1453 + }, + { + "entropy": 1.7084789176781972, + "epoch": 0.15973194913625002, + "grad_norm": 0.7071990966796875, + "learning_rate": 1.988334697018292e-05, + "loss": 1.2958, + "mean_token_accuracy": 0.6706744233767191, + "num_tokens": 242318729.0, + "step": 1454 + }, + { + "entropy": 1.6312975188096364, + "epoch": 0.15984180604762296, + "grad_norm": 0.6232081055641174, + "learning_rate": 1.9883089875136138e-05, + "loss": 1.524, + "mean_token_accuracy": 0.6439757943153381, + "num_tokens": 242585751.0, + "step": 1455 + }, + { + "entropy": 1.730732500553131, + "epoch": 0.1599516629589959, + "grad_norm": 0.8543137311935425, + "learning_rate": 1.9882832498943565e-05, + "loss": 1.5473, + "mean_token_accuracy": 0.650491843620936, + "num_tokens": 242744635.0, + "step": 1456 + }, + { + "entropy": 1.7197604576746623, + "epoch": 0.16006151987036885, + "grad_norm": 0.6170863509178162, + "learning_rate": 1.9882574841613343e-05, + "loss": 1.3721, + "mean_token_accuracy": 0.6532426675160726, + "num_tokens": 242906122.0, + "step": 1457 + }, + { + "entropy": 1.6996217370033264, + "epoch": 0.1601713767817418, + "grad_norm": 0.6345753073692322, + "learning_rate": 1.988231690315363e-05, + "loss": 1.4325, + "mean_token_accuracy": 0.6547591636578242, + "num_tokens": 243090125.0, + "step": 1458 + }, + { + "entropy": 1.7933961947758992, + "epoch": 0.16028123369311473, + "grad_norm": 0.6274416446685791, + "learning_rate": 1.9882058683572592e-05, + "loss": 1.5511, + "mean_token_accuracy": 0.6325584451357523, + "num_tokens": 243343304.0, + "step": 1459 + }, + { + "entropy": 1.6818243861198425, + "epoch": 0.16039109060448767, + "grad_norm": 0.8472453355789185, + "learning_rate": 1.9881800182878398e-05, + "loss": 1.3233, + "mean_token_accuracy": 0.6590960721174876, + "num_tokens": 243460895.0, + "step": 1460 + }, + { + "entropy": 1.7377244929472606, + "epoch": 0.16050094751586058, + "grad_norm": 0.6753596067428589, + "learning_rate": 1.988154140107923e-05, + "loss": 1.3971, + "mean_token_accuracy": 0.6536543518304825, + "num_tokens": 243660279.0, + "step": 1461 + }, + { + "entropy": 1.7900232076644897, + "epoch": 0.16061080442723352, + "grad_norm": 0.7756820917129517, + "learning_rate": 1.9881282338183277e-05, + "loss": 1.3319, + "mean_token_accuracy": 0.6654796799023946, + "num_tokens": 243799068.0, + "step": 1462 + }, + { + "entropy": 1.659742573897044, + "epoch": 0.16072066133860646, + "grad_norm": 0.6284655332565308, + "learning_rate": 1.9881022994198744e-05, + "loss": 1.4305, + "mean_token_accuracy": 0.65923244257768, + "num_tokens": 243991348.0, + "step": 1463 + }, + { + "entropy": 1.7306521832942963, + "epoch": 0.1608305182499794, + "grad_norm": 0.6783806681632996, + "learning_rate": 1.988076336913383e-05, + "loss": 1.347, + "mean_token_accuracy": 0.6588092744350433, + "num_tokens": 244144588.0, + "step": 1464 + }, + { + "entropy": 1.690244237581889, + "epoch": 0.16094037516135234, + "grad_norm": 0.7397460341453552, + "learning_rate": 1.9880503462996763e-05, + "loss": 1.5079, + "mean_token_accuracy": 0.6375847011804581, + "num_tokens": 244393970.0, + "step": 1465 + }, + { + "entropy": 1.6876225968201954, + "epoch": 0.16105023207272529, + "grad_norm": 0.7748399376869202, + "learning_rate": 1.9880243275795758e-05, + "loss": 1.1538, + "mean_token_accuracy": 0.6858840386072794, + "num_tokens": 244502713.0, + "step": 1466 + }, + { + "entropy": 1.6742197672526042, + "epoch": 0.1611600889840982, + "grad_norm": 0.6609341502189636, + "learning_rate": 1.987998280753906e-05, + "loss": 1.2905, + "mean_token_accuracy": 0.6699156562487284, + "num_tokens": 244680505.0, + "step": 1467 + }, + { + "entropy": 1.8177895247936249, + "epoch": 0.16126994589547114, + "grad_norm": 0.6985216736793518, + "learning_rate": 1.9879722058234903e-05, + "loss": 1.419, + "mean_token_accuracy": 0.6576731552680334, + "num_tokens": 244816173.0, + "step": 1468 + }, + { + "entropy": 1.7414447963237762, + "epoch": 0.16137980280684408, + "grad_norm": 0.7506465911865234, + "learning_rate": 1.9879461027891546e-05, + "loss": 1.3904, + "mean_token_accuracy": 0.6590597579876581, + "num_tokens": 244951775.0, + "step": 1469 + }, + { + "entropy": 1.676265945037206, + "epoch": 0.16148965971821702, + "grad_norm": 0.5871666669845581, + "learning_rate": 1.9879199716517247e-05, + "loss": 1.2981, + "mean_token_accuracy": 0.6679906199375788, + "num_tokens": 245154813.0, + "step": 1470 + }, + { + "entropy": 1.7278977930545807, + "epoch": 0.16159951662958996, + "grad_norm": 0.6722689270973206, + "learning_rate": 1.987893812412028e-05, + "loss": 1.5139, + "mean_token_accuracy": 0.6363293901085854, + "num_tokens": 245349545.0, + "step": 1471 + }, + { + "entropy": 1.6537324488162994, + "epoch": 0.1617093735409629, + "grad_norm": 0.7102640867233276, + "learning_rate": 1.9878676250708922e-05, + "loss": 1.3051, + "mean_token_accuracy": 0.6738253484169642, + "num_tokens": 245523769.0, + "step": 1472 + }, + { + "entropy": 1.7391831477483113, + "epoch": 0.16181923045233584, + "grad_norm": 0.7415077090263367, + "learning_rate": 1.9878414096291462e-05, + "loss": 1.3022, + "mean_token_accuracy": 0.6666351109743118, + "num_tokens": 245682976.0, + "step": 1473 + }, + { + "entropy": 1.7627001007397969, + "epoch": 0.16192908736370876, + "grad_norm": 0.6851019263267517, + "learning_rate": 1.9878151660876195e-05, + "loss": 1.4372, + "mean_token_accuracy": 0.6531008581320444, + "num_tokens": 245828968.0, + "step": 1474 + }, + { + "entropy": 1.7610424359639485, + "epoch": 0.1620389442750817, + "grad_norm": 0.7833350896835327, + "learning_rate": 1.9877888944471432e-05, + "loss": 1.3799, + "mean_token_accuracy": 0.6726367622613907, + "num_tokens": 245963753.0, + "step": 1475 + }, + { + "entropy": 1.7145276069641113, + "epoch": 0.16214880118645464, + "grad_norm": 0.7759976983070374, + "learning_rate": 1.9877625947085478e-05, + "loss": 1.5535, + "mean_token_accuracy": 0.6358107725779215, + "num_tokens": 246109245.0, + "step": 1476 + }, + { + "entropy": 1.717542956272761, + "epoch": 0.16225865809782758, + "grad_norm": 0.6734223961830139, + "learning_rate": 1.987736266872667e-05, + "loss": 1.535, + "mean_token_accuracy": 0.643081416686376, + "num_tokens": 246266085.0, + "step": 1477 + }, + { + "entropy": 1.7120660841464996, + "epoch": 0.16236851500920052, + "grad_norm": 0.6512724161148071, + "learning_rate": 1.987709910940333e-05, + "loss": 1.3746, + "mean_token_accuracy": 0.6628182381391525, + "num_tokens": 246450821.0, + "step": 1478 + }, + { + "entropy": 1.7386384705702465, + "epoch": 0.16247837192057346, + "grad_norm": 1.0366206169128418, + "learning_rate": 1.9876835269123806e-05, + "loss": 1.4757, + "mean_token_accuracy": 0.6452366163333257, + "num_tokens": 246654303.0, + "step": 1479 + }, + { + "entropy": 1.7971782286961873, + "epoch": 0.1625882288319464, + "grad_norm": 0.6871124505996704, + "learning_rate": 1.987657114789644e-05, + "loss": 1.3246, + "mean_token_accuracy": 0.6647598246733347, + "num_tokens": 246766857.0, + "step": 1480 + }, + { + "entropy": 1.7070013185342152, + "epoch": 0.16269808574331932, + "grad_norm": 0.7218469381332397, + "learning_rate": 1.98763067457296e-05, + "loss": 1.4415, + "mean_token_accuracy": 0.6488531132539114, + "num_tokens": 246930724.0, + "step": 1481 + }, + { + "entropy": 1.6882566809654236, + "epoch": 0.16280794265469226, + "grad_norm": 0.6939437389373779, + "learning_rate": 1.9876042062631655e-05, + "loss": 1.3713, + "mean_token_accuracy": 0.6539578934510549, + "num_tokens": 247155211.0, + "step": 1482 + }, + { + "entropy": 1.6750567058722179, + "epoch": 0.1629177995660652, + "grad_norm": 0.5919098258018494, + "learning_rate": 1.9875777098610973e-05, + "loss": 1.3869, + "mean_token_accuracy": 0.6595859378576279, + "num_tokens": 247370725.0, + "step": 1483 + }, + { + "entropy": 1.6641695896784465, + "epoch": 0.16302765647743814, + "grad_norm": 0.6180868148803711, + "learning_rate": 1.9875511853675952e-05, + "loss": 1.5215, + "mean_token_accuracy": 0.6372140099604925, + "num_tokens": 247577638.0, + "step": 1484 + }, + { + "entropy": 1.641531765460968, + "epoch": 0.16313751338881108, + "grad_norm": 0.6315323710441589, + "learning_rate": 1.9875246327834973e-05, + "loss": 1.4135, + "mean_token_accuracy": 0.647536481420199, + "num_tokens": 247794309.0, + "step": 1485 + }, + { + "entropy": 1.7311479051907857, + "epoch": 0.16324737030018402, + "grad_norm": 0.6436353921890259, + "learning_rate": 1.987498052109645e-05, + "loss": 1.3109, + "mean_token_accuracy": 0.6649970014890035, + "num_tokens": 247932675.0, + "step": 1486 + }, + { + "entropy": 1.6989285945892334, + "epoch": 0.16335722721155696, + "grad_norm": 0.7341669201850891, + "learning_rate": 1.9874714433468792e-05, + "loss": 1.3223, + "mean_token_accuracy": 0.6625909308592478, + "num_tokens": 248085847.0, + "step": 1487 + }, + { + "entropy": 1.729597012201945, + "epoch": 0.16346708412292987, + "grad_norm": 0.6755861639976501, + "learning_rate": 1.9874448064960422e-05, + "loss": 1.502, + "mean_token_accuracy": 0.6398074775934219, + "num_tokens": 248277427.0, + "step": 1488 + }, + { + "entropy": 1.6631129284699757, + "epoch": 0.16357694103430281, + "grad_norm": 0.6498894095420837, + "learning_rate": 1.987418141557977e-05, + "loss": 1.2334, + "mean_token_accuracy": 0.6779984682798386, + "num_tokens": 248423875.0, + "step": 1489 + }, + { + "entropy": 1.7226392328739166, + "epoch": 0.16368679794567575, + "grad_norm": 0.5937130451202393, + "learning_rate": 1.9873914485335274e-05, + "loss": 1.4005, + "mean_token_accuracy": 0.6503377507130305, + "num_tokens": 248638269.0, + "step": 1490 + }, + { + "entropy": 1.7222268283367157, + "epoch": 0.1637966548570487, + "grad_norm": 0.7151001691818237, + "learning_rate": 1.9873647274235384e-05, + "loss": 1.4244, + "mean_token_accuracy": 0.6669965138038, + "num_tokens": 248764528.0, + "step": 1491 + }, + { + "entropy": 1.7631146212418873, + "epoch": 0.16390651176842164, + "grad_norm": 0.7749632000923157, + "learning_rate": 1.9873379782288555e-05, + "loss": 1.3909, + "mean_token_accuracy": 0.6470388472080231, + "num_tokens": 248911474.0, + "step": 1492 + }, + { + "entropy": 1.7954902946949005, + "epoch": 0.16401636867979458, + "grad_norm": 0.6423087120056152, + "learning_rate": 1.9873112009503256e-05, + "loss": 1.4206, + "mean_token_accuracy": 0.649329255024592, + "num_tokens": 249077416.0, + "step": 1493 + }, + { + "entropy": 1.6648548245429993, + "epoch": 0.1641262255911675, + "grad_norm": 0.9466790556907654, + "learning_rate": 1.987284395588796e-05, + "loss": 1.4267, + "mean_token_accuracy": 0.6575258076190948, + "num_tokens": 249211722.0, + "step": 1494 + }, + { + "entropy": 1.767043113708496, + "epoch": 0.16423608250254043, + "grad_norm": 0.6665019989013672, + "learning_rate": 1.987257562145115e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6605860988299052, + "num_tokens": 249442176.0, + "step": 1495 + }, + { + "entropy": 1.7210952043533325, + "epoch": 0.16434593941391337, + "grad_norm": 0.769982099533081, + "learning_rate": 1.987230700620132e-05, + "loss": 1.3586, + "mean_token_accuracy": 0.6576197892427444, + "num_tokens": 249569258.0, + "step": 1496 + }, + { + "entropy": 1.7499938607215881, + "epoch": 0.1644557963252863, + "grad_norm": 0.8687669634819031, + "learning_rate": 1.987203811014697e-05, + "loss": 1.4874, + "mean_token_accuracy": 0.6517137040694555, + "num_tokens": 249739841.0, + "step": 1497 + }, + { + "entropy": 1.7234773536523182, + "epoch": 0.16456565323665925, + "grad_norm": 0.7829402089118958, + "learning_rate": 1.9871768933296616e-05, + "loss": 1.3424, + "mean_token_accuracy": 0.6686098178227743, + "num_tokens": 249887148.0, + "step": 1498 + }, + { + "entropy": 1.7844958702723186, + "epoch": 0.1646755101480322, + "grad_norm": 0.7004623413085938, + "learning_rate": 1.987149947565877e-05, + "loss": 1.5753, + "mean_token_accuracy": 0.6275525540113449, + "num_tokens": 250106989.0, + "step": 1499 + }, + { + "entropy": 1.7767977714538574, + "epoch": 0.16478536705940514, + "grad_norm": 0.8283874988555908, + "learning_rate": 1.9871229737241963e-05, + "loss": 1.4238, + "mean_token_accuracy": 0.6419435540835062, + "num_tokens": 250244200.0, + "step": 1500 + }, + { + "entropy": 1.7311366498470306, + "epoch": 0.16489522397077805, + "grad_norm": 0.6668105125427246, + "learning_rate": 1.9870959718054733e-05, + "loss": 1.4291, + "mean_token_accuracy": 0.6505243728558222, + "num_tokens": 250418231.0, + "step": 1501 + }, + { + "entropy": 1.7808765669663746, + "epoch": 0.165005080882151, + "grad_norm": 0.6634812951087952, + "learning_rate": 1.9870689418105623e-05, + "loss": 1.3597, + "mean_token_accuracy": 0.6567443857590357, + "num_tokens": 250544059.0, + "step": 1502 + }, + { + "entropy": 1.7454225818316143, + "epoch": 0.16511493779352393, + "grad_norm": 0.7223145961761475, + "learning_rate": 1.9870418837403194e-05, + "loss": 1.4386, + "mean_token_accuracy": 0.6378096987803777, + "num_tokens": 250731948.0, + "step": 1503 + }, + { + "entropy": 1.7013043363889058, + "epoch": 0.16522479470489687, + "grad_norm": 0.712053656578064, + "learning_rate": 1.9870147975956004e-05, + "loss": 1.3979, + "mean_token_accuracy": 0.6579237480958303, + "num_tokens": 250889151.0, + "step": 1504 + }, + { + "entropy": 1.692932019631068, + "epoch": 0.1653346516162698, + "grad_norm": 0.7106614112854004, + "learning_rate": 1.9869876833772625e-05, + "loss": 1.5012, + "mean_token_accuracy": 0.6438654214143753, + "num_tokens": 251113024.0, + "step": 1505 + }, + { + "entropy": 1.7009440064430237, + "epoch": 0.16544450852764275, + "grad_norm": 0.7013898491859436, + "learning_rate": 1.9869605410861646e-05, + "loss": 1.2613, + "mean_token_accuracy": 0.6773126920064291, + "num_tokens": 251247569.0, + "step": 1506 + }, + { + "entropy": 1.7212568124135335, + "epoch": 0.1655543654390157, + "grad_norm": 0.6236050724983215, + "learning_rate": 1.986933370723165e-05, + "loss": 1.3193, + "mean_token_accuracy": 0.6535242249568304, + "num_tokens": 251385146.0, + "step": 1507 + }, + { + "entropy": 1.669597287972768, + "epoch": 0.1656642223503886, + "grad_norm": 0.7364504933357239, + "learning_rate": 1.9869061722891235e-05, + "loss": 1.2212, + "mean_token_accuracy": 0.6872139424085617, + "num_tokens": 251535729.0, + "step": 1508 + }, + { + "entropy": 1.7369881371657054, + "epoch": 0.16577407926176155, + "grad_norm": 0.7119384407997131, + "learning_rate": 1.9868789457849018e-05, + "loss": 1.4191, + "mean_token_accuracy": 0.6483266254266103, + "num_tokens": 251685078.0, + "step": 1509 + }, + { + "entropy": 1.7074172000090282, + "epoch": 0.1658839361731345, + "grad_norm": 0.5950348377227783, + "learning_rate": 1.986851691211361e-05, + "loss": 1.3744, + "mean_token_accuracy": 0.6569475283225378, + "num_tokens": 251863638.0, + "step": 1510 + }, + { + "entropy": 1.7139769693215687, + "epoch": 0.16599379308450743, + "grad_norm": 0.7062231302261353, + "learning_rate": 1.986824408569364e-05, + "loss": 1.3153, + "mean_token_accuracy": 0.6584917455911636, + "num_tokens": 252028124.0, + "step": 1511 + }, + { + "entropy": 1.774364709854126, + "epoch": 0.16610364999588037, + "grad_norm": 0.6657449007034302, + "learning_rate": 1.9867970978597738e-05, + "loss": 1.3523, + "mean_token_accuracy": 0.6527550766865412, + "num_tokens": 252189853.0, + "step": 1512 + }, + { + "entropy": 1.645888904730479, + "epoch": 0.1662135069072533, + "grad_norm": 1.011468768119812, + "learning_rate": 1.9867697590834552e-05, + "loss": 1.4475, + "mean_token_accuracy": 0.6760109663009644, + "num_tokens": 252391083.0, + "step": 1513 + }, + { + "entropy": 1.7124705612659454, + "epoch": 0.16632336381862625, + "grad_norm": 0.866172194480896, + "learning_rate": 1.9867423922412732e-05, + "loss": 1.3624, + "mean_token_accuracy": 0.6526060750087103, + "num_tokens": 252555620.0, + "step": 1514 + }, + { + "entropy": 1.7539417843023937, + "epoch": 0.16643322072999917, + "grad_norm": 0.7852609753608704, + "learning_rate": 1.986714997334094e-05, + "loss": 1.3325, + "mean_token_accuracy": 0.6687429994344711, + "num_tokens": 252705040.0, + "step": 1515 + }, + { + "entropy": 1.7840530971686046, + "epoch": 0.1665430776413721, + "grad_norm": 0.7107824087142944, + "learning_rate": 1.9866875743627845e-05, + "loss": 1.5625, + "mean_token_accuracy": 0.6361605624357859, + "num_tokens": 252910429.0, + "step": 1516 + }, + { + "entropy": 1.6945801079273224, + "epoch": 0.16665293455274505, + "grad_norm": 0.7252150177955627, + "learning_rate": 1.9866601233282133e-05, + "loss": 1.3175, + "mean_token_accuracy": 0.6691045463085175, + "num_tokens": 253035185.0, + "step": 1517 + }, + { + "entropy": 1.738978087902069, + "epoch": 0.166762791464118, + "grad_norm": 1.032089352607727, + "learning_rate": 1.9866326442312485e-05, + "loss": 1.574, + "mean_token_accuracy": 0.6497288842995962, + "num_tokens": 253196871.0, + "step": 1518 + }, + { + "entropy": 1.7240648766358693, + "epoch": 0.16687264837549093, + "grad_norm": 0.6749817132949829, + "learning_rate": 1.9866051370727604e-05, + "loss": 1.3699, + "mean_token_accuracy": 0.6552683015664419, + "num_tokens": 253352031.0, + "step": 1519 + }, + { + "entropy": 1.7319549322128296, + "epoch": 0.16698250528686387, + "grad_norm": 0.6335762143135071, + "learning_rate": 1.9865776018536188e-05, + "loss": 1.4646, + "mean_token_accuracy": 0.6430019934972128, + "num_tokens": 253511981.0, + "step": 1520 + }, + { + "entropy": 1.7338798642158508, + "epoch": 0.1670923621982368, + "grad_norm": 0.6900236010551453, + "learning_rate": 1.9865500385746954e-05, + "loss": 1.3692, + "mean_token_accuracy": 0.6590522130330404, + "num_tokens": 253663727.0, + "step": 1521 + }, + { + "entropy": 1.7445284326871235, + "epoch": 0.16720221910960972, + "grad_norm": 0.7063678503036499, + "learning_rate": 1.9865224472368634e-05, + "loss": 1.5004, + "mean_token_accuracy": 0.6508728663126627, + "num_tokens": 253860814.0, + "step": 1522 + }, + { + "entropy": 1.7685929238796234, + "epoch": 0.16731207602098266, + "grad_norm": 0.7194231152534485, + "learning_rate": 1.986494827840995e-05, + "loss": 1.3029, + "mean_token_accuracy": 0.6638128211100897, + "num_tokens": 253980941.0, + "step": 1523 + }, + { + "entropy": 1.750822017590205, + "epoch": 0.1674219329323556, + "grad_norm": 0.6481338143348694, + "learning_rate": 1.9864671803879648e-05, + "loss": 1.3924, + "mean_token_accuracy": 0.6541791011889776, + "num_tokens": 254148839.0, + "step": 1524 + }, + { + "entropy": 1.6678146918614705, + "epoch": 0.16753178984372855, + "grad_norm": 0.9855983257293701, + "learning_rate": 1.9864395048786477e-05, + "loss": 1.6043, + "mean_token_accuracy": 0.6395136813322703, + "num_tokens": 254354163.0, + "step": 1525 + }, + { + "entropy": 1.718589961528778, + "epoch": 0.1676416467551015, + "grad_norm": 0.8905704617500305, + "learning_rate": 1.98641180131392e-05, + "loss": 1.2566, + "mean_token_accuracy": 0.6782409648100535, + "num_tokens": 254504599.0, + "step": 1526 + }, + { + "entropy": 1.7314582268397014, + "epoch": 0.16775150366647443, + "grad_norm": 0.6675595641136169, + "learning_rate": 1.986384069694658e-05, + "loss": 1.4248, + "mean_token_accuracy": 0.660760889450709, + "num_tokens": 254671668.0, + "step": 1527 + }, + { + "entropy": 1.819986879825592, + "epoch": 0.16786136057784734, + "grad_norm": 0.7095764875411987, + "learning_rate": 1.9863563100217397e-05, + "loss": 1.5101, + "mean_token_accuracy": 0.6234359592199326, + "num_tokens": 254837716.0, + "step": 1528 + }, + { + "entropy": 1.6930086314678192, + "epoch": 0.16797121748922028, + "grad_norm": 0.7223114371299744, + "learning_rate": 1.9863285222960436e-05, + "loss": 1.3947, + "mean_token_accuracy": 0.6587740182876587, + "num_tokens": 255038553.0, + "step": 1529 + }, + { + "entropy": 1.7511506875356038, + "epoch": 0.16808107440059322, + "grad_norm": 0.6249548196792603, + "learning_rate": 1.986300706518449e-05, + "loss": 1.4064, + "mean_token_accuracy": 0.6461884180704752, + "num_tokens": 255246012.0, + "step": 1530 + }, + { + "entropy": 1.7595790127913158, + "epoch": 0.16819093131196616, + "grad_norm": 0.8911905288696289, + "learning_rate": 1.9862728626898363e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6536350101232529, + "num_tokens": 255406852.0, + "step": 1531 + }, + { + "entropy": 1.7233379284540813, + "epoch": 0.1683007882233391, + "grad_norm": 0.6526165008544922, + "learning_rate": 1.9862449908110876e-05, + "loss": 1.5453, + "mean_token_accuracy": 0.6314892421166102, + "num_tokens": 255600884.0, + "step": 1532 + }, + { + "entropy": 1.7559981842835743, + "epoch": 0.16841064513471204, + "grad_norm": 0.7608028054237366, + "learning_rate": 1.9862170908830837e-05, + "loss": 1.5887, + "mean_token_accuracy": 0.6392476956049601, + "num_tokens": 255762853.0, + "step": 1533 + }, + { + "entropy": 1.7382706304391224, + "epoch": 0.16852050204608499, + "grad_norm": 0.651728630065918, + "learning_rate": 1.986189162906708e-05, + "loss": 1.5546, + "mean_token_accuracy": 0.6259034971396128, + "num_tokens": 255989620.0, + "step": 1534 + }, + { + "entropy": 1.7760844230651855, + "epoch": 0.1686303589574579, + "grad_norm": 0.8464280366897583, + "learning_rate": 1.986161206882845e-05, + "loss": 1.309, + "mean_token_accuracy": 0.6692384978135427, + "num_tokens": 256139502.0, + "step": 1535 + }, + { + "entropy": 1.7130251824855804, + "epoch": 0.16874021586883084, + "grad_norm": 0.6570628881454468, + "learning_rate": 1.986133222812379e-05, + "loss": 1.3684, + "mean_token_accuracy": 0.6575349122285843, + "num_tokens": 256303217.0, + "step": 1536 + }, + { + "entropy": 1.746040016412735, + "epoch": 0.16885007278020378, + "grad_norm": 0.7733869552612305, + "learning_rate": 1.986105210696196e-05, + "loss": 1.4488, + "mean_token_accuracy": 0.6521646479765574, + "num_tokens": 256480152.0, + "step": 1537 + }, + { + "entropy": 1.7764933109283447, + "epoch": 0.16895992969157672, + "grad_norm": 0.8233133554458618, + "learning_rate": 1.9860771705351822e-05, + "loss": 1.453, + "mean_token_accuracy": 0.6571964025497437, + "num_tokens": 256621819.0, + "step": 1538 + }, + { + "entropy": 1.70499520500501, + "epoch": 0.16906978660294966, + "grad_norm": 0.7217221260070801, + "learning_rate": 1.9860491023302252e-05, + "loss": 1.4539, + "mean_token_accuracy": 0.6460290650526682, + "num_tokens": 256864065.0, + "step": 1539 + }, + { + "entropy": 1.6891511678695679, + "epoch": 0.1691796435143226, + "grad_norm": 0.7945336103439331, + "learning_rate": 1.9860210060822137e-05, + "loss": 1.4004, + "mean_token_accuracy": 0.6728538970152537, + "num_tokens": 257076992.0, + "step": 1540 + }, + { + "entropy": 1.7396195034186046, + "epoch": 0.16928950042569554, + "grad_norm": 0.7375956773757935, + "learning_rate": 1.9859928817920363e-05, + "loss": 1.3562, + "mean_token_accuracy": 0.6567103415727615, + "num_tokens": 257245992.0, + "step": 1541 + }, + { + "entropy": 1.7729649543762207, + "epoch": 0.16939935733706846, + "grad_norm": 0.5919457674026489, + "learning_rate": 1.9859647294605832e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6635322074095408, + "num_tokens": 257484248.0, + "step": 1542 + }, + { + "entropy": 1.7026795248190563, + "epoch": 0.1695092142484414, + "grad_norm": 0.7219969630241394, + "learning_rate": 1.985936549088746e-05, + "loss": 1.3216, + "mean_token_accuracy": 0.6651143580675125, + "num_tokens": 257612083.0, + "step": 1543 + }, + { + "entropy": 1.6986474494139354, + "epoch": 0.16961907115981434, + "grad_norm": 0.6384711861610413, + "learning_rate": 1.985908340677416e-05, + "loss": 1.2761, + "mean_token_accuracy": 0.6821579784154892, + "num_tokens": 257801207.0, + "step": 1544 + }, + { + "entropy": 1.740968902905782, + "epoch": 0.16972892807118728, + "grad_norm": 1.006049633026123, + "learning_rate": 1.9858801042274865e-05, + "loss": 1.6709, + "mean_token_accuracy": 0.6359262764453888, + "num_tokens": 257943835.0, + "step": 1545 + }, + { + "entropy": 1.761252890030543, + "epoch": 0.16983878498256022, + "grad_norm": 0.678752601146698, + "learning_rate": 1.9858518397398506e-05, + "loss": 1.3871, + "mean_token_accuracy": 0.6560403803984324, + "num_tokens": 258112452.0, + "step": 1546 + }, + { + "entropy": 1.7248517572879791, + "epoch": 0.16994864189393316, + "grad_norm": 0.6440653204917908, + "learning_rate": 1.9858235472154035e-05, + "loss": 1.3485, + "mean_token_accuracy": 0.6657489885886511, + "num_tokens": 258272795.0, + "step": 1547 + }, + { + "entropy": 1.7436510026454926, + "epoch": 0.1700584988053061, + "grad_norm": 0.6682828068733215, + "learning_rate": 1.98579522665504e-05, + "loss": 1.2871, + "mean_token_accuracy": 0.6653337130943934, + "num_tokens": 258421193.0, + "step": 1548 + }, + { + "entropy": 1.768191655476888, + "epoch": 0.17016835571667902, + "grad_norm": 0.6639819145202637, + "learning_rate": 1.9857668780596566e-05, + "loss": 1.3542, + "mean_token_accuracy": 0.6570809185504913, + "num_tokens": 258586838.0, + "step": 1549 + }, + { + "entropy": 1.7020961840947468, + "epoch": 0.17027821262805196, + "grad_norm": 0.713425874710083, + "learning_rate": 1.985738501430151e-05, + "loss": 1.4274, + "mean_token_accuracy": 0.6487694978713989, + "num_tokens": 258785508.0, + "step": 1550 + }, + { + "entropy": 1.7292237877845764, + "epoch": 0.1703880695394249, + "grad_norm": 0.6867907047271729, + "learning_rate": 1.9857100967674207e-05, + "loss": 1.4103, + "mean_token_accuracy": 0.646657998363177, + "num_tokens": 258951979.0, + "step": 1551 + }, + { + "entropy": 1.7403623362382252, + "epoch": 0.17049792645079784, + "grad_norm": 0.6921045780181885, + "learning_rate": 1.985681664072365e-05, + "loss": 1.4913, + "mean_token_accuracy": 0.6297862927118937, + "num_tokens": 259176216.0, + "step": 1552 + }, + { + "entropy": 1.7630130350589752, + "epoch": 0.17060778336217078, + "grad_norm": 0.6721217632293701, + "learning_rate": 1.9856532033458838e-05, + "loss": 1.349, + "mean_token_accuracy": 0.6710087656974792, + "num_tokens": 259352224.0, + "step": 1553 + }, + { + "entropy": 1.8647314310073853, + "epoch": 0.17071764027354372, + "grad_norm": 0.8853358626365662, + "learning_rate": 1.985624714588878e-05, + "loss": 1.4459, + "mean_token_accuracy": 0.6719841261704763, + "num_tokens": 259450105.0, + "step": 1554 + }, + { + "entropy": 1.7859836916128795, + "epoch": 0.17082749718491663, + "grad_norm": 0.7029029726982117, + "learning_rate": 1.9855961978022487e-05, + "loss": 1.3664, + "mean_token_accuracy": 0.6556002298990885, + "num_tokens": 259563607.0, + "step": 1555 + }, + { + "entropy": 1.7396026750405629, + "epoch": 0.17093735409628957, + "grad_norm": 0.9122359156608582, + "learning_rate": 1.9855676529868987e-05, + "loss": 1.4244, + "mean_token_accuracy": 0.6562978277603785, + "num_tokens": 259724551.0, + "step": 1556 + }, + { + "entropy": 1.6989874243736267, + "epoch": 0.17104721100766251, + "grad_norm": 0.7166799306869507, + "learning_rate": 1.985539080143732e-05, + "loss": 1.4114, + "mean_token_accuracy": 0.6575746287902197, + "num_tokens": 259892519.0, + "step": 1557 + }, + { + "entropy": 1.763812651236852, + "epoch": 0.17115706791903545, + "grad_norm": 0.8969507217407227, + "learning_rate": 1.9855104792736523e-05, + "loss": 1.2682, + "mean_token_accuracy": 0.6701969256003698, + "num_tokens": 260020525.0, + "step": 1558 + }, + { + "entropy": 1.6851574281851451, + "epoch": 0.1712669248304084, + "grad_norm": 0.6716583967208862, + "learning_rate": 1.985481850377565e-05, + "loss": 1.2937, + "mean_token_accuracy": 0.6693290968736013, + "num_tokens": 260173052.0, + "step": 1559 + }, + { + "entropy": 1.7508944670359294, + "epoch": 0.17137678174178134, + "grad_norm": 0.7718809247016907, + "learning_rate": 1.9854531934563756e-05, + "loss": 1.3227, + "mean_token_accuracy": 0.6782904316981634, + "num_tokens": 260302029.0, + "step": 1560 + }, + { + "entropy": 1.725020448366801, + "epoch": 0.17148663865315428, + "grad_norm": 0.6612850427627563, + "learning_rate": 1.985424508510992e-05, + "loss": 1.5275, + "mean_token_accuracy": 0.6432696729898453, + "num_tokens": 260518768.0, + "step": 1561 + }, + { + "entropy": 1.706325650215149, + "epoch": 0.1715964955645272, + "grad_norm": 0.6583466529846191, + "learning_rate": 1.985395795542322e-05, + "loss": 1.382, + "mean_token_accuracy": 0.6552939414978027, + "num_tokens": 260747412.0, + "step": 1562 + }, + { + "entropy": 1.7528244455655415, + "epoch": 0.17170635247590013, + "grad_norm": 0.6246992349624634, + "learning_rate": 1.985367054551274e-05, + "loss": 1.4976, + "mean_token_accuracy": 0.6324788878361384, + "num_tokens": 260942417.0, + "step": 1563 + }, + { + "entropy": 1.658085564772288, + "epoch": 0.17181620938727307, + "grad_norm": 0.8413445353507996, + "learning_rate": 1.985338285538757e-05, + "loss": 1.2427, + "mean_token_accuracy": 0.6698757459719976, + "num_tokens": 261082216.0, + "step": 1564 + }, + { + "entropy": 1.7026270429293315, + "epoch": 0.171926066298646, + "grad_norm": 0.6878015995025635, + "learning_rate": 1.9853094885056824e-05, + "loss": 1.3236, + "mean_token_accuracy": 0.6607059886058172, + "num_tokens": 261265341.0, + "step": 1565 + }, + { + "entropy": 1.7412429749965668, + "epoch": 0.17203592321001895, + "grad_norm": 0.6753911375999451, + "learning_rate": 1.9852806634529617e-05, + "loss": 1.5171, + "mean_token_accuracy": 0.645161176721255, + "num_tokens": 261451694.0, + "step": 1566 + }, + { + "entropy": 1.7042547861735027, + "epoch": 0.1721457801213919, + "grad_norm": 0.6290127635002136, + "learning_rate": 1.985251810381507e-05, + "loss": 1.3632, + "mean_token_accuracy": 0.6557406087716421, + "num_tokens": 261608260.0, + "step": 1567 + }, + { + "entropy": 1.707568456729253, + "epoch": 0.17225563703276484, + "grad_norm": 0.7598758935928345, + "learning_rate": 1.985222929292231e-05, + "loss": 1.5189, + "mean_token_accuracy": 0.643217921257019, + "num_tokens": 261830010.0, + "step": 1568 + }, + { + "entropy": 1.6917735834916432, + "epoch": 0.17236549394413775, + "grad_norm": 0.7184498906135559, + "learning_rate": 1.9851940201860486e-05, + "loss": 1.3412, + "mean_token_accuracy": 0.656810333331426, + "num_tokens": 261946266.0, + "step": 1569 + }, + { + "entropy": 1.723334978024165, + "epoch": 0.1724753508555107, + "grad_norm": 0.5953544974327087, + "learning_rate": 1.985165083063874e-05, + "loss": 1.437, + "mean_token_accuracy": 0.638761967420578, + "num_tokens": 262116192.0, + "step": 1570 + }, + { + "entropy": 1.7972069382667542, + "epoch": 0.17258520776688363, + "grad_norm": 0.8096028566360474, + "learning_rate": 1.985136117926624e-05, + "loss": 1.3684, + "mean_token_accuracy": 0.6510176906983057, + "num_tokens": 262310188.0, + "step": 1571 + }, + { + "entropy": 1.7401937345663707, + "epoch": 0.17269506467825657, + "grad_norm": 0.699908435344696, + "learning_rate": 1.9851071247752144e-05, + "loss": 1.512, + "mean_token_accuracy": 0.6426756829023361, + "num_tokens": 262485269.0, + "step": 1572 + }, + { + "entropy": 1.739349255959193, + "epoch": 0.1728049215896295, + "grad_norm": 0.7645807266235352, + "learning_rate": 1.9850781036105628e-05, + "loss": 1.3933, + "mean_token_accuracy": 0.6666643818219503, + "num_tokens": 262634754.0, + "step": 1573 + }, + { + "entropy": 1.7648814817269642, + "epoch": 0.17291477850100245, + "grad_norm": 0.5633745789527893, + "learning_rate": 1.9850490544335883e-05, + "loss": 1.4968, + "mean_token_accuracy": 0.6367639104525248, + "num_tokens": 262848023.0, + "step": 1574 + }, + { + "entropy": 1.7256119847297668, + "epoch": 0.1730246354123754, + "grad_norm": 0.7680408358573914, + "learning_rate": 1.9850199772452102e-05, + "loss": 1.2459, + "mean_token_accuracy": 0.6759609977404276, + "num_tokens": 262952079.0, + "step": 1575 + }, + { + "entropy": 1.7688967287540436, + "epoch": 0.1731344923237483, + "grad_norm": 0.7607701420783997, + "learning_rate": 1.9849908720463483e-05, + "loss": 1.686, + "mean_token_accuracy": 0.6304403940836588, + "num_tokens": 263123172.0, + "step": 1576 + }, + { + "entropy": 1.7065779368082683, + "epoch": 0.17324434923512125, + "grad_norm": 0.7314710021018982, + "learning_rate": 1.9849617388379243e-05, + "loss": 1.3961, + "mean_token_accuracy": 0.662748172879219, + "num_tokens": 263287696.0, + "step": 1577 + }, + { + "entropy": 1.7852273086706798, + "epoch": 0.1733542061464942, + "grad_norm": 0.7075253129005432, + "learning_rate": 1.9849325776208597e-05, + "loss": 1.5109, + "mean_token_accuracy": 0.6389070451259613, + "num_tokens": 263463542.0, + "step": 1578 + }, + { + "entropy": 1.759129822254181, + "epoch": 0.17346406305786713, + "grad_norm": 0.6782553791999817, + "learning_rate": 1.984903388396078e-05, + "loss": 1.3754, + "mean_token_accuracy": 0.66612375775973, + "num_tokens": 263611324.0, + "step": 1579 + }, + { + "entropy": 1.7275327742099762, + "epoch": 0.17357391996924007, + "grad_norm": 0.6759166121482849, + "learning_rate": 1.984874171164503e-05, + "loss": 1.4564, + "mean_token_accuracy": 0.6484881341457367, + "num_tokens": 263803481.0, + "step": 1580 + }, + { + "entropy": 1.7481872042020161, + "epoch": 0.173683776880613, + "grad_norm": 0.7830897569656372, + "learning_rate": 1.9848449259270594e-05, + "loss": 1.4403, + "mean_token_accuracy": 0.6442924290895462, + "num_tokens": 263980547.0, + "step": 1581 + }, + { + "entropy": 1.6898111701011658, + "epoch": 0.17379363379198595, + "grad_norm": 0.7016710638999939, + "learning_rate": 1.984815652684672e-05, + "loss": 1.2634, + "mean_token_accuracy": 0.677551324168841, + "num_tokens": 264100476.0, + "step": 1582 + }, + { + "entropy": 1.756488412618637, + "epoch": 0.17390349070335887, + "grad_norm": 0.725829005241394, + "learning_rate": 1.9847863514382684e-05, + "loss": 1.3736, + "mean_token_accuracy": 0.6665743341048559, + "num_tokens": 264267717.0, + "step": 1583 + }, + { + "entropy": 1.7239744464556377, + "epoch": 0.1740133476147318, + "grad_norm": 0.7309584617614746, + "learning_rate": 1.9847570221887752e-05, + "loss": 1.4512, + "mean_token_accuracy": 0.6631620625654856, + "num_tokens": 264435736.0, + "step": 1584 + }, + { + "entropy": 1.754454771677653, + "epoch": 0.17412320452610475, + "grad_norm": 0.7535955905914307, + "learning_rate": 1.984727664937121e-05, + "loss": 1.3803, + "mean_token_accuracy": 0.6569480895996094, + "num_tokens": 264575234.0, + "step": 1585 + }, + { + "entropy": 1.7318914433320363, + "epoch": 0.1742330614374777, + "grad_norm": 0.7644667625427246, + "learning_rate": 1.9846982796842348e-05, + "loss": 1.3139, + "mean_token_accuracy": 0.6594865024089813, + "num_tokens": 264726884.0, + "step": 1586 + }, + { + "entropy": 1.716256360212962, + "epoch": 0.17434291834885063, + "grad_norm": 0.9378094673156738, + "learning_rate": 1.9846688664310466e-05, + "loss": 1.3089, + "mean_token_accuracy": 0.6705767214298248, + "num_tokens": 264888577.0, + "step": 1587 + }, + { + "entropy": 1.7499485909938812, + "epoch": 0.17445277526022357, + "grad_norm": 0.6963350772857666, + "learning_rate": 1.9846394251784878e-05, + "loss": 1.3224, + "mean_token_accuracy": 0.6636629452308019, + "num_tokens": 265017805.0, + "step": 1588 + }, + { + "entropy": 1.6987595359484355, + "epoch": 0.17456263217159648, + "grad_norm": 0.549950122833252, + "learning_rate": 1.9846099559274896e-05, + "loss": 1.445, + "mean_token_accuracy": 0.6353604396184286, + "num_tokens": 265263719.0, + "step": 1589 + }, + { + "entropy": 1.663997044165929, + "epoch": 0.17467248908296942, + "grad_norm": 0.6349430680274963, + "learning_rate": 1.9845804586789846e-05, + "loss": 1.497, + "mean_token_accuracy": 0.6526202807823817, + "num_tokens": 265452023.0, + "step": 1590 + }, + { + "entropy": 1.7305479149023693, + "epoch": 0.17478234599434236, + "grad_norm": 0.641980767250061, + "learning_rate": 1.984550933433907e-05, + "loss": 1.5094, + "mean_token_accuracy": 0.6398686319589615, + "num_tokens": 265639435.0, + "step": 1591 + }, + { + "entropy": 1.7175764441490173, + "epoch": 0.1748922029057153, + "grad_norm": 0.6882075071334839, + "learning_rate": 1.9845213801931912e-05, + "loss": 1.5512, + "mean_token_accuracy": 0.6555089851220449, + "num_tokens": 265805955.0, + "step": 1592 + }, + { + "entropy": 1.7594363292058308, + "epoch": 0.17500205981708825, + "grad_norm": 0.6819611191749573, + "learning_rate": 1.984491798957772e-05, + "loss": 1.6119, + "mean_token_accuracy": 0.623213991522789, + "num_tokens": 265979623.0, + "step": 1593 + }, + { + "entropy": 1.6993054151535034, + "epoch": 0.1751119167284612, + "grad_norm": 0.6733065843582153, + "learning_rate": 1.9844621897285857e-05, + "loss": 1.3148, + "mean_token_accuracy": 0.6543001731236776, + "num_tokens": 266115341.0, + "step": 1594 + }, + { + "entropy": 1.633053998152415, + "epoch": 0.17522177363983413, + "grad_norm": 0.7140861749649048, + "learning_rate": 1.9844325525065703e-05, + "loss": 1.3898, + "mean_token_accuracy": 0.6798456112543741, + "num_tokens": 266245546.0, + "step": 1595 + }, + { + "entropy": 1.7286945780118306, + "epoch": 0.17533163055120704, + "grad_norm": 0.7508774399757385, + "learning_rate": 1.9844028872926624e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.6422994434833527, + "num_tokens": 266441953.0, + "step": 1596 + }, + { + "entropy": 1.6760883927345276, + "epoch": 0.17544148746257998, + "grad_norm": 0.6171284317970276, + "learning_rate": 1.984373194087802e-05, + "loss": 1.3983, + "mean_token_accuracy": 0.6639473040898641, + "num_tokens": 266596780.0, + "step": 1597 + }, + { + "entropy": 1.6912609040737152, + "epoch": 0.17555134437395292, + "grad_norm": 0.6284732818603516, + "learning_rate": 1.9843434728929287e-05, + "loss": 1.2327, + "mean_token_accuracy": 0.6864048341910044, + "num_tokens": 266738616.0, + "step": 1598 + }, + { + "entropy": 1.6915510594844818, + "epoch": 0.17566120128532586, + "grad_norm": 0.6584773063659668, + "learning_rate": 1.9843137237089825e-05, + "loss": 1.3557, + "mean_token_accuracy": 0.6557959119478861, + "num_tokens": 266897766.0, + "step": 1599 + }, + { + "entropy": 1.7651262879371643, + "epoch": 0.1757710581966988, + "grad_norm": 0.8828444480895996, + "learning_rate": 1.984283946536906e-05, + "loss": 1.4619, + "mean_token_accuracy": 0.6481021742026011, + "num_tokens": 267065155.0, + "step": 1600 + }, + { + "entropy": 1.8013077477614086, + "epoch": 0.17588091510807174, + "grad_norm": 0.685353696346283, + "learning_rate": 1.9842541413776405e-05, + "loss": 1.3999, + "mean_token_accuracy": 0.6464814196030298, + "num_tokens": 267229641.0, + "step": 1601 + }, + { + "entropy": 1.8050518830617268, + "epoch": 0.17599077201944469, + "grad_norm": 0.8125794529914856, + "learning_rate": 1.98422430823213e-05, + "loss": 1.5728, + "mean_token_accuracy": 0.6321427176396052, + "num_tokens": 267378156.0, + "step": 1602 + }, + { + "entropy": 1.651296724875768, + "epoch": 0.1761006289308176, + "grad_norm": 0.6428789496421814, + "learning_rate": 1.984194447101319e-05, + "loss": 1.3641, + "mean_token_accuracy": 0.6623906741539637, + "num_tokens": 267590605.0, + "step": 1603 + }, + { + "entropy": 1.7778548002243042, + "epoch": 0.17621048584219054, + "grad_norm": 0.6255530118942261, + "learning_rate": 1.984164557986152e-05, + "loss": 1.4539, + "mean_token_accuracy": 0.6396249979734421, + "num_tokens": 267756215.0, + "step": 1604 + }, + { + "entropy": 1.6877602239449818, + "epoch": 0.17632034275356348, + "grad_norm": 0.7188239693641663, + "learning_rate": 1.984134640887575e-05, + "loss": 1.4997, + "mean_token_accuracy": 0.6404122064510981, + "num_tokens": 268034363.0, + "step": 1605 + }, + { + "entropy": 1.760389010111491, + "epoch": 0.17643019966493642, + "grad_norm": 0.758726954460144, + "learning_rate": 1.984104695806535e-05, + "loss": 1.3262, + "mean_token_accuracy": 0.6760849605003992, + "num_tokens": 268157355.0, + "step": 1606 + }, + { + "entropy": 1.6769930223623912, + "epoch": 0.17654005657630936, + "grad_norm": 0.6320821642875671, + "learning_rate": 1.98407472274398e-05, + "loss": 1.4481, + "mean_token_accuracy": 0.6526689926783243, + "num_tokens": 268370854.0, + "step": 1607 + }, + { + "entropy": 1.7059557735919952, + "epoch": 0.1766499134876823, + "grad_norm": 0.5866220593452454, + "learning_rate": 1.9840447217008583e-05, + "loss": 1.4575, + "mean_token_accuracy": 0.6406320333480835, + "num_tokens": 268603959.0, + "step": 1608 + }, + { + "entropy": 1.6889616250991821, + "epoch": 0.17675977039905524, + "grad_norm": 0.802364706993103, + "learning_rate": 1.9840146926781193e-05, + "loss": 1.3417, + "mean_token_accuracy": 0.6644609669844309, + "num_tokens": 268784211.0, + "step": 1609 + }, + { + "entropy": 1.7760377724965413, + "epoch": 0.17686962731042816, + "grad_norm": 0.760371744632721, + "learning_rate": 1.9839846356767135e-05, + "loss": 1.5012, + "mean_token_accuracy": 0.6282084981600443, + "num_tokens": 268997493.0, + "step": 1610 + }, + { + "entropy": 1.750706394513448, + "epoch": 0.1769794842218011, + "grad_norm": 0.8299148678779602, + "learning_rate": 1.983954550697593e-05, + "loss": 1.3053, + "mean_token_accuracy": 0.6595065792401632, + "num_tokens": 269102555.0, + "step": 1611 + }, + { + "entropy": 1.7330009837945302, + "epoch": 0.17708934113317404, + "grad_norm": 0.6234577298164368, + "learning_rate": 1.9839244377417087e-05, + "loss": 1.4342, + "mean_token_accuracy": 0.6495455453793207, + "num_tokens": 269270569.0, + "step": 1612 + }, + { + "entropy": 1.712340384721756, + "epoch": 0.17719919804454698, + "grad_norm": 0.6255055069923401, + "learning_rate": 1.9838942968100145e-05, + "loss": 1.4671, + "mean_token_accuracy": 0.6483869006236395, + "num_tokens": 269488489.0, + "step": 1613 + }, + { + "entropy": 1.6904551486174266, + "epoch": 0.17730905495591992, + "grad_norm": 1.2697219848632812, + "learning_rate": 1.983864127903464e-05, + "loss": 1.229, + "mean_token_accuracy": 0.6800280114014944, + "num_tokens": 269745519.0, + "step": 1614 + }, + { + "entropy": 1.7634007533391316, + "epoch": 0.17741891186729286, + "grad_norm": 0.6574758291244507, + "learning_rate": 1.9838339310230123e-05, + "loss": 1.4662, + "mean_token_accuracy": 0.6376509219408035, + "num_tokens": 269926677.0, + "step": 1615 + }, + { + "entropy": 1.6982669830322266, + "epoch": 0.17752876877866577, + "grad_norm": 1.7531147003173828, + "learning_rate": 1.983803706169615e-05, + "loss": 1.0775, + "mean_token_accuracy": 0.6754929721355438, + "num_tokens": 270145004.0, + "step": 1616 + }, + { + "entropy": 1.7733216385046642, + "epoch": 0.17763862569003872, + "grad_norm": 0.7520893216133118, + "learning_rate": 1.983773453344228e-05, + "loss": 1.4172, + "mean_token_accuracy": 0.6514776547749838, + "num_tokens": 270295428.0, + "step": 1617 + }, + { + "entropy": 1.7757883270581563, + "epoch": 0.17774848260141166, + "grad_norm": 0.6765945553779602, + "learning_rate": 1.98374317254781e-05, + "loss": 1.3595, + "mean_token_accuracy": 0.661077231168747, + "num_tokens": 270507846.0, + "step": 1618 + }, + { + "entropy": 1.7714926997820537, + "epoch": 0.1778583395127846, + "grad_norm": 0.8542430400848389, + "learning_rate": 1.9837128637813187e-05, + "loss": 1.4031, + "mean_token_accuracy": 0.6585122595230738, + "num_tokens": 270682825.0, + "step": 1619 + }, + { + "entropy": 1.7776615619659424, + "epoch": 0.17796819642415754, + "grad_norm": 0.7186983823776245, + "learning_rate": 1.9836825270457133e-05, + "loss": 1.3677, + "mean_token_accuracy": 0.6544285813967387, + "num_tokens": 270818145.0, + "step": 1620 + }, + { + "entropy": 1.7131075461705525, + "epoch": 0.17807805333553048, + "grad_norm": 0.5991750359535217, + "learning_rate": 1.9836521623419546e-05, + "loss": 1.3429, + "mean_token_accuracy": 0.6644314974546432, + "num_tokens": 270978762.0, + "step": 1621 + }, + { + "entropy": 1.704333871603012, + "epoch": 0.17818791024690342, + "grad_norm": 0.9044831395149231, + "learning_rate": 1.983621769671003e-05, + "loss": 1.4033, + "mean_token_accuracy": 0.6686131457487742, + "num_tokens": 271125197.0, + "step": 1622 + }, + { + "entropy": 1.7072937885920207, + "epoch": 0.17829776715827633, + "grad_norm": 0.6216189861297607, + "learning_rate": 1.98359134903382e-05, + "loss": 1.4799, + "mean_token_accuracy": 0.6512039552132288, + "num_tokens": 271319205.0, + "step": 1623 + }, + { + "entropy": 1.6969341238339741, + "epoch": 0.17840762406964927, + "grad_norm": 0.8598399758338928, + "learning_rate": 1.9835609004313693e-05, + "loss": 1.3197, + "mean_token_accuracy": 0.6660919090112051, + "num_tokens": 271482991.0, + "step": 1624 + }, + { + "entropy": 1.7475373148918152, + "epoch": 0.17851748098102221, + "grad_norm": 0.8220011591911316, + "learning_rate": 1.9835304238646146e-05, + "loss": 1.3325, + "mean_token_accuracy": 0.6602373421192169, + "num_tokens": 271615079.0, + "step": 1625 + }, + { + "entropy": 1.7845442990461986, + "epoch": 0.17862733789239515, + "grad_norm": 0.7265953421592712, + "learning_rate": 1.9834999193345197e-05, + "loss": 1.2799, + "mean_token_accuracy": 0.6676252981026968, + "num_tokens": 271737120.0, + "step": 1626 + }, + { + "entropy": 1.7501579523086548, + "epoch": 0.1787371948037681, + "grad_norm": 0.6195200681686401, + "learning_rate": 1.9834693868420505e-05, + "loss": 1.4889, + "mean_token_accuracy": 0.643327941497167, + "num_tokens": 271926352.0, + "step": 1627 + }, + { + "entropy": 1.7393087645371754, + "epoch": 0.17884705171514104, + "grad_norm": 0.8133971095085144, + "learning_rate": 1.9834388263881736e-05, + "loss": 1.4181, + "mean_token_accuracy": 0.6791380792856216, + "num_tokens": 272038740.0, + "step": 1628 + }, + { + "entropy": 1.6529111862182617, + "epoch": 0.17895690862651398, + "grad_norm": 0.653113603591919, + "learning_rate": 1.9834082379738556e-05, + "loss": 1.4357, + "mean_token_accuracy": 0.6570485532283783, + "num_tokens": 272226850.0, + "step": 1629 + }, + { + "entropy": 1.701356291770935, + "epoch": 0.1790667655378869, + "grad_norm": 0.7334955334663391, + "learning_rate": 1.983377621600065e-05, + "loss": 1.3291, + "mean_token_accuracy": 0.6579341193040212, + "num_tokens": 272393860.0, + "step": 1630 + }, + { + "entropy": 1.7137031455834706, + "epoch": 0.17917662244925983, + "grad_norm": 0.6799026727676392, + "learning_rate": 1.983346977267771e-05, + "loss": 1.4359, + "mean_token_accuracy": 0.6577520171801249, + "num_tokens": 272577442.0, + "step": 1631 + }, + { + "entropy": 1.7033151189486186, + "epoch": 0.17928647936063277, + "grad_norm": 0.7878915071487427, + "learning_rate": 1.983316304977943e-05, + "loss": 1.4819, + "mean_token_accuracy": 0.6430552899837494, + "num_tokens": 272770738.0, + "step": 1632 + }, + { + "entropy": 1.6933635870615642, + "epoch": 0.1793963362720057, + "grad_norm": 0.7030259966850281, + "learning_rate": 1.9832856047315522e-05, + "loss": 1.3218, + "mean_token_accuracy": 0.6660377085208893, + "num_tokens": 272972512.0, + "step": 1633 + }, + { + "entropy": 1.767316649357478, + "epoch": 0.17950619318337865, + "grad_norm": 0.6197423934936523, + "learning_rate": 1.9832548765295696e-05, + "loss": 1.4417, + "mean_token_accuracy": 0.6372250666220983, + "num_tokens": 273161984.0, + "step": 1634 + }, + { + "entropy": 1.6606941322485607, + "epoch": 0.1796160500947516, + "grad_norm": 0.6114673614501953, + "learning_rate": 1.9832241203729684e-05, + "loss": 1.4749, + "mean_token_accuracy": 0.649383544921875, + "num_tokens": 273362338.0, + "step": 1635 + }, + { + "entropy": 1.757198413213094, + "epoch": 0.17972590700612454, + "grad_norm": 0.7329999208450317, + "learning_rate": 1.9831933362627215e-05, + "loss": 1.5256, + "mean_token_accuracy": 0.6449063618977865, + "num_tokens": 273550420.0, + "step": 1636 + }, + { + "entropy": 1.7132985492547352, + "epoch": 0.17983576391749745, + "grad_norm": 0.72648024559021, + "learning_rate": 1.983162524199804e-05, + "loss": 1.5848, + "mean_token_accuracy": 0.6244159291187922, + "num_tokens": 273767744.0, + "step": 1637 + }, + { + "entropy": 1.7032330830891926, + "epoch": 0.1799456208288704, + "grad_norm": 0.7133116126060486, + "learning_rate": 1.9831316841851906e-05, + "loss": 1.4667, + "mean_token_accuracy": 0.6442908545335134, + "num_tokens": 274002746.0, + "step": 1638 + }, + { + "entropy": 1.700139433145523, + "epoch": 0.18005547774024333, + "grad_norm": 0.662652850151062, + "learning_rate": 1.9831008162198565e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6429425726334254, + "num_tokens": 274183020.0, + "step": 1639 + }, + { + "entropy": 1.700132042169571, + "epoch": 0.18016533465161627, + "grad_norm": 0.7111302018165588, + "learning_rate": 1.9830699203047804e-05, + "loss": 1.4556, + "mean_token_accuracy": 0.6517406602700552, + "num_tokens": 274383586.0, + "step": 1640 + }, + { + "entropy": 1.7377035915851593, + "epoch": 0.1802751915629892, + "grad_norm": 0.7069066762924194, + "learning_rate": 1.983038996440939e-05, + "loss": 1.3404, + "mean_token_accuracy": 0.6645782341559728, + "num_tokens": 274524820.0, + "step": 1641 + }, + { + "entropy": 1.7640187640984852, + "epoch": 0.18038504847436215, + "grad_norm": 0.7231025099754333, + "learning_rate": 1.983008044629311e-05, + "loss": 1.3854, + "mean_token_accuracy": 0.6507704704999924, + "num_tokens": 274656672.0, + "step": 1642 + }, + { + "entropy": 1.728807379802068, + "epoch": 0.1804949053857351, + "grad_norm": 0.6958527565002441, + "learning_rate": 1.9829770648708764e-05, + "loss": 1.4108, + "mean_token_accuracy": 0.6552976568539938, + "num_tokens": 274853709.0, + "step": 1643 + }, + { + "entropy": 1.7781054377555847, + "epoch": 0.180604762297108, + "grad_norm": 0.8837335705757141, + "learning_rate": 1.9829460571666156e-05, + "loss": 1.4283, + "mean_token_accuracy": 0.6518157124519348, + "num_tokens": 274988556.0, + "step": 1644 + }, + { + "entropy": 1.7898385723431904, + "epoch": 0.18071461920848095, + "grad_norm": 0.801447331905365, + "learning_rate": 1.9829150215175103e-05, + "loss": 1.5257, + "mean_token_accuracy": 0.6439861307541529, + "num_tokens": 275141877.0, + "step": 1645 + }, + { + "entropy": 1.6892934044202168, + "epoch": 0.1808244761198539, + "grad_norm": 0.621210515499115, + "learning_rate": 1.982883957924542e-05, + "loss": 1.337, + "mean_token_accuracy": 0.6619320660829544, + "num_tokens": 275286921.0, + "step": 1646 + }, + { + "entropy": 1.7738712231318157, + "epoch": 0.18093433303122683, + "grad_norm": 0.7250021696090698, + "learning_rate": 1.9828528663886946e-05, + "loss": 1.4375, + "mean_token_accuracy": 0.6405630757411321, + "num_tokens": 275465643.0, + "step": 1647 + }, + { + "entropy": 1.7501648565133412, + "epoch": 0.18104418994259977, + "grad_norm": 0.7406297326087952, + "learning_rate": 1.9828217469109514e-05, + "loss": 1.6333, + "mean_token_accuracy": 0.631665957470735, + "num_tokens": 275636335.0, + "step": 1648 + }, + { + "entropy": 1.7328572471936543, + "epoch": 0.1811540468539727, + "grad_norm": 0.7023396492004395, + "learning_rate": 1.982790599492298e-05, + "loss": 1.3489, + "mean_token_accuracy": 0.6668912867705027, + "num_tokens": 275778888.0, + "step": 1649 + }, + { + "entropy": 1.7373796900113423, + "epoch": 0.18126390376534562, + "grad_norm": 0.6394554376602173, + "learning_rate": 1.9827594241337196e-05, + "loss": 1.2949, + "mean_token_accuracy": 0.6552981982628504, + "num_tokens": 275936865.0, + "step": 1650 + }, + { + "entropy": 1.6960271497567494, + "epoch": 0.18137376067671857, + "grad_norm": 0.6193222403526306, + "learning_rate": 1.9827282208362034e-05, + "loss": 1.3971, + "mean_token_accuracy": 0.6560710817575455, + "num_tokens": 276084652.0, + "step": 1651 + }, + { + "entropy": 1.747694154580434, + "epoch": 0.1814836175880915, + "grad_norm": 0.6936965584754944, + "learning_rate": 1.982696989600737e-05, + "loss": 1.3516, + "mean_token_accuracy": 0.6600579669078191, + "num_tokens": 276264605.0, + "step": 1652 + }, + { + "entropy": 1.7094822824001312, + "epoch": 0.18159347449946445, + "grad_norm": 0.7348982095718384, + "learning_rate": 1.9826657304283085e-05, + "loss": 1.3374, + "mean_token_accuracy": 0.6641669621070226, + "num_tokens": 276425397.0, + "step": 1653 + }, + { + "entropy": 1.7678433259328206, + "epoch": 0.1817033314108374, + "grad_norm": 0.6787462830543518, + "learning_rate": 1.982634443319907e-05, + "loss": 1.4428, + "mean_token_accuracy": 0.6486629645029703, + "num_tokens": 276586282.0, + "step": 1654 + }, + { + "entropy": 1.7211858630180359, + "epoch": 0.18181318832221033, + "grad_norm": 0.6997633576393127, + "learning_rate": 1.9826031282765233e-05, + "loss": 1.3731, + "mean_token_accuracy": 0.6508075048526129, + "num_tokens": 276786867.0, + "step": 1655 + }, + { + "entropy": 1.7509803275267284, + "epoch": 0.18192304523358327, + "grad_norm": 0.7596442699432373, + "learning_rate": 1.9825717852991487e-05, + "loss": 1.4368, + "mean_token_accuracy": 0.6510319958130518, + "num_tokens": 276926610.0, + "step": 1656 + }, + { + "entropy": 1.7846331695715587, + "epoch": 0.18203290214495618, + "grad_norm": 0.7824363708496094, + "learning_rate": 1.9825404143887746e-05, + "loss": 1.5756, + "mean_token_accuracy": 0.6446571896473566, + "num_tokens": 277112438.0, + "step": 1657 + }, + { + "entropy": 1.7570526401201885, + "epoch": 0.18214275905632912, + "grad_norm": 0.7908049821853638, + "learning_rate": 1.9825090155463936e-05, + "loss": 1.3862, + "mean_token_accuracy": 0.6663278043270111, + "num_tokens": 277257253.0, + "step": 1658 + }, + { + "entropy": 1.6966181596120198, + "epoch": 0.18225261596770206, + "grad_norm": 0.7348120808601379, + "learning_rate": 1.9824775887730006e-05, + "loss": 1.2518, + "mean_token_accuracy": 0.6801349520683289, + "num_tokens": 277366545.0, + "step": 1659 + }, + { + "entropy": 1.7101906538009644, + "epoch": 0.182362472879075, + "grad_norm": 0.7664896845817566, + "learning_rate": 1.9824461340695892e-05, + "loss": 1.6055, + "mean_token_accuracy": 0.6423424060146014, + "num_tokens": 277553846.0, + "step": 1660 + }, + { + "entropy": 1.6806841989358265, + "epoch": 0.18247232979044795, + "grad_norm": 0.5992354154586792, + "learning_rate": 1.9824146514371553e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6359513700008392, + "num_tokens": 277779533.0, + "step": 1661 + }, + { + "entropy": 1.7670801480611165, + "epoch": 0.1825821867018209, + "grad_norm": 0.8071349263191223, + "learning_rate": 1.9823831408766953e-05, + "loss": 1.703, + "mean_token_accuracy": 0.6325116107861201, + "num_tokens": 277985352.0, + "step": 1662 + }, + { + "entropy": 1.6910718381404877, + "epoch": 0.18269204361319383, + "grad_norm": 0.6777242422103882, + "learning_rate": 1.9823516023892067e-05, + "loss": 1.5038, + "mean_token_accuracy": 0.6417184472084045, + "num_tokens": 278220879.0, + "step": 1663 + }, + { + "entropy": 1.728562315305074, + "epoch": 0.18280190052456674, + "grad_norm": 0.6362787485122681, + "learning_rate": 1.9823200359756875e-05, + "loss": 1.4164, + "mean_token_accuracy": 0.6453818678855896, + "num_tokens": 278368023.0, + "step": 1664 + }, + { + "entropy": 1.7313311994075775, + "epoch": 0.18291175743593968, + "grad_norm": 0.6767556071281433, + "learning_rate": 1.9822884416371364e-05, + "loss": 1.3056, + "mean_token_accuracy": 0.6659288257360458, + "num_tokens": 278496388.0, + "step": 1665 + }, + { + "entropy": 1.6881966690222423, + "epoch": 0.18302161434731262, + "grad_norm": 0.6294616460800171, + "learning_rate": 1.982256819374554e-05, + "loss": 1.4921, + "mean_token_accuracy": 0.6462472081184387, + "num_tokens": 278674023.0, + "step": 1666 + }, + { + "entropy": 1.6751320759455364, + "epoch": 0.18313147125868556, + "grad_norm": 0.5912817120552063, + "learning_rate": 1.9822251691889408e-05, + "loss": 1.3839, + "mean_token_accuracy": 0.6660457054773966, + "num_tokens": 278837833.0, + "step": 1667 + }, + { + "entropy": 1.7794308066368103, + "epoch": 0.1832413281700585, + "grad_norm": 1.0613802671432495, + "learning_rate": 1.9821934910812984e-05, + "loss": 1.5629, + "mean_token_accuracy": 0.6530888924996058, + "num_tokens": 279020010.0, + "step": 1668 + }, + { + "entropy": 1.6890638172626495, + "epoch": 0.18335118508143144, + "grad_norm": 0.9429357647895813, + "learning_rate": 1.9821617850526297e-05, + "loss": 1.7154, + "mean_token_accuracy": 0.6422553857167562, + "num_tokens": 279192730.0, + "step": 1669 + }, + { + "entropy": 1.7527574300765991, + "epoch": 0.18346104199280439, + "grad_norm": 0.6539662480354309, + "learning_rate": 1.9821300511039378e-05, + "loss": 1.3789, + "mean_token_accuracy": 0.6461327920357386, + "num_tokens": 279341934.0, + "step": 1670 + }, + { + "entropy": 1.7701294422149658, + "epoch": 0.1835708989041773, + "grad_norm": 0.7200759053230286, + "learning_rate": 1.9820982892362274e-05, + "loss": 1.3941, + "mean_token_accuracy": 0.65191750228405, + "num_tokens": 279491511.0, + "step": 1671 + }, + { + "entropy": 1.8013378481070201, + "epoch": 0.18368075581555024, + "grad_norm": 0.7267040610313416, + "learning_rate": 1.9820664994505035e-05, + "loss": 1.4202, + "mean_token_accuracy": 0.6435056875149409, + "num_tokens": 279678965.0, + "step": 1672 + }, + { + "entropy": 1.723004271586736, + "epoch": 0.18379061272692318, + "grad_norm": 0.7009273171424866, + "learning_rate": 1.9820346817477725e-05, + "loss": 1.4127, + "mean_token_accuracy": 0.6442108601331711, + "num_tokens": 279853720.0, + "step": 1673 + }, + { + "entropy": 1.710123598575592, + "epoch": 0.18390046963829612, + "grad_norm": 0.8877512812614441, + "learning_rate": 1.982002836129041e-05, + "loss": 1.4214, + "mean_token_accuracy": 0.6699336767196655, + "num_tokens": 279997611.0, + "step": 1674 + }, + { + "entropy": 1.7392498552799225, + "epoch": 0.18401032654966906, + "grad_norm": 0.8386234641075134, + "learning_rate": 1.9819709625953174e-05, + "loss": 1.285, + "mean_token_accuracy": 0.6787472317616144, + "num_tokens": 280114602.0, + "step": 1675 + }, + { + "entropy": 1.763936976591746, + "epoch": 0.184120183461042, + "grad_norm": 0.8790518641471863, + "learning_rate": 1.9819390611476105e-05, + "loss": 1.3905, + "mean_token_accuracy": 0.6481430331865946, + "num_tokens": 280325517.0, + "step": 1676 + }, + { + "entropy": 1.725318839152654, + "epoch": 0.18423004037241492, + "grad_norm": 0.5728416442871094, + "learning_rate": 1.9819071317869295e-05, + "loss": 1.5904, + "mean_token_accuracy": 0.6158607254425684, + "num_tokens": 280595902.0, + "step": 1677 + }, + { + "entropy": 1.7371946076552074, + "epoch": 0.18433989728378786, + "grad_norm": 0.5821994543075562, + "learning_rate": 1.9818751745142853e-05, + "loss": 1.4596, + "mean_token_accuracy": 0.645810733238856, + "num_tokens": 280830692.0, + "step": 1678 + }, + { + "entropy": 1.8042426307996113, + "epoch": 0.1844497541951608, + "grad_norm": 0.6637836694717407, + "learning_rate": 1.9818431893306887e-05, + "loss": 1.5292, + "mean_token_accuracy": 0.6280421316623688, + "num_tokens": 281004533.0, + "step": 1679 + }, + { + "entropy": 1.7169641653696697, + "epoch": 0.18455961110653374, + "grad_norm": 0.7053066492080688, + "learning_rate": 1.981811176237153e-05, + "loss": 1.3631, + "mean_token_accuracy": 0.6582474460204443, + "num_tokens": 281160638.0, + "step": 1680 + }, + { + "entropy": 1.777447024981181, + "epoch": 0.18466946801790668, + "grad_norm": 0.7402387857437134, + "learning_rate": 1.981779135234691e-05, + "loss": 1.5166, + "mean_token_accuracy": 0.6345723768075308, + "num_tokens": 281338443.0, + "step": 1681 + }, + { + "entropy": 1.7175538738568623, + "epoch": 0.18477932492927962, + "grad_norm": 0.5362944602966309, + "learning_rate": 1.9817470663243165e-05, + "loss": 0.9837, + "mean_token_accuracy": 0.6940766374270121, + "num_tokens": 281502174.0, + "step": 1682 + }, + { + "entropy": 1.77180611093839, + "epoch": 0.18488918184065256, + "grad_norm": 0.9495187401771545, + "learning_rate": 1.9817149695070447e-05, + "loss": 1.4066, + "mean_token_accuracy": 0.6637826462586721, + "num_tokens": 281636795.0, + "step": 1683 + }, + { + "entropy": 1.7006933093070984, + "epoch": 0.18499903875202547, + "grad_norm": 0.5615609884262085, + "learning_rate": 1.9816828447838913e-05, + "loss": 1.3676, + "mean_token_accuracy": 0.6498903632164001, + "num_tokens": 281795185.0, + "step": 1684 + }, + { + "entropy": 1.8015896479288738, + "epoch": 0.18510889566339842, + "grad_norm": 0.7910170555114746, + "learning_rate": 1.9816506921558733e-05, + "loss": 1.4873, + "mean_token_accuracy": 0.6395314981540045, + "num_tokens": 281967043.0, + "step": 1685 + }, + { + "entropy": 1.7635705371697743, + "epoch": 0.18521875257477136, + "grad_norm": 0.6194027066230774, + "learning_rate": 1.9816185116240084e-05, + "loss": 1.4286, + "mean_token_accuracy": 0.6360458632310232, + "num_tokens": 282209695.0, + "step": 1686 + }, + { + "entropy": 1.727480669816335, + "epoch": 0.1853286094861443, + "grad_norm": 0.6732087135314941, + "learning_rate": 1.981586303189315e-05, + "loss": 1.2988, + "mean_token_accuracy": 0.6633017708857855, + "num_tokens": 282393134.0, + "step": 1687 + }, + { + "entropy": 1.7175672849019368, + "epoch": 0.18543846639751724, + "grad_norm": 0.7200150489807129, + "learning_rate": 1.9815540668528116e-05, + "loss": 1.4865, + "mean_token_accuracy": 0.652409682671229, + "num_tokens": 282586386.0, + "step": 1688 + }, + { + "entropy": 1.7157348195711772, + "epoch": 0.18554832330889018, + "grad_norm": 0.6159201264381409, + "learning_rate": 1.9815218026155194e-05, + "loss": 1.4005, + "mean_token_accuracy": 0.6451119929552078, + "num_tokens": 282769139.0, + "step": 1689 + }, + { + "entropy": 1.7268279194831848, + "epoch": 0.18565818022026312, + "grad_norm": 0.8370085954666138, + "learning_rate": 1.9814895104784598e-05, + "loss": 1.3224, + "mean_token_accuracy": 0.6635098308324814, + "num_tokens": 282906774.0, + "step": 1690 + }, + { + "entropy": 1.7369357744852703, + "epoch": 0.18576803713163603, + "grad_norm": 0.8012304306030273, + "learning_rate": 1.9814571904426543e-05, + "loss": 1.3494, + "mean_token_accuracy": 0.6629086136817932, + "num_tokens": 283026160.0, + "step": 1691 + }, + { + "entropy": 1.766795575618744, + "epoch": 0.18587789404300897, + "grad_norm": 0.6900972127914429, + "learning_rate": 1.9814248425091256e-05, + "loss": 1.4208, + "mean_token_accuracy": 0.6468196511268616, + "num_tokens": 283225811.0, + "step": 1692 + }, + { + "entropy": 1.739597777525584, + "epoch": 0.18598775095438191, + "grad_norm": 0.6812617778778076, + "learning_rate": 1.981392466678898e-05, + "loss": 1.4982, + "mean_token_accuracy": 0.6387915263573328, + "num_tokens": 283443567.0, + "step": 1693 + }, + { + "entropy": 1.6632297138373058, + "epoch": 0.18609760786575485, + "grad_norm": 0.7416886687278748, + "learning_rate": 1.981360062952996e-05, + "loss": 1.4409, + "mean_token_accuracy": 0.6323288530111313, + "num_tokens": 283658416.0, + "step": 1694 + }, + { + "entropy": 1.7458237608273823, + "epoch": 0.1862074647771278, + "grad_norm": 0.5728136897087097, + "learning_rate": 1.9813276313324453e-05, + "loss": 1.3206, + "mean_token_accuracy": 0.6567148516575495, + "num_tokens": 283807479.0, + "step": 1695 + }, + { + "entropy": 1.7193138301372528, + "epoch": 0.18631732168850074, + "grad_norm": 0.6321941018104553, + "learning_rate": 1.981295171818272e-05, + "loss": 1.3395, + "mean_token_accuracy": 0.6582736521959305, + "num_tokens": 284013422.0, + "step": 1696 + }, + { + "entropy": 1.7184888124465942, + "epoch": 0.18642717859987368, + "grad_norm": 0.579247236251831, + "learning_rate": 1.981262684411504e-05, + "loss": 1.4342, + "mean_token_accuracy": 0.6520635535319647, + "num_tokens": 284242706.0, + "step": 1697 + }, + { + "entropy": 1.7161762118339539, + "epoch": 0.1865370355112466, + "grad_norm": 0.6291903853416443, + "learning_rate": 1.9812301691131688e-05, + "loss": 1.3629, + "mean_token_accuracy": 0.663032611211141, + "num_tokens": 284413353.0, + "step": 1698 + }, + { + "entropy": 1.7629437744617462, + "epoch": 0.18664689242261953, + "grad_norm": 0.7114847898483276, + "learning_rate": 1.981197625924296e-05, + "loss": 1.3757, + "mean_token_accuracy": 0.6550849924484888, + "num_tokens": 284543844.0, + "step": 1699 + }, + { + "entropy": 1.7381982902685802, + "epoch": 0.18675674933399247, + "grad_norm": 0.7382696866989136, + "learning_rate": 1.9811650548459155e-05, + "loss": 1.4664, + "mean_token_accuracy": 0.6513356864452362, + "num_tokens": 284731517.0, + "step": 1700 + }, + { + "entropy": 1.674417903025945, + "epoch": 0.1868666062453654, + "grad_norm": 0.7865607142448425, + "learning_rate": 1.9811324558790573e-05, + "loss": 1.3178, + "mean_token_accuracy": 0.6622059692939123, + "num_tokens": 284848458.0, + "step": 1701 + }, + { + "entropy": 1.6967013478279114, + "epoch": 0.18697646315673835, + "grad_norm": 0.6472486853599548, + "learning_rate": 1.9810998290247547e-05, + "loss": 1.4338, + "mean_token_accuracy": 0.6450558453798294, + "num_tokens": 285063455.0, + "step": 1702 + }, + { + "entropy": 1.701272616783778, + "epoch": 0.1870863200681113, + "grad_norm": 0.6370506882667542, + "learning_rate": 1.9810671742840394e-05, + "loss": 1.3558, + "mean_token_accuracy": 0.6611084739367167, + "num_tokens": 285219438.0, + "step": 1703 + }, + { + "entropy": 1.7011124789714813, + "epoch": 0.18719617697948424, + "grad_norm": 0.7365835309028625, + "learning_rate": 1.981034491657945e-05, + "loss": 1.4027, + "mean_token_accuracy": 0.6517395476500193, + "num_tokens": 285421503.0, + "step": 1704 + }, + { + "entropy": 1.7084963818391163, + "epoch": 0.18730603389085715, + "grad_norm": 0.7742033004760742, + "learning_rate": 1.9810017811475058e-05, + "loss": 1.3874, + "mean_token_accuracy": 0.6629381775856018, + "num_tokens": 285546599.0, + "step": 1705 + }, + { + "entropy": 1.7513247827688854, + "epoch": 0.1874158908022301, + "grad_norm": 0.6956959962844849, + "learning_rate": 1.9809690427537577e-05, + "loss": 1.436, + "mean_token_accuracy": 0.6494058966636658, + "num_tokens": 285726907.0, + "step": 1706 + }, + { + "entropy": 1.6769114037354786, + "epoch": 0.18752574771360303, + "grad_norm": 0.6903713345527649, + "learning_rate": 1.9809362764777357e-05, + "loss": 1.3839, + "mean_token_accuracy": 0.656981165210406, + "num_tokens": 285913198.0, + "step": 1707 + }, + { + "entropy": 1.7068938712279003, + "epoch": 0.18763560462497597, + "grad_norm": 0.6433144807815552, + "learning_rate": 1.980903482320478e-05, + "loss": 1.4031, + "mean_token_accuracy": 0.6512850423653921, + "num_tokens": 286084913.0, + "step": 1708 + }, + { + "entropy": 1.7556909918785095, + "epoch": 0.1877454615363489, + "grad_norm": 0.6782954335212708, + "learning_rate": 1.980870660283022e-05, + "loss": 1.3747, + "mean_token_accuracy": 0.6523662805557251, + "num_tokens": 286225973.0, + "step": 1709 + }, + { + "entropy": 1.7448440194129944, + "epoch": 0.18785531844772185, + "grad_norm": 0.7479444742202759, + "learning_rate": 1.9808378103664064e-05, + "loss": 1.3416, + "mean_token_accuracy": 0.6604155600070953, + "num_tokens": 286384384.0, + "step": 1710 + }, + { + "entropy": 1.7512960731983185, + "epoch": 0.18796517535909477, + "grad_norm": 0.7333407402038574, + "learning_rate": 1.980804932571671e-05, + "loss": 1.3877, + "mean_token_accuracy": 0.6567022105058035, + "num_tokens": 286546675.0, + "step": 1711 + }, + { + "entropy": 1.6740979949633281, + "epoch": 0.1880750322704677, + "grad_norm": 0.8082526922225952, + "learning_rate": 1.9807720268998563e-05, + "loss": 1.3267, + "mean_token_accuracy": 0.6659936855236689, + "num_tokens": 286679729.0, + "step": 1712 + }, + { + "entropy": 1.7247611383597057, + "epoch": 0.18818488918184065, + "grad_norm": 0.6447880268096924, + "learning_rate": 1.980739093352004e-05, + "loss": 1.3205, + "mean_token_accuracy": 0.6574007123708725, + "num_tokens": 286806107.0, + "step": 1713 + }, + { + "entropy": 1.683200756708781, + "epoch": 0.1882947460932136, + "grad_norm": 0.7221059203147888, + "learning_rate": 1.9807061319291562e-05, + "loss": 1.4732, + "mean_token_accuracy": 0.6614238594969114, + "num_tokens": 286932425.0, + "step": 1714 + }, + { + "entropy": 1.733568549156189, + "epoch": 0.18840460300458653, + "grad_norm": 0.8345680832862854, + "learning_rate": 1.980673142632356e-05, + "loss": 1.3026, + "mean_token_accuracy": 0.6688729921976725, + "num_tokens": 287051397.0, + "step": 1715 + }, + { + "entropy": 1.700208157300949, + "epoch": 0.18851445991595947, + "grad_norm": 0.7973415851593018, + "learning_rate": 1.9806401254626483e-05, + "loss": 1.3171, + "mean_token_accuracy": 0.6919489403565725, + "num_tokens": 287216101.0, + "step": 1716 + }, + { + "entropy": 1.7363602022329967, + "epoch": 0.1886243168273324, + "grad_norm": 0.798485517501831, + "learning_rate": 1.9806070804210768e-05, + "loss": 1.4979, + "mean_token_accuracy": 0.6549982378880183, + "num_tokens": 287384291.0, + "step": 1717 + }, + { + "entropy": 1.6927721202373505, + "epoch": 0.18873417373870532, + "grad_norm": 0.6318584084510803, + "learning_rate": 1.9805740075086884e-05, + "loss": 1.2625, + "mean_token_accuracy": 0.6866245418787003, + "num_tokens": 287521194.0, + "step": 1718 + }, + { + "entropy": 1.7616549928983052, + "epoch": 0.18884403065007827, + "grad_norm": 0.6952632069587708, + "learning_rate": 1.980540906726529e-05, + "loss": 1.5152, + "mean_token_accuracy": 0.6423781365156174, + "num_tokens": 287695130.0, + "step": 1719 + }, + { + "entropy": 1.7475587129592896, + "epoch": 0.1889538875614512, + "grad_norm": 0.8023023009300232, + "learning_rate": 1.9805077780756473e-05, + "loss": 1.2237, + "mean_token_accuracy": 0.6784818867842356, + "num_tokens": 287806319.0, + "step": 1720 + }, + { + "entropy": 1.7310992081960042, + "epoch": 0.18906374447282415, + "grad_norm": 0.7073454856872559, + "learning_rate": 1.9804746215570908e-05, + "loss": 1.4297, + "mean_token_accuracy": 0.6600957165161768, + "num_tokens": 288006487.0, + "step": 1721 + }, + { + "entropy": 1.6791634062925975, + "epoch": 0.1891736013841971, + "grad_norm": 0.7400916218757629, + "learning_rate": 1.9804414371719096e-05, + "loss": 1.2141, + "mean_token_accuracy": 0.6767335186402003, + "num_tokens": 288109036.0, + "step": 1722 + }, + { + "entropy": 1.7825380861759186, + "epoch": 0.18928345829557003, + "grad_norm": 0.786323070526123, + "learning_rate": 1.9804082249211533e-05, + "loss": 1.4554, + "mean_token_accuracy": 0.6546699553728104, + "num_tokens": 288234316.0, + "step": 1723 + }, + { + "entropy": 1.7622264524300892, + "epoch": 0.18939331520694297, + "grad_norm": 0.7630921602249146, + "learning_rate": 1.9803749848058733e-05, + "loss": 1.2852, + "mean_token_accuracy": 0.6784159690141678, + "num_tokens": 288358675.0, + "step": 1724 + }, + { + "entropy": 1.6970917185147603, + "epoch": 0.18950317211831588, + "grad_norm": 0.9257987141609192, + "learning_rate": 1.980341716827122e-05, + "loss": 1.3535, + "mean_token_accuracy": 0.658329596122106, + "num_tokens": 288525891.0, + "step": 1725 + }, + { + "entropy": 1.7701091667016347, + "epoch": 0.18961302902968882, + "grad_norm": 0.7394087910652161, + "learning_rate": 1.980308420985952e-05, + "loss": 1.3935, + "mean_token_accuracy": 0.6555753747622172, + "num_tokens": 288695061.0, + "step": 1726 + }, + { + "entropy": 1.7800631125768025, + "epoch": 0.18972288594106176, + "grad_norm": 0.8137099742889404, + "learning_rate": 1.980275097283417e-05, + "loss": 1.5755, + "mean_token_accuracy": 0.6337922463814417, + "num_tokens": 288879071.0, + "step": 1727 + }, + { + "entropy": 1.6966053247451782, + "epoch": 0.1898327428524347, + "grad_norm": 0.6805859804153442, + "learning_rate": 1.980241745720572e-05, + "loss": 1.56, + "mean_token_accuracy": 0.6277511119842529, + "num_tokens": 289092246.0, + "step": 1728 + }, + { + "entropy": 1.7272930939992268, + "epoch": 0.18994259976380765, + "grad_norm": 0.6892310976982117, + "learning_rate": 1.9802083662984727e-05, + "loss": 1.5014, + "mean_token_accuracy": 0.645158996184667, + "num_tokens": 289246411.0, + "step": 1729 + }, + { + "entropy": 1.718581090370814, + "epoch": 0.1900524566751806, + "grad_norm": 0.7332895994186401, + "learning_rate": 1.9801749590181747e-05, + "loss": 1.4741, + "mean_token_accuracy": 0.6542643109957377, + "num_tokens": 289450051.0, + "step": 1730 + }, + { + "entropy": 1.7375418742497761, + "epoch": 0.19016231358655353, + "grad_norm": 0.6508983969688416, + "learning_rate": 1.980141523880736e-05, + "loss": 1.4436, + "mean_token_accuracy": 0.6606499453385671, + "num_tokens": 289635594.0, + "step": 1731 + }, + { + "entropy": 1.749743362267812, + "epoch": 0.19027217049792644, + "grad_norm": 0.6622723937034607, + "learning_rate": 1.980108060887215e-05, + "loss": 1.3431, + "mean_token_accuracy": 0.6567785541216532, + "num_tokens": 289785489.0, + "step": 1732 + }, + { + "entropy": 1.7038879295190175, + "epoch": 0.19038202740929938, + "grad_norm": 0.7151694297790527, + "learning_rate": 1.98007457003867e-05, + "loss": 1.4463, + "mean_token_accuracy": 0.6599321961402893, + "num_tokens": 289955840.0, + "step": 1733 + }, + { + "entropy": 1.6714328130086262, + "epoch": 0.19049188432067232, + "grad_norm": 0.5718501210212708, + "learning_rate": 1.980041051336162e-05, + "loss": 1.3941, + "mean_token_accuracy": 0.6486310015122095, + "num_tokens": 290155918.0, + "step": 1734 + }, + { + "entropy": 1.7431990305582683, + "epoch": 0.19060174123204526, + "grad_norm": 0.7136338949203491, + "learning_rate": 1.9800075047807507e-05, + "loss": 1.4286, + "mean_token_accuracy": 0.6443975865840912, + "num_tokens": 290318477.0, + "step": 1735 + }, + { + "entropy": 1.7279701729615529, + "epoch": 0.1907115981434182, + "grad_norm": 0.8319575786590576, + "learning_rate": 1.9799739303734986e-05, + "loss": 1.3872, + "mean_token_accuracy": 0.6527506609757742, + "num_tokens": 290442722.0, + "step": 1736 + }, + { + "entropy": 1.6454756160577138, + "epoch": 0.19082145505479114, + "grad_norm": 0.6253258585929871, + "learning_rate": 1.9799403281154684e-05, + "loss": 1.2394, + "mean_token_accuracy": 0.6801058252652487, + "num_tokens": 290589905.0, + "step": 1737 + }, + { + "entropy": 1.7290644546349843, + "epoch": 0.19093131196616406, + "grad_norm": 0.6695640087127686, + "learning_rate": 1.9799066980077227e-05, + "loss": 1.4237, + "mean_token_accuracy": 0.6563388605912527, + "num_tokens": 290762031.0, + "step": 1738 + }, + { + "entropy": 1.7278599540392559, + "epoch": 0.191041168877537, + "grad_norm": 0.7530442476272583, + "learning_rate": 1.979873040051327e-05, + "loss": 1.5137, + "mean_token_accuracy": 0.6513047764698664, + "num_tokens": 290924823.0, + "step": 1739 + }, + { + "entropy": 1.79681396484375, + "epoch": 0.19115102578890994, + "grad_norm": 0.7993313074111938, + "learning_rate": 1.9798393542473456e-05, + "loss": 1.4511, + "mean_token_accuracy": 0.6647708465655645, + "num_tokens": 291080098.0, + "step": 1740 + }, + { + "entropy": 1.7253247797489166, + "epoch": 0.19126088270028288, + "grad_norm": 0.6776132583618164, + "learning_rate": 1.9798056405968457e-05, + "loss": 1.3921, + "mean_token_accuracy": 0.6470450113217036, + "num_tokens": 291206890.0, + "step": 1741 + }, + { + "entropy": 1.7353723645210266, + "epoch": 0.19137073961165582, + "grad_norm": 0.8562172055244446, + "learning_rate": 1.9797718991008936e-05, + "loss": 1.3435, + "mean_token_accuracy": 0.6580035636822382, + "num_tokens": 291343320.0, + "step": 1742 + }, + { + "entropy": 1.7603330214818318, + "epoch": 0.19148059652302876, + "grad_norm": 0.7309443950653076, + "learning_rate": 1.979738129760557e-05, + "loss": 1.5365, + "mean_token_accuracy": 0.622960185011228, + "num_tokens": 291550070.0, + "step": 1743 + }, + { + "entropy": 1.7653774221738179, + "epoch": 0.1915904534344017, + "grad_norm": 0.833625316619873, + "learning_rate": 1.9797043325769056e-05, + "loss": 1.3869, + "mean_token_accuracy": 0.6533713638782501, + "num_tokens": 291680108.0, + "step": 1744 + }, + { + "entropy": 1.715358128150304, + "epoch": 0.19170031034577462, + "grad_norm": 0.7196187973022461, + "learning_rate": 1.979670507551008e-05, + "loss": 1.4072, + "mean_token_accuracy": 0.6509930094083151, + "num_tokens": 291852862.0, + "step": 1745 + }, + { + "entropy": 1.7125201920668285, + "epoch": 0.19181016725714756, + "grad_norm": 0.6884719729423523, + "learning_rate": 1.9796366546839354e-05, + "loss": 1.3614, + "mean_token_accuracy": 0.655213917295138, + "num_tokens": 292033459.0, + "step": 1746 + }, + { + "entropy": 1.6897228856881459, + "epoch": 0.1919200241685205, + "grad_norm": 0.6630612015724182, + "learning_rate": 1.9796027739767587e-05, + "loss": 1.5917, + "mean_token_accuracy": 0.6360281805197397, + "num_tokens": 292222658.0, + "step": 1747 + }, + { + "entropy": 1.6932558019955952, + "epoch": 0.19202988107989344, + "grad_norm": 0.6871110200881958, + "learning_rate": 1.979568865430551e-05, + "loss": 1.3225, + "mean_token_accuracy": 0.6635150760412216, + "num_tokens": 292398658.0, + "step": 1748 + }, + { + "entropy": 1.7077955702940624, + "epoch": 0.19213973799126638, + "grad_norm": 0.6830503344535828, + "learning_rate": 1.979534929046385e-05, + "loss": 1.4689, + "mean_token_accuracy": 0.6499410420656204, + "num_tokens": 292556109.0, + "step": 1749 + }, + { + "entropy": 1.6917288800080617, + "epoch": 0.19224959490263932, + "grad_norm": 0.7428691983222961, + "learning_rate": 1.9795009648253346e-05, + "loss": 1.4188, + "mean_token_accuracy": 0.6616235027710596, + "num_tokens": 292701727.0, + "step": 1750 + }, + { + "entropy": 1.7118379374345143, + "epoch": 0.19235945181401226, + "grad_norm": 0.7000189423561096, + "learning_rate": 1.979466972768475e-05, + "loss": 1.399, + "mean_token_accuracy": 0.6520606428384781, + "num_tokens": 292871798.0, + "step": 1751 + }, + { + "entropy": 1.6876471141974132, + "epoch": 0.19246930872538517, + "grad_norm": 0.7701053619384766, + "learning_rate": 1.9794329528768822e-05, + "loss": 1.2992, + "mean_token_accuracy": 0.665013869603475, + "num_tokens": 293021064.0, + "step": 1752 + }, + { + "entropy": 1.7469976941744487, + "epoch": 0.19257916563675812, + "grad_norm": 0.6615880727767944, + "learning_rate": 1.9793989051516327e-05, + "loss": 1.4366, + "mean_token_accuracy": 0.6483164032300314, + "num_tokens": 293149174.0, + "step": 1753 + }, + { + "entropy": 1.7197850545247395, + "epoch": 0.19268902254813106, + "grad_norm": 0.714011549949646, + "learning_rate": 1.979364829593804e-05, + "loss": 1.4344, + "mean_token_accuracy": 0.639839842915535, + "num_tokens": 293300752.0, + "step": 1754 + }, + { + "entropy": 1.7304006119569142, + "epoch": 0.192798879459504, + "grad_norm": 0.7182620763778687, + "learning_rate": 1.9793307262044748e-05, + "loss": 1.4202, + "mean_token_accuracy": 0.6542019993066788, + "num_tokens": 293456002.0, + "step": 1755 + }, + { + "entropy": 1.7409031490484874, + "epoch": 0.19290873637087694, + "grad_norm": 0.6725859045982361, + "learning_rate": 1.9792965949847242e-05, + "loss": 1.3865, + "mean_token_accuracy": 0.6448834588130316, + "num_tokens": 293628935.0, + "step": 1756 + }, + { + "entropy": 1.6798570553461711, + "epoch": 0.19301859328224988, + "grad_norm": 0.7474890351295471, + "learning_rate": 1.9792624359356326e-05, + "loss": 1.2733, + "mean_token_accuracy": 0.6787517368793488, + "num_tokens": 293755756.0, + "step": 1757 + }, + { + "entropy": 1.7863287031650543, + "epoch": 0.19312845019362282, + "grad_norm": 0.8425063490867615, + "learning_rate": 1.9792282490582812e-05, + "loss": 1.4917, + "mean_token_accuracy": 0.6436772296826044, + "num_tokens": 293919313.0, + "step": 1758 + }, + { + "entropy": 1.6916013459364574, + "epoch": 0.19323830710499573, + "grad_norm": 0.7149840593338013, + "learning_rate": 1.9791940343537517e-05, + "loss": 1.4658, + "mean_token_accuracy": 0.653274287780126, + "num_tokens": 294076984.0, + "step": 1759 + }, + { + "entropy": 1.7235734164714813, + "epoch": 0.19334816401636867, + "grad_norm": 0.7532820701599121, + "learning_rate": 1.9791597918231278e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6706577440102895, + "num_tokens": 294259181.0, + "step": 1760 + }, + { + "entropy": 1.727324555317561, + "epoch": 0.19345802092774161, + "grad_norm": 0.711613655090332, + "learning_rate": 1.9791255214674922e-05, + "loss": 1.4411, + "mean_token_accuracy": 0.6571811487277349, + "num_tokens": 294419560.0, + "step": 1761 + }, + { + "entropy": 1.7226892411708832, + "epoch": 0.19356787783911455, + "grad_norm": 0.6647672653198242, + "learning_rate": 1.97909122328793e-05, + "loss": 1.5226, + "mean_token_accuracy": 0.6560903539260229, + "num_tokens": 294579172.0, + "step": 1762 + }, + { + "entropy": 1.719457467397054, + "epoch": 0.1936777347504875, + "grad_norm": 0.6270412802696228, + "learning_rate": 1.9790568972855266e-05, + "loss": 1.3127, + "mean_token_accuracy": 0.6619095156590143, + "num_tokens": 294728030.0, + "step": 1763 + }, + { + "entropy": 1.6665961543718975, + "epoch": 0.19378759166186044, + "grad_norm": 0.7340520620346069, + "learning_rate": 1.9790225434613687e-05, + "loss": 1.3513, + "mean_token_accuracy": 0.681754027803739, + "num_tokens": 294866688.0, + "step": 1764 + }, + { + "entropy": 1.8049577971299489, + "epoch": 0.19389744857323335, + "grad_norm": 0.7559868097305298, + "learning_rate": 1.9789881618165434e-05, + "loss": 1.3979, + "mean_token_accuracy": 0.6505992064873377, + "num_tokens": 295021152.0, + "step": 1765 + }, + { + "entropy": 1.6748623251914978, + "epoch": 0.1940073054846063, + "grad_norm": 0.5827370882034302, + "learning_rate": 1.9789537523521387e-05, + "loss": 1.3721, + "mean_token_accuracy": 0.6543336113293966, + "num_tokens": 295229519.0, + "step": 1766 + }, + { + "entropy": 1.7147534688313801, + "epoch": 0.19411716239597923, + "grad_norm": 6.890192031860352, + "learning_rate": 1.9789193150692438e-05, + "loss": 1.4899, + "mean_token_accuracy": 0.6559451967477798, + "num_tokens": 295384686.0, + "step": 1767 + }, + { + "entropy": 1.707640786965688, + "epoch": 0.19422701930735217, + "grad_norm": 0.6139290928840637, + "learning_rate": 1.978884849968949e-05, + "loss": 1.2937, + "mean_token_accuracy": 0.6562622785568237, + "num_tokens": 295563230.0, + "step": 1768 + }, + { + "entropy": 1.7182945013046265, + "epoch": 0.1943368762187251, + "grad_norm": 0.6669062972068787, + "learning_rate": 1.9788503570523443e-05, + "loss": 1.4615, + "mean_token_accuracy": 0.6506092747052511, + "num_tokens": 295726034.0, + "step": 1769 + }, + { + "entropy": 1.7365097900231679, + "epoch": 0.19444673313009805, + "grad_norm": 0.7182427048683167, + "learning_rate": 1.978815836320522e-05, + "loss": 1.4676, + "mean_token_accuracy": 0.651889423529307, + "num_tokens": 295918492.0, + "step": 1770 + }, + { + "entropy": 1.7327903906504314, + "epoch": 0.194556590041471, + "grad_norm": 0.6817887425422668, + "learning_rate": 1.9787812877745745e-05, + "loss": 1.3728, + "mean_token_accuracy": 0.6570483843485514, + "num_tokens": 296085849.0, + "step": 1771 + }, + { + "entropy": 1.693399171034495, + "epoch": 0.1946664469528439, + "grad_norm": 0.715168833732605, + "learning_rate": 1.978746711415595e-05, + "loss": 1.3975, + "mean_token_accuracy": 0.6593023041884104, + "num_tokens": 296266624.0, + "step": 1772 + }, + { + "entropy": 1.7591918508211772, + "epoch": 0.19477630386421685, + "grad_norm": 0.7243099808692932, + "learning_rate": 1.9787121072446785e-05, + "loss": 1.4259, + "mean_token_accuracy": 0.6468930890162786, + "num_tokens": 296396972.0, + "step": 1773 + }, + { + "entropy": 1.6918930610020955, + "epoch": 0.1948861607755898, + "grad_norm": 0.7280333042144775, + "learning_rate": 1.9786774752629195e-05, + "loss": 1.3678, + "mean_token_accuracy": 0.6707959572474161, + "num_tokens": 296548243.0, + "step": 1774 + }, + { + "entropy": 1.7696085572242737, + "epoch": 0.19499601768696273, + "grad_norm": 0.5832480788230896, + "learning_rate": 1.9786428154714143e-05, + "loss": 1.5862, + "mean_token_accuracy": 0.6292986472447714, + "num_tokens": 296747351.0, + "step": 1775 + }, + { + "entropy": 1.7673150698343914, + "epoch": 0.19510587459833567, + "grad_norm": 0.6457276940345764, + "learning_rate": 1.9786081278712598e-05, + "loss": 1.4639, + "mean_token_accuracy": 0.6444855431715647, + "num_tokens": 296935015.0, + "step": 1776 + }, + { + "entropy": 1.6158926784992218, + "epoch": 0.1952157315097086, + "grad_norm": 0.5862371921539307, + "learning_rate": 1.9785734124635544e-05, + "loss": 1.3359, + "mean_token_accuracy": 0.6693507929642996, + "num_tokens": 297145524.0, + "step": 1777 + }, + { + "entropy": 1.7320951322714488, + "epoch": 0.19532558842108155, + "grad_norm": 0.7013179659843445, + "learning_rate": 1.978538669249396e-05, + "loss": 1.4895, + "mean_token_accuracy": 0.6407797584931055, + "num_tokens": 297288935.0, + "step": 1778 + }, + { + "entropy": 1.7831117709477742, + "epoch": 0.19543544533245447, + "grad_norm": 0.7159624099731445, + "learning_rate": 1.978503898229885e-05, + "loss": 1.4075, + "mean_token_accuracy": 0.6433553198973337, + "num_tokens": 297459113.0, + "step": 1779 + }, + { + "entropy": 1.6679442922274272, + "epoch": 0.1955453022438274, + "grad_norm": 0.725671112537384, + "learning_rate": 1.978469099406121e-05, + "loss": 1.3892, + "mean_token_accuracy": 0.655972421169281, + "num_tokens": 297624207.0, + "step": 1780 + }, + { + "entropy": 1.7021668950716655, + "epoch": 0.19565515915520035, + "grad_norm": 0.6634953022003174, + "learning_rate": 1.978434272779206e-05, + "loss": 1.3119, + "mean_token_accuracy": 0.6691978275775909, + "num_tokens": 297786571.0, + "step": 1781 + }, + { + "entropy": 1.7528183460235596, + "epoch": 0.1957650160665733, + "grad_norm": 0.7583564519882202, + "learning_rate": 1.9783994183502423e-05, + "loss": 1.5388, + "mean_token_accuracy": 0.626950333515803, + "num_tokens": 298018826.0, + "step": 1782 + }, + { + "entropy": 1.6614128748575847, + "epoch": 0.19587487297794623, + "grad_norm": 0.5320108532905579, + "learning_rate": 1.9783645361203324e-05, + "loss": 1.3674, + "mean_token_accuracy": 0.6552481253941854, + "num_tokens": 298245649.0, + "step": 1783 + }, + { + "entropy": 1.770830233891805, + "epoch": 0.19598472988931917, + "grad_norm": 0.6707810163497925, + "learning_rate": 1.9783296260905812e-05, + "loss": 1.3718, + "mean_token_accuracy": 0.6541836063067118, + "num_tokens": 298382824.0, + "step": 1784 + }, + { + "entropy": 1.759553889433543, + "epoch": 0.1960945868006921, + "grad_norm": 0.7541280388832092, + "learning_rate": 1.978294688262093e-05, + "loss": 1.2791, + "mean_token_accuracy": 0.6692831069231033, + "num_tokens": 298517297.0, + "step": 1785 + }, + { + "entropy": 1.7564668953418732, + "epoch": 0.19620444371206502, + "grad_norm": 0.6617575287818909, + "learning_rate": 1.9782597226359737e-05, + "loss": 1.4138, + "mean_token_accuracy": 0.6551504383484522, + "num_tokens": 298666505.0, + "step": 1786 + }, + { + "entropy": 1.760979433854421, + "epoch": 0.19631430062343797, + "grad_norm": 0.8420041799545288, + "learning_rate": 1.97822472921333e-05, + "loss": 1.2352, + "mean_token_accuracy": 0.6695485363403956, + "num_tokens": 298824770.0, + "step": 1787 + }, + { + "entropy": 1.7239550054073334, + "epoch": 0.1964241575348109, + "grad_norm": 0.7422584295272827, + "learning_rate": 1.9781897079952693e-05, + "loss": 1.4415, + "mean_token_accuracy": 0.6553243845701218, + "num_tokens": 298993460.0, + "step": 1788 + }, + { + "entropy": 1.7003070811430614, + "epoch": 0.19653401444618385, + "grad_norm": 0.6796644926071167, + "learning_rate": 1.9781546589828993e-05, + "loss": 1.4076, + "mean_token_accuracy": 0.6579789767662684, + "num_tokens": 299145182.0, + "step": 1789 + }, + { + "entropy": 1.7247399985790253, + "epoch": 0.1966438713575568, + "grad_norm": 0.7494299411773682, + "learning_rate": 1.9781195821773313e-05, + "loss": 1.2761, + "mean_token_accuracy": 0.6815857142210007, + "num_tokens": 299274678.0, + "step": 1790 + }, + { + "entropy": 1.7042510112126668, + "epoch": 0.19675372826892973, + "grad_norm": 0.6861709952354431, + "learning_rate": 1.9780844775796733e-05, + "loss": 1.3178, + "mean_token_accuracy": 0.6655194312334061, + "num_tokens": 299407876.0, + "step": 1791 + }, + { + "entropy": 1.6969355642795563, + "epoch": 0.19686358518030267, + "grad_norm": 0.7215892672538757, + "learning_rate": 1.978049345191038e-05, + "loss": 1.3921, + "mean_token_accuracy": 0.6600968192021052, + "num_tokens": 299560247.0, + "step": 1792 + }, + { + "entropy": 1.7007923424243927, + "epoch": 0.19697344209167558, + "grad_norm": 0.5989768505096436, + "learning_rate": 1.9780141850125362e-05, + "loss": 1.3689, + "mean_token_accuracy": 0.6636965026458105, + "num_tokens": 299788837.0, + "step": 1793 + }, + { + "entropy": 1.672113170226415, + "epoch": 0.19708329900304852, + "grad_norm": 0.751150906085968, + "learning_rate": 1.977978997045281e-05, + "loss": 1.3912, + "mean_token_accuracy": 0.6588180909554163, + "num_tokens": 299966636.0, + "step": 1794 + }, + { + "entropy": 1.6974561214447021, + "epoch": 0.19719315591442146, + "grad_norm": 0.6662459969520569, + "learning_rate": 1.9779437812903862e-05, + "loss": 1.3124, + "mean_token_accuracy": 0.6676592777172724, + "num_tokens": 300121520.0, + "step": 1795 + }, + { + "entropy": 1.7895707885424297, + "epoch": 0.1973030128257944, + "grad_norm": 0.6926854252815247, + "learning_rate": 1.9779085377489663e-05, + "loss": 1.4074, + "mean_token_accuracy": 0.6511611789464951, + "num_tokens": 300249227.0, + "step": 1796 + }, + { + "entropy": 1.6819771826267242, + "epoch": 0.19741286973716735, + "grad_norm": 3.198624849319458, + "learning_rate": 1.977873266422137e-05, + "loss": 1.1345, + "mean_token_accuracy": 0.6746688683827718, + "num_tokens": 300417964.0, + "step": 1797 + }, + { + "entropy": 1.7427258292833965, + "epoch": 0.1975227266485403, + "grad_norm": 0.7579461336135864, + "learning_rate": 1.977837967311014e-05, + "loss": 1.3167, + "mean_token_accuracy": 0.6665651847918829, + "num_tokens": 300538034.0, + "step": 1798 + }, + { + "entropy": 1.7113410731156666, + "epoch": 0.1976325835599132, + "grad_norm": 0.7278785705566406, + "learning_rate": 1.977802640416715e-05, + "loss": 1.3031, + "mean_token_accuracy": 0.6744206001361212, + "num_tokens": 300652123.0, + "step": 1799 + }, + { + "entropy": 1.7057754894097645, + "epoch": 0.19774244047128614, + "grad_norm": 0.6383393406867981, + "learning_rate": 1.9777672857403584e-05, + "loss": 1.5941, + "mean_token_accuracy": 0.6359094778696696, + "num_tokens": 300918339.0, + "step": 1800 + }, + { + "entropy": 1.796320726474126, + "epoch": 0.19785229738265908, + "grad_norm": 0.8345091342926025, + "learning_rate": 1.9777319032830624e-05, + "loss": 1.433, + "mean_token_accuracy": 0.6534850498040518, + "num_tokens": 301084973.0, + "step": 1801 + }, + { + "entropy": 1.7382001678148906, + "epoch": 0.19796215429403202, + "grad_norm": 0.8948132395744324, + "learning_rate": 1.9776964930459474e-05, + "loss": 1.3021, + "mean_token_accuracy": 0.6653626014788946, + "num_tokens": 301208519.0, + "step": 1802 + }, + { + "entropy": 1.7888563871383667, + "epoch": 0.19807201120540496, + "grad_norm": 0.6303662061691284, + "learning_rate": 1.9776610550301338e-05, + "loss": 1.4763, + "mean_token_accuracy": 0.6427715172370275, + "num_tokens": 301351718.0, + "step": 1803 + }, + { + "entropy": 1.6515491306781769, + "epoch": 0.1981818681167779, + "grad_norm": 0.7126901745796204, + "learning_rate": 1.977625589236743e-05, + "loss": 1.2714, + "mean_token_accuracy": 0.6703294515609741, + "num_tokens": 301507032.0, + "step": 1804 + }, + { + "entropy": 1.7583887577056885, + "epoch": 0.19829172502815084, + "grad_norm": 0.68550044298172, + "learning_rate": 1.977590095666898e-05, + "loss": 1.39, + "mean_token_accuracy": 0.6463887542486191, + "num_tokens": 301646202.0, + "step": 1805 + }, + { + "entropy": 1.68940003712972, + "epoch": 0.19840158193952376, + "grad_norm": 0.6771380305290222, + "learning_rate": 1.977554574321722e-05, + "loss": 1.482, + "mean_token_accuracy": 0.6526958445707957, + "num_tokens": 301804123.0, + "step": 1806 + }, + { + "entropy": 1.7353551983833313, + "epoch": 0.1985114388508967, + "grad_norm": 0.7520933747291565, + "learning_rate": 1.977519025202339e-05, + "loss": 1.5235, + "mean_token_accuracy": 0.6432921489079794, + "num_tokens": 301974575.0, + "step": 1807 + }, + { + "entropy": 1.7340351243813832, + "epoch": 0.19862129576226964, + "grad_norm": 0.6563875675201416, + "learning_rate": 1.9774834483098745e-05, + "loss": 1.3366, + "mean_token_accuracy": 0.6615680456161499, + "num_tokens": 302144787.0, + "step": 1808 + }, + { + "entropy": 1.6780883272488911, + "epoch": 0.19873115267364258, + "grad_norm": 0.6213021278381348, + "learning_rate": 1.977447843645454e-05, + "loss": 1.3939, + "mean_token_accuracy": 0.6571485896905264, + "num_tokens": 302332010.0, + "step": 1809 + }, + { + "entropy": 1.7402231593926747, + "epoch": 0.19884100958501552, + "grad_norm": 0.7091407775878906, + "learning_rate": 1.9774122112102047e-05, + "loss": 1.4103, + "mean_token_accuracy": 0.6477878441413244, + "num_tokens": 302472018.0, + "step": 1810 + }, + { + "entropy": 1.6822114984194438, + "epoch": 0.19895086649638846, + "grad_norm": 0.7527471780776978, + "learning_rate": 1.9773765510052546e-05, + "loss": 1.5214, + "mean_token_accuracy": 0.6414872830112776, + "num_tokens": 302690599.0, + "step": 1811 + }, + { + "entropy": 1.7382448216279347, + "epoch": 0.1990607234077614, + "grad_norm": 0.8488646149635315, + "learning_rate": 1.9773408630317316e-05, + "loss": 1.4812, + "mean_token_accuracy": 0.6391682376464208, + "num_tokens": 302884159.0, + "step": 1812 + }, + { + "entropy": 1.7655917604764302, + "epoch": 0.19917058031913432, + "grad_norm": 0.8058510422706604, + "learning_rate": 1.9773051472907657e-05, + "loss": 1.558, + "mean_token_accuracy": 0.6351256171862284, + "num_tokens": 303045104.0, + "step": 1813 + }, + { + "entropy": 1.7527148922284443, + "epoch": 0.19928043723050726, + "grad_norm": 0.7928998470306396, + "learning_rate": 1.9772694037834873e-05, + "loss": 1.5035, + "mean_token_accuracy": 0.64784603814284, + "num_tokens": 303195849.0, + "step": 1814 + }, + { + "entropy": 1.7500303983688354, + "epoch": 0.1993902941418802, + "grad_norm": 0.6854771375656128, + "learning_rate": 1.977233632511028e-05, + "loss": 1.3672, + "mean_token_accuracy": 0.6633496433496475, + "num_tokens": 303362204.0, + "step": 1815 + }, + { + "entropy": 1.7209295729796092, + "epoch": 0.19950015105325314, + "grad_norm": 0.7351916432380676, + "learning_rate": 1.9771978334745184e-05, + "loss": 1.4907, + "mean_token_accuracy": 0.6455176422993342, + "num_tokens": 303576314.0, + "step": 1816 + }, + { + "entropy": 1.7865357597668965, + "epoch": 0.19961000796462608, + "grad_norm": 0.79336017370224, + "learning_rate": 1.9771620066750937e-05, + "loss": 1.4667, + "mean_token_accuracy": 0.6478342215220133, + "num_tokens": 303792834.0, + "step": 1817 + }, + { + "entropy": 1.6429633895556133, + "epoch": 0.19971986487599902, + "grad_norm": 0.6706152558326721, + "learning_rate": 1.9771261521138862e-05, + "loss": 1.2351, + "mean_token_accuracy": 0.6809234966834387, + "num_tokens": 303906864.0, + "step": 1818 + }, + { + "entropy": 1.8072155614693959, + "epoch": 0.19982972178737196, + "grad_norm": 0.8159286975860596, + "learning_rate": 1.9770902697920315e-05, + "loss": 1.5045, + "mean_token_accuracy": 0.6369387259085973, + "num_tokens": 304084742.0, + "step": 1819 + }, + { + "entropy": 1.7179032564163208, + "epoch": 0.19993957869874487, + "grad_norm": 0.770172655582428, + "learning_rate": 1.977054359710665e-05, + "loss": 1.2617, + "mean_token_accuracy": 0.6699782609939575, + "num_tokens": 304223214.0, + "step": 1820 + }, + { + "entropy": 1.712960034608841, + "epoch": 0.20004943561011782, + "grad_norm": 0.9042562246322632, + "learning_rate": 1.977018421870923e-05, + "loss": 1.3977, + "mean_token_accuracy": 0.6594905803600947, + "num_tokens": 304405961.0, + "step": 1821 + }, + { + "entropy": 1.6442652344703674, + "epoch": 0.20015929252149076, + "grad_norm": 0.609815776348114, + "learning_rate": 1.976982456273943e-05, + "loss": 1.3585, + "mean_token_accuracy": 0.6593890488147736, + "num_tokens": 304574118.0, + "step": 1822 + }, + { + "entropy": 1.6892712612946827, + "epoch": 0.2002691494328637, + "grad_norm": 0.7102997303009033, + "learning_rate": 1.9769464629208643e-05, + "loss": 1.2489, + "mean_token_accuracy": 0.6777701675891876, + "num_tokens": 304742861.0, + "step": 1823 + }, + { + "entropy": 1.6933867732683818, + "epoch": 0.20037900634423664, + "grad_norm": 0.596263587474823, + "learning_rate": 1.976910441812824e-05, + "loss": 1.5056, + "mean_token_accuracy": 0.6288742969433466, + "num_tokens": 305020688.0, + "step": 1824 + }, + { + "entropy": 1.7255509893099468, + "epoch": 0.20048886325560958, + "grad_norm": 0.730722188949585, + "learning_rate": 1.9768743929509643e-05, + "loss": 1.425, + "mean_token_accuracy": 0.6644620796044668, + "num_tokens": 305151455.0, + "step": 1825 + }, + { + "entropy": 1.7349488337834675, + "epoch": 0.2005987201669825, + "grad_norm": 0.6531474590301514, + "learning_rate": 1.9768383163364248e-05, + "loss": 1.6278, + "mean_token_accuracy": 0.6154864877462387, + "num_tokens": 305348902.0, + "step": 1826 + }, + { + "entropy": 1.7766931653022766, + "epoch": 0.20070857707835543, + "grad_norm": 0.7823039293289185, + "learning_rate": 1.9768022119703477e-05, + "loss": 1.3906, + "mean_token_accuracy": 0.6641748547554016, + "num_tokens": 305467608.0, + "step": 1827 + }, + { + "entropy": 1.7294628620147705, + "epoch": 0.20081843398972837, + "grad_norm": 0.600829541683197, + "learning_rate": 1.9767660798538757e-05, + "loss": 1.3125, + "mean_token_accuracy": 0.6706470201412836, + "num_tokens": 305626175.0, + "step": 1828 + }, + { + "entropy": 1.7608105738957722, + "epoch": 0.20092829090110131, + "grad_norm": 0.6462728977203369, + "learning_rate": 1.9767299199881524e-05, + "loss": 1.4496, + "mean_token_accuracy": 0.6452651371558508, + "num_tokens": 305863805.0, + "step": 1829 + }, + { + "entropy": 1.7059124012788136, + "epoch": 0.20103814781247425, + "grad_norm": 0.5904053449630737, + "learning_rate": 1.9766937323743226e-05, + "loss": 1.2778, + "mean_token_accuracy": 0.6797713836034139, + "num_tokens": 306001168.0, + "step": 1830 + }, + { + "entropy": 1.6253532270590465, + "epoch": 0.2011480047238472, + "grad_norm": 0.7917356491088867, + "learning_rate": 1.976657517013531e-05, + "loss": 1.2863, + "mean_token_accuracy": 0.6701359699169794, + "num_tokens": 306199106.0, + "step": 1831 + }, + { + "entropy": 1.7297336757183075, + "epoch": 0.20125786163522014, + "grad_norm": 0.7113462686538696, + "learning_rate": 1.9766212739069233e-05, + "loss": 1.4775, + "mean_token_accuracy": 0.6459426383177439, + "num_tokens": 306362651.0, + "step": 1832 + }, + { + "entropy": 1.7070247530937195, + "epoch": 0.20136771854659305, + "grad_norm": 0.6649527549743652, + "learning_rate": 1.976585003055648e-05, + "loss": 1.4208, + "mean_token_accuracy": 0.6574066330989202, + "num_tokens": 306544467.0, + "step": 1833 + }, + { + "entropy": 1.7177915970484416, + "epoch": 0.201477575457966, + "grad_norm": 0.742743730545044, + "learning_rate": 1.976548704460852e-05, + "loss": 1.2361, + "mean_token_accuracy": 0.6762651801109314, + "num_tokens": 306659110.0, + "step": 1834 + }, + { + "entropy": 1.706160436073939, + "epoch": 0.20158743236933893, + "grad_norm": 0.907835841178894, + "learning_rate": 1.976512378123685e-05, + "loss": 1.3505, + "mean_token_accuracy": 0.6621433893839518, + "num_tokens": 306810908.0, + "step": 1835 + }, + { + "entropy": 1.700301080942154, + "epoch": 0.20169728928071187, + "grad_norm": 0.7097511291503906, + "learning_rate": 1.9764760240452957e-05, + "loss": 1.4589, + "mean_token_accuracy": 0.6501483097672462, + "num_tokens": 306986279.0, + "step": 1836 + }, + { + "entropy": 1.7191430628299713, + "epoch": 0.2018071461920848, + "grad_norm": 0.607349157333374, + "learning_rate": 1.9764396422268356e-05, + "loss": 1.3485, + "mean_token_accuracy": 0.662805438041687, + "num_tokens": 307116996.0, + "step": 1837 + }, + { + "entropy": 1.7494115829467773, + "epoch": 0.20191700310345775, + "grad_norm": 0.7002793550491333, + "learning_rate": 1.976403232669455e-05, + "loss": 1.5679, + "mean_token_accuracy": 0.6226314206918081, + "num_tokens": 307295127.0, + "step": 1838 + }, + { + "entropy": 1.7843216558297474, + "epoch": 0.2020268600148307, + "grad_norm": 0.6927679777145386, + "learning_rate": 1.9763667953743078e-05, + "loss": 1.4433, + "mean_token_accuracy": 0.650288388133049, + "num_tokens": 307474847.0, + "step": 1839 + }, + { + "entropy": 1.7510626216729481, + "epoch": 0.2021367169262036, + "grad_norm": 0.652125895023346, + "learning_rate": 1.9763303303425463e-05, + "loss": 1.3464, + "mean_token_accuracy": 0.6607447812954584, + "num_tokens": 307652843.0, + "step": 1840 + }, + { + "entropy": 1.7755654851595561, + "epoch": 0.20224657383757655, + "grad_norm": 0.9156058430671692, + "learning_rate": 1.9762938375753245e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.6591128408908844, + "num_tokens": 307769467.0, + "step": 1841 + }, + { + "entropy": 1.7441391746203105, + "epoch": 0.2023564307489495, + "grad_norm": 0.7158201932907104, + "learning_rate": 1.976257317073798e-05, + "loss": 1.366, + "mean_token_accuracy": 0.6583238691091537, + "num_tokens": 307913812.0, + "step": 1842 + }, + { + "entropy": 1.7281207144260406, + "epoch": 0.20246628766032243, + "grad_norm": 0.5468607544898987, + "learning_rate": 1.9762207688391216e-05, + "loss": 1.4142, + "mean_token_accuracy": 0.6494698971509933, + "num_tokens": 308126776.0, + "step": 1843 + }, + { + "entropy": 1.7128788232803345, + "epoch": 0.20257614457169537, + "grad_norm": 0.6986768841743469, + "learning_rate": 1.976184192872453e-05, + "loss": 1.3821, + "mean_token_accuracy": 0.650952065984408, + "num_tokens": 308296663.0, + "step": 1844 + }, + { + "entropy": 1.7056085566679637, + "epoch": 0.2026860014830683, + "grad_norm": 0.6520532369613647, + "learning_rate": 1.9761475891749496e-05, + "loss": 1.3298, + "mean_token_accuracy": 0.6659070352713267, + "num_tokens": 308484182.0, + "step": 1845 + }, + { + "entropy": 1.7646136184533436, + "epoch": 0.20279585839444125, + "grad_norm": 0.6946887373924255, + "learning_rate": 1.9761109577477696e-05, + "loss": 1.5495, + "mean_token_accuracy": 0.6431872049967448, + "num_tokens": 308660541.0, + "step": 1846 + }, + { + "entropy": 1.716064860423406, + "epoch": 0.20290571530581417, + "grad_norm": 0.8077802062034607, + "learning_rate": 1.9760742985920726e-05, + "loss": 1.5907, + "mean_token_accuracy": 0.6451023171345392, + "num_tokens": 308825957.0, + "step": 1847 + }, + { + "entropy": 1.7179016371568043, + "epoch": 0.2030155722171871, + "grad_norm": 0.7027926445007324, + "learning_rate": 1.976037611709019e-05, + "loss": 1.4141, + "mean_token_accuracy": 0.6579538484414419, + "num_tokens": 308979266.0, + "step": 1848 + }, + { + "entropy": 1.724165548880895, + "epoch": 0.20312542912856005, + "grad_norm": 0.5816169381141663, + "learning_rate": 1.9760008970997702e-05, + "loss": 1.3984, + "mean_token_accuracy": 0.6541108936071396, + "num_tokens": 309144227.0, + "step": 1849 + }, + { + "entropy": 1.7344149947166443, + "epoch": 0.203235286039933, + "grad_norm": 0.7658873796463013, + "learning_rate": 1.975964154765487e-05, + "loss": 1.5109, + "mean_token_accuracy": 0.6461022893587748, + "num_tokens": 309307786.0, + "step": 1850 + }, + { + "entropy": 1.7166369756062825, + "epoch": 0.20334514295130593, + "grad_norm": 0.6772670149803162, + "learning_rate": 1.975927384707333e-05, + "loss": 1.4384, + "mean_token_accuracy": 0.6566036691268285, + "num_tokens": 309459208.0, + "step": 1851 + }, + { + "entropy": 1.6875809729099274, + "epoch": 0.20345499986267887, + "grad_norm": 0.7443994283676147, + "learning_rate": 1.9758905869264725e-05, + "loss": 1.5135, + "mean_token_accuracy": 0.6605640351772308, + "num_tokens": 309607725.0, + "step": 1852 + }, + { + "entropy": 1.775184839963913, + "epoch": 0.2035648567740518, + "grad_norm": 0.7913832664489746, + "learning_rate": 1.9758537614240692e-05, + "loss": 1.3856, + "mean_token_accuracy": 0.6432444254557291, + "num_tokens": 309776782.0, + "step": 1853 + }, + { + "entropy": 1.7546232839425404, + "epoch": 0.20367471368542472, + "grad_norm": 0.7907248139381409, + "learning_rate": 1.9758169082012893e-05, + "loss": 1.3165, + "mean_token_accuracy": 0.6588219453891119, + "num_tokens": 309925121.0, + "step": 1854 + }, + { + "entropy": 1.7209657728672028, + "epoch": 0.20378457059679767, + "grad_norm": 0.770335853099823, + "learning_rate": 1.975780027259299e-05, + "loss": 1.4543, + "mean_token_accuracy": 0.6663737197717031, + "num_tokens": 310094118.0, + "step": 1855 + }, + { + "entropy": 1.6761006116867065, + "epoch": 0.2038944275081706, + "grad_norm": 0.7291706204414368, + "learning_rate": 1.975743118599265e-05, + "loss": 1.322, + "mean_token_accuracy": 0.672456423441569, + "num_tokens": 310234093.0, + "step": 1856 + }, + { + "entropy": 1.7500018080075581, + "epoch": 0.20400428441954355, + "grad_norm": 0.7051176428794861, + "learning_rate": 1.975706182222356e-05, + "loss": 1.4993, + "mean_token_accuracy": 0.6471735189358393, + "num_tokens": 310438381.0, + "step": 1857 + }, + { + "entropy": 1.7876022458076477, + "epoch": 0.2041141413309165, + "grad_norm": 0.7112467288970947, + "learning_rate": 1.9756692181297412e-05, + "loss": 1.5468, + "mean_token_accuracy": 0.6419008473555247, + "num_tokens": 310624466.0, + "step": 1858 + }, + { + "entropy": 1.7213526765505474, + "epoch": 0.20422399824228943, + "grad_norm": 0.6644498705863953, + "learning_rate": 1.9756322263225903e-05, + "loss": 1.3655, + "mean_token_accuracy": 0.6711994310220083, + "num_tokens": 310774375.0, + "step": 1859 + }, + { + "entropy": 1.6958336333433788, + "epoch": 0.20433385515366234, + "grad_norm": 0.6671877503395081, + "learning_rate": 1.9755952068020737e-05, + "loss": 1.3116, + "mean_token_accuracy": 0.6657513082027435, + "num_tokens": 310940187.0, + "step": 1860 + }, + { + "entropy": 1.6833363076051076, + "epoch": 0.20444371206503528, + "grad_norm": 0.7353310585021973, + "learning_rate": 1.9755581595693636e-05, + "loss": 1.4442, + "mean_token_accuracy": 0.6834416141112646, + "num_tokens": 311079868.0, + "step": 1861 + }, + { + "entropy": 1.7643102804819744, + "epoch": 0.20455356897640822, + "grad_norm": 0.7783114910125732, + "learning_rate": 1.975521084625632e-05, + "loss": 1.3208, + "mean_token_accuracy": 0.6558940261602402, + "num_tokens": 311220945.0, + "step": 1862 + }, + { + "entropy": 1.7230792840321858, + "epoch": 0.20466342588778116, + "grad_norm": 0.6682336926460266, + "learning_rate": 1.975483981972053e-05, + "loss": 1.426, + "mean_token_accuracy": 0.6528936872879664, + "num_tokens": 311384579.0, + "step": 1863 + }, + { + "entropy": 1.7676608264446259, + "epoch": 0.2047732827991541, + "grad_norm": 0.7327151298522949, + "learning_rate": 1.9754468516098003e-05, + "loss": 1.3778, + "mean_token_accuracy": 0.6655658980210623, + "num_tokens": 311529478.0, + "step": 1864 + }, + { + "entropy": 1.773693968852361, + "epoch": 0.20488313971052705, + "grad_norm": 0.8596065044403076, + "learning_rate": 1.975409693540049e-05, + "loss": 1.2687, + "mean_token_accuracy": 0.6674908697605133, + "num_tokens": 311648679.0, + "step": 1865 + }, + { + "entropy": 1.682075430949529, + "epoch": 0.2049929966219, + "grad_norm": 0.7088510990142822, + "learning_rate": 1.9753725077639757e-05, + "loss": 1.3837, + "mean_token_accuracy": 0.6538609862327576, + "num_tokens": 311807028.0, + "step": 1866 + }, + { + "entropy": 1.7813390990098317, + "epoch": 0.2051028535332729, + "grad_norm": 0.7097972631454468, + "learning_rate": 1.9753352942827568e-05, + "loss": 1.6051, + "mean_token_accuracy": 0.6367628425359726, + "num_tokens": 312009929.0, + "step": 1867 + }, + { + "entropy": 1.7026053667068481, + "epoch": 0.20521271044464584, + "grad_norm": 0.7793052196502686, + "learning_rate": 1.9752980530975702e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6695610036452612, + "num_tokens": 312142042.0, + "step": 1868 + }, + { + "entropy": 1.7217031021912892, + "epoch": 0.20532256735601878, + "grad_norm": 0.6053064465522766, + "learning_rate": 1.975260784209595e-05, + "loss": 1.4688, + "mean_token_accuracy": 0.6375043392181396, + "num_tokens": 312356696.0, + "step": 1869 + }, + { + "entropy": 1.7617976367473602, + "epoch": 0.20543242426739172, + "grad_norm": 1.0582380294799805, + "learning_rate": 1.9752234876200097e-05, + "loss": 1.5011, + "mean_token_accuracy": 0.6502614468336105, + "num_tokens": 312534643.0, + "step": 1870 + }, + { + "entropy": 1.6304903427759807, + "epoch": 0.20554228117876466, + "grad_norm": 0.5571168065071106, + "learning_rate": 1.975186163329996e-05, + "loss": 1.2778, + "mean_token_accuracy": 0.6664902319510778, + "num_tokens": 312700073.0, + "step": 1871 + }, + { + "entropy": 1.6912108063697815, + "epoch": 0.2056521380901376, + "grad_norm": 0.6295304894447327, + "learning_rate": 1.9751488113407343e-05, + "loss": 1.4227, + "mean_token_accuracy": 0.6689607550700506, + "num_tokens": 312847666.0, + "step": 1872 + }, + { + "entropy": 1.7157114446163177, + "epoch": 0.20576199500151054, + "grad_norm": 0.8937650918960571, + "learning_rate": 1.975111431653407e-05, + "loss": 1.5174, + "mean_token_accuracy": 0.6386149227619171, + "num_tokens": 313025199.0, + "step": 1873 + }, + { + "entropy": 1.7388854622840881, + "epoch": 0.20587185191288346, + "grad_norm": 0.5921317934989929, + "learning_rate": 1.9750740242691978e-05, + "loss": 1.5385, + "mean_token_accuracy": 0.6445744981368383, + "num_tokens": 313268338.0, + "step": 1874 + }, + { + "entropy": 1.6713208258152008, + "epoch": 0.2059817088242564, + "grad_norm": 0.7296470403671265, + "learning_rate": 1.9750365891892894e-05, + "loss": 1.4052, + "mean_token_accuracy": 0.6561514039834341, + "num_tokens": 313477471.0, + "step": 1875 + }, + { + "entropy": 1.6889991958936055, + "epoch": 0.20609156573562934, + "grad_norm": 0.591461181640625, + "learning_rate": 1.9749991264148676e-05, + "loss": 1.3788, + "mean_token_accuracy": 0.6569076975186666, + "num_tokens": 313703842.0, + "step": 1876 + }, + { + "entropy": 1.710012932618459, + "epoch": 0.20620142264700228, + "grad_norm": 0.7003285884857178, + "learning_rate": 1.9749616359471176e-05, + "loss": 1.2286, + "mean_token_accuracy": 0.6813757121562958, + "num_tokens": 313828662.0, + "step": 1877 + }, + { + "entropy": 1.7340795695781708, + "epoch": 0.20631127955837522, + "grad_norm": 0.7556050419807434, + "learning_rate": 1.974924117787226e-05, + "loss": 1.3818, + "mean_token_accuracy": 0.6705377250909805, + "num_tokens": 313992143.0, + "step": 1878 + }, + { + "entropy": 1.6594361861546834, + "epoch": 0.20642113646974816, + "grad_norm": 0.67914217710495, + "learning_rate": 1.974886571936381e-05, + "loss": 1.4848, + "mean_token_accuracy": 0.655074879527092, + "num_tokens": 314195437.0, + "step": 1879 + }, + { + "entropy": 1.7556609710057576, + "epoch": 0.2065309933811211, + "grad_norm": 0.7271071672439575, + "learning_rate": 1.9748489983957692e-05, + "loss": 1.4298, + "mean_token_accuracy": 0.641119142373403, + "num_tokens": 314352057.0, + "step": 1880 + }, + { + "entropy": 1.747815767923991, + "epoch": 0.20664085029249402, + "grad_norm": 0.6552515625953674, + "learning_rate": 1.9748113971665816e-05, + "loss": 1.3886, + "mean_token_accuracy": 0.6543693294127783, + "num_tokens": 314532136.0, + "step": 1881 + }, + { + "entropy": 1.723093291123708, + "epoch": 0.20675070720386696, + "grad_norm": 0.6786137223243713, + "learning_rate": 1.9747737682500072e-05, + "loss": 1.5003, + "mean_token_accuracy": 0.6459396133820215, + "num_tokens": 314696517.0, + "step": 1882 + }, + { + "entropy": 1.7882917523384094, + "epoch": 0.2068605641152399, + "grad_norm": 0.7087535262107849, + "learning_rate": 1.9747361116472373e-05, + "loss": 1.4855, + "mean_token_accuracy": 0.6424766977628072, + "num_tokens": 314906715.0, + "step": 1883 + }, + { + "entropy": 1.7414319415887196, + "epoch": 0.20697042102661284, + "grad_norm": 0.8721282482147217, + "learning_rate": 1.9746984273594632e-05, + "loss": 1.4097, + "mean_token_accuracy": 0.654596209526062, + "num_tokens": 315155191.0, + "step": 1884 + }, + { + "entropy": 1.7770436803499858, + "epoch": 0.20708027793798578, + "grad_norm": 0.7305892705917358, + "learning_rate": 1.9746607153878786e-05, + "loss": 1.4086, + "mean_token_accuracy": 0.6574834038813909, + "num_tokens": 315321135.0, + "step": 1885 + }, + { + "entropy": 1.7179120083649952, + "epoch": 0.20719013484935872, + "grad_norm": 0.7217333912849426, + "learning_rate": 1.9746229757336763e-05, + "loss": 1.4068, + "mean_token_accuracy": 0.6624855548143387, + "num_tokens": 315492089.0, + "step": 1886 + }, + { + "entropy": 1.7504285176595051, + "epoch": 0.20729999176073163, + "grad_norm": 0.6315979957580566, + "learning_rate": 1.9745852083980507e-05, + "loss": 1.5327, + "mean_token_accuracy": 0.6477139939864477, + "num_tokens": 315664077.0, + "step": 1887 + }, + { + "entropy": 1.6375745435555775, + "epoch": 0.20740984867210457, + "grad_norm": 0.7350934147834778, + "learning_rate": 1.9745474133821978e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6684954961140951, + "num_tokens": 315862163.0, + "step": 1888 + }, + { + "entropy": 1.6826703051726024, + "epoch": 0.20751970558347752, + "grad_norm": 0.6202029585838318, + "learning_rate": 1.974509590687313e-05, + "loss": 1.3122, + "mean_token_accuracy": 0.6682248959938685, + "num_tokens": 316025052.0, + "step": 1889 + }, + { + "entropy": 1.724346548318863, + "epoch": 0.20762956249485046, + "grad_norm": 0.7694988250732422, + "learning_rate": 1.9744717403145935e-05, + "loss": 1.3031, + "mean_token_accuracy": 0.6729477494955063, + "num_tokens": 316128499.0, + "step": 1890 + }, + { + "entropy": 1.7621622681617737, + "epoch": 0.2077394194062234, + "grad_norm": 0.6370652318000793, + "learning_rate": 1.974433862265238e-05, + "loss": 1.4028, + "mean_token_accuracy": 0.653249795238177, + "num_tokens": 316363618.0, + "step": 1891 + }, + { + "entropy": 1.733237236738205, + "epoch": 0.20784927631759634, + "grad_norm": 0.7276230454444885, + "learning_rate": 1.9743959565404444e-05, + "loss": 1.2583, + "mean_token_accuracy": 0.6862328201532364, + "num_tokens": 316514946.0, + "step": 1892 + }, + { + "entropy": 1.6722540160020192, + "epoch": 0.20795913322896928, + "grad_norm": 0.6568346619606018, + "learning_rate": 1.974358023141413e-05, + "loss": 1.3413, + "mean_token_accuracy": 0.6587617894013723, + "num_tokens": 316683060.0, + "step": 1893 + }, + { + "entropy": 1.7204617460568745, + "epoch": 0.2080689901403422, + "grad_norm": 0.7202989459037781, + "learning_rate": 1.9743200620693442e-05, + "loss": 1.3252, + "mean_token_accuracy": 0.6609460512797037, + "num_tokens": 316879147.0, + "step": 1894 + }, + { + "entropy": 1.703949401775996, + "epoch": 0.20817884705171513, + "grad_norm": 0.7430747747421265, + "learning_rate": 1.9742820733254394e-05, + "loss": 1.3856, + "mean_token_accuracy": 0.6529827465613683, + "num_tokens": 317073037.0, + "step": 1895 + }, + { + "entropy": 1.7440508604049683, + "epoch": 0.20828870396308807, + "grad_norm": 0.6155939698219299, + "learning_rate": 1.9742440569109008e-05, + "loss": 1.4088, + "mean_token_accuracy": 0.6512663116057714, + "num_tokens": 317300611.0, + "step": 1896 + }, + { + "entropy": 1.7047240138053894, + "epoch": 0.20839856087446101, + "grad_norm": 0.7423590421676636, + "learning_rate": 1.974206012826932e-05, + "loss": 1.4194, + "mean_token_accuracy": 0.6572458644707998, + "num_tokens": 317448579.0, + "step": 1897 + }, + { + "entropy": 1.7306395471096039, + "epoch": 0.20850841778583395, + "grad_norm": 0.644149661064148, + "learning_rate": 1.9741679410747364e-05, + "loss": 1.4372, + "mean_token_accuracy": 0.6524844119946162, + "num_tokens": 317628610.0, + "step": 1898 + }, + { + "entropy": 1.7001280784606934, + "epoch": 0.2086182746972069, + "grad_norm": 0.6855803728103638, + "learning_rate": 1.9741298416555196e-05, + "loss": 1.4569, + "mean_token_accuracy": 0.6421359926462173, + "num_tokens": 317810190.0, + "step": 1899 + }, + { + "entropy": 1.7195194760958354, + "epoch": 0.20872813160857984, + "grad_norm": 0.7779269218444824, + "learning_rate": 1.974091714570487e-05, + "loss": 1.5581, + "mean_token_accuracy": 0.6334304213523865, + "num_tokens": 318001402.0, + "step": 1900 + }, + { + "entropy": 1.7282605667908986, + "epoch": 0.20883798851995275, + "grad_norm": 0.6376639604568481, + "learning_rate": 1.9740535598208458e-05, + "loss": 1.3576, + "mean_token_accuracy": 0.6543524712324142, + "num_tokens": 318167363.0, + "step": 1901 + }, + { + "entropy": 1.7092109819253285, + "epoch": 0.2089478454313257, + "grad_norm": 0.6729239821434021, + "learning_rate": 1.9740153774078033e-05, + "loss": 1.346, + "mean_token_accuracy": 0.658411035935084, + "num_tokens": 318317612.0, + "step": 1902 + }, + { + "entropy": 1.7186111609141033, + "epoch": 0.20905770234269863, + "grad_norm": 0.8159520030021667, + "learning_rate": 1.9739771673325678e-05, + "loss": 1.4808, + "mean_token_accuracy": 0.6539090524117152, + "num_tokens": 318481892.0, + "step": 1903 + }, + { + "entropy": 1.7617724239826202, + "epoch": 0.20916755925407157, + "grad_norm": 0.6777756214141846, + "learning_rate": 1.9739389295963486e-05, + "loss": 1.5622, + "mean_token_accuracy": 0.6342104425032934, + "num_tokens": 318709241.0, + "step": 1904 + }, + { + "entropy": 1.719315081834793, + "epoch": 0.2092774161654445, + "grad_norm": 0.663506805896759, + "learning_rate": 1.9739006642003566e-05, + "loss": 1.3675, + "mean_token_accuracy": 0.668515016635259, + "num_tokens": 318948897.0, + "step": 1905 + }, + { + "entropy": 1.7113699913024902, + "epoch": 0.20938727307681745, + "grad_norm": 0.6316990256309509, + "learning_rate": 1.973862371145802e-05, + "loss": 1.4644, + "mean_token_accuracy": 0.6517230321963629, + "num_tokens": 319134577.0, + "step": 1906 + }, + { + "entropy": 1.7475859622160594, + "epoch": 0.2094971299881904, + "grad_norm": 0.7789897322654724, + "learning_rate": 1.973824050433897e-05, + "loss": 1.4932, + "mean_token_accuracy": 0.6495751440525055, + "num_tokens": 319295735.0, + "step": 1907 + }, + { + "entropy": 1.685575932264328, + "epoch": 0.2096069868995633, + "grad_norm": 0.5883545279502869, + "learning_rate": 1.973785702065855e-05, + "loss": 1.4238, + "mean_token_accuracy": 0.6530423561731974, + "num_tokens": 319516027.0, + "step": 1908 + }, + { + "entropy": 1.7320783734321594, + "epoch": 0.20971684381093625, + "grad_norm": 0.8050070405006409, + "learning_rate": 1.9737473260428894e-05, + "loss": 1.337, + "mean_token_accuracy": 0.6576566646496455, + "num_tokens": 319616804.0, + "step": 1909 + }, + { + "entropy": 1.7201977968215942, + "epoch": 0.2098267007223092, + "grad_norm": 0.7059934139251709, + "learning_rate": 1.973708922366214e-05, + "loss": 1.2972, + "mean_token_accuracy": 0.6773047844568888, + "num_tokens": 319738775.0, + "step": 1910 + }, + { + "entropy": 1.7452878654003143, + "epoch": 0.20993655763368213, + "grad_norm": 0.6112817525863647, + "learning_rate": 1.973670491037045e-05, + "loss": 1.4088, + "mean_token_accuracy": 0.6482658833265305, + "num_tokens": 319930662.0, + "step": 1911 + }, + { + "entropy": 1.710121254126231, + "epoch": 0.21004641454505507, + "grad_norm": 0.7919174432754517, + "learning_rate": 1.973632032056599e-05, + "loss": 1.4035, + "mean_token_accuracy": 0.6597762157519659, + "num_tokens": 320076366.0, + "step": 1912 + }, + { + "entropy": 1.7030129532019298, + "epoch": 0.210156271456428, + "grad_norm": 0.700587272644043, + "learning_rate": 1.9735935454260925e-05, + "loss": 1.3965, + "mean_token_accuracy": 0.6516723334789276, + "num_tokens": 320247392.0, + "step": 1913 + }, + { + "entropy": 1.7211789786815643, + "epoch": 0.21026612836780095, + "grad_norm": 0.7128605842590332, + "learning_rate": 1.9735550311467443e-05, + "loss": 1.4136, + "mean_token_accuracy": 0.6487771173318228, + "num_tokens": 320408383.0, + "step": 1914 + }, + { + "entropy": 1.7001396020253499, + "epoch": 0.21037598527917387, + "grad_norm": 0.7244299650192261, + "learning_rate": 1.973516489219773e-05, + "loss": 1.499, + "mean_token_accuracy": 0.6356903513272604, + "num_tokens": 320659297.0, + "step": 1915 + }, + { + "entropy": 1.7240975697835286, + "epoch": 0.2104858421905468, + "grad_norm": 0.5836021900177002, + "learning_rate": 1.973477919646398e-05, + "loss": 1.316, + "mean_token_accuracy": 0.6605116327603658, + "num_tokens": 320823387.0, + "step": 1916 + }, + { + "entropy": 1.724854737520218, + "epoch": 0.21059569910191975, + "grad_norm": 0.7972742319107056, + "learning_rate": 1.9734393224278406e-05, + "loss": 1.3694, + "mean_token_accuracy": 0.6639392127593359, + "num_tokens": 320951543.0, + "step": 1917 + }, + { + "entropy": 1.696036696434021, + "epoch": 0.2107055560132927, + "grad_norm": 0.8939576745033264, + "learning_rate": 1.9734006975653224e-05, + "loss": 1.2696, + "mean_token_accuracy": 0.6718998452027639, + "num_tokens": 321087040.0, + "step": 1918 + }, + { + "entropy": 1.6345807611942291, + "epoch": 0.21081541292466563, + "grad_norm": 0.5838897824287415, + "learning_rate": 1.9733620450600655e-05, + "loss": 1.3427, + "mean_token_accuracy": 0.6704151580731074, + "num_tokens": 321269169.0, + "step": 1919 + }, + { + "entropy": 1.7143397529919941, + "epoch": 0.21092526983603857, + "grad_norm": 0.7261598110198975, + "learning_rate": 1.9733233649132938e-05, + "loss": 1.4234, + "mean_token_accuracy": 0.6666987985372543, + "num_tokens": 321418860.0, + "step": 1920 + }, + { + "entropy": 1.7327088514963787, + "epoch": 0.21103512674741148, + "grad_norm": 0.7012075185775757, + "learning_rate": 1.9732846571262304e-05, + "loss": 1.4299, + "mean_token_accuracy": 0.6525350759426752, + "num_tokens": 321598118.0, + "step": 1921 + }, + { + "entropy": 1.7444495658079784, + "epoch": 0.21114498365878442, + "grad_norm": 0.6507946252822876, + "learning_rate": 1.9732459217001017e-05, + "loss": 1.4639, + "mean_token_accuracy": 0.6573313424984614, + "num_tokens": 321804284.0, + "step": 1922 + }, + { + "entropy": 1.724996030330658, + "epoch": 0.21125484057015737, + "grad_norm": 0.620772123336792, + "learning_rate": 1.9732071586361334e-05, + "loss": 1.5714, + "mean_token_accuracy": 0.625721663236618, + "num_tokens": 322021779.0, + "step": 1923 + }, + { + "entropy": 1.6913380126158397, + "epoch": 0.2113646974815303, + "grad_norm": 0.5675688982009888, + "learning_rate": 1.973168367935551e-05, + "loss": 1.4439, + "mean_token_accuracy": 0.6470164060592651, + "num_tokens": 322200625.0, + "step": 1924 + }, + { + "entropy": 1.647382875283559, + "epoch": 0.21147455439290325, + "grad_norm": 0.6393111348152161, + "learning_rate": 1.9731295495995838e-05, + "loss": 1.3366, + "mean_token_accuracy": 0.6661768307288488, + "num_tokens": 322380416.0, + "step": 1925 + }, + { + "entropy": 1.6876880129178364, + "epoch": 0.2115844113042762, + "grad_norm": 2.3705599308013916, + "learning_rate": 1.97309070362946e-05, + "loss": 1.175, + "mean_token_accuracy": 0.6751070966323217, + "num_tokens": 322560122.0, + "step": 1926 + }, + { + "entropy": 1.7260343929131825, + "epoch": 0.21169426821564913, + "grad_norm": 0.689630389213562, + "learning_rate": 1.9730518300264086e-05, + "loss": 1.4034, + "mean_token_accuracy": 0.66618379453818, + "num_tokens": 322702668.0, + "step": 1927 + }, + { + "entropy": 1.6774284541606903, + "epoch": 0.21180412512702204, + "grad_norm": 0.6801313757896423, + "learning_rate": 1.97301292879166e-05, + "loss": 1.3852, + "mean_token_accuracy": 0.6520472516616186, + "num_tokens": 322920370.0, + "step": 1928 + }, + { + "entropy": 1.6716736455758412, + "epoch": 0.21191398203839498, + "grad_norm": 0.7043926119804382, + "learning_rate": 1.9729739999264458e-05, + "loss": 1.4412, + "mean_token_accuracy": 0.6420286248127619, + "num_tokens": 323090618.0, + "step": 1929 + }, + { + "entropy": 1.7062031924724579, + "epoch": 0.21202383894976792, + "grad_norm": 0.6019328832626343, + "learning_rate": 1.9729350434319977e-05, + "loss": 1.5193, + "mean_token_accuracy": 0.6340185602506002, + "num_tokens": 323303523.0, + "step": 1930 + }, + { + "entropy": 1.71775417526563, + "epoch": 0.21213369586114086, + "grad_norm": 0.655889093875885, + "learning_rate": 1.9728960593095493e-05, + "loss": 1.3497, + "mean_token_accuracy": 0.6655166298151016, + "num_tokens": 323446877.0, + "step": 1931 + }, + { + "entropy": 1.6558316747347515, + "epoch": 0.2122435527725138, + "grad_norm": 0.703708291053772, + "learning_rate": 1.9728570475603336e-05, + "loss": 1.3943, + "mean_token_accuracy": 0.6581457406282425, + "num_tokens": 323604718.0, + "step": 1932 + }, + { + "entropy": 1.7211906711260478, + "epoch": 0.21235340968388675, + "grad_norm": 0.7779010534286499, + "learning_rate": 1.9728180081855855e-05, + "loss": 1.4799, + "mean_token_accuracy": 0.6504116902748743, + "num_tokens": 323744606.0, + "step": 1933 + }, + { + "entropy": 1.636175235112508, + "epoch": 0.2124632665952597, + "grad_norm": 0.6399657726287842, + "learning_rate": 1.972778941186541e-05, + "loss": 1.2702, + "mean_token_accuracy": 0.6657491226991018, + "num_tokens": 323939422.0, + "step": 1934 + }, + { + "entropy": 1.7119918167591095, + "epoch": 0.2125731235066326, + "grad_norm": 0.6368902921676636, + "learning_rate": 1.9727398465644363e-05, + "loss": 1.3036, + "mean_token_accuracy": 0.668227881193161, + "num_tokens": 324097047.0, + "step": 1935 + }, + { + "entropy": 1.6312094032764435, + "epoch": 0.21268298041800554, + "grad_norm": 0.6294612884521484, + "learning_rate": 1.972700724320509e-05, + "loss": 1.2651, + "mean_token_accuracy": 0.6779163281122843, + "num_tokens": 324248984.0, + "step": 1936 + }, + { + "entropy": 1.7897210717201233, + "epoch": 0.21279283732937848, + "grad_norm": 0.7651355266571045, + "learning_rate": 1.9726615744559965e-05, + "loss": 1.4585, + "mean_token_accuracy": 0.6460629999637604, + "num_tokens": 324427567.0, + "step": 1937 + }, + { + "entropy": 1.760125567515691, + "epoch": 0.21290269424075142, + "grad_norm": 0.9342473745346069, + "learning_rate": 1.9726223969721384e-05, + "loss": 1.3453, + "mean_token_accuracy": 0.6497061004241308, + "num_tokens": 324608248.0, + "step": 1938 + }, + { + "entropy": 1.7319878935813904, + "epoch": 0.21301255115212436, + "grad_norm": 0.6311997771263123, + "learning_rate": 1.972583191870175e-05, + "loss": 1.3187, + "mean_token_accuracy": 0.6541954825321833, + "num_tokens": 324767775.0, + "step": 1939 + }, + { + "entropy": 1.681753158569336, + "epoch": 0.2131224080634973, + "grad_norm": 0.7324146032333374, + "learning_rate": 1.9725439591513467e-05, + "loss": 1.3592, + "mean_token_accuracy": 0.6560372710227966, + "num_tokens": 324951412.0, + "step": 1940 + }, + { + "entropy": 1.7503413657347362, + "epoch": 0.21323226497487024, + "grad_norm": 0.7484403252601624, + "learning_rate": 1.972504698816895e-05, + "loss": 1.3736, + "mean_token_accuracy": 0.662136490146319, + "num_tokens": 325081774.0, + "step": 1941 + }, + { + "entropy": 1.6506099005540211, + "epoch": 0.21334212188624316, + "grad_norm": 0.6231799125671387, + "learning_rate": 1.972465410868063e-05, + "loss": 1.2779, + "mean_token_accuracy": 0.6839319914579391, + "num_tokens": 325248232.0, + "step": 1942 + }, + { + "entropy": 1.7160189251104991, + "epoch": 0.2134519787976161, + "grad_norm": 0.7348440289497375, + "learning_rate": 1.972426095306094e-05, + "loss": 1.488, + "mean_token_accuracy": 0.6483329683542252, + "num_tokens": 325456430.0, + "step": 1943 + }, + { + "entropy": 1.6814146141211193, + "epoch": 0.21356183570898904, + "grad_norm": 0.6065512299537659, + "learning_rate": 1.972386752132232e-05, + "loss": 1.3807, + "mean_token_accuracy": 0.6708929588397344, + "num_tokens": 325625943.0, + "step": 1944 + }, + { + "entropy": 1.7090165813763936, + "epoch": 0.21367169262036198, + "grad_norm": 0.6108605861663818, + "learning_rate": 1.9723473813477223e-05, + "loss": 1.363, + "mean_token_accuracy": 0.6590655495723089, + "num_tokens": 325806626.0, + "step": 1945 + }, + { + "entropy": 1.7288100719451904, + "epoch": 0.21378154953173492, + "grad_norm": 0.814892590045929, + "learning_rate": 1.9723079829538115e-05, + "loss": 1.4412, + "mean_token_accuracy": 0.6585568189620972, + "num_tokens": 325963539.0, + "step": 1946 + }, + { + "entropy": 1.7768865327040355, + "epoch": 0.21389140644310786, + "grad_norm": 0.8345417976379395, + "learning_rate": 1.9722685569517455e-05, + "loss": 1.4554, + "mean_token_accuracy": 0.6512434879938761, + "num_tokens": 326093531.0, + "step": 1947 + }, + { + "entropy": 1.6884494721889496, + "epoch": 0.21400126335448078, + "grad_norm": 0.6763792634010315, + "learning_rate": 1.9722291033427733e-05, + "loss": 1.3025, + "mean_token_accuracy": 0.6711457918087641, + "num_tokens": 326244680.0, + "step": 1948 + }, + { + "entropy": 1.6721834540367126, + "epoch": 0.21411112026585372, + "grad_norm": 0.7668681144714355, + "learning_rate": 1.9721896221281426e-05, + "loss": 1.3331, + "mean_token_accuracy": 0.6610443890094757, + "num_tokens": 326351781.0, + "step": 1949 + }, + { + "entropy": 1.6889389057954152, + "epoch": 0.21422097717722666, + "grad_norm": 0.6436994075775146, + "learning_rate": 1.9721501133091035e-05, + "loss": 1.3498, + "mean_token_accuracy": 0.6683029731114706, + "num_tokens": 326496856.0, + "step": 1950 + }, + { + "entropy": 1.726958990097046, + "epoch": 0.2143308340885996, + "grad_norm": 0.7337366342544556, + "learning_rate": 1.9721105768869066e-05, + "loss": 1.5077, + "mean_token_accuracy": 0.6476754397153854, + "num_tokens": 326642845.0, + "step": 1951 + }, + { + "entropy": 1.750323196252187, + "epoch": 0.21444069099997254, + "grad_norm": 0.6473729610443115, + "learning_rate": 1.972071012862802e-05, + "loss": 1.5035, + "mean_token_accuracy": 0.6414787570635477, + "num_tokens": 326801852.0, + "step": 1952 + }, + { + "entropy": 1.6994662880897522, + "epoch": 0.21455054791134548, + "grad_norm": 0.6995154023170471, + "learning_rate": 1.9720314212380437e-05, + "loss": 1.3645, + "mean_token_accuracy": 0.6681473056475321, + "num_tokens": 327002498.0, + "step": 1953 + }, + { + "entropy": 1.7458200256029766, + "epoch": 0.21466040482271842, + "grad_norm": 0.7394571900367737, + "learning_rate": 1.971991802013884e-05, + "loss": 1.6005, + "mean_token_accuracy": 0.6400333990653356, + "num_tokens": 327189288.0, + "step": 1954 + }, + { + "entropy": 1.7399512827396393, + "epoch": 0.21477026173409133, + "grad_norm": 1.1217888593673706, + "learning_rate": 1.9719521551915763e-05, + "loss": 1.3252, + "mean_token_accuracy": 0.6774489680926005, + "num_tokens": 327317080.0, + "step": 1955 + }, + { + "entropy": 1.7320358057816823, + "epoch": 0.21488011864546427, + "grad_norm": 0.6957964897155762, + "learning_rate": 1.971912480772376e-05, + "loss": 1.4162, + "mean_token_accuracy": 0.6480669528245926, + "num_tokens": 327514283.0, + "step": 1956 + }, + { + "entropy": 1.6627892553806305, + "epoch": 0.21498997555683722, + "grad_norm": 0.7532937526702881, + "learning_rate": 1.9718727787575383e-05, + "loss": 1.3621, + "mean_token_accuracy": 0.6672651420036951, + "num_tokens": 327668545.0, + "step": 1957 + }, + { + "entropy": 1.6754935681819916, + "epoch": 0.21509983246821016, + "grad_norm": 0.6984850764274597, + "learning_rate": 1.97183304914832e-05, + "loss": 1.3214, + "mean_token_accuracy": 0.6585142463445663, + "num_tokens": 327835589.0, + "step": 1958 + }, + { + "entropy": 1.7026499410470326, + "epoch": 0.2152096893795831, + "grad_norm": 0.558738112449646, + "learning_rate": 1.9717932919459784e-05, + "loss": 1.4541, + "mean_token_accuracy": 0.6451480984687805, + "num_tokens": 328089156.0, + "step": 1959 + }, + { + "entropy": 1.6809870799382527, + "epoch": 0.21531954629095604, + "grad_norm": 0.6252606511116028, + "learning_rate": 1.9717535071517724e-05, + "loss": 1.4261, + "mean_token_accuracy": 0.6565803388754526, + "num_tokens": 328295200.0, + "step": 1960 + }, + { + "entropy": 1.736928681532542, + "epoch": 0.21542940320232898, + "grad_norm": 0.7609971165657043, + "learning_rate": 1.9717136947669606e-05, + "loss": 1.3809, + "mean_token_accuracy": 0.6587243974208832, + "num_tokens": 328509731.0, + "step": 1961 + }, + { + "entropy": 1.6860435704390209, + "epoch": 0.2155392601137019, + "grad_norm": 0.751956045627594, + "learning_rate": 1.971673854792803e-05, + "loss": 1.2261, + "mean_token_accuracy": 0.6756429572900137, + "num_tokens": 328612622.0, + "step": 1962 + }, + { + "entropy": 1.7120301028092701, + "epoch": 0.21564911702507483, + "grad_norm": 0.700073778629303, + "learning_rate": 1.971633987230561e-05, + "loss": 1.4035, + "mean_token_accuracy": 0.6547966102759043, + "num_tokens": 328845263.0, + "step": 1963 + }, + { + "entropy": 1.7697708209355671, + "epoch": 0.21575897393644777, + "grad_norm": 0.7105919718742371, + "learning_rate": 1.971594092081496e-05, + "loss": 1.312, + "mean_token_accuracy": 0.6595413237810135, + "num_tokens": 328975866.0, + "step": 1964 + }, + { + "entropy": 1.7790792485078175, + "epoch": 0.21586883084782071, + "grad_norm": 0.8269145488739014, + "learning_rate": 1.9715541693468703e-05, + "loss": 1.3614, + "mean_token_accuracy": 0.6634324043989182, + "num_tokens": 329095429.0, + "step": 1965 + }, + { + "entropy": 1.6846925516923268, + "epoch": 0.21597868775919365, + "grad_norm": 0.5915414094924927, + "learning_rate": 1.9715142190279482e-05, + "loss": 1.3213, + "mean_token_accuracy": 0.6567167490720749, + "num_tokens": 329249605.0, + "step": 1966 + }, + { + "entropy": 1.744320313135783, + "epoch": 0.2160885446705666, + "grad_norm": 0.6608003973960876, + "learning_rate": 1.971474241125994e-05, + "loss": 1.4533, + "mean_token_accuracy": 0.6462043275435766, + "num_tokens": 329411071.0, + "step": 1967 + }, + { + "entropy": 1.6898845732212067, + "epoch": 0.21619840158193954, + "grad_norm": 0.6290837526321411, + "learning_rate": 1.9714342356422723e-05, + "loss": 1.4013, + "mean_token_accuracy": 0.6717150410016378, + "num_tokens": 329570535.0, + "step": 1968 + }, + { + "entropy": 1.7154962023099263, + "epoch": 0.21630825849331245, + "grad_norm": 0.7625136971473694, + "learning_rate": 1.97139420257805e-05, + "loss": 1.3062, + "mean_token_accuracy": 0.6675903101762136, + "num_tokens": 329703050.0, + "step": 1969 + }, + { + "entropy": 1.7551279962062836, + "epoch": 0.2164181154046854, + "grad_norm": 0.736630380153656, + "learning_rate": 1.971354141934594e-05, + "loss": 1.4062, + "mean_token_accuracy": 0.6460110992193222, + "num_tokens": 329872834.0, + "step": 1970 + }, + { + "entropy": 1.7381382485230763, + "epoch": 0.21652797231605833, + "grad_norm": 0.6895781755447388, + "learning_rate": 1.9713140537131715e-05, + "loss": 1.4274, + "mean_token_accuracy": 0.6558477779229482, + "num_tokens": 330013651.0, + "step": 1971 + }, + { + "entropy": 1.7030009031295776, + "epoch": 0.21663782922743127, + "grad_norm": 0.7224587798118591, + "learning_rate": 1.9712739379150523e-05, + "loss": 1.4991, + "mean_token_accuracy": 0.6588802685340246, + "num_tokens": 330178313.0, + "step": 1972 + }, + { + "entropy": 1.681588480869929, + "epoch": 0.2167476861388042, + "grad_norm": 0.5660827159881592, + "learning_rate": 1.9712337945415054e-05, + "loss": 1.4877, + "mean_token_accuracy": 0.6458199421564738, + "num_tokens": 330385757.0, + "step": 1973 + }, + { + "entropy": 1.6638726989428203, + "epoch": 0.21685754305017715, + "grad_norm": 0.623779296875, + "learning_rate": 1.9711936235938014e-05, + "loss": 1.3621, + "mean_token_accuracy": 0.659187431136767, + "num_tokens": 330589361.0, + "step": 1974 + }, + { + "entropy": 1.691465973854065, + "epoch": 0.2169673999615501, + "grad_norm": 0.6252658367156982, + "learning_rate": 1.971153425073212e-05, + "loss": 1.3112, + "mean_token_accuracy": 0.6599445144335429, + "num_tokens": 330753043.0, + "step": 1975 + }, + { + "entropy": 1.7143527666727703, + "epoch": 0.217077256872923, + "grad_norm": 0.7959213256835938, + "learning_rate": 1.971113198981009e-05, + "loss": 1.4267, + "mean_token_accuracy": 0.6643683314323425, + "num_tokens": 330914792.0, + "step": 1976 + }, + { + "entropy": 1.7403099636236827, + "epoch": 0.21718711378429595, + "grad_norm": 0.7242742776870728, + "learning_rate": 1.9710729453184663e-05, + "loss": 1.4078, + "mean_token_accuracy": 0.6562838604052862, + "num_tokens": 331075502.0, + "step": 1977 + }, + { + "entropy": 1.721234291791916, + "epoch": 0.2172969706956689, + "grad_norm": 0.6983941197395325, + "learning_rate": 1.9710326640868568e-05, + "loss": 1.429, + "mean_token_accuracy": 0.6565836171309153, + "num_tokens": 331268768.0, + "step": 1978 + }, + { + "entropy": 1.728889485200246, + "epoch": 0.21740682760704183, + "grad_norm": 0.7118070125579834, + "learning_rate": 1.9709923552874565e-05, + "loss": 1.3784, + "mean_token_accuracy": 0.6568760176499685, + "num_tokens": 331436142.0, + "step": 1979 + }, + { + "entropy": 1.7155893842379253, + "epoch": 0.21751668451841477, + "grad_norm": 0.738287091255188, + "learning_rate": 1.9709520189215403e-05, + "loss": 1.4332, + "mean_token_accuracy": 0.6453232914209366, + "num_tokens": 331580092.0, + "step": 1980 + }, + { + "entropy": 1.7075227002302806, + "epoch": 0.2176265414297877, + "grad_norm": 0.6078463792800903, + "learning_rate": 1.970911654990385e-05, + "loss": 1.501, + "mean_token_accuracy": 0.6402676453193029, + "num_tokens": 331803746.0, + "step": 1981 + }, + { + "entropy": 1.6620681583881378, + "epoch": 0.21773639834116063, + "grad_norm": 0.6076948046684265, + "learning_rate": 1.9708712634952688e-05, + "loss": 1.4018, + "mean_token_accuracy": 0.6624077359835306, + "num_tokens": 331958945.0, + "step": 1982 + }, + { + "entropy": 1.6899429162343342, + "epoch": 0.21784625525253357, + "grad_norm": 0.6573540568351746, + "learning_rate": 1.970830844437469e-05, + "loss": 1.4148, + "mean_token_accuracy": 0.6478807330131531, + "num_tokens": 332191045.0, + "step": 1983 + }, + { + "entropy": 1.705435295899709, + "epoch": 0.2179561121639065, + "grad_norm": 0.6898738145828247, + "learning_rate": 1.970790397818266e-05, + "loss": 1.4029, + "mean_token_accuracy": 0.6507929762204488, + "num_tokens": 332347935.0, + "step": 1984 + }, + { + "entropy": 1.7328907350699108, + "epoch": 0.21806596907527945, + "grad_norm": 0.6054257750511169, + "learning_rate": 1.9707499236389384e-05, + "loss": 1.4292, + "mean_token_accuracy": 0.6487182925144831, + "num_tokens": 332523846.0, + "step": 1985 + }, + { + "entropy": 1.6892162561416626, + "epoch": 0.2181758259866524, + "grad_norm": 0.7589190006256104, + "learning_rate": 1.9707094219007687e-05, + "loss": 1.2616, + "mean_token_accuracy": 0.6733472148577372, + "num_tokens": 332684862.0, + "step": 1986 + }, + { + "entropy": 1.6447477738062541, + "epoch": 0.21828568289802533, + "grad_norm": 0.7766509056091309, + "learning_rate": 1.970668892605038e-05, + "loss": 1.3476, + "mean_token_accuracy": 0.6714780976374944, + "num_tokens": 332864922.0, + "step": 1987 + }, + { + "entropy": 1.7524027526378632, + "epoch": 0.21839553980939827, + "grad_norm": 0.6996143460273743, + "learning_rate": 1.9706283357530294e-05, + "loss": 1.3252, + "mean_token_accuracy": 0.66605643928051, + "num_tokens": 333030584.0, + "step": 1988 + }, + { + "entropy": 1.7636100550492604, + "epoch": 0.21850539672077118, + "grad_norm": 0.8666709661483765, + "learning_rate": 1.9705877513460257e-05, + "loss": 1.4356, + "mean_token_accuracy": 0.64682570596536, + "num_tokens": 333186942.0, + "step": 1989 + }, + { + "entropy": 1.7297316590944927, + "epoch": 0.21861525363214412, + "grad_norm": 0.8126122951507568, + "learning_rate": 1.9705471393853126e-05, + "loss": 1.3266, + "mean_token_accuracy": 0.6661059657732645, + "num_tokens": 333316991.0, + "step": 1990 + }, + { + "entropy": 1.7401694059371948, + "epoch": 0.21872511054351707, + "grad_norm": 0.7024965286254883, + "learning_rate": 1.9705064998721742e-05, + "loss": 1.2493, + "mean_token_accuracy": 0.6787941058476766, + "num_tokens": 333439176.0, + "step": 1991 + }, + { + "entropy": 1.708668867746989, + "epoch": 0.21883496745489, + "grad_norm": 0.6795996427536011, + "learning_rate": 1.970465832807898e-05, + "loss": 1.4659, + "mean_token_accuracy": 0.660365030169487, + "num_tokens": 333613726.0, + "step": 1992 + }, + { + "entropy": 1.696395069360733, + "epoch": 0.21894482436626295, + "grad_norm": 0.7904760837554932, + "learning_rate": 1.9704251381937703e-05, + "loss": 1.2613, + "mean_token_accuracy": 0.6752869784832001, + "num_tokens": 333778956.0, + "step": 1993 + }, + { + "entropy": 1.698264519373576, + "epoch": 0.2190546812776359, + "grad_norm": 0.5874459147453308, + "learning_rate": 1.970384416031079e-05, + "loss": 1.4253, + "mean_token_accuracy": 0.6463806182146072, + "num_tokens": 333982509.0, + "step": 1994 + }, + { + "entropy": 1.7498473624388378, + "epoch": 0.21916453818900883, + "grad_norm": 0.7056718468666077, + "learning_rate": 1.970343666321113e-05, + "loss": 1.3865, + "mean_token_accuracy": 0.6528016924858093, + "num_tokens": 334137289.0, + "step": 1995 + }, + { + "entropy": 1.7133673230806987, + "epoch": 0.21927439510038174, + "grad_norm": 0.7165104150772095, + "learning_rate": 1.9703028890651625e-05, + "loss": 1.3948, + "mean_token_accuracy": 0.6589706838130951, + "num_tokens": 334332222.0, + "step": 1996 + }, + { + "entropy": 1.704959104458491, + "epoch": 0.21938425201175468, + "grad_norm": 0.6553063988685608, + "learning_rate": 1.9702620842645176e-05, + "loss": 1.5619, + "mean_token_accuracy": 0.6511781016985575, + "num_tokens": 334525106.0, + "step": 1997 + }, + { + "entropy": 1.7078428268432617, + "epoch": 0.21949410892312762, + "grad_norm": 0.7418580651283264, + "learning_rate": 1.9702212519204697e-05, + "loss": 1.3736, + "mean_token_accuracy": 0.669340506196022, + "num_tokens": 334729517.0, + "step": 1998 + }, + { + "entropy": 1.71206929286321, + "epoch": 0.21960396583450056, + "grad_norm": 2.2254767417907715, + "learning_rate": 1.9701803920343117e-05, + "loss": 1.1656, + "mean_token_accuracy": 0.6783639788627625, + "num_tokens": 334926935.0, + "step": 1999 + }, + { + "entropy": 1.6726875305175781, + "epoch": 0.2197138227458735, + "grad_norm": 0.6199320554733276, + "learning_rate": 1.9701395046073358e-05, + "loss": 1.4867, + "mean_token_accuracy": 0.6402423232793808, + "num_tokens": 335119572.0, + "step": 2000 + }, + { + "entropy": 1.6810939411322277, + "epoch": 0.21982367965724645, + "grad_norm": 0.6892693638801575, + "learning_rate": 1.970098589640837e-05, + "loss": 1.4422, + "mean_token_accuracy": 0.6712821374336878, + "num_tokens": 335300516.0, + "step": 2001 + }, + { + "entropy": 1.7373451888561249, + "epoch": 0.2199335365686194, + "grad_norm": 0.652580201625824, + "learning_rate": 1.9700576471361103e-05, + "loss": 1.383, + "mean_token_accuracy": 0.6453498254219691, + "num_tokens": 335441164.0, + "step": 2002 + }, + { + "entropy": 1.7873657743136089, + "epoch": 0.2200433934799923, + "grad_norm": 0.6044803261756897, + "learning_rate": 1.9700166770944505e-05, + "loss": 1.4454, + "mean_token_accuracy": 0.6438331256310145, + "num_tokens": 335614767.0, + "step": 2003 + }, + { + "entropy": 1.7439679205417633, + "epoch": 0.22015325039136524, + "grad_norm": 0.718855619430542, + "learning_rate": 1.9699756795171553e-05, + "loss": 1.5, + "mean_token_accuracy": 0.6593141506115595, + "num_tokens": 335782527.0, + "step": 2004 + }, + { + "entropy": 1.680442641178767, + "epoch": 0.22026310730273818, + "grad_norm": 6.6189284324646, + "learning_rate": 1.9699346544055217e-05, + "loss": 1.3119, + "mean_token_accuracy": 0.6821538011233012, + "num_tokens": 335921470.0, + "step": 2005 + }, + { + "entropy": 1.7134496867656708, + "epoch": 0.22037296421411112, + "grad_norm": 0.749874472618103, + "learning_rate": 1.9698936017608484e-05, + "loss": 1.4309, + "mean_token_accuracy": 0.6579453547795614, + "num_tokens": 336076125.0, + "step": 2006 + }, + { + "entropy": 1.6650103032588959, + "epoch": 0.22048282112548406, + "grad_norm": 0.6054286956787109, + "learning_rate": 1.9698525215844347e-05, + "loss": 1.3048, + "mean_token_accuracy": 0.6650077700614929, + "num_tokens": 336246010.0, + "step": 2007 + }, + { + "entropy": 1.7242847979068756, + "epoch": 0.220592678036857, + "grad_norm": 0.8481932878494263, + "learning_rate": 1.96981141387758e-05, + "loss": 1.1978, + "mean_token_accuracy": 0.6881664743026098, + "num_tokens": 336384043.0, + "step": 2008 + }, + { + "entropy": 1.6481478810310364, + "epoch": 0.22070253494822992, + "grad_norm": 0.8285883665084839, + "learning_rate": 1.9697702786415866e-05, + "loss": 1.4015, + "mean_token_accuracy": 0.6562249759833018, + "num_tokens": 336584871.0, + "step": 2009 + }, + { + "entropy": 1.7549095054467518, + "epoch": 0.22081239185960286, + "grad_norm": 0.6210580468177795, + "learning_rate": 1.969729115877756e-05, + "loss": 1.4517, + "mean_token_accuracy": 0.6447333445151647, + "num_tokens": 336785253.0, + "step": 2010 + }, + { + "entropy": 1.7175431450208027, + "epoch": 0.2209222487709758, + "grad_norm": 0.8552317023277283, + "learning_rate": 1.9696879255873902e-05, + "loss": 1.5219, + "mean_token_accuracy": 0.6434709678093592, + "num_tokens": 336988668.0, + "step": 2011 + }, + { + "entropy": 1.7095571756362915, + "epoch": 0.22103210568234874, + "grad_norm": 0.6405702233314514, + "learning_rate": 1.969646707771794e-05, + "loss": 1.3564, + "mean_token_accuracy": 0.6549317836761475, + "num_tokens": 337175572.0, + "step": 2012 + }, + { + "entropy": 1.7163640260696411, + "epoch": 0.22114196259372168, + "grad_norm": 0.8766898512840271, + "learning_rate": 1.969605462432271e-05, + "loss": 1.3813, + "mean_token_accuracy": 0.6668533583482107, + "num_tokens": 337325813.0, + "step": 2013 + }, + { + "entropy": 1.6816040774186451, + "epoch": 0.22125181950509462, + "grad_norm": 0.7182164192199707, + "learning_rate": 1.969564189570127e-05, + "loss": 1.4426, + "mean_token_accuracy": 0.6525221814711889, + "num_tokens": 337460554.0, + "step": 2014 + }, + { + "entropy": 1.768685221672058, + "epoch": 0.22136167641646756, + "grad_norm": 0.7436334490776062, + "learning_rate": 1.9695228891866683e-05, + "loss": 1.611, + "mean_token_accuracy": 0.6214992552995682, + "num_tokens": 337641696.0, + "step": 2015 + }, + { + "entropy": 1.656598299741745, + "epoch": 0.22147153332784048, + "grad_norm": 0.7697328925132751, + "learning_rate": 1.9694815612832018e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.6659995466470718, + "num_tokens": 337771813.0, + "step": 2016 + }, + { + "entropy": 1.7032875816027324, + "epoch": 0.22158139023921342, + "grad_norm": 0.6816899180412292, + "learning_rate": 1.969440205861036e-05, + "loss": 1.483, + "mean_token_accuracy": 0.6475236068169276, + "num_tokens": 337973121.0, + "step": 2017 + }, + { + "entropy": 1.7433029512564342, + "epoch": 0.22169124715058636, + "grad_norm": 0.6455709934234619, + "learning_rate": 1.969398822921479e-05, + "loss": 1.4009, + "mean_token_accuracy": 0.6455973982810974, + "num_tokens": 338137737.0, + "step": 2018 + }, + { + "entropy": 1.7247349818547566, + "epoch": 0.2218011040619593, + "grad_norm": 0.6982224583625793, + "learning_rate": 1.9693574124658414e-05, + "loss": 1.4222, + "mean_token_accuracy": 0.6396381010611852, + "num_tokens": 338369135.0, + "step": 2019 + }, + { + "entropy": 1.67547611395518, + "epoch": 0.22191096097333224, + "grad_norm": 0.6213436722755432, + "learning_rate": 1.9693159744954335e-05, + "loss": 1.4448, + "mean_token_accuracy": 0.6563169062137604, + "num_tokens": 338550921.0, + "step": 2020 + }, + { + "entropy": 1.7012183169523876, + "epoch": 0.22202081788470518, + "grad_norm": 0.6709868311882019, + "learning_rate": 1.9692745090115664e-05, + "loss": 1.3577, + "mean_token_accuracy": 0.6605485628048579, + "num_tokens": 338739755.0, + "step": 2021 + }, + { + "entropy": 1.7212641338507335, + "epoch": 0.22213067479607812, + "grad_norm": 0.6741979122161865, + "learning_rate": 1.969233016015553e-05, + "loss": 1.4985, + "mean_token_accuracy": 0.6394399156173071, + "num_tokens": 338942812.0, + "step": 2022 + }, + { + "entropy": 1.7291043400764465, + "epoch": 0.22224053170745103, + "grad_norm": 0.7105062007904053, + "learning_rate": 1.9691914955087065e-05, + "loss": 1.4693, + "mean_token_accuracy": 0.6584506978591284, + "num_tokens": 339081060.0, + "step": 2023 + }, + { + "entropy": 1.7811701993147533, + "epoch": 0.22235038861882397, + "grad_norm": 0.7212976217269897, + "learning_rate": 1.9691499474923405e-05, + "loss": 1.4486, + "mean_token_accuracy": 0.644217719634374, + "num_tokens": 339226880.0, + "step": 2024 + }, + { + "entropy": 1.6993672450383503, + "epoch": 0.22246024553019692, + "grad_norm": 0.7805929780006409, + "learning_rate": 1.9691083719677707e-05, + "loss": 1.381, + "mean_token_accuracy": 0.672341451048851, + "num_tokens": 339366403.0, + "step": 2025 + }, + { + "entropy": 1.6640028357505798, + "epoch": 0.22257010244156986, + "grad_norm": 0.643774151802063, + "learning_rate": 1.969066768936312e-05, + "loss": 1.3309, + "mean_token_accuracy": 0.6650595118602117, + "num_tokens": 339535229.0, + "step": 2026 + }, + { + "entropy": 1.7151845892270405, + "epoch": 0.2226799593529428, + "grad_norm": 0.7019052505493164, + "learning_rate": 1.969025138399282e-05, + "loss": 1.3497, + "mean_token_accuracy": 0.6575596183538437, + "num_tokens": 339704603.0, + "step": 2027 + }, + { + "entropy": 1.6844234863917034, + "epoch": 0.22278981626431574, + "grad_norm": 0.7261092066764832, + "learning_rate": 1.9689834803579983e-05, + "loss": 1.4324, + "mean_token_accuracy": 0.652221143245697, + "num_tokens": 339868917.0, + "step": 2028 + }, + { + "entropy": 1.7535183529059093, + "epoch": 0.22289967317568868, + "grad_norm": 0.8210894465446472, + "learning_rate": 1.9689417948137786e-05, + "loss": 1.5589, + "mean_token_accuracy": 0.6526251584291458, + "num_tokens": 340013342.0, + "step": 2029 + }, + { + "entropy": 1.8341341416041057, + "epoch": 0.2230095300870616, + "grad_norm": 0.8437470197677612, + "learning_rate": 1.9689000817679428e-05, + "loss": 1.496, + "mean_token_accuracy": 0.6356715758641561, + "num_tokens": 340169704.0, + "step": 2030 + }, + { + "entropy": 1.6810695727666218, + "epoch": 0.22311938699843453, + "grad_norm": 0.6405393481254578, + "learning_rate": 1.9688583412218108e-05, + "loss": 1.36, + "mean_token_accuracy": 0.6580146849155426, + "num_tokens": 340359004.0, + "step": 2031 + }, + { + "entropy": 1.7621925572554271, + "epoch": 0.22322924390980747, + "grad_norm": 0.7428179979324341, + "learning_rate": 1.9688165731767037e-05, + "loss": 1.5521, + "mean_token_accuracy": 0.6292354067166647, + "num_tokens": 340577217.0, + "step": 2032 + }, + { + "entropy": 1.6683639585971832, + "epoch": 0.22333910082118041, + "grad_norm": 0.6185237765312195, + "learning_rate": 1.968774777633944e-05, + "loss": 1.3931, + "mean_token_accuracy": 0.6481966078281403, + "num_tokens": 340762287.0, + "step": 2033 + }, + { + "entropy": 1.6244226296742756, + "epoch": 0.22344895773255335, + "grad_norm": 0.6757908463478088, + "learning_rate": 1.9687329545948533e-05, + "loss": 1.3674, + "mean_token_accuracy": 0.6705109626054764, + "num_tokens": 340905555.0, + "step": 2034 + }, + { + "entropy": 1.6744134823481243, + "epoch": 0.2235588146439263, + "grad_norm": 0.6213631629943848, + "learning_rate": 1.968691104060757e-05, + "loss": 1.4006, + "mean_token_accuracy": 0.6517485429843267, + "num_tokens": 341092632.0, + "step": 2035 + }, + { + "entropy": 1.6906062265237172, + "epoch": 0.22366867155529924, + "grad_norm": 0.6319667100906372, + "learning_rate": 1.9686492260329783e-05, + "loss": 1.3007, + "mean_token_accuracy": 0.6612733155488968, + "num_tokens": 341223672.0, + "step": 2036 + }, + { + "entropy": 1.7172020475069683, + "epoch": 0.22377852846667215, + "grad_norm": 0.6903420090675354, + "learning_rate": 1.968607320512843e-05, + "loss": 1.3112, + "mean_token_accuracy": 0.6633612463871638, + "num_tokens": 341388402.0, + "step": 2037 + }, + { + "entropy": 1.7287559310595195, + "epoch": 0.2238883853780451, + "grad_norm": 0.7884252071380615, + "learning_rate": 1.9685653875016773e-05, + "loss": 1.252, + "mean_token_accuracy": 0.6740682969490687, + "num_tokens": 341510142.0, + "step": 2038 + }, + { + "entropy": 1.741033395131429, + "epoch": 0.22399824228941803, + "grad_norm": 0.6842942237854004, + "learning_rate": 1.9685234270008085e-05, + "loss": 1.378, + "mean_token_accuracy": 0.6639875521262487, + "num_tokens": 341675965.0, + "step": 2039 + }, + { + "entropy": 1.7549268503983815, + "epoch": 0.22410809920079097, + "grad_norm": 0.7328823208808899, + "learning_rate": 1.9684814390115644e-05, + "loss": 1.3624, + "mean_token_accuracy": 0.6532031744718552, + "num_tokens": 341811346.0, + "step": 2040 + }, + { + "entropy": 1.7577114800612132, + "epoch": 0.2242179561121639, + "grad_norm": 0.6929943561553955, + "learning_rate": 1.9684394235352744e-05, + "loss": 1.5978, + "mean_token_accuracy": 0.6406635567545891, + "num_tokens": 341991608.0, + "step": 2041 + }, + { + "entropy": 1.7216146389643352, + "epoch": 0.22432781302353685, + "grad_norm": 0.7165713310241699, + "learning_rate": 1.9683973805732684e-05, + "loss": 1.4438, + "mean_token_accuracy": 0.6414127250512441, + "num_tokens": 342162750.0, + "step": 2042 + }, + { + "entropy": 1.6384899119536083, + "epoch": 0.22443766993490977, + "grad_norm": 0.694940984249115, + "learning_rate": 1.9683553101268756e-05, + "loss": 1.3885, + "mean_token_accuracy": 0.6634632100661596, + "num_tokens": 342342619.0, + "step": 2043 + }, + { + "entropy": 1.721029241879781, + "epoch": 0.2245475268462827, + "grad_norm": 2.877837657928467, + "learning_rate": 1.968313212197429e-05, + "loss": 1.3462, + "mean_token_accuracy": 0.6564084043105444, + "num_tokens": 342549662.0, + "step": 2044 + }, + { + "entropy": 1.7353296478589375, + "epoch": 0.22465738375765565, + "grad_norm": 0.7393618226051331, + "learning_rate": 1.968271086786261e-05, + "loss": 1.3049, + "mean_token_accuracy": 0.6696875343720118, + "num_tokens": 342671498.0, + "step": 2045 + }, + { + "entropy": 1.6773792306582134, + "epoch": 0.2247672406690286, + "grad_norm": 0.6500130295753479, + "learning_rate": 1.9682289338947037e-05, + "loss": 1.3325, + "mean_token_accuracy": 0.6623169581095377, + "num_tokens": 342824666.0, + "step": 2046 + }, + { + "entropy": 1.647678832213084, + "epoch": 0.22487709758040153, + "grad_norm": 0.6050171256065369, + "learning_rate": 1.9681867535240924e-05, + "loss": 1.35, + "mean_token_accuracy": 0.665855829914411, + "num_tokens": 343013627.0, + "step": 2047 + }, + { + "entropy": 1.703715850909551, + "epoch": 0.22498695449177447, + "grad_norm": 0.7125741839408875, + "learning_rate": 1.968144545675761e-05, + "loss": 1.4769, + "mean_token_accuracy": 0.650515486796697, + "num_tokens": 343157329.0, + "step": 2048 + }, + { + "entropy": 1.717505931854248, + "epoch": 0.2250968114031474, + "grad_norm": 0.7336664795875549, + "learning_rate": 1.9681023103510465e-05, + "loss": 1.3677, + "mean_token_accuracy": 0.6719395915667216, + "num_tokens": 343305654.0, + "step": 2049 + }, + { + "entropy": 1.7339052259922028, + "epoch": 0.22520666831452033, + "grad_norm": 0.7247207164764404, + "learning_rate": 1.9680600475512844e-05, + "loss": 1.3452, + "mean_token_accuracy": 0.6586270729700724, + "num_tokens": 343467941.0, + "step": 2050 + }, + { + "entropy": 1.6390206515789032, + "epoch": 0.22531652522589327, + "grad_norm": 0.7296788096427917, + "learning_rate": 1.9680177572778135e-05, + "loss": 1.2363, + "mean_token_accuracy": 0.6724939694007238, + "num_tokens": 343596667.0, + "step": 2051 + }, + { + "entropy": 1.7341953416665394, + "epoch": 0.2254263821372662, + "grad_norm": 0.6788911819458008, + "learning_rate": 1.9679754395319714e-05, + "loss": 1.3616, + "mean_token_accuracy": 0.6549450208743414, + "num_tokens": 343744356.0, + "step": 2052 + }, + { + "entropy": 1.7105887234210968, + "epoch": 0.22553623904863915, + "grad_norm": 0.6413341760635376, + "learning_rate": 1.9679330943150982e-05, + "loss": 1.3892, + "mean_token_accuracy": 0.6598889281352361, + "num_tokens": 343920401.0, + "step": 2053 + }, + { + "entropy": 1.7920573552449544, + "epoch": 0.2256460959600121, + "grad_norm": 0.7013512849807739, + "learning_rate": 1.967890721628533e-05, + "loss": 1.4534, + "mean_token_accuracy": 0.6496995538473129, + "num_tokens": 344058015.0, + "step": 2054 + }, + { + "entropy": 1.6886359850565593, + "epoch": 0.22575595287138503, + "grad_norm": 0.6699264049530029, + "learning_rate": 1.967848321473618e-05, + "loss": 1.4349, + "mean_token_accuracy": 0.6515084008375803, + "num_tokens": 344246715.0, + "step": 2055 + }, + { + "entropy": 1.7019972801208496, + "epoch": 0.22586580978275797, + "grad_norm": 0.6247515678405762, + "learning_rate": 1.9678058938516946e-05, + "loss": 1.3938, + "mean_token_accuracy": 0.6596282968918482, + "num_tokens": 344416664.0, + "step": 2056 + }, + { + "entropy": 1.6917652984460194, + "epoch": 0.22597566669413088, + "grad_norm": 0.7000340223312378, + "learning_rate": 1.9677634387641056e-05, + "loss": 1.4938, + "mean_token_accuracy": 0.6429780870676041, + "num_tokens": 344609791.0, + "step": 2057 + }, + { + "entropy": 1.729089339574178, + "epoch": 0.22608552360550382, + "grad_norm": 0.7346123456954956, + "learning_rate": 1.967720956212195e-05, + "loss": 1.4291, + "mean_token_accuracy": 0.6589990357557932, + "num_tokens": 344777080.0, + "step": 2058 + }, + { + "entropy": 1.7153427302837372, + "epoch": 0.22619538051687677, + "grad_norm": 0.6257114410400391, + "learning_rate": 1.9676784461973068e-05, + "loss": 1.4968, + "mean_token_accuracy": 0.6334412743647894, + "num_tokens": 344995489.0, + "step": 2059 + }, + { + "entropy": 1.766806423664093, + "epoch": 0.2263052374282497, + "grad_norm": 0.7080755233764648, + "learning_rate": 1.967635908720787e-05, + "loss": 1.3861, + "mean_token_accuracy": 0.6605810771385828, + "num_tokens": 345155660.0, + "step": 2060 + }, + { + "entropy": 1.659007489681244, + "epoch": 0.22641509433962265, + "grad_norm": 0.728387713432312, + "learning_rate": 1.9675933437839817e-05, + "loss": 1.3944, + "mean_token_accuracy": 0.6693829894065857, + "num_tokens": 345282543.0, + "step": 2061 + }, + { + "entropy": 1.757192333539327, + "epoch": 0.2265249512509956, + "grad_norm": 0.6841723322868347, + "learning_rate": 1.967550751388238e-05, + "loss": 1.4359, + "mean_token_accuracy": 0.645816907286644, + "num_tokens": 345459681.0, + "step": 2062 + }, + { + "entropy": 1.7275842030843098, + "epoch": 0.22663480816236853, + "grad_norm": 0.7722262740135193, + "learning_rate": 1.9675081315349037e-05, + "loss": 1.3776, + "mean_token_accuracy": 0.6634769191344579, + "num_tokens": 345643276.0, + "step": 2063 + }, + { + "entropy": 1.6928351819515228, + "epoch": 0.22674466507374144, + "grad_norm": 0.7217462658882141, + "learning_rate": 1.9674654842253283e-05, + "loss": 1.3029, + "mean_token_accuracy": 0.6618945797284445, + "num_tokens": 345780782.0, + "step": 2064 + }, + { + "entropy": 1.7282683352629344, + "epoch": 0.22685452198511438, + "grad_norm": 0.6660979390144348, + "learning_rate": 1.967422809460861e-05, + "loss": 1.4675, + "mean_token_accuracy": 0.6451341956853867, + "num_tokens": 345949242.0, + "step": 2065 + }, + { + "entropy": 1.6747341950734456, + "epoch": 0.22696437889648732, + "grad_norm": 0.632645308971405, + "learning_rate": 1.9673801072428528e-05, + "loss": 1.402, + "mean_token_accuracy": 0.6539940188328425, + "num_tokens": 346131283.0, + "step": 2066 + }, + { + "entropy": 1.823501318693161, + "epoch": 0.22707423580786026, + "grad_norm": 0.8174815773963928, + "learning_rate": 1.967337377572655e-05, + "loss": 1.5313, + "mean_token_accuracy": 0.6354220509529114, + "num_tokens": 346348047.0, + "step": 2067 + }, + { + "entropy": 1.7198161383469899, + "epoch": 0.2271840927192332, + "grad_norm": 0.6577022671699524, + "learning_rate": 1.96729462045162e-05, + "loss": 1.3033, + "mean_token_accuracy": 0.6620743771394094, + "num_tokens": 346500237.0, + "step": 2068 + }, + { + "entropy": 1.7309946616490681, + "epoch": 0.22729394963060615, + "grad_norm": 0.641176164150238, + "learning_rate": 1.967251835881101e-05, + "loss": 1.4701, + "mean_token_accuracy": 0.6394395977258682, + "num_tokens": 346672848.0, + "step": 2069 + }, + { + "entropy": 1.758286048968633, + "epoch": 0.22740380654197906, + "grad_norm": 0.7229630947113037, + "learning_rate": 1.967209023862452e-05, + "loss": 1.3428, + "mean_token_accuracy": 0.6649421552817026, + "num_tokens": 346808156.0, + "step": 2070 + }, + { + "entropy": 1.6775102814038594, + "epoch": 0.227513663453352, + "grad_norm": 0.6443026661872864, + "learning_rate": 1.9671661843970283e-05, + "loss": 1.4133, + "mean_token_accuracy": 0.6403845498959223, + "num_tokens": 346997057.0, + "step": 2071 + }, + { + "entropy": 1.7017942468325298, + "epoch": 0.22762352036472494, + "grad_norm": 0.7220076322555542, + "learning_rate": 1.967123317486186e-05, + "loss": 1.4647, + "mean_token_accuracy": 0.6385684708754221, + "num_tokens": 347201593.0, + "step": 2072 + }, + { + "entropy": 1.729232649008433, + "epoch": 0.22773337727609788, + "grad_norm": 0.7669538259506226, + "learning_rate": 1.967080423131281e-05, + "loss": 1.28, + "mean_token_accuracy": 0.6778450608253479, + "num_tokens": 347340620.0, + "step": 2073 + }, + { + "entropy": 1.7673300007979076, + "epoch": 0.22784323418747082, + "grad_norm": 0.7315675616264343, + "learning_rate": 1.9670375013336716e-05, + "loss": 1.3434, + "mean_token_accuracy": 0.661798839767774, + "num_tokens": 347537266.0, + "step": 2074 + }, + { + "entropy": 1.7187786897023518, + "epoch": 0.22795309109884376, + "grad_norm": 0.6596832275390625, + "learning_rate": 1.966994552094716e-05, + "loss": 1.372, + "mean_token_accuracy": 0.6497376809517542, + "num_tokens": 347677741.0, + "step": 2075 + }, + { + "entropy": 1.6955298682053883, + "epoch": 0.2280629480102167, + "grad_norm": 0.698900580406189, + "learning_rate": 1.9669515754157732e-05, + "loss": 1.3391, + "mean_token_accuracy": 0.6492668986320496, + "num_tokens": 347890197.0, + "step": 2076 + }, + { + "entropy": 1.7692882418632507, + "epoch": 0.22817280492158962, + "grad_norm": 0.669717013835907, + "learning_rate": 1.9669085712982038e-05, + "loss": 1.6084, + "mean_token_accuracy": 0.6258754978577296, + "num_tokens": 348110513.0, + "step": 2077 + }, + { + "entropy": 1.7276891966660817, + "epoch": 0.22828266183296256, + "grad_norm": 0.6462763547897339, + "learning_rate": 1.966865539743369e-05, + "loss": 1.3711, + "mean_token_accuracy": 0.6671418696641922, + "num_tokens": 348238587.0, + "step": 2078 + }, + { + "entropy": 1.708987295627594, + "epoch": 0.2283925187443355, + "grad_norm": 0.7623677253723145, + "learning_rate": 1.9668224807526306e-05, + "loss": 1.3025, + "mean_token_accuracy": 0.6712601681550344, + "num_tokens": 348356342.0, + "step": 2079 + }, + { + "entropy": 1.6536230842272441, + "epoch": 0.22850237565570844, + "grad_norm": 0.5909548997879028, + "learning_rate": 1.9667793943273507e-05, + "loss": 1.3718, + "mean_token_accuracy": 0.6658143649498621, + "num_tokens": 348573661.0, + "step": 2080 + }, + { + "entropy": 1.6918116807937622, + "epoch": 0.22861223256708138, + "grad_norm": 0.6436260938644409, + "learning_rate": 1.966736280468894e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.6581264610091845, + "num_tokens": 348730332.0, + "step": 2081 + }, + { + "entropy": 1.7079047163327534, + "epoch": 0.22872208947845432, + "grad_norm": 0.680583119392395, + "learning_rate": 1.966693139178624e-05, + "loss": 1.353, + "mean_token_accuracy": 0.6601024568080902, + "num_tokens": 348882457.0, + "step": 2082 + }, + { + "entropy": 1.655688891808192, + "epoch": 0.22883194638982726, + "grad_norm": 0.6544623970985413, + "learning_rate": 1.9666499704579074e-05, + "loss": 1.3916, + "mean_token_accuracy": 0.6687876433134079, + "num_tokens": 349018470.0, + "step": 2083 + }, + { + "entropy": 1.6593633393446605, + "epoch": 0.22894180330120018, + "grad_norm": 0.7558723092079163, + "learning_rate": 1.9666067743081094e-05, + "loss": 1.2681, + "mean_token_accuracy": 0.681985874970754, + "num_tokens": 349155041.0, + "step": 2084 + }, + { + "entropy": 1.755085527896881, + "epoch": 0.22905166021257312, + "grad_norm": 0.7257434725761414, + "learning_rate": 1.9665635507305975e-05, + "loss": 1.368, + "mean_token_accuracy": 0.6560467928647995, + "num_tokens": 349303770.0, + "step": 2085 + }, + { + "entropy": 1.7510084907213848, + "epoch": 0.22916151712394606, + "grad_norm": 0.7767149209976196, + "learning_rate": 1.9665202997267398e-05, + "loss": 1.33, + "mean_token_accuracy": 0.666937862833341, + "num_tokens": 349421045.0, + "step": 2086 + }, + { + "entropy": 1.7155856092770894, + "epoch": 0.229271374035319, + "grad_norm": 0.6380376219749451, + "learning_rate": 1.9664770212979048e-05, + "loss": 1.3557, + "mean_token_accuracy": 0.6549844940503439, + "num_tokens": 349573846.0, + "step": 2087 + }, + { + "entropy": 1.7313959101835887, + "epoch": 0.22938123094669194, + "grad_norm": 0.7793395519256592, + "learning_rate": 1.966433715445463e-05, + "loss": 1.4454, + "mean_token_accuracy": 0.6458447525898615, + "num_tokens": 349733574.0, + "step": 2088 + }, + { + "entropy": 1.7679253121217091, + "epoch": 0.22949108785806488, + "grad_norm": 0.7481524348258972, + "learning_rate": 1.9663903821707843e-05, + "loss": 1.4977, + "mean_token_accuracy": 0.646313488483429, + "num_tokens": 349914696.0, + "step": 2089 + }, + { + "entropy": 1.7201211750507355, + "epoch": 0.22960094476943782, + "grad_norm": 0.6787753701210022, + "learning_rate": 1.9663470214752404e-05, + "loss": 1.2731, + "mean_token_accuracy": 0.6699012964963913, + "num_tokens": 350019115.0, + "step": 2090 + }, + { + "entropy": 1.744313398996989, + "epoch": 0.22971080168081073, + "grad_norm": 0.8407695293426514, + "learning_rate": 1.966303633360204e-05, + "loss": 1.5798, + "mean_token_accuracy": 0.6501995325088501, + "num_tokens": 350204871.0, + "step": 2091 + }, + { + "entropy": 1.66973876953125, + "epoch": 0.22982065859218367, + "grad_norm": 0.6919146180152893, + "learning_rate": 1.9662602178270473e-05, + "loss": 1.4768, + "mean_token_accuracy": 0.6421976536512375, + "num_tokens": 350414385.0, + "step": 2092 + }, + { + "entropy": 1.710203657547633, + "epoch": 0.22993051550355662, + "grad_norm": 0.6580731272697449, + "learning_rate": 1.9662167748771456e-05, + "loss": 1.4226, + "mean_token_accuracy": 0.6441041479508082, + "num_tokens": 350563611.0, + "step": 2093 + }, + { + "entropy": 1.7262790699799855, + "epoch": 0.23004037241492956, + "grad_norm": 0.6261684894561768, + "learning_rate": 1.966173304511873e-05, + "loss": 1.3752, + "mean_token_accuracy": 0.6569176514943441, + "num_tokens": 350733461.0, + "step": 2094 + }, + { + "entropy": 1.7107460002104442, + "epoch": 0.2301502293263025, + "grad_norm": 0.722224235534668, + "learning_rate": 1.9661298067326057e-05, + "loss": 1.4182, + "mean_token_accuracy": 0.6536510388056437, + "num_tokens": 350896912.0, + "step": 2095 + }, + { + "entropy": 1.7100590467453003, + "epoch": 0.23026008623767544, + "grad_norm": 0.9301192760467529, + "learning_rate": 1.9660862815407203e-05, + "loss": 1.503, + "mean_token_accuracy": 0.6540075987577438, + "num_tokens": 351056942.0, + "step": 2096 + }, + { + "entropy": 1.6752377649148305, + "epoch": 0.23036994314904835, + "grad_norm": 0.6664115190505981, + "learning_rate": 1.9660427289375945e-05, + "loss": 1.4077, + "mean_token_accuracy": 0.6520341883103052, + "num_tokens": 351293001.0, + "step": 2097 + }, + { + "entropy": 1.7409682472546895, + "epoch": 0.2304798000604213, + "grad_norm": 0.6675509810447693, + "learning_rate": 1.965999148924606e-05, + "loss": 1.5078, + "mean_token_accuracy": 0.6427519768476486, + "num_tokens": 351525219.0, + "step": 2098 + }, + { + "entropy": 1.7048235932985942, + "epoch": 0.23058965697179423, + "grad_norm": 0.6759990453720093, + "learning_rate": 1.9659555415031352e-05, + "loss": 1.4459, + "mean_token_accuracy": 0.6511163661877314, + "num_tokens": 351698289.0, + "step": 2099 + }, + { + "entropy": 1.7245367964108784, + "epoch": 0.23069951388316717, + "grad_norm": 0.6386352181434631, + "learning_rate": 1.965911906674562e-05, + "loss": 1.477, + "mean_token_accuracy": 0.646999349196752, + "num_tokens": 351893291.0, + "step": 2100 + }, + { + "entropy": 1.6794813175996144, + "epoch": 0.23080937079454011, + "grad_norm": 0.7493569850921631, + "learning_rate": 1.9658682444402666e-05, + "loss": 1.0951, + "mean_token_accuracy": 0.6764346112807592, + "num_tokens": 352074217.0, + "step": 2101 + }, + { + "entropy": 1.6989248593648274, + "epoch": 0.23091922770591305, + "grad_norm": 0.6547640562057495, + "learning_rate": 1.9658245548016314e-05, + "loss": 1.3522, + "mean_token_accuracy": 0.661203866203626, + "num_tokens": 352207973.0, + "step": 2102 + }, + { + "entropy": 1.7575849791367848, + "epoch": 0.231029084617286, + "grad_norm": 0.75108802318573, + "learning_rate": 1.9657808377600395e-05, + "loss": 1.3971, + "mean_token_accuracy": 0.6579893529415131, + "num_tokens": 352402101.0, + "step": 2103 + }, + { + "entropy": 1.6805921494960785, + "epoch": 0.2311389415286589, + "grad_norm": 0.6265072226524353, + "learning_rate": 1.965737093316874e-05, + "loss": 1.246, + "mean_token_accuracy": 0.6822475343942642, + "num_tokens": 352530008.0, + "step": 2104 + }, + { + "entropy": 1.7218637466430664, + "epoch": 0.23124879844003185, + "grad_norm": 0.7253437042236328, + "learning_rate": 1.96569332147352e-05, + "loss": 1.3307, + "mean_token_accuracy": 0.6561064024766287, + "num_tokens": 352674554.0, + "step": 2105 + }, + { + "entropy": 1.6980949739615123, + "epoch": 0.2313586553514048, + "grad_norm": 0.5350558757781982, + "learning_rate": 1.965649522231362e-05, + "loss": 1.4637, + "mean_token_accuracy": 0.6409613688786825, + "num_tokens": 352954153.0, + "step": 2106 + }, + { + "entropy": 1.6509924431641896, + "epoch": 0.23146851226277773, + "grad_norm": 0.6137299537658691, + "learning_rate": 1.965605695591787e-05, + "loss": 1.3385, + "mean_token_accuracy": 0.6657126645247141, + "num_tokens": 353113713.0, + "step": 2107 + }, + { + "entropy": 1.690576394399007, + "epoch": 0.23157836917415067, + "grad_norm": 0.6963815689086914, + "learning_rate": 1.9655618415561816e-05, + "loss": 1.3486, + "mean_token_accuracy": 0.660049964984258, + "num_tokens": 353263381.0, + "step": 2108 + }, + { + "entropy": 1.7236856520175934, + "epoch": 0.2316882260855236, + "grad_norm": 0.678913950920105, + "learning_rate": 1.965517960125934e-05, + "loss": 1.3219, + "mean_token_accuracy": 0.6630937109390894, + "num_tokens": 353390958.0, + "step": 2109 + }, + { + "entropy": 1.8335295915603638, + "epoch": 0.23179808299689655, + "grad_norm": 0.8892464637756348, + "learning_rate": 1.965474051302433e-05, + "loss": 1.4938, + "mean_token_accuracy": 0.6548814475536346, + "num_tokens": 353538619.0, + "step": 2110 + }, + { + "entropy": 1.7106240193049114, + "epoch": 0.23190793990826947, + "grad_norm": 0.7260220646858215, + "learning_rate": 1.965430115087068e-05, + "loss": 1.3023, + "mean_token_accuracy": 0.6652000844478607, + "num_tokens": 353691828.0, + "step": 2111 + }, + { + "entropy": 1.7223450640837352, + "epoch": 0.2320177968196424, + "grad_norm": 0.6633872389793396, + "learning_rate": 1.9653861514812305e-05, + "loss": 1.439, + "mean_token_accuracy": 0.6730594833691915, + "num_tokens": 353900581.0, + "step": 2112 + }, + { + "entropy": 1.6289326747258503, + "epoch": 0.23212765373101535, + "grad_norm": 0.6929929256439209, + "learning_rate": 1.965342160486311e-05, + "loss": 1.3896, + "mean_token_accuracy": 0.6601720203955969, + "num_tokens": 354106680.0, + "step": 2113 + }, + { + "entropy": 1.6527433395385742, + "epoch": 0.2322375106423883, + "grad_norm": 0.6912239193916321, + "learning_rate": 1.9652981421037016e-05, + "loss": 1.3321, + "mean_token_accuracy": 0.6719396263360977, + "num_tokens": 354240940.0, + "step": 2114 + }, + { + "entropy": 1.7169695695241292, + "epoch": 0.23234736755376123, + "grad_norm": 0.682310938835144, + "learning_rate": 1.965254096334796e-05, + "loss": 1.3884, + "mean_token_accuracy": 0.6444130092859268, + "num_tokens": 354378597.0, + "step": 2115 + }, + { + "entropy": 1.794925073782603, + "epoch": 0.23245722446513417, + "grad_norm": 0.8051143288612366, + "learning_rate": 1.9652100231809886e-05, + "loss": 1.5086, + "mean_token_accuracy": 0.6416242470343908, + "num_tokens": 354547066.0, + "step": 2116 + }, + { + "entropy": 1.7723517616589863, + "epoch": 0.2325670813765071, + "grad_norm": 0.8096024990081787, + "learning_rate": 1.9651659226436736e-05, + "loss": 1.4075, + "mean_token_accuracy": 0.6405593703190485, + "num_tokens": 354728773.0, + "step": 2117 + }, + { + "entropy": 1.7235789597034454, + "epoch": 0.23267693828788003, + "grad_norm": 0.7707192897796631, + "learning_rate": 1.965121794724247e-05, + "loss": 1.6905, + "mean_token_accuracy": 0.6104727784792582, + "num_tokens": 354986725.0, + "step": 2118 + }, + { + "entropy": 1.6637963851292927, + "epoch": 0.23278679519925297, + "grad_norm": 0.7707030177116394, + "learning_rate": 1.9650776394241053e-05, + "loss": 1.244, + "mean_token_accuracy": 0.6805417140324911, + "num_tokens": 355113452.0, + "step": 2119 + }, + { + "entropy": 1.7366061508655548, + "epoch": 0.2328966521106259, + "grad_norm": 0.7351778149604797, + "learning_rate": 1.9650334567446464e-05, + "loss": 1.2731, + "mean_token_accuracy": 0.6736204822858175, + "num_tokens": 355213687.0, + "step": 2120 + }, + { + "entropy": 1.7788714170455933, + "epoch": 0.23300650902199885, + "grad_norm": 0.7859005928039551, + "learning_rate": 1.964989246687268e-05, + "loss": 1.3793, + "mean_token_accuracy": 0.651082048813502, + "num_tokens": 355321358.0, + "step": 2121 + }, + { + "entropy": 1.737849046786626, + "epoch": 0.2331163659333718, + "grad_norm": 0.7442585825920105, + "learning_rate": 1.96494500925337e-05, + "loss": 1.311, + "mean_token_accuracy": 0.6615156581004461, + "num_tokens": 355425618.0, + "step": 2122 + }, + { + "entropy": 1.7283195753892262, + "epoch": 0.23322622284474473, + "grad_norm": 0.638728141784668, + "learning_rate": 1.964900744444352e-05, + "loss": 1.4032, + "mean_token_accuracy": 0.6568774382273356, + "num_tokens": 355597017.0, + "step": 2123 + }, + { + "entropy": 1.6358279883861542, + "epoch": 0.23333607975611767, + "grad_norm": 0.6525757312774658, + "learning_rate": 1.9648564522616156e-05, + "loss": 1.2853, + "mean_token_accuracy": 0.6706103881200155, + "num_tokens": 355760536.0, + "step": 2124 + }, + { + "entropy": 1.737657070159912, + "epoch": 0.23344593666749058, + "grad_norm": 0.6344397664070129, + "learning_rate": 1.9648121327065618e-05, + "loss": 1.4552, + "mean_token_accuracy": 0.6427590002616247, + "num_tokens": 355950982.0, + "step": 2125 + }, + { + "entropy": 1.7366611162821453, + "epoch": 0.23355579357886352, + "grad_norm": 0.9384574294090271, + "learning_rate": 1.964767785780594e-05, + "loss": 1.3528, + "mean_token_accuracy": 0.6612386802832285, + "num_tokens": 356072860.0, + "step": 2126 + }, + { + "entropy": 1.696535994609197, + "epoch": 0.23366565049023647, + "grad_norm": 0.7266330718994141, + "learning_rate": 1.9647234114851152e-05, + "loss": 1.275, + "mean_token_accuracy": 0.6714907536904017, + "num_tokens": 356205844.0, + "step": 2127 + }, + { + "entropy": 1.7079964379469554, + "epoch": 0.2337755074016094, + "grad_norm": 0.7104028463363647, + "learning_rate": 1.9646790098215302e-05, + "loss": 1.4331, + "mean_token_accuracy": 0.6543066402276357, + "num_tokens": 356376146.0, + "step": 2128 + }, + { + "entropy": 1.6866773664951324, + "epoch": 0.23388536431298235, + "grad_norm": 1.0117084980010986, + "learning_rate": 1.964634580791244e-05, + "loss": 1.3217, + "mean_token_accuracy": 0.6756196220715841, + "num_tokens": 356535528.0, + "step": 2129 + }, + { + "entropy": 1.7522582213083904, + "epoch": 0.2339952212243553, + "grad_norm": 0.6908559799194336, + "learning_rate": 1.964590124395663e-05, + "loss": 1.4413, + "mean_token_accuracy": 0.6561037798722585, + "num_tokens": 356697797.0, + "step": 2130 + }, + { + "entropy": 1.7344152132670085, + "epoch": 0.2341050781357282, + "grad_norm": 0.6250041127204895, + "learning_rate": 1.9645456406361945e-05, + "loss": 1.4769, + "mean_token_accuracy": 0.6356958995262781, + "num_tokens": 356880210.0, + "step": 2131 + }, + { + "entropy": 1.76920285820961, + "epoch": 0.23421493504710114, + "grad_norm": 0.7835322618484497, + "learning_rate": 1.9645011295142456e-05, + "loss": 1.5479, + "mean_token_accuracy": 0.6402261257171631, + "num_tokens": 357043606.0, + "step": 2132 + }, + { + "entropy": 1.6776104768117268, + "epoch": 0.23432479195847408, + "grad_norm": 0.745183527469635, + "learning_rate": 1.9644565910312257e-05, + "loss": 1.4785, + "mean_token_accuracy": 0.6556303252776464, + "num_tokens": 357235188.0, + "step": 2133 + }, + { + "entropy": 1.7570060988267262, + "epoch": 0.23443464886984702, + "grad_norm": 1.126607060432434, + "learning_rate": 1.9644120251885442e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6616190771261851, + "num_tokens": 357369106.0, + "step": 2134 + }, + { + "entropy": 1.7477031548817952, + "epoch": 0.23454450578121996, + "grad_norm": 0.5550586581230164, + "learning_rate": 1.9643674319876116e-05, + "loss": 1.5428, + "mean_token_accuracy": 0.6342013676961263, + "num_tokens": 357630109.0, + "step": 2135 + }, + { + "entropy": 1.7409445345401764, + "epoch": 0.2346543626925929, + "grad_norm": 0.6409521102905273, + "learning_rate": 1.9643228114298394e-05, + "loss": 1.3992, + "mean_token_accuracy": 0.6574912518262863, + "num_tokens": 357876601.0, + "step": 2136 + }, + { + "entropy": 1.755222608645757, + "epoch": 0.23476421960396585, + "grad_norm": 0.8085409998893738, + "learning_rate": 1.9642781635166394e-05, + "loss": 1.3566, + "mean_token_accuracy": 0.6657391488552094, + "num_tokens": 358031405.0, + "step": 2137 + }, + { + "entropy": 1.7329784830411274, + "epoch": 0.23487407651533876, + "grad_norm": 0.7524353265762329, + "learning_rate": 1.9642334882494252e-05, + "loss": 1.2204, + "mean_token_accuracy": 0.6737546324729919, + "num_tokens": 358140931.0, + "step": 2138 + }, + { + "entropy": 1.7543505827585857, + "epoch": 0.2349839334267117, + "grad_norm": 0.673270046710968, + "learning_rate": 1.9641887856296103e-05, + "loss": 1.4576, + "mean_token_accuracy": 0.6485131829977036, + "num_tokens": 358374623.0, + "step": 2139 + }, + { + "entropy": 1.7349696457386017, + "epoch": 0.23509379033808464, + "grad_norm": 0.6543999314308167, + "learning_rate": 1.9641440556586103e-05, + "loss": 1.2942, + "mean_token_accuracy": 0.6671332617600759, + "num_tokens": 358489422.0, + "step": 2140 + }, + { + "entropy": 1.7005629340807598, + "epoch": 0.23520364724945758, + "grad_norm": 0.6191766262054443, + "learning_rate": 1.9640992983378396e-05, + "loss": 1.4521, + "mean_token_accuracy": 0.6530391176541647, + "num_tokens": 358675459.0, + "step": 2141 + }, + { + "entropy": 1.7502335608005524, + "epoch": 0.23531350416083052, + "grad_norm": 0.7959282994270325, + "learning_rate": 1.9640545136687163e-05, + "loss": 1.5671, + "mean_token_accuracy": 0.6383850276470184, + "num_tokens": 358841502.0, + "step": 2142 + }, + { + "entropy": 1.7522724668184917, + "epoch": 0.23542336107220346, + "grad_norm": 0.8718724846839905, + "learning_rate": 1.9640097016526562e-05, + "loss": 1.5196, + "mean_token_accuracy": 0.6397239714860916, + "num_tokens": 359026040.0, + "step": 2143 + }, + { + "entropy": 1.7374042570590973, + "epoch": 0.2355332179835764, + "grad_norm": 0.6595825552940369, + "learning_rate": 1.9639648622910786e-05, + "loss": 1.4752, + "mean_token_accuracy": 0.6529039045174917, + "num_tokens": 359254305.0, + "step": 2144 + }, + { + "entropy": 1.643284171819687, + "epoch": 0.23564307489494932, + "grad_norm": 0.6858879327774048, + "learning_rate": 1.963919995585403e-05, + "loss": 1.3472, + "mean_token_accuracy": 0.6681111405293146, + "num_tokens": 359394696.0, + "step": 2145 + }, + { + "entropy": 1.722949246565501, + "epoch": 0.23575293180632226, + "grad_norm": 0.687818706035614, + "learning_rate": 1.9638751015370482e-05, + "loss": 1.2756, + "mean_token_accuracy": 0.6662940879662832, + "num_tokens": 359529099.0, + "step": 2146 + }, + { + "entropy": 1.6889635523160298, + "epoch": 0.2358627887176952, + "grad_norm": 0.643974781036377, + "learning_rate": 1.963830180147436e-05, + "loss": 1.4641, + "mean_token_accuracy": 0.651663934191068, + "num_tokens": 359724038.0, + "step": 2147 + }, + { + "entropy": 1.6901886363824208, + "epoch": 0.23597264562906814, + "grad_norm": 0.5987870097160339, + "learning_rate": 1.9637852314179874e-05, + "loss": 1.4741, + "mean_token_accuracy": 0.6606296797593435, + "num_tokens": 359880863.0, + "step": 2148 + }, + { + "entropy": 1.716208666563034, + "epoch": 0.23608250254044108, + "grad_norm": 0.7272748351097107, + "learning_rate": 1.963740255350126e-05, + "loss": 1.5086, + "mean_token_accuracy": 0.6366155793269476, + "num_tokens": 360047860.0, + "step": 2149 + }, + { + "entropy": 1.7702193856239319, + "epoch": 0.23619235945181402, + "grad_norm": 0.664681613445282, + "learning_rate": 1.9636952519452744e-05, + "loss": 1.3891, + "mean_token_accuracy": 0.6544702003399531, + "num_tokens": 360202676.0, + "step": 2150 + }, + { + "entropy": 1.7394584218660991, + "epoch": 0.23630221636318696, + "grad_norm": 0.6830300688743591, + "learning_rate": 1.9636502212048572e-05, + "loss": 1.3563, + "mean_token_accuracy": 0.6578590472539266, + "num_tokens": 360355267.0, + "step": 2151 + }, + { + "entropy": 1.7491403818130493, + "epoch": 0.23641207327455988, + "grad_norm": 0.7265613675117493, + "learning_rate": 1.9636051631303e-05, + "loss": 1.477, + "mean_token_accuracy": 0.6517923126618067, + "num_tokens": 360478966.0, + "step": 2152 + }, + { + "entropy": 1.7051890293757122, + "epoch": 0.23652193018593282, + "grad_norm": 0.7303532361984253, + "learning_rate": 1.9635600777230282e-05, + "loss": 1.3862, + "mean_token_accuracy": 0.6609994073708853, + "num_tokens": 360660424.0, + "step": 2153 + }, + { + "entropy": 1.7975957592328389, + "epoch": 0.23663178709730576, + "grad_norm": 0.5887877345085144, + "learning_rate": 1.9635149649844692e-05, + "loss": 1.5907, + "mean_token_accuracy": 0.6157426983118057, + "num_tokens": 360924995.0, + "step": 2154 + }, + { + "entropy": 1.6898219386736553, + "epoch": 0.2367416440086787, + "grad_norm": 0.6663623452186584, + "learning_rate": 1.963469824916051e-05, + "loss": 1.3366, + "mean_token_accuracy": 0.6628438780705134, + "num_tokens": 361077147.0, + "step": 2155 + }, + { + "entropy": 1.6904780368010204, + "epoch": 0.23685150092005164, + "grad_norm": 0.6058080196380615, + "learning_rate": 1.9634246575192016e-05, + "loss": 1.5434, + "mean_token_accuracy": 0.6356114248434702, + "num_tokens": 361294273.0, + "step": 2156 + }, + { + "entropy": 1.7322813769181569, + "epoch": 0.23696135783142458, + "grad_norm": 0.6489390730857849, + "learning_rate": 1.963379462795351e-05, + "loss": 1.2153, + "mean_token_accuracy": 0.6865619271993637, + "num_tokens": 361438046.0, + "step": 2157 + }, + { + "entropy": 1.7179724077383678, + "epoch": 0.2370712147427975, + "grad_norm": 0.8556140661239624, + "learning_rate": 1.9633342407459293e-05, + "loss": 1.4153, + "mean_token_accuracy": 0.6623386641343435, + "num_tokens": 361588264.0, + "step": 2158 + }, + { + "entropy": 1.728331635395686, + "epoch": 0.23718107165417043, + "grad_norm": 0.8043140172958374, + "learning_rate": 1.963288991372368e-05, + "loss": 1.4934, + "mean_token_accuracy": 0.6493467340866724, + "num_tokens": 361753598.0, + "step": 2159 + }, + { + "entropy": 1.732284684975942, + "epoch": 0.23729092856554337, + "grad_norm": 0.7125091552734375, + "learning_rate": 1.963243714676099e-05, + "loss": 1.4851, + "mean_token_accuracy": 0.6488629430532455, + "num_tokens": 361902073.0, + "step": 2160 + }, + { + "entropy": 1.78290989001592, + "epoch": 0.23740078547691632, + "grad_norm": 0.7010405659675598, + "learning_rate": 1.9631984106585555e-05, + "loss": 1.3971, + "mean_token_accuracy": 0.6474642306566238, + "num_tokens": 362076912.0, + "step": 2161 + }, + { + "entropy": 1.7021795014540355, + "epoch": 0.23751064238828926, + "grad_norm": 0.6787356734275818, + "learning_rate": 1.9631530793211714e-05, + "loss": 1.4077, + "mean_token_accuracy": 0.648701603213946, + "num_tokens": 362238050.0, + "step": 2162 + }, + { + "entropy": 1.7377806107203166, + "epoch": 0.2376204992996622, + "grad_norm": 0.5525624752044678, + "learning_rate": 1.9631077206653813e-05, + "loss": 1.3933, + "mean_token_accuracy": 0.6540517012278239, + "num_tokens": 362480190.0, + "step": 2163 + }, + { + "entropy": 1.7550107041994731, + "epoch": 0.23773035621103514, + "grad_norm": 0.7044647336006165, + "learning_rate": 1.9630623346926204e-05, + "loss": 1.5253, + "mean_token_accuracy": 0.635568325718244, + "num_tokens": 362670764.0, + "step": 2164 + }, + { + "entropy": 1.7480690081914265, + "epoch": 0.23784021312240805, + "grad_norm": 0.7633349895477295, + "learning_rate": 1.9630169214043256e-05, + "loss": 1.4851, + "mean_token_accuracy": 0.6614658435185751, + "num_tokens": 362833342.0, + "step": 2165 + }, + { + "entropy": 1.7279286682605743, + "epoch": 0.237950070033781, + "grad_norm": 0.6929425597190857, + "learning_rate": 1.9629714808019346e-05, + "loss": 1.3254, + "mean_token_accuracy": 0.668560599287351, + "num_tokens": 363009578.0, + "step": 2166 + }, + { + "entropy": 1.6972400446732838, + "epoch": 0.23805992694515393, + "grad_norm": 0.6299599409103394, + "learning_rate": 1.9629260128868845e-05, + "loss": 1.4519, + "mean_token_accuracy": 0.6551623294750849, + "num_tokens": 363187419.0, + "step": 2167 + }, + { + "entropy": 1.7553909023602803, + "epoch": 0.23816978385652687, + "grad_norm": 0.8156332969665527, + "learning_rate": 1.9628805176606154e-05, + "loss": 1.4541, + "mean_token_accuracy": 0.647150124112765, + "num_tokens": 363328352.0, + "step": 2168 + }, + { + "entropy": 1.7071846425533295, + "epoch": 0.23827964076789981, + "grad_norm": 0.856622040271759, + "learning_rate": 1.9628349951245664e-05, + "loss": 1.3319, + "mean_token_accuracy": 0.6602184077103933, + "num_tokens": 363489818.0, + "step": 2169 + }, + { + "entropy": 1.739296982685725, + "epoch": 0.23838949767927275, + "grad_norm": 0.6858103275299072, + "learning_rate": 1.962789445280179e-05, + "loss": 1.2651, + "mean_token_accuracy": 0.6659305195013682, + "num_tokens": 363619436.0, + "step": 2170 + }, + { + "entropy": 1.6668515503406525, + "epoch": 0.2384993545906457, + "grad_norm": 0.6311803460121155, + "learning_rate": 1.962743868128894e-05, + "loss": 1.4142, + "mean_token_accuracy": 0.6425204674402872, + "num_tokens": 363811652.0, + "step": 2171 + }, + { + "entropy": 1.7442655265331268, + "epoch": 0.2386092115020186, + "grad_norm": 0.6944742202758789, + "learning_rate": 1.9626982636721545e-05, + "loss": 1.4477, + "mean_token_accuracy": 0.6392138053973516, + "num_tokens": 363977169.0, + "step": 2172 + }, + { + "entropy": 1.7125125726064045, + "epoch": 0.23871906841339155, + "grad_norm": 0.7651383876800537, + "learning_rate": 1.9626526319114036e-05, + "loss": 1.3283, + "mean_token_accuracy": 0.6706061810255051, + "num_tokens": 364116621.0, + "step": 2173 + }, + { + "entropy": 1.7354065477848053, + "epoch": 0.2388289253247645, + "grad_norm": 0.6315815448760986, + "learning_rate": 1.9626069728480858e-05, + "loss": 1.443, + "mean_token_accuracy": 0.6499165147542953, + "num_tokens": 364276980.0, + "step": 2174 + }, + { + "entropy": 1.735385040442149, + "epoch": 0.23893878223613743, + "grad_norm": 0.7463067173957825, + "learning_rate": 1.962561286483646e-05, + "loss": 1.3995, + "mean_token_accuracy": 0.6562738716602325, + "num_tokens": 364459519.0, + "step": 2175 + }, + { + "entropy": 1.7214009960492451, + "epoch": 0.23904863914751037, + "grad_norm": 0.7005197405815125, + "learning_rate": 1.9625155728195302e-05, + "loss": 1.3058, + "mean_token_accuracy": 0.666901707649231, + "num_tokens": 364587858.0, + "step": 2176 + }, + { + "entropy": 1.712921271721522, + "epoch": 0.2391584960588833, + "grad_norm": 0.6100747585296631, + "learning_rate": 1.962469831857185e-05, + "loss": 1.4452, + "mean_token_accuracy": 0.6609046856562296, + "num_tokens": 364753196.0, + "step": 2177 + }, + { + "entropy": 1.7237416009108226, + "epoch": 0.23926835297025625, + "grad_norm": 0.7957034111022949, + "learning_rate": 1.9624240635980584e-05, + "loss": 1.2262, + "mean_token_accuracy": 0.674635981520017, + "num_tokens": 364910423.0, + "step": 2178 + }, + { + "entropy": 1.700117399295171, + "epoch": 0.23937820988162917, + "grad_norm": 0.6685667037963867, + "learning_rate": 1.9623782680435987e-05, + "loss": 1.5193, + "mean_token_accuracy": 0.6441571215788523, + "num_tokens": 365077021.0, + "step": 2179 + }, + { + "entropy": 1.7847689390182495, + "epoch": 0.2394880667930021, + "grad_norm": 0.7147764563560486, + "learning_rate": 1.9623324451952553e-05, + "loss": 1.5083, + "mean_token_accuracy": 0.6502701590458552, + "num_tokens": 365244591.0, + "step": 2180 + }, + { + "entropy": 1.7398345371087391, + "epoch": 0.23959792370437505, + "grad_norm": 0.8563188910484314, + "learning_rate": 1.962286595054479e-05, + "loss": 1.3004, + "mean_token_accuracy": 0.6682943751414617, + "num_tokens": 365376513.0, + "step": 2181 + }, + { + "entropy": 1.7507529258728027, + "epoch": 0.239707780615748, + "grad_norm": 0.693526566028595, + "learning_rate": 1.9622407176227203e-05, + "loss": 1.2758, + "mean_token_accuracy": 0.6713242183128992, + "num_tokens": 365498732.0, + "step": 2182 + }, + { + "entropy": 1.7492507894833882, + "epoch": 0.23981763752712093, + "grad_norm": 0.6851478219032288, + "learning_rate": 1.9621948129014313e-05, + "loss": 1.4017, + "mean_token_accuracy": 0.6537040372689565, + "num_tokens": 365646829.0, + "step": 2183 + }, + { + "entropy": 1.6749079823493958, + "epoch": 0.23992749443849387, + "grad_norm": 0.6201784610748291, + "learning_rate": 1.962148880892065e-05, + "loss": 1.283, + "mean_token_accuracy": 0.6736765950918198, + "num_tokens": 365806583.0, + "step": 2184 + }, + { + "entropy": 1.709314078092575, + "epoch": 0.2400373513498668, + "grad_norm": 0.758945643901825, + "learning_rate": 1.9621029215960754e-05, + "loss": 1.4324, + "mean_token_accuracy": 0.6545686274766922, + "num_tokens": 365961748.0, + "step": 2185 + }, + { + "entropy": 1.6410066386063893, + "epoch": 0.24014720826123973, + "grad_norm": 0.5668028593063354, + "learning_rate": 1.9620569350149165e-05, + "loss": 1.4312, + "mean_token_accuracy": 0.6502448171377182, + "num_tokens": 366235040.0, + "step": 2186 + }, + { + "entropy": 1.6869204839070637, + "epoch": 0.24025706517261267, + "grad_norm": 0.648259162902832, + "learning_rate": 1.962010921150044e-05, + "loss": 1.4071, + "mean_token_accuracy": 0.653958131869634, + "num_tokens": 366404008.0, + "step": 2187 + }, + { + "entropy": 1.762471745411555, + "epoch": 0.2403669220839856, + "grad_norm": 0.730883002281189, + "learning_rate": 1.9619648800029147e-05, + "loss": 1.507, + "mean_token_accuracy": 0.6531516114870707, + "num_tokens": 366593272.0, + "step": 2188 + }, + { + "entropy": 1.7137305339177449, + "epoch": 0.24047677899535855, + "grad_norm": 0.6646022796630859, + "learning_rate": 1.961918811574985e-05, + "loss": 1.3821, + "mean_token_accuracy": 0.6573305775721868, + "num_tokens": 366745378.0, + "step": 2189 + }, + { + "entropy": 1.724080502986908, + "epoch": 0.2405866359067315, + "grad_norm": 0.7894087433815002, + "learning_rate": 1.9618727158677135e-05, + "loss": 1.2611, + "mean_token_accuracy": 0.6693530778090159, + "num_tokens": 366856467.0, + "step": 2190 + }, + { + "entropy": 1.6957077880700429, + "epoch": 0.24069649281810443, + "grad_norm": 0.7428924441337585, + "learning_rate": 1.9618265928825585e-05, + "loss": 1.478, + "mean_token_accuracy": 0.6377401451269785, + "num_tokens": 367087055.0, + "step": 2191 + }, + { + "entropy": 1.7296242912610371, + "epoch": 0.24080634972947734, + "grad_norm": 0.820216178894043, + "learning_rate": 1.9617804426209806e-05, + "loss": 1.4666, + "mean_token_accuracy": 0.6516137719154358, + "num_tokens": 367269105.0, + "step": 2192 + }, + { + "entropy": 1.7189955015977223, + "epoch": 0.24091620664085028, + "grad_norm": 0.6569713950157166, + "learning_rate": 1.96173426508444e-05, + "loss": 1.3206, + "mean_token_accuracy": 0.6575327118237814, + "num_tokens": 367475382.0, + "step": 2193 + }, + { + "entropy": 1.6480527619520824, + "epoch": 0.24102606355222322, + "grad_norm": 0.6472831964492798, + "learning_rate": 1.961688060274398e-05, + "loss": 1.3378, + "mean_token_accuracy": 0.6720782270034155, + "num_tokens": 367684314.0, + "step": 2194 + }, + { + "entropy": 1.7488195300102234, + "epoch": 0.24113592046359617, + "grad_norm": 0.6956831812858582, + "learning_rate": 1.9616418281923173e-05, + "loss": 1.3187, + "mean_token_accuracy": 0.6687667121489843, + "num_tokens": 367816401.0, + "step": 2195 + }, + { + "entropy": 1.7608485917250316, + "epoch": 0.2412457773749691, + "grad_norm": 0.7031329274177551, + "learning_rate": 1.9615955688396612e-05, + "loss": 1.3447, + "mean_token_accuracy": 0.6478245705366135, + "num_tokens": 367957070.0, + "step": 2196 + }, + { + "entropy": 1.6865754624207814, + "epoch": 0.24135563428634205, + "grad_norm": 0.604003369808197, + "learning_rate": 1.961549282217893e-05, + "loss": 1.4346, + "mean_token_accuracy": 0.650801420211792, + "num_tokens": 368155745.0, + "step": 2197 + }, + { + "entropy": 1.7110398511091869, + "epoch": 0.241465491197715, + "grad_norm": 0.6684320569038391, + "learning_rate": 1.961502968328479e-05, + "loss": 1.3022, + "mean_token_accuracy": 0.6626110126574835, + "num_tokens": 368276371.0, + "step": 2198 + }, + { + "entropy": 1.7122747302055359, + "epoch": 0.2415753481090879, + "grad_norm": 0.6755695343017578, + "learning_rate": 1.9614566271728837e-05, + "loss": 1.3599, + "mean_token_accuracy": 0.648558442791303, + "num_tokens": 368465965.0, + "step": 2199 + }, + { + "entropy": 1.8012695610523224, + "epoch": 0.24168520502046084, + "grad_norm": 0.684968888759613, + "learning_rate": 1.9614102587525747e-05, + "loss": 1.4568, + "mean_token_accuracy": 0.6433351039886475, + "num_tokens": 368636338.0, + "step": 2200 + }, + { + "entropy": 1.7449529965718586, + "epoch": 0.24179506193183378, + "grad_norm": 0.7659276127815247, + "learning_rate": 1.961363863069019e-05, + "loss": 1.5253, + "mean_token_accuracy": 0.633417159318924, + "num_tokens": 368846134.0, + "step": 2201 + }, + { + "entropy": 1.6875906387964885, + "epoch": 0.24190491884320672, + "grad_norm": 0.5927108526229858, + "learning_rate": 1.9613174401236854e-05, + "loss": 1.3356, + "mean_token_accuracy": 0.6597074568271637, + "num_tokens": 369033966.0, + "step": 2202 + }, + { + "entropy": 1.6861758530139923, + "epoch": 0.24201477575457966, + "grad_norm": 0.6627983450889587, + "learning_rate": 1.9612709899180426e-05, + "loss": 1.2911, + "mean_token_accuracy": 0.6747742146253586, + "num_tokens": 369165869.0, + "step": 2203 + }, + { + "entropy": 1.6657120088736217, + "epoch": 0.2421246326659526, + "grad_norm": 0.6609204411506653, + "learning_rate": 1.961224512453561e-05, + "loss": 1.3563, + "mean_token_accuracy": 0.6647985180219015, + "num_tokens": 369339600.0, + "step": 2204 + }, + { + "entropy": 1.7627909282843273, + "epoch": 0.24223448957732555, + "grad_norm": 0.6282365322113037, + "learning_rate": 1.961178007731712e-05, + "loss": 1.4089, + "mean_token_accuracy": 0.6540078123410543, + "num_tokens": 369538959.0, + "step": 2205 + }, + { + "entropy": 1.722537229458491, + "epoch": 0.24234434648869846, + "grad_norm": 0.7731828093528748, + "learning_rate": 1.961131475753967e-05, + "loss": 1.4734, + "mean_token_accuracy": 0.648734783132871, + "num_tokens": 369681113.0, + "step": 2206 + }, + { + "entropy": 1.7212282319863637, + "epoch": 0.2424542034000714, + "grad_norm": 0.8537726402282715, + "learning_rate": 1.9610849165217987e-05, + "loss": 1.4472, + "mean_token_accuracy": 0.6453748544057211, + "num_tokens": 369894724.0, + "step": 2207 + }, + { + "entropy": 1.6924510598182678, + "epoch": 0.24256406031144434, + "grad_norm": 0.6755207777023315, + "learning_rate": 1.9610383300366805e-05, + "loss": 1.4163, + "mean_token_accuracy": 0.6647496223449707, + "num_tokens": 370034227.0, + "step": 2208 + }, + { + "entropy": 1.649043579896291, + "epoch": 0.24267391722281728, + "grad_norm": 0.7188857793807983, + "learning_rate": 1.960991716300088e-05, + "loss": 1.3476, + "mean_token_accuracy": 0.6680084963639578, + "num_tokens": 370255912.0, + "step": 2209 + }, + { + "entropy": 1.6831135253111522, + "epoch": 0.24278377413419022, + "grad_norm": 0.6619141101837158, + "learning_rate": 1.960945075313495e-05, + "loss": 1.4331, + "mean_token_accuracy": 0.652848685781161, + "num_tokens": 370441362.0, + "step": 2210 + }, + { + "entropy": 1.7197281420230865, + "epoch": 0.24289363104556316, + "grad_norm": 0.710666298866272, + "learning_rate": 1.9608984070783783e-05, + "loss": 1.3113, + "mean_token_accuracy": 0.6746131976445516, + "num_tokens": 370578910.0, + "step": 2211 + }, + { + "entropy": 1.6992753148078918, + "epoch": 0.2430034879569361, + "grad_norm": 0.6412252187728882, + "learning_rate": 1.9608517115962155e-05, + "loss": 1.4755, + "mean_token_accuracy": 0.6443726023038229, + "num_tokens": 370759506.0, + "step": 2212 + }, + { + "entropy": 1.7628650764624278, + "epoch": 0.24311334486830902, + "grad_norm": 0.8076620101928711, + "learning_rate": 1.9608049888684834e-05, + "loss": 1.369, + "mean_token_accuracy": 0.6632248312234879, + "num_tokens": 370888869.0, + "step": 2213 + }, + { + "entropy": 1.7496830423672993, + "epoch": 0.24322320177968196, + "grad_norm": 0.6189358234405518, + "learning_rate": 1.9607582388966616e-05, + "loss": 1.3547, + "mean_token_accuracy": 0.6613173534472784, + "num_tokens": 371046402.0, + "step": 2214 + }, + { + "entropy": 1.7203446328639984, + "epoch": 0.2433330586910549, + "grad_norm": 0.6774999499320984, + "learning_rate": 1.960711461682229e-05, + "loss": 1.411, + "mean_token_accuracy": 0.6542786955833435, + "num_tokens": 371304026.0, + "step": 2215 + }, + { + "entropy": 1.6830947597821553, + "epoch": 0.24344291560242784, + "grad_norm": 0.6460142731666565, + "learning_rate": 1.960664657226667e-05, + "loss": 1.3369, + "mean_token_accuracy": 0.6676846394936243, + "num_tokens": 371470000.0, + "step": 2216 + }, + { + "entropy": 1.7166868448257446, + "epoch": 0.24355277251380078, + "grad_norm": 0.6478844285011292, + "learning_rate": 1.960617825531456e-05, + "loss": 1.2987, + "mean_token_accuracy": 0.6614114989837011, + "num_tokens": 371579838.0, + "step": 2217 + }, + { + "entropy": 1.7828513085842133, + "epoch": 0.24366262942517372, + "grad_norm": 1.131135106086731, + "learning_rate": 1.960570966598079e-05, + "loss": 1.5558, + "mean_token_accuracy": 0.6534903893868128, + "num_tokens": 371739465.0, + "step": 2218 + }, + { + "entropy": 1.728717068831126, + "epoch": 0.24377248633654663, + "grad_norm": 0.6150972247123718, + "learning_rate": 1.9605240804280185e-05, + "loss": 1.4331, + "mean_token_accuracy": 0.6512966354688009, + "num_tokens": 371937968.0, + "step": 2219 + }, + { + "entropy": 1.7227512498696644, + "epoch": 0.24388234324791958, + "grad_norm": 0.7741029262542725, + "learning_rate": 1.9604771670227586e-05, + "loss": 1.5728, + "mean_token_accuracy": 0.6268777251243591, + "num_tokens": 372118632.0, + "step": 2220 + }, + { + "entropy": 1.7896570165952046, + "epoch": 0.24399220015929252, + "grad_norm": 0.6654759049415588, + "learning_rate": 1.960430226383784e-05, + "loss": 1.4358, + "mean_token_accuracy": 0.6406222383181254, + "num_tokens": 372328145.0, + "step": 2221 + }, + { + "entropy": 1.7172939280668895, + "epoch": 0.24410205707066546, + "grad_norm": 0.759647011756897, + "learning_rate": 1.9603832585125807e-05, + "loss": 1.472, + "mean_token_accuracy": 0.6392781734466553, + "num_tokens": 372540925.0, + "step": 2222 + }, + { + "entropy": 1.7888973255952199, + "epoch": 0.2442119139820384, + "grad_norm": 0.7657260298728943, + "learning_rate": 1.960336263410635e-05, + "loss": 1.3073, + "mean_token_accuracy": 0.671954408288002, + "num_tokens": 372674433.0, + "step": 2223 + }, + { + "entropy": 1.6941000918547313, + "epoch": 0.24432177089341134, + "grad_norm": 0.7540859580039978, + "learning_rate": 1.960289241079434e-05, + "loss": 1.3743, + "mean_token_accuracy": 0.6615285774072012, + "num_tokens": 372864530.0, + "step": 2224 + }, + { + "entropy": 1.7387097477912903, + "epoch": 0.24443162780478428, + "grad_norm": 0.629091739654541, + "learning_rate": 1.960242191520466e-05, + "loss": 1.4295, + "mean_token_accuracy": 0.6414004961649576, + "num_tokens": 373026958.0, + "step": 2225 + }, + { + "entropy": 1.6648336052894592, + "epoch": 0.2445414847161572, + "grad_norm": 0.5851943492889404, + "learning_rate": 1.960195114735221e-05, + "loss": 1.4927, + "mean_token_accuracy": 0.6401997953653336, + "num_tokens": 373255586.0, + "step": 2226 + }, + { + "entropy": 1.6632899244626362, + "epoch": 0.24465134162753013, + "grad_norm": 0.5614564418792725, + "learning_rate": 1.9601480107251875e-05, + "loss": 1.3345, + "mean_token_accuracy": 0.6726627051830292, + "num_tokens": 373415841.0, + "step": 2227 + }, + { + "entropy": 1.7338934938112895, + "epoch": 0.24476119853890307, + "grad_norm": 0.6574128866195679, + "learning_rate": 1.960100879491857e-05, + "loss": 1.4711, + "mean_token_accuracy": 0.6444676717122396, + "num_tokens": 373600936.0, + "step": 2228 + }, + { + "entropy": 1.7459152539571126, + "epoch": 0.24487105545027602, + "grad_norm": 0.7730247974395752, + "learning_rate": 1.960053721036722e-05, + "loss": 1.3288, + "mean_token_accuracy": 0.6706405679384867, + "num_tokens": 373739586.0, + "step": 2229 + }, + { + "entropy": 1.7264153758684795, + "epoch": 0.24498091236164896, + "grad_norm": 0.6619213819503784, + "learning_rate": 1.9600065353612735e-05, + "loss": 1.5545, + "mean_token_accuracy": 0.6263764947652817, + "num_tokens": 373943250.0, + "step": 2230 + }, + { + "entropy": 1.7214798033237457, + "epoch": 0.2450907692730219, + "grad_norm": 0.7965298295021057, + "learning_rate": 1.959959322467006e-05, + "loss": 1.5338, + "mean_token_accuracy": 0.6338108479976654, + "num_tokens": 374145099.0, + "step": 2231 + }, + { + "entropy": 1.7392083803812664, + "epoch": 0.24520062618439484, + "grad_norm": 0.6499489545822144, + "learning_rate": 1.9599120823554137e-05, + "loss": 1.3013, + "mean_token_accuracy": 0.6649557749430338, + "num_tokens": 374267307.0, + "step": 2232 + }, + { + "entropy": 1.6417719821135204, + "epoch": 0.24531048309576775, + "grad_norm": 0.6339417099952698, + "learning_rate": 1.959864815027991e-05, + "loss": 1.4139, + "mean_token_accuracy": 0.6582342187563578, + "num_tokens": 374443719.0, + "step": 2233 + }, + { + "entropy": 1.7121461629867554, + "epoch": 0.2454203400071407, + "grad_norm": 0.6677350997924805, + "learning_rate": 1.9598175204862348e-05, + "loss": 1.3063, + "mean_token_accuracy": 0.6706344981988271, + "num_tokens": 374593607.0, + "step": 2234 + }, + { + "entropy": 1.7524688243865967, + "epoch": 0.24553019691851363, + "grad_norm": 0.7025663256645203, + "learning_rate": 1.959770198731641e-05, + "loss": 1.4846, + "mean_token_accuracy": 0.6432475497325262, + "num_tokens": 374766690.0, + "step": 2235 + }, + { + "entropy": 1.7159065902233124, + "epoch": 0.24564005382988657, + "grad_norm": 0.7714294791221619, + "learning_rate": 1.9597228497657084e-05, + "loss": 1.3186, + "mean_token_accuracy": 0.6638337969779968, + "num_tokens": 374890263.0, + "step": 2236 + }, + { + "entropy": 1.722405840953191, + "epoch": 0.24574991074125951, + "grad_norm": 0.863430380821228, + "learning_rate": 1.9596754735899347e-05, + "loss": 1.2672, + "mean_token_accuracy": 0.669483408331871, + "num_tokens": 375022168.0, + "step": 2237 + }, + { + "entropy": 1.7640716234842937, + "epoch": 0.24585976765263245, + "grad_norm": 0.790648877620697, + "learning_rate": 1.95962807020582e-05, + "loss": 1.5312, + "mean_token_accuracy": 0.6371087779601415, + "num_tokens": 375212695.0, + "step": 2238 + }, + { + "entropy": 1.6626634697119396, + "epoch": 0.2459696245640054, + "grad_norm": 0.6195372343063354, + "learning_rate": 1.959580639614864e-05, + "loss": 1.5048, + "mean_token_accuracy": 0.631678581237793, + "num_tokens": 375458846.0, + "step": 2239 + }, + { + "entropy": 1.7087614436944325, + "epoch": 0.2460794814753783, + "grad_norm": 0.7846135497093201, + "learning_rate": 1.959533181818568e-05, + "loss": 1.3327, + "mean_token_accuracy": 0.6647358934084574, + "num_tokens": 375608137.0, + "step": 2240 + }, + { + "entropy": 1.6494195957978566, + "epoch": 0.24618933838675125, + "grad_norm": 0.9152674674987793, + "learning_rate": 1.9594856968184338e-05, + "loss": 1.4468, + "mean_token_accuracy": 0.664206475019455, + "num_tokens": 375798556.0, + "step": 2241 + }, + { + "entropy": 1.7339473962783813, + "epoch": 0.2462991952981242, + "grad_norm": 0.7858838438987732, + "learning_rate": 1.959438184615965e-05, + "loss": 1.4387, + "mean_token_accuracy": 0.660494844118754, + "num_tokens": 375947922.0, + "step": 2242 + }, + { + "entropy": 1.7143746713797252, + "epoch": 0.24640905220949713, + "grad_norm": 0.6970986127853394, + "learning_rate": 1.9593906452126646e-05, + "loss": 1.6092, + "mean_token_accuracy": 0.6282972743113836, + "num_tokens": 376216590.0, + "step": 2243 + }, + { + "entropy": 1.6953730583190918, + "epoch": 0.24651890912087007, + "grad_norm": 0.6785817742347717, + "learning_rate": 1.9593430786100382e-05, + "loss": 1.2938, + "mean_token_accuracy": 0.6653183003266653, + "num_tokens": 376336959.0, + "step": 2244 + }, + { + "entropy": 1.722437173128128, + "epoch": 0.246628766032243, + "grad_norm": 0.6590582132339478, + "learning_rate": 1.9592954848095904e-05, + "loss": 1.65, + "mean_token_accuracy": 0.6332317143678665, + "num_tokens": 376538380.0, + "step": 2245 + }, + { + "entropy": 1.721213052670161, + "epoch": 0.24673862294361595, + "grad_norm": 0.8105208277702332, + "learning_rate": 1.9592478638128272e-05, + "loss": 1.3964, + "mean_token_accuracy": 0.6566237409909567, + "num_tokens": 376704949.0, + "step": 2246 + }, + { + "entropy": 1.7157046496868134, + "epoch": 0.24684847985498887, + "grad_norm": 0.6640613675117493, + "learning_rate": 1.9592002156212568e-05, + "loss": 1.3865, + "mean_token_accuracy": 0.6623821159203848, + "num_tokens": 376867333.0, + "step": 2247 + }, + { + "entropy": 1.6208436489105225, + "epoch": 0.2469583367663618, + "grad_norm": 0.6282928586006165, + "learning_rate": 1.9591525402363864e-05, + "loss": 1.309, + "mean_token_accuracy": 0.67738905052344, + "num_tokens": 377061118.0, + "step": 2248 + }, + { + "entropy": 1.7082207401593525, + "epoch": 0.24706819367773475, + "grad_norm": 0.6272217035293579, + "learning_rate": 1.9591048376597253e-05, + "loss": 1.5739, + "mean_token_accuracy": 0.636579230427742, + "num_tokens": 377247432.0, + "step": 2249 + }, + { + "entropy": 1.7537776231765747, + "epoch": 0.2471780505891077, + "grad_norm": 0.7704234719276428, + "learning_rate": 1.959057107892783e-05, + "loss": 1.6269, + "mean_token_accuracy": 0.626529390613238, + "num_tokens": 377454531.0, + "step": 2250 + }, + { + "entropy": 1.681376536687215, + "epoch": 0.24728790750048063, + "grad_norm": 0.65434330701828, + "learning_rate": 1.9590093509370708e-05, + "loss": 1.3295, + "mean_token_accuracy": 0.6573110024134318, + "num_tokens": 377664696.0, + "step": 2251 + }, + { + "entropy": 1.69319083293279, + "epoch": 0.24739776441185357, + "grad_norm": 0.7122017741203308, + "learning_rate": 1.9589615667940994e-05, + "loss": 1.48, + "mean_token_accuracy": 0.645169585943222, + "num_tokens": 377844199.0, + "step": 2252 + }, + { + "entropy": 1.7546374201774597, + "epoch": 0.24750762132322648, + "grad_norm": 0.6402483582496643, + "learning_rate": 1.958913755465382e-05, + "loss": 1.3988, + "mean_token_accuracy": 0.6497417340675989, + "num_tokens": 378018473.0, + "step": 2253 + }, + { + "entropy": 1.82759756843249, + "epoch": 0.24761747823459943, + "grad_norm": 0.7866818308830261, + "learning_rate": 1.958865916952431e-05, + "loss": 1.6695, + "mean_token_accuracy": 0.6300811717907587, + "num_tokens": 378212776.0, + "step": 2254 + }, + { + "entropy": 1.7436250348885853, + "epoch": 0.24772733514597237, + "grad_norm": 0.6628825068473816, + "learning_rate": 1.9588180512567604e-05, + "loss": 1.4227, + "mean_token_accuracy": 0.6468610117832819, + "num_tokens": 378394493.0, + "step": 2255 + }, + { + "entropy": 1.7446848253409069, + "epoch": 0.2478371920573453, + "grad_norm": 0.6798176765441895, + "learning_rate": 1.958770158379886e-05, + "loss": 1.4955, + "mean_token_accuracy": 0.6422973871231079, + "num_tokens": 378575118.0, + "step": 2256 + }, + { + "entropy": 1.7074936429659526, + "epoch": 0.24794704896871825, + "grad_norm": 1.042772650718689, + "learning_rate": 1.9587222383233228e-05, + "loss": 1.5275, + "mean_token_accuracy": 0.6391404122114182, + "num_tokens": 378807469.0, + "step": 2257 + }, + { + "entropy": 1.72696053981781, + "epoch": 0.2480569058800912, + "grad_norm": 0.766135573387146, + "learning_rate": 1.9586742910885874e-05, + "loss": 1.4886, + "mean_token_accuracy": 0.6411213676134745, + "num_tokens": 378956471.0, + "step": 2258 + }, + { + "entropy": 1.6875308553377788, + "epoch": 0.24816676279146413, + "grad_norm": 0.6208860874176025, + "learning_rate": 1.9586263166771976e-05, + "loss": 1.3516, + "mean_token_accuracy": 0.6571200539668401, + "num_tokens": 379099761.0, + "step": 2259 + }, + { + "entropy": 1.6892028748989105, + "epoch": 0.24827661970283704, + "grad_norm": 0.871165931224823, + "learning_rate": 1.958578315090672e-05, + "loss": 1.5336, + "mean_token_accuracy": 0.6406076997518539, + "num_tokens": 379257672.0, + "step": 2260 + }, + { + "entropy": 1.7612777749697368, + "epoch": 0.24838647661420998, + "grad_norm": 0.7290350198745728, + "learning_rate": 1.95853028633053e-05, + "loss": 1.3248, + "mean_token_accuracy": 0.6579936047395071, + "num_tokens": 379388678.0, + "step": 2261 + }, + { + "entropy": 1.7003148396809895, + "epoch": 0.24849633352558292, + "grad_norm": 0.652847170829773, + "learning_rate": 1.958482230398291e-05, + "loss": 1.3746, + "mean_token_accuracy": 0.6677578836679459, + "num_tokens": 379561397.0, + "step": 2262 + }, + { + "entropy": 1.7450473109881084, + "epoch": 0.24860619043695587, + "grad_norm": 0.6660063862800598, + "learning_rate": 1.958434147295476e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6531000037988027, + "num_tokens": 379754068.0, + "step": 2263 + }, + { + "entropy": 1.738626629114151, + "epoch": 0.2487160473483288, + "grad_norm": 0.6657839417457581, + "learning_rate": 1.9583860370236073e-05, + "loss": 1.401, + "mean_token_accuracy": 0.6500038256247839, + "num_tokens": 379929662.0, + "step": 2264 + }, + { + "entropy": 1.6692744890848796, + "epoch": 0.24882590425970175, + "grad_norm": 0.5950648188591003, + "learning_rate": 1.9583378995842073e-05, + "loss": 1.359, + "mean_token_accuracy": 0.6600983838240305, + "num_tokens": 380102552.0, + "step": 2265 + }, + { + "entropy": 1.6895807385444641, + "epoch": 0.2489357611710747, + "grad_norm": 0.6088550686836243, + "learning_rate": 1.9582897349788e-05, + "loss": 1.49, + "mean_token_accuracy": 0.6435723503430685, + "num_tokens": 380318259.0, + "step": 2266 + }, + { + "entropy": 1.660117010275523, + "epoch": 0.2490456180824476, + "grad_norm": 0.5880101919174194, + "learning_rate": 1.9582415432089086e-05, + "loss": 1.3985, + "mean_token_accuracy": 0.6589946647485098, + "num_tokens": 380484218.0, + "step": 2267 + }, + { + "entropy": 1.7047088046868641, + "epoch": 0.24915547499382054, + "grad_norm": 0.7071229815483093, + "learning_rate": 1.9581933242760595e-05, + "loss": 1.5435, + "mean_token_accuracy": 0.6445372601350149, + "num_tokens": 380657830.0, + "step": 2268 + }, + { + "entropy": 1.6842391391595204, + "epoch": 0.24926533190519348, + "grad_norm": 0.6728826761245728, + "learning_rate": 1.9581450781817782e-05, + "loss": 1.4207, + "mean_token_accuracy": 0.6447325150171915, + "num_tokens": 380870392.0, + "step": 2269 + }, + { + "entropy": 1.7044761975606282, + "epoch": 0.24937518881656642, + "grad_norm": 0.6908706426620483, + "learning_rate": 1.9580968049275918e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6554999053478241, + "num_tokens": 381045560.0, + "step": 2270 + }, + { + "entropy": 1.7423875729242961, + "epoch": 0.24948504572793936, + "grad_norm": 0.6786364912986755, + "learning_rate": 1.9580485045150284e-05, + "loss": 1.3842, + "mean_token_accuracy": 0.6539370367924372, + "num_tokens": 381184792.0, + "step": 2271 + }, + { + "entropy": 1.7371763586997986, + "epoch": 0.2495949026393123, + "grad_norm": 0.6156378984451294, + "learning_rate": 1.9580001769456166e-05, + "loss": 1.5311, + "mean_token_accuracy": 0.6272181322177252, + "num_tokens": 381470099.0, + "step": 2272 + }, + { + "entropy": 1.7165015836556752, + "epoch": 0.24970475955068525, + "grad_norm": 0.6726257801055908, + "learning_rate": 1.9579518222208855e-05, + "loss": 1.3107, + "mean_token_accuracy": 0.6664568881193796, + "num_tokens": 381608195.0, + "step": 2273 + }, + { + "entropy": 1.7067488332589467, + "epoch": 0.24981461646205816, + "grad_norm": 0.7491520643234253, + "learning_rate": 1.957903440342366e-05, + "loss": 1.5999, + "mean_token_accuracy": 0.6128611117601395, + "num_tokens": 381845892.0, + "step": 2274 + }, + { + "entropy": 1.742664744456609, + "epoch": 0.2499244733734311, + "grad_norm": 0.6049597263336182, + "learning_rate": 1.9578550313115892e-05, + "loss": 1.3704, + "mean_token_accuracy": 0.6534209748109182, + "num_tokens": 382016391.0, + "step": 2275 + }, + { + "entropy": 1.6452626784642537, + "epoch": 0.25003433028480404, + "grad_norm": 0.6422023773193359, + "learning_rate": 1.9578065951300873e-05, + "loss": 1.4084, + "mean_token_accuracy": 0.6613962203264236, + "num_tokens": 382212478.0, + "step": 2276 + }, + { + "entropy": 1.6521940728028615, + "epoch": 0.250144187196177, + "grad_norm": 0.816700279712677, + "learning_rate": 1.957758131799393e-05, + "loss": 1.3941, + "mean_token_accuracy": 0.6591680943965912, + "num_tokens": 382412188.0, + "step": 2277 + }, + { + "entropy": 1.6978717148303986, + "epoch": 0.2502540441075499, + "grad_norm": 0.6817605495452881, + "learning_rate": 1.9577096413210405e-05, + "loss": 1.4719, + "mean_token_accuracy": 0.6562831451495489, + "num_tokens": 382571361.0, + "step": 2278 + }, + { + "entropy": 1.7319901784261067, + "epoch": 0.25036390101892286, + "grad_norm": 0.9123170971870422, + "learning_rate": 1.9576611236965644e-05, + "loss": 1.3404, + "mean_token_accuracy": 0.6749412715435028, + "num_tokens": 382719176.0, + "step": 2279 + }, + { + "entropy": 1.680397629737854, + "epoch": 0.2504737579302958, + "grad_norm": 0.6984215974807739, + "learning_rate": 1.9576125789275e-05, + "loss": 1.3374, + "mean_token_accuracy": 0.6687712669372559, + "num_tokens": 382873172.0, + "step": 2280 + }, + { + "entropy": 1.7036446233590443, + "epoch": 0.25058361484166874, + "grad_norm": 0.5962554216384888, + "learning_rate": 1.957564007015384e-05, + "loss": 1.464, + "mean_token_accuracy": 0.6451049596071243, + "num_tokens": 383057241.0, + "step": 2281 + }, + { + "entropy": 1.771241287390391, + "epoch": 0.2506934717530417, + "grad_norm": 0.7492452263832092, + "learning_rate": 1.9575154079617535e-05, + "loss": 1.3027, + "mean_token_accuracy": 0.6861212154229482, + "num_tokens": 383177887.0, + "step": 2282 + }, + { + "entropy": 1.6081528663635254, + "epoch": 0.2508033286644146, + "grad_norm": 0.6222009062767029, + "learning_rate": 1.957466781768147e-05, + "loss": 1.3991, + "mean_token_accuracy": 0.6638480375210444, + "num_tokens": 383382040.0, + "step": 2283 + }, + { + "entropy": 1.736664613087972, + "epoch": 0.2509131855757875, + "grad_norm": 0.7736158967018127, + "learning_rate": 1.957418128436103e-05, + "loss": 1.3175, + "mean_token_accuracy": 0.672918826341629, + "num_tokens": 383497583.0, + "step": 2284 + }, + { + "entropy": 1.7848374644915264, + "epoch": 0.25102304248716045, + "grad_norm": 0.70754075050354, + "learning_rate": 1.957369447967162e-05, + "loss": 1.3359, + "mean_token_accuracy": 0.6519061873356501, + "num_tokens": 383651660.0, + "step": 2285 + }, + { + "entropy": 1.7001720269521077, + "epoch": 0.2511328993985334, + "grad_norm": 0.6765170693397522, + "learning_rate": 1.9573207403628638e-05, + "loss": 1.2694, + "mean_token_accuracy": 0.6806812932093939, + "num_tokens": 383782790.0, + "step": 2286 + }, + { + "entropy": 1.723912199338277, + "epoch": 0.25124275630990633, + "grad_norm": 0.6241678595542908, + "learning_rate": 1.957272005624751e-05, + "loss": 1.4251, + "mean_token_accuracy": 0.6483699729045233, + "num_tokens": 383980808.0, + "step": 2287 + }, + { + "entropy": 1.7109817663828533, + "epoch": 0.2513526132212793, + "grad_norm": 0.6277598142623901, + "learning_rate": 1.957223243754365e-05, + "loss": 1.3538, + "mean_token_accuracy": 0.6669852336247762, + "num_tokens": 384136251.0, + "step": 2288 + }, + { + "entropy": 1.7253372172514598, + "epoch": 0.2514624701326522, + "grad_norm": 0.6478981375694275, + "learning_rate": 1.95717445475325e-05, + "loss": 1.3742, + "mean_token_accuracy": 0.6614843010902405, + "num_tokens": 384350465.0, + "step": 2289 + }, + { + "entropy": 1.7166267931461334, + "epoch": 0.25157232704402516, + "grad_norm": 0.8400107026100159, + "learning_rate": 1.9571256386229494e-05, + "loss": 1.3264, + "mean_token_accuracy": 0.6640830139319102, + "num_tokens": 384468885.0, + "step": 2290 + }, + { + "entropy": 1.663007269303004, + "epoch": 0.2516821839553981, + "grad_norm": 0.6767208576202393, + "learning_rate": 1.9570767953650088e-05, + "loss": 1.4499, + "mean_token_accuracy": 0.6597578575213751, + "num_tokens": 384655353.0, + "step": 2291 + }, + { + "entropy": 1.7039824028809865, + "epoch": 0.25179204086677104, + "grad_norm": 0.6444749236106873, + "learning_rate": 1.957027924980974e-05, + "loss": 1.2652, + "mean_token_accuracy": 0.6771525144577026, + "num_tokens": 384792003.0, + "step": 2292 + }, + { + "entropy": 1.7554666300614674, + "epoch": 0.251901897778144, + "grad_norm": 0.6999207139015198, + "learning_rate": 1.956979027472391e-05, + "loss": 1.4989, + "mean_token_accuracy": 0.6585345417261124, + "num_tokens": 384962519.0, + "step": 2293 + }, + { + "entropy": 1.7081598440806072, + "epoch": 0.2520117546895169, + "grad_norm": 0.6971555352210999, + "learning_rate": 1.9569301028408084e-05, + "loss": 1.5249, + "mean_token_accuracy": 0.6455909609794617, + "num_tokens": 385179673.0, + "step": 2294 + }, + { + "entropy": 1.7678824067115784, + "epoch": 0.25212161160088986, + "grad_norm": 0.6623879075050354, + "learning_rate": 1.9568811510877742e-05, + "loss": 1.461, + "mean_token_accuracy": 0.6422907660404841, + "num_tokens": 385361347.0, + "step": 2295 + }, + { + "entropy": 1.7030884822209675, + "epoch": 0.2522314685122628, + "grad_norm": 0.6273277401924133, + "learning_rate": 1.9568321722148376e-05, + "loss": 1.3993, + "mean_token_accuracy": 0.6431881437699, + "num_tokens": 385552421.0, + "step": 2296 + }, + { + "entropy": 1.710710922876994, + "epoch": 0.2523413254236357, + "grad_norm": 0.633685290813446, + "learning_rate": 1.9567831662235485e-05, + "loss": 1.4126, + "mean_token_accuracy": 0.6471965213616689, + "num_tokens": 385726239.0, + "step": 2297 + }, + { + "entropy": 1.7225467264652252, + "epoch": 0.25245118233500863, + "grad_norm": 0.6174577474594116, + "learning_rate": 1.956734133115459e-05, + "loss": 1.5379, + "mean_token_accuracy": 0.618008534113566, + "num_tokens": 386036858.0, + "step": 2298 + }, + { + "entropy": 1.6922851900259654, + "epoch": 0.25256103924638157, + "grad_norm": 0.6310000419616699, + "learning_rate": 1.9566850728921196e-05, + "loss": 1.5632, + "mean_token_accuracy": 0.6343136032422384, + "num_tokens": 386284124.0, + "step": 2299 + }, + { + "entropy": 1.7784220079580944, + "epoch": 0.2526708961577545, + "grad_norm": 0.6629165410995483, + "learning_rate": 1.9566359855550837e-05, + "loss": 1.4848, + "mean_token_accuracy": 0.642065703868866, + "num_tokens": 386461755.0, + "step": 2300 + }, + { + "entropy": 1.7046751876672108, + "epoch": 0.25278075306912745, + "grad_norm": 0.6529744863510132, + "learning_rate": 1.9565868711059054e-05, + "loss": 1.319, + "mean_token_accuracy": 0.675203874707222, + "num_tokens": 386579521.0, + "step": 2301 + }, + { + "entropy": 1.7891161839167278, + "epoch": 0.2528906099805004, + "grad_norm": 0.7346999645233154, + "learning_rate": 1.956537729546138e-05, + "loss": 1.3972, + "mean_token_accuracy": 0.6581176420052847, + "num_tokens": 386772847.0, + "step": 2302 + }, + { + "entropy": 1.6651329696178436, + "epoch": 0.25300046689187333, + "grad_norm": 0.7511558532714844, + "learning_rate": 1.956488560877338e-05, + "loss": 1.5864, + "mean_token_accuracy": 0.6338248377044996, + "num_tokens": 386982952.0, + "step": 2303 + }, + { + "entropy": 1.7310482561588287, + "epoch": 0.2531103238032463, + "grad_norm": 0.788306713104248, + "learning_rate": 1.9564393651010603e-05, + "loss": 1.5, + "mean_token_accuracy": 0.6546668658653895, + "num_tokens": 387175533.0, + "step": 2304 + }, + { + "entropy": 1.7673071126143138, + "epoch": 0.2532201807146192, + "grad_norm": 0.6239743828773499, + "learning_rate": 1.9563901422188635e-05, + "loss": 1.3626, + "mean_token_accuracy": 0.6721245894829432, + "num_tokens": 387353943.0, + "step": 2305 + }, + { + "entropy": 1.7524100144704182, + "epoch": 0.25333003762599215, + "grad_norm": 0.7127106189727783, + "learning_rate": 1.956340892232304e-05, + "loss": 1.4402, + "mean_token_accuracy": 0.6585116336743037, + "num_tokens": 387549620.0, + "step": 2306 + }, + { + "entropy": 1.7112055122852325, + "epoch": 0.2534398945373651, + "grad_norm": 0.8389919400215149, + "learning_rate": 1.956291615142941e-05, + "loss": 1.4082, + "mean_token_accuracy": 0.6524738470713297, + "num_tokens": 387680799.0, + "step": 2307 + }, + { + "entropy": 1.698311318953832, + "epoch": 0.25354975144873804, + "grad_norm": 0.7265674471855164, + "learning_rate": 1.9562423109523346e-05, + "loss": 1.3202, + "mean_token_accuracy": 0.6675619333982468, + "num_tokens": 387830972.0, + "step": 2308 + }, + { + "entropy": 1.741395463546117, + "epoch": 0.253659608360111, + "grad_norm": 0.7003577351570129, + "learning_rate": 1.956192979662045e-05, + "loss": 1.4395, + "mean_token_accuracy": 0.6541390617688497, + "num_tokens": 387975525.0, + "step": 2309 + }, + { + "entropy": 1.7452284097671509, + "epoch": 0.2537694652714839, + "grad_norm": 0.6919611096382141, + "learning_rate": 1.956143621273633e-05, + "loss": 1.4696, + "mean_token_accuracy": 0.6432512650887171, + "num_tokens": 388168464.0, + "step": 2310 + }, + { + "entropy": 1.7431021829446156, + "epoch": 0.2538793221828568, + "grad_norm": 0.7462813854217529, + "learning_rate": 1.9560942357886612e-05, + "loss": 1.4758, + "mean_token_accuracy": 0.6642291049162546, + "num_tokens": 388318737.0, + "step": 2311 + }, + { + "entropy": 1.7515191932519276, + "epoch": 0.25398917909422974, + "grad_norm": 0.7343306541442871, + "learning_rate": 1.9560448232086927e-05, + "loss": 1.2869, + "mean_token_accuracy": 0.6709674050410589, + "num_tokens": 388467029.0, + "step": 2312 + }, + { + "entropy": 1.7023037274678547, + "epoch": 0.2540990360056027, + "grad_norm": 0.7146450281143188, + "learning_rate": 1.9559953835352916e-05, + "loss": 1.2726, + "mean_token_accuracy": 0.6788142820199331, + "num_tokens": 388662606.0, + "step": 2313 + }, + { + "entropy": 1.755051185687383, + "epoch": 0.2542088929169756, + "grad_norm": 0.7248855233192444, + "learning_rate": 1.955945916770022e-05, + "loss": 1.36, + "mean_token_accuracy": 0.6611029158035914, + "num_tokens": 388797070.0, + "step": 2314 + }, + { + "entropy": 1.7301162481307983, + "epoch": 0.25431874982834857, + "grad_norm": 0.6761884689331055, + "learning_rate": 1.9558964229144498e-05, + "loss": 1.2911, + "mean_token_accuracy": 0.6739692836999893, + "num_tokens": 388932368.0, + "step": 2315 + }, + { + "entropy": 1.7212721010049183, + "epoch": 0.2544286067397215, + "grad_norm": 0.9040817022323608, + "learning_rate": 1.9558469019701415e-05, + "loss": 1.2838, + "mean_token_accuracy": 0.6708403180042902, + "num_tokens": 389061358.0, + "step": 2316 + }, + { + "entropy": 1.6814491947491963, + "epoch": 0.25453846365109445, + "grad_norm": 0.5825700759887695, + "learning_rate": 1.9557973539386648e-05, + "loss": 1.1823, + "mean_token_accuracy": 0.6723542312781016, + "num_tokens": 389254558.0, + "step": 2317 + }, + { + "entropy": 1.7602062424023945, + "epoch": 0.2546483205624674, + "grad_norm": 0.8323972821235657, + "learning_rate": 1.955747778821587e-05, + "loss": 1.5388, + "mean_token_accuracy": 0.636329710483551, + "num_tokens": 389451108.0, + "step": 2318 + }, + { + "entropy": 1.7078562676906586, + "epoch": 0.25475817747384033, + "grad_norm": 0.6664144992828369, + "learning_rate": 1.9556981766204778e-05, + "loss": 1.4438, + "mean_token_accuracy": 0.6484460184971491, + "num_tokens": 389610942.0, + "step": 2319 + }, + { + "entropy": 1.7442236840724945, + "epoch": 0.25486803438521327, + "grad_norm": 0.7855071425437927, + "learning_rate": 1.955648547336907e-05, + "loss": 1.4623, + "mean_token_accuracy": 0.660668358206749, + "num_tokens": 389720189.0, + "step": 2320 + }, + { + "entropy": 1.7352332770824432, + "epoch": 0.2549778912965862, + "grad_norm": 0.7771610021591187, + "learning_rate": 1.9555988909724452e-05, + "loss": 1.3173, + "mean_token_accuracy": 0.65856105585893, + "num_tokens": 389873275.0, + "step": 2321 + }, + { + "entropy": 1.7454373637835185, + "epoch": 0.25508774820795915, + "grad_norm": 0.8752892017364502, + "learning_rate": 1.9555492075286637e-05, + "loss": 1.2962, + "mean_token_accuracy": 0.6755866954723994, + "num_tokens": 389978431.0, + "step": 2322 + }, + { + "entropy": 1.752275397380193, + "epoch": 0.2551976051193321, + "grad_norm": 0.7649509906768799, + "learning_rate": 1.955499497007136e-05, + "loss": 1.5054, + "mean_token_accuracy": 0.6548398286104202, + "num_tokens": 390126888.0, + "step": 2323 + }, + { + "entropy": 1.7507530748844147, + "epoch": 0.255307462030705, + "grad_norm": 0.9283920526504517, + "learning_rate": 1.955449759409434e-05, + "loss": 1.5272, + "mean_token_accuracy": 0.6637701342503229, + "num_tokens": 390347120.0, + "step": 2324 + }, + { + "entropy": 1.7015658716360729, + "epoch": 0.2554173189420779, + "grad_norm": 0.6118226647377014, + "learning_rate": 1.955399994737133e-05, + "loss": 1.3858, + "mean_token_accuracy": 0.6548377076784769, + "num_tokens": 390538791.0, + "step": 2325 + }, + { + "entropy": 1.6828657388687134, + "epoch": 0.25552717585345086, + "grad_norm": 0.7431015968322754, + "learning_rate": 1.9553502029918075e-05, + "loss": 1.2795, + "mean_token_accuracy": 0.67412897447745, + "num_tokens": 390704930.0, + "step": 2326 + }, + { + "entropy": 1.7173655331134796, + "epoch": 0.2556370327648238, + "grad_norm": 0.6374660730361938, + "learning_rate": 1.9553003841750334e-05, + "loss": 1.4074, + "mean_token_accuracy": 0.6585131883621216, + "num_tokens": 390836480.0, + "step": 2327 + }, + { + "entropy": 1.6791688601175945, + "epoch": 0.25574688967619674, + "grad_norm": 0.6104090213775635, + "learning_rate": 1.9552505382883876e-05, + "loss": 1.4401, + "mean_token_accuracy": 0.6437318821748098, + "num_tokens": 391014632.0, + "step": 2328 + }, + { + "entropy": 1.7703735729058583, + "epoch": 0.2558567465875697, + "grad_norm": 0.7664080858230591, + "learning_rate": 1.9552006653334478e-05, + "loss": 1.3211, + "mean_token_accuracy": 0.6642909646034241, + "num_tokens": 391148660.0, + "step": 2329 + }, + { + "entropy": 1.6809141437212627, + "epoch": 0.2559666034989426, + "grad_norm": 0.5534673929214478, + "learning_rate": 1.955150765311792e-05, + "loss": 1.346, + "mean_token_accuracy": 0.6547142614920934, + "num_tokens": 391358170.0, + "step": 2330 + }, + { + "entropy": 1.7062424222628276, + "epoch": 0.25607646041031557, + "grad_norm": 0.7570271492004395, + "learning_rate": 1.9551008382250002e-05, + "loss": 1.435, + "mean_token_accuracy": 0.671400730808576, + "num_tokens": 391484679.0, + "step": 2331 + }, + { + "entropy": 1.7625846366087596, + "epoch": 0.2561863173216885, + "grad_norm": 0.7675605416297913, + "learning_rate": 1.955050884074652e-05, + "loss": 1.2715, + "mean_token_accuracy": 0.6735413372516632, + "num_tokens": 391603471.0, + "step": 2332 + }, + { + "entropy": 1.7328182061513264, + "epoch": 0.25629617423306145, + "grad_norm": 0.6863852739334106, + "learning_rate": 1.955000902862329e-05, + "loss": 1.4326, + "mean_token_accuracy": 0.6669427951176962, + "num_tokens": 391764718.0, + "step": 2333 + }, + { + "entropy": 1.685337871313095, + "epoch": 0.2564060311444344, + "grad_norm": 0.8060728311538696, + "learning_rate": 1.954950894589612e-05, + "loss": 1.3938, + "mean_token_accuracy": 0.658026655515035, + "num_tokens": 391928953.0, + "step": 2334 + }, + { + "entropy": 1.705341676870982, + "epoch": 0.25651588805580733, + "grad_norm": 0.7532424330711365, + "learning_rate": 1.9549008592580845e-05, + "loss": 1.3471, + "mean_token_accuracy": 0.6698052088419596, + "num_tokens": 392067702.0, + "step": 2335 + }, + { + "entropy": 1.6493215759595234, + "epoch": 0.25662574496718027, + "grad_norm": 0.715596616268158, + "learning_rate": 1.9548507968693306e-05, + "loss": 1.4487, + "mean_token_accuracy": 0.655690461397171, + "num_tokens": 392257962.0, + "step": 2336 + }, + { + "entropy": 1.6872890889644623, + "epoch": 0.2567356018785532, + "grad_norm": 0.8979227542877197, + "learning_rate": 1.954800707424934e-05, + "loss": 1.3481, + "mean_token_accuracy": 0.6613185554742813, + "num_tokens": 392388267.0, + "step": 2337 + }, + { + "entropy": 1.7475295166174571, + "epoch": 0.2568454587899261, + "grad_norm": 0.6289818286895752, + "learning_rate": 1.95475059092648e-05, + "loss": 1.3003, + "mean_token_accuracy": 0.6711251934369405, + "num_tokens": 392512381.0, + "step": 2338 + }, + { + "entropy": 1.7208695312341054, + "epoch": 0.25695531570129904, + "grad_norm": 0.6243081092834473, + "learning_rate": 1.9547004473755548e-05, + "loss": 1.4753, + "mean_token_accuracy": 0.6438407252232233, + "num_tokens": 392734191.0, + "step": 2339 + }, + { + "entropy": 1.715543230374654, + "epoch": 0.257065172612672, + "grad_norm": 0.5962228775024414, + "learning_rate": 1.954650276773746e-05, + "loss": 1.3443, + "mean_token_accuracy": 0.6543434709310532, + "num_tokens": 392910062.0, + "step": 2340 + }, + { + "entropy": 1.763334850470225, + "epoch": 0.2571750295240449, + "grad_norm": 0.7142077684402466, + "learning_rate": 1.9546000791226407e-05, + "loss": 1.4651, + "mean_token_accuracy": 0.634983961780866, + "num_tokens": 393052254.0, + "step": 2341 + }, + { + "entropy": 1.6993257999420166, + "epoch": 0.25728488643541786, + "grad_norm": 0.6107556819915771, + "learning_rate": 1.954549854423828e-05, + "loss": 1.3589, + "mean_token_accuracy": 0.6691757639249166, + "num_tokens": 393172773.0, + "step": 2342 + }, + { + "entropy": 1.6871144970258076, + "epoch": 0.2573947433467908, + "grad_norm": 0.8920505046844482, + "learning_rate": 1.9544996026788978e-05, + "loss": 1.2929, + "mean_token_accuracy": 0.6790098696947098, + "num_tokens": 393294814.0, + "step": 2343 + }, + { + "entropy": 1.7936709821224213, + "epoch": 0.25750460025816374, + "grad_norm": 0.8346240520477295, + "learning_rate": 1.95444932388944e-05, + "loss": 1.4746, + "mean_token_accuracy": 0.6549756973981857, + "num_tokens": 393466030.0, + "step": 2344 + }, + { + "entropy": 1.613057146469752, + "epoch": 0.2576144571695367, + "grad_norm": 0.8011279106140137, + "learning_rate": 1.9543990180570464e-05, + "loss": 1.2318, + "mean_token_accuracy": 0.6732364992300669, + "num_tokens": 393633963.0, + "step": 2345 + }, + { + "entropy": 1.6945912341276805, + "epoch": 0.2577243140809096, + "grad_norm": 0.6898323893547058, + "learning_rate": 1.9543486851833085e-05, + "loss": 1.3876, + "mean_token_accuracy": 0.6581351061662039, + "num_tokens": 393811077.0, + "step": 2346 + }, + { + "entropy": 1.7314487596352894, + "epoch": 0.25783417099228256, + "grad_norm": 0.6320956945419312, + "learning_rate": 1.9542983252698198e-05, + "loss": 1.4795, + "mean_token_accuracy": 0.627448558807373, + "num_tokens": 394006293.0, + "step": 2347 + }, + { + "entropy": 1.7345199584960938, + "epoch": 0.2579440279036555, + "grad_norm": 0.864216148853302, + "learning_rate": 1.954247938318174e-05, + "loss": 1.5121, + "mean_token_accuracy": 0.6495955387751261, + "num_tokens": 394196329.0, + "step": 2348 + }, + { + "entropy": 1.692550351222356, + "epoch": 0.25805388481502844, + "grad_norm": 2.546358108520508, + "learning_rate": 1.954197524329966e-05, + "loss": 1.189, + "mean_token_accuracy": 0.6847096085548401, + "num_tokens": 394355250.0, + "step": 2349 + }, + { + "entropy": 1.738300661245982, + "epoch": 0.2581637417264014, + "grad_norm": 0.6396709680557251, + "learning_rate": 1.9541470833067916e-05, + "loss": 1.3458, + "mean_token_accuracy": 0.6599492281675339, + "num_tokens": 394485210.0, + "step": 2350 + }, + { + "entropy": 1.7182751496632893, + "epoch": 0.2582735986377743, + "grad_norm": 0.7549358606338501, + "learning_rate": 1.9540966152502463e-05, + "loss": 1.2255, + "mean_token_accuracy": 0.6786756366491318, + "num_tokens": 394596219.0, + "step": 2351 + }, + { + "entropy": 1.7194179395834606, + "epoch": 0.2583834555491472, + "grad_norm": 0.6862432956695557, + "learning_rate": 1.9540461201619283e-05, + "loss": 1.223, + "mean_token_accuracy": 0.680388276775678, + "num_tokens": 394749092.0, + "step": 2352 + }, + { + "entropy": 1.72928582628568, + "epoch": 0.25849331246052015, + "grad_norm": 0.6295669078826904, + "learning_rate": 1.9539955980434354e-05, + "loss": 1.4289, + "mean_token_accuracy": 0.6479186564683914, + "num_tokens": 394905639.0, + "step": 2353 + }, + { + "entropy": 1.7089947859446208, + "epoch": 0.2586031693718931, + "grad_norm": 0.6928002238273621, + "learning_rate": 1.9539450488963665e-05, + "loss": 1.5692, + "mean_token_accuracy": 0.6375485310951868, + "num_tokens": 395055391.0, + "step": 2354 + }, + { + "entropy": 1.7104221880435944, + "epoch": 0.25871302628326603, + "grad_norm": 0.7009070515632629, + "learning_rate": 1.953894472722322e-05, + "loss": 1.4035, + "mean_token_accuracy": 0.6675273527701696, + "num_tokens": 395197551.0, + "step": 2355 + }, + { + "entropy": 1.6930966973304749, + "epoch": 0.258822883194639, + "grad_norm": 0.6557918787002563, + "learning_rate": 1.9538438695229017e-05, + "loss": 1.5066, + "mean_token_accuracy": 0.657018855214119, + "num_tokens": 395393178.0, + "step": 2356 + }, + { + "entropy": 1.7057071129480998, + "epoch": 0.2589327401060119, + "grad_norm": 0.6957089900970459, + "learning_rate": 1.9537932392997083e-05, + "loss": 1.4734, + "mean_token_accuracy": 0.6418667435646057, + "num_tokens": 395564794.0, + "step": 2357 + }, + { + "entropy": 1.766910860935847, + "epoch": 0.25904259701738486, + "grad_norm": 0.6810352802276611, + "learning_rate": 1.9537425820543427e-05, + "loss": 1.5344, + "mean_token_accuracy": 0.6410610576470693, + "num_tokens": 395751580.0, + "step": 2358 + }, + { + "entropy": 1.6266848941644032, + "epoch": 0.2591524539287578, + "grad_norm": 0.6686707139015198, + "learning_rate": 1.95369189778841e-05, + "loss": 1.2843, + "mean_token_accuracy": 0.6765497128168741, + "num_tokens": 395922802.0, + "step": 2359 + }, + { + "entropy": 1.7174728314081829, + "epoch": 0.25926231084013074, + "grad_norm": 0.6954838633537292, + "learning_rate": 1.9536411865035126e-05, + "loss": 1.5106, + "mean_token_accuracy": 0.6519478385647138, + "num_tokens": 396112141.0, + "step": 2360 + }, + { + "entropy": 1.7171052594979603, + "epoch": 0.2593721677515037, + "grad_norm": 0.715525209903717, + "learning_rate": 1.953590448201257e-05, + "loss": 1.3681, + "mean_token_accuracy": 0.6634097794691721, + "num_tokens": 396273450.0, + "step": 2361 + }, + { + "entropy": 1.7169589797655742, + "epoch": 0.2594820246628766, + "grad_norm": 0.7676208019256592, + "learning_rate": 1.953539682883248e-05, + "loss": 1.4953, + "mean_token_accuracy": 0.6550164272387823, + "num_tokens": 396436428.0, + "step": 2362 + }, + { + "entropy": 1.7600337266921997, + "epoch": 0.25959188157424956, + "grad_norm": 0.665710985660553, + "learning_rate": 1.953488890551093e-05, + "loss": 1.4444, + "mean_token_accuracy": 0.6462709407011668, + "num_tokens": 396612360.0, + "step": 2363 + }, + { + "entropy": 1.772403875986735, + "epoch": 0.2597017384856225, + "grad_norm": 0.7826092839241028, + "learning_rate": 1.953438071206399e-05, + "loss": 1.6427, + "mean_token_accuracy": 0.607773964603742, + "num_tokens": 396858383.0, + "step": 2364 + }, + { + "entropy": 1.7608717381954193, + "epoch": 0.2598115953969954, + "grad_norm": 0.7619585990905762, + "learning_rate": 1.9533872248507743e-05, + "loss": 1.4618, + "mean_token_accuracy": 0.6620588004589081, + "num_tokens": 396976485.0, + "step": 2365 + }, + { + "entropy": 1.7266217470169067, + "epoch": 0.25992145230836833, + "grad_norm": 0.7657930850982666, + "learning_rate": 1.9533363514858285e-05, + "loss": 1.4424, + "mean_token_accuracy": 0.655510276556015, + "num_tokens": 397162060.0, + "step": 2366 + }, + { + "entropy": 1.7765484750270844, + "epoch": 0.26003130921974127, + "grad_norm": 0.827907145023346, + "learning_rate": 1.9532854511131723e-05, + "loss": 1.4477, + "mean_token_accuracy": 0.6488782366116842, + "num_tokens": 397310390.0, + "step": 2367 + }, + { + "entropy": 1.7494306067625682, + "epoch": 0.2601411661311142, + "grad_norm": 0.808404803276062, + "learning_rate": 1.9532345237344154e-05, + "loss": 1.6514, + "mean_token_accuracy": 0.637022852897644, + "num_tokens": 397545166.0, + "step": 2368 + }, + { + "entropy": 1.7077105244000752, + "epoch": 0.26025102304248715, + "grad_norm": 0.6834400296211243, + "learning_rate": 1.9531835693511706e-05, + "loss": 1.4438, + "mean_token_accuracy": 0.6453288247187933, + "num_tokens": 397702370.0, + "step": 2369 + }, + { + "entropy": 1.7838773429393768, + "epoch": 0.2603608799538601, + "grad_norm": 0.6579210758209229, + "learning_rate": 1.95313258796505e-05, + "loss": 1.5525, + "mean_token_accuracy": 0.6188159386316935, + "num_tokens": 397913924.0, + "step": 2370 + }, + { + "entropy": 1.6815649668375652, + "epoch": 0.26047073686523303, + "grad_norm": 0.6755560636520386, + "learning_rate": 1.953081579577668e-05, + "loss": 1.3892, + "mean_token_accuracy": 0.6632423549890518, + "num_tokens": 398065506.0, + "step": 2371 + }, + { + "entropy": 1.7138336102167766, + "epoch": 0.260580593776606, + "grad_norm": 0.6806965470314026, + "learning_rate": 1.9530305441906384e-05, + "loss": 1.5041, + "mean_token_accuracy": 0.634550929069519, + "num_tokens": 398251769.0, + "step": 2372 + }, + { + "entropy": 1.7888704140981038, + "epoch": 0.2606904506879789, + "grad_norm": 0.7154719829559326, + "learning_rate": 1.952979481805576e-05, + "loss": 1.3385, + "mean_token_accuracy": 0.6612465778986613, + "num_tokens": 398355980.0, + "step": 2373 + }, + { + "entropy": 1.6614461243152618, + "epoch": 0.26080030759935185, + "grad_norm": 0.6078544855117798, + "learning_rate": 1.9529283924240976e-05, + "loss": 1.362, + "mean_token_accuracy": 0.660592312614123, + "num_tokens": 398553184.0, + "step": 2374 + }, + { + "entropy": 1.6379695137341816, + "epoch": 0.2609101645107248, + "grad_norm": 0.6618489623069763, + "learning_rate": 1.95287727604782e-05, + "loss": 1.3702, + "mean_token_accuracy": 0.6538491994142532, + "num_tokens": 398771887.0, + "step": 2375 + }, + { + "entropy": 1.7040774722894032, + "epoch": 0.26102002142209774, + "grad_norm": 0.6134635806083679, + "learning_rate": 1.9528261326783608e-05, + "loss": 1.2476, + "mean_token_accuracy": 0.6677992393573126, + "num_tokens": 398964496.0, + "step": 2376 + }, + { + "entropy": 1.6660463015238445, + "epoch": 0.2611298783334707, + "grad_norm": 0.8864745497703552, + "learning_rate": 1.9527749623173388e-05, + "loss": 1.2948, + "mean_token_accuracy": 0.6655771185954412, + "num_tokens": 399113614.0, + "step": 2377 + }, + { + "entropy": 1.721436192591985, + "epoch": 0.2612397352448436, + "grad_norm": 0.7737887501716614, + "learning_rate": 1.9527237649663736e-05, + "loss": 1.3916, + "mean_token_accuracy": 0.655788873632749, + "num_tokens": 399281508.0, + "step": 2378 + }, + { + "entropy": 1.6972166001796722, + "epoch": 0.2613495921562165, + "grad_norm": 0.6834520697593689, + "learning_rate": 1.952672540627085e-05, + "loss": 1.2625, + "mean_token_accuracy": 0.6793500383694967, + "num_tokens": 399438374.0, + "step": 2379 + }, + { + "entropy": 1.718246688445409, + "epoch": 0.26145944906758944, + "grad_norm": 0.5676518082618713, + "learning_rate": 1.9526212893010955e-05, + "loss": 1.4461, + "mean_token_accuracy": 0.6427881369988123, + "num_tokens": 399631632.0, + "step": 2380 + }, + { + "entropy": 1.731793224811554, + "epoch": 0.2615693059789624, + "grad_norm": 0.6785540580749512, + "learning_rate": 1.9525700109900257e-05, + "loss": 1.6035, + "mean_token_accuracy": 0.6242391665776571, + "num_tokens": 399854358.0, + "step": 2381 + }, + { + "entropy": 1.729330152273178, + "epoch": 0.2616791628903353, + "grad_norm": 0.675688624382019, + "learning_rate": 1.9525187056955e-05, + "loss": 1.428, + "mean_token_accuracy": 0.6407847801844279, + "num_tokens": 400026748.0, + "step": 2382 + }, + { + "entropy": 1.7342185775438945, + "epoch": 0.26178901980170827, + "grad_norm": 0.6857423782348633, + "learning_rate": 1.9524673734191407e-05, + "loss": 1.5219, + "mean_token_accuracy": 0.6373167236646017, + "num_tokens": 400228664.0, + "step": 2383 + }, + { + "entropy": 1.71209650238355, + "epoch": 0.2618988767130812, + "grad_norm": 0.8391352891921997, + "learning_rate": 1.952416014162573e-05, + "loss": 1.3521, + "mean_token_accuracy": 0.6658279597759247, + "num_tokens": 400383096.0, + "step": 2384 + }, + { + "entropy": 1.6978493134180705, + "epoch": 0.26200873362445415, + "grad_norm": 0.6963003873825073, + "learning_rate": 1.952364627927423e-05, + "loss": 1.3041, + "mean_token_accuracy": 0.6771473834911982, + "num_tokens": 400554043.0, + "step": 2385 + }, + { + "entropy": 1.7559028963247936, + "epoch": 0.2621185905358271, + "grad_norm": 0.6737483739852905, + "learning_rate": 1.9523132147153167e-05, + "loss": 1.3949, + "mean_token_accuracy": 0.6481351753075918, + "num_tokens": 400803149.0, + "step": 2386 + }, + { + "entropy": 1.7359568774700165, + "epoch": 0.26222844744720003, + "grad_norm": 0.8350893259048462, + "learning_rate": 1.952261774527881e-05, + "loss": 1.5246, + "mean_token_accuracy": 0.6469388355811437, + "num_tokens": 400981816.0, + "step": 2387 + }, + { + "entropy": 1.6978191137313843, + "epoch": 0.26233830435857297, + "grad_norm": 0.5784783959388733, + "learning_rate": 1.9522103073667444e-05, + "loss": 1.3718, + "mean_token_accuracy": 0.6644446154435476, + "num_tokens": 401135137.0, + "step": 2388 + }, + { + "entropy": 1.7019244929154713, + "epoch": 0.2624481612699459, + "grad_norm": 0.6432830095291138, + "learning_rate": 1.9521588132335352e-05, + "loss": 1.303, + "mean_token_accuracy": 0.6735948820908865, + "num_tokens": 401289759.0, + "step": 2389 + }, + { + "entropy": 1.7148842712243397, + "epoch": 0.26255801818131885, + "grad_norm": 0.7368968725204468, + "learning_rate": 1.952107292129884e-05, + "loss": 1.4875, + "mean_token_accuracy": 0.6443512886762619, + "num_tokens": 401476228.0, + "step": 2390 + }, + { + "entropy": 1.6623288591702778, + "epoch": 0.2626678750926918, + "grad_norm": 0.6117868423461914, + "learning_rate": 1.952055744057421e-05, + "loss": 1.316, + "mean_token_accuracy": 0.6717942307392756, + "num_tokens": 401697757.0, + "step": 2391 + }, + { + "entropy": 1.7370295623938243, + "epoch": 0.2627777320040647, + "grad_norm": 0.5878593921661377, + "learning_rate": 1.9520041690177775e-05, + "loss": 1.3522, + "mean_token_accuracy": 0.6460974911848704, + "num_tokens": 401882412.0, + "step": 2392 + }, + { + "entropy": 1.7986371119817097, + "epoch": 0.2628875889154376, + "grad_norm": 0.6486433744430542, + "learning_rate": 1.9519525670125857e-05, + "loss": 1.3154, + "mean_token_accuracy": 0.6704276005427042, + "num_tokens": 402013794.0, + "step": 2393 + }, + { + "entropy": 1.6841194530328114, + "epoch": 0.26299744582681056, + "grad_norm": 0.5892508625984192, + "learning_rate": 1.951900938043479e-05, + "loss": 1.4242, + "mean_token_accuracy": 0.6558033525943756, + "num_tokens": 402207088.0, + "step": 2394 + }, + { + "entropy": 1.7395052810509999, + "epoch": 0.2631073027381835, + "grad_norm": 0.7096725106239319, + "learning_rate": 1.951849282112092e-05, + "loss": 1.4639, + "mean_token_accuracy": 0.6478641132513682, + "num_tokens": 402393514.0, + "step": 2395 + }, + { + "entropy": 1.7166326642036438, + "epoch": 0.26321715964955644, + "grad_norm": 0.6767532229423523, + "learning_rate": 1.9517975992200588e-05, + "loss": 1.4591, + "mean_token_accuracy": 0.6498099068800608, + "num_tokens": 402598639.0, + "step": 2396 + }, + { + "entropy": 1.679230511188507, + "epoch": 0.2633270165609294, + "grad_norm": 0.6896412372589111, + "learning_rate": 1.9517458893690154e-05, + "loss": 1.4353, + "mean_token_accuracy": 0.6464412113030752, + "num_tokens": 402787828.0, + "step": 2397 + }, + { + "entropy": 1.7297282814979553, + "epoch": 0.2634368734723023, + "grad_norm": 0.8391108512878418, + "learning_rate": 1.9516941525605985e-05, + "loss": 1.5188, + "mean_token_accuracy": 0.6511440972487131, + "num_tokens": 402958227.0, + "step": 2398 + }, + { + "entropy": 1.7749987840652466, + "epoch": 0.26354673038367527, + "grad_norm": 0.7650160193443298, + "learning_rate": 1.9516423887964454e-05, + "loss": 1.3217, + "mean_token_accuracy": 0.6643730302651724, + "num_tokens": 403099927.0, + "step": 2399 + }, + { + "entropy": 1.6371342937151592, + "epoch": 0.2636565872950482, + "grad_norm": 0.6480301022529602, + "learning_rate": 1.9515905980781944e-05, + "loss": 1.3876, + "mean_token_accuracy": 0.6590510805447897, + "num_tokens": 403265885.0, + "step": 2400 + }, + { + "entropy": 1.686698744694392, + "epoch": 0.26376644420642115, + "grad_norm": 0.6983146667480469, + "learning_rate": 1.9515387804074845e-05, + "loss": 1.4885, + "mean_token_accuracy": 0.6543626685937246, + "num_tokens": 403427044.0, + "step": 2401 + }, + { + "entropy": 1.7273488442103069, + "epoch": 0.2638763011177941, + "grad_norm": 0.7281332015991211, + "learning_rate": 1.9514869357859565e-05, + "loss": 1.4179, + "mean_token_accuracy": 0.651305079460144, + "num_tokens": 403589146.0, + "step": 2402 + }, + { + "entropy": 1.6473909517129262, + "epoch": 0.26398615802916703, + "grad_norm": 0.6067411303520203, + "learning_rate": 1.95143506421525e-05, + "loss": 1.4189, + "mean_token_accuracy": 0.6576470931371053, + "num_tokens": 403788427.0, + "step": 2403 + }, + { + "entropy": 1.6839656432469685, + "epoch": 0.26409601494053997, + "grad_norm": 0.7861200571060181, + "learning_rate": 1.9513831656970078e-05, + "loss": 1.2481, + "mean_token_accuracy": 0.6709416210651398, + "num_tokens": 403935089.0, + "step": 2404 + }, + { + "entropy": 1.656613161166509, + "epoch": 0.2642058718519129, + "grad_norm": 0.5541518926620483, + "learning_rate": 1.951331240232872e-05, + "loss": 1.5037, + "mean_token_accuracy": 0.6444850116968155, + "num_tokens": 404176482.0, + "step": 2405 + }, + { + "entropy": 1.6887876590092976, + "epoch": 0.2643157287632858, + "grad_norm": 0.6820752620697021, + "learning_rate": 1.9512792878244863e-05, + "loss": 1.3964, + "mean_token_accuracy": 0.6626077542702357, + "num_tokens": 404329820.0, + "step": 2406 + }, + { + "entropy": 1.681986967722575, + "epoch": 0.26442558567465874, + "grad_norm": 0.6239963173866272, + "learning_rate": 1.9512273084734942e-05, + "loss": 1.3412, + "mean_token_accuracy": 0.6608551541964213, + "num_tokens": 404576812.0, + "step": 2407 + }, + { + "entropy": 1.695441444714864, + "epoch": 0.2645354425860317, + "grad_norm": 0.8196799755096436, + "learning_rate": 1.9511753021815418e-05, + "loss": 1.53, + "mean_token_accuracy": 0.6412807106971741, + "num_tokens": 404766940.0, + "step": 2408 + }, + { + "entropy": 1.6683510939280193, + "epoch": 0.2646452994974046, + "grad_norm": 0.6213725805282593, + "learning_rate": 1.9511232689502744e-05, + "loss": 1.403, + "mean_token_accuracy": 0.6513712704181671, + "num_tokens": 404960575.0, + "step": 2409 + }, + { + "entropy": 1.6807579199473064, + "epoch": 0.26475515640877756, + "grad_norm": 0.6994633078575134, + "learning_rate": 1.9510712087813392e-05, + "loss": 1.4742, + "mean_token_accuracy": 0.6587166041135788, + "num_tokens": 405117547.0, + "step": 2410 + }, + { + "entropy": 1.7289150754610698, + "epoch": 0.2648650133201505, + "grad_norm": 0.7993577122688293, + "learning_rate": 1.9510191216763836e-05, + "loss": 1.4124, + "mean_token_accuracy": 0.6584879656632742, + "num_tokens": 405246798.0, + "step": 2411 + }, + { + "entropy": 1.664443125327428, + "epoch": 0.26497487023152344, + "grad_norm": 0.6626906991004944, + "learning_rate": 1.9509670076370563e-05, + "loss": 1.3149, + "mean_token_accuracy": 0.6721779108047485, + "num_tokens": 405377290.0, + "step": 2412 + }, + { + "entropy": 1.7044403950373332, + "epoch": 0.2650847271428964, + "grad_norm": 0.7034863829612732, + "learning_rate": 1.9509148666650065e-05, + "loss": 1.3123, + "mean_token_accuracy": 0.6630191802978516, + "num_tokens": 405550836.0, + "step": 2413 + }, + { + "entropy": 1.7661688923835754, + "epoch": 0.2651945840542693, + "grad_norm": 0.7401782274246216, + "learning_rate": 1.9508626987618847e-05, + "loss": 1.3626, + "mean_token_accuracy": 0.6758743226528168, + "num_tokens": 405714724.0, + "step": 2414 + }, + { + "entropy": 1.65723983446757, + "epoch": 0.26530444096564226, + "grad_norm": 0.7574884295463562, + "learning_rate": 1.9508105039293422e-05, + "loss": 1.385, + "mean_token_accuracy": 0.6691581656535467, + "num_tokens": 405882381.0, + "step": 2415 + }, + { + "entropy": 1.7135234475135803, + "epoch": 0.2654142978770152, + "grad_norm": 0.670354425907135, + "learning_rate": 1.9507582821690308e-05, + "loss": 1.3597, + "mean_token_accuracy": 0.6573469589153925, + "num_tokens": 406058875.0, + "step": 2416 + }, + { + "entropy": 1.6622610290845234, + "epoch": 0.26552415478838814, + "grad_norm": 0.6127861142158508, + "learning_rate": 1.9507060334826024e-05, + "loss": 1.3034, + "mean_token_accuracy": 0.6667560587326685, + "num_tokens": 406201318.0, + "step": 2417 + }, + { + "entropy": 1.692472666501999, + "epoch": 0.2656340116997611, + "grad_norm": 0.6307097673416138, + "learning_rate": 1.9506537578717116e-05, + "loss": 1.3596, + "mean_token_accuracy": 0.6713347236315409, + "num_tokens": 406390930.0, + "step": 2418 + }, + { + "entropy": 1.7068506876627605, + "epoch": 0.26574386861113397, + "grad_norm": 0.6524941325187683, + "learning_rate": 1.9506014553380134e-05, + "loss": 1.3678, + "mean_token_accuracy": 0.6618810991446177, + "num_tokens": 406554961.0, + "step": 2419 + }, + { + "entropy": 1.7632996737957, + "epoch": 0.2658537255225069, + "grad_norm": 0.5874424576759338, + "learning_rate": 1.9505491258831615e-05, + "loss": 1.4096, + "mean_token_accuracy": 0.6508981684843699, + "num_tokens": 406756062.0, + "step": 2420 + }, + { + "entropy": 1.6941703458627064, + "epoch": 0.26596358243387985, + "grad_norm": 0.7010703086853027, + "learning_rate": 1.9504967695088135e-05, + "loss": 1.2203, + "mean_token_accuracy": 0.6803840696811676, + "num_tokens": 406885417.0, + "step": 2421 + }, + { + "entropy": 1.6123863955338795, + "epoch": 0.2660734393452528, + "grad_norm": 0.7297987341880798, + "learning_rate": 1.9504443862166258e-05, + "loss": 1.3876, + "mean_token_accuracy": 0.664091577132543, + "num_tokens": 407036264.0, + "step": 2422 + }, + { + "entropy": 1.6742206513881683, + "epoch": 0.26618329625662573, + "grad_norm": 0.6762830018997192, + "learning_rate": 1.9503919760082566e-05, + "loss": 1.4989, + "mean_token_accuracy": 0.6637553547819456, + "num_tokens": 407194733.0, + "step": 2423 + }, + { + "entropy": 1.6779598693052928, + "epoch": 0.2662931531679987, + "grad_norm": 0.6086080074310303, + "learning_rate": 1.9503395388853646e-05, + "loss": 1.3268, + "mean_token_accuracy": 0.6625054279963175, + "num_tokens": 407344026.0, + "step": 2424 + }, + { + "entropy": 1.7654085954030354, + "epoch": 0.2664030100793716, + "grad_norm": 0.7018662691116333, + "learning_rate": 1.950287074849609e-05, + "loss": 1.5278, + "mean_token_accuracy": 0.6402523169914881, + "num_tokens": 407520433.0, + "step": 2425 + }, + { + "entropy": 1.6793859700361888, + "epoch": 0.26651286699074456, + "grad_norm": 0.7431981563568115, + "learning_rate": 1.9502345839026508e-05, + "loss": 1.4142, + "mean_token_accuracy": 0.6648030032714208, + "num_tokens": 407670492.0, + "step": 2426 + }, + { + "entropy": 1.7211259802182515, + "epoch": 0.2666227239021175, + "grad_norm": 0.7325606346130371, + "learning_rate": 1.9501820660461515e-05, + "loss": 1.377, + "mean_token_accuracy": 0.6672718872626623, + "num_tokens": 407799323.0, + "step": 2427 + }, + { + "entropy": 1.6678573687871296, + "epoch": 0.26673258081349044, + "grad_norm": 0.7462669014930725, + "learning_rate": 1.9501295212817725e-05, + "loss": 1.3763, + "mean_token_accuracy": 0.6571807414293289, + "num_tokens": 408006005.0, + "step": 2428 + }, + { + "entropy": 1.7572451035181682, + "epoch": 0.2668424377248634, + "grad_norm": 0.5830179452896118, + "learning_rate": 1.9500769496111774e-05, + "loss": 1.5124, + "mean_token_accuracy": 0.640462522705396, + "num_tokens": 408199992.0, + "step": 2429 + }, + { + "entropy": 1.6478756566842396, + "epoch": 0.2669522946362363, + "grad_norm": 0.8301806449890137, + "learning_rate": 1.95002435103603e-05, + "loss": 1.2939, + "mean_token_accuracy": 0.6621855149666468, + "num_tokens": 408334984.0, + "step": 2430 + }, + { + "entropy": 1.7445407410462697, + "epoch": 0.26706215154760926, + "grad_norm": 0.67365562915802, + "learning_rate": 1.949971725557995e-05, + "loss": 1.3452, + "mean_token_accuracy": 0.6596545328696569, + "num_tokens": 408455952.0, + "step": 2431 + }, + { + "entropy": 1.6797068814436595, + "epoch": 0.2671720084589822, + "grad_norm": 0.7494860291481018, + "learning_rate": 1.9499190731787376e-05, + "loss": 1.4513, + "mean_token_accuracy": 0.6530092904965082, + "num_tokens": 408607105.0, + "step": 2432 + }, + { + "entropy": 1.7337973912556965, + "epoch": 0.2672818653703551, + "grad_norm": 0.6328655481338501, + "learning_rate": 1.9498663938999244e-05, + "loss": 1.4147, + "mean_token_accuracy": 0.6540708690881729, + "num_tokens": 408779812.0, + "step": 2433 + }, + { + "entropy": 1.7009416421254475, + "epoch": 0.26739172228172803, + "grad_norm": 0.6963172554969788, + "learning_rate": 1.949813687723223e-05, + "loss": 1.2986, + "mean_token_accuracy": 0.6816080609957377, + "num_tokens": 408938627.0, + "step": 2434 + }, + { + "entropy": 1.693843275308609, + "epoch": 0.26750157919310097, + "grad_norm": 0.7338165640830994, + "learning_rate": 1.9497609546503017e-05, + "loss": 1.2886, + "mean_token_accuracy": 0.6752390662829081, + "num_tokens": 409084769.0, + "step": 2435 + }, + { + "entropy": 1.7020284434159596, + "epoch": 0.2676114361044739, + "grad_norm": 0.6143628358840942, + "learning_rate": 1.9497081946828287e-05, + "loss": 1.5255, + "mean_token_accuracy": 0.648758257428805, + "num_tokens": 409265453.0, + "step": 2436 + }, + { + "entropy": 1.727043906847636, + "epoch": 0.26772129301584685, + "grad_norm": 0.5915109515190125, + "learning_rate": 1.9496554078224743e-05, + "loss": 1.5126, + "mean_token_accuracy": 0.6246416866779327, + "num_tokens": 409486969.0, + "step": 2437 + }, + { + "entropy": 1.6795673767725627, + "epoch": 0.2678311499272198, + "grad_norm": 0.6236563920974731, + "learning_rate": 1.949602594070909e-05, + "loss": 1.4403, + "mean_token_accuracy": 0.6531639695167542, + "num_tokens": 409666706.0, + "step": 2438 + }, + { + "entropy": 1.6891235609849293, + "epoch": 0.26794100683859273, + "grad_norm": 0.774161696434021, + "learning_rate": 1.949549753429804e-05, + "loss": 1.3583, + "mean_token_accuracy": 0.6626383264859518, + "num_tokens": 409838047.0, + "step": 2439 + }, + { + "entropy": 1.737920731306076, + "epoch": 0.2680508637499657, + "grad_norm": 0.9070193767547607, + "learning_rate": 1.949496885900833e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6440466195344925, + "num_tokens": 410038258.0, + "step": 2440 + }, + { + "entropy": 1.6907021900018055, + "epoch": 0.2681607206613386, + "grad_norm": 0.7839999198913574, + "learning_rate": 1.949443991485668e-05, + "loss": 1.2923, + "mean_token_accuracy": 0.6767375022172928, + "num_tokens": 410213875.0, + "step": 2441 + }, + { + "entropy": 1.6959629952907562, + "epoch": 0.26827057757271155, + "grad_norm": 0.7793353199958801, + "learning_rate": 1.9493910701859832e-05, + "loss": 1.2393, + "mean_token_accuracy": 0.6860839327176412, + "num_tokens": 410367133.0, + "step": 2442 + }, + { + "entropy": 1.7073424855868022, + "epoch": 0.2683804344840845, + "grad_norm": 0.688703179359436, + "learning_rate": 1.949338122003454e-05, + "loss": 1.4447, + "mean_token_accuracy": 0.6503096967935562, + "num_tokens": 410551050.0, + "step": 2443 + }, + { + "entropy": 1.7885093291600545, + "epoch": 0.26849029139545744, + "grad_norm": 0.6733722686767578, + "learning_rate": 1.949285146939756e-05, + "loss": 1.5393, + "mean_token_accuracy": 0.6366397589445114, + "num_tokens": 410772639.0, + "step": 2444 + }, + { + "entropy": 1.7265475789705913, + "epoch": 0.2686001483068304, + "grad_norm": 0.7780793905258179, + "learning_rate": 1.9492321449965657e-05, + "loss": 1.3903, + "mean_token_accuracy": 0.6643284608920416, + "num_tokens": 410931367.0, + "step": 2445 + }, + { + "entropy": 1.6977720061937969, + "epoch": 0.26871000521820326, + "grad_norm": 0.7139464020729065, + "learning_rate": 1.949179116175561e-05, + "loss": 1.3582, + "mean_token_accuracy": 0.6607271184523901, + "num_tokens": 411120350.0, + "step": 2446 + }, + { + "entropy": 1.7429304122924805, + "epoch": 0.2688198621295762, + "grad_norm": 0.7446684241294861, + "learning_rate": 1.9491260604784196e-05, + "loss": 1.5354, + "mean_token_accuracy": 0.6431114623943964, + "num_tokens": 411333636.0, + "step": 2447 + }, + { + "entropy": 1.6738013923168182, + "epoch": 0.26892971904094914, + "grad_norm": 0.7107298970222473, + "learning_rate": 1.949072977906821e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6652788172165552, + "num_tokens": 411496037.0, + "step": 2448 + }, + { + "entropy": 1.7873223821322124, + "epoch": 0.2690395759523221, + "grad_norm": 0.749606728553772, + "learning_rate": 1.9490198684624456e-05, + "loss": 1.5195, + "mean_token_accuracy": 0.6423113072911898, + "num_tokens": 411652722.0, + "step": 2449 + }, + { + "entropy": 1.7064524590969086, + "epoch": 0.269149432863695, + "grad_norm": 0.6809104084968567, + "learning_rate": 1.9489667321469733e-05, + "loss": 1.492, + "mean_token_accuracy": 0.6354218969742457, + "num_tokens": 411817928.0, + "step": 2450 + }, + { + "entropy": 1.6833800375461578, + "epoch": 0.26925928977506797, + "grad_norm": 0.7017293572425842, + "learning_rate": 1.948913568962087e-05, + "loss": 1.4219, + "mean_token_accuracy": 0.6560679723819097, + "num_tokens": 411959682.0, + "step": 2451 + }, + { + "entropy": 1.7220069666703541, + "epoch": 0.2693691466864409, + "grad_norm": 0.7343468070030212, + "learning_rate": 1.9488603789094687e-05, + "loss": 1.1667, + "mean_token_accuracy": 0.6866755535205206, + "num_tokens": 412080212.0, + "step": 2452 + }, + { + "entropy": 1.694863756497701, + "epoch": 0.26947900359781385, + "grad_norm": 0.5863001942634583, + "learning_rate": 1.9488071619908016e-05, + "loss": 1.3702, + "mean_token_accuracy": 0.6538634697596232, + "num_tokens": 412248760.0, + "step": 2453 + }, + { + "entropy": 1.7061160604159038, + "epoch": 0.2695888605091868, + "grad_norm": 0.6747974753379822, + "learning_rate": 1.9487539182077707e-05, + "loss": 1.4749, + "mean_token_accuracy": 0.6355902552604675, + "num_tokens": 412431548.0, + "step": 2454 + }, + { + "entropy": 1.7680587371190388, + "epoch": 0.26969871742055973, + "grad_norm": 0.6596987843513489, + "learning_rate": 1.9487006475620606e-05, + "loss": 1.4807, + "mean_token_accuracy": 0.6433413873116175, + "num_tokens": 412608344.0, + "step": 2455 + }, + { + "entropy": 1.6938173572222393, + "epoch": 0.26980857433193267, + "grad_norm": 0.6757233738899231, + "learning_rate": 1.9486473500553575e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6577209134896597, + "num_tokens": 412769029.0, + "step": 2456 + }, + { + "entropy": 1.666108379761378, + "epoch": 0.2699184312433056, + "grad_norm": 0.8024738430976868, + "learning_rate": 1.9485940256893477e-05, + "loss": 1.4351, + "mean_token_accuracy": 0.6482095420360565, + "num_tokens": 412906545.0, + "step": 2457 + }, + { + "entropy": 1.7094764014085133, + "epoch": 0.27002828815467855, + "grad_norm": 0.6070849299430847, + "learning_rate": 1.94854067446572e-05, + "loss": 1.3914, + "mean_token_accuracy": 0.658433347940445, + "num_tokens": 413051352.0, + "step": 2458 + }, + { + "entropy": 1.740122099717458, + "epoch": 0.2701381450660515, + "grad_norm": 0.7562992572784424, + "learning_rate": 1.948487296386162e-05, + "loss": 1.4434, + "mean_token_accuracy": 0.664603571097056, + "num_tokens": 413255124.0, + "step": 2459 + }, + { + "entropy": 1.7619945506254833, + "epoch": 0.2702480019774244, + "grad_norm": 0.6791596412658691, + "learning_rate": 1.9484338914523634e-05, + "loss": 1.4725, + "mean_token_accuracy": 0.6506190747022629, + "num_tokens": 413466529.0, + "step": 2460 + }, + { + "entropy": 1.6560695469379425, + "epoch": 0.2703578588887973, + "grad_norm": 0.6612438559532166, + "learning_rate": 1.9483804596660144e-05, + "loss": 1.2345, + "mean_token_accuracy": 0.6793079773585001, + "num_tokens": 413646644.0, + "step": 2461 + }, + { + "entropy": 1.7678084075450897, + "epoch": 0.27046771580017026, + "grad_norm": 0.7152573466300964, + "learning_rate": 1.9483270010288064e-05, + "loss": 1.5106, + "mean_token_accuracy": 0.6360708425442377, + "num_tokens": 413830566.0, + "step": 2462 + }, + { + "entropy": 1.715853621562322, + "epoch": 0.2705775727115432, + "grad_norm": 0.700097382068634, + "learning_rate": 1.948273515542431e-05, + "loss": 1.4053, + "mean_token_accuracy": 0.6485730955998102, + "num_tokens": 413993907.0, + "step": 2463 + }, + { + "entropy": 1.741620510816574, + "epoch": 0.27068742962291614, + "grad_norm": 0.6623098850250244, + "learning_rate": 1.948220003208581e-05, + "loss": 1.5757, + "mean_token_accuracy": 0.6438944588104883, + "num_tokens": 414233249.0, + "step": 2464 + }, + { + "entropy": 1.755933254957199, + "epoch": 0.2707972865342891, + "grad_norm": 0.7296027541160583, + "learning_rate": 1.9481664640289503e-05, + "loss": 1.6268, + "mean_token_accuracy": 0.628890261054039, + "num_tokens": 414414584.0, + "step": 2465 + }, + { + "entropy": 1.784269521633784, + "epoch": 0.270907143445662, + "grad_norm": 0.6351885795593262, + "learning_rate": 1.9481128980052328e-05, + "loss": 1.4174, + "mean_token_accuracy": 0.6424904266993204, + "num_tokens": 414551766.0, + "step": 2466 + }, + { + "entropy": 1.6945275962352753, + "epoch": 0.27101700035703497, + "grad_norm": 0.7583399415016174, + "learning_rate": 1.948059305139125e-05, + "loss": 1.2475, + "mean_token_accuracy": 0.6736636360486349, + "num_tokens": 414688590.0, + "step": 2467 + }, + { + "entropy": 1.7745845814545949, + "epoch": 0.2711268572684079, + "grad_norm": 0.7532066106796265, + "learning_rate": 1.9480056854323214e-05, + "loss": 1.3369, + "mean_token_accuracy": 0.6628710478544235, + "num_tokens": 414788942.0, + "step": 2468 + }, + { + "entropy": 1.6538518170515697, + "epoch": 0.27123671417978085, + "grad_norm": 0.5937896370887756, + "learning_rate": 1.9479520388865206e-05, + "loss": 1.4212, + "mean_token_accuracy": 0.6629629383484522, + "num_tokens": 414982964.0, + "step": 2469 + }, + { + "entropy": 1.7156404356161754, + "epoch": 0.2713465710911538, + "grad_norm": 0.7552958726882935, + "learning_rate": 1.9478983655034195e-05, + "loss": 1.4249, + "mean_token_accuracy": 0.6556862344344457, + "num_tokens": 415179129.0, + "step": 2470 + }, + { + "entropy": 1.677052636941274, + "epoch": 0.27145642800252673, + "grad_norm": 0.6952337622642517, + "learning_rate": 1.9478446652847177e-05, + "loss": 1.404, + "mean_token_accuracy": 0.6581928680340449, + "num_tokens": 415343003.0, + "step": 2471 + }, + { + "entropy": 1.7252871096134186, + "epoch": 0.27156628491389967, + "grad_norm": 0.6775010824203491, + "learning_rate": 1.9477909382321138e-05, + "loss": 1.2959, + "mean_token_accuracy": 0.6630641867717108, + "num_tokens": 415454975.0, + "step": 2472 + }, + { + "entropy": 1.7001695533593495, + "epoch": 0.2716761418252726, + "grad_norm": 0.6910304427146912, + "learning_rate": 1.947737184347309e-05, + "loss": 1.4267, + "mean_token_accuracy": 0.6569666018088659, + "num_tokens": 415659032.0, + "step": 2473 + }, + { + "entropy": 1.6541123191515605, + "epoch": 0.2717859987366455, + "grad_norm": 0.6405661106109619, + "learning_rate": 1.9476834036320044e-05, + "loss": 1.3477, + "mean_token_accuracy": 0.6658432185649872, + "num_tokens": 415812784.0, + "step": 2474 + }, + { + "entropy": 1.6903918882211049, + "epoch": 0.27189585564801844, + "grad_norm": 0.6007583141326904, + "learning_rate": 1.9476295960879015e-05, + "loss": 1.4172, + "mean_token_accuracy": 0.6355639646450678, + "num_tokens": 416109941.0, + "step": 2475 + }, + { + "entropy": 1.7218912939230602, + "epoch": 0.2720057125593914, + "grad_norm": 0.693682074546814, + "learning_rate": 1.947575761716704e-05, + "loss": 1.3423, + "mean_token_accuracy": 0.6594141821066538, + "num_tokens": 416216952.0, + "step": 2476 + }, + { + "entropy": 1.699045052131017, + "epoch": 0.2721155694707643, + "grad_norm": 0.884042501449585, + "learning_rate": 1.947521900520116e-05, + "loss": 1.4059, + "mean_token_accuracy": 0.6671479294697443, + "num_tokens": 416341238.0, + "step": 2477 + }, + { + "entropy": 1.7600041528542836, + "epoch": 0.27222542638213726, + "grad_norm": 0.7523099184036255, + "learning_rate": 1.9474680124998414e-05, + "loss": 1.4538, + "mean_token_accuracy": 0.645432690779368, + "num_tokens": 416550006.0, + "step": 2478 + }, + { + "entropy": 1.7305284440517426, + "epoch": 0.2723352832935102, + "grad_norm": 0.8291406631469727, + "learning_rate": 1.9474140976575862e-05, + "loss": 1.4125, + "mean_token_accuracy": 0.6581102510293325, + "num_tokens": 416699068.0, + "step": 2479 + }, + { + "entropy": 1.7285153965155284, + "epoch": 0.27244514020488314, + "grad_norm": 0.6339166164398193, + "learning_rate": 1.9473601559950566e-05, + "loss": 1.3232, + "mean_token_accuracy": 0.6745259314775467, + "num_tokens": 416836056.0, + "step": 2480 + }, + { + "entropy": 1.6818051934242249, + "epoch": 0.2725549971162561, + "grad_norm": 0.6953151226043701, + "learning_rate": 1.9473061875139603e-05, + "loss": 1.3799, + "mean_token_accuracy": 0.6700175007184347, + "num_tokens": 417010817.0, + "step": 2481 + }, + { + "entropy": 1.744279553492864, + "epoch": 0.272664854027629, + "grad_norm": 0.6217046976089478, + "learning_rate": 1.9472521922160044e-05, + "loss": 1.395, + "mean_token_accuracy": 0.6566085666418076, + "num_tokens": 417189250.0, + "step": 2482 + }, + { + "entropy": 1.7224168479442596, + "epoch": 0.27277471093900196, + "grad_norm": 0.6365125775337219, + "learning_rate": 1.9471981701028988e-05, + "loss": 1.4695, + "mean_token_accuracy": 0.6413618673880895, + "num_tokens": 417406226.0, + "step": 2483 + }, + { + "entropy": 1.7924610773722331, + "epoch": 0.2728845678503749, + "grad_norm": 0.9400225281715393, + "learning_rate": 1.9471441211763526e-05, + "loss": 1.4758, + "mean_token_accuracy": 0.6626063287258148, + "num_tokens": 417552783.0, + "step": 2484 + }, + { + "entropy": 1.718948523203532, + "epoch": 0.27299442476174784, + "grad_norm": 0.7052686214447021, + "learning_rate": 1.947090045438077e-05, + "loss": 1.418, + "mean_token_accuracy": 0.6460021386543909, + "num_tokens": 417764247.0, + "step": 2485 + }, + { + "entropy": 1.7755232155323029, + "epoch": 0.2731042816731208, + "grad_norm": 0.8216487169265747, + "learning_rate": 1.9470359428897827e-05, + "loss": 1.4258, + "mean_token_accuracy": 0.6543919444084167, + "num_tokens": 417922392.0, + "step": 2486 + }, + { + "entropy": 1.6645058989524841, + "epoch": 0.27321413858449367, + "grad_norm": 0.5918928980827332, + "learning_rate": 1.946981813533183e-05, + "loss": 1.449, + "mean_token_accuracy": 0.6550898949305216, + "num_tokens": 418126736.0, + "step": 2487 + }, + { + "entropy": 1.7120232780774434, + "epoch": 0.2733239954958666, + "grad_norm": 0.6520063877105713, + "learning_rate": 1.9469276573699902e-05, + "loss": 1.5254, + "mean_token_accuracy": 0.6442107409238815, + "num_tokens": 418302170.0, + "step": 2488 + }, + { + "entropy": 1.690334975719452, + "epoch": 0.27343385240723955, + "grad_norm": 0.6218593120574951, + "learning_rate": 1.9468734744019187e-05, + "loss": 1.5628, + "mean_token_accuracy": 0.6418820967276891, + "num_tokens": 418522765.0, + "step": 2489 + }, + { + "entropy": 1.771299680074056, + "epoch": 0.2735437093186125, + "grad_norm": 0.6886900663375854, + "learning_rate": 1.9468192646306836e-05, + "loss": 1.4371, + "mean_token_accuracy": 0.6507747322320938, + "num_tokens": 418675411.0, + "step": 2490 + }, + { + "entropy": 1.7038741906483967, + "epoch": 0.27365356622998543, + "grad_norm": 0.7144287824630737, + "learning_rate": 1.9467650280580002e-05, + "loss": 1.5871, + "mean_token_accuracy": 0.6539332419633865, + "num_tokens": 418883702.0, + "step": 2491 + }, + { + "entropy": 1.6580975651741028, + "epoch": 0.2737634231413584, + "grad_norm": 0.6173264384269714, + "learning_rate": 1.946710764685585e-05, + "loss": 1.4252, + "mean_token_accuracy": 0.6434942533572515, + "num_tokens": 419136995.0, + "step": 2492 + }, + { + "entropy": 1.6869386335213978, + "epoch": 0.2738732800527313, + "grad_norm": 8.725295066833496, + "learning_rate": 1.946656474515156e-05, + "loss": 1.3962, + "mean_token_accuracy": 0.6559255520502726, + "num_tokens": 419314327.0, + "step": 2493 + }, + { + "entropy": 1.664640615383784, + "epoch": 0.27398313696410426, + "grad_norm": 0.7240090370178223, + "learning_rate": 1.946602157548431e-05, + "loss": 1.2949, + "mean_token_accuracy": 0.6713322947422663, + "num_tokens": 419461578.0, + "step": 2494 + }, + { + "entropy": 1.6624565223852794, + "epoch": 0.2740929938754772, + "grad_norm": 1.0136929750442505, + "learning_rate": 1.946547813787129e-05, + "loss": 1.3312, + "mean_token_accuracy": 0.6643867244323095, + "num_tokens": 419636649.0, + "step": 2495 + }, + { + "entropy": 1.6829350888729095, + "epoch": 0.27420285078685014, + "grad_norm": 0.6138503551483154, + "learning_rate": 1.9464934432329706e-05, + "loss": 1.4834, + "mean_token_accuracy": 0.6375181674957275, + "num_tokens": 419815875.0, + "step": 2496 + }, + { + "entropy": 1.7264296412467957, + "epoch": 0.2743127076982231, + "grad_norm": 0.6041257977485657, + "learning_rate": 1.9464390458876757e-05, + "loss": 1.3408, + "mean_token_accuracy": 0.6562155981858572, + "num_tokens": 419975058.0, + "step": 2497 + }, + { + "entropy": 1.7106037835280101, + "epoch": 0.274422564609596, + "grad_norm": 0.7350174784660339, + "learning_rate": 1.9463846217529666e-05, + "loss": 1.4776, + "mean_token_accuracy": 0.6583420137564341, + "num_tokens": 420145254.0, + "step": 2498 + }, + { + "entropy": 1.7087813913822174, + "epoch": 0.27453242152096896, + "grad_norm": 0.652927577495575, + "learning_rate": 1.9463301708305654e-05, + "loss": 1.3691, + "mean_token_accuracy": 0.6640399495760599, + "num_tokens": 420326840.0, + "step": 2499 + }, + { + "entropy": 1.702909102042516, + "epoch": 0.2746422784323419, + "grad_norm": 0.6692368984222412, + "learning_rate": 1.946275693122196e-05, + "loss": 1.3357, + "mean_token_accuracy": 0.6595581869284312, + "num_tokens": 420463074.0, + "step": 2500 + }, + { + "entropy": 1.7041659752527873, + "epoch": 0.2747521353437148, + "grad_norm": 0.8389718532562256, + "learning_rate": 1.9462211886295823e-05, + "loss": 1.4709, + "mean_token_accuracy": 0.6703790177901586, + "num_tokens": 420626300.0, + "step": 2501 + }, + { + "entropy": 1.750393122434616, + "epoch": 0.27486199225508773, + "grad_norm": 0.6642520427703857, + "learning_rate": 1.9461666573544488e-05, + "loss": 1.3781, + "mean_token_accuracy": 0.6633904526631037, + "num_tokens": 420765456.0, + "step": 2502 + }, + { + "entropy": 1.7149128119150798, + "epoch": 0.27497184916646067, + "grad_norm": 0.6768452525138855, + "learning_rate": 1.9461120992985222e-05, + "loss": 1.4081, + "mean_token_accuracy": 0.6513208548227946, + "num_tokens": 420919832.0, + "step": 2503 + }, + { + "entropy": 1.7376088599363964, + "epoch": 0.2750817060778336, + "grad_norm": 0.6658154129981995, + "learning_rate": 1.946057514463529e-05, + "loss": 1.3253, + "mean_token_accuracy": 0.6688035577535629, + "num_tokens": 421060527.0, + "step": 2504 + }, + { + "entropy": 1.7594562570254009, + "epoch": 0.27519156298920655, + "grad_norm": 0.7218078374862671, + "learning_rate": 1.9460029028511965e-05, + "loss": 1.3467, + "mean_token_accuracy": 0.6551917244990667, + "num_tokens": 421228365.0, + "step": 2505 + }, + { + "entropy": 1.7568883796532948, + "epoch": 0.2753014199005795, + "grad_norm": 0.9153607487678528, + "learning_rate": 1.9459482644632537e-05, + "loss": 1.4104, + "mean_token_accuracy": 0.6578507423400879, + "num_tokens": 421371677.0, + "step": 2506 + }, + { + "entropy": 1.6713534692923229, + "epoch": 0.27541127681195243, + "grad_norm": 0.782477617263794, + "learning_rate": 1.9458935993014292e-05, + "loss": 1.219, + "mean_token_accuracy": 0.680430273214976, + "num_tokens": 421521194.0, + "step": 2507 + }, + { + "entropy": 1.7415608565012615, + "epoch": 0.2755211337233254, + "grad_norm": 0.8416798710823059, + "learning_rate": 1.9458389073674536e-05, + "loss": 1.3152, + "mean_token_accuracy": 0.6562491556008657, + "num_tokens": 421672704.0, + "step": 2508 + }, + { + "entropy": 1.6363226175308228, + "epoch": 0.2756309906346983, + "grad_norm": 0.730694055557251, + "learning_rate": 1.9457841886630576e-05, + "loss": 1.3778, + "mean_token_accuracy": 0.6615554342667261, + "num_tokens": 421828497.0, + "step": 2509 + }, + { + "entropy": 1.6957507530848186, + "epoch": 0.27574084754607125, + "grad_norm": 0.6770949363708496, + "learning_rate": 1.9457294431899733e-05, + "loss": 1.3599, + "mean_token_accuracy": 0.6570597738027573, + "num_tokens": 421983856.0, + "step": 2510 + }, + { + "entropy": 1.6833610932032268, + "epoch": 0.2758507044574442, + "grad_norm": 0.7206348776817322, + "learning_rate": 1.9456746709499332e-05, + "loss": 1.2937, + "mean_token_accuracy": 0.6710290809472402, + "num_tokens": 422128478.0, + "step": 2511 + }, + { + "entropy": 1.6595212817192078, + "epoch": 0.27596056136881714, + "grad_norm": 0.6251102685928345, + "learning_rate": 1.945619871944671e-05, + "loss": 1.3831, + "mean_token_accuracy": 0.6632163723309835, + "num_tokens": 422334158.0, + "step": 2512 + }, + { + "entropy": 1.7431277732054393, + "epoch": 0.2760704182801901, + "grad_norm": 0.7098563313484192, + "learning_rate": 1.9455650461759202e-05, + "loss": 1.3254, + "mean_token_accuracy": 0.6785684078931808, + "num_tokens": 422495092.0, + "step": 2513 + }, + { + "entropy": 1.7170985639095306, + "epoch": 0.27618027519156296, + "grad_norm": 0.6277499198913574, + "learning_rate": 1.9455101936454174e-05, + "loss": 1.2778, + "mean_token_accuracy": 0.6751286735137304, + "num_tokens": 422625201.0, + "step": 2514 + }, + { + "entropy": 1.7012461324532826, + "epoch": 0.2762901321029359, + "grad_norm": 0.608238697052002, + "learning_rate": 1.9454553143548977e-05, + "loss": 1.3602, + "mean_token_accuracy": 0.6591375966866811, + "num_tokens": 422785134.0, + "step": 2515 + }, + { + "entropy": 1.729185124238332, + "epoch": 0.27639998901430884, + "grad_norm": 0.6700869202613831, + "learning_rate": 1.945400408306098e-05, + "loss": 1.4432, + "mean_token_accuracy": 0.6543838481108347, + "num_tokens": 422931936.0, + "step": 2516 + }, + { + "entropy": 1.747061401605606, + "epoch": 0.2765098459256818, + "grad_norm": 0.9989476203918457, + "learning_rate": 1.945345475500757e-05, + "loss": 1.1749, + "mean_token_accuracy": 0.6789939254522324, + "num_tokens": 423050412.0, + "step": 2517 + }, + { + "entropy": 1.7638193666934967, + "epoch": 0.2766197028370547, + "grad_norm": 0.7275906205177307, + "learning_rate": 1.9452905159406124e-05, + "loss": 1.4777, + "mean_token_accuracy": 0.641241709391276, + "num_tokens": 423224244.0, + "step": 2518 + }, + { + "entropy": 1.686012178659439, + "epoch": 0.27672955974842767, + "grad_norm": 0.7584397792816162, + "learning_rate": 1.9452355296274036e-05, + "loss": 1.363, + "mean_token_accuracy": 0.6610731234153112, + "num_tokens": 423354122.0, + "step": 2519 + }, + { + "entropy": 1.7558989524841309, + "epoch": 0.2768394166598006, + "grad_norm": 0.8733357787132263, + "learning_rate": 1.9451805165628713e-05, + "loss": 1.4204, + "mean_token_accuracy": 0.6505479166905085, + "num_tokens": 423498784.0, + "step": 2520 + }, + { + "entropy": 1.7407618463039398, + "epoch": 0.27694927357117355, + "grad_norm": 0.6735062599182129, + "learning_rate": 1.9451254767487564e-05, + "loss": 1.3931, + "mean_token_accuracy": 0.6727576404809952, + "num_tokens": 423624180.0, + "step": 2521 + }, + { + "entropy": 1.7527087032794952, + "epoch": 0.2770591304825465, + "grad_norm": 0.7081560492515564, + "learning_rate": 1.9450704101868012e-05, + "loss": 1.3928, + "mean_token_accuracy": 0.652918224533399, + "num_tokens": 423776308.0, + "step": 2522 + }, + { + "entropy": 1.7728693286577861, + "epoch": 0.27716898739391943, + "grad_norm": 0.706057071685791, + "learning_rate": 1.945015316878748e-05, + "loss": 1.4191, + "mean_token_accuracy": 0.6447743972142538, + "num_tokens": 423957168.0, + "step": 2523 + }, + { + "entropy": 1.718264530102412, + "epoch": 0.27727884430529237, + "grad_norm": 0.666571319103241, + "learning_rate": 1.9449601968263413e-05, + "loss": 1.4477, + "mean_token_accuracy": 0.6623519708712896, + "num_tokens": 424136873.0, + "step": 2524 + }, + { + "entropy": 1.753323624531428, + "epoch": 0.2773887012166653, + "grad_norm": 0.7180684208869934, + "learning_rate": 1.9449050500313247e-05, + "loss": 1.2229, + "mean_token_accuracy": 0.680413618683815, + "num_tokens": 424229190.0, + "step": 2525 + }, + { + "entropy": 1.6851735214392345, + "epoch": 0.27749855812803825, + "grad_norm": 0.8312351703643799, + "learning_rate": 1.944849876495444e-05, + "loss": 1.318, + "mean_token_accuracy": 0.6703576147556305, + "num_tokens": 424367123.0, + "step": 2526 + }, + { + "entropy": 1.6827231248219807, + "epoch": 0.2776084150394112, + "grad_norm": 0.6143300533294678, + "learning_rate": 1.9447946762204454e-05, + "loss": 1.4498, + "mean_token_accuracy": 0.634101668993632, + "num_tokens": 424587988.0, + "step": 2527 + }, + { + "entropy": 1.6629830300807953, + "epoch": 0.2777182719507841, + "grad_norm": 0.6683552265167236, + "learning_rate": 1.944739449208076e-05, + "loss": 1.4396, + "mean_token_accuracy": 0.6639639983574549, + "num_tokens": 424786286.0, + "step": 2528 + }, + { + "entropy": 1.6775904496510823, + "epoch": 0.277828128862157, + "grad_norm": 0.6482076048851013, + "learning_rate": 1.944684195460084e-05, + "loss": 1.4997, + "mean_token_accuracy": 0.6430053263902664, + "num_tokens": 424967840.0, + "step": 2529 + }, + { + "entropy": 1.7745787998040516, + "epoch": 0.27793798577352996, + "grad_norm": 0.6843352913856506, + "learning_rate": 1.9446289149782175e-05, + "loss": 1.4148, + "mean_token_accuracy": 0.6597040891647339, + "num_tokens": 425137083.0, + "step": 2530 + }, + { + "entropy": 1.6696482102076213, + "epoch": 0.2780478426849029, + "grad_norm": 0.6596343517303467, + "learning_rate": 1.9445736077642266e-05, + "loss": 1.1965, + "mean_token_accuracy": 0.6828029155731201, + "num_tokens": 425253600.0, + "step": 2531 + }, + { + "entropy": 1.7446503738562267, + "epoch": 0.27815769959627584, + "grad_norm": 0.6544666290283203, + "learning_rate": 1.9445182738198614e-05, + "loss": 1.4899, + "mean_token_accuracy": 0.6364033321539561, + "num_tokens": 425425203.0, + "step": 2532 + }, + { + "entropy": 1.674493948618571, + "epoch": 0.2782675565076488, + "grad_norm": 0.6259612441062927, + "learning_rate": 1.944462913146874e-05, + "loss": 1.2664, + "mean_token_accuracy": 0.6673529297113419, + "num_tokens": 425549724.0, + "step": 2533 + }, + { + "entropy": 1.7295575936635335, + "epoch": 0.2783774134190217, + "grad_norm": 0.7398607730865479, + "learning_rate": 1.944407525747015e-05, + "loss": 1.4909, + "mean_token_accuracy": 0.6412216623624166, + "num_tokens": 425749328.0, + "step": 2534 + }, + { + "entropy": 1.759381393591563, + "epoch": 0.27848727033039467, + "grad_norm": 0.7434036135673523, + "learning_rate": 1.9443521116220386e-05, + "loss": 1.4622, + "mean_token_accuracy": 0.6621341158946356, + "num_tokens": 425963889.0, + "step": 2535 + }, + { + "entropy": 1.7146111925443013, + "epoch": 0.2785971272417676, + "grad_norm": 0.6938877105712891, + "learning_rate": 1.9442966707736987e-05, + "loss": 1.355, + "mean_token_accuracy": 0.657206580042839, + "num_tokens": 426114600.0, + "step": 2536 + }, + { + "entropy": 1.7992856403191884, + "epoch": 0.27870698415314055, + "grad_norm": 0.7209751009941101, + "learning_rate": 1.944241203203749e-05, + "loss": 1.5263, + "mean_token_accuracy": 0.6411692102750143, + "num_tokens": 426320358.0, + "step": 2537 + }, + { + "entropy": 1.688951204220454, + "epoch": 0.2788168410645135, + "grad_norm": 1.081633448600769, + "learning_rate": 1.9441857089139464e-05, + "loss": 1.2315, + "mean_token_accuracy": 0.6716073205073675, + "num_tokens": 426498576.0, + "step": 2538 + }, + { + "entropy": 1.7655569116274517, + "epoch": 0.27892669797588643, + "grad_norm": 0.8024057745933533, + "learning_rate": 1.944130187906046e-05, + "loss": 1.389, + "mean_token_accuracy": 0.6557512134313583, + "num_tokens": 426645970.0, + "step": 2539 + }, + { + "entropy": 1.6769147912661235, + "epoch": 0.27903655488725937, + "grad_norm": 0.7822548151016235, + "learning_rate": 1.944074640181806e-05, + "loss": 1.4512, + "mean_token_accuracy": 0.646538108587265, + "num_tokens": 426806650.0, + "step": 2540 + }, + { + "entropy": 1.7401223282019298, + "epoch": 0.27914641179863225, + "grad_norm": 0.753135085105896, + "learning_rate": 1.9440190657429833e-05, + "loss": 1.392, + "mean_token_accuracy": 0.661638930439949, + "num_tokens": 426943210.0, + "step": 2541 + }, + { + "entropy": 1.792117138703664, + "epoch": 0.2792562687100052, + "grad_norm": 1.012791633605957, + "learning_rate": 1.943963464591338e-05, + "loss": 1.4671, + "mean_token_accuracy": 0.6386220256487528, + "num_tokens": 427126900.0, + "step": 2542 + }, + { + "entropy": 1.6871282557646434, + "epoch": 0.27936612562137814, + "grad_norm": 0.6758045554161072, + "learning_rate": 1.943907836728629e-05, + "loss": 1.5413, + "mean_token_accuracy": 0.6594254424174627, + "num_tokens": 427294304.0, + "step": 2543 + }, + { + "entropy": 1.7226394315560658, + "epoch": 0.2794759825327511, + "grad_norm": 0.6298893094062805, + "learning_rate": 1.9438521821566178e-05, + "loss": 1.4598, + "mean_token_accuracy": 0.635049377878507, + "num_tokens": 427491263.0, + "step": 2544 + }, + { + "entropy": 1.7836333811283112, + "epoch": 0.279585839444124, + "grad_norm": 0.8018297553062439, + "learning_rate": 1.9437965008770647e-05, + "loss": 1.6433, + "mean_token_accuracy": 0.625967395802339, + "num_tokens": 427671894.0, + "step": 2545 + }, + { + "entropy": 1.731053650379181, + "epoch": 0.27969569635549696, + "grad_norm": 0.6557754278182983, + "learning_rate": 1.9437407928917327e-05, + "loss": 1.4259, + "mean_token_accuracy": 0.6569652110338211, + "num_tokens": 427880951.0, + "step": 2546 + }, + { + "entropy": 1.7691397269566853, + "epoch": 0.2798055532668699, + "grad_norm": 0.717713475227356, + "learning_rate": 1.943685058202385e-05, + "loss": 1.5615, + "mean_token_accuracy": 0.6480028629302979, + "num_tokens": 428026306.0, + "step": 2547 + }, + { + "entropy": 1.722131739060084, + "epoch": 0.27991541017824284, + "grad_norm": 0.706473708152771, + "learning_rate": 1.9436292968107854e-05, + "loss": 1.4702, + "mean_token_accuracy": 0.6523783256610235, + "num_tokens": 428229212.0, + "step": 2548 + }, + { + "entropy": 1.7450311680634816, + "epoch": 0.2800252670896158, + "grad_norm": 0.6654791831970215, + "learning_rate": 1.9435735087186985e-05, + "loss": 1.2768, + "mean_token_accuracy": 0.6689060380061468, + "num_tokens": 428410903.0, + "step": 2549 + }, + { + "entropy": 1.7161929905414581, + "epoch": 0.2801351240009887, + "grad_norm": 0.722743570804596, + "learning_rate": 1.9435176939278902e-05, + "loss": 1.4746, + "mean_token_accuracy": 0.6436196118593216, + "num_tokens": 428586998.0, + "step": 2550 + }, + { + "entropy": 1.7112940152486165, + "epoch": 0.28024498091236166, + "grad_norm": 0.6534221172332764, + "learning_rate": 1.9434618524401273e-05, + "loss": 1.3635, + "mean_token_accuracy": 0.654671644171079, + "num_tokens": 428723631.0, + "step": 2551 + }, + { + "entropy": 1.741028368473053, + "epoch": 0.2803548378237346, + "grad_norm": 0.763145387172699, + "learning_rate": 1.9434059842571766e-05, + "loss": 1.5508, + "mean_token_accuracy": 0.635222981373469, + "num_tokens": 428984871.0, + "step": 2552 + }, + { + "entropy": 1.7181775569915771, + "epoch": 0.28046469473510754, + "grad_norm": 0.6733216047286987, + "learning_rate": 1.9433500893808064e-05, + "loss": 1.3059, + "mean_token_accuracy": 0.6693119158347448, + "num_tokens": 429135765.0, + "step": 2553 + }, + { + "entropy": 1.7046631177266438, + "epoch": 0.2805745516464805, + "grad_norm": 0.7447198629379272, + "learning_rate": 1.9432941678127863e-05, + "loss": 1.2777, + "mean_token_accuracy": 0.6620823442935944, + "num_tokens": 429255761.0, + "step": 2554 + }, + { + "entropy": 1.7015548547108967, + "epoch": 0.28068440855785337, + "grad_norm": 0.7502123117446899, + "learning_rate": 1.943238219554885e-05, + "loss": 1.3332, + "mean_token_accuracy": 0.6637969613075256, + "num_tokens": 429404829.0, + "step": 2555 + }, + { + "entropy": 1.6994233131408691, + "epoch": 0.2807942654692263, + "grad_norm": 0.5920188426971436, + "learning_rate": 1.943182244608875e-05, + "loss": 1.4957, + "mean_token_accuracy": 0.6309523532787958, + "num_tokens": 429608954.0, + "step": 2556 + }, + { + "entropy": 1.6985510190327961, + "epoch": 0.28090412238059925, + "grad_norm": 0.6346762180328369, + "learning_rate": 1.943126242976526e-05, + "loss": 1.4234, + "mean_token_accuracy": 0.6668014178673426, + "num_tokens": 429760077.0, + "step": 2557 + }, + { + "entropy": 1.716838429371516, + "epoch": 0.2810139792919722, + "grad_norm": 0.6307840347290039, + "learning_rate": 1.943070214659612e-05, + "loss": 1.3829, + "mean_token_accuracy": 0.6482276519139608, + "num_tokens": 429933647.0, + "step": 2558 + }, + { + "entropy": 1.7555852731068928, + "epoch": 0.28112383620334513, + "grad_norm": 0.7939680218696594, + "learning_rate": 1.9430141596599055e-05, + "loss": 1.5112, + "mean_token_accuracy": 0.6550355777144432, + "num_tokens": 430065355.0, + "step": 2559 + }, + { + "entropy": 1.7047178248564403, + "epoch": 0.2812336931147181, + "grad_norm": 0.6013801097869873, + "learning_rate": 1.9429580779791806e-05, + "loss": 1.4673, + "mean_token_accuracy": 0.6566463013490041, + "num_tokens": 430241848.0, + "step": 2560 + }, + { + "entropy": 1.7068589230378468, + "epoch": 0.281343550026091, + "grad_norm": 0.6323118209838867, + "learning_rate": 1.9429019696192122e-05, + "loss": 1.5224, + "mean_token_accuracy": 0.6419420739014944, + "num_tokens": 430428484.0, + "step": 2561 + }, + { + "entropy": 1.6809816559155781, + "epoch": 0.28145340693746396, + "grad_norm": 0.754179060459137, + "learning_rate": 1.9428458345817762e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6452597826719284, + "num_tokens": 430591922.0, + "step": 2562 + }, + { + "entropy": 1.652672717968623, + "epoch": 0.2815632638488369, + "grad_norm": 0.7418878674507141, + "learning_rate": 1.94278967286865e-05, + "loss": 1.1637, + "mean_token_accuracy": 0.6897034098704656, + "num_tokens": 430712080.0, + "step": 2563 + }, + { + "entropy": 1.6429332792758942, + "epoch": 0.28167312076020984, + "grad_norm": 0.6739898324012756, + "learning_rate": 1.94273348448161e-05, + "loss": 1.3419, + "mean_token_accuracy": 0.6748927334944407, + "num_tokens": 430883503.0, + "step": 2564 + }, + { + "entropy": 1.697869877020518, + "epoch": 0.2817829776715828, + "grad_norm": 0.6139808297157288, + "learning_rate": 1.9426772694224346e-05, + "loss": 1.37, + "mean_token_accuracy": 0.656757061680158, + "num_tokens": 431038787.0, + "step": 2565 + }, + { + "entropy": 1.8296188414096832, + "epoch": 0.2818928345829557, + "grad_norm": 0.6441859006881714, + "learning_rate": 1.9426210276929038e-05, + "loss": 1.6558, + "mean_token_accuracy": 0.6172501345475515, + "num_tokens": 431232258.0, + "step": 2566 + }, + { + "entropy": 1.729474276304245, + "epoch": 0.28200269149432866, + "grad_norm": 0.6634087562561035, + "learning_rate": 1.942564759294797e-05, + "loss": 1.5779, + "mean_token_accuracy": 0.6198930492003759, + "num_tokens": 431501611.0, + "step": 2567 + }, + { + "entropy": 1.693379670381546, + "epoch": 0.28211254840570155, + "grad_norm": 0.7475607395172119, + "learning_rate": 1.9425084642298956e-05, + "loss": 1.3763, + "mean_token_accuracy": 0.668298656741778, + "num_tokens": 431651634.0, + "step": 2568 + }, + { + "entropy": 1.7199491361776988, + "epoch": 0.2822224053170745, + "grad_norm": 0.6126656532287598, + "learning_rate": 1.9424521424999805e-05, + "loss": 1.3842, + "mean_token_accuracy": 0.6540129085381826, + "num_tokens": 431811528.0, + "step": 2569 + }, + { + "entropy": 1.711053987344106, + "epoch": 0.28233226222844743, + "grad_norm": 0.8134360909461975, + "learning_rate": 1.942395794106835e-05, + "loss": 1.2594, + "mean_token_accuracy": 0.6801566729942957, + "num_tokens": 431973926.0, + "step": 2570 + }, + { + "entropy": 1.715971678495407, + "epoch": 0.28244211913982037, + "grad_norm": 0.623103678226471, + "learning_rate": 1.942339419052242e-05, + "loss": 1.5081, + "mean_token_accuracy": 0.6435102721055349, + "num_tokens": 432176408.0, + "step": 2571 + }, + { + "entropy": 1.6517931123574574, + "epoch": 0.2825519760511933, + "grad_norm": 0.7378969192504883, + "learning_rate": 1.942283017337986e-05, + "loss": 1.3061, + "mean_token_accuracy": 0.6691179027160009, + "num_tokens": 432306283.0, + "step": 2572 + }, + { + "entropy": 1.671871801217397, + "epoch": 0.28266183296256625, + "grad_norm": 0.6152805685997009, + "learning_rate": 1.942226588965852e-05, + "loss": 1.2702, + "mean_token_accuracy": 0.6683402210474014, + "num_tokens": 432466149.0, + "step": 2573 + }, + { + "entropy": 1.7728142738342285, + "epoch": 0.2827716898739392, + "grad_norm": 0.6616373658180237, + "learning_rate": 1.9421701339376263e-05, + "loss": 1.407, + "mean_token_accuracy": 0.6552805304527283, + "num_tokens": 432602942.0, + "step": 2574 + }, + { + "entropy": 1.7376613914966583, + "epoch": 0.28288154678531213, + "grad_norm": 0.6483553647994995, + "learning_rate": 1.942113652255095e-05, + "loss": 1.432, + "mean_token_accuracy": 0.6605344464381536, + "num_tokens": 432771571.0, + "step": 2575 + }, + { + "entropy": 1.67216690381368, + "epoch": 0.2829914036966851, + "grad_norm": 0.6230313181877136, + "learning_rate": 1.9420571439200463e-05, + "loss": 1.4043, + "mean_token_accuracy": 0.6436713586250941, + "num_tokens": 432970686.0, + "step": 2576 + }, + { + "entropy": 1.6591029067834218, + "epoch": 0.283101260608058, + "grad_norm": 0.6028016209602356, + "learning_rate": 1.942000608934268e-05, + "loss": 1.3898, + "mean_token_accuracy": 0.6501336942116419, + "num_tokens": 433158122.0, + "step": 2577 + }, + { + "entropy": 1.756819248199463, + "epoch": 0.28321111751943095, + "grad_norm": 0.6819401979446411, + "learning_rate": 1.9419440472995502e-05, + "loss": 1.2936, + "mean_token_accuracy": 0.6685625910758972, + "num_tokens": 433336335.0, + "step": 2578 + }, + { + "entropy": 1.7187687456607819, + "epoch": 0.2833209744308039, + "grad_norm": 0.8300583362579346, + "learning_rate": 1.9418874590176827e-05, + "loss": 1.5282, + "mean_token_accuracy": 0.6508554766575495, + "num_tokens": 433457129.0, + "step": 2579 + }, + { + "entropy": 1.6912387907505035, + "epoch": 0.28343083134217684, + "grad_norm": 0.8252399563789368, + "learning_rate": 1.9418308440904564e-05, + "loss": 1.4709, + "mean_token_accuracy": 0.6553937296072642, + "num_tokens": 433624991.0, + "step": 2580 + }, + { + "entropy": 1.7515579263369243, + "epoch": 0.2835406882535498, + "grad_norm": 0.7480166554450989, + "learning_rate": 1.9417742025196635e-05, + "loss": 1.5038, + "mean_token_accuracy": 0.6306808292865753, + "num_tokens": 433839600.0, + "step": 2581 + }, + { + "entropy": 1.724341442187627, + "epoch": 0.28365054516492266, + "grad_norm": 0.8125796914100647, + "learning_rate": 1.9417175343070962e-05, + "loss": 1.3742, + "mean_token_accuracy": 0.6564011871814728, + "num_tokens": 433995970.0, + "step": 2582 + }, + { + "entropy": 1.6599280138810475, + "epoch": 0.2837604020762956, + "grad_norm": 0.6576691269874573, + "learning_rate": 1.941660839454548e-05, + "loss": 1.365, + "mean_token_accuracy": 0.6600429564714432, + "num_tokens": 434197426.0, + "step": 2583 + }, + { + "entropy": 1.659876714150111, + "epoch": 0.28387025898766854, + "grad_norm": 0.6102942824363708, + "learning_rate": 1.9416041179638138e-05, + "loss": 1.3061, + "mean_token_accuracy": 0.67480997244517, + "num_tokens": 434399328.0, + "step": 2584 + }, + { + "entropy": 1.6985367238521576, + "epoch": 0.2839801158990415, + "grad_norm": 0.6151925921440125, + "learning_rate": 1.941547369836688e-05, + "loss": 1.2711, + "mean_token_accuracy": 0.6702224761247635, + "num_tokens": 434537957.0, + "step": 2585 + }, + { + "entropy": 1.673358827829361, + "epoch": 0.2840899728104144, + "grad_norm": 0.6189048886299133, + "learning_rate": 1.941490595074968e-05, + "loss": 1.3391, + "mean_token_accuracy": 0.6615988264481226, + "num_tokens": 434758718.0, + "step": 2586 + }, + { + "entropy": 1.7454047600428264, + "epoch": 0.28419982972178737, + "grad_norm": 0.6552925109863281, + "learning_rate": 1.941433793680449e-05, + "loss": 1.4267, + "mean_token_accuracy": 0.6499452342589697, + "num_tokens": 434917375.0, + "step": 2587 + }, + { + "entropy": 1.7055931588013966, + "epoch": 0.2843096866331603, + "grad_norm": 0.8626599311828613, + "learning_rate": 1.94137696565493e-05, + "loss": 1.4959, + "mean_token_accuracy": 0.6463074237108231, + "num_tokens": 435103637.0, + "step": 2588 + }, + { + "entropy": 1.7163316309452057, + "epoch": 0.28441954354453325, + "grad_norm": 0.6372457146644592, + "learning_rate": 1.9413201110002094e-05, + "loss": 1.5373, + "mean_token_accuracy": 0.6396234631538391, + "num_tokens": 435295478.0, + "step": 2589 + }, + { + "entropy": 1.7566253244876862, + "epoch": 0.2845294004559062, + "grad_norm": 0.6608404517173767, + "learning_rate": 1.941263229718086e-05, + "loss": 1.4931, + "mean_token_accuracy": 0.6418844411770502, + "num_tokens": 435479005.0, + "step": 2590 + }, + { + "entropy": 1.6568923095862071, + "epoch": 0.28463925736727913, + "grad_norm": 0.7189907431602478, + "learning_rate": 1.9412063218103607e-05, + "loss": 1.2423, + "mean_token_accuracy": 0.6729675034681956, + "num_tokens": 435628265.0, + "step": 2591 + }, + { + "entropy": 1.6527254382769268, + "epoch": 0.28474911427865207, + "grad_norm": 0.6984722018241882, + "learning_rate": 1.9411493872788342e-05, + "loss": 1.4279, + "mean_token_accuracy": 0.6531344701846441, + "num_tokens": 435898622.0, + "step": 2592 + }, + { + "entropy": 1.7152815461158752, + "epoch": 0.284858971190025, + "grad_norm": 0.7125999331474304, + "learning_rate": 1.941092426125309e-05, + "loss": 1.5202, + "mean_token_accuracy": 0.6567995101213455, + "num_tokens": 436075360.0, + "step": 2593 + }, + { + "entropy": 1.6898160974184673, + "epoch": 0.28496882810139795, + "grad_norm": 0.6797850728034973, + "learning_rate": 1.9410354383515872e-05, + "loss": 1.3609, + "mean_token_accuracy": 0.667990709344546, + "num_tokens": 436269293.0, + "step": 2594 + }, + { + "entropy": 1.6990408897399902, + "epoch": 0.28507868501277084, + "grad_norm": 0.6356927752494812, + "learning_rate": 1.9409784239594726e-05, + "loss": 1.4249, + "mean_token_accuracy": 0.659389058748881, + "num_tokens": 436420010.0, + "step": 2595 + }, + { + "entropy": 1.7738368213176727, + "epoch": 0.2851885419241438, + "grad_norm": 0.5506088137626648, + "learning_rate": 1.94092138295077e-05, + "loss": 1.5087, + "mean_token_accuracy": 0.621019164721171, + "num_tokens": 436673612.0, + "step": 2596 + }, + { + "entropy": 1.7091120680173237, + "epoch": 0.2852983988355167, + "grad_norm": 0.6980639100074768, + "learning_rate": 1.9408643153272845e-05, + "loss": 1.3243, + "mean_token_accuracy": 0.6670518765846888, + "num_tokens": 436835491.0, + "step": 2597 + }, + { + "entropy": 1.779990682999293, + "epoch": 0.28540825574688966, + "grad_norm": 0.6934612393379211, + "learning_rate": 1.9408072210908224e-05, + "loss": 1.4711, + "mean_token_accuracy": 0.6409854739904404, + "num_tokens": 437001428.0, + "step": 2598 + }, + { + "entropy": 1.6988802353541057, + "epoch": 0.2855181126582626, + "grad_norm": 0.7314718961715698, + "learning_rate": 1.9407501002431906e-05, + "loss": 1.5671, + "mean_token_accuracy": 0.6539310614267985, + "num_tokens": 437139054.0, + "step": 2599 + }, + { + "entropy": 1.7167824109395344, + "epoch": 0.28562796956963554, + "grad_norm": 0.8039875030517578, + "learning_rate": 1.940692952786197e-05, + "loss": 1.4672, + "mean_token_accuracy": 0.6653132339318594, + "num_tokens": 437270459.0, + "step": 2600 + }, + { + "entropy": 1.6449208458264668, + "epoch": 0.2857378264810085, + "grad_norm": 0.7214610576629639, + "learning_rate": 1.9406357787216504e-05, + "loss": 1.4112, + "mean_token_accuracy": 0.6606322924296061, + "num_tokens": 437421392.0, + "step": 2601 + }, + { + "entropy": 1.680547167857488, + "epoch": 0.2858476833923814, + "grad_norm": 0.7055097222328186, + "learning_rate": 1.94057857805136e-05, + "loss": 1.3738, + "mean_token_accuracy": 0.6694934616486231, + "num_tokens": 437561791.0, + "step": 2602 + }, + { + "entropy": 1.6792938709259033, + "epoch": 0.28595754030375437, + "grad_norm": 0.6724585890769958, + "learning_rate": 1.9405213507771363e-05, + "loss": 1.4334, + "mean_token_accuracy": 0.6348255177338918, + "num_tokens": 437791784.0, + "step": 2603 + }, + { + "entropy": 1.6977178752422333, + "epoch": 0.2860673972151273, + "grad_norm": 0.8410064578056335, + "learning_rate": 1.9404640969007907e-05, + "loss": 1.6249, + "mean_token_accuracy": 0.6277847041686376, + "num_tokens": 437997002.0, + "step": 2604 + }, + { + "entropy": 1.7250556250413258, + "epoch": 0.28617725412650025, + "grad_norm": 0.7040321230888367, + "learning_rate": 1.9404068164241354e-05, + "loss": 1.3341, + "mean_token_accuracy": 0.6668216039737066, + "num_tokens": 438129008.0, + "step": 2605 + }, + { + "entropy": 1.6219845215479534, + "epoch": 0.2862871110378732, + "grad_norm": 0.5418662428855896, + "learning_rate": 1.940349509348983e-05, + "loss": 1.3108, + "mean_token_accuracy": 0.6690922429164251, + "num_tokens": 438323789.0, + "step": 2606 + }, + { + "entropy": 1.6409521003564198, + "epoch": 0.28639696794924613, + "grad_norm": 0.6410753130912781, + "learning_rate": 1.9402921756771467e-05, + "loss": 1.3104, + "mean_token_accuracy": 0.6703230490287145, + "num_tokens": 438483486.0, + "step": 2607 + }, + { + "entropy": 1.7267462313175201, + "epoch": 0.28650682486061907, + "grad_norm": 0.8248845338821411, + "learning_rate": 1.940234815410442e-05, + "loss": 1.2461, + "mean_token_accuracy": 0.6791750093301138, + "num_tokens": 438633786.0, + "step": 2608 + }, + { + "entropy": 1.7135661741097767, + "epoch": 0.28661668177199195, + "grad_norm": 0.6371444463729858, + "learning_rate": 1.9401774285506844e-05, + "loss": 1.4584, + "mean_token_accuracy": 0.6644980758428574, + "num_tokens": 438804168.0, + "step": 2609 + }, + { + "entropy": 1.73434716463089, + "epoch": 0.2867265386833649, + "grad_norm": 0.7363563179969788, + "learning_rate": 1.9401200150996897e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.6546731541554133, + "num_tokens": 439014763.0, + "step": 2610 + }, + { + "entropy": 1.6565505663553874, + "epoch": 0.28683639559473784, + "grad_norm": 0.9262635111808777, + "learning_rate": 1.940062575059275e-05, + "loss": 1.4191, + "mean_token_accuracy": 0.6618759582440058, + "num_tokens": 439230751.0, + "step": 2611 + }, + { + "entropy": 1.6508768200874329, + "epoch": 0.2869462525061108, + "grad_norm": 0.8240963220596313, + "learning_rate": 1.9400051084312582e-05, + "loss": 1.2587, + "mean_token_accuracy": 0.6758377949396769, + "num_tokens": 439369696.0, + "step": 2612 + }, + { + "entropy": 1.7037352323532104, + "epoch": 0.2870561094174837, + "grad_norm": 0.6460655927658081, + "learning_rate": 1.9399476152174582e-05, + "loss": 1.3585, + "mean_token_accuracy": 0.6592906763156255, + "num_tokens": 439513162.0, + "step": 2613 + }, + { + "entropy": 1.689726283152898, + "epoch": 0.28716596632885666, + "grad_norm": 0.621805727481842, + "learning_rate": 1.939890095419695e-05, + "loss": 1.5793, + "mean_token_accuracy": 0.6374916980663935, + "num_tokens": 439714986.0, + "step": 2614 + }, + { + "entropy": 1.7064528862635295, + "epoch": 0.2872758232402296, + "grad_norm": 0.6384214162826538, + "learning_rate": 1.9398325490397882e-05, + "loss": 1.3932, + "mean_token_accuracy": 0.6516889532407125, + "num_tokens": 439906645.0, + "step": 2615 + }, + { + "entropy": 1.80901434024175, + "epoch": 0.28738568015160254, + "grad_norm": 0.7344052195549011, + "learning_rate": 1.93977497607956e-05, + "loss": 1.3694, + "mean_token_accuracy": 0.6523802032073339, + "num_tokens": 440005633.0, + "step": 2616 + }, + { + "entropy": 1.7248846590518951, + "epoch": 0.2874955370629755, + "grad_norm": 0.7670570611953735, + "learning_rate": 1.939717376540832e-05, + "loss": 1.5491, + "mean_token_accuracy": 0.6424847940603892, + "num_tokens": 440210280.0, + "step": 2617 + }, + { + "entropy": 1.7063851058483124, + "epoch": 0.2876053939743484, + "grad_norm": 0.8078677654266357, + "learning_rate": 1.939659750425428e-05, + "loss": 1.4602, + "mean_token_accuracy": 0.6642357558012009, + "num_tokens": 440382539.0, + "step": 2618 + }, + { + "entropy": 1.7136432727177937, + "epoch": 0.28771525088572136, + "grad_norm": 0.6970177888870239, + "learning_rate": 1.9396020977351707e-05, + "loss": 1.456, + "mean_token_accuracy": 0.6549165745576223, + "num_tokens": 440542147.0, + "step": 2619 + }, + { + "entropy": 1.769452879826228, + "epoch": 0.2878251077970943, + "grad_norm": 0.6893923282623291, + "learning_rate": 1.9395444184718856e-05, + "loss": 1.361, + "mean_token_accuracy": 0.6599644472201666, + "num_tokens": 440687660.0, + "step": 2620 + }, + { + "entropy": 1.730559726556142, + "epoch": 0.28793496470846724, + "grad_norm": 0.6542354822158813, + "learning_rate": 1.9394867126373978e-05, + "loss": 1.473, + "mean_token_accuracy": 0.639530157049497, + "num_tokens": 440882125.0, + "step": 2621 + }, + { + "entropy": 1.7187020281950633, + "epoch": 0.2880448216198402, + "grad_norm": 0.6793704628944397, + "learning_rate": 1.939428980233534e-05, + "loss": 1.3705, + "mean_token_accuracy": 0.6617551843325297, + "num_tokens": 441024849.0, + "step": 2622 + }, + { + "entropy": 1.8135265906651814, + "epoch": 0.28815467853121307, + "grad_norm": 0.9341657757759094, + "learning_rate": 1.939371221262121e-05, + "loss": 1.4504, + "mean_token_accuracy": 0.6392282346884409, + "num_tokens": 441207195.0, + "step": 2623 + }, + { + "entropy": 1.705136905113856, + "epoch": 0.288264535442586, + "grad_norm": 0.623195469379425, + "learning_rate": 1.9393134357249873e-05, + "loss": 1.5089, + "mean_token_accuracy": 0.6382016837596893, + "num_tokens": 441406980.0, + "step": 2624 + }, + { + "entropy": 1.7127378980318706, + "epoch": 0.28837439235395895, + "grad_norm": 0.8345064520835876, + "learning_rate": 1.939255623623961e-05, + "loss": 1.4068, + "mean_token_accuracy": 0.6660717676083246, + "num_tokens": 441559825.0, + "step": 2625 + }, + { + "entropy": 1.658822198708852, + "epoch": 0.2884842492653319, + "grad_norm": 0.7187853455543518, + "learning_rate": 1.939197784960873e-05, + "loss": 1.3836, + "mean_token_accuracy": 0.6508085081974665, + "num_tokens": 441766932.0, + "step": 2626 + }, + { + "entropy": 1.704751859108607, + "epoch": 0.28859410617670483, + "grad_norm": 0.6862353682518005, + "learning_rate": 1.9391399197375532e-05, + "loss": 1.3871, + "mean_token_accuracy": 0.6600909431775411, + "num_tokens": 441913413.0, + "step": 2627 + }, + { + "entropy": 1.6826223929723103, + "epoch": 0.2887039630880778, + "grad_norm": 0.6673212647438049, + "learning_rate": 1.939082027955833e-05, + "loss": 1.3096, + "mean_token_accuracy": 0.6611541360616684, + "num_tokens": 442074956.0, + "step": 2628 + }, + { + "entropy": 1.7774581213792164, + "epoch": 0.2888138199994507, + "grad_norm": 0.7454673051834106, + "learning_rate": 1.9390241096175446e-05, + "loss": 1.3841, + "mean_token_accuracy": 0.6583873877922694, + "num_tokens": 442194901.0, + "step": 2629 + }, + { + "entropy": 1.7498468160629272, + "epoch": 0.28892367691082366, + "grad_norm": 0.6279725432395935, + "learning_rate": 1.9389661647245216e-05, + "loss": 1.4632, + "mean_token_accuracy": 0.6340119491020838, + "num_tokens": 442375203.0, + "step": 2630 + }, + { + "entropy": 1.6489406327406566, + "epoch": 0.2890335338221966, + "grad_norm": 2.050079345703125, + "learning_rate": 1.9389081932785972e-05, + "loss": 1.1596, + "mean_token_accuracy": 0.6808596501747767, + "num_tokens": 442579070.0, + "step": 2631 + }, + { + "entropy": 1.6880771319071453, + "epoch": 0.28914339073356954, + "grad_norm": 0.7001965045928955, + "learning_rate": 1.9388501952816065e-05, + "loss": 1.4539, + "mean_token_accuracy": 0.6618387450774511, + "num_tokens": 442764662.0, + "step": 2632 + }, + { + "entropy": 1.7235000828901927, + "epoch": 0.2892532476449425, + "grad_norm": 0.7411707639694214, + "learning_rate": 1.9387921707353852e-05, + "loss": 1.3913, + "mean_token_accuracy": 0.661471888422966, + "num_tokens": 442937159.0, + "step": 2633 + }, + { + "entropy": 1.7213360567887623, + "epoch": 0.2893631045563154, + "grad_norm": 0.7139886021614075, + "learning_rate": 1.9387341196417693e-05, + "loss": 1.369, + "mean_token_accuracy": 0.6659169793128967, + "num_tokens": 443095204.0, + "step": 2634 + }, + { + "entropy": 1.7081999878088634, + "epoch": 0.28947296146768836, + "grad_norm": 0.6855827569961548, + "learning_rate": 1.938676042002597e-05, + "loss": 1.4566, + "mean_token_accuracy": 0.6534863263368607, + "num_tokens": 443244399.0, + "step": 2635 + }, + { + "entropy": 1.732550968726476, + "epoch": 0.28958281837906125, + "grad_norm": 0.7183929681777954, + "learning_rate": 1.9386179378197057e-05, + "loss": 1.4231, + "mean_token_accuracy": 0.6524986227353414, + "num_tokens": 443443407.0, + "step": 2636 + }, + { + "entropy": 1.7156991362571716, + "epoch": 0.2896926752904342, + "grad_norm": 0.8171935677528381, + "learning_rate": 1.9385598070949344e-05, + "loss": 1.2888, + "mean_token_accuracy": 0.6588062097628912, + "num_tokens": 443554465.0, + "step": 2637 + }, + { + "entropy": 1.7135216891765594, + "epoch": 0.28980253220180713, + "grad_norm": 0.6728255748748779, + "learning_rate": 1.938501649830123e-05, + "loss": 1.5447, + "mean_token_accuracy": 0.6269475817680359, + "num_tokens": 443758902.0, + "step": 2638 + }, + { + "entropy": 1.6548403700192769, + "epoch": 0.28991238911318007, + "grad_norm": 0.5925636887550354, + "learning_rate": 1.9384434660271127e-05, + "loss": 1.4282, + "mean_token_accuracy": 0.6455462525288264, + "num_tokens": 443975978.0, + "step": 2639 + }, + { + "entropy": 1.6577099462350209, + "epoch": 0.290022246024553, + "grad_norm": 0.6581486463546753, + "learning_rate": 1.9383852556877442e-05, + "loss": 1.4118, + "mean_token_accuracy": 0.6570763885974884, + "num_tokens": 444185925.0, + "step": 2640 + }, + { + "entropy": 1.6643743515014648, + "epoch": 0.29013210293592595, + "grad_norm": 0.581012487411499, + "learning_rate": 1.93832701881386e-05, + "loss": 1.2362, + "mean_token_accuracy": 0.6839745144049326, + "num_tokens": 444347755.0, + "step": 2641 + }, + { + "entropy": 1.6794247229894002, + "epoch": 0.2902419598472989, + "grad_norm": 0.6780709028244019, + "learning_rate": 1.9382687554073037e-05, + "loss": 1.494, + "mean_token_accuracy": 0.6450832734505335, + "num_tokens": 444602974.0, + "step": 2642 + }, + { + "entropy": 1.732841948668162, + "epoch": 0.29035181675867183, + "grad_norm": 0.6316855549812317, + "learning_rate": 1.9382104654699188e-05, + "loss": 1.4737, + "mean_token_accuracy": 0.6366277585426966, + "num_tokens": 444861153.0, + "step": 2643 + }, + { + "entropy": 1.7282691299915314, + "epoch": 0.2904616736700448, + "grad_norm": 0.6324109435081482, + "learning_rate": 1.9381521490035507e-05, + "loss": 1.4287, + "mean_token_accuracy": 0.64259501794974, + "num_tokens": 445064095.0, + "step": 2644 + }, + { + "entropy": 1.7289496064186096, + "epoch": 0.2905715305814177, + "grad_norm": 0.6488730311393738, + "learning_rate": 1.9380938060100444e-05, + "loss": 1.4234, + "mean_token_accuracy": 0.6523800839980444, + "num_tokens": 445277916.0, + "step": 2645 + }, + { + "entropy": 1.7048724194367726, + "epoch": 0.29068138749279065, + "grad_norm": 0.8301265835762024, + "learning_rate": 1.938035436491247e-05, + "loss": 1.4233, + "mean_token_accuracy": 0.6592496484518051, + "num_tokens": 445421134.0, + "step": 2646 + }, + { + "entropy": 1.6666546662648518, + "epoch": 0.2907912444041636, + "grad_norm": 0.6903906464576721, + "learning_rate": 1.9379770404490055e-05, + "loss": 1.434, + "mean_token_accuracy": 0.6567947318156561, + "num_tokens": 445598132.0, + "step": 2647 + }, + { + "entropy": 1.6847756405671437, + "epoch": 0.29090110131553654, + "grad_norm": 0.7805249691009521, + "learning_rate": 1.9379186178851682e-05, + "loss": 1.56, + "mean_token_accuracy": 0.6398202478885651, + "num_tokens": 445778211.0, + "step": 2648 + }, + { + "entropy": 1.7226019004980724, + "epoch": 0.2910109582269095, + "grad_norm": 0.7891348004341125, + "learning_rate": 1.9378601688015844e-05, + "loss": 1.3392, + "mean_token_accuracy": 0.6544974197944006, + "num_tokens": 445908005.0, + "step": 2649 + }, + { + "entropy": 1.7356652915477753, + "epoch": 0.29112081513828236, + "grad_norm": 0.6843798160552979, + "learning_rate": 1.9378016932001038e-05, + "loss": 1.4653, + "mean_token_accuracy": 0.6467985957860947, + "num_tokens": 446039417.0, + "step": 2650 + }, + { + "entropy": 1.6962276101112366, + "epoch": 0.2912306720496553, + "grad_norm": 0.8159408569335938, + "learning_rate": 1.937743191082577e-05, + "loss": 1.2746, + "mean_token_accuracy": 0.6776145696640015, + "num_tokens": 446183804.0, + "step": 2651 + }, + { + "entropy": 1.6974614063898723, + "epoch": 0.29134052896102824, + "grad_norm": 0.6378005743026733, + "learning_rate": 1.937684662450856e-05, + "loss": 1.4558, + "mean_token_accuracy": 0.6375326613585154, + "num_tokens": 446395323.0, + "step": 2652 + }, + { + "entropy": 1.708219935496648, + "epoch": 0.2914503858724012, + "grad_norm": 0.6746168732643127, + "learning_rate": 1.9376261073067924e-05, + "loss": 1.4131, + "mean_token_accuracy": 0.6394032041231791, + "num_tokens": 446585024.0, + "step": 2653 + }, + { + "entropy": 1.7100668549537659, + "epoch": 0.2915602427837741, + "grad_norm": 0.7768794298171997, + "learning_rate": 1.9375675256522407e-05, + "loss": 1.4283, + "mean_token_accuracy": 0.6499427556991577, + "num_tokens": 446745055.0, + "step": 2654 + }, + { + "entropy": 1.6924604872862499, + "epoch": 0.29167009969514707, + "grad_norm": 0.8306671977043152, + "learning_rate": 1.9375089174890535e-05, + "loss": 1.3603, + "mean_token_accuracy": 0.6594620595375696, + "num_tokens": 446914924.0, + "step": 2655 + }, + { + "entropy": 1.7755940457185109, + "epoch": 0.29177995660652, + "grad_norm": 0.7717196941375732, + "learning_rate": 1.937450282819087e-05, + "loss": 1.485, + "mean_token_accuracy": 0.6404218624035517, + "num_tokens": 447059653.0, + "step": 2656 + }, + { + "entropy": 1.7116881906986237, + "epoch": 0.29188981351789295, + "grad_norm": 0.7639763951301575, + "learning_rate": 1.937391621644196e-05, + "loss": 1.3773, + "mean_token_accuracy": 0.653444285194079, + "num_tokens": 447200194.0, + "step": 2657 + }, + { + "entropy": 1.6779767175515492, + "epoch": 0.2919996704292659, + "grad_norm": 0.6334971189498901, + "learning_rate": 1.9373329339662376e-05, + "loss": 1.4107, + "mean_token_accuracy": 0.6518335590759913, + "num_tokens": 447396983.0, + "step": 2658 + }, + { + "entropy": 1.682463566462199, + "epoch": 0.29210952734063883, + "grad_norm": 0.673891007900238, + "learning_rate": 1.9372742197870694e-05, + "loss": 1.3052, + "mean_token_accuracy": 0.6681603988011678, + "num_tokens": 447551504.0, + "step": 2659 + }, + { + "entropy": 1.6600935558478038, + "epoch": 0.29221938425201177, + "grad_norm": 0.6958986520767212, + "learning_rate": 1.9372154791085494e-05, + "loss": 1.3451, + "mean_token_accuracy": 0.6691215733687083, + "num_tokens": 447691507.0, + "step": 2660 + }, + { + "entropy": 1.6572713057200115, + "epoch": 0.2923292411633847, + "grad_norm": 0.6040387153625488, + "learning_rate": 1.9371567119325366e-05, + "loss": 1.4619, + "mean_token_accuracy": 0.641932855049769, + "num_tokens": 447887087.0, + "step": 2661 + }, + { + "entropy": 1.6765493253866832, + "epoch": 0.29243909807475765, + "grad_norm": 0.6503280401229858, + "learning_rate": 1.937097918260891e-05, + "loss": 1.3431, + "mean_token_accuracy": 0.6661610007286072, + "num_tokens": 448043292.0, + "step": 2662 + }, + { + "entropy": 1.681261380513509, + "epoch": 0.29254895498613054, + "grad_norm": 0.7642120718955994, + "learning_rate": 1.9370390980954734e-05, + "loss": 1.4735, + "mean_token_accuracy": 0.653852661450704, + "num_tokens": 448207574.0, + "step": 2663 + }, + { + "entropy": 1.6833062370618184, + "epoch": 0.2926588118975035, + "grad_norm": 0.670602023601532, + "learning_rate": 1.936980251438146e-05, + "loss": 1.3351, + "mean_token_accuracy": 0.6685332159201304, + "num_tokens": 448357135.0, + "step": 2664 + }, + { + "entropy": 1.7469529509544373, + "epoch": 0.2927686688088764, + "grad_norm": 0.7529553771018982, + "learning_rate": 1.9369213782907704e-05, + "loss": 1.5406, + "mean_token_accuracy": 0.6347383807102839, + "num_tokens": 448561405.0, + "step": 2665 + }, + { + "entropy": 1.7639080087343852, + "epoch": 0.29287852572024936, + "grad_norm": 0.7545667290687561, + "learning_rate": 1.9368624786552103e-05, + "loss": 1.5237, + "mean_token_accuracy": 0.6413746724526087, + "num_tokens": 448745327.0, + "step": 2666 + }, + { + "entropy": 1.694331020116806, + "epoch": 0.2929883826316223, + "grad_norm": 0.6679127216339111, + "learning_rate": 1.93680355253333e-05, + "loss": 1.3199, + "mean_token_accuracy": 0.6663316388924917, + "num_tokens": 448887664.0, + "step": 2667 + }, + { + "entropy": 1.7083205878734589, + "epoch": 0.29309823954299524, + "grad_norm": 0.6043710708618164, + "learning_rate": 1.9367445999269942e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6552055925130844, + "num_tokens": 449053784.0, + "step": 2668 + }, + { + "entropy": 1.7981708248456318, + "epoch": 0.2932080964543682, + "grad_norm": 0.5984474420547485, + "learning_rate": 1.9366856208380692e-05, + "loss": 1.3922, + "mean_token_accuracy": 0.6492073982954025, + "num_tokens": 449226912.0, + "step": 2669 + }, + { + "entropy": 1.7615818778673809, + "epoch": 0.2933179533657411, + "grad_norm": 0.7663693428039551, + "learning_rate": 1.936626615268421e-05, + "loss": 1.2619, + "mean_token_accuracy": 0.6588031202554703, + "num_tokens": 449331942.0, + "step": 2670 + }, + { + "entropy": 1.7315702736377716, + "epoch": 0.29342781027711407, + "grad_norm": 0.7021830677986145, + "learning_rate": 1.9365675832199173e-05, + "loss": 1.3047, + "mean_token_accuracy": 0.6822420656681061, + "num_tokens": 449455287.0, + "step": 2671 + }, + { + "entropy": 1.6805367469787598, + "epoch": 0.293537667188487, + "grad_norm": 0.5745053887367249, + "learning_rate": 1.936508524694427e-05, + "loss": 1.3666, + "mean_token_accuracy": 0.6640961915254593, + "num_tokens": 449670325.0, + "step": 2672 + }, + { + "entropy": 1.6514671444892883, + "epoch": 0.29364752409985995, + "grad_norm": 0.727182924747467, + "learning_rate": 1.9364494396938183e-05, + "loss": 1.2007, + "mean_token_accuracy": 0.6855193028847376, + "num_tokens": 449786535.0, + "step": 2673 + }, + { + "entropy": 1.7635166545708973, + "epoch": 0.2937573810112329, + "grad_norm": 0.7026004791259766, + "learning_rate": 1.9363903282199622e-05, + "loss": 1.5577, + "mean_token_accuracy": 0.6341749678055445, + "num_tokens": 449959641.0, + "step": 2674 + }, + { + "entropy": 1.6938972075780232, + "epoch": 0.29386723792260583, + "grad_norm": 0.7433229088783264, + "learning_rate": 1.936331190274729e-05, + "loss": 1.4235, + "mean_token_accuracy": 0.6720124930143356, + "num_tokens": 450131900.0, + "step": 2675 + }, + { + "entropy": 1.683308909336726, + "epoch": 0.29397709483397877, + "grad_norm": 0.7515206933021545, + "learning_rate": 1.9362720258599906e-05, + "loss": 1.433, + "mean_token_accuracy": 0.6503423452377319, + "num_tokens": 450296861.0, + "step": 2676 + }, + { + "entropy": 1.6832621296246846, + "epoch": 0.29408695174535165, + "grad_norm": 0.7027547955513, + "learning_rate": 1.936212834977619e-05, + "loss": 1.2631, + "mean_token_accuracy": 0.6792778372764587, + "num_tokens": 450446299.0, + "step": 2677 + }, + { + "entropy": 1.7150346239407857, + "epoch": 0.2941968086567246, + "grad_norm": 0.6275519132614136, + "learning_rate": 1.9361536176294884e-05, + "loss": 1.4966, + "mean_token_accuracy": 0.6483021924893061, + "num_tokens": 450598627.0, + "step": 2678 + }, + { + "entropy": 1.7051092684268951, + "epoch": 0.29430666556809754, + "grad_norm": 0.7069133520126343, + "learning_rate": 1.9360943738174723e-05, + "loss": 1.4622, + "mean_token_accuracy": 0.6559437364339828, + "num_tokens": 450730927.0, + "step": 2679 + }, + { + "entropy": 1.684261292219162, + "epoch": 0.2944165224794705, + "grad_norm": 1.7670702934265137, + "learning_rate": 1.9360351035434462e-05, + "loss": 1.3459, + "mean_token_accuracy": 0.6596929530302683, + "num_tokens": 450949743.0, + "step": 2680 + }, + { + "entropy": 1.711377779642741, + "epoch": 0.2945263793908434, + "grad_norm": 0.6422090530395508, + "learning_rate": 1.9359758068092856e-05, + "loss": 1.4483, + "mean_token_accuracy": 0.6548082033793131, + "num_tokens": 451108520.0, + "step": 2681 + }, + { + "entropy": 1.6563451290130615, + "epoch": 0.29463623630221636, + "grad_norm": 0.7376955151557922, + "learning_rate": 1.9359164836168673e-05, + "loss": 1.1897, + "mean_token_accuracy": 0.6866026818752289, + "num_tokens": 451255541.0, + "step": 2682 + }, + { + "entropy": 1.7390979429086049, + "epoch": 0.2947460932135893, + "grad_norm": 0.78875732421875, + "learning_rate": 1.9358571339680695e-05, + "loss": 1.3601, + "mean_token_accuracy": 0.6453971515099207, + "num_tokens": 451456051.0, + "step": 2683 + }, + { + "entropy": 1.7441952129205067, + "epoch": 0.29485595012496224, + "grad_norm": 0.7171877026557922, + "learning_rate": 1.93579775786477e-05, + "loss": 1.4548, + "mean_token_accuracy": 0.6368372937043508, + "num_tokens": 451669627.0, + "step": 2684 + }, + { + "entropy": 1.75757697224617, + "epoch": 0.2949658070363352, + "grad_norm": 0.7782573103904724, + "learning_rate": 1.9357383553088475e-05, + "loss": 1.4544, + "mean_token_accuracy": 0.645165205001831, + "num_tokens": 451796991.0, + "step": 2685 + }, + { + "entropy": 1.6625976363817851, + "epoch": 0.2950756639477081, + "grad_norm": 0.8164569735527039, + "learning_rate": 1.935678926302183e-05, + "loss": 1.3995, + "mean_token_accuracy": 0.6670281787713369, + "num_tokens": 451954998.0, + "step": 2686 + }, + { + "entropy": 1.7122354706128438, + "epoch": 0.29518552085908106, + "grad_norm": 0.6264376044273376, + "learning_rate": 1.935619470846657e-05, + "loss": 1.3778, + "mean_token_accuracy": 0.6613652606805166, + "num_tokens": 452135263.0, + "step": 2687 + }, + { + "entropy": 1.7165345946947734, + "epoch": 0.295295377770454, + "grad_norm": 0.6703044176101685, + "learning_rate": 1.9355599889441514e-05, + "loss": 1.3914, + "mean_token_accuracy": 0.6461621175209681, + "num_tokens": 452312831.0, + "step": 2688 + }, + { + "entropy": 1.6509647866090138, + "epoch": 0.29540523468182694, + "grad_norm": 0.6027291417121887, + "learning_rate": 1.9355004805965488e-05, + "loss": 1.4686, + "mean_token_accuracy": 0.6490070174137751, + "num_tokens": 452490265.0, + "step": 2689 + }, + { + "entropy": 1.7152677079041798, + "epoch": 0.29551509159319983, + "grad_norm": 0.6414744257926941, + "learning_rate": 1.935440945805732e-05, + "loss": 1.4622, + "mean_token_accuracy": 0.6658103515704473, + "num_tokens": 452695410.0, + "step": 2690 + }, + { + "entropy": 1.7520911594231923, + "epoch": 0.29562494850457277, + "grad_norm": 0.701274037361145, + "learning_rate": 1.935381384573586e-05, + "loss": 1.3646, + "mean_token_accuracy": 0.6607696761687597, + "num_tokens": 452872274.0, + "step": 2691 + }, + { + "entropy": 1.646621435880661, + "epoch": 0.2957348054159457, + "grad_norm": 0.6311284303665161, + "learning_rate": 1.9353217969019955e-05, + "loss": 1.3512, + "mean_token_accuracy": 0.6649795571962992, + "num_tokens": 453021211.0, + "step": 2692 + }, + { + "entropy": 1.7599443395932515, + "epoch": 0.29584466232731865, + "grad_norm": 0.6941083073616028, + "learning_rate": 1.9352621827928467e-05, + "loss": 1.3329, + "mean_token_accuracy": 0.6627233326435089, + "num_tokens": 453146565.0, + "step": 2693 + }, + { + "entropy": 1.7353703478972118, + "epoch": 0.2959545192386916, + "grad_norm": 0.659747838973999, + "learning_rate": 1.9352025422480263e-05, + "loss": 1.4226, + "mean_token_accuracy": 0.6492767333984375, + "num_tokens": 453362565.0, + "step": 2694 + }, + { + "entropy": 1.7668009400367737, + "epoch": 0.29606437615006453, + "grad_norm": 0.6181442737579346, + "learning_rate": 1.9351428752694215e-05, + "loss": 1.4072, + "mean_token_accuracy": 0.6527328540881475, + "num_tokens": 453547775.0, + "step": 2695 + }, + { + "entropy": 1.7386986712614696, + "epoch": 0.2961742330614375, + "grad_norm": 0.7841033339500427, + "learning_rate": 1.9350831818589207e-05, + "loss": 1.4427, + "mean_token_accuracy": 0.6478182226419449, + "num_tokens": 453707336.0, + "step": 2696 + }, + { + "entropy": 1.7775764167308807, + "epoch": 0.2962840899728104, + "grad_norm": 0.7095794081687927, + "learning_rate": 1.935023462018414e-05, + "loss": 1.3005, + "mean_token_accuracy": 0.6672998617092768, + "num_tokens": 453824420.0, + "step": 2697 + }, + { + "entropy": 1.6585584084192913, + "epoch": 0.29639394688418336, + "grad_norm": 0.6653675436973572, + "learning_rate": 1.9349637157497912e-05, + "loss": 1.4212, + "mean_token_accuracy": 0.6473642687002817, + "num_tokens": 454019843.0, + "step": 2698 + }, + { + "entropy": 1.6676256358623505, + "epoch": 0.2965038037955563, + "grad_norm": 0.5587428212165833, + "learning_rate": 1.934903943054943e-05, + "loss": 1.3297, + "mean_token_accuracy": 0.6603167007366816, + "num_tokens": 454209292.0, + "step": 2699 + }, + { + "entropy": 1.7065544823805492, + "epoch": 0.29661366070692924, + "grad_norm": 0.6772942543029785, + "learning_rate": 1.9348441439357607e-05, + "loss": 1.3898, + "mean_token_accuracy": 0.6574710955222448, + "num_tokens": 454350359.0, + "step": 2700 + }, + { + "entropy": 1.7034912804762523, + "epoch": 0.2967235176183022, + "grad_norm": 0.8810101747512817, + "learning_rate": 1.9347843183941376e-05, + "loss": 1.2346, + "mean_token_accuracy": 0.6790671199560165, + "num_tokens": 454487540.0, + "step": 2701 + }, + { + "entropy": 1.791587918996811, + "epoch": 0.2968333745296751, + "grad_norm": 0.707517147064209, + "learning_rate": 1.9347244664319674e-05, + "loss": 1.4353, + "mean_token_accuracy": 0.637357547879219, + "num_tokens": 454670144.0, + "step": 2702 + }, + { + "entropy": 1.65616970260938, + "epoch": 0.29694323144104806, + "grad_norm": 0.6979170441627502, + "learning_rate": 1.9346645880511435e-05, + "loss": 1.3765, + "mean_token_accuracy": 0.6787627389033636, + "num_tokens": 454805453.0, + "step": 2703 + }, + { + "entropy": 1.7858235935370128, + "epoch": 0.29705308835242095, + "grad_norm": 0.7254014015197754, + "learning_rate": 1.9346046832535616e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6682693660259247, + "num_tokens": 454926309.0, + "step": 2704 + }, + { + "entropy": 1.7087645729382832, + "epoch": 0.2971629452637939, + "grad_norm": 0.6836649775505066, + "learning_rate": 1.9345447520411176e-05, + "loss": 1.4221, + "mean_token_accuracy": 0.6326878815889359, + "num_tokens": 455124796.0, + "step": 2705 + }, + { + "entropy": 1.6126576364040375, + "epoch": 0.29727280217516683, + "grad_norm": 0.6172579526901245, + "learning_rate": 1.9344847944157082e-05, + "loss": 1.4618, + "mean_token_accuracy": 0.6497344275315603, + "num_tokens": 455349822.0, + "step": 2706 + }, + { + "entropy": 1.6935412486394246, + "epoch": 0.29738265908653977, + "grad_norm": 0.7592183351516724, + "learning_rate": 1.9344248103792312e-05, + "loss": 1.5196, + "mean_token_accuracy": 0.6431877315044403, + "num_tokens": 455536405.0, + "step": 2707 + }, + { + "entropy": 1.8123325010140736, + "epoch": 0.2974925159979127, + "grad_norm": 0.6768981218338013, + "learning_rate": 1.9343647999335852e-05, + "loss": 1.5576, + "mean_token_accuracy": 0.6386434634526571, + "num_tokens": 455720583.0, + "step": 2708 + }, + { + "entropy": 1.7398807009061177, + "epoch": 0.29760237290928565, + "grad_norm": 0.5949780941009521, + "learning_rate": 1.9343047630806686e-05, + "loss": 1.4819, + "mean_token_accuracy": 0.6428021887938181, + "num_tokens": 455956692.0, + "step": 2709 + }, + { + "entropy": 1.7306031584739685, + "epoch": 0.2977122298206586, + "grad_norm": 0.7073639035224915, + "learning_rate": 1.9342446998223828e-05, + "loss": 1.4921, + "mean_token_accuracy": 0.6601720154285431, + "num_tokens": 456096324.0, + "step": 2710 + }, + { + "entropy": 1.6887525916099548, + "epoch": 0.29782208673203153, + "grad_norm": 0.7725453972816467, + "learning_rate": 1.934184610160628e-05, + "loss": 1.2753, + "mean_token_accuracy": 0.6711432288090388, + "num_tokens": 456218859.0, + "step": 2711 + }, + { + "entropy": 1.6401757498582203, + "epoch": 0.2979319436434045, + "grad_norm": 0.7324026823043823, + "learning_rate": 1.934124494097306e-05, + "loss": 1.3085, + "mean_token_accuracy": 0.6673613637685776, + "num_tokens": 456346809.0, + "step": 2712 + }, + { + "entropy": 1.734503875176112, + "epoch": 0.2980418005547774, + "grad_norm": 0.7018632888793945, + "learning_rate": 1.9340643516343197e-05, + "loss": 1.3471, + "mean_token_accuracy": 0.6849260876576105, + "num_tokens": 456517652.0, + "step": 2713 + }, + { + "entropy": 1.6619239548842113, + "epoch": 0.29815165746615035, + "grad_norm": 0.7496260404586792, + "learning_rate": 1.9340041827735724e-05, + "loss": 1.3782, + "mean_token_accuracy": 0.6606259942054749, + "num_tokens": 456674429.0, + "step": 2714 + }, + { + "entropy": 1.7683907647927601, + "epoch": 0.2982615143775233, + "grad_norm": 0.7211847305297852, + "learning_rate": 1.9339439875169688e-05, + "loss": 1.3487, + "mean_token_accuracy": 0.6543639997641245, + "num_tokens": 456820642.0, + "step": 2715 + }, + { + "entropy": 1.6940323412418365, + "epoch": 0.29837137128889624, + "grad_norm": 0.8029798865318298, + "learning_rate": 1.933883765866414e-05, + "loss": 1.284, + "mean_token_accuracy": 0.6674282451470693, + "num_tokens": 456965638.0, + "step": 2716 + }, + { + "entropy": 1.7033016582330067, + "epoch": 0.2984812282002691, + "grad_norm": 0.6672521829605103, + "learning_rate": 1.933823517823813e-05, + "loss": 1.416, + "mean_token_accuracy": 0.6518148928880692, + "num_tokens": 457158175.0, + "step": 2717 + }, + { + "entropy": 1.7121039628982544, + "epoch": 0.29859108511164206, + "grad_norm": 0.9051281213760376, + "learning_rate": 1.933763243391074e-05, + "loss": 1.4112, + "mean_token_accuracy": 0.666137158870697, + "num_tokens": 457327515.0, + "step": 2718 + }, + { + "entropy": 1.699680785338084, + "epoch": 0.298700942023015, + "grad_norm": 0.6100730299949646, + "learning_rate": 1.933702942570104e-05, + "loss": 1.2842, + "mean_token_accuracy": 0.6708463281393051, + "num_tokens": 457486212.0, + "step": 2719 + }, + { + "entropy": 1.7419918080170949, + "epoch": 0.29881079893438794, + "grad_norm": 0.711141049861908, + "learning_rate": 1.9336426153628112e-05, + "loss": 1.4956, + "mean_token_accuracy": 0.6444249103466669, + "num_tokens": 457693330.0, + "step": 2720 + }, + { + "entropy": 1.6464999218781788, + "epoch": 0.2989206558457609, + "grad_norm": 0.8733800649642944, + "learning_rate": 1.9335822617711054e-05, + "loss": 1.2148, + "mean_token_accuracy": 0.6803951313098272, + "num_tokens": 457821397.0, + "step": 2721 + }, + { + "entropy": 1.6942930221557617, + "epoch": 0.2990305127571338, + "grad_norm": 0.7114554643630981, + "learning_rate": 1.9335218817968967e-05, + "loss": 1.3726, + "mean_token_accuracy": 0.664305662115415, + "num_tokens": 457983544.0, + "step": 2722 + }, + { + "entropy": 1.6836686829725902, + "epoch": 0.29914036966850677, + "grad_norm": 0.708476722240448, + "learning_rate": 1.9334614754420958e-05, + "loss": 1.2791, + "mean_token_accuracy": 0.6689166078964869, + "num_tokens": 458134039.0, + "step": 2723 + }, + { + "entropy": 1.7399452825387318, + "epoch": 0.2992502265798797, + "grad_norm": 0.7190913558006287, + "learning_rate": 1.9334010427086154e-05, + "loss": 1.3825, + "mean_token_accuracy": 0.6503856033086777, + "num_tokens": 458282778.0, + "step": 2724 + }, + { + "entropy": 1.752762794494629, + "epoch": 0.29936008349125265, + "grad_norm": 0.6258200407028198, + "learning_rate": 1.933340583598367e-05, + "loss": 1.4285, + "mean_token_accuracy": 0.6458547860383987, + "num_tokens": 458475221.0, + "step": 2725 + }, + { + "entropy": 1.732886830965678, + "epoch": 0.2994699404026256, + "grad_norm": 0.6532291769981384, + "learning_rate": 1.9332800981132648e-05, + "loss": 1.3873, + "mean_token_accuracy": 0.6579280296961466, + "num_tokens": 458642845.0, + "step": 2726 + }, + { + "entropy": 1.7705637713273366, + "epoch": 0.29957979731399853, + "grad_norm": 0.7122387290000916, + "learning_rate": 1.933219586255223e-05, + "loss": 1.574, + "mean_token_accuracy": 0.6305630256732305, + "num_tokens": 458841123.0, + "step": 2727 + }, + { + "entropy": 1.736388514439265, + "epoch": 0.29968965422537147, + "grad_norm": 0.6985000967979431, + "learning_rate": 1.9331590480261568e-05, + "loss": 1.3021, + "mean_token_accuracy": 0.6691931088765463, + "num_tokens": 458985028.0, + "step": 2728 + }, + { + "entropy": 1.7540164987246196, + "epoch": 0.2997995111367444, + "grad_norm": 0.7186359763145447, + "learning_rate": 1.933098483427982e-05, + "loss": 1.5224, + "mean_token_accuracy": 0.6380279958248138, + "num_tokens": 459144300.0, + "step": 2729 + }, + { + "entropy": 1.7715636988480885, + "epoch": 0.29990936804811735, + "grad_norm": 0.8147019147872925, + "learning_rate": 1.9330378924626156e-05, + "loss": 1.4505, + "mean_token_accuracy": 0.6575749566157659, + "num_tokens": 459280365.0, + "step": 2730 + }, + { + "entropy": 1.6673146188259125, + "epoch": 0.30001922495949024, + "grad_norm": 0.6811074018478394, + "learning_rate": 1.9329772751319755e-05, + "loss": 1.4948, + "mean_token_accuracy": 0.6632718841234843, + "num_tokens": 459456372.0, + "step": 2731 + }, + { + "entropy": 1.7491925756136577, + "epoch": 0.3001290818708632, + "grad_norm": 0.7324425578117371, + "learning_rate": 1.93291663143798e-05, + "loss": 1.4991, + "mean_token_accuracy": 0.652273048957189, + "num_tokens": 459617699.0, + "step": 2732 + }, + { + "entropy": 1.6765054762363434, + "epoch": 0.3002389387822361, + "grad_norm": 0.7338621616363525, + "learning_rate": 1.9328559613825483e-05, + "loss": 1.2981, + "mean_token_accuracy": 0.6762462556362152, + "num_tokens": 459785217.0, + "step": 2733 + }, + { + "entropy": 1.648837725321452, + "epoch": 0.30034879569360906, + "grad_norm": 0.6431924104690552, + "learning_rate": 1.9327952649676006e-05, + "loss": 1.2079, + "mean_token_accuracy": 0.676081563035647, + "num_tokens": 459913674.0, + "step": 2734 + }, + { + "entropy": 1.6991098026434581, + "epoch": 0.300458652604982, + "grad_norm": 0.6374147534370422, + "learning_rate": 1.932734542195058e-05, + "loss": 1.4266, + "mean_token_accuracy": 0.6573264350493749, + "num_tokens": 460110173.0, + "step": 2735 + }, + { + "entropy": 1.6661823689937592, + "epoch": 0.30056850951635494, + "grad_norm": 3.0219857692718506, + "learning_rate": 1.9326737930668425e-05, + "loss": 1.3622, + "mean_token_accuracy": 0.6620204945405325, + "num_tokens": 460326181.0, + "step": 2736 + }, + { + "entropy": 1.769929716984431, + "epoch": 0.3006783664277279, + "grad_norm": 0.822551429271698, + "learning_rate": 1.932613017584877e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6456111868222555, + "num_tokens": 460473005.0, + "step": 2737 + }, + { + "entropy": 1.7742596765359242, + "epoch": 0.3007882233391008, + "grad_norm": 0.6372090578079224, + "learning_rate": 1.9325522157510842e-05, + "loss": 1.5462, + "mean_token_accuracy": 0.6321751674016317, + "num_tokens": 460711663.0, + "step": 2738 + }, + { + "entropy": 1.7386666536331177, + "epoch": 0.30089808025047377, + "grad_norm": 0.7106104493141174, + "learning_rate": 1.9324913875673893e-05, + "loss": 1.48, + "mean_token_accuracy": 0.6593476285537084, + "num_tokens": 460896697.0, + "step": 2739 + }, + { + "entropy": 1.6937275826931, + "epoch": 0.3010079371618467, + "grad_norm": 0.654400646686554, + "learning_rate": 1.932430533035717e-05, + "loss": 1.5314, + "mean_token_accuracy": 0.627138485511144, + "num_tokens": 461179979.0, + "step": 2740 + }, + { + "entropy": 1.6967305839061737, + "epoch": 0.30111779407321965, + "grad_norm": 0.7618654370307922, + "learning_rate": 1.9323696521579933e-05, + "loss": 1.3714, + "mean_token_accuracy": 0.6593584269285202, + "num_tokens": 461381167.0, + "step": 2741 + }, + { + "entropy": 1.7463171482086182, + "epoch": 0.3012276509845926, + "grad_norm": 0.7259137034416199, + "learning_rate": 1.932308744936145e-05, + "loss": 1.3643, + "mean_token_accuracy": 0.6586255977551142, + "num_tokens": 461596991.0, + "step": 2742 + }, + { + "entropy": 1.7076434095700581, + "epoch": 0.30133750789596553, + "grad_norm": 0.6767429709434509, + "learning_rate": 1.9322478113721e-05, + "loss": 1.3569, + "mean_token_accuracy": 0.670208474000295, + "num_tokens": 461746827.0, + "step": 2743 + }, + { + "entropy": 1.6913744111855824, + "epoch": 0.30144736480733847, + "grad_norm": 0.7813226580619812, + "learning_rate": 1.9321868514677874e-05, + "loss": 1.3386, + "mean_token_accuracy": 0.6839630007743835, + "num_tokens": 461894088.0, + "step": 2744 + }, + { + "entropy": 1.7275190949440002, + "epoch": 0.30155722171871135, + "grad_norm": 0.7166203856468201, + "learning_rate": 1.9321258652251354e-05, + "loss": 1.2261, + "mean_token_accuracy": 0.6746133218208948, + "num_tokens": 462015371.0, + "step": 2745 + }, + { + "entropy": 1.7810499270757039, + "epoch": 0.3016670786300843, + "grad_norm": 0.6879790425300598, + "learning_rate": 1.932064852646075e-05, + "loss": 1.4334, + "mean_token_accuracy": 0.6484188586473465, + "num_tokens": 462235970.0, + "step": 2746 + }, + { + "entropy": 1.7113142510255177, + "epoch": 0.30177693554145724, + "grad_norm": 0.9017158150672913, + "learning_rate": 1.9320038137325364e-05, + "loss": 1.4883, + "mean_token_accuracy": 0.6399280428886414, + "num_tokens": 462409137.0, + "step": 2747 + }, + { + "entropy": 1.7389302551746368, + "epoch": 0.3018867924528302, + "grad_norm": 0.7917147874832153, + "learning_rate": 1.9319427484864526e-05, + "loss": 1.3114, + "mean_token_accuracy": 0.6665030618508657, + "num_tokens": 462590216.0, + "step": 2748 + }, + { + "entropy": 1.6455481847127278, + "epoch": 0.3019966493642031, + "grad_norm": 0.7206093072891235, + "learning_rate": 1.9318816569097557e-05, + "loss": 1.3726, + "mean_token_accuracy": 0.6634992212057114, + "num_tokens": 462764694.0, + "step": 2749 + }, + { + "entropy": 1.6946881413459778, + "epoch": 0.30210650627557606, + "grad_norm": 0.6624197959899902, + "learning_rate": 1.9318205390043786e-05, + "loss": 1.3401, + "mean_token_accuracy": 0.6626093486944834, + "num_tokens": 462918633.0, + "step": 2750 + }, + { + "entropy": 1.754339079062144, + "epoch": 0.302216363186949, + "grad_norm": 0.6414199471473694, + "learning_rate": 1.931759394772257e-05, + "loss": 1.3364, + "mean_token_accuracy": 0.6651070167620977, + "num_tokens": 463065938.0, + "step": 2751 + }, + { + "entropy": 1.7131946782271068, + "epoch": 0.30232622009832194, + "grad_norm": 0.7476382255554199, + "learning_rate": 1.931698224215325e-05, + "loss": 1.3628, + "mean_token_accuracy": 0.6677864193916321, + "num_tokens": 463202521.0, + "step": 2752 + }, + { + "entropy": 1.6731769144535065, + "epoch": 0.3024360770096949, + "grad_norm": 1.3017513751983643, + "learning_rate": 1.931637027335519e-05, + "loss": 1.1589, + "mean_token_accuracy": 0.6780026058355967, + "num_tokens": 463403149.0, + "step": 2753 + }, + { + "entropy": 1.6853074034055073, + "epoch": 0.3025459339210678, + "grad_norm": 0.5546422600746155, + "learning_rate": 1.9315758041347758e-05, + "loss": 1.3004, + "mean_token_accuracy": 0.6725151340166727, + "num_tokens": 463590458.0, + "step": 2754 + }, + { + "entropy": 1.6593196491400402, + "epoch": 0.30265579083244076, + "grad_norm": 0.699647843837738, + "learning_rate": 1.931514554615033e-05, + "loss": 1.31, + "mean_token_accuracy": 0.6586611072222391, + "num_tokens": 463747655.0, + "step": 2755 + }, + { + "entropy": 1.6894031167030334, + "epoch": 0.3027656477438137, + "grad_norm": 0.6589730978012085, + "learning_rate": 1.9314532787782295e-05, + "loss": 1.3531, + "mean_token_accuracy": 0.6681742072105408, + "num_tokens": 463913959.0, + "step": 2756 + }, + { + "entropy": 1.7593967119852703, + "epoch": 0.30287550465518664, + "grad_norm": 0.6436064839363098, + "learning_rate": 1.9313919766263043e-05, + "loss": 1.3402, + "mean_token_accuracy": 0.6659826586643854, + "num_tokens": 464060668.0, + "step": 2757 + }, + { + "entropy": 1.746421605348587, + "epoch": 0.30298536156655953, + "grad_norm": 0.7272017598152161, + "learning_rate": 1.9313306481611977e-05, + "loss": 1.3655, + "mean_token_accuracy": 0.661940743525823, + "num_tokens": 464204000.0, + "step": 2758 + }, + { + "entropy": 1.710692157347997, + "epoch": 0.30309521847793247, + "grad_norm": 0.6198656558990479, + "learning_rate": 1.9312692933848505e-05, + "loss": 1.3947, + "mean_token_accuracy": 0.6594121058781942, + "num_tokens": 464384758.0, + "step": 2759 + }, + { + "entropy": 1.7099157869815826, + "epoch": 0.3032050753893054, + "grad_norm": 0.5952613353729248, + "learning_rate": 1.931207912299205e-05, + "loss": 1.44, + "mean_token_accuracy": 0.6447249750296274, + "num_tokens": 464582558.0, + "step": 2760 + }, + { + "entropy": 1.6782328983147938, + "epoch": 0.30331493230067835, + "grad_norm": 0.7183821201324463, + "learning_rate": 1.9311465049062036e-05, + "loss": 1.2911, + "mean_token_accuracy": 0.6735262523094813, + "num_tokens": 464715667.0, + "step": 2761 + }, + { + "entropy": 1.6694311400254567, + "epoch": 0.3034247892120513, + "grad_norm": 0.8863995671272278, + "learning_rate": 1.9310850712077897e-05, + "loss": 1.1808, + "mean_token_accuracy": 0.6926176349322001, + "num_tokens": 464851884.0, + "step": 2762 + }, + { + "entropy": 1.6931216617425282, + "epoch": 0.30353464612342423, + "grad_norm": 0.6631143689155579, + "learning_rate": 1.9310236112059076e-05, + "loss": 1.5559, + "mean_token_accuracy": 0.6472532153129578, + "num_tokens": 465012783.0, + "step": 2763 + }, + { + "entropy": 1.7171483039855957, + "epoch": 0.3036445030347972, + "grad_norm": 0.8214719295501709, + "learning_rate": 1.9309621249025033e-05, + "loss": 1.4744, + "mean_token_accuracy": 0.6650175104538599, + "num_tokens": 465166374.0, + "step": 2764 + }, + { + "entropy": 1.6988566915194194, + "epoch": 0.3037543599461701, + "grad_norm": 0.7386065125465393, + "learning_rate": 1.930900612299522e-05, + "loss": 1.5162, + "mean_token_accuracy": 0.6555062482754389, + "num_tokens": 465343817.0, + "step": 2765 + }, + { + "entropy": 1.721668581167857, + "epoch": 0.30386421685754306, + "grad_norm": 0.6086675524711609, + "learning_rate": 1.93083907339891e-05, + "loss": 1.3695, + "mean_token_accuracy": 0.6555673032999039, + "num_tokens": 465495475.0, + "step": 2766 + }, + { + "entropy": 1.7061657309532166, + "epoch": 0.303974073768916, + "grad_norm": 0.9607113599777222, + "learning_rate": 1.930777508202617e-05, + "loss": 1.294, + "mean_token_accuracy": 0.6664957702159882, + "num_tokens": 465650381.0, + "step": 2767 + }, + { + "entropy": 1.6989998122056325, + "epoch": 0.30408393068028894, + "grad_norm": 0.7101160883903503, + "learning_rate": 1.9307159167125887e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.6705865065256754, + "num_tokens": 465802895.0, + "step": 2768 + }, + { + "entropy": 1.7586182653903961, + "epoch": 0.3041937875916619, + "grad_norm": 0.7714918851852417, + "learning_rate": 1.9306542989307774e-05, + "loss": 1.5817, + "mean_token_accuracy": 0.6417257438103358, + "num_tokens": 465991579.0, + "step": 2769 + }, + { + "entropy": 1.6654569506645203, + "epoch": 0.3043036445030348, + "grad_norm": 0.7866225242614746, + "learning_rate": 1.930592654859131e-05, + "loss": 1.5116, + "mean_token_accuracy": 0.6589531550804774, + "num_tokens": 466160734.0, + "step": 2770 + }, + { + "entropy": 1.7186169922351837, + "epoch": 0.30441350141440776, + "grad_norm": 0.8639843463897705, + "learning_rate": 1.9305309844996014e-05, + "loss": 1.4303, + "mean_token_accuracy": 0.6495883216460546, + "num_tokens": 466326061.0, + "step": 2771 + }, + { + "entropy": 1.7337853809197743, + "epoch": 0.30452335832578065, + "grad_norm": 0.7233784794807434, + "learning_rate": 1.9304692878541407e-05, + "loss": 1.3016, + "mean_token_accuracy": 0.6698676447073618, + "num_tokens": 466463745.0, + "step": 2772 + }, + { + "entropy": 1.6232518255710602, + "epoch": 0.3046332152371536, + "grad_norm": 0.6475121378898621, + "learning_rate": 1.930407564924701e-05, + "loss": 1.4022, + "mean_token_accuracy": 0.6630886395772299, + "num_tokens": 466593450.0, + "step": 2773 + }, + { + "entropy": 1.6947415073712666, + "epoch": 0.30474307214852653, + "grad_norm": 0.6529332399368286, + "learning_rate": 1.930345815713236e-05, + "loss": 1.489, + "mean_token_accuracy": 0.6480614195267359, + "num_tokens": 466750956.0, + "step": 2774 + }, + { + "entropy": 1.7117355068524678, + "epoch": 0.30485292905989947, + "grad_norm": 0.7926428914070129, + "learning_rate": 1.9302840402217004e-05, + "loss": 1.5965, + "mean_token_accuracy": 0.638613685965538, + "num_tokens": 466927278.0, + "step": 2775 + }, + { + "entropy": 1.6899478038152058, + "epoch": 0.3049627859712724, + "grad_norm": 0.7616437673568726, + "learning_rate": 1.930222238452049e-05, + "loss": 1.4495, + "mean_token_accuracy": 0.6520648350318273, + "num_tokens": 467105233.0, + "step": 2776 + }, + { + "entropy": 1.6401771505673726, + "epoch": 0.30507264288264535, + "grad_norm": 0.6086208820343018, + "learning_rate": 1.9301604104062378e-05, + "loss": 1.383, + "mean_token_accuracy": 0.6533524294694265, + "num_tokens": 467265754.0, + "step": 2777 + }, + { + "entropy": 1.6210876603921254, + "epoch": 0.3051824997940183, + "grad_norm": 0.717979907989502, + "learning_rate": 1.9300985560862235e-05, + "loss": 1.3769, + "mean_token_accuracy": 0.6591470589240392, + "num_tokens": 467411535.0, + "step": 2778 + }, + { + "entropy": 1.7154032389322917, + "epoch": 0.30529235670539123, + "grad_norm": 0.6699005365371704, + "learning_rate": 1.9300366754939642e-05, + "loss": 1.4056, + "mean_token_accuracy": 0.639826089143753, + "num_tokens": 467624066.0, + "step": 2779 + }, + { + "entropy": 1.7107741832733154, + "epoch": 0.3054022136167642, + "grad_norm": 0.6464679837226868, + "learning_rate": 1.9299747686314178e-05, + "loss": 1.3999, + "mean_token_accuracy": 0.6531073401371638, + "num_tokens": 467762610.0, + "step": 2780 + }, + { + "entropy": 1.723695029815038, + "epoch": 0.3055120705281371, + "grad_norm": 0.645092785358429, + "learning_rate": 1.9299128355005443e-05, + "loss": 1.2986, + "mean_token_accuracy": 0.6608155220746994, + "num_tokens": 467932748.0, + "step": 2781 + }, + { + "entropy": 1.7858980596065521, + "epoch": 0.30562192743951005, + "grad_norm": 0.7578333020210266, + "learning_rate": 1.9298508761033035e-05, + "loss": 1.4476, + "mean_token_accuracy": 0.6441960881153742, + "num_tokens": 468085168.0, + "step": 2782 + }, + { + "entropy": 1.6949599981307983, + "epoch": 0.305731784350883, + "grad_norm": 0.7087303400039673, + "learning_rate": 1.929788890441656e-05, + "loss": 1.324, + "mean_token_accuracy": 0.6659413874149323, + "num_tokens": 468243090.0, + "step": 2783 + }, + { + "entropy": 1.7236262162526448, + "epoch": 0.30584164126225594, + "grad_norm": 0.652289628982544, + "learning_rate": 1.9297268785175647e-05, + "loss": 1.3338, + "mean_token_accuracy": 0.66497070590655, + "num_tokens": 468357807.0, + "step": 2784 + }, + { + "entropy": 1.7121999263763428, + "epoch": 0.3059514981736288, + "grad_norm": 0.7053149938583374, + "learning_rate": 1.9296648403329915e-05, + "loss": 1.5589, + "mean_token_accuracy": 0.6464631706476212, + "num_tokens": 468557805.0, + "step": 2785 + }, + { + "entropy": 1.7528938154379528, + "epoch": 0.30606135508500176, + "grad_norm": 0.8355051279067993, + "learning_rate": 1.9296027758898993e-05, + "loss": 1.5363, + "mean_token_accuracy": 0.6388404369354248, + "num_tokens": 468705510.0, + "step": 2786 + }, + { + "entropy": 1.6805029014746349, + "epoch": 0.3061712119963747, + "grad_norm": 0.7608030438423157, + "learning_rate": 1.9295406851902538e-05, + "loss": 1.4483, + "mean_token_accuracy": 0.6516820987065634, + "num_tokens": 468892051.0, + "step": 2787 + }, + { + "entropy": 1.6512704094250996, + "epoch": 0.30628106890774764, + "grad_norm": 0.789167046546936, + "learning_rate": 1.929478568236019e-05, + "loss": 1.2027, + "mean_token_accuracy": 0.6777097682158152, + "num_tokens": 469054572.0, + "step": 2788 + }, + { + "entropy": 1.6551719705263774, + "epoch": 0.3063909258191206, + "grad_norm": 0.6679402589797974, + "learning_rate": 1.9294164250291613e-05, + "loss": 1.4609, + "mean_token_accuracy": 0.6612751533587774, + "num_tokens": 469218937.0, + "step": 2789 + }, + { + "entropy": 1.6761693557103474, + "epoch": 0.3065007827304935, + "grad_norm": 0.6211634278297424, + "learning_rate": 1.9293542555716476e-05, + "loss": 1.3792, + "mean_token_accuracy": 0.6515985727310181, + "num_tokens": 469472537.0, + "step": 2790 + }, + { + "entropy": 1.685540109872818, + "epoch": 0.30661063964186647, + "grad_norm": 0.7555694580078125, + "learning_rate": 1.9292920598654455e-05, + "loss": 1.2874, + "mean_token_accuracy": 0.6632583638032278, + "num_tokens": 469610592.0, + "step": 2791 + }, + { + "entropy": 1.760539670785268, + "epoch": 0.3067204965532394, + "grad_norm": 0.6980460286140442, + "learning_rate": 1.9292298379125235e-05, + "loss": 1.3597, + "mean_token_accuracy": 0.6550954182942709, + "num_tokens": 469752029.0, + "step": 2792 + }, + { + "entropy": 1.7144795854886372, + "epoch": 0.30683035346461235, + "grad_norm": 0.6551751494407654, + "learning_rate": 1.9291675897148504e-05, + "loss": 1.3658, + "mean_token_accuracy": 0.6568573415279388, + "num_tokens": 469937147.0, + "step": 2793 + }, + { + "entropy": 1.7677671909332275, + "epoch": 0.3069402103759853, + "grad_norm": 0.7456969618797302, + "learning_rate": 1.9291053152743968e-05, + "loss": 1.5226, + "mean_token_accuracy": 0.6467147767543793, + "num_tokens": 470105109.0, + "step": 2794 + }, + { + "entropy": 1.7279229164123535, + "epoch": 0.30705006728735823, + "grad_norm": 0.7341436743736267, + "learning_rate": 1.929043014593134e-05, + "loss": 1.3933, + "mean_token_accuracy": 0.6459408948818842, + "num_tokens": 470253996.0, + "step": 2795 + }, + { + "entropy": 1.7426668008168538, + "epoch": 0.30715992419873117, + "grad_norm": 0.7237141132354736, + "learning_rate": 1.9289806876730328e-05, + "loss": 1.3304, + "mean_token_accuracy": 0.6563834051291147, + "num_tokens": 470407684.0, + "step": 2796 + }, + { + "entropy": 1.7966994444529216, + "epoch": 0.3072697811101041, + "grad_norm": 0.6734642386436462, + "learning_rate": 1.9289183345160666e-05, + "loss": 1.368, + "mean_token_accuracy": 0.6494917968908945, + "num_tokens": 470537389.0, + "step": 2797 + }, + { + "entropy": 1.7864558796087902, + "epoch": 0.30737963802147705, + "grad_norm": 0.7371609210968018, + "learning_rate": 1.9288559551242084e-05, + "loss": 1.4408, + "mean_token_accuracy": 0.651896004875501, + "num_tokens": 470702673.0, + "step": 2798 + }, + { + "entropy": 1.7101215819517772, + "epoch": 0.30748949493284994, + "grad_norm": 0.9700989127159119, + "learning_rate": 1.9287935494994333e-05, + "loss": 1.4321, + "mean_token_accuracy": 0.6406466712554296, + "num_tokens": 470848334.0, + "step": 2799 + }, + { + "entropy": 1.66282253464063, + "epoch": 0.3075993518442229, + "grad_norm": 0.6381438970565796, + "learning_rate": 1.9287311176437154e-05, + "loss": 1.3278, + "mean_token_accuracy": 0.6706396341323853, + "num_tokens": 470980854.0, + "step": 2800 + }, + { + "entropy": 1.7123548885186513, + "epoch": 0.3077092087555958, + "grad_norm": 0.6777034401893616, + "learning_rate": 1.928668659559031e-05, + "loss": 1.2528, + "mean_token_accuracy": 0.6732690334320068, + "num_tokens": 471106170.0, + "step": 2801 + }, + { + "entropy": 1.6131497025489807, + "epoch": 0.30781906566696876, + "grad_norm": 0.6792572736740112, + "learning_rate": 1.9286061752473575e-05, + "loss": 1.3078, + "mean_token_accuracy": 0.6727622449398041, + "num_tokens": 471244822.0, + "step": 2802 + }, + { + "entropy": 1.731354941924413, + "epoch": 0.3079289225783417, + "grad_norm": 0.6826562881469727, + "learning_rate": 1.9285436647106716e-05, + "loss": 1.4002, + "mean_token_accuracy": 0.6561285803715388, + "num_tokens": 471422667.0, + "step": 2803 + }, + { + "entropy": 1.6713635822137196, + "epoch": 0.30803877948971464, + "grad_norm": 0.592697262763977, + "learning_rate": 1.9284811279509518e-05, + "loss": 1.46, + "mean_token_accuracy": 0.649135539929072, + "num_tokens": 471606192.0, + "step": 2804 + }, + { + "entropy": 1.702138513326645, + "epoch": 0.3081486364010876, + "grad_norm": 0.8116854429244995, + "learning_rate": 1.928418564970178e-05, + "loss": 1.2933, + "mean_token_accuracy": 0.6709979226191839, + "num_tokens": 471749051.0, + "step": 2805 + }, + { + "entropy": 1.6859862705071766, + "epoch": 0.3082584933124605, + "grad_norm": 0.670845091342926, + "learning_rate": 1.9283559757703295e-05, + "loss": 1.2985, + "mean_token_accuracy": 0.6729146291812261, + "num_tokens": 471883696.0, + "step": 2806 + }, + { + "entropy": 1.7104649444421132, + "epoch": 0.30836835022383347, + "grad_norm": 0.7185712456703186, + "learning_rate": 1.928293360353388e-05, + "loss": 1.4908, + "mean_token_accuracy": 0.6591552595297495, + "num_tokens": 472068070.0, + "step": 2807 + }, + { + "entropy": 1.6668047904968262, + "epoch": 0.3084782071352064, + "grad_norm": 0.6307313442230225, + "learning_rate": 1.9282307187213346e-05, + "loss": 1.2885, + "mean_token_accuracy": 0.6734770586093267, + "num_tokens": 472204796.0, + "step": 2808 + }, + { + "entropy": 1.756018191576004, + "epoch": 0.30858806404657935, + "grad_norm": 0.6855459213256836, + "learning_rate": 1.928168050876152e-05, + "loss": 1.4125, + "mean_token_accuracy": 0.6466716329256693, + "num_tokens": 472354680.0, + "step": 2809 + }, + { + "entropy": 1.7116002043088276, + "epoch": 0.3086979209579523, + "grad_norm": 0.7074426412582397, + "learning_rate": 1.9281053568198245e-05, + "loss": 1.3565, + "mean_token_accuracy": 0.6588050077358881, + "num_tokens": 472525501.0, + "step": 2810 + }, + { + "entropy": 1.6833085417747498, + "epoch": 0.30880777786932523, + "grad_norm": 0.7049890160560608, + "learning_rate": 1.928042636554335e-05, + "loss": 1.3164, + "mean_token_accuracy": 0.6683538804451624, + "num_tokens": 472719716.0, + "step": 2811 + }, + { + "entropy": 1.6787909964720409, + "epoch": 0.3089176347806981, + "grad_norm": 0.703137218952179, + "learning_rate": 1.9279798900816696e-05, + "loss": 1.3727, + "mean_token_accuracy": 0.6661172757546107, + "num_tokens": 472881173.0, + "step": 2812 + }, + { + "entropy": 1.7261568506558735, + "epoch": 0.30902749169207105, + "grad_norm": 0.6603509783744812, + "learning_rate": 1.9279171174038132e-05, + "loss": 1.3456, + "mean_token_accuracy": 0.6602363437414169, + "num_tokens": 473034728.0, + "step": 2813 + }, + { + "entropy": 1.685040682554245, + "epoch": 0.309137348603444, + "grad_norm": 0.743989109992981, + "learning_rate": 1.9278543185227535e-05, + "loss": 1.3698, + "mean_token_accuracy": 0.6495349953571955, + "num_tokens": 473229426.0, + "step": 2814 + }, + { + "entropy": 1.7586186130841572, + "epoch": 0.30924720551481694, + "grad_norm": 0.7787970900535583, + "learning_rate": 1.9277914934404774e-05, + "loss": 1.4331, + "mean_token_accuracy": 0.6600687354803085, + "num_tokens": 473377396.0, + "step": 2815 + }, + { + "entropy": 1.6828500429789226, + "epoch": 0.3093570624261899, + "grad_norm": 0.6871913075447083, + "learning_rate": 1.927728642158974e-05, + "loss": 1.3833, + "mean_token_accuracy": 0.6608909120162328, + "num_tokens": 473539223.0, + "step": 2816 + }, + { + "entropy": 1.749051849047343, + "epoch": 0.3094669193375628, + "grad_norm": 0.7910235524177551, + "learning_rate": 1.9276657646802318e-05, + "loss": 1.4661, + "mean_token_accuracy": 0.6595475325981776, + "num_tokens": 473693265.0, + "step": 2817 + }, + { + "entropy": 1.6784795920054119, + "epoch": 0.30957677624893576, + "grad_norm": 0.5966885089874268, + "learning_rate": 1.9276028610062412e-05, + "loss": 1.3034, + "mean_token_accuracy": 0.6713368693987528, + "num_tokens": 473845058.0, + "step": 2818 + }, + { + "entropy": 1.662716676791509, + "epoch": 0.3096866331603087, + "grad_norm": 0.6432068347930908, + "learning_rate": 1.927539931138993e-05, + "loss": 1.3785, + "mean_token_accuracy": 0.6545740862687429, + "num_tokens": 474043963.0, + "step": 2819 + }, + { + "entropy": 1.7873293658097584, + "epoch": 0.30979649007168164, + "grad_norm": 0.733140766620636, + "learning_rate": 1.9274769750804786e-05, + "loss": 1.4616, + "mean_token_accuracy": 0.6570829351743063, + "num_tokens": 474264410.0, + "step": 2820 + }, + { + "entropy": 1.6941826542218525, + "epoch": 0.3099063469830546, + "grad_norm": 0.6925363540649414, + "learning_rate": 1.9274139928326913e-05, + "loss": 1.206, + "mean_token_accuracy": 0.6815744390090307, + "num_tokens": 474365234.0, + "step": 2821 + }, + { + "entropy": 1.690369079510371, + "epoch": 0.3100162038944275, + "grad_norm": 0.5778855681419373, + "learning_rate": 1.927350984397623e-05, + "loss": 1.3985, + "mean_token_accuracy": 0.655535156528155, + "num_tokens": 474574557.0, + "step": 2822 + }, + { + "entropy": 1.639299343029658, + "epoch": 0.31012606080580046, + "grad_norm": 0.6779983639717102, + "learning_rate": 1.92728794977727e-05, + "loss": 1.3839, + "mean_token_accuracy": 0.6478785822788874, + "num_tokens": 474785559.0, + "step": 2823 + }, + { + "entropy": 1.6963231166203816, + "epoch": 0.3102359177171734, + "grad_norm": 0.6037493944168091, + "learning_rate": 1.9272248889736255e-05, + "loss": 1.4553, + "mean_token_accuracy": 0.6422694027423859, + "num_tokens": 474970243.0, + "step": 2824 + }, + { + "entropy": 1.7773303886254628, + "epoch": 0.31034577462854634, + "grad_norm": 0.6794271469116211, + "learning_rate": 1.927161801988686e-05, + "loss": 1.454, + "mean_token_accuracy": 0.6485249102115631, + "num_tokens": 475167904.0, + "step": 2825 + }, + { + "entropy": 1.7010780572891235, + "epoch": 0.31045563153991923, + "grad_norm": 0.7082254886627197, + "learning_rate": 1.9270986888244486e-05, + "loss": 1.3759, + "mean_token_accuracy": 0.6544408549865087, + "num_tokens": 475314141.0, + "step": 2826 + }, + { + "entropy": 1.7761707603931427, + "epoch": 0.31056548845129217, + "grad_norm": 0.6975213289260864, + "learning_rate": 1.92703554948291e-05, + "loss": 1.5001, + "mean_token_accuracy": 0.6315238277117411, + "num_tokens": 475504348.0, + "step": 2827 + }, + { + "entropy": 1.7582517365614574, + "epoch": 0.3106753453626651, + "grad_norm": 0.7386556267738342, + "learning_rate": 1.926972383966069e-05, + "loss": 1.548, + "mean_token_accuracy": 0.6362548967202505, + "num_tokens": 475668498.0, + "step": 2828 + }, + { + "entropy": 1.7129935026168823, + "epoch": 0.31078520227403805, + "grad_norm": 0.6855206489562988, + "learning_rate": 1.9269091922759248e-05, + "loss": 1.4024, + "mean_token_accuracy": 0.6538207034269968, + "num_tokens": 475810430.0, + "step": 2829 + }, + { + "entropy": 1.6759546200434368, + "epoch": 0.310895059185411, + "grad_norm": 0.7332449555397034, + "learning_rate": 1.9268459744144775e-05, + "loss": 1.3902, + "mean_token_accuracy": 0.6686356763044993, + "num_tokens": 476027236.0, + "step": 2830 + }, + { + "entropy": 1.7742613156636555, + "epoch": 0.31100491609678393, + "grad_norm": 0.6197798252105713, + "learning_rate": 1.9267827303837277e-05, + "loss": 1.4344, + "mean_token_accuracy": 0.6507979234059652, + "num_tokens": 476190360.0, + "step": 2831 + }, + { + "entropy": 1.6519160469373066, + "epoch": 0.3111147730081569, + "grad_norm": 0.5655571818351746, + "learning_rate": 1.9267194601856765e-05, + "loss": 1.3515, + "mean_token_accuracy": 0.6520318339268366, + "num_tokens": 476376327.0, + "step": 2832 + }, + { + "entropy": 1.7403542399406433, + "epoch": 0.3112246299195298, + "grad_norm": 0.7627872228622437, + "learning_rate": 1.9266561638223272e-05, + "loss": 1.2572, + "mean_token_accuracy": 0.6759726454814275, + "num_tokens": 476486395.0, + "step": 2833 + }, + { + "entropy": 1.767777254184087, + "epoch": 0.31133448683090276, + "grad_norm": 0.7002077102661133, + "learning_rate": 1.926592841295683e-05, + "loss": 1.4234, + "mean_token_accuracy": 0.6400815695524216, + "num_tokens": 476665343.0, + "step": 2834 + }, + { + "entropy": 1.6697245140870411, + "epoch": 0.3114443437422757, + "grad_norm": 0.6324102282524109, + "learning_rate": 1.9265294926077476e-05, + "loss": 1.4785, + "mean_token_accuracy": 0.6443575223286947, + "num_tokens": 476858023.0, + "step": 2835 + }, + { + "entropy": 1.6648296117782593, + "epoch": 0.31155420065364864, + "grad_norm": 0.6571188569068909, + "learning_rate": 1.9264661177605264e-05, + "loss": 1.281, + "mean_token_accuracy": 0.6755544741948446, + "num_tokens": 477049028.0, + "step": 2836 + }, + { + "entropy": 1.7388223707675934, + "epoch": 0.3116640575650216, + "grad_norm": 0.7424740195274353, + "learning_rate": 1.926402716756025e-05, + "loss": 1.4497, + "mean_token_accuracy": 0.6438274731238683, + "num_tokens": 477173446.0, + "step": 2837 + }, + { + "entropy": 1.7497480809688568, + "epoch": 0.3117739144763945, + "grad_norm": 0.7240679860115051, + "learning_rate": 1.9263392895962497e-05, + "loss": 1.4083, + "mean_token_accuracy": 0.6496013253927231, + "num_tokens": 477314695.0, + "step": 2838 + }, + { + "entropy": 1.6946297883987427, + "epoch": 0.3118837713877674, + "grad_norm": 0.680553674697876, + "learning_rate": 1.9262758362832082e-05, + "loss": 1.272, + "mean_token_accuracy": 0.6792226930459341, + "num_tokens": 477478810.0, + "step": 2839 + }, + { + "entropy": 1.7206788162390392, + "epoch": 0.31199362829914035, + "grad_norm": 0.7550413012504578, + "learning_rate": 1.9262123568189094e-05, + "loss": 1.4304, + "mean_token_accuracy": 0.6598779608805975, + "num_tokens": 477598456.0, + "step": 2840 + }, + { + "entropy": 1.655482719341914, + "epoch": 0.3121034852105133, + "grad_norm": 0.6688495874404907, + "learning_rate": 1.9261488512053615e-05, + "loss": 1.2902, + "mean_token_accuracy": 0.667375867565473, + "num_tokens": 477733015.0, + "step": 2841 + }, + { + "entropy": 1.727291077375412, + "epoch": 0.31221334212188623, + "grad_norm": 0.7079230546951294, + "learning_rate": 1.9260853194445743e-05, + "loss": 1.4079, + "mean_token_accuracy": 0.6517117569843928, + "num_tokens": 477897769.0, + "step": 2842 + }, + { + "entropy": 1.7456689874331157, + "epoch": 0.31232319903325917, + "grad_norm": 0.6888191103935242, + "learning_rate": 1.9260217615385593e-05, + "loss": 1.5706, + "mean_token_accuracy": 0.6395171880722046, + "num_tokens": 478082329.0, + "step": 2843 + }, + { + "entropy": 1.7743544578552246, + "epoch": 0.3124330559446321, + "grad_norm": 0.7306809425354004, + "learning_rate": 1.9259581774893278e-05, + "loss": 1.3308, + "mean_token_accuracy": 0.660937691728274, + "num_tokens": 478250478.0, + "step": 2844 + }, + { + "entropy": 1.7033919493357341, + "epoch": 0.31254291285600505, + "grad_norm": 0.7696589827537537, + "learning_rate": 1.9258945672988926e-05, + "loss": 1.3718, + "mean_token_accuracy": 0.6629279802242914, + "num_tokens": 478402917.0, + "step": 2845 + }, + { + "entropy": 1.6651720503966014, + "epoch": 0.312652769767378, + "grad_norm": 0.7585199475288391, + "learning_rate": 1.925830930969266e-05, + "loss": 1.2988, + "mean_token_accuracy": 0.675694132844607, + "num_tokens": 478585743.0, + "step": 2846 + }, + { + "entropy": 1.7346419990062714, + "epoch": 0.31276262667875093, + "grad_norm": 0.7175132036209106, + "learning_rate": 1.9257672685024625e-05, + "loss": 1.5035, + "mean_token_accuracy": 0.6487277994553248, + "num_tokens": 478737605.0, + "step": 2847 + }, + { + "entropy": 1.714732418457667, + "epoch": 0.3128724835901239, + "grad_norm": 0.7112360596656799, + "learning_rate": 1.9257035799004974e-05, + "loss": 1.5867, + "mean_token_accuracy": 0.6398107608159384, + "num_tokens": 478931966.0, + "step": 2848 + }, + { + "entropy": 1.6614445745944977, + "epoch": 0.3129823405014968, + "grad_norm": 0.6705912351608276, + "learning_rate": 1.925639865165386e-05, + "loss": 1.2835, + "mean_token_accuracy": 0.67622738579909, + "num_tokens": 479049933.0, + "step": 2849 + }, + { + "entropy": 1.7204997936884563, + "epoch": 0.31309219741286975, + "grad_norm": 0.5869386196136475, + "learning_rate": 1.9255761242991445e-05, + "loss": 1.4058, + "mean_token_accuracy": 0.6542830715576807, + "num_tokens": 479252205.0, + "step": 2850 + }, + { + "entropy": 1.7119795382022858, + "epoch": 0.3132020543242427, + "grad_norm": 0.6954273581504822, + "learning_rate": 1.925512357303791e-05, + "loss": 1.3548, + "mean_token_accuracy": 0.6612391769886017, + "num_tokens": 479422251.0, + "step": 2851 + }, + { + "entropy": 1.6646559834480286, + "epoch": 0.31331191123561564, + "grad_norm": 0.8026860952377319, + "learning_rate": 1.9254485641813434e-05, + "loss": 1.4965, + "mean_token_accuracy": 0.6526677558819453, + "num_tokens": 479628173.0, + "step": 2852 + }, + { + "entropy": 1.7153493762016296, + "epoch": 0.3134217681469885, + "grad_norm": 0.6951051950454712, + "learning_rate": 1.9253847449338202e-05, + "loss": 1.3979, + "mean_token_accuracy": 0.6515944103399912, + "num_tokens": 479815276.0, + "step": 2853 + }, + { + "entropy": 1.6854738493760426, + "epoch": 0.31353162505836146, + "grad_norm": 0.6943417191505432, + "learning_rate": 1.9253208995632426e-05, + "loss": 1.3106, + "mean_token_accuracy": 0.6591322422027588, + "num_tokens": 480010470.0, + "step": 2854 + }, + { + "entropy": 1.7210654417673747, + "epoch": 0.3136414819697344, + "grad_norm": 0.7324227690696716, + "learning_rate": 1.9252570280716298e-05, + "loss": 1.2533, + "mean_token_accuracy": 0.6761431097984314, + "num_tokens": 480149477.0, + "step": 2855 + }, + { + "entropy": 1.7122070292631786, + "epoch": 0.31375133888110734, + "grad_norm": 0.723173975944519, + "learning_rate": 1.9251931304610042e-05, + "loss": 1.3591, + "mean_token_accuracy": 0.6570076793432236, + "num_tokens": 480358379.0, + "step": 2856 + }, + { + "entropy": 1.7481829424699147, + "epoch": 0.3138611957924803, + "grad_norm": 0.7296705842018127, + "learning_rate": 1.925129206733388e-05, + "loss": 1.352, + "mean_token_accuracy": 0.6633712897698084, + "num_tokens": 480463609.0, + "step": 2857 + }, + { + "entropy": 1.6913323799769084, + "epoch": 0.3139710527038532, + "grad_norm": 0.6928101181983948, + "learning_rate": 1.925065256890804e-05, + "loss": 1.4533, + "mean_token_accuracy": 0.6504810303449631, + "num_tokens": 480692043.0, + "step": 2858 + }, + { + "entropy": 1.6184549927711487, + "epoch": 0.31408090961522617, + "grad_norm": 0.6742601990699768, + "learning_rate": 1.9250012809352764e-05, + "loss": 1.2693, + "mean_token_accuracy": 0.6729711244503657, + "num_tokens": 480841939.0, + "step": 2859 + }, + { + "entropy": 1.763518790404002, + "epoch": 0.3141907665265991, + "grad_norm": 0.9082183241844177, + "learning_rate": 1.92493727886883e-05, + "loss": 1.4003, + "mean_token_accuracy": 0.6487570206324259, + "num_tokens": 480978037.0, + "step": 2860 + }, + { + "entropy": 1.6946383118629456, + "epoch": 0.31430062343797205, + "grad_norm": 0.613614559173584, + "learning_rate": 1.9248732506934902e-05, + "loss": 1.3008, + "mean_token_accuracy": 0.6719754189252853, + "num_tokens": 481144692.0, + "step": 2861 + }, + { + "entropy": 1.6866445640722911, + "epoch": 0.314410480349345, + "grad_norm": 0.6680022478103638, + "learning_rate": 1.924809196411284e-05, + "loss": 1.3679, + "mean_token_accuracy": 0.6613704959551493, + "num_tokens": 481335314.0, + "step": 2862 + }, + { + "entropy": 1.710518628358841, + "epoch": 0.31452033726071793, + "grad_norm": 0.5925086140632629, + "learning_rate": 1.9247451160242385e-05, + "loss": 1.4802, + "mean_token_accuracy": 0.6449962158997854, + "num_tokens": 481534394.0, + "step": 2863 + }, + { + "entropy": 1.7207396229108174, + "epoch": 0.31463019417209087, + "grad_norm": 0.7651747465133667, + "learning_rate": 1.9246810095343815e-05, + "loss": 1.5563, + "mean_token_accuracy": 0.6396962677439054, + "num_tokens": 481754072.0, + "step": 2864 + }, + { + "entropy": 1.6443149745464325, + "epoch": 0.3147400510834638, + "grad_norm": 0.6283020973205566, + "learning_rate": 1.9246168769437426e-05, + "loss": 1.2379, + "mean_token_accuracy": 0.6734432826439539, + "num_tokens": 481925605.0, + "step": 2865 + }, + { + "entropy": 1.7003800670305889, + "epoch": 0.3148499079948367, + "grad_norm": 0.6588708162307739, + "learning_rate": 1.9245527182543506e-05, + "loss": 1.4739, + "mean_token_accuracy": 0.6499484032392502, + "num_tokens": 482134082.0, + "step": 2866 + }, + { + "entropy": 1.6552692552407582, + "epoch": 0.31495976490620964, + "grad_norm": 0.7375697493553162, + "learning_rate": 1.9244885334682367e-05, + "loss": 1.2952, + "mean_token_accuracy": 0.6863390256961187, + "num_tokens": 482280461.0, + "step": 2867 + }, + { + "entropy": 1.682984471321106, + "epoch": 0.3150696218175826, + "grad_norm": 0.6565489172935486, + "learning_rate": 1.9244243225874328e-05, + "loss": 1.321, + "mean_token_accuracy": 0.6630292187134424, + "num_tokens": 482426569.0, + "step": 2868 + }, + { + "entropy": 1.7070794304211934, + "epoch": 0.3151794787289555, + "grad_norm": 0.6953569650650024, + "learning_rate": 1.92436008561397e-05, + "loss": 1.2777, + "mean_token_accuracy": 0.6620368659496307, + "num_tokens": 482561769.0, + "step": 2869 + }, + { + "entropy": 1.7198736766974132, + "epoch": 0.31528933564032846, + "grad_norm": 0.6899498701095581, + "learning_rate": 1.924295822549882e-05, + "loss": 1.3307, + "mean_token_accuracy": 0.6546566337347031, + "num_tokens": 482694606.0, + "step": 2870 + }, + { + "entropy": 1.7433649897575378, + "epoch": 0.3153991925517014, + "grad_norm": 0.624694287776947, + "learning_rate": 1.9242315333972028e-05, + "loss": 1.464, + "mean_token_accuracy": 0.6367517908414205, + "num_tokens": 482880755.0, + "step": 2871 + }, + { + "entropy": 1.6819024284680684, + "epoch": 0.31550904946307434, + "grad_norm": 1.0012410879135132, + "learning_rate": 1.924167218157967e-05, + "loss": 1.3411, + "mean_token_accuracy": 0.6579955021540324, + "num_tokens": 483037033.0, + "step": 2872 + }, + { + "entropy": 1.7648041447003682, + "epoch": 0.3156189063744473, + "grad_norm": 0.772260844707489, + "learning_rate": 1.9241028768342097e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6752532223860422, + "num_tokens": 483178611.0, + "step": 2873 + }, + { + "entropy": 1.741791735092799, + "epoch": 0.3157287632858202, + "grad_norm": 0.6674953103065491, + "learning_rate": 1.9240385094279682e-05, + "loss": 1.3883, + "mean_token_accuracy": 0.6550341794888178, + "num_tokens": 483365090.0, + "step": 2874 + }, + { + "entropy": 1.7017056147257488, + "epoch": 0.31583862019719317, + "grad_norm": 0.7384848594665527, + "learning_rate": 1.923974115941279e-05, + "loss": 1.4, + "mean_token_accuracy": 0.6497112860282263, + "num_tokens": 483590442.0, + "step": 2875 + }, + { + "entropy": 1.7605169018109639, + "epoch": 0.3159484771085661, + "grad_norm": 0.6867170929908752, + "learning_rate": 1.92390969637618e-05, + "loss": 1.4002, + "mean_token_accuracy": 0.6571317712465922, + "num_tokens": 483711913.0, + "step": 2876 + }, + { + "entropy": 1.7703985174496968, + "epoch": 0.31605833401993905, + "grad_norm": 0.7674762010574341, + "learning_rate": 1.9238452507347112e-05, + "loss": 1.3849, + "mean_token_accuracy": 0.659190704425176, + "num_tokens": 483868222.0, + "step": 2877 + }, + { + "entropy": 1.7050415376822154, + "epoch": 0.316168190931312, + "grad_norm": 0.69243985414505, + "learning_rate": 1.9237807790189108e-05, + "loss": 1.4312, + "mean_token_accuracy": 0.6475951820611954, + "num_tokens": 484031399.0, + "step": 2878 + }, + { + "entropy": 1.6624840299288433, + "epoch": 0.31627804784268493, + "grad_norm": 0.670750617980957, + "learning_rate": 1.9237162812308204e-05, + "loss": 1.3691, + "mean_token_accuracy": 0.657655676205953, + "num_tokens": 484182873.0, + "step": 2879 + }, + { + "entropy": 1.748474011818568, + "epoch": 0.3163879047540578, + "grad_norm": 0.7633766531944275, + "learning_rate": 1.9236517573724808e-05, + "loss": 1.4173, + "mean_token_accuracy": 0.6639850089947382, + "num_tokens": 484326382.0, + "step": 2880 + }, + { + "entropy": 1.7452166378498077, + "epoch": 0.31649776166543075, + "grad_norm": 0.6374519467353821, + "learning_rate": 1.923587207445934e-05, + "loss": 1.428, + "mean_token_accuracy": 0.643037294348081, + "num_tokens": 484478175.0, + "step": 2881 + }, + { + "entropy": 1.7703827420870464, + "epoch": 0.3166076185768037, + "grad_norm": 0.6338856816291809, + "learning_rate": 1.923522631453223e-05, + "loss": 1.4608, + "mean_token_accuracy": 0.6400000900030136, + "num_tokens": 484684598.0, + "step": 2882 + }, + { + "entropy": 1.6763095458348591, + "epoch": 0.31671747548817664, + "grad_norm": 0.7417557835578918, + "learning_rate": 1.9234580293963922e-05, + "loss": 1.377, + "mean_token_accuracy": 0.6634032080570856, + "num_tokens": 484851276.0, + "step": 2883 + }, + { + "entropy": 1.7742979725201924, + "epoch": 0.3168273323995496, + "grad_norm": 0.8382861018180847, + "learning_rate": 1.9233934012774855e-05, + "loss": 1.3599, + "mean_token_accuracy": 0.6617914984623591, + "num_tokens": 485027045.0, + "step": 2884 + }, + { + "entropy": 1.6628169218699138, + "epoch": 0.3169371893109225, + "grad_norm": 0.6727002263069153, + "learning_rate": 1.923328747098549e-05, + "loss": 1.2461, + "mean_token_accuracy": 0.6721131453911463, + "num_tokens": 485188142.0, + "step": 2885 + }, + { + "entropy": 1.6688977181911469, + "epoch": 0.31704704622229546, + "grad_norm": 0.6404067277908325, + "learning_rate": 1.9232640668616284e-05, + "loss": 1.4282, + "mean_token_accuracy": 0.6530584941307703, + "num_tokens": 485375508.0, + "step": 2886 + }, + { + "entropy": 1.717422644297282, + "epoch": 0.3171569031336684, + "grad_norm": 0.6284343004226685, + "learning_rate": 1.923199360568771e-05, + "loss": 1.3367, + "mean_token_accuracy": 0.6689218978087107, + "num_tokens": 485534089.0, + "step": 2887 + }, + { + "entropy": 1.6929986675580342, + "epoch": 0.31726676004504134, + "grad_norm": 0.6857246160507202, + "learning_rate": 1.923134628222025e-05, + "loss": 1.357, + "mean_token_accuracy": 0.663006509343783, + "num_tokens": 485680262.0, + "step": 2888 + }, + { + "entropy": 1.680146853129069, + "epoch": 0.3173766169564143, + "grad_norm": 0.6327770352363586, + "learning_rate": 1.923069869823439e-05, + "loss": 1.4815, + "mean_token_accuracy": 0.6378313849369684, + "num_tokens": 485910473.0, + "step": 2889 + }, + { + "entropy": 1.752420961856842, + "epoch": 0.3174864738677872, + "grad_norm": 0.7673040628433228, + "learning_rate": 1.9230050853750624e-05, + "loss": 1.4938, + "mean_token_accuracy": 0.6509816845258077, + "num_tokens": 486064919.0, + "step": 2890 + }, + { + "entropy": 1.7698513368765514, + "epoch": 0.31759633077916016, + "grad_norm": 0.9164318442344666, + "learning_rate": 1.9229402748789456e-05, + "loss": 1.2388, + "mean_token_accuracy": 0.6785450875759125, + "num_tokens": 486196513.0, + "step": 2891 + }, + { + "entropy": 1.7393431663513184, + "epoch": 0.3177061876905331, + "grad_norm": 0.6718737483024597, + "learning_rate": 1.92287543833714e-05, + "loss": 1.3484, + "mean_token_accuracy": 0.6540123621622721, + "num_tokens": 486378261.0, + "step": 2892 + }, + { + "entropy": 1.707463949918747, + "epoch": 0.31781604460190604, + "grad_norm": 0.8465375304222107, + "learning_rate": 1.9228105757516974e-05, + "loss": 1.3403, + "mean_token_accuracy": 0.6693908423185349, + "num_tokens": 486592874.0, + "step": 2893 + }, + { + "entropy": 1.6605522235234578, + "epoch": 0.31792590151327893, + "grad_norm": 0.6796486377716064, + "learning_rate": 1.9227456871246714e-05, + "loss": 1.3849, + "mean_token_accuracy": 0.6787869185209274, + "num_tokens": 486712525.0, + "step": 2894 + }, + { + "entropy": 1.7513254880905151, + "epoch": 0.31803575842465187, + "grad_norm": 0.7111853957176208, + "learning_rate": 1.9226807724581148e-05, + "loss": 1.4619, + "mean_token_accuracy": 0.6655588646729788, + "num_tokens": 486866331.0, + "step": 2895 + }, + { + "entropy": 1.7475620210170746, + "epoch": 0.3181456153360248, + "grad_norm": 0.737848699092865, + "learning_rate": 1.922615831754082e-05, + "loss": 1.3467, + "mean_token_accuracy": 0.6666428198417028, + "num_tokens": 487025593.0, + "step": 2896 + }, + { + "entropy": 1.6630546947320302, + "epoch": 0.31825547224739775, + "grad_norm": 0.8860530853271484, + "learning_rate": 1.9225508650146294e-05, + "loss": 1.5313, + "mean_token_accuracy": 0.6354395796855291, + "num_tokens": 487258804.0, + "step": 2897 + }, + { + "entropy": 1.753849446773529, + "epoch": 0.3183653291587707, + "grad_norm": 0.8779124617576599, + "learning_rate": 1.9224858722418122e-05, + "loss": 1.581, + "mean_token_accuracy": 0.6377290387948354, + "num_tokens": 487413807.0, + "step": 2898 + }, + { + "entropy": 1.6524033447106679, + "epoch": 0.31847518607014363, + "grad_norm": 0.7529869079589844, + "learning_rate": 1.922420853437688e-05, + "loss": 1.1943, + "mean_token_accuracy": 0.679450144370397, + "num_tokens": 487517690.0, + "step": 2899 + }, + { + "entropy": 1.6763293743133545, + "epoch": 0.3185850429815166, + "grad_norm": 0.656287670135498, + "learning_rate": 1.9223558086043147e-05, + "loss": 1.3652, + "mean_token_accuracy": 0.6669528832038244, + "num_tokens": 487677299.0, + "step": 2900 + }, + { + "entropy": 1.6773990790049236, + "epoch": 0.3186948998928895, + "grad_norm": 0.7613338828086853, + "learning_rate": 1.92229073774375e-05, + "loss": 1.5412, + "mean_token_accuracy": 0.6458795020977656, + "num_tokens": 487835814.0, + "step": 2901 + }, + { + "entropy": 1.736249138911565, + "epoch": 0.31880475680426246, + "grad_norm": 0.5995625853538513, + "learning_rate": 1.9222256408580545e-05, + "loss": 1.3598, + "mean_token_accuracy": 0.6582354704538981, + "num_tokens": 487995593.0, + "step": 2902 + }, + { + "entropy": 1.715472271045049, + "epoch": 0.3189146137156354, + "grad_norm": 0.6768810153007507, + "learning_rate": 1.9221605179492878e-05, + "loss": 1.3586, + "mean_token_accuracy": 0.6697869201501211, + "num_tokens": 488186812.0, + "step": 2903 + }, + { + "entropy": 1.7300209204355876, + "epoch": 0.31902447062700834, + "grad_norm": 0.7611496448516846, + "learning_rate": 1.922095369019511e-05, + "loss": 1.4155, + "mean_token_accuracy": 0.6667229980230331, + "num_tokens": 488359701.0, + "step": 2904 + }, + { + "entropy": 1.7256126403808594, + "epoch": 0.3191343275383813, + "grad_norm": 0.6969589591026306, + "learning_rate": 1.922030194070786e-05, + "loss": 1.4225, + "mean_token_accuracy": 0.6631851394971212, + "num_tokens": 488511590.0, + "step": 2905 + }, + { + "entropy": 1.678377350171407, + "epoch": 0.3192441844497542, + "grad_norm": 0.6946367025375366, + "learning_rate": 1.9219649931051764e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6693602552016577, + "num_tokens": 488646530.0, + "step": 2906 + }, + { + "entropy": 1.6788996458053589, + "epoch": 0.3193540413611271, + "grad_norm": 0.5399008989334106, + "learning_rate": 1.9218997661247446e-05, + "loss": 1.3327, + "mean_token_accuracy": 0.6513966371615728, + "num_tokens": 488871769.0, + "step": 2907 + }, + { + "entropy": 1.695472886164983, + "epoch": 0.31946389827250005, + "grad_norm": 0.7106685638427734, + "learning_rate": 1.921834513131556e-05, + "loss": 1.3448, + "mean_token_accuracy": 0.6767653375864029, + "num_tokens": 489036941.0, + "step": 2908 + }, + { + "entropy": 1.750376472870509, + "epoch": 0.319573755183873, + "grad_norm": 0.6660662889480591, + "learning_rate": 1.921769234127675e-05, + "loss": 1.3957, + "mean_token_accuracy": 0.6495022475719452, + "num_tokens": 489212402.0, + "step": 2909 + }, + { + "entropy": 1.6709170639514923, + "epoch": 0.31968361209524593, + "grad_norm": 0.6726402044296265, + "learning_rate": 1.9217039291151684e-05, + "loss": 1.2903, + "mean_token_accuracy": 0.6766814192136129, + "num_tokens": 489359701.0, + "step": 2910 + }, + { + "entropy": 1.772289474805196, + "epoch": 0.31979346900661887, + "grad_norm": 0.8132745027542114, + "learning_rate": 1.9216385980961027e-05, + "loss": 1.3114, + "mean_token_accuracy": 0.6673167099555334, + "num_tokens": 489509117.0, + "step": 2911 + }, + { + "entropy": 1.6465057233969371, + "epoch": 0.3199033259179918, + "grad_norm": 0.6829494833946228, + "learning_rate": 1.9215732410725453e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6528383443752924, + "num_tokens": 489687106.0, + "step": 2912 + }, + { + "entropy": 1.6819744805494945, + "epoch": 0.32001318282936475, + "grad_norm": 0.6408959031105042, + "learning_rate": 1.9215078580465653e-05, + "loss": 1.3749, + "mean_token_accuracy": 0.6629981398582458, + "num_tokens": 489879747.0, + "step": 2913 + }, + { + "entropy": 1.7291929125785828, + "epoch": 0.3201230397407377, + "grad_norm": 0.7077094912528992, + "learning_rate": 1.9214424490202316e-05, + "loss": 1.4732, + "mean_token_accuracy": 0.6318852504094442, + "num_tokens": 490037336.0, + "step": 2914 + }, + { + "entropy": 1.7284752825895946, + "epoch": 0.32023289665211063, + "grad_norm": 0.6826415657997131, + "learning_rate": 1.9213770139956145e-05, + "loss": 1.413, + "mean_token_accuracy": 0.6539119978745779, + "num_tokens": 490202718.0, + "step": 2915 + }, + { + "entropy": 1.6593516568342845, + "epoch": 0.3203427535634836, + "grad_norm": 0.6397992372512817, + "learning_rate": 1.921311552974785e-05, + "loss": 1.3014, + "mean_token_accuracy": 0.6723136901855469, + "num_tokens": 490349621.0, + "step": 2916 + }, + { + "entropy": 1.7860759397347767, + "epoch": 0.3204526104748565, + "grad_norm": 0.6551001071929932, + "learning_rate": 1.9212460659598153e-05, + "loss": 1.3407, + "mean_token_accuracy": 0.6589092761278152, + "num_tokens": 490457337.0, + "step": 2917 + }, + { + "entropy": 1.730087826649348, + "epoch": 0.32056246738622945, + "grad_norm": 0.7016686201095581, + "learning_rate": 1.9211805529527775e-05, + "loss": 1.484, + "mean_token_accuracy": 0.6437141746282578, + "num_tokens": 490669111.0, + "step": 2918 + }, + { + "entropy": 1.6922438045342763, + "epoch": 0.3206723242976024, + "grad_norm": 0.7173215746879578, + "learning_rate": 1.921115013955745e-05, + "loss": 1.4722, + "mean_token_accuracy": 0.6466464251279831, + "num_tokens": 490849927.0, + "step": 2919 + }, + { + "entropy": 1.7310430804888408, + "epoch": 0.32078218120897534, + "grad_norm": 0.7087364196777344, + "learning_rate": 1.9210494489707926e-05, + "loss": 1.3491, + "mean_token_accuracy": 0.66358715792497, + "num_tokens": 490961402.0, + "step": 2920 + }, + { + "entropy": 1.6876995464166005, + "epoch": 0.3208920381203482, + "grad_norm": 0.697143018245697, + "learning_rate": 1.9209838579999947e-05, + "loss": 1.3488, + "mean_token_accuracy": 0.6534036248922348, + "num_tokens": 491129298.0, + "step": 2921 + }, + { + "entropy": 1.680976579586665, + "epoch": 0.32100189503172116, + "grad_norm": 0.7552234530448914, + "learning_rate": 1.920918241045428e-05, + "loss": 1.5304, + "mean_token_accuracy": 0.6349671731392542, + "num_tokens": 491341947.0, + "step": 2922 + }, + { + "entropy": 1.6714160442352295, + "epoch": 0.3211117519430941, + "grad_norm": 0.6923167109489441, + "learning_rate": 1.920852598109169e-05, + "loss": 1.2721, + "mean_token_accuracy": 0.678156390786171, + "num_tokens": 491468492.0, + "step": 2923 + }, + { + "entropy": 1.7526885271072388, + "epoch": 0.32122160885446704, + "grad_norm": 0.6262015700340271, + "learning_rate": 1.920786929193295e-05, + "loss": 1.4856, + "mean_token_accuracy": 0.6357933630545934, + "num_tokens": 491655738.0, + "step": 2924 + }, + { + "entropy": 1.799785594145457, + "epoch": 0.32133146576584, + "grad_norm": 0.7483623623847961, + "learning_rate": 1.920721234299884e-05, + "loss": 1.4297, + "mean_token_accuracy": 0.645599807302157, + "num_tokens": 491883634.0, + "step": 2925 + }, + { + "entropy": 1.6690879464149475, + "epoch": 0.3214413226772129, + "grad_norm": 0.7060349583625793, + "learning_rate": 1.9206555134310166e-05, + "loss": 1.2896, + "mean_token_accuracy": 0.6781369696060816, + "num_tokens": 492049115.0, + "step": 2926 + }, + { + "entropy": 1.7402922709782918, + "epoch": 0.32155117958858587, + "grad_norm": 0.8359885215759277, + "learning_rate": 1.9205897665887718e-05, + "loss": 1.5249, + "mean_token_accuracy": 0.644447940091292, + "num_tokens": 492183166.0, + "step": 2927 + }, + { + "entropy": 1.7350502908229828, + "epoch": 0.3216610364999588, + "grad_norm": 0.7333374619483948, + "learning_rate": 1.9205239937752304e-05, + "loss": 1.3024, + "mean_token_accuracy": 0.6708205292622248, + "num_tokens": 492369207.0, + "step": 2928 + }, + { + "entropy": 1.665328135093053, + "epoch": 0.32177089341133175, + "grad_norm": 0.7489623427391052, + "learning_rate": 1.9204581949924744e-05, + "loss": 1.2896, + "mean_token_accuracy": 0.6738118877013525, + "num_tokens": 492533237.0, + "step": 2929 + }, + { + "entropy": 1.7022863527139027, + "epoch": 0.3218807503227047, + "grad_norm": 0.7714312076568604, + "learning_rate": 1.9203923702425863e-05, + "loss": 1.4599, + "mean_token_accuracy": 0.6606474220752716, + "num_tokens": 492733024.0, + "step": 2930 + }, + { + "entropy": 1.732410063346227, + "epoch": 0.32199060723407763, + "grad_norm": 0.7454637885093689, + "learning_rate": 1.9203265195276494e-05, + "loss": 1.2414, + "mean_token_accuracy": 0.6792856454849243, + "num_tokens": 492879920.0, + "step": 2931 + }, + { + "entropy": 1.6901346445083618, + "epoch": 0.32210046414545057, + "grad_norm": 0.6151790618896484, + "learning_rate": 1.9202606428497476e-05, + "loss": 1.3307, + "mean_token_accuracy": 0.6711449126402537, + "num_tokens": 493008404.0, + "step": 2932 + }, + { + "entropy": 1.7341221272945404, + "epoch": 0.3222103210568235, + "grad_norm": 0.6119834780693054, + "learning_rate": 1.9201947402109663e-05, + "loss": 1.5834, + "mean_token_accuracy": 0.6300620784362158, + "num_tokens": 493215482.0, + "step": 2933 + }, + { + "entropy": 1.6682228247324626, + "epoch": 0.3223201779681964, + "grad_norm": 0.6601076722145081, + "learning_rate": 1.920128811613391e-05, + "loss": 1.4085, + "mean_token_accuracy": 0.651889776190122, + "num_tokens": 493409342.0, + "step": 2934 + }, + { + "entropy": 1.701788494984309, + "epoch": 0.32243003487956934, + "grad_norm": 0.7248215079307556, + "learning_rate": 1.9200628570591084e-05, + "loss": 1.3979, + "mean_token_accuracy": 0.6579019526640574, + "num_tokens": 493560199.0, + "step": 2935 + }, + { + "entropy": 1.684028019507726, + "epoch": 0.3225398917909423, + "grad_norm": 0.6509939432144165, + "learning_rate": 1.919996876550206e-05, + "loss": 1.4448, + "mean_token_accuracy": 0.6362205495436987, + "num_tokens": 493753256.0, + "step": 2936 + }, + { + "entropy": 1.720295896132787, + "epoch": 0.3226497487023152, + "grad_norm": 0.7366635203361511, + "learning_rate": 1.919930870088772e-05, + "loss": 1.5125, + "mean_token_accuracy": 0.6424083262681961, + "num_tokens": 493932673.0, + "step": 2937 + }, + { + "entropy": 1.6438338458538055, + "epoch": 0.32275960561368816, + "grad_norm": 0.6553351879119873, + "learning_rate": 1.919864837676895e-05, + "loss": 1.2257, + "mean_token_accuracy": 0.6853679070870081, + "num_tokens": 494050760.0, + "step": 2938 + }, + { + "entropy": 1.7294786274433136, + "epoch": 0.3228694625250611, + "grad_norm": 0.655874490737915, + "learning_rate": 1.9197987793166655e-05, + "loss": 1.3517, + "mean_token_accuracy": 0.658056045571963, + "num_tokens": 494240349.0, + "step": 2939 + }, + { + "entropy": 1.7637418508529663, + "epoch": 0.32297931943643404, + "grad_norm": 0.7153424620628357, + "learning_rate": 1.9197326950101744e-05, + "loss": 1.4253, + "mean_token_accuracy": 0.655969480673472, + "num_tokens": 494379250.0, + "step": 2940 + }, + { + "entropy": 1.696879784266154, + "epoch": 0.323089176347807, + "grad_norm": 0.6477358937263489, + "learning_rate": 1.9196665847595126e-05, + "loss": 1.3708, + "mean_token_accuracy": 0.6583545009295145, + "num_tokens": 494535109.0, + "step": 2941 + }, + { + "entropy": 1.716434359550476, + "epoch": 0.3231990332591799, + "grad_norm": 0.6917335987091064, + "learning_rate": 1.9196004485667728e-05, + "loss": 1.5066, + "mean_token_accuracy": 0.6468228300412496, + "num_tokens": 494667235.0, + "step": 2942 + }, + { + "entropy": 1.7584334413210552, + "epoch": 0.32330889017055287, + "grad_norm": 0.6509451866149902, + "learning_rate": 1.9195342864340477e-05, + "loss": 1.4719, + "mean_token_accuracy": 0.6429315656423569, + "num_tokens": 494841807.0, + "step": 2943 + }, + { + "entropy": 1.7075772682825725, + "epoch": 0.3234187470819258, + "grad_norm": 0.682874858379364, + "learning_rate": 1.9194680983634323e-05, + "loss": 1.3857, + "mean_token_accuracy": 0.6495272219181061, + "num_tokens": 494983967.0, + "step": 2944 + }, + { + "entropy": 1.6379812856515248, + "epoch": 0.32352860399329875, + "grad_norm": 0.8594545722007751, + "learning_rate": 1.9194018843570208e-05, + "loss": 1.3222, + "mean_token_accuracy": 0.6915220071872076, + "num_tokens": 495111243.0, + "step": 2945 + }, + { + "entropy": 1.6541229287783306, + "epoch": 0.3236384609046717, + "grad_norm": 0.6572254300117493, + "learning_rate": 1.9193356444169086e-05, + "loss": 1.3144, + "mean_token_accuracy": 0.6642016619443893, + "num_tokens": 495250273.0, + "step": 2946 + }, + { + "entropy": 1.721500555674235, + "epoch": 0.32374831781604463, + "grad_norm": 0.643337070941925, + "learning_rate": 1.9192693785451925e-05, + "loss": 1.388, + "mean_token_accuracy": 0.6487238456805547, + "num_tokens": 495413516.0, + "step": 2947 + }, + { + "entropy": 1.6922288636366527, + "epoch": 0.3238581747274175, + "grad_norm": 0.6867654919624329, + "learning_rate": 1.91920308674397e-05, + "loss": 1.4196, + "mean_token_accuracy": 0.6719547808170319, + "num_tokens": 495576233.0, + "step": 2948 + }, + { + "entropy": 1.6915934085845947, + "epoch": 0.32396803163879045, + "grad_norm": 0.680091142654419, + "learning_rate": 1.919136769015339e-05, + "loss": 1.3689, + "mean_token_accuracy": 0.6606242706378301, + "num_tokens": 495789140.0, + "step": 2949 + }, + { + "entropy": 1.6307895680268605, + "epoch": 0.3240778885501634, + "grad_norm": 0.650629460811615, + "learning_rate": 1.919070425361398e-05, + "loss": 1.2485, + "mean_token_accuracy": 0.678497518102328, + "num_tokens": 495930360.0, + "step": 2950 + }, + { + "entropy": 1.7259167035420735, + "epoch": 0.32418774546153634, + "grad_norm": 0.8071044683456421, + "learning_rate": 1.9190040557842472e-05, + "loss": 1.4053, + "mean_token_accuracy": 0.6503327190876007, + "num_tokens": 496059841.0, + "step": 2951 + }, + { + "entropy": 1.6474729379018147, + "epoch": 0.3242976023729093, + "grad_norm": 0.6372878551483154, + "learning_rate": 1.918937660285987e-05, + "loss": 1.3471, + "mean_token_accuracy": 0.673143744468689, + "num_tokens": 496224948.0, + "step": 2952 + }, + { + "entropy": 1.6423666775226593, + "epoch": 0.3244074592842822, + "grad_norm": 0.6071237325668335, + "learning_rate": 1.918871238868719e-05, + "loss": 1.4078, + "mean_token_accuracy": 0.6574894885222117, + "num_tokens": 496408432.0, + "step": 2953 + }, + { + "entropy": 1.7058760623137157, + "epoch": 0.32451731619565516, + "grad_norm": 0.7525854706764221, + "learning_rate": 1.9188047915345455e-05, + "loss": 1.3487, + "mean_token_accuracy": 0.6681368251641592, + "num_tokens": 496548095.0, + "step": 2954 + }, + { + "entropy": 1.6724230746428173, + "epoch": 0.3246271731070281, + "grad_norm": 0.615999698638916, + "learning_rate": 1.9187383182855693e-05, + "loss": 1.5475, + "mean_token_accuracy": 0.65153868496418, + "num_tokens": 496777867.0, + "step": 2955 + }, + { + "entropy": 1.7171376744906108, + "epoch": 0.32473703001840104, + "grad_norm": 0.5861404538154602, + "learning_rate": 1.918671819123894e-05, + "loss": 1.2934, + "mean_token_accuracy": 0.6681883285442988, + "num_tokens": 496927140.0, + "step": 2956 + }, + { + "entropy": 1.7224361499150593, + "epoch": 0.324846886929774, + "grad_norm": 0.6110925674438477, + "learning_rate": 1.9186052940516245e-05, + "loss": 1.3695, + "mean_token_accuracy": 0.6483077257871628, + "num_tokens": 497121766.0, + "step": 2957 + }, + { + "entropy": 1.7283575534820557, + "epoch": 0.3249567438411469, + "grad_norm": 0.6756021976470947, + "learning_rate": 1.9185387430708663e-05, + "loss": 1.3304, + "mean_token_accuracy": 0.6640335768461227, + "num_tokens": 497257864.0, + "step": 2958 + }, + { + "entropy": 1.6929436028003693, + "epoch": 0.32506660075251986, + "grad_norm": 0.6733092069625854, + "learning_rate": 1.918472166183726e-05, + "loss": 1.5546, + "mean_token_accuracy": 0.6462220996618271, + "num_tokens": 497457514.0, + "step": 2959 + }, + { + "entropy": 1.7729289134343464, + "epoch": 0.3251764576638928, + "grad_norm": 0.7560225129127502, + "learning_rate": 1.9184055633923105e-05, + "loss": 1.3431, + "mean_token_accuracy": 0.6635515093803406, + "num_tokens": 497622838.0, + "step": 2960 + }, + { + "entropy": 1.7306861976782482, + "epoch": 0.3252863145752657, + "grad_norm": 0.6715700626373291, + "learning_rate": 1.9183389346987274e-05, + "loss": 1.3844, + "mean_token_accuracy": 0.6618200093507767, + "num_tokens": 497773384.0, + "step": 2961 + }, + { + "entropy": 1.664503941933314, + "epoch": 0.32539617148663863, + "grad_norm": 0.6396395564079285, + "learning_rate": 1.9182722801050858e-05, + "loss": 1.3481, + "mean_token_accuracy": 0.6635024050871531, + "num_tokens": 497959238.0, + "step": 2962 + }, + { + "entropy": 1.6557064652442932, + "epoch": 0.32550602839801157, + "grad_norm": 0.6370712518692017, + "learning_rate": 1.9182055996134955e-05, + "loss": 1.4172, + "mean_token_accuracy": 0.6532147924105326, + "num_tokens": 498113737.0, + "step": 2963 + }, + { + "entropy": 1.7849931518236797, + "epoch": 0.3256158853093845, + "grad_norm": 0.7333866357803345, + "learning_rate": 1.9181388932260663e-05, + "loss": 1.4514, + "mean_token_accuracy": 0.6394390016794205, + "num_tokens": 498298336.0, + "step": 2964 + }, + { + "entropy": 1.697540670633316, + "epoch": 0.32572574222075745, + "grad_norm": 0.6613360047340393, + "learning_rate": 1.91807216094491e-05, + "loss": 1.3528, + "mean_token_accuracy": 0.6626348445812861, + "num_tokens": 498469831.0, + "step": 2965 + }, + { + "entropy": 1.788981705904007, + "epoch": 0.3258355991321304, + "grad_norm": 0.7449756264686584, + "learning_rate": 1.9180054027721386e-05, + "loss": 1.3713, + "mean_token_accuracy": 0.6606344183286031, + "num_tokens": 498594651.0, + "step": 2966 + }, + { + "entropy": 1.749136467774709, + "epoch": 0.32594545604350333, + "grad_norm": 0.7143117785453796, + "learning_rate": 1.9179386187098648e-05, + "loss": 1.3748, + "mean_token_accuracy": 0.6651021838188171, + "num_tokens": 498725535.0, + "step": 2967 + }, + { + "entropy": 1.714682827393214, + "epoch": 0.3260553129548763, + "grad_norm": 0.7331691980361938, + "learning_rate": 1.917871808760202e-05, + "loss": 1.4736, + "mean_token_accuracy": 0.664309561252594, + "num_tokens": 498953694.0, + "step": 2968 + }, + { + "entropy": 1.630650371313095, + "epoch": 0.3261651698662492, + "grad_norm": 0.7111721038818359, + "learning_rate": 1.917804972925265e-05, + "loss": 1.3526, + "mean_token_accuracy": 0.6674729784329733, + "num_tokens": 499166088.0, + "step": 2969 + }, + { + "entropy": 1.695541262626648, + "epoch": 0.32627502677762216, + "grad_norm": 0.5924942493438721, + "learning_rate": 1.9177381112071693e-05, + "loss": 1.3686, + "mean_token_accuracy": 0.6531450500090917, + "num_tokens": 499373411.0, + "step": 2970 + }, + { + "entropy": 1.6883581181367238, + "epoch": 0.3263848836889951, + "grad_norm": 0.6998016834259033, + "learning_rate": 1.917671223608031e-05, + "loss": 1.3799, + "mean_token_accuracy": 0.6497906893491745, + "num_tokens": 499509178.0, + "step": 2971 + }, + { + "entropy": 1.7307297984759014, + "epoch": 0.32649474060036804, + "grad_norm": 0.6792302131652832, + "learning_rate": 1.9176043101299664e-05, + "loss": 1.4943, + "mean_token_accuracy": 0.6375556737184525, + "num_tokens": 499681437.0, + "step": 2972 + }, + { + "entropy": 1.7387321988741558, + "epoch": 0.326604597511741, + "grad_norm": 0.7723334431648254, + "learning_rate": 1.917537370775094e-05, + "loss": 1.5843, + "mean_token_accuracy": 0.6392184148232142, + "num_tokens": 499865905.0, + "step": 2973 + }, + { + "entropy": 1.7423201700051625, + "epoch": 0.3267144544231139, + "grad_norm": 0.721889853477478, + "learning_rate": 1.9174704055455327e-05, + "loss": 1.4046, + "mean_token_accuracy": 0.6562129308780035, + "num_tokens": 499992743.0, + "step": 2974 + }, + { + "entropy": 1.711453249057134, + "epoch": 0.3268243113344868, + "grad_norm": 0.6114857196807861, + "learning_rate": 1.917403414443401e-05, + "loss": 1.4313, + "mean_token_accuracy": 0.6468845208485922, + "num_tokens": 500185819.0, + "step": 2975 + }, + { + "entropy": 1.6875855922698975, + "epoch": 0.32693416824585975, + "grad_norm": 0.6063688397407532, + "learning_rate": 1.9173363974708196e-05, + "loss": 1.4764, + "mean_token_accuracy": 0.6517787824074427, + "num_tokens": 500420510.0, + "step": 2976 + }, + { + "entropy": 1.6635367274284363, + "epoch": 0.3270440251572327, + "grad_norm": 0.7195242047309875, + "learning_rate": 1.9172693546299094e-05, + "loss": 1.296, + "mean_token_accuracy": 0.6765902439753214, + "num_tokens": 500555525.0, + "step": 2977 + }, + { + "entropy": 1.717705875635147, + "epoch": 0.32715388206860563, + "grad_norm": 0.6923717856407166, + "learning_rate": 1.9172022859227927e-05, + "loss": 1.5837, + "mean_token_accuracy": 0.6424253235260645, + "num_tokens": 500746134.0, + "step": 2978 + }, + { + "entropy": 1.7490998009840648, + "epoch": 0.32726373897997857, + "grad_norm": 0.747678279876709, + "learning_rate": 1.9171351913515916e-05, + "loss": 1.4725, + "mean_token_accuracy": 0.6539937580625216, + "num_tokens": 500921039.0, + "step": 2979 + }, + { + "entropy": 1.6731195151805878, + "epoch": 0.3273735958913515, + "grad_norm": 0.8201509118080139, + "learning_rate": 1.91706807091843e-05, + "loss": 1.4356, + "mean_token_accuracy": 0.6559693316618601, + "num_tokens": 501079984.0, + "step": 2980 + }, + { + "entropy": 1.7330588301022847, + "epoch": 0.32748345280272445, + "grad_norm": 0.777803897857666, + "learning_rate": 1.9170009246254323e-05, + "loss": 1.3363, + "mean_token_accuracy": 0.6543682813644409, + "num_tokens": 501218988.0, + "step": 2981 + }, + { + "entropy": 1.7402281661828358, + "epoch": 0.3275933097140974, + "grad_norm": 0.7554537653923035, + "learning_rate": 1.9169337524747232e-05, + "loss": 1.3441, + "mean_token_accuracy": 0.6577061663071314, + "num_tokens": 501333041.0, + "step": 2982 + }, + { + "entropy": 1.794272631406784, + "epoch": 0.32770316662547033, + "grad_norm": 0.6783401966094971, + "learning_rate": 1.9168665544684292e-05, + "loss": 1.568, + "mean_token_accuracy": 0.6587167580922445, + "num_tokens": 501481324.0, + "step": 2983 + }, + { + "entropy": 1.822703331708908, + "epoch": 0.3278130235368433, + "grad_norm": 0.7269600629806519, + "learning_rate": 1.9167993306086768e-05, + "loss": 1.6219, + "mean_token_accuracy": 0.613858292500178, + "num_tokens": 501706864.0, + "step": 2984 + }, + { + "entropy": 1.7719651063283284, + "epoch": 0.3279228804482162, + "grad_norm": 0.6666971445083618, + "learning_rate": 1.9167320808975936e-05, + "loss": 1.4524, + "mean_token_accuracy": 0.6489970783392588, + "num_tokens": 501867433.0, + "step": 2985 + }, + { + "entropy": 1.6982284088929493, + "epoch": 0.32803273735958915, + "grad_norm": 0.655606210231781, + "learning_rate": 1.916664805337308e-05, + "loss": 1.4744, + "mean_token_accuracy": 0.6488128999869028, + "num_tokens": 502050616.0, + "step": 2986 + }, + { + "entropy": 1.676661233107249, + "epoch": 0.3281425942709621, + "grad_norm": 1.2920039892196655, + "learning_rate": 1.9165975039299497e-05, + "loss": 1.41, + "mean_token_accuracy": 0.6578782151142756, + "num_tokens": 502183884.0, + "step": 2987 + }, + { + "entropy": 1.7440513372421265, + "epoch": 0.328252451182335, + "grad_norm": 0.6458728909492493, + "learning_rate": 1.9165301766776478e-05, + "loss": 1.3387, + "mean_token_accuracy": 0.6633184005816778, + "num_tokens": 502312558.0, + "step": 2988 + }, + { + "entropy": 1.7916107575098674, + "epoch": 0.3283623080937079, + "grad_norm": 0.6091529130935669, + "learning_rate": 1.916462823582534e-05, + "loss": 1.4744, + "mean_token_accuracy": 0.6307684083779653, + "num_tokens": 502501943.0, + "step": 2989 + }, + { + "entropy": 1.7316680351893108, + "epoch": 0.32847216500508086, + "grad_norm": 0.8422166705131531, + "learning_rate": 1.9163954446467396e-05, + "loss": 1.4116, + "mean_token_accuracy": 0.6442118336757024, + "num_tokens": 502643511.0, + "step": 2990 + }, + { + "entropy": 1.7342032194137573, + "epoch": 0.3285820219164538, + "grad_norm": 0.7511305212974548, + "learning_rate": 1.9163280398723974e-05, + "loss": 1.497, + "mean_token_accuracy": 0.6555758366982142, + "num_tokens": 502797590.0, + "step": 2991 + }, + { + "entropy": 1.7133084932963054, + "epoch": 0.32869187882782674, + "grad_norm": 0.6199161410331726, + "learning_rate": 1.9162606092616407e-05, + "loss": 1.4054, + "mean_token_accuracy": 0.6550638278325399, + "num_tokens": 503013251.0, + "step": 2992 + }, + { + "entropy": 1.6971174776554108, + "epoch": 0.3288017357391997, + "grad_norm": 0.7281301617622375, + "learning_rate": 1.9161931528166034e-05, + "loss": 1.4092, + "mean_token_accuracy": 0.6583978980779648, + "num_tokens": 503184291.0, + "step": 2993 + }, + { + "entropy": 1.6845273971557617, + "epoch": 0.3289115926505726, + "grad_norm": 0.720726728439331, + "learning_rate": 1.9161256705394204e-05, + "loss": 1.3414, + "mean_token_accuracy": 0.6659517834583918, + "num_tokens": 503315433.0, + "step": 2994 + }, + { + "entropy": 1.729546884695689, + "epoch": 0.32902144956194557, + "grad_norm": 0.7851258516311646, + "learning_rate": 1.916058162432228e-05, + "loss": 1.2136, + "mean_token_accuracy": 0.6868036687374115, + "num_tokens": 503460120.0, + "step": 2995 + }, + { + "entropy": 1.6761589348316193, + "epoch": 0.3291313064733185, + "grad_norm": 0.742510974407196, + "learning_rate": 1.9159906284971627e-05, + "loss": 1.2834, + "mean_token_accuracy": 0.6747480084498724, + "num_tokens": 503610117.0, + "step": 2996 + }, + { + "entropy": 1.7127369443575542, + "epoch": 0.32924116338469145, + "grad_norm": 0.6928642392158508, + "learning_rate": 1.915923068736361e-05, + "loss": 1.3747, + "mean_token_accuracy": 0.6500995755195618, + "num_tokens": 503752043.0, + "step": 2997 + }, + { + "entropy": 1.7473743855953217, + "epoch": 0.3293510202960644, + "grad_norm": 0.7999443411827087, + "learning_rate": 1.915855483151962e-05, + "loss": 1.3799, + "mean_token_accuracy": 0.6510451088349024, + "num_tokens": 503902442.0, + "step": 2998 + }, + { + "entropy": 1.7298793097337086, + "epoch": 0.32946087720743733, + "grad_norm": 0.7435027956962585, + "learning_rate": 1.9157878717461048e-05, + "loss": 1.362, + "mean_token_accuracy": 0.6558040330807368, + "num_tokens": 504050985.0, + "step": 2999 + }, + { + "entropy": 1.752679854631424, + "epoch": 0.32957073411881027, + "grad_norm": 0.7258424758911133, + "learning_rate": 1.9157202345209293e-05, + "loss": 1.3282, + "mean_token_accuracy": 0.6720992128054301, + "num_tokens": 504192687.0, + "step": 3000 + }, + { + "entropy": 1.7436956961949666, + "epoch": 0.3296805910301832, + "grad_norm": 0.7615739703178406, + "learning_rate": 1.9156525714785758e-05, + "loss": 1.5075, + "mean_token_accuracy": 0.6405996978282928, + "num_tokens": 504369108.0, + "step": 3001 + }, + { + "entropy": 1.6998974084854126, + "epoch": 0.3297904479415561, + "grad_norm": 0.6445051431655884, + "learning_rate": 1.9155848826211856e-05, + "loss": 1.3417, + "mean_token_accuracy": 0.6661799550056458, + "num_tokens": 504538330.0, + "step": 3002 + }, + { + "entropy": 1.7470042010148366, + "epoch": 0.32990030485292904, + "grad_norm": 0.729354202747345, + "learning_rate": 1.915517167950902e-05, + "loss": 1.3582, + "mean_token_accuracy": 0.669076090057691, + "num_tokens": 504667747.0, + "step": 3003 + }, + { + "entropy": 1.7464979787667592, + "epoch": 0.330010161764302, + "grad_norm": 0.61955326795578, + "learning_rate": 1.9154494274698668e-05, + "loss": 1.6182, + "mean_token_accuracy": 0.6314500818649927, + "num_tokens": 504866089.0, + "step": 3004 + }, + { + "entropy": 1.7116199831167858, + "epoch": 0.3301200186756749, + "grad_norm": 0.675858736038208, + "learning_rate": 1.9153816611802252e-05, + "loss": 1.438, + "mean_token_accuracy": 0.652592346072197, + "num_tokens": 505020897.0, + "step": 3005 + }, + { + "entropy": 1.7077987988789876, + "epoch": 0.33022987558704786, + "grad_norm": 0.674115002155304, + "learning_rate": 1.9153138690841212e-05, + "loss": 1.3522, + "mean_token_accuracy": 0.6594054301579794, + "num_tokens": 505194598.0, + "step": 3006 + }, + { + "entropy": 1.7043022612730663, + "epoch": 0.3303397324984208, + "grad_norm": 0.8764580488204956, + "learning_rate": 1.9152460511837006e-05, + "loss": 1.4887, + "mean_token_accuracy": 0.6579779237508774, + "num_tokens": 505360115.0, + "step": 3007 + }, + { + "entropy": 1.6655643979708354, + "epoch": 0.33044958940979374, + "grad_norm": 0.7459789514541626, + "learning_rate": 1.9151782074811098e-05, + "loss": 1.2533, + "mean_token_accuracy": 0.6698919186989466, + "num_tokens": 505487058.0, + "step": 3008 + }, + { + "entropy": 1.6778320570786793, + "epoch": 0.3305594463211667, + "grad_norm": 0.6878098845481873, + "learning_rate": 1.9151103379784964e-05, + "loss": 1.2724, + "mean_token_accuracy": 0.6792107174793879, + "num_tokens": 505668993.0, + "step": 3009 + }, + { + "entropy": 1.7397213677565257, + "epoch": 0.3306693032325396, + "grad_norm": 0.6972684264183044, + "learning_rate": 1.915042442678008e-05, + "loss": 1.4157, + "mean_token_accuracy": 0.6579104761282603, + "num_tokens": 505806107.0, + "step": 3010 + }, + { + "entropy": 1.790239155292511, + "epoch": 0.33077916014391257, + "grad_norm": 0.6503934264183044, + "learning_rate": 1.914974521581793e-05, + "loss": 1.4499, + "mean_token_accuracy": 0.6496985306342443, + "num_tokens": 505975483.0, + "step": 3011 + }, + { + "entropy": 1.7129294872283936, + "epoch": 0.3308890170552855, + "grad_norm": 0.6534547805786133, + "learning_rate": 1.9149065746920023e-05, + "loss": 1.6048, + "mean_token_accuracy": 0.640040377775828, + "num_tokens": 506158617.0, + "step": 3012 + }, + { + "entropy": 1.7680010497570038, + "epoch": 0.33099887396665845, + "grad_norm": 0.6420508027076721, + "learning_rate": 1.9148386020107857e-05, + "loss": 1.422, + "mean_token_accuracy": 0.6478531509637833, + "num_tokens": 506352143.0, + "step": 3013 + }, + { + "entropy": 1.693307230869929, + "epoch": 0.3311087308780314, + "grad_norm": 0.6774839758872986, + "learning_rate": 1.914770603540294e-05, + "loss": 1.4863, + "mean_token_accuracy": 0.6670309404532114, + "num_tokens": 506573320.0, + "step": 3014 + }, + { + "entropy": 1.6939273178577423, + "epoch": 0.33121858778940433, + "grad_norm": 0.6691749691963196, + "learning_rate": 1.9147025792826803e-05, + "loss": 1.4095, + "mean_token_accuracy": 0.6504554947217306, + "num_tokens": 506733027.0, + "step": 3015 + }, + { + "entropy": 1.7147826254367828, + "epoch": 0.3313284447007772, + "grad_norm": 0.6580132246017456, + "learning_rate": 1.914634529240097e-05, + "loss": 1.3658, + "mean_token_accuracy": 0.6597993324200312, + "num_tokens": 506878473.0, + "step": 3016 + }, + { + "entropy": 1.6655152241388957, + "epoch": 0.33143830161215015, + "grad_norm": 0.6758162379264832, + "learning_rate": 1.914566453414698e-05, + "loss": 1.2672, + "mean_token_accuracy": 0.6674109697341919, + "num_tokens": 507007095.0, + "step": 3017 + }, + { + "entropy": 1.6784224609533946, + "epoch": 0.3315481585235231, + "grad_norm": 0.7197324633598328, + "learning_rate": 1.9144983518086378e-05, + "loss": 1.41, + "mean_token_accuracy": 0.6585833777983984, + "num_tokens": 507154336.0, + "step": 3018 + }, + { + "entropy": 1.6712388892968495, + "epoch": 0.33165801543489604, + "grad_norm": 0.7722300887107849, + "learning_rate": 1.9144302244240715e-05, + "loss": 1.4964, + "mean_token_accuracy": 0.6578239550193151, + "num_tokens": 507338056.0, + "step": 3019 + }, + { + "entropy": 1.6774245699246724, + "epoch": 0.331767872346269, + "grad_norm": 0.7282935976982117, + "learning_rate": 1.9143620712631555e-05, + "loss": 1.2622, + "mean_token_accuracy": 0.6662948727607727, + "num_tokens": 507455716.0, + "step": 3020 + }, + { + "entropy": 1.6499692300955455, + "epoch": 0.3318777292576419, + "grad_norm": 0.6513247489929199, + "learning_rate": 1.914293892328047e-05, + "loss": 1.4438, + "mean_token_accuracy": 0.6459585577249527, + "num_tokens": 507658074.0, + "step": 3021 + }, + { + "entropy": 1.7383529146512349, + "epoch": 0.33198758616901486, + "grad_norm": 0.629367470741272, + "learning_rate": 1.9142256876209046e-05, + "loss": 1.308, + "mean_token_accuracy": 0.666439284880956, + "num_tokens": 507828153.0, + "step": 3022 + }, + { + "entropy": 1.732998142639796, + "epoch": 0.3320974430803878, + "grad_norm": 0.890418529510498, + "learning_rate": 1.914157457143885e-05, + "loss": 1.2594, + "mean_token_accuracy": 0.6755412022272745, + "num_tokens": 507958961.0, + "step": 3023 + }, + { + "entropy": 1.7181305587291718, + "epoch": 0.33220729999176074, + "grad_norm": 0.6836814880371094, + "learning_rate": 1.914089200899149e-05, + "loss": 1.4408, + "mean_token_accuracy": 0.6476282527049383, + "num_tokens": 508133687.0, + "step": 3024 + }, + { + "entropy": 1.7462326188882191, + "epoch": 0.3323171569031337, + "grad_norm": 0.6367360949516296, + "learning_rate": 1.914020918888857e-05, + "loss": 1.4157, + "mean_token_accuracy": 0.6456411679585775, + "num_tokens": 508330078.0, + "step": 3025 + }, + { + "entropy": 1.611125926176707, + "epoch": 0.3324270138145066, + "grad_norm": 0.5883477330207825, + "learning_rate": 1.9139526111151695e-05, + "loss": 1.3606, + "mean_token_accuracy": 0.6693080514669418, + "num_tokens": 508519565.0, + "step": 3026 + }, + { + "entropy": 1.7090338071187336, + "epoch": 0.33253687072587956, + "grad_norm": 0.7724202275276184, + "learning_rate": 1.9138842775802483e-05, + "loss": 1.3513, + "mean_token_accuracy": 0.6597883005936941, + "num_tokens": 508705625.0, + "step": 3027 + }, + { + "entropy": 1.6161086161931355, + "epoch": 0.3326467276372525, + "grad_norm": 0.6500586867332458, + "learning_rate": 1.913815918286257e-05, + "loss": 1.2911, + "mean_token_accuracy": 0.6685569137334824, + "num_tokens": 508844283.0, + "step": 3028 + }, + { + "entropy": 1.7713151176770527, + "epoch": 0.3327565845486254, + "grad_norm": 0.8597967624664307, + "learning_rate": 1.9137475332353586e-05, + "loss": 1.5667, + "mean_token_accuracy": 0.6257463147242864, + "num_tokens": 509093149.0, + "step": 3029 + }, + { + "entropy": 1.6782112022240956, + "epoch": 0.33286644145999833, + "grad_norm": 0.6756134033203125, + "learning_rate": 1.913679122429717e-05, + "loss": 1.592, + "mean_token_accuracy": 0.6362019727627436, + "num_tokens": 509269750.0, + "step": 3030 + }, + { + "entropy": 1.7369333803653717, + "epoch": 0.33297629837137127, + "grad_norm": 0.5684821605682373, + "learning_rate": 1.9136106858714983e-05, + "loss": 1.4523, + "mean_token_accuracy": 0.6342654128869375, + "num_tokens": 509476830.0, + "step": 3031 + }, + { + "entropy": 1.7342596749464672, + "epoch": 0.3330861552827442, + "grad_norm": 0.6759097576141357, + "learning_rate": 1.9135422235628676e-05, + "loss": 1.1854, + "mean_token_accuracy": 0.6867292175690333, + "num_tokens": 509569400.0, + "step": 3032 + }, + { + "entropy": 1.671131859223048, + "epoch": 0.33319601219411715, + "grad_norm": 0.7441453337669373, + "learning_rate": 1.913473735505992e-05, + "loss": 1.5021, + "mean_token_accuracy": 0.6428874333699545, + "num_tokens": 509764136.0, + "step": 3033 + }, + { + "entropy": 1.7323594490687053, + "epoch": 0.3333058691054901, + "grad_norm": 0.6669164299964905, + "learning_rate": 1.91340522170304e-05, + "loss": 1.3045, + "mean_token_accuracy": 0.6643939961989721, + "num_tokens": 509890442.0, + "step": 3034 + }, + { + "entropy": 1.7155201435089111, + "epoch": 0.33341572601686303, + "grad_norm": 0.6839675307273865, + "learning_rate": 1.9133366821561788e-05, + "loss": 1.3481, + "mean_token_accuracy": 0.6636628260215124, + "num_tokens": 510052207.0, + "step": 3035 + }, + { + "entropy": 1.7188059786955516, + "epoch": 0.333525582928236, + "grad_norm": 0.7544474601745605, + "learning_rate": 1.9132681168675778e-05, + "loss": 1.4636, + "mean_token_accuracy": 0.6669894407192866, + "num_tokens": 510227328.0, + "step": 3036 + }, + { + "entropy": 1.6821933786074321, + "epoch": 0.3336354398396089, + "grad_norm": 0.6986051797866821, + "learning_rate": 1.9131995258394077e-05, + "loss": 1.3955, + "mean_token_accuracy": 0.6599144538243612, + "num_tokens": 510424924.0, + "step": 3037 + }, + { + "entropy": 1.7697338461875916, + "epoch": 0.33374529675098186, + "grad_norm": 0.6908696293830872, + "learning_rate": 1.913130909073839e-05, + "loss": 1.2912, + "mean_token_accuracy": 0.6725708792606989, + "num_tokens": 510524846.0, + "step": 3038 + }, + { + "entropy": 1.7425800959269206, + "epoch": 0.3338551536623548, + "grad_norm": 0.6675323247909546, + "learning_rate": 1.9130622665730434e-05, + "loss": 1.5795, + "mean_token_accuracy": 0.6429319083690643, + "num_tokens": 510715188.0, + "step": 3039 + }, + { + "entropy": 1.6848439772923787, + "epoch": 0.33396501057372774, + "grad_norm": 0.6058026552200317, + "learning_rate": 1.9129935983391933e-05, + "loss": 1.435, + "mean_token_accuracy": 0.6421783665815989, + "num_tokens": 510913269.0, + "step": 3040 + }, + { + "entropy": 1.718187967936198, + "epoch": 0.3340748674851007, + "grad_norm": 0.6798617839813232, + "learning_rate": 1.9129249043744627e-05, + "loss": 1.4205, + "mean_token_accuracy": 0.6531337300936381, + "num_tokens": 511088529.0, + "step": 3041 + }, + { + "entropy": 1.7231755753358204, + "epoch": 0.3341847243964736, + "grad_norm": 0.821942150592804, + "learning_rate": 1.9128561846810247e-05, + "loss": 1.4694, + "mean_token_accuracy": 0.6571696201960245, + "num_tokens": 511284719.0, + "step": 3042 + }, + { + "entropy": 1.722469339768092, + "epoch": 0.3342945813078465, + "grad_norm": 0.6946660876274109, + "learning_rate": 1.9127874392610548e-05, + "loss": 1.4006, + "mean_token_accuracy": 0.6625101615985235, + "num_tokens": 511411692.0, + "step": 3043 + }, + { + "entropy": 1.7435683111349742, + "epoch": 0.33440443821921945, + "grad_norm": 0.7410975098609924, + "learning_rate": 1.9127186681167288e-05, + "loss": 1.4159, + "mean_token_accuracy": 0.6528633783260981, + "num_tokens": 511551200.0, + "step": 3044 + }, + { + "entropy": 1.7684580485026042, + "epoch": 0.3345142951305924, + "grad_norm": 0.7469737529754639, + "learning_rate": 1.912649871250223e-05, + "loss": 1.4624, + "mean_token_accuracy": 0.6433129956324896, + "num_tokens": 511779579.0, + "step": 3045 + }, + { + "entropy": 1.7564850250879924, + "epoch": 0.33462415204196533, + "grad_norm": 0.6813525557518005, + "learning_rate": 1.9125810486637152e-05, + "loss": 1.3189, + "mean_token_accuracy": 0.6654183914264044, + "num_tokens": 511907802.0, + "step": 3046 + }, + { + "entropy": 1.739985744158427, + "epoch": 0.33473400895333827, + "grad_norm": 0.7431362271308899, + "learning_rate": 1.9125122003593833e-05, + "loss": 1.3795, + "mean_token_accuracy": 0.6528779665629069, + "num_tokens": 512047494.0, + "step": 3047 + }, + { + "entropy": 1.7247369190057118, + "epoch": 0.3348438658647112, + "grad_norm": 0.6911116242408752, + "learning_rate": 1.9124433263394063e-05, + "loss": 1.6181, + "mean_token_accuracy": 0.6445044080416361, + "num_tokens": 512268944.0, + "step": 3048 + }, + { + "entropy": 1.7096697489420574, + "epoch": 0.33495372277608415, + "grad_norm": 0.7822225689888, + "learning_rate": 1.9123744266059644e-05, + "loss": 1.3366, + "mean_token_accuracy": 0.6592263529698054, + "num_tokens": 512381190.0, + "step": 3049 + }, + { + "entropy": 1.7738013366858165, + "epoch": 0.3350635796874571, + "grad_norm": 0.9529100656509399, + "learning_rate": 1.9123055011612375e-05, + "loss": 1.4496, + "mean_token_accuracy": 0.6615629196166992, + "num_tokens": 512579620.0, + "step": 3050 + }, + { + "entropy": 1.7056050995985668, + "epoch": 0.33517343659883003, + "grad_norm": 0.7717422842979431, + "learning_rate": 1.912236550007408e-05, + "loss": 1.4488, + "mean_token_accuracy": 0.6731387178103129, + "num_tokens": 512725425.0, + "step": 3051 + }, + { + "entropy": 1.7472182114919026, + "epoch": 0.335283293510203, + "grad_norm": 0.687759518623352, + "learning_rate": 1.9121675731466572e-05, + "loss": 1.4012, + "mean_token_accuracy": 0.6478379468123118, + "num_tokens": 512904679.0, + "step": 3052 + }, + { + "entropy": 1.7227794031302135, + "epoch": 0.3353931504215759, + "grad_norm": 0.6436380743980408, + "learning_rate": 1.912098570581169e-05, + "loss": 1.3285, + "mean_token_accuracy": 0.6682673941055933, + "num_tokens": 513037137.0, + "step": 3053 + }, + { + "entropy": 1.7546689013640087, + "epoch": 0.33550300733294885, + "grad_norm": 0.7144925594329834, + "learning_rate": 1.912029542313127e-05, + "loss": 1.5938, + "mean_token_accuracy": 0.6321324606736501, + "num_tokens": 513241089.0, + "step": 3054 + }, + { + "entropy": 1.7048328717549641, + "epoch": 0.3356128642443218, + "grad_norm": 0.5735582709312439, + "learning_rate": 1.9119604883447155e-05, + "loss": 1.4212, + "mean_token_accuracy": 0.644182562828064, + "num_tokens": 513485301.0, + "step": 3055 + }, + { + "entropy": 1.710164765516917, + "epoch": 0.3357227211556947, + "grad_norm": 0.6873478293418884, + "learning_rate": 1.9118914086781208e-05, + "loss": 1.3381, + "mean_token_accuracy": 0.6514832923809687, + "num_tokens": 513653369.0, + "step": 3056 + }, + { + "entropy": 1.6649628281593323, + "epoch": 0.3358325780670676, + "grad_norm": 0.573425829410553, + "learning_rate": 1.911822303315529e-05, + "loss": 1.4072, + "mean_token_accuracy": 0.6597304493188858, + "num_tokens": 513852950.0, + "step": 3057 + }, + { + "entropy": 1.686522215604782, + "epoch": 0.33594243497844056, + "grad_norm": 0.62139493227005, + "learning_rate": 1.9117531722591267e-05, + "loss": 1.4055, + "mean_token_accuracy": 0.6543814688920975, + "num_tokens": 513995038.0, + "step": 3058 + }, + { + "entropy": 1.6593074301878612, + "epoch": 0.3360522918898135, + "grad_norm": 0.7323905229568481, + "learning_rate": 1.9116840155111024e-05, + "loss": 1.2915, + "mean_token_accuracy": 0.6618063052495321, + "num_tokens": 514138366.0, + "step": 3059 + }, + { + "entropy": 1.738914539416631, + "epoch": 0.33616214880118644, + "grad_norm": 0.6564517021179199, + "learning_rate": 1.911614833073645e-05, + "loss": 1.4708, + "mean_token_accuracy": 0.657853235801061, + "num_tokens": 514286217.0, + "step": 3060 + }, + { + "entropy": 1.7365315755208333, + "epoch": 0.3362720057125594, + "grad_norm": 0.7343533039093018, + "learning_rate": 1.9115456249489438e-05, + "loss": 1.368, + "mean_token_accuracy": 0.6527054756879807, + "num_tokens": 514488524.0, + "step": 3061 + }, + { + "entropy": 1.6972340444723766, + "epoch": 0.3363818626239323, + "grad_norm": 0.705672562122345, + "learning_rate": 1.911476391139189e-05, + "loss": 1.3701, + "mean_token_accuracy": 0.6614405562480291, + "num_tokens": 514625795.0, + "step": 3062 + }, + { + "entropy": 1.729969580968221, + "epoch": 0.33649171953530527, + "grad_norm": 0.7283251881599426, + "learning_rate": 1.9114071316465724e-05, + "loss": 1.3981, + "mean_token_accuracy": 0.6484886904557546, + "num_tokens": 514758846.0, + "step": 3063 + }, + { + "entropy": 1.7162734270095825, + "epoch": 0.3366015764466782, + "grad_norm": 0.7314303517341614, + "learning_rate": 1.9113378464732855e-05, + "loss": 1.3198, + "mean_token_accuracy": 0.6710415830214819, + "num_tokens": 514886650.0, + "step": 3064 + }, + { + "entropy": 1.7744275629520416, + "epoch": 0.33671143335805115, + "grad_norm": 0.6890711784362793, + "learning_rate": 1.9112685356215213e-05, + "loss": 1.5018, + "mean_token_accuracy": 0.6435980846484503, + "num_tokens": 515096800.0, + "step": 3065 + }, + { + "entropy": 1.7427007655302684, + "epoch": 0.3368212902694241, + "grad_norm": 0.8294128775596619, + "learning_rate": 1.9111991990934736e-05, + "loss": 1.2624, + "mean_token_accuracy": 0.668077364563942, + "num_tokens": 515220739.0, + "step": 3066 + }, + { + "entropy": 1.7589279413223267, + "epoch": 0.33693114718079703, + "grad_norm": 0.6529168486595154, + "learning_rate": 1.9111298368913368e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6581545720497767, + "num_tokens": 515409312.0, + "step": 3067 + }, + { + "entropy": 1.7398897409439087, + "epoch": 0.33704100409216997, + "grad_norm": 0.705767035484314, + "learning_rate": 1.9110604490173065e-05, + "loss": 1.4301, + "mean_token_accuracy": 0.6575960069894791, + "num_tokens": 515532475.0, + "step": 3068 + }, + { + "entropy": 1.6691156526406605, + "epoch": 0.3371508610035429, + "grad_norm": 0.6828332543373108, + "learning_rate": 1.9109910354735778e-05, + "loss": 1.2612, + "mean_token_accuracy": 0.6719839175542196, + "num_tokens": 515668637.0, + "step": 3069 + }, + { + "entropy": 1.7897346119085948, + "epoch": 0.3372607179149158, + "grad_norm": 0.7696541547775269, + "learning_rate": 1.910921596262349e-05, + "loss": 1.5465, + "mean_token_accuracy": 0.6406855061650276, + "num_tokens": 515800127.0, + "step": 3070 + }, + { + "entropy": 1.7390386561552684, + "epoch": 0.33737057482628874, + "grad_norm": 0.8148576617240906, + "learning_rate": 1.9108521313858164e-05, + "loss": 1.5137, + "mean_token_accuracy": 0.640427882472674, + "num_tokens": 516004144.0, + "step": 3071 + }, + { + "entropy": 1.645291765530904, + "epoch": 0.3374804317376617, + "grad_norm": 0.7999487519264221, + "learning_rate": 1.9107826408461796e-05, + "loss": 1.3468, + "mean_token_accuracy": 0.668777272105217, + "num_tokens": 516178099.0, + "step": 3072 + }, + { + "entropy": 1.7338153024514515, + "epoch": 0.3375902886490346, + "grad_norm": 0.6503975987434387, + "learning_rate": 1.9107131246456372e-05, + "loss": 1.4304, + "mean_token_accuracy": 0.6472113927205404, + "num_tokens": 516343512.0, + "step": 3073 + }, + { + "entropy": 1.7268471519152324, + "epoch": 0.33770014556040756, + "grad_norm": 0.8366050124168396, + "learning_rate": 1.9106435827863903e-05, + "loss": 1.372, + "mean_token_accuracy": 0.6583438813686371, + "num_tokens": 516489688.0, + "step": 3074 + }, + { + "entropy": 1.7441972096761067, + "epoch": 0.3378100024717805, + "grad_norm": 0.6372826099395752, + "learning_rate": 1.9105740152706388e-05, + "loss": 1.4328, + "mean_token_accuracy": 0.6417555212974548, + "num_tokens": 516658774.0, + "step": 3075 + }, + { + "entropy": 1.660355657339096, + "epoch": 0.33791985938315344, + "grad_norm": 0.6428292989730835, + "learning_rate": 1.9105044221005852e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.6568067520856857, + "num_tokens": 516796670.0, + "step": 3076 + }, + { + "entropy": 1.69850026567777, + "epoch": 0.3380297162945264, + "grad_norm": 0.7151433825492859, + "learning_rate": 1.910434803278432e-05, + "loss": 1.3114, + "mean_token_accuracy": 0.6674358497063319, + "num_tokens": 516955225.0, + "step": 3077 + }, + { + "entropy": 1.6989915072917938, + "epoch": 0.3381395732058993, + "grad_norm": 0.7246211767196655, + "learning_rate": 1.9103651588063822e-05, + "loss": 1.3039, + "mean_token_accuracy": 0.6708632856607437, + "num_tokens": 517099008.0, + "step": 3078 + }, + { + "entropy": 1.7012667655944824, + "epoch": 0.33824943011727227, + "grad_norm": 0.8172794580459595, + "learning_rate": 1.9102954886866404e-05, + "loss": 1.3946, + "mean_token_accuracy": 0.6592543323834738, + "num_tokens": 517245097.0, + "step": 3079 + }, + { + "entropy": 1.7260870933532715, + "epoch": 0.3383592870286452, + "grad_norm": 0.8274314999580383, + "learning_rate": 1.9102257929214114e-05, + "loss": 1.4324, + "mean_token_accuracy": 0.6518008708953857, + "num_tokens": 517406042.0, + "step": 3080 + }, + { + "entropy": 1.7201440036296844, + "epoch": 0.33846914394001815, + "grad_norm": 0.7528727054595947, + "learning_rate": 1.9101560715129013e-05, + "loss": 1.4812, + "mean_token_accuracy": 0.6550854941209158, + "num_tokens": 517555565.0, + "step": 3081 + }, + { + "entropy": 1.6386935810248058, + "epoch": 0.3385790008513911, + "grad_norm": 0.6425924897193909, + "learning_rate": 1.9100863244633165e-05, + "loss": 1.3557, + "mean_token_accuracy": 0.6622224648793539, + "num_tokens": 517724677.0, + "step": 3082 + }, + { + "entropy": 1.7015309532483418, + "epoch": 0.338688857762764, + "grad_norm": 0.6084007620811462, + "learning_rate": 1.9100165517748647e-05, + "loss": 1.4147, + "mean_token_accuracy": 0.6466822971900305, + "num_tokens": 517907631.0, + "step": 3083 + }, + { + "entropy": 1.7300503849983215, + "epoch": 0.3387987146741369, + "grad_norm": 0.6377536058425903, + "learning_rate": 1.909946753449754e-05, + "loss": 1.5793, + "mean_token_accuracy": 0.641609787940979, + "num_tokens": 518073726.0, + "step": 3084 + }, + { + "entropy": 1.7079397439956665, + "epoch": 0.33890857158550985, + "grad_norm": 0.6480128169059753, + "learning_rate": 1.9098769294901933e-05, + "loss": 1.3289, + "mean_token_accuracy": 0.661668727795283, + "num_tokens": 518225623.0, + "step": 3085 + }, + { + "entropy": 1.7619627118110657, + "epoch": 0.3390184284968828, + "grad_norm": 0.8301718831062317, + "learning_rate": 1.909807079898393e-05, + "loss": 1.5109, + "mean_token_accuracy": 0.644140308101972, + "num_tokens": 518415989.0, + "step": 3086 + }, + { + "entropy": 1.7237402101357777, + "epoch": 0.33912828540825574, + "grad_norm": 0.7291299104690552, + "learning_rate": 1.9097372046765632e-05, + "loss": 1.3323, + "mean_token_accuracy": 0.6596636722485224, + "num_tokens": 518567833.0, + "step": 3087 + }, + { + "entropy": 1.7579568723837535, + "epoch": 0.3392381423196287, + "grad_norm": 0.9242023825645447, + "learning_rate": 1.909667303826916e-05, + "loss": 1.3778, + "mean_token_accuracy": 0.6666527688503265, + "num_tokens": 518740218.0, + "step": 3088 + }, + { + "entropy": 1.7019338707129161, + "epoch": 0.3393479992310016, + "grad_norm": 0.7077080607414246, + "learning_rate": 1.9095973773516634e-05, + "loss": 1.407, + "mean_token_accuracy": 0.6584417273600897, + "num_tokens": 518915130.0, + "step": 3089 + }, + { + "entropy": 1.8156996667385101, + "epoch": 0.33945785614237456, + "grad_norm": 0.7829475998878479, + "learning_rate": 1.9095274252530187e-05, + "loss": 1.42, + "mean_token_accuracy": 0.65616142253081, + "num_tokens": 519088777.0, + "step": 3090 + }, + { + "entropy": 1.7283440331617992, + "epoch": 0.3395677130537475, + "grad_norm": 0.7416720986366272, + "learning_rate": 1.9094574475331956e-05, + "loss": 1.2568, + "mean_token_accuracy": 0.6756617377201716, + "num_tokens": 519255742.0, + "step": 3091 + }, + { + "entropy": 1.6994885007540386, + "epoch": 0.33967756996512044, + "grad_norm": 0.6388126611709595, + "learning_rate": 1.9093874441944095e-05, + "loss": 1.3153, + "mean_token_accuracy": 0.6684769292672476, + "num_tokens": 519396522.0, + "step": 3092 + }, + { + "entropy": 1.7258747617403667, + "epoch": 0.3397874268764934, + "grad_norm": 0.7206714153289795, + "learning_rate": 1.909317415238875e-05, + "loss": 1.4058, + "mean_token_accuracy": 0.6619496643543243, + "num_tokens": 519590462.0, + "step": 3093 + }, + { + "entropy": 1.7497619986534119, + "epoch": 0.3398972837878663, + "grad_norm": 0.7609780430793762, + "learning_rate": 1.909247360668809e-05, + "loss": 1.3087, + "mean_token_accuracy": 0.6699084391196569, + "num_tokens": 519720103.0, + "step": 3094 + }, + { + "entropy": 1.6820762356122334, + "epoch": 0.34000714069923926, + "grad_norm": 0.8035679459571838, + "learning_rate": 1.9091772804864292e-05, + "loss": 1.4856, + "mean_token_accuracy": 0.6475498353441557, + "num_tokens": 519923412.0, + "step": 3095 + }, + { + "entropy": 1.6931442022323608, + "epoch": 0.3401169976106122, + "grad_norm": 0.6897434592247009, + "learning_rate": 1.9091071746939526e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6593603193759918, + "num_tokens": 520072967.0, + "step": 3096 + }, + { + "entropy": 1.7068798343340557, + "epoch": 0.3402268545219851, + "grad_norm": 0.6774353981018066, + "learning_rate": 1.909037043293599e-05, + "loss": 1.4233, + "mean_token_accuracy": 0.6536633421977361, + "num_tokens": 520255988.0, + "step": 3097 + }, + { + "entropy": 1.7121065855026245, + "epoch": 0.34033671143335803, + "grad_norm": 0.7132356762886047, + "learning_rate": 1.908966886287587e-05, + "loss": 1.3878, + "mean_token_accuracy": 0.6654232740402222, + "num_tokens": 520488918.0, + "step": 3098 + }, + { + "entropy": 1.6239934662977855, + "epoch": 0.34044656834473097, + "grad_norm": 0.8887202739715576, + "learning_rate": 1.908896703678138e-05, + "loss": 1.4214, + "mean_token_accuracy": 0.6726511965195338, + "num_tokens": 520660679.0, + "step": 3099 + }, + { + "entropy": 1.7160980502764385, + "epoch": 0.3405564252561039, + "grad_norm": 0.6955882906913757, + "learning_rate": 1.9088264954674724e-05, + "loss": 1.3324, + "mean_token_accuracy": 0.6596625298261642, + "num_tokens": 520838220.0, + "step": 3100 + }, + { + "entropy": 1.6648767292499542, + "epoch": 0.34066628216747685, + "grad_norm": 0.6133254170417786, + "learning_rate": 1.908756261657813e-05, + "loss": 1.4626, + "mean_token_accuracy": 0.6427382330099741, + "num_tokens": 521036582.0, + "step": 3101 + }, + { + "entropy": 1.7698853611946106, + "epoch": 0.3407761390788498, + "grad_norm": 0.7591292262077332, + "learning_rate": 1.9086860022513823e-05, + "loss": 1.3989, + "mean_token_accuracy": 0.646201545993487, + "num_tokens": 521195519.0, + "step": 3102 + }, + { + "entropy": 1.6912595828374226, + "epoch": 0.34088599599022273, + "grad_norm": 0.6244848966598511, + "learning_rate": 1.9086157172504036e-05, + "loss": 1.4302, + "mean_token_accuracy": 0.6519865940014521, + "num_tokens": 521348443.0, + "step": 3103 + }, + { + "entropy": 1.7508087356885274, + "epoch": 0.3409958529015957, + "grad_norm": 0.6853779554367065, + "learning_rate": 1.9085454066571023e-05, + "loss": 1.4347, + "mean_token_accuracy": 0.6526261965433756, + "num_tokens": 521505284.0, + "step": 3104 + }, + { + "entropy": 1.6787457764148712, + "epoch": 0.3411057098129686, + "grad_norm": 0.7456424236297607, + "learning_rate": 1.908475070473703e-05, + "loss": 1.3914, + "mean_token_accuracy": 0.6657826354106268, + "num_tokens": 521671735.0, + "step": 3105 + }, + { + "entropy": 1.7271059552828472, + "epoch": 0.34121556672434156, + "grad_norm": 0.7938264608383179, + "learning_rate": 1.9084047087024325e-05, + "loss": 1.4352, + "mean_token_accuracy": 0.657663548986117, + "num_tokens": 521839358.0, + "step": 3106 + }, + { + "entropy": 1.6707509557406108, + "epoch": 0.3413254236357145, + "grad_norm": 0.6704132556915283, + "learning_rate": 1.9083343213455167e-05, + "loss": 1.4081, + "mean_token_accuracy": 0.655000850558281, + "num_tokens": 522070055.0, + "step": 3107 + }, + { + "entropy": 1.6707326571146648, + "epoch": 0.34143528054708744, + "grad_norm": 0.7726478576660156, + "learning_rate": 1.908263908405184e-05, + "loss": 1.1888, + "mean_token_accuracy": 0.6913275470336279, + "num_tokens": 522186687.0, + "step": 3108 + }, + { + "entropy": 1.7188027401765187, + "epoch": 0.3415451374584604, + "grad_norm": 0.673206627368927, + "learning_rate": 1.908193469883663e-05, + "loss": 1.3749, + "mean_token_accuracy": 0.6531964292128881, + "num_tokens": 522337959.0, + "step": 3109 + }, + { + "entropy": 1.7751700381437938, + "epoch": 0.34165499436983326, + "grad_norm": 0.6284109950065613, + "learning_rate": 1.9081230057831827e-05, + "loss": 1.3603, + "mean_token_accuracy": 0.6515864779551824, + "num_tokens": 522509221.0, + "step": 3110 + }, + { + "entropy": 1.66496338446935, + "epoch": 0.3417648512812062, + "grad_norm": 0.7413091063499451, + "learning_rate": 1.9080525161059737e-05, + "loss": 1.3669, + "mean_token_accuracy": 0.6810042262077332, + "num_tokens": 522637505.0, + "step": 3111 + }, + { + "entropy": 1.7317347327868144, + "epoch": 0.34187470819257915, + "grad_norm": 0.7235397100448608, + "learning_rate": 1.907982000854266e-05, + "loss": 1.2779, + "mean_token_accuracy": 0.6744700570901235, + "num_tokens": 522731744.0, + "step": 3112 + }, + { + "entropy": 1.6716107825438182, + "epoch": 0.3419845651039521, + "grad_norm": 0.7283676266670227, + "learning_rate": 1.9079114600302926e-05, + "loss": 1.3479, + "mean_token_accuracy": 0.6658121645450592, + "num_tokens": 522907464.0, + "step": 3113 + }, + { + "entropy": 1.6335497895876567, + "epoch": 0.34209442201532503, + "grad_norm": 0.7072910070419312, + "learning_rate": 1.9078408936362857e-05, + "loss": 1.2444, + "mean_token_accuracy": 0.6787208517392477, + "num_tokens": 523057454.0, + "step": 3114 + }, + { + "entropy": 1.7214987377325695, + "epoch": 0.34220427892669797, + "grad_norm": 0.7019241452217102, + "learning_rate": 1.907770301674478e-05, + "loss": 1.3578, + "mean_token_accuracy": 0.6487798243761063, + "num_tokens": 523240620.0, + "step": 3115 + }, + { + "entropy": 1.646393616994222, + "epoch": 0.3423141358380709, + "grad_norm": 0.7203688621520996, + "learning_rate": 1.9076996841471047e-05, + "loss": 1.3311, + "mean_token_accuracy": 0.6660736699899038, + "num_tokens": 523391299.0, + "step": 3116 + }, + { + "entropy": 1.7005057732264202, + "epoch": 0.34242399274944385, + "grad_norm": 0.636923611164093, + "learning_rate": 1.9076290410564e-05, + "loss": 1.4847, + "mean_token_accuracy": 0.6475814878940582, + "num_tokens": 523564225.0, + "step": 3117 + }, + { + "entropy": 1.7076924443244934, + "epoch": 0.3425338496608168, + "grad_norm": 0.7187374234199524, + "learning_rate": 1.9075583724046004e-05, + "loss": 1.4605, + "mean_token_accuracy": 0.6543427258729935, + "num_tokens": 523745353.0, + "step": 3118 + }, + { + "entropy": 1.7326221863428752, + "epoch": 0.34264370657218973, + "grad_norm": 0.6638636589050293, + "learning_rate": 1.907487678193942e-05, + "loss": 1.432, + "mean_token_accuracy": 0.6483491808176041, + "num_tokens": 523914991.0, + "step": 3119 + }, + { + "entropy": 1.692298283179601, + "epoch": 0.3427535634835627, + "grad_norm": 0.7324991822242737, + "learning_rate": 1.9074169584266627e-05, + "loss": 1.4448, + "mean_token_accuracy": 0.6582270761330923, + "num_tokens": 524095746.0, + "step": 3120 + }, + { + "entropy": 1.733394632736842, + "epoch": 0.3428634203949356, + "grad_norm": 0.7477669715881348, + "learning_rate": 1.9073462131050002e-05, + "loss": 1.5347, + "mean_token_accuracy": 0.6465493490298589, + "num_tokens": 524332980.0, + "step": 3121 + }, + { + "entropy": 1.7090126971403758, + "epoch": 0.34297327730630855, + "grad_norm": 0.6135408878326416, + "learning_rate": 1.9072754422311937e-05, + "loss": 1.3248, + "mean_token_accuracy": 0.6581533948580424, + "num_tokens": 524465802.0, + "step": 3122 + }, + { + "entropy": 1.715635746717453, + "epoch": 0.3430831342176815, + "grad_norm": 0.5586002469062805, + "learning_rate": 1.9072046458074834e-05, + "loss": 1.5206, + "mean_token_accuracy": 0.6238716145356497, + "num_tokens": 524743379.0, + "step": 3123 + }, + { + "entropy": 1.7447759707768757, + "epoch": 0.3431929911290544, + "grad_norm": 0.8412876129150391, + "learning_rate": 1.90713382383611e-05, + "loss": 1.4483, + "mean_token_accuracy": 0.6630438417196274, + "num_tokens": 524892897.0, + "step": 3124 + }, + { + "entropy": 1.7658887306849163, + "epoch": 0.3433028480404273, + "grad_norm": 0.7582389116287231, + "learning_rate": 1.9070629763193148e-05, + "loss": 1.5115, + "mean_token_accuracy": 0.6470319529374441, + "num_tokens": 525073268.0, + "step": 3125 + }, + { + "entropy": 1.7027207911014557, + "epoch": 0.34341270495180026, + "grad_norm": 0.7003147006034851, + "learning_rate": 1.90699210325934e-05, + "loss": 1.3185, + "mean_token_accuracy": 0.6545501202344894, + "num_tokens": 525223000.0, + "step": 3126 + }, + { + "entropy": 1.6880733569463093, + "epoch": 0.3435225618631732, + "grad_norm": 0.8535897731781006, + "learning_rate": 1.9069212046584288e-05, + "loss": 1.2261, + "mean_token_accuracy": 0.6908506006002426, + "num_tokens": 525346110.0, + "step": 3127 + }, + { + "entropy": 1.6838291088740032, + "epoch": 0.34363241877454614, + "grad_norm": 0.7103913426399231, + "learning_rate": 1.9068502805188247e-05, + "loss": 1.4846, + "mean_token_accuracy": 0.6660540848970413, + "num_tokens": 525510089.0, + "step": 3128 + }, + { + "entropy": 1.6504852771759033, + "epoch": 0.3437422756859191, + "grad_norm": 0.6084645390510559, + "learning_rate": 1.9067793308427734e-05, + "loss": 1.3674, + "mean_token_accuracy": 0.6550189206997553, + "num_tokens": 525680601.0, + "step": 3129 + }, + { + "entropy": 1.66959352294604, + "epoch": 0.343852132597292, + "grad_norm": 0.9643108248710632, + "learning_rate": 1.906708355632519e-05, + "loss": 1.186, + "mean_token_accuracy": 0.684942439198494, + "num_tokens": 525819677.0, + "step": 3130 + }, + { + "entropy": 1.7390025953451793, + "epoch": 0.34396198950866497, + "grad_norm": 0.8256139755249023, + "learning_rate": 1.9066373548903097e-05, + "loss": 1.4807, + "mean_token_accuracy": 0.6392683138449987, + "num_tokens": 526016590.0, + "step": 3131 + }, + { + "entropy": 1.6989341179529827, + "epoch": 0.3440718464200379, + "grad_norm": 0.6582311987876892, + "learning_rate": 1.906566328618391e-05, + "loss": 1.3251, + "mean_token_accuracy": 0.6610842347145081, + "num_tokens": 526146630.0, + "step": 3132 + }, + { + "entropy": 1.6985157827536266, + "epoch": 0.34418170333141085, + "grad_norm": 0.7080286741256714, + "learning_rate": 1.9064952768190114e-05, + "loss": 1.2785, + "mean_token_accuracy": 0.6769105891386668, + "num_tokens": 526281535.0, + "step": 3133 + }, + { + "entropy": 1.7005742291609447, + "epoch": 0.3442915602427838, + "grad_norm": 0.741958737373352, + "learning_rate": 1.9064241994944197e-05, + "loss": 1.2767, + "mean_token_accuracy": 0.6701702376206716, + "num_tokens": 526426500.0, + "step": 3134 + }, + { + "entropy": 1.6816225151220958, + "epoch": 0.34440141715415673, + "grad_norm": 0.6872779130935669, + "learning_rate": 1.9063530966468655e-05, + "loss": 1.4314, + "mean_token_accuracy": 0.6407319158315659, + "num_tokens": 526647271.0, + "step": 3135 + }, + { + "entropy": 1.6521427631378174, + "epoch": 0.34451127406552967, + "grad_norm": 0.7281518578529358, + "learning_rate": 1.9062819682785993e-05, + "loss": 1.3395, + "mean_token_accuracy": 0.6678001085917155, + "num_tokens": 526775519.0, + "step": 3136 + }, + { + "entropy": 1.6584477225939434, + "epoch": 0.3446211309769026, + "grad_norm": 0.7105032205581665, + "learning_rate": 1.906210814391872e-05, + "loss": 1.5403, + "mean_token_accuracy": 0.6434331585963567, + "num_tokens": 526960781.0, + "step": 3137 + }, + { + "entropy": 1.7096496224403381, + "epoch": 0.3447309878882755, + "grad_norm": 1.1132054328918457, + "learning_rate": 1.9061396349889357e-05, + "loss": 1.3313, + "mean_token_accuracy": 0.6592603524525961, + "num_tokens": 527077059.0, + "step": 3138 + }, + { + "entropy": 1.7816942930221558, + "epoch": 0.34484084479964844, + "grad_norm": 0.623199462890625, + "learning_rate": 1.9060684300720435e-05, + "loss": 1.4786, + "mean_token_accuracy": 0.6451230843861898, + "num_tokens": 527242392.0, + "step": 3139 + }, + { + "entropy": 1.7357947031656902, + "epoch": 0.3449507017110214, + "grad_norm": 0.6942022442817688, + "learning_rate": 1.9059971996434483e-05, + "loss": 1.6755, + "mean_token_accuracy": 0.6327243894338608, + "num_tokens": 527421563.0, + "step": 3140 + }, + { + "entropy": 1.719354470570882, + "epoch": 0.3450605586223943, + "grad_norm": 0.6363817453384399, + "learning_rate": 1.9059259437054052e-05, + "loss": 1.2542, + "mean_token_accuracy": 0.6765570292870203, + "num_tokens": 527542290.0, + "step": 3141 + }, + { + "entropy": 1.6994706292947133, + "epoch": 0.34517041553376726, + "grad_norm": 0.6537553071975708, + "learning_rate": 1.9058546622601688e-05, + "loss": 1.522, + "mean_token_accuracy": 0.6443347980578741, + "num_tokens": 527721230.0, + "step": 3142 + }, + { + "entropy": 1.7906754612922668, + "epoch": 0.3452802724451402, + "grad_norm": 0.7728573679924011, + "learning_rate": 1.9057833553099957e-05, + "loss": 1.444, + "mean_token_accuracy": 0.6405621866385142, + "num_tokens": 527925672.0, + "step": 3143 + }, + { + "entropy": 1.635061929623286, + "epoch": 0.34539012935651314, + "grad_norm": 0.7225202918052673, + "learning_rate": 1.9057120228571426e-05, + "loss": 1.2604, + "mean_token_accuracy": 0.6803568998972574, + "num_tokens": 528045373.0, + "step": 3144 + }, + { + "entropy": 1.7567674815654755, + "epoch": 0.3454999862678861, + "grad_norm": 0.7282200455665588, + "learning_rate": 1.905640664903867e-05, + "loss": 1.6159, + "mean_token_accuracy": 0.6386793802181879, + "num_tokens": 528235110.0, + "step": 3145 + }, + { + "entropy": 1.6713021596272786, + "epoch": 0.345609843179259, + "grad_norm": 0.6087730526924133, + "learning_rate": 1.9055692814524273e-05, + "loss": 1.2903, + "mean_token_accuracy": 0.6718141039212545, + "num_tokens": 528415004.0, + "step": 3146 + }, + { + "entropy": 1.6759739617506664, + "epoch": 0.34571970009063197, + "grad_norm": 0.8467540144920349, + "learning_rate": 1.9054978725050827e-05, + "loss": 1.3079, + "mean_token_accuracy": 0.6630802005529404, + "num_tokens": 528553683.0, + "step": 3147 + }, + { + "entropy": 1.709736426671346, + "epoch": 0.3458295570020049, + "grad_norm": 0.6705769896507263, + "learning_rate": 1.9054264380640936e-05, + "loss": 1.3064, + "mean_token_accuracy": 0.6646785189708074, + "num_tokens": 528671808.0, + "step": 3148 + }, + { + "entropy": 1.747974932193756, + "epoch": 0.34593941391337785, + "grad_norm": 0.7119439244270325, + "learning_rate": 1.9053549781317208e-05, + "loss": 1.2835, + "mean_token_accuracy": 0.6721477657556534, + "num_tokens": 528778625.0, + "step": 3149 + }, + { + "entropy": 1.741389234860738, + "epoch": 0.3460492708247508, + "grad_norm": 0.6632856130599976, + "learning_rate": 1.9052834927102255e-05, + "loss": 1.3884, + "mean_token_accuracy": 0.6711229979991913, + "num_tokens": 528918469.0, + "step": 3150 + }, + { + "entropy": 1.7209580143292744, + "epoch": 0.3461591277361237, + "grad_norm": 0.6064859628677368, + "learning_rate": 1.905211981801871e-05, + "loss": 1.2897, + "mean_token_accuracy": 0.6677672813336054, + "num_tokens": 529046367.0, + "step": 3151 + }, + { + "entropy": 1.7029893298943837, + "epoch": 0.3462689846474966, + "grad_norm": 0.6776720285415649, + "learning_rate": 1.9051404454089196e-05, + "loss": 1.473, + "mean_token_accuracy": 0.6507180581490198, + "num_tokens": 529254785.0, + "step": 3152 + }, + { + "entropy": 1.699689010779063, + "epoch": 0.34637884155886955, + "grad_norm": 0.7268986701965332, + "learning_rate": 1.9050688835336358e-05, + "loss": 1.3269, + "mean_token_accuracy": 0.674399678905805, + "num_tokens": 529399490.0, + "step": 3153 + }, + { + "entropy": 1.7162836492061615, + "epoch": 0.3464886984702425, + "grad_norm": 0.7248696088790894, + "learning_rate": 1.904997296178285e-05, + "loss": 1.3693, + "mean_token_accuracy": 0.6586166570583979, + "num_tokens": 529527378.0, + "step": 3154 + }, + { + "entropy": 1.6862310767173767, + "epoch": 0.34659855538161544, + "grad_norm": 0.7161970138549805, + "learning_rate": 1.9049256833451327e-05, + "loss": 1.481, + "mean_token_accuracy": 0.6531454250216484, + "num_tokens": 529734726.0, + "step": 3155 + }, + { + "entropy": 1.6877350012461345, + "epoch": 0.3467084122929884, + "grad_norm": 0.6686804294586182, + "learning_rate": 1.904854045036445e-05, + "loss": 1.3981, + "mean_token_accuracy": 0.65309705833594, + "num_tokens": 529949884.0, + "step": 3156 + }, + { + "entropy": 1.7140614589055378, + "epoch": 0.3468182692043613, + "grad_norm": 0.6998611092567444, + "learning_rate": 1.9047823812544893e-05, + "loss": 1.2816, + "mean_token_accuracy": 0.6777733812729517, + "num_tokens": 530095707.0, + "step": 3157 + }, + { + "entropy": 1.7329360047976177, + "epoch": 0.34692812611573426, + "grad_norm": 0.7617079615592957, + "learning_rate": 1.904710692001534e-05, + "loss": 1.27, + "mean_token_accuracy": 0.6669184813896815, + "num_tokens": 530233076.0, + "step": 3158 + }, + { + "entropy": 1.7423981527487438, + "epoch": 0.3470379830271072, + "grad_norm": 0.7219134569168091, + "learning_rate": 1.904638977279848e-05, + "loss": 1.4189, + "mean_token_accuracy": 0.6537288725376129, + "num_tokens": 530361395.0, + "step": 3159 + }, + { + "entropy": 1.7932091653347015, + "epoch": 0.34714783993848014, + "grad_norm": 0.7211331129074097, + "learning_rate": 1.9045672370917008e-05, + "loss": 1.3775, + "mean_token_accuracy": 0.6503029266993204, + "num_tokens": 530476029.0, + "step": 3160 + }, + { + "entropy": 1.6707605421543121, + "epoch": 0.3472576968498531, + "grad_norm": 0.6406380534172058, + "learning_rate": 1.904495471439363e-05, + "loss": 1.2947, + "mean_token_accuracy": 0.6745659758647283, + "num_tokens": 530620082.0, + "step": 3161 + }, + { + "entropy": 1.635949860016505, + "epoch": 0.347367553761226, + "grad_norm": 0.5812481641769409, + "learning_rate": 1.9044236803251063e-05, + "loss": 1.318, + "mean_token_accuracy": 0.6674651255210241, + "num_tokens": 530800009.0, + "step": 3162 + }, + { + "entropy": 1.686038355032603, + "epoch": 0.34747741067259896, + "grad_norm": 0.65413898229599, + "learning_rate": 1.9043518637512027e-05, + "loss": 1.39, + "mean_token_accuracy": 0.6651994735002518, + "num_tokens": 530939319.0, + "step": 3163 + }, + { + "entropy": 1.7452342510223389, + "epoch": 0.3475872675839719, + "grad_norm": 0.711044192314148, + "learning_rate": 1.9042800217199248e-05, + "loss": 1.3807, + "mean_token_accuracy": 0.6522654493649801, + "num_tokens": 531069878.0, + "step": 3164 + }, + { + "entropy": 1.672851413488388, + "epoch": 0.3476971244953448, + "grad_norm": 0.5760392546653748, + "learning_rate": 1.9042081542335467e-05, + "loss": 1.4053, + "mean_token_accuracy": 0.6590311825275421, + "num_tokens": 531295489.0, + "step": 3165 + }, + { + "entropy": 1.7025360067685444, + "epoch": 0.34780698140671773, + "grad_norm": 0.6651199460029602, + "learning_rate": 1.9041362612943432e-05, + "loss": 1.4792, + "mean_token_accuracy": 0.6444245874881744, + "num_tokens": 531499724.0, + "step": 3166 + }, + { + "entropy": 1.7628304362297058, + "epoch": 0.34791683831809067, + "grad_norm": 0.6585642099380493, + "learning_rate": 1.9040643429045887e-05, + "loss": 1.4042, + "mean_token_accuracy": 0.6575342814127604, + "num_tokens": 531731480.0, + "step": 3167 + }, + { + "entropy": 1.740038514137268, + "epoch": 0.3480266952294636, + "grad_norm": 0.7570586800575256, + "learning_rate": 1.9039923990665605e-05, + "loss": 1.439, + "mean_token_accuracy": 0.6459324061870575, + "num_tokens": 531912616.0, + "step": 3168 + }, + { + "entropy": 1.7111516793568928, + "epoch": 0.34813655214083655, + "grad_norm": 0.6636490225791931, + "learning_rate": 1.903920429782535e-05, + "loss": 1.3603, + "mean_token_accuracy": 0.6567764480908712, + "num_tokens": 532036362.0, + "step": 3169 + }, + { + "entropy": 1.7178413569927216, + "epoch": 0.3482464090522095, + "grad_norm": 0.8525426387786865, + "learning_rate": 1.9038484350547903e-05, + "loss": 1.3025, + "mean_token_accuracy": 0.6610483030478159, + "num_tokens": 532188392.0, + "step": 3170 + }, + { + "entropy": 1.7293661733468373, + "epoch": 0.34835626596358243, + "grad_norm": 0.7002199292182922, + "learning_rate": 1.903776414885605e-05, + "loss": 1.4114, + "mean_token_accuracy": 0.6590321709712347, + "num_tokens": 532327314.0, + "step": 3171 + }, + { + "entropy": 1.691053032875061, + "epoch": 0.3484661228749554, + "grad_norm": 0.7545453310012817, + "learning_rate": 1.903704369277258e-05, + "loss": 1.336, + "mean_token_accuracy": 0.6653313388427099, + "num_tokens": 532475041.0, + "step": 3172 + }, + { + "entropy": 1.7258902490139008, + "epoch": 0.3485759797863283, + "grad_norm": 0.7080891132354736, + "learning_rate": 1.90363229823203e-05, + "loss": 1.3609, + "mean_token_accuracy": 0.6499852339426676, + "num_tokens": 532608570.0, + "step": 3173 + }, + { + "entropy": 1.7066673735777538, + "epoch": 0.34868583669770126, + "grad_norm": 0.7277325391769409, + "learning_rate": 1.9035602017522018e-05, + "loss": 1.3951, + "mean_token_accuracy": 0.6554910639921824, + "num_tokens": 532771155.0, + "step": 3174 + }, + { + "entropy": 1.6777300437291462, + "epoch": 0.3487956936090742, + "grad_norm": 0.6466101408004761, + "learning_rate": 1.9034880798400556e-05, + "loss": 1.5736, + "mean_token_accuracy": 0.6363010754187902, + "num_tokens": 532958303.0, + "step": 3175 + }, + { + "entropy": 1.7415178914864857, + "epoch": 0.34890555052044714, + "grad_norm": 0.7584067583084106, + "learning_rate": 1.9034159324978735e-05, + "loss": 1.2576, + "mean_token_accuracy": 0.6761174450318018, + "num_tokens": 533125729.0, + "step": 3176 + }, + { + "entropy": 1.7105094691117604, + "epoch": 0.3490154074318201, + "grad_norm": 0.7372577786445618, + "learning_rate": 1.9033437597279392e-05, + "loss": 1.3823, + "mean_token_accuracy": 0.6669119844834009, + "num_tokens": 533287732.0, + "step": 3177 + }, + { + "entropy": 1.6837959190209706, + "epoch": 0.34912526434319296, + "grad_norm": 0.8528002500534058, + "learning_rate": 1.903271561532537e-05, + "loss": 1.3608, + "mean_token_accuracy": 0.6598199556271235, + "num_tokens": 533488739.0, + "step": 3178 + }, + { + "entropy": 1.7475207646687825, + "epoch": 0.3492351212545659, + "grad_norm": 0.7588545083999634, + "learning_rate": 1.9031993379139517e-05, + "loss": 1.4702, + "mean_token_accuracy": 0.6513977944850922, + "num_tokens": 533632736.0, + "step": 3179 + }, + { + "entropy": 1.676239550113678, + "epoch": 0.34934497816593885, + "grad_norm": 0.6096740365028381, + "learning_rate": 1.903127088874469e-05, + "loss": 1.221, + "mean_token_accuracy": 0.6817640314499537, + "num_tokens": 533766233.0, + "step": 3180 + }, + { + "entropy": 1.734337071577708, + "epoch": 0.3494548350773118, + "grad_norm": 0.6904963850975037, + "learning_rate": 1.9030548144163766e-05, + "loss": 1.4203, + "mean_token_accuracy": 0.6599701891342798, + "num_tokens": 533970894.0, + "step": 3181 + }, + { + "entropy": 1.7419179677963257, + "epoch": 0.34956469198868473, + "grad_norm": 0.8853073120117188, + "learning_rate": 1.9029825145419606e-05, + "loss": 1.3835, + "mean_token_accuracy": 0.6622406442960104, + "num_tokens": 534094496.0, + "step": 3182 + }, + { + "entropy": 1.6786328554153442, + "epoch": 0.34967454890005767, + "grad_norm": 0.6363751292228699, + "learning_rate": 1.90291018925351e-05, + "loss": 1.4218, + "mean_token_accuracy": 0.661681205034256, + "num_tokens": 534283850.0, + "step": 3183 + }, + { + "entropy": 1.6927332083384197, + "epoch": 0.3497844058114306, + "grad_norm": 0.6172245144844055, + "learning_rate": 1.902837838553314e-05, + "loss": 1.3543, + "mean_token_accuracy": 0.6617726981639862, + "num_tokens": 534467200.0, + "step": 3184 + }, + { + "entropy": 1.7397380471229553, + "epoch": 0.34989426272280355, + "grad_norm": 0.7281948328018188, + "learning_rate": 1.9027654624436617e-05, + "loss": 1.3724, + "mean_token_accuracy": 0.6596235682566961, + "num_tokens": 534630747.0, + "step": 3185 + }, + { + "entropy": 1.7327102224032085, + "epoch": 0.3500041196341765, + "grad_norm": 0.6819478869438171, + "learning_rate": 1.9026930609268445e-05, + "loss": 1.3829, + "mean_token_accuracy": 0.6533329288164774, + "num_tokens": 534801851.0, + "step": 3186 + }, + { + "entropy": 1.630759596824646, + "epoch": 0.35011397654554943, + "grad_norm": 0.6898466944694519, + "learning_rate": 1.9026206340051535e-05, + "loss": 1.2503, + "mean_token_accuracy": 0.6898584812879562, + "num_tokens": 534974937.0, + "step": 3187 + }, + { + "entropy": 1.7490450243155162, + "epoch": 0.3502238334569224, + "grad_norm": 0.6831504106521606, + "learning_rate": 1.902548181680881e-05, + "loss": 1.3835, + "mean_token_accuracy": 0.6639950623114904, + "num_tokens": 535144941.0, + "step": 3188 + }, + { + "entropy": 1.7011112074057262, + "epoch": 0.3503336903682953, + "grad_norm": 0.6328549385070801, + "learning_rate": 1.902475703956321e-05, + "loss": 1.4859, + "mean_token_accuracy": 0.6441772828499476, + "num_tokens": 535306022.0, + "step": 3189 + }, + { + "entropy": 1.7117084761460621, + "epoch": 0.35044354727966825, + "grad_norm": 0.6839233040809631, + "learning_rate": 1.9024032008337654e-05, + "loss": 1.3128, + "mean_token_accuracy": 0.6650984783967336, + "num_tokens": 535457085.0, + "step": 3190 + }, + { + "entropy": 1.639816661675771, + "epoch": 0.3505534041910412, + "grad_norm": 0.6154528856277466, + "learning_rate": 1.9023306723155108e-05, + "loss": 1.4108, + "mean_token_accuracy": 0.645085021853447, + "num_tokens": 535644965.0, + "step": 3191 + }, + { + "entropy": 1.7268753548463185, + "epoch": 0.3506632611024141, + "grad_norm": 0.7392616271972656, + "learning_rate": 1.902258118403852e-05, + "loss": 1.5011, + "mean_token_accuracy": 0.6435932318369547, + "num_tokens": 535819854.0, + "step": 3192 + }, + { + "entropy": 1.7294295032819111, + "epoch": 0.350773118013787, + "grad_norm": 0.750583827495575, + "learning_rate": 1.9021855391010848e-05, + "loss": 1.4695, + "mean_token_accuracy": 0.6535586913426717, + "num_tokens": 535986320.0, + "step": 3193 + }, + { + "entropy": 1.6876760522524517, + "epoch": 0.35088297492515996, + "grad_norm": 0.733034610748291, + "learning_rate": 1.902112934409507e-05, + "loss": 1.5651, + "mean_token_accuracy": 0.6665263374646505, + "num_tokens": 536133774.0, + "step": 3194 + }, + { + "entropy": 1.724027395248413, + "epoch": 0.3509928318365329, + "grad_norm": 0.705089271068573, + "learning_rate": 1.9020403043314165e-05, + "loss": 1.3844, + "mean_token_accuracy": 0.6622153123219808, + "num_tokens": 536301152.0, + "step": 3195 + }, + { + "entropy": 1.6385993957519531, + "epoch": 0.35110268874790584, + "grad_norm": 0.6220270991325378, + "learning_rate": 1.9019676488691113e-05, + "loss": 1.3943, + "mean_token_accuracy": 0.6637493073940277, + "num_tokens": 536466359.0, + "step": 3196 + }, + { + "entropy": 1.647113859653473, + "epoch": 0.3512125456592788, + "grad_norm": 0.6425076723098755, + "learning_rate": 1.9018949680248913e-05, + "loss": 1.2825, + "mean_token_accuracy": 0.6719297617673874, + "num_tokens": 536602915.0, + "step": 3197 + }, + { + "entropy": 1.6742305755615234, + "epoch": 0.3513224025706517, + "grad_norm": 0.683866024017334, + "learning_rate": 1.9018222618010577e-05, + "loss": 1.3446, + "mean_token_accuracy": 0.6559995263814926, + "num_tokens": 536785708.0, + "step": 3198 + }, + { + "entropy": 1.6582418382167816, + "epoch": 0.35143225948202467, + "grad_norm": 0.5620256066322327, + "learning_rate": 1.90174953019991e-05, + "loss": 1.3776, + "mean_token_accuracy": 0.6558230916659037, + "num_tokens": 537010057.0, + "step": 3199 + }, + { + "entropy": 1.7322443127632141, + "epoch": 0.3515421163933976, + "grad_norm": 0.7158662676811218, + "learning_rate": 1.9016767732237517e-05, + "loss": 1.4791, + "mean_token_accuracy": 0.6457269241412481, + "num_tokens": 537170570.0, + "step": 3200 + }, + { + "entropy": 1.7214918732643127, + "epoch": 0.35165197330477055, + "grad_norm": 0.7073965072631836, + "learning_rate": 1.901603990874884e-05, + "loss": 1.3967, + "mean_token_accuracy": 0.6600227405627569, + "num_tokens": 537366594.0, + "step": 3201 + }, + { + "entropy": 1.6927407383918762, + "epoch": 0.3517618302161435, + "grad_norm": 0.6808587312698364, + "learning_rate": 1.9015311831556115e-05, + "loss": 1.326, + "mean_token_accuracy": 0.6713967969020208, + "num_tokens": 537506637.0, + "step": 3202 + }, + { + "entropy": 1.719231108824412, + "epoch": 0.35187168712751643, + "grad_norm": 0.8534165024757385, + "learning_rate": 1.9014583500682384e-05, + "loss": 1.3823, + "mean_token_accuracy": 0.6730857292811075, + "num_tokens": 537656682.0, + "step": 3203 + }, + { + "entropy": 1.684444894393285, + "epoch": 0.35198154403888937, + "grad_norm": 0.74547278881073, + "learning_rate": 1.90138549161507e-05, + "loss": 1.2741, + "mean_token_accuracy": 0.670419305562973, + "num_tokens": 537803818.0, + "step": 3204 + }, + { + "entropy": 1.7365160286426544, + "epoch": 0.35209140095026226, + "grad_norm": 0.8732142448425293, + "learning_rate": 1.901312607798411e-05, + "loss": 1.5498, + "mean_token_accuracy": 0.6439683735370636, + "num_tokens": 537990938.0, + "step": 3205 + }, + { + "entropy": 1.7507870495319366, + "epoch": 0.3522012578616352, + "grad_norm": 0.7832520008087158, + "learning_rate": 1.9012396986205695e-05, + "loss": 1.5008, + "mean_token_accuracy": 0.6420899679263433, + "num_tokens": 538123491.0, + "step": 3206 + }, + { + "entropy": 1.6451840698719025, + "epoch": 0.35231111477300814, + "grad_norm": 0.6481114029884338, + "learning_rate": 1.9011667640838527e-05, + "loss": 1.4059, + "mean_token_accuracy": 0.6524686167637507, + "num_tokens": 538325290.0, + "step": 3207 + }, + { + "entropy": 1.645903656880061, + "epoch": 0.3524209716843811, + "grad_norm": 0.6290945410728455, + "learning_rate": 1.901093804190569e-05, + "loss": 1.3211, + "mean_token_accuracy": 0.6677955438693365, + "num_tokens": 538458740.0, + "step": 3208 + }, + { + "entropy": 1.7137588659922283, + "epoch": 0.352530828595754, + "grad_norm": 0.6638056635856628, + "learning_rate": 1.901020818943027e-05, + "loss": 1.2841, + "mean_token_accuracy": 0.6764950404564539, + "num_tokens": 538596800.0, + "step": 3209 + }, + { + "entropy": 1.7618902027606964, + "epoch": 0.35264068550712696, + "grad_norm": 0.7724607586860657, + "learning_rate": 1.9009478083435372e-05, + "loss": 1.473, + "mean_token_accuracy": 0.6430220901966095, + "num_tokens": 538746035.0, + "step": 3210 + }, + { + "entropy": 1.6959488193194072, + "epoch": 0.3527505424184999, + "grad_norm": 0.7325376272201538, + "learning_rate": 1.90087477239441e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.6453822106122971, + "num_tokens": 538922341.0, + "step": 3211 + }, + { + "entropy": 1.743853767712911, + "epoch": 0.35286039932987284, + "grad_norm": 0.7420296669006348, + "learning_rate": 1.9008017110979573e-05, + "loss": 1.4575, + "mean_token_accuracy": 0.6492985039949417, + "num_tokens": 539055514.0, + "step": 3212 + }, + { + "entropy": 1.6643742322921753, + "epoch": 0.3529702562412458, + "grad_norm": 0.616858959197998, + "learning_rate": 1.9007286244564912e-05, + "loss": 1.3077, + "mean_token_accuracy": 0.6636150479316711, + "num_tokens": 539195690.0, + "step": 3213 + }, + { + "entropy": 1.7134017944335938, + "epoch": 0.3530801131526187, + "grad_norm": 0.6087394952774048, + "learning_rate": 1.900655512472325e-05, + "loss": 1.4671, + "mean_token_accuracy": 0.642287035783132, + "num_tokens": 539452254.0, + "step": 3214 + }, + { + "entropy": 1.6220239003499348, + "epoch": 0.35318997006399167, + "grad_norm": 0.6543572545051575, + "learning_rate": 1.9005823751477727e-05, + "loss": 1.4195, + "mean_token_accuracy": 0.6753101100524267, + "num_tokens": 539651551.0, + "step": 3215 + }, + { + "entropy": 1.7377901673316956, + "epoch": 0.3532998269753646, + "grad_norm": 0.6597190499305725, + "learning_rate": 1.9005092124851488e-05, + "loss": 1.4769, + "mean_token_accuracy": 0.6341644277175268, + "num_tokens": 539839472.0, + "step": 3216 + }, + { + "entropy": 1.636192907889684, + "epoch": 0.35340968388673755, + "grad_norm": 0.7926854491233826, + "learning_rate": 1.9004360244867692e-05, + "loss": 1.4022, + "mean_token_accuracy": 0.6752565801143646, + "num_tokens": 539974405.0, + "step": 3217 + }, + { + "entropy": 1.7549065450827281, + "epoch": 0.3535195407981105, + "grad_norm": 0.733349621295929, + "learning_rate": 1.90036281115495e-05, + "loss": 1.309, + "mean_token_accuracy": 0.6662448445955912, + "num_tokens": 540115247.0, + "step": 3218 + }, + { + "entropy": 1.7415490448474884, + "epoch": 0.3536293977094834, + "grad_norm": 0.6571947336196899, + "learning_rate": 1.9002895724920084e-05, + "loss": 1.4145, + "mean_token_accuracy": 0.6497042328119278, + "num_tokens": 540260715.0, + "step": 3219 + }, + { + "entropy": 1.7305750052134197, + "epoch": 0.3537392546208563, + "grad_norm": 0.6744175553321838, + "learning_rate": 1.9002163085002627e-05, + "loss": 1.2965, + "mean_token_accuracy": 0.6661575684944788, + "num_tokens": 540413230.0, + "step": 3220 + }, + { + "entropy": 1.7879510422547658, + "epoch": 0.35384911153222925, + "grad_norm": 0.7192217707633972, + "learning_rate": 1.900143019182031e-05, + "loss": 1.5031, + "mean_token_accuracy": 0.6537016083796819, + "num_tokens": 540599645.0, + "step": 3221 + }, + { + "entropy": 1.7366569141546886, + "epoch": 0.3539589684436022, + "grad_norm": 0.728387713432312, + "learning_rate": 1.9000697045396335e-05, + "loss": 1.6104, + "mean_token_accuracy": 0.6425615598758062, + "num_tokens": 540770142.0, + "step": 3222 + }, + { + "entropy": 1.6701840062936146, + "epoch": 0.35406882535497514, + "grad_norm": 0.6737568974494934, + "learning_rate": 1.8999963645753907e-05, + "loss": 1.4392, + "mean_token_accuracy": 0.6645657767852148, + "num_tokens": 540924432.0, + "step": 3223 + }, + { + "entropy": 1.7021212875843048, + "epoch": 0.3541786822663481, + "grad_norm": 1.4555866718292236, + "learning_rate": 1.8999229992916234e-05, + "loss": 1.2265, + "mean_token_accuracy": 0.688769077261289, + "num_tokens": 541139968.0, + "step": 3224 + }, + { + "entropy": 1.6931440830230713, + "epoch": 0.354288539177721, + "grad_norm": 0.7598428726196289, + "learning_rate": 1.8998496086906536e-05, + "loss": 1.415, + "mean_token_accuracy": 0.6580548882484436, + "num_tokens": 541332213.0, + "step": 3225 + }, + { + "entropy": 1.7305771907170613, + "epoch": 0.35439839608909396, + "grad_norm": 0.6824182271957397, + "learning_rate": 1.8997761927748038e-05, + "loss": 1.3613, + "mean_token_accuracy": 0.6714488168557485, + "num_tokens": 541503362.0, + "step": 3226 + }, + { + "entropy": 1.7079228858153026, + "epoch": 0.3545082530004669, + "grad_norm": 0.663765013217926, + "learning_rate": 1.8997027515463982e-05, + "loss": 1.5137, + "mean_token_accuracy": 0.6328500509262085, + "num_tokens": 541703771.0, + "step": 3227 + }, + { + "entropy": 1.7136943340301514, + "epoch": 0.35461810991183984, + "grad_norm": 0.6330761313438416, + "learning_rate": 1.8996292850077605e-05, + "loss": 1.3637, + "mean_token_accuracy": 0.6594545394182205, + "num_tokens": 541901051.0, + "step": 3228 + }, + { + "entropy": 1.6656453013420105, + "epoch": 0.3547279668232128, + "grad_norm": 0.7125634551048279, + "learning_rate": 1.8995557931612162e-05, + "loss": 1.4978, + "mean_token_accuracy": 0.6360595971345901, + "num_tokens": 542105364.0, + "step": 3229 + }, + { + "entropy": 1.6696421404679616, + "epoch": 0.3548378237345857, + "grad_norm": 0.6307772994041443, + "learning_rate": 1.8994822760090917e-05, + "loss": 1.3209, + "mean_token_accuracy": 0.6642138212919235, + "num_tokens": 542278837.0, + "step": 3230 + }, + { + "entropy": 1.7220669488112132, + "epoch": 0.35494768064595866, + "grad_norm": 0.6235775947570801, + "learning_rate": 1.8994087335537136e-05, + "loss": 1.4231, + "mean_token_accuracy": 0.6583664764960607, + "num_tokens": 542477234.0, + "step": 3231 + }, + { + "entropy": 1.767273376385371, + "epoch": 0.35505753755733155, + "grad_norm": 0.75313800573349, + "learning_rate": 1.8993351657974088e-05, + "loss": 1.3379, + "mean_token_accuracy": 0.6546075393756231, + "num_tokens": 542579859.0, + "step": 3232 + }, + { + "entropy": 1.7123860716819763, + "epoch": 0.3551673944687045, + "grad_norm": 0.7081466317176819, + "learning_rate": 1.8992615727425064e-05, + "loss": 1.4038, + "mean_token_accuracy": 0.6437129030625025, + "num_tokens": 542748402.0, + "step": 3233 + }, + { + "entropy": 1.7314150631427765, + "epoch": 0.35527725138007743, + "grad_norm": 0.7206461429595947, + "learning_rate": 1.8991879543913353e-05, + "loss": 1.4612, + "mean_token_accuracy": 0.6518258800109228, + "num_tokens": 542947728.0, + "step": 3234 + }, + { + "entropy": 1.7437797288099925, + "epoch": 0.35538710829145037, + "grad_norm": 0.6989411115646362, + "learning_rate": 1.8991143107462256e-05, + "loss": 1.4143, + "mean_token_accuracy": 0.6447295347849528, + "num_tokens": 543108539.0, + "step": 3235 + }, + { + "entropy": 1.739637513955434, + "epoch": 0.3554969652028233, + "grad_norm": 0.8311676979064941, + "learning_rate": 1.8990406418095083e-05, + "loss": 1.3188, + "mean_token_accuracy": 0.6700087090333303, + "num_tokens": 543256908.0, + "step": 3236 + }, + { + "entropy": 1.6945122977097828, + "epoch": 0.35560682211419625, + "grad_norm": 0.7516961097717285, + "learning_rate": 1.8989669475835145e-05, + "loss": 1.3135, + "mean_token_accuracy": 0.6661293009916941, + "num_tokens": 543373158.0, + "step": 3237 + }, + { + "entropy": 1.7841602961222331, + "epoch": 0.3557166790255692, + "grad_norm": 0.8799614310264587, + "learning_rate": 1.898893228070577e-05, + "loss": 1.2702, + "mean_token_accuracy": 0.6760559976100922, + "num_tokens": 543513318.0, + "step": 3238 + }, + { + "entropy": 1.7264136672019958, + "epoch": 0.35582653593694213, + "grad_norm": 0.7641453742980957, + "learning_rate": 1.8988194832730283e-05, + "loss": 1.301, + "mean_token_accuracy": 0.6664343724648157, + "num_tokens": 543645245.0, + "step": 3239 + }, + { + "entropy": 1.6593830386797588, + "epoch": 0.3559363928483151, + "grad_norm": 0.7157340049743652, + "learning_rate": 1.8987457131932036e-05, + "loss": 1.4008, + "mean_token_accuracy": 0.6607535431782404, + "num_tokens": 543795740.0, + "step": 3240 + }, + { + "entropy": 1.7583003342151642, + "epoch": 0.356046249759688, + "grad_norm": 0.8725547194480896, + "learning_rate": 1.898671917833437e-05, + "loss": 1.5383, + "mean_token_accuracy": 0.6420815885066986, + "num_tokens": 543958970.0, + "step": 3241 + }, + { + "entropy": 1.6769887109597523, + "epoch": 0.35615610667106096, + "grad_norm": 0.6710975766181946, + "learning_rate": 1.8985980971960637e-05, + "loss": 1.5089, + "mean_token_accuracy": 0.6415905406077703, + "num_tokens": 544152972.0, + "step": 3242 + }, + { + "entropy": 1.6808498601118724, + "epoch": 0.3562659635824339, + "grad_norm": 0.6576784253120422, + "learning_rate": 1.8985242512834205e-05, + "loss": 1.4222, + "mean_token_accuracy": 0.6540278444687525, + "num_tokens": 544305414.0, + "step": 3243 + }, + { + "entropy": 1.7014685571193695, + "epoch": 0.35637582049380684, + "grad_norm": 0.67486572265625, + "learning_rate": 1.8984503800978444e-05, + "loss": 1.4781, + "mean_token_accuracy": 0.6487467388312022, + "num_tokens": 544497707.0, + "step": 3244 + }, + { + "entropy": 1.7330358525117238, + "epoch": 0.3564856774051798, + "grad_norm": 0.6918492317199707, + "learning_rate": 1.898376483641674e-05, + "loss": 1.3808, + "mean_token_accuracy": 0.6515394548575083, + "num_tokens": 544661597.0, + "step": 3245 + }, + { + "entropy": 1.7030868232250214, + "epoch": 0.35659553431655266, + "grad_norm": 0.6160433292388916, + "learning_rate": 1.898302561917247e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.662481889128685, + "num_tokens": 544814617.0, + "step": 3246 + }, + { + "entropy": 1.7345664103825886, + "epoch": 0.3567053912279256, + "grad_norm": 0.7806865572929382, + "learning_rate": 1.8982286149269043e-05, + "loss": 1.505, + "mean_token_accuracy": 0.6493661950031916, + "num_tokens": 544950318.0, + "step": 3247 + }, + { + "entropy": 1.7066125174363453, + "epoch": 0.35681524813929855, + "grad_norm": 0.6025816202163696, + "learning_rate": 1.8981546426729856e-05, + "loss": 1.3322, + "mean_token_accuracy": 0.6618035733699799, + "num_tokens": 545107729.0, + "step": 3248 + }, + { + "entropy": 1.7195513546466827, + "epoch": 0.3569251050506715, + "grad_norm": 0.7217980027198792, + "learning_rate": 1.898080645157832e-05, + "loss": 1.4037, + "mean_token_accuracy": 0.6431277443965276, + "num_tokens": 545338954.0, + "step": 3249 + }, + { + "entropy": 1.6746398607889812, + "epoch": 0.35703496196204443, + "grad_norm": 0.5947571992874146, + "learning_rate": 1.8980066223837857e-05, + "loss": 1.3889, + "mean_token_accuracy": 0.6642505377531052, + "num_tokens": 545502181.0, + "step": 3250 + }, + { + "entropy": 1.782493571440379, + "epoch": 0.35714481887341737, + "grad_norm": 0.6762712001800537, + "learning_rate": 1.8979325743531892e-05, + "loss": 1.3322, + "mean_token_accuracy": 0.6563690652449926, + "num_tokens": 545647976.0, + "step": 3251 + }, + { + "entropy": 1.6596081058184307, + "epoch": 0.3572546757847903, + "grad_norm": 0.665545642375946, + "learning_rate": 1.897858501068386e-05, + "loss": 1.3157, + "mean_token_accuracy": 0.6745046228170395, + "num_tokens": 545789298.0, + "step": 3252 + }, + { + "entropy": 1.7242101629575093, + "epoch": 0.35736453269616325, + "grad_norm": 0.6829879879951477, + "learning_rate": 1.8977844025317212e-05, + "loss": 1.4886, + "mean_token_accuracy": 0.645786871512731, + "num_tokens": 546005021.0, + "step": 3253 + }, + { + "entropy": 1.6587198774019878, + "epoch": 0.3574743896075362, + "grad_norm": 0.645124614238739, + "learning_rate": 1.897710278745539e-05, + "loss": 1.4629, + "mean_token_accuracy": 0.6476227790117264, + "num_tokens": 546211606.0, + "step": 3254 + }, + { + "entropy": 1.77561150987943, + "epoch": 0.35758424651890913, + "grad_norm": 0.7814483642578125, + "learning_rate": 1.897636129712187e-05, + "loss": 1.5806, + "mean_token_accuracy": 0.6428438226381937, + "num_tokens": 546374799.0, + "step": 3255 + }, + { + "entropy": 1.6502399047215779, + "epoch": 0.3576941034302821, + "grad_norm": 0.6323907971382141, + "learning_rate": 1.8975619554340103e-05, + "loss": 1.3035, + "mean_token_accuracy": 0.6714171419541041, + "num_tokens": 546556026.0, + "step": 3256 + }, + { + "entropy": 1.622281789779663, + "epoch": 0.357803960341655, + "grad_norm": 0.6249427795410156, + "learning_rate": 1.8974877559133568e-05, + "loss": 1.4739, + "mean_token_accuracy": 0.6602053095897039, + "num_tokens": 546763855.0, + "step": 3257 + }, + { + "entropy": 1.7764336963494618, + "epoch": 0.35791381725302795, + "grad_norm": 0.7319939136505127, + "learning_rate": 1.8974135311525756e-05, + "loss": 1.3925, + "mean_token_accuracy": 0.6508530924717585, + "num_tokens": 546905288.0, + "step": 3258 + }, + { + "entropy": 1.6988587478796642, + "epoch": 0.35802367416440084, + "grad_norm": 0.5522320866584778, + "learning_rate": 1.897339281154015e-05, + "loss": 1.3956, + "mean_token_accuracy": 0.6408476581176122, + "num_tokens": 547116190.0, + "step": 3259 + }, + { + "entropy": 1.717311978340149, + "epoch": 0.3581335310757738, + "grad_norm": 0.6752801537513733, + "learning_rate": 1.897265005920026e-05, + "loss": 1.4233, + "mean_token_accuracy": 0.6433817644913992, + "num_tokens": 547287521.0, + "step": 3260 + }, + { + "entropy": 1.65370711684227, + "epoch": 0.3582433879871467, + "grad_norm": 0.6644560098648071, + "learning_rate": 1.8971907054529585e-05, + "loss": 1.5168, + "mean_token_accuracy": 0.6519752393166224, + "num_tokens": 547490966.0, + "step": 3261 + }, + { + "entropy": 1.7368830641110737, + "epoch": 0.35835324489851966, + "grad_norm": 0.6558582782745361, + "learning_rate": 1.8971163797551645e-05, + "loss": 1.4857, + "mean_token_accuracy": 0.6533776869376501, + "num_tokens": 547688075.0, + "step": 3262 + }, + { + "entropy": 1.6827348172664642, + "epoch": 0.3584631018098926, + "grad_norm": 0.6018016934394836, + "learning_rate": 1.8970420288289963e-05, + "loss": 1.4116, + "mean_token_accuracy": 0.6425057997306188, + "num_tokens": 547881243.0, + "step": 3263 + }, + { + "entropy": 1.7337345282236736, + "epoch": 0.35857295872126554, + "grad_norm": 0.6800892353057861, + "learning_rate": 1.8969676526768072e-05, + "loss": 1.4148, + "mean_token_accuracy": 0.6537298361460367, + "num_tokens": 548022572.0, + "step": 3264 + }, + { + "entropy": 1.7320611973603566, + "epoch": 0.3586828156326385, + "grad_norm": 0.6835919618606567, + "learning_rate": 1.8968932513009507e-05, + "loss": 1.4243, + "mean_token_accuracy": 0.65031631787618, + "num_tokens": 548206869.0, + "step": 3265 + }, + { + "entropy": 1.6919034918149312, + "epoch": 0.3587926725440114, + "grad_norm": 0.703696608543396, + "learning_rate": 1.8968188247037823e-05, + "loss": 1.411, + "mean_token_accuracy": 0.655804713567098, + "num_tokens": 548390855.0, + "step": 3266 + }, + { + "entropy": 1.7181176046530406, + "epoch": 0.35890252945538437, + "grad_norm": 0.7795320153236389, + "learning_rate": 1.8967443728876566e-05, + "loss": 1.2687, + "mean_token_accuracy": 0.6653892497221628, + "num_tokens": 548546869.0, + "step": 3267 + }, + { + "entropy": 1.7424982289473216, + "epoch": 0.3590123863667573, + "grad_norm": 0.7345746755599976, + "learning_rate": 1.896669895854931e-05, + "loss": 1.6389, + "mean_token_accuracy": 0.6303468098243078, + "num_tokens": 548770908.0, + "step": 3268 + }, + { + "entropy": 1.7184557716051738, + "epoch": 0.35912224327813025, + "grad_norm": 0.7089744806289673, + "learning_rate": 1.8965953936079616e-05, + "loss": 1.5394, + "mean_token_accuracy": 0.6407269140084585, + "num_tokens": 548980049.0, + "step": 3269 + }, + { + "entropy": 1.768985648949941, + "epoch": 0.3592321001895032, + "grad_norm": 0.7779526114463806, + "learning_rate": 1.8965208661491073e-05, + "loss": 1.452, + "mean_token_accuracy": 0.6549462129672369, + "num_tokens": 549138218.0, + "step": 3270 + }, + { + "entropy": 1.734433690706889, + "epoch": 0.35934195710087613, + "grad_norm": 0.67804354429245, + "learning_rate": 1.8964463134807265e-05, + "loss": 1.4667, + "mean_token_accuracy": 0.6576692014932632, + "num_tokens": 549297807.0, + "step": 3271 + }, + { + "entropy": 1.6583465834458668, + "epoch": 0.35945181401224907, + "grad_norm": 0.6600108742713928, + "learning_rate": 1.896371735605179e-05, + "loss": 1.442, + "mean_token_accuracy": 0.651568760474523, + "num_tokens": 549471832.0, + "step": 3272 + }, + { + "entropy": 1.7016997933387756, + "epoch": 0.35956167092362196, + "grad_norm": 0.5997833013534546, + "learning_rate": 1.8962971325248246e-05, + "loss": 1.5253, + "mean_token_accuracy": 0.6380040893952051, + "num_tokens": 549645821.0, + "step": 3273 + }, + { + "entropy": 1.728733738263448, + "epoch": 0.3596715278349949, + "grad_norm": 0.7934627532958984, + "learning_rate": 1.8962225042420248e-05, + "loss": 1.5075, + "mean_token_accuracy": 0.6416665812333425, + "num_tokens": 549809498.0, + "step": 3274 + }, + { + "entropy": 1.7281453907489777, + "epoch": 0.35978138474636784, + "grad_norm": 0.6657528877258301, + "learning_rate": 1.8961478507591417e-05, + "loss": 1.3891, + "mean_token_accuracy": 0.6498565276463827, + "num_tokens": 549995261.0, + "step": 3275 + }, + { + "entropy": 1.7271487216154735, + "epoch": 0.3598912416577408, + "grad_norm": 0.6277911067008972, + "learning_rate": 1.8960731720785378e-05, + "loss": 1.3812, + "mean_token_accuracy": 0.6533424854278564, + "num_tokens": 550156327.0, + "step": 3276 + }, + { + "entropy": 1.7464400331179302, + "epoch": 0.3600010985691137, + "grad_norm": 0.6984190940856934, + "learning_rate": 1.8959984682025767e-05, + "loss": 1.3108, + "mean_token_accuracy": 0.6629662662744522, + "num_tokens": 550289813.0, + "step": 3277 + }, + { + "entropy": 1.7461791435877483, + "epoch": 0.36011095548048666, + "grad_norm": 0.8859475255012512, + "learning_rate": 1.8959237391336226e-05, + "loss": 1.3565, + "mean_token_accuracy": 0.6585030903418859, + "num_tokens": 550433398.0, + "step": 3278 + }, + { + "entropy": 1.7174023687839508, + "epoch": 0.3602208123918596, + "grad_norm": 0.6496213674545288, + "learning_rate": 1.895848984874041e-05, + "loss": 1.3238, + "mean_token_accuracy": 0.6703794449567795, + "num_tokens": 550571694.0, + "step": 3279 + }, + { + "entropy": 1.6957313120365143, + "epoch": 0.36033066930323254, + "grad_norm": 0.7161815762519836, + "learning_rate": 1.8957742054261976e-05, + "loss": 1.4328, + "mean_token_accuracy": 0.6538204352060953, + "num_tokens": 550742742.0, + "step": 3280 + }, + { + "entropy": 1.7091784179210663, + "epoch": 0.3604405262146055, + "grad_norm": 0.6954407095909119, + "learning_rate": 1.8956994007924595e-05, + "loss": 1.4701, + "mean_token_accuracy": 0.6483365694681803, + "num_tokens": 550915669.0, + "step": 3281 + }, + { + "entropy": 1.759674459695816, + "epoch": 0.3605503831259784, + "grad_norm": 0.8163644671440125, + "learning_rate": 1.8956245709751932e-05, + "loss": 1.4494, + "mean_token_accuracy": 0.6482079128424326, + "num_tokens": 551072402.0, + "step": 3282 + }, + { + "entropy": 1.6610101958115895, + "epoch": 0.36066024003735137, + "grad_norm": 0.6455697417259216, + "learning_rate": 1.8955497159767683e-05, + "loss": 1.3841, + "mean_token_accuracy": 0.6573386738697687, + "num_tokens": 551243907.0, + "step": 3283 + }, + { + "entropy": 1.765950342019399, + "epoch": 0.3607700969487243, + "grad_norm": 0.6208654046058655, + "learning_rate": 1.8954748357995532e-05, + "loss": 1.472, + "mean_token_accuracy": 0.6405810018380483, + "num_tokens": 551437093.0, + "step": 3284 + }, + { + "entropy": 1.6940280695756276, + "epoch": 0.36087995386009725, + "grad_norm": 0.7773803472518921, + "learning_rate": 1.8953999304459182e-05, + "loss": 1.306, + "mean_token_accuracy": 0.670986607670784, + "num_tokens": 551578647.0, + "step": 3285 + }, + { + "entropy": 1.7210556169350941, + "epoch": 0.3609898107714702, + "grad_norm": 0.6685793399810791, + "learning_rate": 1.8953249999182336e-05, + "loss": 1.3721, + "mean_token_accuracy": 0.6593438486258189, + "num_tokens": 551716346.0, + "step": 3286 + }, + { + "entropy": 1.7289181451002757, + "epoch": 0.3610996676828431, + "grad_norm": 0.6698284149169922, + "learning_rate": 1.895250044218871e-05, + "loss": 1.4681, + "mean_token_accuracy": 0.6375825703144073, + "num_tokens": 551901315.0, + "step": 3287 + }, + { + "entropy": 1.7202289899190266, + "epoch": 0.361209524594216, + "grad_norm": 0.7790493369102478, + "learning_rate": 1.895175063350203e-05, + "loss": 1.4252, + "mean_token_accuracy": 0.6556966801484426, + "num_tokens": 552055231.0, + "step": 3288 + }, + { + "entropy": 1.638626625140508, + "epoch": 0.36131938150558895, + "grad_norm": 0.6459670662879944, + "learning_rate": 1.8951000573146028e-05, + "loss": 1.1587, + "mean_token_accuracy": 0.6860545178254446, + "num_tokens": 552154538.0, + "step": 3289 + }, + { + "entropy": 1.7047406236330669, + "epoch": 0.3614292384169619, + "grad_norm": 0.6823393702507019, + "learning_rate": 1.895025026114444e-05, + "loss": 1.3642, + "mean_token_accuracy": 0.6609081079562505, + "num_tokens": 552306200.0, + "step": 3290 + }, + { + "entropy": 1.7296662827332814, + "epoch": 0.36153909532833484, + "grad_norm": 0.6827804446220398, + "learning_rate": 1.8949499697521013e-05, + "loss": 1.5255, + "mean_token_accuracy": 0.6456420173247656, + "num_tokens": 552495063.0, + "step": 3291 + }, + { + "entropy": 1.7409346004327138, + "epoch": 0.3616489522397078, + "grad_norm": 0.6589847803115845, + "learning_rate": 1.89487488822995e-05, + "loss": 1.569, + "mean_token_accuracy": 0.6369834740956625, + "num_tokens": 552680325.0, + "step": 3292 + }, + { + "entropy": 1.7292551795641582, + "epoch": 0.3617588091510807, + "grad_norm": 0.683055579662323, + "learning_rate": 1.8947997815503668e-05, + "loss": 1.3601, + "mean_token_accuracy": 0.6582022855679194, + "num_tokens": 552842160.0, + "step": 3293 + }, + { + "entropy": 1.6185003022352855, + "epoch": 0.36186866606245366, + "grad_norm": 0.5870607495307922, + "learning_rate": 1.8947246497157287e-05, + "loss": 1.2843, + "mean_token_accuracy": 0.6818203230698904, + "num_tokens": 553002411.0, + "step": 3294 + }, + { + "entropy": 1.7373617390791576, + "epoch": 0.3619785229738266, + "grad_norm": 0.8702647089958191, + "learning_rate": 1.8946494927284134e-05, + "loss": 1.4134, + "mean_token_accuracy": 0.6574988017479578, + "num_tokens": 553134679.0, + "step": 3295 + }, + { + "entropy": 1.6635343730449677, + "epoch": 0.36208837988519954, + "grad_norm": 0.6780598163604736, + "learning_rate": 1.8945743105908004e-05, + "loss": 1.2698, + "mean_token_accuracy": 0.672525574763616, + "num_tokens": 553279210.0, + "step": 3296 + }, + { + "entropy": 1.7142191926638286, + "epoch": 0.3621982367965725, + "grad_norm": 0.6832349896430969, + "learning_rate": 1.894499103305268e-05, + "loss": 1.388, + "mean_token_accuracy": 0.6695650964975357, + "num_tokens": 553454135.0, + "step": 3297 + }, + { + "entropy": 1.7063461641470592, + "epoch": 0.3623080937079454, + "grad_norm": 0.6330212354660034, + "learning_rate": 1.894423870874197e-05, + "loss": 1.4071, + "mean_token_accuracy": 0.6505293697118759, + "num_tokens": 553628489.0, + "step": 3298 + }, + { + "entropy": 1.705621709426244, + "epoch": 0.36241795061931836, + "grad_norm": 0.732349693775177, + "learning_rate": 1.894348613299968e-05, + "loss": 1.3747, + "mean_token_accuracy": 0.6729957262674967, + "num_tokens": 553793405.0, + "step": 3299 + }, + { + "entropy": 1.7087404429912567, + "epoch": 0.36252780753069125, + "grad_norm": 0.6913777589797974, + "learning_rate": 1.8942733305849643e-05, + "loss": 1.4367, + "mean_token_accuracy": 0.6505131224791209, + "num_tokens": 554019155.0, + "step": 3300 + }, + { + "entropy": 1.723453958829244, + "epoch": 0.3626376644420642, + "grad_norm": 0.6385790705680847, + "learning_rate": 1.8941980227315672e-05, + "loss": 1.3893, + "mean_token_accuracy": 0.6500000605980555, + "num_tokens": 554154296.0, + "step": 3301 + }, + { + "entropy": 1.7123183111349742, + "epoch": 0.36274752135343713, + "grad_norm": 0.7006135582923889, + "learning_rate": 1.89412268974216e-05, + "loss": 1.368, + "mean_token_accuracy": 0.6440122773249944, + "num_tokens": 554283476.0, + "step": 3302 + }, + { + "entropy": 1.7936599254608154, + "epoch": 0.36285737826481007, + "grad_norm": 0.6872299909591675, + "learning_rate": 1.8940473316191282e-05, + "loss": 1.4587, + "mean_token_accuracy": 0.646611750125885, + "num_tokens": 554451283.0, + "step": 3303 + }, + { + "entropy": 1.69118133187294, + "epoch": 0.362967235176183, + "grad_norm": 0.6158702969551086, + "learning_rate": 1.893971948364856e-05, + "loss": 1.3139, + "mean_token_accuracy": 0.6713261753320694, + "num_tokens": 554591528.0, + "step": 3304 + }, + { + "entropy": 1.707411030928294, + "epoch": 0.36307709208755595, + "grad_norm": 0.7171065807342529, + "learning_rate": 1.8938965399817295e-05, + "loss": 1.4017, + "mean_token_accuracy": 0.6603502780199051, + "num_tokens": 554776939.0, + "step": 3305 + }, + { + "entropy": 1.6777910987536113, + "epoch": 0.3631869489989289, + "grad_norm": 0.7720420360565186, + "learning_rate": 1.8938211064721348e-05, + "loss": 1.3903, + "mean_token_accuracy": 0.67117311557134, + "num_tokens": 554967794.0, + "step": 3306 + }, + { + "entropy": 1.745880534251531, + "epoch": 0.36329680591030183, + "grad_norm": 0.7731246948242188, + "learning_rate": 1.89374564783846e-05, + "loss": 1.5085, + "mean_token_accuracy": 0.6443192313114802, + "num_tokens": 555120674.0, + "step": 3307 + }, + { + "entropy": 1.7017800112565358, + "epoch": 0.3634066628216748, + "grad_norm": 0.6351495981216431, + "learning_rate": 1.8936701640830932e-05, + "loss": 1.3872, + "mean_token_accuracy": 0.6458842406670252, + "num_tokens": 555372792.0, + "step": 3308 + }, + { + "entropy": 1.731082151333491, + "epoch": 0.3635165197330477, + "grad_norm": 0.6268585324287415, + "learning_rate": 1.8935946552084235e-05, + "loss": 1.3286, + "mean_token_accuracy": 0.6559985081354777, + "num_tokens": 555557204.0, + "step": 3309 + }, + { + "entropy": 1.7254813611507416, + "epoch": 0.36362637664442066, + "grad_norm": 0.7122596502304077, + "learning_rate": 1.8935191212168404e-05, + "loss": 1.3135, + "mean_token_accuracy": 0.6687471518913904, + "num_tokens": 555714696.0, + "step": 3310 + }, + { + "entropy": 1.7088161011536915, + "epoch": 0.3637362335557936, + "grad_norm": 0.6561578512191772, + "learning_rate": 1.8934435621107348e-05, + "loss": 1.4729, + "mean_token_accuracy": 0.6405731240908304, + "num_tokens": 555929585.0, + "step": 3311 + }, + { + "entropy": 1.6857821742693584, + "epoch": 0.36384609046716654, + "grad_norm": 0.6950667500495911, + "learning_rate": 1.8933679778924977e-05, + "loss": 1.4068, + "mean_token_accuracy": 0.6570146431525549, + "num_tokens": 556083537.0, + "step": 3312 + }, + { + "entropy": 1.6994845469792683, + "epoch": 0.3639559473785395, + "grad_norm": 0.6382138133049011, + "learning_rate": 1.8932923685645218e-05, + "loss": 1.5331, + "mean_token_accuracy": 0.6430691679318746, + "num_tokens": 556268816.0, + "step": 3313 + }, + { + "entropy": 1.7211009760697682, + "epoch": 0.36406580428991236, + "grad_norm": 0.753278911113739, + "learning_rate": 1.8932167341291998e-05, + "loss": 1.416, + "mean_token_accuracy": 0.6640914579232534, + "num_tokens": 556411187.0, + "step": 3314 + }, + { + "entropy": 1.7074837684631348, + "epoch": 0.3641756612012853, + "grad_norm": 0.7115225195884705, + "learning_rate": 1.893141074588926e-05, + "loss": 1.2538, + "mean_token_accuracy": 0.6794544955094656, + "num_tokens": 556560154.0, + "step": 3315 + }, + { + "entropy": 1.6474250952402751, + "epoch": 0.36428551811265825, + "grad_norm": 0.6773630976676941, + "learning_rate": 1.893065389946094e-05, + "loss": 1.4965, + "mean_token_accuracy": 0.6429052402575811, + "num_tokens": 556758287.0, + "step": 3316 + }, + { + "entropy": 1.7411263982454936, + "epoch": 0.3643953750240312, + "grad_norm": 0.7541442513465881, + "learning_rate": 1.8929896802031e-05, + "loss": 1.3983, + "mean_token_accuracy": 0.6654303272565206, + "num_tokens": 556906714.0, + "step": 3317 + }, + { + "entropy": 1.6828208565711975, + "epoch": 0.36450523193540413, + "grad_norm": 0.5869950652122498, + "learning_rate": 1.89291394536234e-05, + "loss": 1.5068, + "mean_token_accuracy": 0.638722355167071, + "num_tokens": 557114352.0, + "step": 3318 + }, + { + "entropy": 1.6942188839117687, + "epoch": 0.36461508884677707, + "grad_norm": 0.7280264496803284, + "learning_rate": 1.8928381854262107e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6560780803362528, + "num_tokens": 557283476.0, + "step": 3319 + }, + { + "entropy": 1.7298544545968373, + "epoch": 0.36472494575815, + "grad_norm": 0.6916755437850952, + "learning_rate": 1.8927624003971104e-05, + "loss": 1.4664, + "mean_token_accuracy": 0.6391513794660568, + "num_tokens": 557482545.0, + "step": 3320 + }, + { + "entropy": 1.6934645175933838, + "epoch": 0.36483480266952295, + "grad_norm": 0.6088461875915527, + "learning_rate": 1.892686590277437e-05, + "loss": 1.3194, + "mean_token_accuracy": 0.6680668840805689, + "num_tokens": 557629627.0, + "step": 3321 + }, + { + "entropy": 1.7159304022789001, + "epoch": 0.3649446595808959, + "grad_norm": 0.6701193451881409, + "learning_rate": 1.8926107550695907e-05, + "loss": 1.4102, + "mean_token_accuracy": 0.661454955736796, + "num_tokens": 557788278.0, + "step": 3322 + }, + { + "entropy": 1.6919244428475697, + "epoch": 0.36505451649226883, + "grad_norm": 0.606071949005127, + "learning_rate": 1.892534894775971e-05, + "loss": 1.3828, + "mean_token_accuracy": 0.6501694321632385, + "num_tokens": 557979234.0, + "step": 3323 + }, + { + "entropy": 1.7174591918786366, + "epoch": 0.3651643734036418, + "grad_norm": 0.6546566486358643, + "learning_rate": 1.892459009398979e-05, + "loss": 1.3609, + "mean_token_accuracy": 0.6671615242958069, + "num_tokens": 558163091.0, + "step": 3324 + }, + { + "entropy": 1.7426952123641968, + "epoch": 0.3652742303150147, + "grad_norm": 0.6608150005340576, + "learning_rate": 1.8923830989410165e-05, + "loss": 1.4249, + "mean_token_accuracy": 0.6464882989724478, + "num_tokens": 558294742.0, + "step": 3325 + }, + { + "entropy": 1.7673422197500865, + "epoch": 0.36538408722638765, + "grad_norm": 0.5727324485778809, + "learning_rate": 1.8923071634044855e-05, + "loss": 1.4988, + "mean_token_accuracy": 0.6281411349773407, + "num_tokens": 558479560.0, + "step": 3326 + }, + { + "entropy": 1.7090481917063396, + "epoch": 0.36549394413776054, + "grad_norm": 0.7117367386817932, + "learning_rate": 1.89223120279179e-05, + "loss": 1.3147, + "mean_token_accuracy": 0.6621511876583099, + "num_tokens": 558594005.0, + "step": 3327 + }, + { + "entropy": 1.6214572985967, + "epoch": 0.3656038010491335, + "grad_norm": 0.8425611853599548, + "learning_rate": 1.8921552171053344e-05, + "loss": 1.4351, + "mean_token_accuracy": 0.6586425652106603, + "num_tokens": 558773354.0, + "step": 3328 + }, + { + "entropy": 1.6668421924114227, + "epoch": 0.3657136579605064, + "grad_norm": 0.6190313100814819, + "learning_rate": 1.8920792063475228e-05, + "loss": 1.3001, + "mean_token_accuracy": 0.6651297012964884, + "num_tokens": 558940163.0, + "step": 3329 + }, + { + "entropy": 1.7648253838221233, + "epoch": 0.36582351487187936, + "grad_norm": 0.7122219204902649, + "learning_rate": 1.892003170520761e-05, + "loss": 1.3979, + "mean_token_accuracy": 0.6517335921525955, + "num_tokens": 559094548.0, + "step": 3330 + }, + { + "entropy": 1.738343745470047, + "epoch": 0.3659333717832523, + "grad_norm": 0.6885458827018738, + "learning_rate": 1.8919271096274562e-05, + "loss": 1.3878, + "mean_token_accuracy": 0.6585352619489034, + "num_tokens": 559293154.0, + "step": 3331 + }, + { + "entropy": 1.7013998627662659, + "epoch": 0.36604322869462524, + "grad_norm": 0.5736583471298218, + "learning_rate": 1.8918510236700148e-05, + "loss": 1.4884, + "mean_token_accuracy": 0.6527662177880605, + "num_tokens": 559486360.0, + "step": 3332 + }, + { + "entropy": 1.7290991048018138, + "epoch": 0.3661530856059982, + "grad_norm": 0.6130762100219727, + "learning_rate": 1.8917749126508454e-05, + "loss": 1.3987, + "mean_token_accuracy": 0.649641344944636, + "num_tokens": 559646780.0, + "step": 3333 + }, + { + "entropy": 1.665138175090154, + "epoch": 0.3662629425173711, + "grad_norm": 0.7527748346328735, + "learning_rate": 1.891698776572357e-05, + "loss": 1.4943, + "mean_token_accuracy": 0.6480821569760641, + "num_tokens": 559842312.0, + "step": 3334 + }, + { + "entropy": 1.7401958505312602, + "epoch": 0.36637279942874407, + "grad_norm": 0.7090706825256348, + "learning_rate": 1.891622615436959e-05, + "loss": 1.5787, + "mean_token_accuracy": 0.6238933056592941, + "num_tokens": 560034055.0, + "step": 3335 + }, + { + "entropy": 1.6688226958115895, + "epoch": 0.366482656340117, + "grad_norm": 0.6134145259857178, + "learning_rate": 1.891546429247062e-05, + "loss": 1.443, + "mean_token_accuracy": 0.6504640529553095, + "num_tokens": 560205068.0, + "step": 3336 + }, + { + "entropy": 1.5998762051264446, + "epoch": 0.36659251325148995, + "grad_norm": 0.5931162238121033, + "learning_rate": 1.891470218005077e-05, + "loss": 1.2993, + "mean_token_accuracy": 0.6780025462309519, + "num_tokens": 560354488.0, + "step": 3337 + }, + { + "entropy": 1.6810278395811717, + "epoch": 0.3667023701628629, + "grad_norm": 0.7367040514945984, + "learning_rate": 1.8913939817134167e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6589592695236206, + "num_tokens": 560507374.0, + "step": 3338 + }, + { + "entropy": 1.6817876795927684, + "epoch": 0.36681222707423583, + "grad_norm": 0.8555600047111511, + "learning_rate": 1.8913177203744927e-05, + "loss": 1.3993, + "mean_token_accuracy": 0.6524364600578944, + "num_tokens": 560662070.0, + "step": 3339 + }, + { + "entropy": 1.6980493466059368, + "epoch": 0.36692208398560877, + "grad_norm": 0.6242254972457886, + "learning_rate": 1.89124143399072e-05, + "loss": 1.3962, + "mean_token_accuracy": 0.6619236518939337, + "num_tokens": 560866103.0, + "step": 3340 + }, + { + "entropy": 1.676784485578537, + "epoch": 0.36703194089698166, + "grad_norm": 0.633709192276001, + "learning_rate": 1.891165122564512e-05, + "loss": 1.386, + "mean_token_accuracy": 0.6684582183758417, + "num_tokens": 561025085.0, + "step": 3341 + }, + { + "entropy": 1.6602611144383748, + "epoch": 0.3671417978083546, + "grad_norm": 0.5967673659324646, + "learning_rate": 1.891088786098285e-05, + "loss": 1.3113, + "mean_token_accuracy": 0.6616858939329783, + "num_tokens": 561194282.0, + "step": 3342 + }, + { + "entropy": 1.7069261968135834, + "epoch": 0.36725165471972754, + "grad_norm": 0.7268139719963074, + "learning_rate": 1.8910124245944544e-05, + "loss": 1.4762, + "mean_token_accuracy": 0.641165554523468, + "num_tokens": 561399006.0, + "step": 3343 + }, + { + "entropy": 1.640328695376714, + "epoch": 0.3673615116311005, + "grad_norm": 0.6956763863563538, + "learning_rate": 1.8909360380554366e-05, + "loss": 1.1811, + "mean_token_accuracy": 0.6891622543334961, + "num_tokens": 561511064.0, + "step": 3344 + }, + { + "entropy": 1.7172695597012837, + "epoch": 0.3674713685424734, + "grad_norm": 0.6288443207740784, + "learning_rate": 1.8908596264836496e-05, + "loss": 1.3749, + "mean_token_accuracy": 0.6524456491072973, + "num_tokens": 561693661.0, + "step": 3345 + }, + { + "entropy": 1.7315253218015034, + "epoch": 0.36758122545384636, + "grad_norm": 0.6713343858718872, + "learning_rate": 1.8907831898815118e-05, + "loss": 1.5466, + "mean_token_accuracy": 0.6562629292408625, + "num_tokens": 561882529.0, + "step": 3346 + }, + { + "entropy": 1.7233446737130482, + "epoch": 0.3676910823652193, + "grad_norm": 0.6376742124557495, + "learning_rate": 1.8907067282514426e-05, + "loss": 1.4229, + "mean_token_accuracy": 0.6561285456021627, + "num_tokens": 562071206.0, + "step": 3347 + }, + { + "entropy": 1.775407483180364, + "epoch": 0.36780093927659224, + "grad_norm": 0.7270674705505371, + "learning_rate": 1.8906302415958617e-05, + "loss": 1.4222, + "mean_token_accuracy": 0.6504105776548386, + "num_tokens": 562217092.0, + "step": 3348 + }, + { + "entropy": 1.6748477617899578, + "epoch": 0.3679107961879652, + "grad_norm": 0.7032795548439026, + "learning_rate": 1.89055372991719e-05, + "loss": 1.4193, + "mean_token_accuracy": 0.6519673566023508, + "num_tokens": 562405459.0, + "step": 3349 + }, + { + "entropy": 1.694932798544566, + "epoch": 0.3680206530993381, + "grad_norm": 0.5827463269233704, + "learning_rate": 1.8904771932178484e-05, + "loss": 1.3895, + "mean_token_accuracy": 0.6607838769753774, + "num_tokens": 562585278.0, + "step": 3350 + }, + { + "entropy": 1.7321241994698842, + "epoch": 0.36813051001071107, + "grad_norm": 0.7316332459449768, + "learning_rate": 1.8904006315002605e-05, + "loss": 1.4072, + "mean_token_accuracy": 0.6589037328958511, + "num_tokens": 562743792.0, + "step": 3351 + }, + { + "entropy": 1.7170052528381348, + "epoch": 0.368240366922084, + "grad_norm": 0.9120453596115112, + "learning_rate": 1.8903240447668485e-05, + "loss": 1.4018, + "mean_token_accuracy": 0.6695507715145746, + "num_tokens": 562855638.0, + "step": 3352 + }, + { + "entropy": 1.7425238887468975, + "epoch": 0.36835022383345695, + "grad_norm": 0.6906775832176208, + "learning_rate": 1.8902474330200368e-05, + "loss": 1.4636, + "mean_token_accuracy": 0.6504637797673544, + "num_tokens": 563050942.0, + "step": 3353 + }, + { + "entropy": 1.7821686168511708, + "epoch": 0.36846008074482983, + "grad_norm": 0.7374799847602844, + "learning_rate": 1.8901707962622497e-05, + "loss": 1.4039, + "mean_token_accuracy": 0.6607334365447363, + "num_tokens": 563186855.0, + "step": 3354 + }, + { + "entropy": 1.6392212013403575, + "epoch": 0.3685699376562028, + "grad_norm": 0.5854918360710144, + "learning_rate": 1.890094134495913e-05, + "loss": 1.4505, + "mean_token_accuracy": 0.6548234969377518, + "num_tokens": 563366408.0, + "step": 3355 + }, + { + "entropy": 1.7537512481212616, + "epoch": 0.3686797945675757, + "grad_norm": 0.6592661738395691, + "learning_rate": 1.890017447723453e-05, + "loss": 1.3427, + "mean_token_accuracy": 0.6637191027402878, + "num_tokens": 563500860.0, + "step": 3356 + }, + { + "entropy": 1.7331876854101818, + "epoch": 0.36878965147894865, + "grad_norm": 0.7768663763999939, + "learning_rate": 1.8899407359472966e-05, + "loss": 1.4831, + "mean_token_accuracy": 0.6461548010508219, + "num_tokens": 563724218.0, + "step": 3357 + }, + { + "entropy": 1.7273361086845398, + "epoch": 0.3688995083903216, + "grad_norm": 0.6514448523521423, + "learning_rate": 1.8898639991698723e-05, + "loss": 1.5468, + "mean_token_accuracy": 0.6472266266743342, + "num_tokens": 563920710.0, + "step": 3358 + }, + { + "entropy": 1.7227802574634552, + "epoch": 0.36900936530169454, + "grad_norm": 0.753607988357544, + "learning_rate": 1.889787237393608e-05, + "loss": 1.3811, + "mean_token_accuracy": 0.6565194974342982, + "num_tokens": 564108660.0, + "step": 3359 + }, + { + "entropy": 1.6839437087376912, + "epoch": 0.3691192222130675, + "grad_norm": 0.7818706631660461, + "learning_rate": 1.8897104506209336e-05, + "loss": 1.5772, + "mean_token_accuracy": 0.6447829628984133, + "num_tokens": 564274275.0, + "step": 3360 + }, + { + "entropy": 1.6813337802886963, + "epoch": 0.3692290791244404, + "grad_norm": 0.6889169812202454, + "learning_rate": 1.8896336388542794e-05, + "loss": 1.503, + "mean_token_accuracy": 0.6593608756860098, + "num_tokens": 564465967.0, + "step": 3361 + }, + { + "entropy": 1.6902830203374226, + "epoch": 0.36933893603581336, + "grad_norm": 0.6168293952941895, + "learning_rate": 1.889556802096076e-05, + "loss": 1.4396, + "mean_token_accuracy": 0.6531057059764862, + "num_tokens": 564676289.0, + "step": 3362 + }, + { + "entropy": 1.7228737076123555, + "epoch": 0.3694487929471863, + "grad_norm": 0.7138974666595459, + "learning_rate": 1.889479940348756e-05, + "loss": 1.3175, + "mean_token_accuracy": 0.6605416287978491, + "num_tokens": 564820337.0, + "step": 3363 + }, + { + "entropy": 1.6877204477787018, + "epoch": 0.36955864985855924, + "grad_norm": 0.8381320834159851, + "learning_rate": 1.8894030536147513e-05, + "loss": 1.5089, + "mean_token_accuracy": 0.658082976937294, + "num_tokens": 564968781.0, + "step": 3364 + }, + { + "entropy": 1.7128780285517375, + "epoch": 0.3696685067699322, + "grad_norm": 0.6636347770690918, + "learning_rate": 1.889326141896496e-05, + "loss": 1.4628, + "mean_token_accuracy": 0.6563326021035513, + "num_tokens": 565162237.0, + "step": 3365 + }, + { + "entropy": 1.6565657357374828, + "epoch": 0.3697783636813051, + "grad_norm": 0.7915597558021545, + "learning_rate": 1.889249205196424e-05, + "loss": 1.4628, + "mean_token_accuracy": 0.6564295887947083, + "num_tokens": 565318530.0, + "step": 3366 + }, + { + "entropy": 1.7151463528474171, + "epoch": 0.36988822059267806, + "grad_norm": 0.647465169429779, + "learning_rate": 1.8891722435169703e-05, + "loss": 1.4666, + "mean_token_accuracy": 0.656380852063497, + "num_tokens": 565493389.0, + "step": 3367 + }, + { + "entropy": 1.7164535621802013, + "epoch": 0.36999807750405095, + "grad_norm": 0.7960186004638672, + "learning_rate": 1.8890952568605704e-05, + "loss": 1.6024, + "mean_token_accuracy": 0.6565567404031754, + "num_tokens": 565649508.0, + "step": 3368 + }, + { + "entropy": 1.7425066431363423, + "epoch": 0.3701079344154239, + "grad_norm": 0.7645494937896729, + "learning_rate": 1.8890182452296612e-05, + "loss": 1.5191, + "mean_token_accuracy": 0.6238025277853012, + "num_tokens": 565856767.0, + "step": 3369 + }, + { + "entropy": 1.7707313100496929, + "epoch": 0.37021779132679683, + "grad_norm": 0.7492119669914246, + "learning_rate": 1.88894120862668e-05, + "loss": 1.4832, + "mean_token_accuracy": 0.649360736211141, + "num_tokens": 566027002.0, + "step": 3370 + }, + { + "entropy": 1.67772110303243, + "epoch": 0.37032764823816977, + "grad_norm": 0.6959682106971741, + "learning_rate": 1.8888641470540652e-05, + "loss": 1.4024, + "mean_token_accuracy": 0.6710788160562515, + "num_tokens": 566189471.0, + "step": 3371 + }, + { + "entropy": 1.7469698985417683, + "epoch": 0.3704375051495427, + "grad_norm": 0.6655913591384888, + "learning_rate": 1.8887870605142557e-05, + "loss": 1.3536, + "mean_token_accuracy": 0.6533726006746292, + "num_tokens": 566349741.0, + "step": 3372 + }, + { + "entropy": 1.6737736264864604, + "epoch": 0.37054736206091565, + "grad_norm": 0.7479509115219116, + "learning_rate": 1.8887099490096914e-05, + "loss": 1.2131, + "mean_token_accuracy": 0.6783681710561117, + "num_tokens": 566478311.0, + "step": 3373 + }, + { + "entropy": 1.6851101120313008, + "epoch": 0.3706572189722886, + "grad_norm": 0.6885315179824829, + "learning_rate": 1.8886328125428123e-05, + "loss": 1.3959, + "mean_token_accuracy": 0.6712721387545267, + "num_tokens": 566626451.0, + "step": 3374 + }, + { + "entropy": 1.6743651628494263, + "epoch": 0.37076707588366153, + "grad_norm": 0.6012775897979736, + "learning_rate": 1.88855565111606e-05, + "loss": 1.4005, + "mean_token_accuracy": 0.650342067082723, + "num_tokens": 566793333.0, + "step": 3375 + }, + { + "entropy": 1.6919374863306682, + "epoch": 0.3708769327950345, + "grad_norm": 0.6600878238677979, + "learning_rate": 1.888478464731877e-05, + "loss": 1.2489, + "mean_token_accuracy": 0.6797458678483963, + "num_tokens": 566944348.0, + "step": 3376 + }, + { + "entropy": 1.6906745433807373, + "epoch": 0.3709867897064074, + "grad_norm": 0.6562890410423279, + "learning_rate": 1.8884012533927056e-05, + "loss": 1.2859, + "mean_token_accuracy": 0.6720538040002187, + "num_tokens": 567083801.0, + "step": 3377 + }, + { + "entropy": 1.7041854957739513, + "epoch": 0.37109664661778036, + "grad_norm": 0.6055631637573242, + "learning_rate": 1.88832401710099e-05, + "loss": 1.4635, + "mean_token_accuracy": 0.652542233467102, + "num_tokens": 567280452.0, + "step": 3378 + }, + { + "entropy": 1.7743679384390514, + "epoch": 0.3712065035291533, + "grad_norm": 0.7774965763092041, + "learning_rate": 1.8882467558591744e-05, + "loss": 1.3845, + "mean_token_accuracy": 0.656991238395373, + "num_tokens": 567482039.0, + "step": 3379 + }, + { + "entropy": 1.6754270593325298, + "epoch": 0.37131636044052624, + "grad_norm": 0.8339126110076904, + "learning_rate": 1.8881694696697043e-05, + "loss": 1.4245, + "mean_token_accuracy": 0.6630517592032751, + "num_tokens": 567662177.0, + "step": 3380 + }, + { + "entropy": 1.7435904542605083, + "epoch": 0.3714262173518991, + "grad_norm": 0.591063916683197, + "learning_rate": 1.888092158535025e-05, + "loss": 1.4174, + "mean_token_accuracy": 0.6464981784423193, + "num_tokens": 567855805.0, + "step": 3381 + }, + { + "entropy": 1.6988468567530315, + "epoch": 0.37153607426327206, + "grad_norm": 0.6442577242851257, + "learning_rate": 1.8880148224575845e-05, + "loss": 1.2865, + "mean_token_accuracy": 0.6738250454266866, + "num_tokens": 567992063.0, + "step": 3382 + }, + { + "entropy": 1.7117958962917328, + "epoch": 0.371645931174645, + "grad_norm": 0.663324236869812, + "learning_rate": 1.8879374614398302e-05, + "loss": 1.4257, + "mean_token_accuracy": 0.6571160405874252, + "num_tokens": 568189030.0, + "step": 3383 + }, + { + "entropy": 1.7384320894877117, + "epoch": 0.37175578808601795, + "grad_norm": 0.6145911812782288, + "learning_rate": 1.8878600754842097e-05, + "loss": 1.4549, + "mean_token_accuracy": 0.6452573786179224, + "num_tokens": 568426788.0, + "step": 3384 + }, + { + "entropy": 1.746810535589854, + "epoch": 0.3718656449973909, + "grad_norm": 0.7367507815361023, + "learning_rate": 1.8877826645931735e-05, + "loss": 1.4924, + "mean_token_accuracy": 0.6439058085282644, + "num_tokens": 568575666.0, + "step": 3385 + }, + { + "entropy": 1.7746360798676808, + "epoch": 0.37197550190876383, + "grad_norm": 0.6903072595596313, + "learning_rate": 1.8877052287691703e-05, + "loss": 1.3686, + "mean_token_accuracy": 0.6597933818896612, + "num_tokens": 568727841.0, + "step": 3386 + }, + { + "entropy": 1.7943654855092366, + "epoch": 0.37208535882013677, + "grad_norm": 0.7477875351905823, + "learning_rate": 1.887627768014652e-05, + "loss": 1.5748, + "mean_token_accuracy": 0.6329491684834162, + "num_tokens": 568951489.0, + "step": 3387 + }, + { + "entropy": 1.6621526181697845, + "epoch": 0.3721952157315097, + "grad_norm": 0.694236159324646, + "learning_rate": 1.8875502823320695e-05, + "loss": 1.3952, + "mean_token_accuracy": 0.661028265953064, + "num_tokens": 569116054.0, + "step": 3388 + }, + { + "entropy": 1.72781902551651, + "epoch": 0.37230507264288265, + "grad_norm": 0.7133349180221558, + "learning_rate": 1.8874727717238756e-05, + "loss": 1.4526, + "mean_token_accuracy": 0.6458436846733093, + "num_tokens": 569290226.0, + "step": 3389 + }, + { + "entropy": 1.7413524389266968, + "epoch": 0.3724149295542556, + "grad_norm": 0.6716554164886475, + "learning_rate": 1.8873952361925233e-05, + "loss": 1.3317, + "mean_token_accuracy": 0.6584653854370117, + "num_tokens": 569445157.0, + "step": 3390 + }, + { + "entropy": 1.676645815372467, + "epoch": 0.37252478646562853, + "grad_norm": 0.6800124645233154, + "learning_rate": 1.8873176757404666e-05, + "loss": 1.5388, + "mean_token_accuracy": 0.6283597896496455, + "num_tokens": 569724434.0, + "step": 3391 + }, + { + "entropy": 1.6982823014259338, + "epoch": 0.3726346433770015, + "grad_norm": 0.6546007394790649, + "learning_rate": 1.8872400903701602e-05, + "loss": 1.3469, + "mean_token_accuracy": 0.6600322326024374, + "num_tokens": 569872986.0, + "step": 3392 + }, + { + "entropy": 1.716494898001353, + "epoch": 0.3727445002883744, + "grad_norm": 0.711789608001709, + "learning_rate": 1.8871624800840595e-05, + "loss": 1.3059, + "mean_token_accuracy": 0.664145290851593, + "num_tokens": 569984629.0, + "step": 3393 + }, + { + "entropy": 1.723552147547404, + "epoch": 0.37285435719974735, + "grad_norm": 0.7755283713340759, + "learning_rate": 1.887084844884621e-05, + "loss": 1.3659, + "mean_token_accuracy": 0.6677046219507853, + "num_tokens": 570215230.0, + "step": 3394 + }, + { + "entropy": 1.7178972562154133, + "epoch": 0.37296421411112024, + "grad_norm": 0.7582644820213318, + "learning_rate": 1.8870071847743023e-05, + "loss": 1.395, + "mean_token_accuracy": 0.65921584268411, + "num_tokens": 570419958.0, + "step": 3395 + }, + { + "entropy": 1.7200748125712078, + "epoch": 0.3730740710224932, + "grad_norm": 0.9002476930618286, + "learning_rate": 1.8869294997555604e-05, + "loss": 1.3866, + "mean_token_accuracy": 0.6646259625752767, + "num_tokens": 570546912.0, + "step": 3396 + }, + { + "entropy": 1.6706381837526958, + "epoch": 0.3731839279338661, + "grad_norm": 0.820124626159668, + "learning_rate": 1.8868517898308548e-05, + "loss": 1.4343, + "mean_token_accuracy": 0.6535915782054266, + "num_tokens": 570738384.0, + "step": 3397 + }, + { + "entropy": 1.700315882762273, + "epoch": 0.37329378484523906, + "grad_norm": 0.6420239210128784, + "learning_rate": 1.8867740550026443e-05, + "loss": 1.4069, + "mean_token_accuracy": 0.6586662083864212, + "num_tokens": 570966515.0, + "step": 3398 + }, + { + "entropy": 1.7011422216892242, + "epoch": 0.373403641756612, + "grad_norm": 0.774379312992096, + "learning_rate": 1.8866962952733898e-05, + "loss": 1.4374, + "mean_token_accuracy": 0.6431511243184408, + "num_tokens": 571157353.0, + "step": 3399 + }, + { + "entropy": 1.7771065930525463, + "epoch": 0.37351349866798494, + "grad_norm": 0.6903269290924072, + "learning_rate": 1.886618510645552e-05, + "loss": 1.2827, + "mean_token_accuracy": 0.6651173532009125, + "num_tokens": 571271399.0, + "step": 3400 + }, + { + "entropy": 1.7323172986507416, + "epoch": 0.3736233555793579, + "grad_norm": 0.5979676246643066, + "learning_rate": 1.8865407011215922e-05, + "loss": 1.3776, + "mean_token_accuracy": 0.6612506260474523, + "num_tokens": 571452904.0, + "step": 3401 + }, + { + "entropy": 1.7338752647240956, + "epoch": 0.3737332124907308, + "grad_norm": 0.7211189270019531, + "learning_rate": 1.8864628667039742e-05, + "loss": 1.3743, + "mean_token_accuracy": 0.658988431096077, + "num_tokens": 571594720.0, + "step": 3402 + }, + { + "entropy": 1.6841518382231395, + "epoch": 0.37384306940210377, + "grad_norm": 0.5563689470291138, + "learning_rate": 1.8863850073951608e-05, + "loss": 1.3286, + "mean_token_accuracy": 0.6501269191503525, + "num_tokens": 571787515.0, + "step": 3403 + }, + { + "entropy": 1.7327661911646526, + "epoch": 0.3739529263134767, + "grad_norm": 0.6619580984115601, + "learning_rate": 1.886307123197616e-05, + "loss": 1.5635, + "mean_token_accuracy": 0.6362739006678263, + "num_tokens": 571974947.0, + "step": 3404 + }, + { + "entropy": 1.7072784701983135, + "epoch": 0.37406278322484965, + "grad_norm": 0.6384603381156921, + "learning_rate": 1.8862292141138053e-05, + "loss": 1.3928, + "mean_token_accuracy": 0.6528366059064865, + "num_tokens": 572153036.0, + "step": 3405 + }, + { + "entropy": 1.7138684292634327, + "epoch": 0.3741726401362226, + "grad_norm": 0.6947295069694519, + "learning_rate": 1.8861512801461943e-05, + "loss": 1.3127, + "mean_token_accuracy": 0.6598065594832102, + "num_tokens": 572292952.0, + "step": 3406 + }, + { + "entropy": 1.7150700986385345, + "epoch": 0.37428249704759553, + "grad_norm": 0.711796760559082, + "learning_rate": 1.8860733212972497e-05, + "loss": 1.3518, + "mean_token_accuracy": 0.6606613347927729, + "num_tokens": 572445917.0, + "step": 3407 + }, + { + "entropy": 1.7485672036806743, + "epoch": 0.37439235395896847, + "grad_norm": 0.6744566559791565, + "learning_rate": 1.8859953375694383e-05, + "loss": 1.4157, + "mean_token_accuracy": 0.6547067513068517, + "num_tokens": 572594708.0, + "step": 3408 + }, + { + "entropy": 1.7242934902509053, + "epoch": 0.37450221087034136, + "grad_norm": 0.7786602973937988, + "learning_rate": 1.8859173289652288e-05, + "loss": 1.4001, + "mean_token_accuracy": 0.6599243432283401, + "num_tokens": 572783684.0, + "step": 3409 + }, + { + "entropy": 1.606506069501241, + "epoch": 0.3746120677817143, + "grad_norm": 0.6144716143608093, + "learning_rate": 1.88583929548709e-05, + "loss": 1.3857, + "mean_token_accuracy": 0.6582548320293427, + "num_tokens": 573015722.0, + "step": 3410 + }, + { + "entropy": 1.774994472662608, + "epoch": 0.37472192469308724, + "grad_norm": 0.6956934332847595, + "learning_rate": 1.8857612371374914e-05, + "loss": 1.4739, + "mean_token_accuracy": 0.6409755696853002, + "num_tokens": 573214915.0, + "step": 3411 + }, + { + "entropy": 1.7437300086021423, + "epoch": 0.3748317816044602, + "grad_norm": 0.7915884852409363, + "learning_rate": 1.885683153918904e-05, + "loss": 1.4826, + "mean_token_accuracy": 0.6374993075927099, + "num_tokens": 573455789.0, + "step": 3412 + }, + { + "entropy": 1.663759668668111, + "epoch": 0.3749416385158331, + "grad_norm": 0.7212685942649841, + "learning_rate": 1.8856050458337985e-05, + "loss": 1.3996, + "mean_token_accuracy": 0.6592791775862376, + "num_tokens": 573633232.0, + "step": 3413 + }, + { + "entropy": 1.7127246956030528, + "epoch": 0.37505149542720606, + "grad_norm": 0.5660611391067505, + "learning_rate": 1.885526912884648e-05, + "loss": 1.4225, + "mean_token_accuracy": 0.6511427859465281, + "num_tokens": 573834729.0, + "step": 3414 + }, + { + "entropy": 1.7536275585492451, + "epoch": 0.375161352338579, + "grad_norm": 0.6715665459632874, + "learning_rate": 1.885448755073924e-05, + "loss": 1.2663, + "mean_token_accuracy": 0.6831430196762085, + "num_tokens": 573966929.0, + "step": 3415 + }, + { + "entropy": 1.6836529274781544, + "epoch": 0.37527120924995194, + "grad_norm": 0.6396788358688354, + "learning_rate": 1.8853705724041008e-05, + "loss": 1.5899, + "mean_token_accuracy": 0.6472560266653696, + "num_tokens": 574171428.0, + "step": 3416 + }, + { + "entropy": 1.701483239730199, + "epoch": 0.3753810661613249, + "grad_norm": 0.6624711155891418, + "learning_rate": 1.8852923648776534e-05, + "loss": 1.3481, + "mean_token_accuracy": 0.6699345856904984, + "num_tokens": 574356626.0, + "step": 3417 + }, + { + "entropy": 1.7148587902386982, + "epoch": 0.3754909230726978, + "grad_norm": 0.6820365786552429, + "learning_rate": 1.885214132497056e-05, + "loss": 1.3628, + "mean_token_accuracy": 0.6586700628201166, + "num_tokens": 574479005.0, + "step": 3418 + }, + { + "entropy": 1.6891990701357524, + "epoch": 0.37560077998407077, + "grad_norm": 0.8026529550552368, + "learning_rate": 1.8851358752647855e-05, + "loss": 1.483, + "mean_token_accuracy": 0.6601535677909851, + "num_tokens": 574600042.0, + "step": 3419 + }, + { + "entropy": 1.6742752293745677, + "epoch": 0.3757106368954437, + "grad_norm": 0.7593013048171997, + "learning_rate": 1.885057593183318e-05, + "loss": 1.2939, + "mean_token_accuracy": 0.6685616920391718, + "num_tokens": 574733331.0, + "step": 3420 + }, + { + "entropy": 1.6507653097311656, + "epoch": 0.37582049380681665, + "grad_norm": 0.5844452977180481, + "learning_rate": 1.8849792862551318e-05, + "loss": 1.3512, + "mean_token_accuracy": 0.6571490665276846, + "num_tokens": 574935739.0, + "step": 3421 + }, + { + "entropy": 1.739424576361974, + "epoch": 0.37593035071818953, + "grad_norm": 0.6515421867370605, + "learning_rate": 1.8849009544827048e-05, + "loss": 1.5581, + "mean_token_accuracy": 0.6339648912350336, + "num_tokens": 575141640.0, + "step": 3422 + }, + { + "entropy": 1.7240214546521504, + "epoch": 0.3760402076295625, + "grad_norm": 0.8800962567329407, + "learning_rate": 1.8848225978685163e-05, + "loss": 1.4257, + "mean_token_accuracy": 0.6527638087670008, + "num_tokens": 575274148.0, + "step": 3423 + }, + { + "entropy": 1.728806068499883, + "epoch": 0.3761500645409354, + "grad_norm": 0.6778604388237, + "learning_rate": 1.884744216415046e-05, + "loss": 1.4071, + "mean_token_accuracy": 0.6551901549100876, + "num_tokens": 575411261.0, + "step": 3424 + }, + { + "entropy": 1.6978506247202556, + "epoch": 0.37625992145230835, + "grad_norm": 0.7038581967353821, + "learning_rate": 1.8846658101247748e-05, + "loss": 1.3712, + "mean_token_accuracy": 0.6615935812393824, + "num_tokens": 575587362.0, + "step": 3425 + }, + { + "entropy": 1.6618886888027191, + "epoch": 0.3763697783636813, + "grad_norm": 0.6956599354743958, + "learning_rate": 1.8845873790001848e-05, + "loss": 1.308, + "mean_token_accuracy": 0.6621120274066925, + "num_tokens": 575763635.0, + "step": 3426 + }, + { + "entropy": 1.7019036809603374, + "epoch": 0.37647963527505424, + "grad_norm": 0.7013808488845825, + "learning_rate": 1.8845089230437573e-05, + "loss": 1.3993, + "mean_token_accuracy": 0.6591120461622874, + "num_tokens": 575906834.0, + "step": 3427 + }, + { + "entropy": 1.6568001906077068, + "epoch": 0.3765894921864272, + "grad_norm": 0.654579758644104, + "learning_rate": 1.8844304422579756e-05, + "loss": 1.4497, + "mean_token_accuracy": 0.6660924007495245, + "num_tokens": 576101135.0, + "step": 3428 + }, + { + "entropy": 1.734635551770528, + "epoch": 0.3766993490978001, + "grad_norm": 0.7906395792961121, + "learning_rate": 1.884351936645325e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6772653212149938, + "num_tokens": 576231678.0, + "step": 3429 + }, + { + "entropy": 1.6919464965661366, + "epoch": 0.37680920600917306, + "grad_norm": 0.7029792666435242, + "learning_rate": 1.8842734062082878e-05, + "loss": 1.5751, + "mean_token_accuracy": 0.6377601400017738, + "num_tokens": 576394578.0, + "step": 3430 + }, + { + "entropy": 1.74019859234492, + "epoch": 0.376919062920546, + "grad_norm": 0.6424586772918701, + "learning_rate": 1.8841948509493517e-05, + "loss": 1.4304, + "mean_token_accuracy": 0.6536309023698171, + "num_tokens": 576599050.0, + "step": 3431 + }, + { + "entropy": 1.7083908418814342, + "epoch": 0.37702891983191894, + "grad_norm": 0.6925437450408936, + "learning_rate": 1.8841162708710015e-05, + "loss": 1.4701, + "mean_token_accuracy": 0.6461886862913767, + "num_tokens": 576788717.0, + "step": 3432 + }, + { + "entropy": 1.7433116436004639, + "epoch": 0.3771387767432919, + "grad_norm": 0.6714274287223816, + "learning_rate": 1.8840376659757247e-05, + "loss": 1.4304, + "mean_token_accuracy": 0.6540505588054657, + "num_tokens": 576958988.0, + "step": 3433 + }, + { + "entropy": 1.765092372894287, + "epoch": 0.3772486336546648, + "grad_norm": 0.7673705220222473, + "learning_rate": 1.8839590362660088e-05, + "loss": 1.5501, + "mean_token_accuracy": 0.6285836895306905, + "num_tokens": 577128512.0, + "step": 3434 + }, + { + "entropy": 1.7895864645640056, + "epoch": 0.37735849056603776, + "grad_norm": 0.6441873908042908, + "learning_rate": 1.8838803817443428e-05, + "loss": 1.5039, + "mean_token_accuracy": 0.6534687529007593, + "num_tokens": 577271620.0, + "step": 3435 + }, + { + "entropy": 1.6606577336788177, + "epoch": 0.37746834747741065, + "grad_norm": 0.6333370804786682, + "learning_rate": 1.8838017024132163e-05, + "loss": 1.4591, + "mean_token_accuracy": 0.6549960821866989, + "num_tokens": 577514456.0, + "step": 3436 + }, + { + "entropy": 1.6238328516483307, + "epoch": 0.3775782043887836, + "grad_norm": 0.6960654854774475, + "learning_rate": 1.883722998275119e-05, + "loss": 1.5595, + "mean_token_accuracy": 0.6362607727448145, + "num_tokens": 577746103.0, + "step": 3437 + }, + { + "entropy": 1.7373960614204407, + "epoch": 0.37768806130015653, + "grad_norm": 0.7334326505661011, + "learning_rate": 1.8836442693325415e-05, + "loss": 1.2604, + "mean_token_accuracy": 0.6687419414520264, + "num_tokens": 577906176.0, + "step": 3438 + }, + { + "entropy": 1.651681274175644, + "epoch": 0.37779791821152947, + "grad_norm": 0.6971881985664368, + "learning_rate": 1.8835655155879765e-05, + "loss": 1.3592, + "mean_token_accuracy": 0.6670136153697968, + "num_tokens": 578070962.0, + "step": 3439 + }, + { + "entropy": 1.7302599747975667, + "epoch": 0.3779077751229024, + "grad_norm": 0.6110210418701172, + "learning_rate": 1.8834867370439158e-05, + "loss": 1.3591, + "mean_token_accuracy": 0.6587302833795547, + "num_tokens": 578248089.0, + "step": 3440 + }, + { + "entropy": 1.7012074490388234, + "epoch": 0.37801763203427535, + "grad_norm": 0.5965494513511658, + "learning_rate": 1.883407933702853e-05, + "loss": 1.4663, + "mean_token_accuracy": 0.6482534607251486, + "num_tokens": 578431091.0, + "step": 3441 + }, + { + "entropy": 1.7487133642037709, + "epoch": 0.3781274889456483, + "grad_norm": 0.7280961275100708, + "learning_rate": 1.8833291055672823e-05, + "loss": 1.3637, + "mean_token_accuracy": 0.6615277727444967, + "num_tokens": 578597636.0, + "step": 3442 + }, + { + "entropy": 1.6576574742794037, + "epoch": 0.37823734585702123, + "grad_norm": 0.7443664073944092, + "learning_rate": 1.883250252639698e-05, + "loss": 1.508, + "mean_token_accuracy": 0.6484198073546091, + "num_tokens": 578781033.0, + "step": 3443 + }, + { + "entropy": 1.7675111095110576, + "epoch": 0.3783472027683942, + "grad_norm": 0.818128228187561, + "learning_rate": 1.883171374922596e-05, + "loss": 1.4656, + "mean_token_accuracy": 0.6324175000190735, + "num_tokens": 578991493.0, + "step": 3444 + }, + { + "entropy": 1.7102212806542714, + "epoch": 0.3784570596797671, + "grad_norm": 0.5803321599960327, + "learning_rate": 1.8830924724184735e-05, + "loss": 1.5241, + "mean_token_accuracy": 0.6512432843446732, + "num_tokens": 579180046.0, + "step": 3445 + }, + { + "entropy": 1.6907643973827362, + "epoch": 0.37856691659114006, + "grad_norm": 0.6002610921859741, + "learning_rate": 1.8830135451298267e-05, + "loss": 1.3832, + "mean_token_accuracy": 0.6592213263114294, + "num_tokens": 579369176.0, + "step": 3446 + }, + { + "entropy": 1.6708094378312428, + "epoch": 0.378676773502513, + "grad_norm": 0.7741876840591431, + "learning_rate": 1.882934593059154e-05, + "loss": 1.3733, + "mean_token_accuracy": 0.6698874334494272, + "num_tokens": 579516065.0, + "step": 3447 + }, + { + "entropy": 1.6863604684670765, + "epoch": 0.37878663041388594, + "grad_norm": 0.5927191972732544, + "learning_rate": 1.8828556162089544e-05, + "loss": 1.3393, + "mean_token_accuracy": 0.6622706055641174, + "num_tokens": 579667258.0, + "step": 3448 + }, + { + "entropy": 1.6480213006337483, + "epoch": 0.3788964873252588, + "grad_norm": 0.6203927397727966, + "learning_rate": 1.882776614581727e-05, + "loss": 1.3284, + "mean_token_accuracy": 0.6719114383061727, + "num_tokens": 579833662.0, + "step": 3449 + }, + { + "entropy": 1.6983853876590729, + "epoch": 0.37900634423663176, + "grad_norm": 0.7154219150543213, + "learning_rate": 1.882697588179973e-05, + "loss": 1.2398, + "mean_token_accuracy": 0.6767720828453699, + "num_tokens": 579961516.0, + "step": 3450 + }, + { + "entropy": 1.6762764751911163, + "epoch": 0.3791162011480047, + "grad_norm": 0.8314480781555176, + "learning_rate": 1.882618537006193e-05, + "loss": 1.331, + "mean_token_accuracy": 0.6671194086472193, + "num_tokens": 580068709.0, + "step": 3451 + }, + { + "entropy": 1.6201636294523876, + "epoch": 0.37922605805937765, + "grad_norm": 0.666167676448822, + "learning_rate": 1.8825394610628885e-05, + "loss": 1.2919, + "mean_token_accuracy": 0.6708478977282842, + "num_tokens": 580270556.0, + "step": 3452 + }, + { + "entropy": 1.6574621001879375, + "epoch": 0.3793359149707506, + "grad_norm": 0.6271048188209534, + "learning_rate": 1.882460360352563e-05, + "loss": 1.4808, + "mean_token_accuracy": 0.6487952421108881, + "num_tokens": 580449275.0, + "step": 3453 + }, + { + "entropy": 1.6838637391726177, + "epoch": 0.37944577188212353, + "grad_norm": 0.7442733645439148, + "learning_rate": 1.8823812348777194e-05, + "loss": 1.4904, + "mean_token_accuracy": 0.6588171670834223, + "num_tokens": 580589870.0, + "step": 3454 + }, + { + "entropy": 1.7033161123593648, + "epoch": 0.37955562879349647, + "grad_norm": 0.6353382468223572, + "learning_rate": 1.8823020846408624e-05, + "loss": 1.3264, + "mean_token_accuracy": 0.6653269082307816, + "num_tokens": 580750981.0, + "step": 3455 + }, + { + "entropy": 1.6577220559120178, + "epoch": 0.3796654857048694, + "grad_norm": 0.7376974821090698, + "learning_rate": 1.8822229096444974e-05, + "loss": 1.3135, + "mean_token_accuracy": 0.6758194168408712, + "num_tokens": 580903947.0, + "step": 3456 + }, + { + "entropy": 1.713914414246877, + "epoch": 0.37977534261624235, + "grad_norm": 0.6808127164840698, + "learning_rate": 1.882143709891129e-05, + "loss": 1.4825, + "mean_token_accuracy": 0.6571186631917953, + "num_tokens": 581099884.0, + "step": 3457 + }, + { + "entropy": 1.7346510489781697, + "epoch": 0.3798851995276153, + "grad_norm": 0.8213891983032227, + "learning_rate": 1.882064485383265e-05, + "loss": 1.656, + "mean_token_accuracy": 0.6358093395829201, + "num_tokens": 581266800.0, + "step": 3458 + }, + { + "entropy": 1.6873709658781688, + "epoch": 0.37999505643898823, + "grad_norm": 0.6150254607200623, + "learning_rate": 1.8819852361234122e-05, + "loss": 1.3555, + "mean_token_accuracy": 0.6584896544615427, + "num_tokens": 581444967.0, + "step": 3459 + }, + { + "entropy": 1.65604763229688, + "epoch": 0.3801049133503612, + "grad_norm": 0.6561197638511658, + "learning_rate": 1.8819059621140795e-05, + "loss": 1.2817, + "mean_token_accuracy": 0.6694964021444321, + "num_tokens": 581564226.0, + "step": 3460 + }, + { + "entropy": 1.727463076512019, + "epoch": 0.3802147702617341, + "grad_norm": 0.8681771159172058, + "learning_rate": 1.8818266633577754e-05, + "loss": 1.505, + "mean_token_accuracy": 0.633656973640124, + "num_tokens": 581760317.0, + "step": 3461 + }, + { + "entropy": 1.6862981617450714, + "epoch": 0.38032462717310705, + "grad_norm": 0.7154708504676819, + "learning_rate": 1.8817473398570093e-05, + "loss": 1.3987, + "mean_token_accuracy": 0.6559828768173853, + "num_tokens": 581924558.0, + "step": 3462 + }, + { + "entropy": 1.7164887289206188, + "epoch": 0.38043448408447994, + "grad_norm": 0.6370391249656677, + "learning_rate": 1.8816679916142926e-05, + "loss": 1.4157, + "mean_token_accuracy": 0.6538357237974802, + "num_tokens": 582107450.0, + "step": 3463 + }, + { + "entropy": 1.7019615570704143, + "epoch": 0.3805443409958529, + "grad_norm": 0.6402043700218201, + "learning_rate": 1.881588618632136e-05, + "loss": 1.3387, + "mean_token_accuracy": 0.6574279069900513, + "num_tokens": 582239856.0, + "step": 3464 + }, + { + "entropy": 1.7193419933319092, + "epoch": 0.3806541979072258, + "grad_norm": 0.6499764323234558, + "learning_rate": 1.8815092209130517e-05, + "loss": 1.3768, + "mean_token_accuracy": 0.6483793556690216, + "num_tokens": 582387148.0, + "step": 3465 + }, + { + "entropy": 1.7380633453528087, + "epoch": 0.38076405481859876, + "grad_norm": 0.637168824672699, + "learning_rate": 1.881429798459553e-05, + "loss": 1.549, + "mean_token_accuracy": 0.6460103044907252, + "num_tokens": 582581215.0, + "step": 3466 + }, + { + "entropy": 1.7223777274290721, + "epoch": 0.3808739117299717, + "grad_norm": 0.8232377171516418, + "learning_rate": 1.881350351274153e-05, + "loss": 1.4829, + "mean_token_accuracy": 0.6414266675710678, + "num_tokens": 582737460.0, + "step": 3467 + }, + { + "entropy": 1.715603917837143, + "epoch": 0.38098376864134464, + "grad_norm": 0.7367724776268005, + "learning_rate": 1.8812708793593665e-05, + "loss": 1.4147, + "mean_token_accuracy": 0.6520026822884878, + "num_tokens": 582887078.0, + "step": 3468 + }, + { + "entropy": 1.6945099035898845, + "epoch": 0.3810936255527176, + "grad_norm": 0.7187338471412659, + "learning_rate": 1.8811913827177086e-05, + "loss": 1.3723, + "mean_token_accuracy": 0.6605968276659647, + "num_tokens": 583095650.0, + "step": 3469 + }, + { + "entropy": 1.7937320371468861, + "epoch": 0.3812034824640905, + "grad_norm": 0.7050454020500183, + "learning_rate": 1.8811118613516958e-05, + "loss": 1.3214, + "mean_token_accuracy": 0.6556558360656103, + "num_tokens": 583233702.0, + "step": 3470 + }, + { + "entropy": 1.739583859841029, + "epoch": 0.38131333937546347, + "grad_norm": 4.8021721839904785, + "learning_rate": 1.8810323152638442e-05, + "loss": 1.2322, + "mean_token_accuracy": 0.657580296198527, + "num_tokens": 583395812.0, + "step": 3471 + }, + { + "entropy": 1.6864960491657257, + "epoch": 0.3814231962868364, + "grad_norm": 0.767795979976654, + "learning_rate": 1.8809527444566724e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6574305593967438, + "num_tokens": 583581686.0, + "step": 3472 + }, + { + "entropy": 1.7561549345652263, + "epoch": 0.38153305319820935, + "grad_norm": 0.7138762474060059, + "learning_rate": 1.8808731489326976e-05, + "loss": 1.4787, + "mean_token_accuracy": 0.6449342767397562, + "num_tokens": 583820312.0, + "step": 3473 + }, + { + "entropy": 1.7409043808778126, + "epoch": 0.3816429101095823, + "grad_norm": 0.6710432171821594, + "learning_rate": 1.8807935286944397e-05, + "loss": 1.5292, + "mean_token_accuracy": 0.6325879693031311, + "num_tokens": 584045229.0, + "step": 3474 + }, + { + "entropy": 1.7295817732810974, + "epoch": 0.38175276702095523, + "grad_norm": 0.7256639003753662, + "learning_rate": 1.880713883744418e-05, + "loss": 1.5294, + "mean_token_accuracy": 0.6477811336517334, + "num_tokens": 584199841.0, + "step": 3475 + }, + { + "entropy": 1.6922647754351299, + "epoch": 0.3818626239323281, + "grad_norm": 0.7137476801872253, + "learning_rate": 1.8806342140851545e-05, + "loss": 1.2719, + "mean_token_accuracy": 0.6746822595596313, + "num_tokens": 584302842.0, + "step": 3476 + }, + { + "entropy": 1.741408884525299, + "epoch": 0.38197248084370106, + "grad_norm": 0.6547417044639587, + "learning_rate": 1.880554519719169e-05, + "loss": 1.4459, + "mean_token_accuracy": 0.6518658250570297, + "num_tokens": 584533354.0, + "step": 3477 + }, + { + "entropy": 1.6792203883330028, + "epoch": 0.382082337755074, + "grad_norm": 0.6794640421867371, + "learning_rate": 1.8804748006489852e-05, + "loss": 1.4004, + "mean_token_accuracy": 0.6527031362056732, + "num_tokens": 584699604.0, + "step": 3478 + }, + { + "entropy": 1.7532505889733632, + "epoch": 0.38219219466644694, + "grad_norm": 0.7683124542236328, + "learning_rate": 1.880395056877126e-05, + "loss": 1.4457, + "mean_token_accuracy": 0.6515724509954453, + "num_tokens": 584869532.0, + "step": 3479 + }, + { + "entropy": 1.7253733774026234, + "epoch": 0.3823020515778199, + "grad_norm": 0.6482527256011963, + "learning_rate": 1.880315288406114e-05, + "loss": 1.5189, + "mean_token_accuracy": 0.6325220863024393, + "num_tokens": 585042675.0, + "step": 3480 + }, + { + "entropy": 1.7039113640785217, + "epoch": 0.3824119084891928, + "grad_norm": 0.6514295935630798, + "learning_rate": 1.8802354952384753e-05, + "loss": 1.4191, + "mean_token_accuracy": 0.6613591512044271, + "num_tokens": 585205823.0, + "step": 3481 + }, + { + "entropy": 1.6615497569243114, + "epoch": 0.38252176540056576, + "grad_norm": 0.7592434883117676, + "learning_rate": 1.8801556773767348e-05, + "loss": 1.2488, + "mean_token_accuracy": 0.6740445991357168, + "num_tokens": 585359134.0, + "step": 3482 + }, + { + "entropy": 1.730480541785558, + "epoch": 0.3826316223119387, + "grad_norm": 0.7494388818740845, + "learning_rate": 1.8800758348234184e-05, + "loss": 1.4356, + "mean_token_accuracy": 0.6558689872423807, + "num_tokens": 585562435.0, + "step": 3483 + }, + { + "entropy": 1.7431610922018688, + "epoch": 0.38274147922331164, + "grad_norm": 0.6161172986030579, + "learning_rate": 1.8799959675810537e-05, + "loss": 1.3557, + "mean_token_accuracy": 0.655944844086965, + "num_tokens": 585766828.0, + "step": 3484 + }, + { + "entropy": 1.7082973023255665, + "epoch": 0.3828513361346846, + "grad_norm": 1.0127819776535034, + "learning_rate": 1.8799160756521678e-05, + "loss": 1.2512, + "mean_token_accuracy": 0.6891203025976816, + "num_tokens": 585899197.0, + "step": 3485 + }, + { + "entropy": 1.7385966678460438, + "epoch": 0.3829611930460575, + "grad_norm": 0.7442635893821716, + "learning_rate": 1.8798361590392894e-05, + "loss": 1.4916, + "mean_token_accuracy": 0.6403134316205978, + "num_tokens": 586067153.0, + "step": 3486 + }, + { + "entropy": 1.72092600663503, + "epoch": 0.38307104995743047, + "grad_norm": 0.6867280006408691, + "learning_rate": 1.8797562177449483e-05, + "loss": 1.3125, + "mean_token_accuracy": 0.6737407147884369, + "num_tokens": 586200744.0, + "step": 3487 + }, + { + "entropy": 1.6985297699769337, + "epoch": 0.3831809068688034, + "grad_norm": 0.6545002460479736, + "learning_rate": 1.879676251771674e-05, + "loss": 1.5108, + "mean_token_accuracy": 0.632008487979571, + "num_tokens": 586382656.0, + "step": 3488 + }, + { + "entropy": 1.665940374135971, + "epoch": 0.38329076378017635, + "grad_norm": 0.7648383975028992, + "learning_rate": 1.879596261121998e-05, + "loss": 1.4246, + "mean_token_accuracy": 0.6555665085713068, + "num_tokens": 586614246.0, + "step": 3489 + }, + { + "entropy": 1.6751268605391185, + "epoch": 0.38340062069154923, + "grad_norm": 0.6534166932106018, + "learning_rate": 1.8795162457984516e-05, + "loss": 1.4129, + "mean_token_accuracy": 0.6518707672754923, + "num_tokens": 586793947.0, + "step": 3490 + }, + { + "entropy": 1.6860232551892598, + "epoch": 0.3835104776029222, + "grad_norm": 0.697482705116272, + "learning_rate": 1.8794362058035665e-05, + "loss": 1.1756, + "mean_token_accuracy": 0.6882057338953018, + "num_tokens": 586907081.0, + "step": 3491 + }, + { + "entropy": 1.6990710695584614, + "epoch": 0.3836203345142951, + "grad_norm": 0.559978187084198, + "learning_rate": 1.879356141139878e-05, + "loss": 1.5079, + "mean_token_accuracy": 0.6403456131617228, + "num_tokens": 587108420.0, + "step": 3492 + }, + { + "entropy": 1.7045027613639832, + "epoch": 0.38373019142566805, + "grad_norm": 0.6749347448348999, + "learning_rate": 1.879276051809918e-05, + "loss": 1.3578, + "mean_token_accuracy": 0.6670193572839102, + "num_tokens": 587270046.0, + "step": 3493 + }, + { + "entropy": 1.6886097590128581, + "epoch": 0.383840048337041, + "grad_norm": 0.7157772779464722, + "learning_rate": 1.879195937816222e-05, + "loss": 1.3459, + "mean_token_accuracy": 0.6764027178287506, + "num_tokens": 587451691.0, + "step": 3494 + }, + { + "entropy": 1.696417550245921, + "epoch": 0.38394990524841394, + "grad_norm": 0.7377708554267883, + "learning_rate": 1.8791157991613258e-05, + "loss": 1.506, + "mean_token_accuracy": 0.6467360059420267, + "num_tokens": 587615528.0, + "step": 3495 + }, + { + "entropy": 1.7199425995349884, + "epoch": 0.3840597621597869, + "grad_norm": 0.7708967328071594, + "learning_rate": 1.879035635847766e-05, + "loss": 1.4605, + "mean_token_accuracy": 0.6508774061997732, + "num_tokens": 587779213.0, + "step": 3496 + }, + { + "entropy": 1.719101478656133, + "epoch": 0.3841696190711598, + "grad_norm": 0.7188828587532043, + "learning_rate": 1.878955447878079e-05, + "loss": 1.6502, + "mean_token_accuracy": 0.6291324868798256, + "num_tokens": 587963491.0, + "step": 3497 + }, + { + "entropy": 1.6726809938748677, + "epoch": 0.38427947598253276, + "grad_norm": 0.8379467725753784, + "learning_rate": 1.8788752352548032e-05, + "loss": 1.4745, + "mean_token_accuracy": 0.6412243594725927, + "num_tokens": 588138029.0, + "step": 3498 + }, + { + "entropy": 1.7113063037395477, + "epoch": 0.3843893328939057, + "grad_norm": 0.6474940180778503, + "learning_rate": 1.8787949979804773e-05, + "loss": 1.4364, + "mean_token_accuracy": 0.6462200383345286, + "num_tokens": 588299515.0, + "step": 3499 + }, + { + "entropy": 1.7047918836275737, + "epoch": 0.38449918980527864, + "grad_norm": 0.8152151703834534, + "learning_rate": 1.8787147360576407e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6530329436063766, + "num_tokens": 588460227.0, + "step": 3500 + }, + { + "entropy": 1.68595157066981, + "epoch": 0.3846090467166516, + "grad_norm": 0.6358811855316162, + "learning_rate": 1.8786344494888334e-05, + "loss": 1.3389, + "mean_token_accuracy": 0.6701284398635229, + "num_tokens": 588603997.0, + "step": 3501 + }, + { + "entropy": 1.6604685087998707, + "epoch": 0.3847189036280245, + "grad_norm": 0.6849839091300964, + "learning_rate": 1.8785541382765963e-05, + "loss": 1.2876, + "mean_token_accuracy": 0.6715737382570902, + "num_tokens": 588756310.0, + "step": 3502 + }, + { + "entropy": 1.7432369391123455, + "epoch": 0.3848287605393974, + "grad_norm": 0.7382224202156067, + "learning_rate": 1.8784738024234724e-05, + "loss": 1.335, + "mean_token_accuracy": 0.6696681876977285, + "num_tokens": 588952647.0, + "step": 3503 + }, + { + "entropy": 1.7226824462413788, + "epoch": 0.38493861745077035, + "grad_norm": 0.7350408434867859, + "learning_rate": 1.8783934419320026e-05, + "loss": 1.4502, + "mean_token_accuracy": 0.652747223774592, + "num_tokens": 589164790.0, + "step": 3504 + }, + { + "entropy": 1.7092638711134593, + "epoch": 0.3850484743621433, + "grad_norm": 0.7419540286064148, + "learning_rate": 1.8783130568047317e-05, + "loss": 1.2935, + "mean_token_accuracy": 0.6710209945837656, + "num_tokens": 589299732.0, + "step": 3505 + }, + { + "entropy": 1.7275305191675823, + "epoch": 0.38515833127351623, + "grad_norm": 0.6263718008995056, + "learning_rate": 1.878232647044203e-05, + "loss": 1.395, + "mean_token_accuracy": 0.6440123667319616, + "num_tokens": 589477221.0, + "step": 3506 + }, + { + "entropy": 1.691060076157252, + "epoch": 0.38526818818488917, + "grad_norm": 0.6086033582687378, + "learning_rate": 1.8781522126529615e-05, + "loss": 1.3346, + "mean_token_accuracy": 0.6638441930214564, + "num_tokens": 589632490.0, + "step": 3507 + }, + { + "entropy": 1.7170771658420563, + "epoch": 0.3853780450962621, + "grad_norm": 0.6135653853416443, + "learning_rate": 1.8780717536335534e-05, + "loss": 1.3926, + "mean_token_accuracy": 0.6520104904969534, + "num_tokens": 589837072.0, + "step": 3508 + }, + { + "entropy": 1.7135487794876099, + "epoch": 0.38548790200763505, + "grad_norm": 0.8644580841064453, + "learning_rate": 1.877991269988525e-05, + "loss": 1.5439, + "mean_token_accuracy": 0.6479515383640925, + "num_tokens": 590037145.0, + "step": 3509 + }, + { + "entropy": 1.656055251757304, + "epoch": 0.385597758919008, + "grad_norm": 0.6589810252189636, + "learning_rate": 1.8779107617204232e-05, + "loss": 1.3376, + "mean_token_accuracy": 0.6675926595926285, + "num_tokens": 590181728.0, + "step": 3510 + }, + { + "entropy": 1.6446336209774017, + "epoch": 0.38570761583038093, + "grad_norm": 0.7715820074081421, + "learning_rate": 1.8778302288317965e-05, + "loss": 1.42, + "mean_token_accuracy": 0.6649827063083649, + "num_tokens": 590345528.0, + "step": 3511 + }, + { + "entropy": 1.7644979854424794, + "epoch": 0.3858174727417539, + "grad_norm": 0.6795924305915833, + "learning_rate": 1.8777496713251937e-05, + "loss": 1.5448, + "mean_token_accuracy": 0.6330472528934479, + "num_tokens": 590543297.0, + "step": 3512 + }, + { + "entropy": 1.7632849017779033, + "epoch": 0.3859273296531268, + "grad_norm": 0.8066057562828064, + "learning_rate": 1.8776690892031642e-05, + "loss": 1.2179, + "mean_token_accuracy": 0.6771250069141388, + "num_tokens": 590649907.0, + "step": 3513 + }, + { + "entropy": 1.6934345563252766, + "epoch": 0.38603718656449976, + "grad_norm": 0.6281071901321411, + "learning_rate": 1.877588482468258e-05, + "loss": 1.3695, + "mean_token_accuracy": 0.6520146181186041, + "num_tokens": 590816034.0, + "step": 3514 + }, + { + "entropy": 1.711225817600886, + "epoch": 0.3861470434758727, + "grad_norm": 0.8094905614852905, + "learning_rate": 1.8775078511230275e-05, + "loss": 1.3598, + "mean_token_accuracy": 0.6714527507623037, + "num_tokens": 590967779.0, + "step": 3515 + }, + { + "entropy": 1.7611852586269379, + "epoch": 0.38625690038724564, + "grad_norm": 0.7497817873954773, + "learning_rate": 1.877427195170023e-05, + "loss": 1.4466, + "mean_token_accuracy": 0.6378799378871918, + "num_tokens": 591115206.0, + "step": 3516 + }, + { + "entropy": 1.7012092570463817, + "epoch": 0.3863667572986185, + "grad_norm": 0.7083910703659058, + "learning_rate": 1.8773465146117988e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.6618959506352743, + "num_tokens": 591263829.0, + "step": 3517 + }, + { + "entropy": 1.697861025730769, + "epoch": 0.38647661420999146, + "grad_norm": 0.6678640842437744, + "learning_rate": 1.8772658094509072e-05, + "loss": 1.419, + "mean_token_accuracy": 0.6634480754534403, + "num_tokens": 591459207.0, + "step": 3518 + }, + { + "entropy": 1.7712201476097107, + "epoch": 0.3865864711213644, + "grad_norm": 0.7492165565490723, + "learning_rate": 1.8771850796899034e-05, + "loss": 1.2713, + "mean_token_accuracy": 0.6765512228012085, + "num_tokens": 591593988.0, + "step": 3519 + }, + { + "entropy": 1.6945832471052806, + "epoch": 0.38669632803273735, + "grad_norm": 0.7029894590377808, + "learning_rate": 1.877104325331342e-05, + "loss": 1.4178, + "mean_token_accuracy": 0.6622582574685415, + "num_tokens": 591758058.0, + "step": 3520 + }, + { + "entropy": 1.6844372848669689, + "epoch": 0.3868061849441103, + "grad_norm": 0.6502472758293152, + "learning_rate": 1.8770235463777784e-05, + "loss": 1.3107, + "mean_token_accuracy": 0.6579713672399521, + "num_tokens": 591902067.0, + "step": 3521 + }, + { + "entropy": 1.698314368724823, + "epoch": 0.38691604185548323, + "grad_norm": 0.8369100093841553, + "learning_rate": 1.87694274283177e-05, + "loss": 1.3315, + "mean_token_accuracy": 0.6667650043964386, + "num_tokens": 592026730.0, + "step": 3522 + }, + { + "entropy": 1.6990590989589691, + "epoch": 0.38702589876685617, + "grad_norm": 0.757598876953125, + "learning_rate": 1.8768619146958736e-05, + "loss": 1.3108, + "mean_token_accuracy": 0.6711003084977468, + "num_tokens": 592159351.0, + "step": 3523 + }, + { + "entropy": 1.6836991906166077, + "epoch": 0.3871357556782291, + "grad_norm": 0.6812123656272888, + "learning_rate": 1.8767810619726486e-05, + "loss": 1.5014, + "mean_token_accuracy": 0.6574613849322001, + "num_tokens": 592313020.0, + "step": 3524 + }, + { + "entropy": 1.6621138453483582, + "epoch": 0.38724561258960205, + "grad_norm": 0.7579445242881775, + "learning_rate": 1.8767001846646522e-05, + "loss": 1.3334, + "mean_token_accuracy": 0.6703773736953735, + "num_tokens": 592465715.0, + "step": 3525 + }, + { + "entropy": 1.7132685979207356, + "epoch": 0.387355469500975, + "grad_norm": 0.683297872543335, + "learning_rate": 1.876619282774445e-05, + "loss": 1.5687, + "mean_token_accuracy": 0.6472751895586649, + "num_tokens": 592714454.0, + "step": 3526 + }, + { + "entropy": 1.6917611062526703, + "epoch": 0.38746532641234793, + "grad_norm": 0.7978048920631409, + "learning_rate": 1.876538356304588e-05, + "loss": 1.4326, + "mean_token_accuracy": 0.6595585942268372, + "num_tokens": 592855814.0, + "step": 3527 + }, + { + "entropy": 1.7584986786047618, + "epoch": 0.3875751833237209, + "grad_norm": 0.6933776140213013, + "learning_rate": 1.876457405257641e-05, + "loss": 1.2829, + "mean_token_accuracy": 0.6754846076170603, + "num_tokens": 592996822.0, + "step": 3528 + }, + { + "entropy": 1.679332544406255, + "epoch": 0.3876850402350938, + "grad_norm": 0.6294096112251282, + "learning_rate": 1.8763764296361676e-05, + "loss": 1.2627, + "mean_token_accuracy": 0.6760277499755224, + "num_tokens": 593154964.0, + "step": 3529 + }, + { + "entropy": 1.704353282848994, + "epoch": 0.3877948971464667, + "grad_norm": 0.65788334608078, + "learning_rate": 1.8762954294427298e-05, + "loss": 1.41, + "mean_token_accuracy": 0.6482875148455302, + "num_tokens": 593295801.0, + "step": 3530 + }, + { + "entropy": 1.7058403293291728, + "epoch": 0.38790475405783964, + "grad_norm": 0.7336824536323547, + "learning_rate": 1.8762144046798917e-05, + "loss": 1.4683, + "mean_token_accuracy": 0.6580928464730581, + "num_tokens": 593505141.0, + "step": 3531 + }, + { + "entropy": 1.7358074982961018, + "epoch": 0.3880146109692126, + "grad_norm": 0.7603702545166016, + "learning_rate": 1.8761333553502173e-05, + "loss": 1.3445, + "mean_token_accuracy": 0.6589676340421041, + "num_tokens": 593635482.0, + "step": 3532 + }, + { + "entropy": 1.7584581673145294, + "epoch": 0.3881244678805855, + "grad_norm": 0.6100241541862488, + "learning_rate": 1.8760522814562723e-05, + "loss": 1.5353, + "mean_token_accuracy": 0.6196905672550201, + "num_tokens": 593889864.0, + "step": 3533 + }, + { + "entropy": 1.709738661845525, + "epoch": 0.38823432479195846, + "grad_norm": 0.6657153964042664, + "learning_rate": 1.875971183000622e-05, + "loss": 1.3076, + "mean_token_accuracy": 0.6685143858194351, + "num_tokens": 594063330.0, + "step": 3534 + }, + { + "entropy": 1.6752463181813557, + "epoch": 0.3883441817033314, + "grad_norm": 0.62481689453125, + "learning_rate": 1.8758900599858333e-05, + "loss": 1.299, + "mean_token_accuracy": 0.6609266599019369, + "num_tokens": 594265034.0, + "step": 3535 + }, + { + "entropy": 1.7667845884958904, + "epoch": 0.38845403861470434, + "grad_norm": 0.7150773406028748, + "learning_rate": 1.875808912414474e-05, + "loss": 1.4817, + "mean_token_accuracy": 0.6347835808992386, + "num_tokens": 594428055.0, + "step": 3536 + }, + { + "entropy": 1.74485116203626, + "epoch": 0.3885638955260773, + "grad_norm": 0.6251989006996155, + "learning_rate": 1.8757277402891118e-05, + "loss": 1.405, + "mean_token_accuracy": 0.6552664488554001, + "num_tokens": 594605592.0, + "step": 3537 + }, + { + "entropy": 1.7133037547270458, + "epoch": 0.3886737524374502, + "grad_norm": 0.695165753364563, + "learning_rate": 1.8756465436123167e-05, + "loss": 1.3105, + "mean_token_accuracy": 0.670314704378446, + "num_tokens": 594744857.0, + "step": 3538 + }, + { + "entropy": 1.7227947811285655, + "epoch": 0.38878360934882317, + "grad_norm": 0.7755094766616821, + "learning_rate": 1.875565322386658e-05, + "loss": 1.3068, + "mean_token_accuracy": 0.6800702015558878, + "num_tokens": 594938776.0, + "step": 3539 + }, + { + "entropy": 1.6865267256895702, + "epoch": 0.3888934662601961, + "grad_norm": 0.671947181224823, + "learning_rate": 1.875484076614706e-05, + "loss": 1.4058, + "mean_token_accuracy": 0.6469751199086508, + "num_tokens": 595122731.0, + "step": 3540 + }, + { + "entropy": 1.6733566025892894, + "epoch": 0.38900332317156905, + "grad_norm": 0.6642799377441406, + "learning_rate": 1.8754028062990327e-05, + "loss": 1.3554, + "mean_token_accuracy": 0.6744746913512548, + "num_tokens": 595306659.0, + "step": 3541 + }, + { + "entropy": 1.7339465618133545, + "epoch": 0.389113180082942, + "grad_norm": 0.7320308089256287, + "learning_rate": 1.8753215114422096e-05, + "loss": 1.3023, + "mean_token_accuracy": 0.6688550561666489, + "num_tokens": 595470855.0, + "step": 3542 + }, + { + "entropy": 1.7501880327860515, + "epoch": 0.38922303699431493, + "grad_norm": 0.8129941821098328, + "learning_rate": 1.8752401920468105e-05, + "loss": 1.4079, + "mean_token_accuracy": 0.656624640027682, + "num_tokens": 595645644.0, + "step": 3543 + }, + { + "entropy": 1.7234038313229878, + "epoch": 0.3893328939056878, + "grad_norm": 0.6668652892112732, + "learning_rate": 1.8751588481154083e-05, + "loss": 1.4884, + "mean_token_accuracy": 0.6411069482564926, + "num_tokens": 595842197.0, + "step": 3544 + }, + { + "entropy": 1.6663442055384319, + "epoch": 0.38944275081706076, + "grad_norm": 0.6142482757568359, + "learning_rate": 1.875077479650578e-05, + "loss": 1.4848, + "mean_token_accuracy": 0.6513569702704748, + "num_tokens": 596060521.0, + "step": 3545 + }, + { + "entropy": 1.6824649969736736, + "epoch": 0.3895526077284337, + "grad_norm": 0.6500999331474304, + "learning_rate": 1.8749960866548948e-05, + "loss": 1.3553, + "mean_token_accuracy": 0.6778768996397654, + "num_tokens": 596237180.0, + "step": 3546 + }, + { + "entropy": 1.6939981679121654, + "epoch": 0.38966246463980664, + "grad_norm": 0.6637330055236816, + "learning_rate": 1.8749146691309347e-05, + "loss": 1.4655, + "mean_token_accuracy": 0.6524067719777426, + "num_tokens": 596402651.0, + "step": 3547 + }, + { + "entropy": 1.7055251995722454, + "epoch": 0.3897723215511796, + "grad_norm": 0.7360928058624268, + "learning_rate": 1.8748332270812746e-05, + "loss": 1.3932, + "mean_token_accuracy": 0.6488986412684122, + "num_tokens": 596604743.0, + "step": 3548 + }, + { + "entropy": 1.7154695093631744, + "epoch": 0.3898821784625525, + "grad_norm": 0.7440617084503174, + "learning_rate": 1.8747517605084914e-05, + "loss": 1.3314, + "mean_token_accuracy": 0.6643383254607519, + "num_tokens": 596728567.0, + "step": 3549 + }, + { + "entropy": 1.6730522513389587, + "epoch": 0.38999203537392546, + "grad_norm": 0.6638359427452087, + "learning_rate": 1.8746702694151645e-05, + "loss": 1.3219, + "mean_token_accuracy": 0.6653886139392853, + "num_tokens": 596933860.0, + "step": 3550 + }, + { + "entropy": 1.6828788320223491, + "epoch": 0.3901018922852984, + "grad_norm": 0.6851414442062378, + "learning_rate": 1.8745887538038727e-05, + "loss": 1.3891, + "mean_token_accuracy": 0.6566885908444723, + "num_tokens": 597094207.0, + "step": 3551 + }, + { + "entropy": 1.7113615274429321, + "epoch": 0.39021174919667134, + "grad_norm": 0.7445501089096069, + "learning_rate": 1.874507213677196e-05, + "loss": 1.3147, + "mean_token_accuracy": 0.6675726721684138, + "num_tokens": 597232948.0, + "step": 3552 + }, + { + "entropy": 1.7263106803099315, + "epoch": 0.3903216061080443, + "grad_norm": 0.5610165596008301, + "learning_rate": 1.8744256490377147e-05, + "loss": 1.4161, + "mean_token_accuracy": 0.6461490740378698, + "num_tokens": 597413908.0, + "step": 3553 + }, + { + "entropy": 1.6911349991957347, + "epoch": 0.3904314630194172, + "grad_norm": 0.7104760408401489, + "learning_rate": 1.874344059888011e-05, + "loss": 1.433, + "mean_token_accuracy": 0.6454216440518697, + "num_tokens": 597659679.0, + "step": 3554 + }, + { + "entropy": 1.7260303298632305, + "epoch": 0.39054131993079017, + "grad_norm": 0.7172141671180725, + "learning_rate": 1.874262446230666e-05, + "loss": 1.295, + "mean_token_accuracy": 0.6699913293123245, + "num_tokens": 597778281.0, + "step": 3555 + }, + { + "entropy": 1.6977204084396362, + "epoch": 0.3906511768421631, + "grad_norm": 0.636026918888092, + "learning_rate": 1.8741808080682642e-05, + "loss": 1.3092, + "mean_token_accuracy": 0.6655734032392502, + "num_tokens": 597910822.0, + "step": 3556 + }, + { + "entropy": 1.7228349049886067, + "epoch": 0.39076103375353605, + "grad_norm": 0.7579364776611328, + "learning_rate": 1.8740991454033883e-05, + "loss": 1.45, + "mean_token_accuracy": 0.6555042515198389, + "num_tokens": 598051246.0, + "step": 3557 + }, + { + "entropy": 1.715238094329834, + "epoch": 0.39087089066490893, + "grad_norm": 0.7158708572387695, + "learning_rate": 1.8740174582386234e-05, + "loss": 1.3264, + "mean_token_accuracy": 0.6576440383990606, + "num_tokens": 598170261.0, + "step": 3558 + }, + { + "entropy": 1.7376553813616435, + "epoch": 0.3909807475762819, + "grad_norm": 0.8242320418357849, + "learning_rate": 1.8739357465765547e-05, + "loss": 1.3275, + "mean_token_accuracy": 0.6688285072644552, + "num_tokens": 598289904.0, + "step": 3559 + }, + { + "entropy": 1.6719888945420582, + "epoch": 0.3910906044876548, + "grad_norm": 0.603971004486084, + "learning_rate": 1.8738540104197683e-05, + "loss": 1.5734, + "mean_token_accuracy": 0.6278845717509588, + "num_tokens": 598516225.0, + "step": 3560 + }, + { + "entropy": 1.697850485642751, + "epoch": 0.39120046139902775, + "grad_norm": 0.61806720495224, + "learning_rate": 1.873772249770851e-05, + "loss": 1.5395, + "mean_token_accuracy": 0.6368564814329147, + "num_tokens": 598787097.0, + "step": 3561 + }, + { + "entropy": 1.6998209357261658, + "epoch": 0.3913103183104007, + "grad_norm": 0.6823562979698181, + "learning_rate": 1.873690464632391e-05, + "loss": 1.4176, + "mean_token_accuracy": 0.6531643867492676, + "num_tokens": 598950071.0, + "step": 3562 + }, + { + "entropy": 1.7084301312764485, + "epoch": 0.39142017522177364, + "grad_norm": 0.7508410811424255, + "learning_rate": 1.8736086550069766e-05, + "loss": 1.5139, + "mean_token_accuracy": 0.6545840700467428, + "num_tokens": 599121424.0, + "step": 3563 + }, + { + "entropy": 1.7422731916109722, + "epoch": 0.3915300321331466, + "grad_norm": 0.6909976601600647, + "learning_rate": 1.8735268208971965e-05, + "loss": 1.496, + "mean_token_accuracy": 0.639715259273847, + "num_tokens": 599284329.0, + "step": 3564 + }, + { + "entropy": 1.6988299985726674, + "epoch": 0.3916398890445195, + "grad_norm": 0.728016197681427, + "learning_rate": 1.873444962305641e-05, + "loss": 1.274, + "mean_token_accuracy": 0.6804704517126083, + "num_tokens": 599418243.0, + "step": 3565 + }, + { + "entropy": 1.7136310239632924, + "epoch": 0.39174974595589246, + "grad_norm": 0.623084545135498, + "learning_rate": 1.8733630792349014e-05, + "loss": 1.5038, + "mean_token_accuracy": 0.6375333170096079, + "num_tokens": 599602975.0, + "step": 3566 + }, + { + "entropy": 1.686454842487971, + "epoch": 0.3918596028672654, + "grad_norm": 0.6495208144187927, + "learning_rate": 1.8732811716875684e-05, + "loss": 1.4385, + "mean_token_accuracy": 0.6662272214889526, + "num_tokens": 599821930.0, + "step": 3567 + }, + { + "entropy": 1.7124264140923817, + "epoch": 0.39196945977863834, + "grad_norm": 0.7537272572517395, + "learning_rate": 1.873199239666235e-05, + "loss": 1.5257, + "mean_token_accuracy": 0.6516513874133428, + "num_tokens": 600017465.0, + "step": 3568 + }, + { + "entropy": 1.6850533187389374, + "epoch": 0.3920793166900113, + "grad_norm": 0.6643959879875183, + "learning_rate": 1.8731172831734937e-05, + "loss": 1.2957, + "mean_token_accuracy": 0.6703493893146515, + "num_tokens": 600164676.0, + "step": 3569 + }, + { + "entropy": 1.699459304412206, + "epoch": 0.3921891736013842, + "grad_norm": 0.6547852754592896, + "learning_rate": 1.8730353022119392e-05, + "loss": 1.4598, + "mean_token_accuracy": 0.652552917599678, + "num_tokens": 600314512.0, + "step": 3570 + }, + { + "entropy": 1.6793744961420696, + "epoch": 0.3922990305127571, + "grad_norm": 0.7872046828269958, + "learning_rate": 1.8729532967841657e-05, + "loss": 1.5209, + "mean_token_accuracy": 0.6407067527373632, + "num_tokens": 600560727.0, + "step": 3571 + }, + { + "entropy": 1.7016756534576416, + "epoch": 0.39240888742413005, + "grad_norm": 0.804166853427887, + "learning_rate": 1.8728712668927684e-05, + "loss": 1.5712, + "mean_token_accuracy": 0.6527331074078878, + "num_tokens": 600701171.0, + "step": 3572 + }, + { + "entropy": 1.6971095005671184, + "epoch": 0.392518744335503, + "grad_norm": 0.6559096574783325, + "learning_rate": 1.8727892125403437e-05, + "loss": 1.4343, + "mean_token_accuracy": 0.6504131704568863, + "num_tokens": 600853204.0, + "step": 3573 + }, + { + "entropy": 1.740959644317627, + "epoch": 0.39262860124687593, + "grad_norm": 0.7399430871009827, + "learning_rate": 1.8727071337294892e-05, + "loss": 1.404, + "mean_token_accuracy": 0.6460892607768377, + "num_tokens": 601018363.0, + "step": 3574 + }, + { + "entropy": 1.70658544699351, + "epoch": 0.39273845815824887, + "grad_norm": 0.6616029143333435, + "learning_rate": 1.8726250304628017e-05, + "loss": 1.4447, + "mean_token_accuracy": 0.6470039238532385, + "num_tokens": 601166522.0, + "step": 3575 + }, + { + "entropy": 1.7305179238319397, + "epoch": 0.3928483150696218, + "grad_norm": 0.693975031375885, + "learning_rate": 1.8725429027428802e-05, + "loss": 1.3161, + "mean_token_accuracy": 0.6667521148920059, + "num_tokens": 601345354.0, + "step": 3576 + }, + { + "entropy": 1.7331166168053944, + "epoch": 0.39295817198099475, + "grad_norm": 0.6473891139030457, + "learning_rate": 1.8724607505723236e-05, + "loss": 1.3952, + "mean_token_accuracy": 0.6563832859198252, + "num_tokens": 601489345.0, + "step": 3577 + }, + { + "entropy": 1.7098148167133331, + "epoch": 0.3930680288923677, + "grad_norm": 0.7081977725028992, + "learning_rate": 1.8723785739537328e-05, + "loss": 1.4453, + "mean_token_accuracy": 0.6467587898174921, + "num_tokens": 601633917.0, + "step": 3578 + }, + { + "entropy": 1.6743212342262268, + "epoch": 0.39317788580374063, + "grad_norm": 0.7473645210266113, + "learning_rate": 1.8722963728897078e-05, + "loss": 1.2851, + "mean_token_accuracy": 0.6733796795209249, + "num_tokens": 601771977.0, + "step": 3579 + }, + { + "entropy": 1.6344492137432098, + "epoch": 0.3932877427151136, + "grad_norm": 0.6567934155464172, + "learning_rate": 1.872214147382851e-05, + "loss": 1.2201, + "mean_token_accuracy": 0.6826841433842977, + "num_tokens": 601917258.0, + "step": 3580 + }, + { + "entropy": 1.7242592175801594, + "epoch": 0.3933975996264865, + "grad_norm": 0.7916681170463562, + "learning_rate": 1.872131897435764e-05, + "loss": 1.4052, + "mean_token_accuracy": 0.6629040241241455, + "num_tokens": 602070528.0, + "step": 3581 + }, + { + "entropy": 1.7825438876946766, + "epoch": 0.39350745653785946, + "grad_norm": 0.6252172589302063, + "learning_rate": 1.872049623051051e-05, + "loss": 1.5612, + "mean_token_accuracy": 0.6375692586104075, + "num_tokens": 602269702.0, + "step": 3582 + }, + { + "entropy": 1.7129474182923634, + "epoch": 0.3936173134492324, + "grad_norm": 0.6330097913742065, + "learning_rate": 1.871967324231315e-05, + "loss": 1.3636, + "mean_token_accuracy": 0.6603029817342758, + "num_tokens": 602439795.0, + "step": 3583 + }, + { + "entropy": 1.7473669946193695, + "epoch": 0.39372717036060534, + "grad_norm": 0.6051161885261536, + "learning_rate": 1.871885000979161e-05, + "loss": 1.4629, + "mean_token_accuracy": 0.6398325165112814, + "num_tokens": 602620971.0, + "step": 3584 + }, + { + "entropy": 1.6432409286499023, + "epoch": 0.3938370272719782, + "grad_norm": 0.7886459231376648, + "learning_rate": 1.8718026532971945e-05, + "loss": 1.4551, + "mean_token_accuracy": 0.6793592671553293, + "num_tokens": 602787338.0, + "step": 3585 + }, + { + "entropy": 1.696602314710617, + "epoch": 0.39394688418335116, + "grad_norm": 0.7194052338600159, + "learning_rate": 1.871720281188022e-05, + "loss": 1.3187, + "mean_token_accuracy": 0.6622123072544733, + "num_tokens": 602910036.0, + "step": 3586 + }, + { + "entropy": 1.6780929962793987, + "epoch": 0.3940567410947241, + "grad_norm": 1.4930285215377808, + "learning_rate": 1.87163788465425e-05, + "loss": 1.5072, + "mean_token_accuracy": 0.6442895332972208, + "num_tokens": 603118858.0, + "step": 3587 + }, + { + "entropy": 1.6930700143178303, + "epoch": 0.39416659800609705, + "grad_norm": 0.7970458269119263, + "learning_rate": 1.8715554636984868e-05, + "loss": 1.3497, + "mean_token_accuracy": 0.6615445464849472, + "num_tokens": 603300138.0, + "step": 3588 + }, + { + "entropy": 1.676286409298579, + "epoch": 0.39427645491747, + "grad_norm": 0.7440655827522278, + "learning_rate": 1.871473018323341e-05, + "loss": 1.5556, + "mean_token_accuracy": 0.6397054543097814, + "num_tokens": 603524712.0, + "step": 3589 + }, + { + "entropy": 1.677784413099289, + "epoch": 0.39438631182884293, + "grad_norm": 0.6356014609336853, + "learning_rate": 1.8713905485314216e-05, + "loss": 1.3834, + "mean_token_accuracy": 0.6559326549371084, + "num_tokens": 603680062.0, + "step": 3590 + }, + { + "entropy": 1.7333962221940358, + "epoch": 0.39449616874021587, + "grad_norm": 0.7091386914253235, + "learning_rate": 1.871308054325339e-05, + "loss": 1.3963, + "mean_token_accuracy": 0.6534087806940079, + "num_tokens": 603863458.0, + "step": 3591 + }, + { + "entropy": 1.6834155718485515, + "epoch": 0.3946060256515888, + "grad_norm": 0.6516834497451782, + "learning_rate": 1.871225535707704e-05, + "loss": 1.3878, + "mean_token_accuracy": 0.6710058401028315, + "num_tokens": 603994946.0, + "step": 3592 + }, + { + "entropy": 1.645541141430537, + "epoch": 0.39471588256296175, + "grad_norm": 0.6310259103775024, + "learning_rate": 1.8711429926811285e-05, + "loss": 1.2944, + "mean_token_accuracy": 0.6635814557472864, + "num_tokens": 604168355.0, + "step": 3593 + }, + { + "entropy": 1.7525591452916462, + "epoch": 0.3948257394743347, + "grad_norm": 0.7263670563697815, + "learning_rate": 1.8710604252482244e-05, + "loss": 1.2925, + "mean_token_accuracy": 0.667170450091362, + "num_tokens": 604272321.0, + "step": 3594 + }, + { + "entropy": 1.673738161722819, + "epoch": 0.39493559638570763, + "grad_norm": 0.6252807378768921, + "learning_rate": 1.8709778334116057e-05, + "loss": 1.3787, + "mean_token_accuracy": 0.6551141440868378, + "num_tokens": 604478407.0, + "step": 3595 + }, + { + "entropy": 1.7633110185464222, + "epoch": 0.3950454532970806, + "grad_norm": 0.6537090539932251, + "learning_rate": 1.8708952171738856e-05, + "loss": 1.5101, + "mean_token_accuracy": 0.6308721353610357, + "num_tokens": 604695750.0, + "step": 3596 + }, + { + "entropy": 1.7199938992659252, + "epoch": 0.3951553102084535, + "grad_norm": 0.7916152477264404, + "learning_rate": 1.87081257653768e-05, + "loss": 1.4183, + "mean_token_accuracy": 0.6466862559318542, + "num_tokens": 604842866.0, + "step": 3597 + }, + { + "entropy": 1.6832565764586132, + "epoch": 0.3952651671198264, + "grad_norm": 0.6747387051582336, + "learning_rate": 1.870729911505603e-05, + "loss": 1.2859, + "mean_token_accuracy": 0.6659845014413198, + "num_tokens": 604984494.0, + "step": 3598 + }, + { + "entropy": 1.7077111999193828, + "epoch": 0.39537502403119934, + "grad_norm": 0.6704530715942383, + "learning_rate": 1.8706472220802717e-05, + "loss": 1.4066, + "mean_token_accuracy": 0.6525115470091502, + "num_tokens": 605147587.0, + "step": 3599 + }, + { + "entropy": 1.6967969636122386, + "epoch": 0.3954848809425723, + "grad_norm": 0.7108339071273804, + "learning_rate": 1.8705645082643032e-05, + "loss": 1.3964, + "mean_token_accuracy": 0.6673119068145752, + "num_tokens": 605284305.0, + "step": 3600 + }, + { + "entropy": 1.6884620984395344, + "epoch": 0.3955947378539452, + "grad_norm": 0.6969875693321228, + "learning_rate": 1.8704817700603154e-05, + "loss": 1.406, + "mean_token_accuracy": 0.6541502624750137, + "num_tokens": 605428461.0, + "step": 3601 + }, + { + "entropy": 1.6805487771828969, + "epoch": 0.39570459476531816, + "grad_norm": 0.6379789710044861, + "learning_rate": 1.8703990074709263e-05, + "loss": 1.4033, + "mean_token_accuracy": 0.6565418342749277, + "num_tokens": 605654217.0, + "step": 3602 + }, + { + "entropy": 1.6334502398967743, + "epoch": 0.3958144516766911, + "grad_norm": 0.7022704482078552, + "learning_rate": 1.870316220498756e-05, + "loss": 1.3101, + "mean_token_accuracy": 0.6683827390273412, + "num_tokens": 605810204.0, + "step": 3603 + }, + { + "entropy": 1.6775756180286407, + "epoch": 0.39592430858806404, + "grad_norm": 0.6165929436683655, + "learning_rate": 1.8702334091464246e-05, + "loss": 1.4418, + "mean_token_accuracy": 0.6538349191347758, + "num_tokens": 605976026.0, + "step": 3604 + }, + { + "entropy": 1.6821360886096954, + "epoch": 0.396034165499437, + "grad_norm": 0.6474902033805847, + "learning_rate": 1.8701505734165527e-05, + "loss": 1.3874, + "mean_token_accuracy": 0.6578802863756815, + "num_tokens": 606145063.0, + "step": 3605 + }, + { + "entropy": 1.6627052525679271, + "epoch": 0.3961440224108099, + "grad_norm": 0.6694169044494629, + "learning_rate": 1.870067713311762e-05, + "loss": 1.4774, + "mean_token_accuracy": 0.6556628793478012, + "num_tokens": 606323836.0, + "step": 3606 + }, + { + "entropy": 1.7426823377609253, + "epoch": 0.39625387932218287, + "grad_norm": 0.7143035531044006, + "learning_rate": 1.8699848288346754e-05, + "loss": 1.5342, + "mean_token_accuracy": 0.6395404686530431, + "num_tokens": 606527328.0, + "step": 3607 + }, + { + "entropy": 1.6447638769944508, + "epoch": 0.3963637362335558, + "grad_norm": 0.6995284557342529, + "learning_rate": 1.869901919987916e-05, + "loss": 1.2323, + "mean_token_accuracy": 0.681049590309461, + "num_tokens": 606652449.0, + "step": 3608 + }, + { + "entropy": 1.7393419643243153, + "epoch": 0.39647359314492875, + "grad_norm": 0.7996697425842285, + "learning_rate": 1.8698189867741076e-05, + "loss": 1.312, + "mean_token_accuracy": 0.6633258064587911, + "num_tokens": 606786412.0, + "step": 3609 + }, + { + "entropy": 1.7173935075600941, + "epoch": 0.3965834500563017, + "grad_norm": 0.8304112553596497, + "learning_rate": 1.8697360291958754e-05, + "loss": 1.3365, + "mean_token_accuracy": 0.6675741821527481, + "num_tokens": 606941342.0, + "step": 3610 + }, + { + "entropy": 1.699825793504715, + "epoch": 0.39669330696767463, + "grad_norm": 0.6240324378013611, + "learning_rate": 1.8696530472558443e-05, + "loss": 1.5215, + "mean_token_accuracy": 0.641850084066391, + "num_tokens": 607137523.0, + "step": 3611 + }, + { + "entropy": 1.7180300255616505, + "epoch": 0.3968031638790475, + "grad_norm": 0.730658769607544, + "learning_rate": 1.8695700409566415e-05, + "loss": 1.4504, + "mean_token_accuracy": 0.6601553956667582, + "num_tokens": 607275218.0, + "step": 3612 + }, + { + "entropy": 1.7043689092000325, + "epoch": 0.39691302079042046, + "grad_norm": 0.7190737128257751, + "learning_rate": 1.8694870103008935e-05, + "loss": 1.3154, + "mean_token_accuracy": 0.6722518901030222, + "num_tokens": 607422791.0, + "step": 3613 + }, + { + "entropy": 1.710739016532898, + "epoch": 0.3970228777017934, + "grad_norm": 0.6662858128547668, + "learning_rate": 1.8694039552912284e-05, + "loss": 1.3454, + "mean_token_accuracy": 0.6655974884827932, + "num_tokens": 607574450.0, + "step": 3614 + }, + { + "entropy": 1.7331662873427074, + "epoch": 0.39713273461316634, + "grad_norm": 0.6620702743530273, + "learning_rate": 1.8693208759302747e-05, + "loss": 1.3848, + "mean_token_accuracy": 0.64388441046079, + "num_tokens": 607709925.0, + "step": 3615 + }, + { + "entropy": 1.6359326243400574, + "epoch": 0.3972425915245393, + "grad_norm": 0.6715786457061768, + "learning_rate": 1.869237772220662e-05, + "loss": 1.32, + "mean_token_accuracy": 0.6786454369624456, + "num_tokens": 607877472.0, + "step": 3616 + }, + { + "entropy": 1.742279092470805, + "epoch": 0.3973524484359122, + "grad_norm": 0.7671318054199219, + "learning_rate": 1.8691546441650207e-05, + "loss": 1.5367, + "mean_token_accuracy": 0.6638617217540741, + "num_tokens": 608006491.0, + "step": 3617 + }, + { + "entropy": 1.6939302285512288, + "epoch": 0.39746230534728516, + "grad_norm": 0.7670230269432068, + "learning_rate": 1.8690714917659814e-05, + "loss": 1.3973, + "mean_token_accuracy": 0.6581595738728842, + "num_tokens": 608179437.0, + "step": 3618 + }, + { + "entropy": 1.6857011218865712, + "epoch": 0.3975721622586581, + "grad_norm": 0.787647008895874, + "learning_rate": 1.8689883150261757e-05, + "loss": 1.3985, + "mean_token_accuracy": 0.6678627133369446, + "num_tokens": 608310733.0, + "step": 3619 + }, + { + "entropy": 1.623725155989329, + "epoch": 0.39768201917003104, + "grad_norm": 0.8164381980895996, + "learning_rate": 1.8689051139482365e-05, + "loss": 1.5179, + "mean_token_accuracy": 0.6581759800513586, + "num_tokens": 608498736.0, + "step": 3620 + }, + { + "entropy": 1.6751043697198231, + "epoch": 0.397791876081404, + "grad_norm": 0.652132511138916, + "learning_rate": 1.8688218885347965e-05, + "loss": 1.2893, + "mean_token_accuracy": 0.6741900146007538, + "num_tokens": 608661402.0, + "step": 3621 + }, + { + "entropy": 1.7041152914365132, + "epoch": 0.3979017329927769, + "grad_norm": 0.7432838082313538, + "learning_rate": 1.868738638788491e-05, + "loss": 1.4467, + "mean_token_accuracy": 0.6503884643316269, + "num_tokens": 608825180.0, + "step": 3622 + }, + { + "entropy": 1.737585683663686, + "epoch": 0.39801158990414987, + "grad_norm": 0.6796532869338989, + "learning_rate": 1.868655364711953e-05, + "loss": 1.4838, + "mean_token_accuracy": 0.6391591926415762, + "num_tokens": 609006138.0, + "step": 3623 + }, + { + "entropy": 1.7053319811820984, + "epoch": 0.3981214468155228, + "grad_norm": 0.7604497671127319, + "learning_rate": 1.86857206630782e-05, + "loss": 1.6561, + "mean_token_accuracy": 0.6299934685230255, + "num_tokens": 609212521.0, + "step": 3624 + }, + { + "entropy": 1.6852892835934956, + "epoch": 0.3982313037268957, + "grad_norm": 0.6800695061683655, + "learning_rate": 1.868488743578727e-05, + "loss": 1.2921, + "mean_token_accuracy": 0.6707666118939718, + "num_tokens": 609346073.0, + "step": 3625 + }, + { + "entropy": 1.667622039715449, + "epoch": 0.39834116063826863, + "grad_norm": 0.7599472403526306, + "learning_rate": 1.8684053965273113e-05, + "loss": 1.3797, + "mean_token_accuracy": 0.6550516585508982, + "num_tokens": 609528585.0, + "step": 3626 + }, + { + "entropy": 1.7041009267171223, + "epoch": 0.3984510175496416, + "grad_norm": 0.6519821882247925, + "learning_rate": 1.8683220251562116e-05, + "loss": 1.4673, + "mean_token_accuracy": 0.649181400736173, + "num_tokens": 609699624.0, + "step": 3627 + }, + { + "entropy": 1.7341491381327312, + "epoch": 0.3985608744610145, + "grad_norm": 0.6558710932731628, + "learning_rate": 1.8682386294680656e-05, + "loss": 1.4285, + "mean_token_accuracy": 0.6576865861813227, + "num_tokens": 609851288.0, + "step": 3628 + }, + { + "entropy": 1.6496534844239552, + "epoch": 0.39867073137238745, + "grad_norm": 0.7691988348960876, + "learning_rate": 1.8681552094655132e-05, + "loss": 1.4595, + "mean_token_accuracy": 0.6498380750417709, + "num_tokens": 610005266.0, + "step": 3629 + }, + { + "entropy": 1.7265863120555878, + "epoch": 0.3987805882837604, + "grad_norm": 0.683956503868103, + "learning_rate": 1.8680717651511948e-05, + "loss": 1.3603, + "mean_token_accuracy": 0.6562889615694681, + "num_tokens": 610131391.0, + "step": 3630 + }, + { + "entropy": 1.6970125834147136, + "epoch": 0.39889044519513334, + "grad_norm": 1.811918020248413, + "learning_rate": 1.8679882965277508e-05, + "loss": 1.1718, + "mean_token_accuracy": 0.6783278286457062, + "num_tokens": 610327332.0, + "step": 3631 + }, + { + "entropy": 1.7868920266628265, + "epoch": 0.3990003021065063, + "grad_norm": 0.8308510184288025, + "learning_rate": 1.8679048035978236e-05, + "loss": 1.4282, + "mean_token_accuracy": 0.6506891945997874, + "num_tokens": 610455251.0, + "step": 3632 + }, + { + "entropy": 1.6640632251898448, + "epoch": 0.3991101590178792, + "grad_norm": 0.6303699612617493, + "learning_rate": 1.8678212863640552e-05, + "loss": 1.4084, + "mean_token_accuracy": 0.6633955190579096, + "num_tokens": 610614206.0, + "step": 3633 + }, + { + "entropy": 1.6598373850186665, + "epoch": 0.39922001592925216, + "grad_norm": 0.5654789209365845, + "learning_rate": 1.8677377448290892e-05, + "loss": 1.2886, + "mean_token_accuracy": 0.6726368019978205, + "num_tokens": 610776600.0, + "step": 3634 + }, + { + "entropy": 1.771477371454239, + "epoch": 0.3993298728406251, + "grad_norm": 0.6862777471542358, + "learning_rate": 1.8676541789955692e-05, + "loss": 1.4854, + "mean_token_accuracy": 0.6504343748092651, + "num_tokens": 610950858.0, + "step": 3635 + }, + { + "entropy": 1.7534089088439941, + "epoch": 0.39943972975199804, + "grad_norm": 0.7172439098358154, + "learning_rate": 1.867570588866141e-05, + "loss": 1.5362, + "mean_token_accuracy": 0.6323390305042267, + "num_tokens": 611180246.0, + "step": 3636 + }, + { + "entropy": 1.6689714988072712, + "epoch": 0.399549586663371, + "grad_norm": 0.5847604870796204, + "learning_rate": 1.867486974443449e-05, + "loss": 1.394, + "mean_token_accuracy": 0.6602125515540441, + "num_tokens": 611354464.0, + "step": 3637 + }, + { + "entropy": 1.7680908838907878, + "epoch": 0.3996594435747439, + "grad_norm": 0.7248014807701111, + "learning_rate": 1.8674033357301402e-05, + "loss": 1.4446, + "mean_token_accuracy": 0.6542030622561773, + "num_tokens": 611522664.0, + "step": 3638 + }, + { + "entropy": 1.714485635360082, + "epoch": 0.3997693004861168, + "grad_norm": 0.6951132416725159, + "learning_rate": 1.8673196727288616e-05, + "loss": 1.2956, + "mean_token_accuracy": 0.6643576820691427, + "num_tokens": 611647329.0, + "step": 3639 + }, + { + "entropy": 1.6996821860472362, + "epoch": 0.39987915739748975, + "grad_norm": 0.7710621953010559, + "learning_rate": 1.8672359854422614e-05, + "loss": 1.4314, + "mean_token_accuracy": 0.6488352914651235, + "num_tokens": 611814510.0, + "step": 3640 + }, + { + "entropy": 1.680074393749237, + "epoch": 0.3999890143088627, + "grad_norm": 0.7552538514137268, + "learning_rate": 1.867152273872988e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.662238617738088, + "num_tokens": 611955456.0, + "step": 3641 + }, + { + "entropy": 1.715712159872055, + "epoch": 0.40009887122023563, + "grad_norm": 0.612894594669342, + "learning_rate": 1.86706853802369e-05, + "loss": 1.387, + "mean_token_accuracy": 0.6484750012556711, + "num_tokens": 612145076.0, + "step": 3642 + }, + { + "entropy": 1.7193404138088226, + "epoch": 0.40020872813160857, + "grad_norm": 0.6383233070373535, + "learning_rate": 1.866984777897019e-05, + "loss": 1.3055, + "mean_token_accuracy": 0.6797666301329931, + "num_tokens": 612270224.0, + "step": 3643 + }, + { + "entropy": 1.6991442839304607, + "epoch": 0.4003185850429815, + "grad_norm": 0.6264839172363281, + "learning_rate": 1.8669009934956256e-05, + "loss": 1.376, + "mean_token_accuracy": 0.6612722476323446, + "num_tokens": 612439048.0, + "step": 3644 + }, + { + "entropy": 1.700903097788493, + "epoch": 0.40042844195435445, + "grad_norm": 0.7194094061851501, + "learning_rate": 1.866817184822161e-05, + "loss": 1.4956, + "mean_token_accuracy": 0.6383681247631708, + "num_tokens": 612610609.0, + "step": 3645 + }, + { + "entropy": 1.6729531685511272, + "epoch": 0.4005382988657274, + "grad_norm": 0.6744722127914429, + "learning_rate": 1.8667333518792786e-05, + "loss": 1.3917, + "mean_token_accuracy": 0.674171636501948, + "num_tokens": 612762506.0, + "step": 3646 + }, + { + "entropy": 1.7685239613056183, + "epoch": 0.40064815577710033, + "grad_norm": 0.7136297821998596, + "learning_rate": 1.8666494946696306e-05, + "loss": 1.5135, + "mean_token_accuracy": 0.6325026253859202, + "num_tokens": 612936758.0, + "step": 3647 + }, + { + "entropy": 1.702545295159022, + "epoch": 0.4007580126884733, + "grad_norm": 0.6659870147705078, + "learning_rate": 1.8665656131958717e-05, + "loss": 1.3334, + "mean_token_accuracy": 0.6652649194002151, + "num_tokens": 613057363.0, + "step": 3648 + }, + { + "entropy": 1.7028338809808095, + "epoch": 0.4008678695998462, + "grad_norm": 0.6186485290527344, + "learning_rate": 1.8664817074606565e-05, + "loss": 1.6413, + "mean_token_accuracy": 0.6314787616332372, + "num_tokens": 613307099.0, + "step": 3649 + }, + { + "entropy": 1.7399461170037587, + "epoch": 0.40097772651121916, + "grad_norm": 0.7118646502494812, + "learning_rate": 1.8663977774666403e-05, + "loss": 1.3848, + "mean_token_accuracy": 0.6634780565897623, + "num_tokens": 613478548.0, + "step": 3650 + }, + { + "entropy": 1.7087344527244568, + "epoch": 0.4010875834225921, + "grad_norm": 0.6944850087165833, + "learning_rate": 1.8663138232164804e-05, + "loss": 1.5035, + "mean_token_accuracy": 0.6449535042047501, + "num_tokens": 613651212.0, + "step": 3651 + }, + { + "entropy": 1.679858426253001, + "epoch": 0.401197440333965, + "grad_norm": 0.7054488062858582, + "learning_rate": 1.866229844712833e-05, + "loss": 1.3144, + "mean_token_accuracy": 0.6721477111180624, + "num_tokens": 613780246.0, + "step": 3652 + }, + { + "entropy": 1.715953419605891, + "epoch": 0.4013072972453379, + "grad_norm": 0.7255501747131348, + "learning_rate": 1.8661458419583563e-05, + "loss": 1.3845, + "mean_token_accuracy": 0.6507025410731634, + "num_tokens": 613943278.0, + "step": 3653 + }, + { + "entropy": 1.7305433750152588, + "epoch": 0.40141715415671086, + "grad_norm": 0.6626403331756592, + "learning_rate": 1.866061814955709e-05, + "loss": 1.5169, + "mean_token_accuracy": 0.6476994504531225, + "num_tokens": 614082100.0, + "step": 3654 + }, + { + "entropy": 1.7705755233764648, + "epoch": 0.4015270110680838, + "grad_norm": 0.8271293640136719, + "learning_rate": 1.8659777637075503e-05, + "loss": 1.6316, + "mean_token_accuracy": 0.6363982160886129, + "num_tokens": 614323862.0, + "step": 3655 + }, + { + "entropy": 1.7688746849695842, + "epoch": 0.40163686797945675, + "grad_norm": 0.6675280928611755, + "learning_rate": 1.8658936882165408e-05, + "loss": 1.4183, + "mean_token_accuracy": 0.6585695048173269, + "num_tokens": 614479973.0, + "step": 3656 + }, + { + "entropy": 1.6870961685975392, + "epoch": 0.4017467248908297, + "grad_norm": 0.7141885161399841, + "learning_rate": 1.8658095884853412e-05, + "loss": 1.369, + "mean_token_accuracy": 0.6499126503864924, + "num_tokens": 614624882.0, + "step": 3657 + }, + { + "entropy": 1.7352818648020427, + "epoch": 0.40185658180220263, + "grad_norm": 0.8047354221343994, + "learning_rate": 1.865725464516613e-05, + "loss": 1.3427, + "mean_token_accuracy": 0.6598058293263117, + "num_tokens": 614770100.0, + "step": 3658 + }, + { + "entropy": 1.702322781085968, + "epoch": 0.40196643871357557, + "grad_norm": 0.5614283680915833, + "learning_rate": 1.865641316313019e-05, + "loss": 1.3603, + "mean_token_accuracy": 0.6561979601780573, + "num_tokens": 614947375.0, + "step": 3659 + }, + { + "entropy": 1.68387637535731, + "epoch": 0.4020762956249485, + "grad_norm": 0.6392550468444824, + "learning_rate": 1.865557143877222e-05, + "loss": 1.3327, + "mean_token_accuracy": 0.6635664403438568, + "num_tokens": 615118931.0, + "step": 3660 + }, + { + "entropy": 1.8135263323783875, + "epoch": 0.40218615253632145, + "grad_norm": 0.586390495300293, + "learning_rate": 1.8654729472118867e-05, + "loss": 1.3753, + "mean_token_accuracy": 0.6475066045920054, + "num_tokens": 615330975.0, + "step": 3661 + }, + { + "entropy": 1.6571489373842876, + "epoch": 0.4022960094476944, + "grad_norm": 0.6854637861251831, + "learning_rate": 1.8653887263196775e-05, + "loss": 1.4143, + "mean_token_accuracy": 0.6672212878863016, + "num_tokens": 615511217.0, + "step": 3662 + }, + { + "entropy": 1.7072084446748097, + "epoch": 0.40240586635906733, + "grad_norm": 0.6002740859985352, + "learning_rate": 1.86530448120326e-05, + "loss": 1.3311, + "mean_token_accuracy": 0.6561245868603388, + "num_tokens": 615670287.0, + "step": 3663 + }, + { + "entropy": 1.709785560766856, + "epoch": 0.4025157232704403, + "grad_norm": 0.6385271549224854, + "learning_rate": 1.8652202118653005e-05, + "loss": 1.4953, + "mean_token_accuracy": 0.6401058932145437, + "num_tokens": 615857571.0, + "step": 3664 + }, + { + "entropy": 1.6435896158218384, + "epoch": 0.4026255801818132, + "grad_norm": 0.66823810338974, + "learning_rate": 1.8651359183084664e-05, + "loss": 1.3375, + "mean_token_accuracy": 0.6652998874584833, + "num_tokens": 616018039.0, + "step": 3665 + }, + { + "entropy": 1.6801452438036601, + "epoch": 0.4027354370931861, + "grad_norm": 0.5973647832870483, + "learning_rate": 1.8650516005354245e-05, + "loss": 1.44, + "mean_token_accuracy": 0.6624323775370916, + "num_tokens": 616182931.0, + "step": 3666 + }, + { + "entropy": 1.7209701438744862, + "epoch": 0.40284529400455904, + "grad_norm": 0.7251614332199097, + "learning_rate": 1.864967258548845e-05, + "loss": 1.3669, + "mean_token_accuracy": 0.6691777855157852, + "num_tokens": 616300469.0, + "step": 3667 + }, + { + "entropy": 1.7649596532185872, + "epoch": 0.402955150915932, + "grad_norm": 0.7085322737693787, + "learning_rate": 1.864882892351396e-05, + "loss": 1.4425, + "mean_token_accuracy": 0.6525527884562811, + "num_tokens": 616507234.0, + "step": 3668 + }, + { + "entropy": 1.7601352433363597, + "epoch": 0.4030650078273049, + "grad_norm": 0.8236812353134155, + "learning_rate": 1.8647985019457482e-05, + "loss": 1.3432, + "mean_token_accuracy": 0.6626549661159515, + "num_tokens": 616629147.0, + "step": 3669 + }, + { + "entropy": 1.6714808940887451, + "epoch": 0.40317486473867786, + "grad_norm": 0.7395771145820618, + "learning_rate": 1.8647140873345727e-05, + "loss": 1.3773, + "mean_token_accuracy": 0.6534648189942042, + "num_tokens": 616778411.0, + "step": 3670 + }, + { + "entropy": 1.6952777008215587, + "epoch": 0.4032847216500508, + "grad_norm": 0.6873879432678223, + "learning_rate": 1.864629648520541e-05, + "loss": 1.3186, + "mean_token_accuracy": 0.66050224006176, + "num_tokens": 616922969.0, + "step": 3671 + }, + { + "entropy": 1.6377867658933003, + "epoch": 0.40339457856142374, + "grad_norm": 0.6935714483261108, + "learning_rate": 1.8645451855063252e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.662156730890274, + "num_tokens": 617068169.0, + "step": 3672 + }, + { + "entropy": 1.704243501027425, + "epoch": 0.4035044354727967, + "grad_norm": 1.5501242876052856, + "learning_rate": 1.8644606982945988e-05, + "loss": 1.2649, + "mean_token_accuracy": 0.6755559096733729, + "num_tokens": 617234636.0, + "step": 3673 + }, + { + "entropy": 1.7551777064800262, + "epoch": 0.4036142923841696, + "grad_norm": 0.7300415635108948, + "learning_rate": 1.8643761868880356e-05, + "loss": 1.531, + "mean_token_accuracy": 0.6428199609120687, + "num_tokens": 617415408.0, + "step": 3674 + }, + { + "entropy": 1.7334574957688649, + "epoch": 0.40372414929554257, + "grad_norm": 0.6666220426559448, + "learning_rate": 1.8642916512893108e-05, + "loss": 1.4268, + "mean_token_accuracy": 0.6364853282769521, + "num_tokens": 617605330.0, + "step": 3675 + }, + { + "entropy": 1.766120086113612, + "epoch": 0.4038340062069155, + "grad_norm": 0.8933466076850891, + "learning_rate": 1.8642070915010994e-05, + "loss": 1.5728, + "mean_token_accuracy": 0.6363476316134135, + "num_tokens": 617780089.0, + "step": 3676 + }, + { + "entropy": 1.730029950539271, + "epoch": 0.40394386311828845, + "grad_norm": 0.7603272795677185, + "learning_rate": 1.8641225075260784e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6469805290301641, + "num_tokens": 617927420.0, + "step": 3677 + }, + { + "entropy": 1.689517339070638, + "epoch": 0.4040537200296614, + "grad_norm": 0.6826215386390686, + "learning_rate": 1.864037899366924e-05, + "loss": 1.4607, + "mean_token_accuracy": 0.6500324904918671, + "num_tokens": 618092883.0, + "step": 3678 + }, + { + "entropy": 1.663270503282547, + "epoch": 0.40416357694103433, + "grad_norm": 0.7083759307861328, + "learning_rate": 1.8639532670263142e-05, + "loss": 1.3416, + "mean_token_accuracy": 0.663003941377004, + "num_tokens": 618253174.0, + "step": 3679 + }, + { + "entropy": 1.6893990735212963, + "epoch": 0.4042734338524072, + "grad_norm": 0.6897298693656921, + "learning_rate": 1.863868610506928e-05, + "loss": 1.3883, + "mean_token_accuracy": 0.6646070778369904, + "num_tokens": 618398292.0, + "step": 3680 + }, + { + "entropy": 1.7317078014214833, + "epoch": 0.40438329076378016, + "grad_norm": 0.7244861721992493, + "learning_rate": 1.8637839298114445e-05, + "loss": 1.3297, + "mean_token_accuracy": 0.6592196226119995, + "num_tokens": 618534629.0, + "step": 3681 + }, + { + "entropy": 1.7509274383385975, + "epoch": 0.4044931476751531, + "grad_norm": 0.7792545557022095, + "learning_rate": 1.8636992249425436e-05, + "loss": 1.2519, + "mean_token_accuracy": 0.6701816469430923, + "num_tokens": 618627524.0, + "step": 3682 + }, + { + "entropy": 1.659875859816869, + "epoch": 0.40460300458652604, + "grad_norm": 0.7609748244285583, + "learning_rate": 1.8636144959029063e-05, + "loss": 1.4746, + "mean_token_accuracy": 0.6501151074965795, + "num_tokens": 618774107.0, + "step": 3683 + }, + { + "entropy": 1.7485106388727825, + "epoch": 0.404712861497899, + "grad_norm": 0.8140855431556702, + "learning_rate": 1.8635297426952147e-05, + "loss": 1.4954, + "mean_token_accuracy": 0.6546749224265417, + "num_tokens": 618906293.0, + "step": 3684 + }, + { + "entropy": 1.6669574677944183, + "epoch": 0.4048227184092719, + "grad_norm": 0.6584160923957825, + "learning_rate": 1.8634449653221505e-05, + "loss": 1.3567, + "mean_token_accuracy": 0.6570019920667013, + "num_tokens": 619075570.0, + "step": 3685 + }, + { + "entropy": 1.6605021357536316, + "epoch": 0.40493257532064486, + "grad_norm": 0.5621921420097351, + "learning_rate": 1.863360163786397e-05, + "loss": 1.4514, + "mean_token_accuracy": 0.6414414793252945, + "num_tokens": 619360684.0, + "step": 3686 + }, + { + "entropy": 1.6636900802453358, + "epoch": 0.4050424322320178, + "grad_norm": 0.6727263331413269, + "learning_rate": 1.8632753380906387e-05, + "loss": 1.3516, + "mean_token_accuracy": 0.6557752440373102, + "num_tokens": 619555666.0, + "step": 3687 + }, + { + "entropy": 1.7114156087239583, + "epoch": 0.40515228914339074, + "grad_norm": 0.6485511064529419, + "learning_rate": 1.8631904882375595e-05, + "loss": 1.5021, + "mean_token_accuracy": 0.6425280173619589, + "num_tokens": 619733822.0, + "step": 3688 + }, + { + "entropy": 1.6890954673290253, + "epoch": 0.4052621460547637, + "grad_norm": 0.6665855646133423, + "learning_rate": 1.8631056142298457e-05, + "loss": 1.3798, + "mean_token_accuracy": 0.6521730422973633, + "num_tokens": 619927481.0, + "step": 3689 + }, + { + "entropy": 1.6435322761535645, + "epoch": 0.4053720029661366, + "grad_norm": 0.6213128566741943, + "learning_rate": 1.8630207160701827e-05, + "loss": 1.3392, + "mean_token_accuracy": 0.6614899933338165, + "num_tokens": 620096992.0, + "step": 3690 + }, + { + "entropy": 1.7578924596309662, + "epoch": 0.40548185987750957, + "grad_norm": 0.7708293795585632, + "learning_rate": 1.862935793761258e-05, + "loss": 1.1762, + "mean_token_accuracy": 0.6814229289690653, + "num_tokens": 620243412.0, + "step": 3691 + }, + { + "entropy": 1.7739320397377014, + "epoch": 0.4055917167888825, + "grad_norm": 0.8655069470405579, + "learning_rate": 1.8628508473057592e-05, + "loss": 1.5393, + "mean_token_accuracy": 0.648578479886055, + "num_tokens": 620390926.0, + "step": 3692 + }, + { + "entropy": 1.70758917927742, + "epoch": 0.4057015737002554, + "grad_norm": 1.9093140363693237, + "learning_rate": 1.862765876706375e-05, + "loss": 1.098, + "mean_token_accuracy": 0.6809806078672409, + "num_tokens": 620557421.0, + "step": 3693 + }, + { + "entropy": 1.6794420282046, + "epoch": 0.40581143061162833, + "grad_norm": 0.679584801197052, + "learning_rate": 1.862680881965794e-05, + "loss": 1.3353, + "mean_token_accuracy": 0.6540734767913818, + "num_tokens": 620718410.0, + "step": 3694 + }, + { + "entropy": 1.73444531361262, + "epoch": 0.4059212875230013, + "grad_norm": 0.6241161227226257, + "learning_rate": 1.8625958630867072e-05, + "loss": 1.3099, + "mean_token_accuracy": 0.6632533123095831, + "num_tokens": 620863809.0, + "step": 3695 + }, + { + "entropy": 1.7056554555892944, + "epoch": 0.4060311444343742, + "grad_norm": 0.7893586158752441, + "learning_rate": 1.862510820071805e-05, + "loss": 1.3113, + "mean_token_accuracy": 0.6655853390693665, + "num_tokens": 620989409.0, + "step": 3696 + }, + { + "entropy": 1.6801581581433613, + "epoch": 0.40614100134574715, + "grad_norm": 0.7889465689659119, + "learning_rate": 1.862425752923779e-05, + "loss": 1.4217, + "mean_token_accuracy": 0.6615896672010422, + "num_tokens": 621155720.0, + "step": 3697 + }, + { + "entropy": 1.7147592107454936, + "epoch": 0.4062508582571201, + "grad_norm": 0.7899035215377808, + "learning_rate": 1.8623406616453213e-05, + "loss": 1.271, + "mean_token_accuracy": 0.6728020161390305, + "num_tokens": 621271342.0, + "step": 3698 + }, + { + "entropy": 1.707566757996877, + "epoch": 0.40636071516849304, + "grad_norm": 0.7400942444801331, + "learning_rate": 1.862255546239125e-05, + "loss": 1.2851, + "mean_token_accuracy": 0.6638195067644119, + "num_tokens": 621403853.0, + "step": 3699 + }, + { + "entropy": 1.7490671475728352, + "epoch": 0.406470572079866, + "grad_norm": 0.6932427287101746, + "learning_rate": 1.8621704067078842e-05, + "loss": 1.3392, + "mean_token_accuracy": 0.6685472031434377, + "num_tokens": 621515651.0, + "step": 3700 + }, + { + "entropy": 1.7084580659866333, + "epoch": 0.4065804289912389, + "grad_norm": 0.6444189548492432, + "learning_rate": 1.8620852430542936e-05, + "loss": 1.3692, + "mean_token_accuracy": 0.6660451342662176, + "num_tokens": 621678160.0, + "step": 3701 + }, + { + "entropy": 1.734459678332011, + "epoch": 0.40669028590261186, + "grad_norm": 0.7342776656150818, + "learning_rate": 1.8620000552810488e-05, + "loss": 1.3653, + "mean_token_accuracy": 0.6498020191987356, + "num_tokens": 621814935.0, + "step": 3702 + }, + { + "entropy": 1.642607440551122, + "epoch": 0.4068001428139848, + "grad_norm": 0.6483259201049805, + "learning_rate": 1.861914843390845e-05, + "loss": 1.3316, + "mean_token_accuracy": 0.6714847981929779, + "num_tokens": 621967804.0, + "step": 3703 + }, + { + "entropy": 1.7405053277810414, + "epoch": 0.40690999972535774, + "grad_norm": 0.6480836868286133, + "learning_rate": 1.86182960738638e-05, + "loss": 1.459, + "mean_token_accuracy": 0.6403814305861791, + "num_tokens": 622156492.0, + "step": 3704 + }, + { + "entropy": 1.6267158389091492, + "epoch": 0.4070198566367307, + "grad_norm": 0.7061080932617188, + "learning_rate": 1.8617443472703514e-05, + "loss": 1.2146, + "mean_token_accuracy": 0.684728279709816, + "num_tokens": 622266394.0, + "step": 3705 + }, + { + "entropy": 1.7066716353098552, + "epoch": 0.4071297135481036, + "grad_norm": 0.8183558583259583, + "learning_rate": 1.861659063045457e-05, + "loss": 1.4803, + "mean_token_accuracy": 0.6585542360941569, + "num_tokens": 622446280.0, + "step": 3706 + }, + { + "entropy": 1.7405121624469757, + "epoch": 0.4072395704594765, + "grad_norm": 0.7175570726394653, + "learning_rate": 1.8615737547143968e-05, + "loss": 1.5347, + "mean_token_accuracy": 0.6411835898955663, + "num_tokens": 622610924.0, + "step": 3707 + }, + { + "entropy": 1.7092544833819072, + "epoch": 0.40734942737084945, + "grad_norm": 0.9672302007675171, + "learning_rate": 1.8614884222798705e-05, + "loss": 1.4165, + "mean_token_accuracy": 0.6539698441823324, + "num_tokens": 622775599.0, + "step": 3708 + }, + { + "entropy": 1.7085430522759755, + "epoch": 0.4074592842822224, + "grad_norm": 0.6571292281150818, + "learning_rate": 1.8614030657445785e-05, + "loss": 1.4122, + "mean_token_accuracy": 0.6727810104688009, + "num_tokens": 622962845.0, + "step": 3709 + }, + { + "entropy": 1.7726508279641469, + "epoch": 0.40756914119359533, + "grad_norm": 0.6792343854904175, + "learning_rate": 1.861317685111223e-05, + "loss": 1.5236, + "mean_token_accuracy": 0.6427791565656662, + "num_tokens": 623157911.0, + "step": 3710 + }, + { + "entropy": 1.7251368661721547, + "epoch": 0.40767899810496827, + "grad_norm": 0.7350800037384033, + "learning_rate": 1.8612322803825053e-05, + "loss": 1.2582, + "mean_token_accuracy": 0.678009644150734, + "num_tokens": 623285497.0, + "step": 3711 + }, + { + "entropy": 1.6766076783339183, + "epoch": 0.4077888550163412, + "grad_norm": 0.6149843335151672, + "learning_rate": 1.861146851561129e-05, + "loss": 1.51, + "mean_token_accuracy": 0.6478391562898954, + "num_tokens": 623448272.0, + "step": 3712 + }, + { + "entropy": 1.6991031368573506, + "epoch": 0.40789871192771415, + "grad_norm": 0.6349066495895386, + "learning_rate": 1.861061398649798e-05, + "loss": 1.2536, + "mean_token_accuracy": 0.6752463430166245, + "num_tokens": 623590450.0, + "step": 3713 + }, + { + "entropy": 1.7172236144542694, + "epoch": 0.4080085688390871, + "grad_norm": 0.6747751832008362, + "learning_rate": 1.860975921651217e-05, + "loss": 1.5217, + "mean_token_accuracy": 0.6311314900716146, + "num_tokens": 623802101.0, + "step": 3714 + }, + { + "entropy": 1.7139861385027568, + "epoch": 0.40811842575046003, + "grad_norm": 0.6247215270996094, + "learning_rate": 1.8608904205680906e-05, + "loss": 1.4003, + "mean_token_accuracy": 0.6497706820567449, + "num_tokens": 623950911.0, + "step": 3715 + }, + { + "entropy": 1.749284307161967, + "epoch": 0.408228282661833, + "grad_norm": 0.5963855981826782, + "learning_rate": 1.8608048954031254e-05, + "loss": 1.4282, + "mean_token_accuracy": 0.6429949502150217, + "num_tokens": 624154271.0, + "step": 3716 + }, + { + "entropy": 1.7446833749612172, + "epoch": 0.4083381395732059, + "grad_norm": 0.6486496925354004, + "learning_rate": 1.8607193461590277e-05, + "loss": 1.3916, + "mean_token_accuracy": 0.6541141470273336, + "num_tokens": 624326903.0, + "step": 3717 + }, + { + "entropy": 1.6626928846041362, + "epoch": 0.40844799648457886, + "grad_norm": 0.7017315030097961, + "learning_rate": 1.860633772838506e-05, + "loss": 1.3665, + "mean_token_accuracy": 0.6529461542765299, + "num_tokens": 624533835.0, + "step": 3718 + }, + { + "entropy": 1.6866288880507152, + "epoch": 0.4085578533959518, + "grad_norm": 0.7589619159698486, + "learning_rate": 1.860548175444268e-05, + "loss": 1.3975, + "mean_token_accuracy": 0.6569940547148386, + "num_tokens": 624695576.0, + "step": 3719 + }, + { + "entropy": 1.7539471586545308, + "epoch": 0.4086677103073247, + "grad_norm": 0.7970021963119507, + "learning_rate": 1.8604625539790228e-05, + "loss": 1.6079, + "mean_token_accuracy": 0.640375425418218, + "num_tokens": 624842347.0, + "step": 3720 + }, + { + "entropy": 1.6378857394059498, + "epoch": 0.4087775672186976, + "grad_norm": 0.7092182040214539, + "learning_rate": 1.8603769084454804e-05, + "loss": 1.4028, + "mean_token_accuracy": 0.6625880300998688, + "num_tokens": 625027646.0, + "step": 3721 + }, + { + "entropy": 1.618373692035675, + "epoch": 0.40888742413007056, + "grad_norm": 0.6855277419090271, + "learning_rate": 1.8602912388463517e-05, + "loss": 1.4195, + "mean_token_accuracy": 0.6505888452132543, + "num_tokens": 625206719.0, + "step": 3722 + }, + { + "entropy": 1.674377828836441, + "epoch": 0.4089972810414435, + "grad_norm": 0.6490065455436707, + "learning_rate": 1.8602055451843478e-05, + "loss": 1.2185, + "mean_token_accuracy": 0.6873969038327535, + "num_tokens": 625327376.0, + "step": 3723 + }, + { + "entropy": 1.664337585369746, + "epoch": 0.40910713795281645, + "grad_norm": 0.7654528021812439, + "learning_rate": 1.860119827462181e-05, + "loss": 1.4033, + "mean_token_accuracy": 0.6536544114351273, + "num_tokens": 625505992.0, + "step": 3724 + }, + { + "entropy": 1.7024443646272023, + "epoch": 0.4092169948641894, + "grad_norm": 0.6893821954727173, + "learning_rate": 1.860034085682564e-05, + "loss": 1.437, + "mean_token_accuracy": 0.6580548087755839, + "num_tokens": 625700752.0, + "step": 3725 + }, + { + "entropy": 1.6648207604885101, + "epoch": 0.40932685177556233, + "grad_norm": 0.7746477723121643, + "learning_rate": 1.859948319848211e-05, + "loss": 1.1891, + "mean_token_accuracy": 0.6822925706704458, + "num_tokens": 625814896.0, + "step": 3726 + }, + { + "entropy": 1.7079435586929321, + "epoch": 0.40943670868693527, + "grad_norm": 0.6155371069908142, + "learning_rate": 1.859862529961836e-05, + "loss": 1.3543, + "mean_token_accuracy": 0.668289711078008, + "num_tokens": 625962535.0, + "step": 3727 + }, + { + "entropy": 1.7189550697803497, + "epoch": 0.4095465655983082, + "grad_norm": 0.7430447936058044, + "learning_rate": 1.859776716026154e-05, + "loss": 1.3355, + "mean_token_accuracy": 0.6572021146615347, + "num_tokens": 626096526.0, + "step": 3728 + }, + { + "entropy": 1.6810458799203236, + "epoch": 0.40965642250968115, + "grad_norm": 1.085142731666565, + "learning_rate": 1.8596908780438814e-05, + "loss": 1.0983, + "mean_token_accuracy": 0.6744260291258494, + "num_tokens": 626249920.0, + "step": 3729 + }, + { + "entropy": 1.7445210615793865, + "epoch": 0.4097662794210541, + "grad_norm": 0.7419488430023193, + "learning_rate": 1.8596050160177352e-05, + "loss": 1.3508, + "mean_token_accuracy": 0.66064981619517, + "num_tokens": 626378039.0, + "step": 3730 + }, + { + "entropy": 1.760192632675171, + "epoch": 0.40987613633242703, + "grad_norm": 0.6193534135818481, + "learning_rate": 1.859519129950432e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6515404631694158, + "num_tokens": 626544591.0, + "step": 3731 + }, + { + "entropy": 1.7204260925451915, + "epoch": 0.4099859932438, + "grad_norm": 1.0320173501968384, + "learning_rate": 1.859433219844691e-05, + "loss": 1.5482, + "mean_token_accuracy": 0.6566920280456543, + "num_tokens": 626683443.0, + "step": 3732 + }, + { + "entropy": 1.7170347174008687, + "epoch": 0.4100958501551729, + "grad_norm": 0.747575044631958, + "learning_rate": 1.8593472857032308e-05, + "loss": 1.4343, + "mean_token_accuracy": 0.6497025390466055, + "num_tokens": 626814527.0, + "step": 3733 + }, + { + "entropy": 1.6637854278087616, + "epoch": 0.4102057070665458, + "grad_norm": 0.6838406324386597, + "learning_rate": 1.859261327528771e-05, + "loss": 1.3744, + "mean_token_accuracy": 0.6690096110105515, + "num_tokens": 626996166.0, + "step": 3734 + }, + { + "entropy": 1.6826592286427815, + "epoch": 0.41031556397791874, + "grad_norm": 0.6222158670425415, + "learning_rate": 1.8591753453240325e-05, + "loss": 1.4596, + "mean_token_accuracy": 0.6647855440775553, + "num_tokens": 627196669.0, + "step": 3735 + }, + { + "entropy": 1.678474356730779, + "epoch": 0.4104254208892917, + "grad_norm": 0.6561827659606934, + "learning_rate": 1.8590893390917363e-05, + "loss": 1.3236, + "mean_token_accuracy": 0.6612599492073059, + "num_tokens": 627341643.0, + "step": 3736 + }, + { + "entropy": 1.6994609435399373, + "epoch": 0.4105352778006646, + "grad_norm": 0.6723978519439697, + "learning_rate": 1.8590033088346045e-05, + "loss": 1.4024, + "mean_token_accuracy": 0.6625661303599676, + "num_tokens": 627504221.0, + "step": 3737 + }, + { + "entropy": 1.7354566156864166, + "epoch": 0.41064513471203756, + "grad_norm": 0.6468124985694885, + "learning_rate": 1.85891725455536e-05, + "loss": 1.3671, + "mean_token_accuracy": 0.6559626658757528, + "num_tokens": 627629476.0, + "step": 3738 + }, + { + "entropy": 1.7572604417800903, + "epoch": 0.4107549916234105, + "grad_norm": 0.6548281908035278, + "learning_rate": 1.8588311762567265e-05, + "loss": 1.4999, + "mean_token_accuracy": 0.649440790216128, + "num_tokens": 627838130.0, + "step": 3739 + }, + { + "entropy": 1.6867552896340687, + "epoch": 0.41086484853478344, + "grad_norm": 0.6936057806015015, + "learning_rate": 1.8587450739414282e-05, + "loss": 1.3552, + "mean_token_accuracy": 0.6642784823973974, + "num_tokens": 628019384.0, + "step": 3740 + }, + { + "entropy": 1.7941297590732574, + "epoch": 0.4109747054461564, + "grad_norm": 0.6662188768386841, + "learning_rate": 1.85865894761219e-05, + "loss": 1.4409, + "mean_token_accuracy": 0.6453611056009928, + "num_tokens": 628160091.0, + "step": 3741 + }, + { + "entropy": 1.677331139643987, + "epoch": 0.4110845623575293, + "grad_norm": 0.7274539470672607, + "learning_rate": 1.858572797271738e-05, + "loss": 1.2365, + "mean_token_accuracy": 0.6719293495019277, + "num_tokens": 628273375.0, + "step": 3742 + }, + { + "entropy": 1.7725926240285237, + "epoch": 0.41119441926890227, + "grad_norm": 0.7785374522209167, + "learning_rate": 1.8584866229227992e-05, + "loss": 1.2674, + "mean_token_accuracy": 0.6794732809066772, + "num_tokens": 628386434.0, + "step": 3743 + }, + { + "entropy": 1.7157474358876545, + "epoch": 0.4113042761802752, + "grad_norm": 0.6738847494125366, + "learning_rate": 1.8584004245681e-05, + "loss": 1.3154, + "mean_token_accuracy": 0.6582375268141428, + "num_tokens": 628503456.0, + "step": 3744 + }, + { + "entropy": 1.6891884605089824, + "epoch": 0.41141413309164815, + "grad_norm": 0.706418514251709, + "learning_rate": 1.8583142022103694e-05, + "loss": 1.5105, + "mean_token_accuracy": 0.6367523421843847, + "num_tokens": 628722141.0, + "step": 3745 + }, + { + "entropy": 1.771670550107956, + "epoch": 0.4115239900030211, + "grad_norm": 0.8060712814331055, + "learning_rate": 1.858227955852336e-05, + "loss": 1.3965, + "mean_token_accuracy": 0.6502161224683126, + "num_tokens": 628870665.0, + "step": 3746 + }, + { + "entropy": 1.6912760337193806, + "epoch": 0.411633846914394, + "grad_norm": 0.6966044902801514, + "learning_rate": 1.8581416854967293e-05, + "loss": 1.4553, + "mean_token_accuracy": 0.6377379248539606, + "num_tokens": 629086050.0, + "step": 3747 + }, + { + "entropy": 1.7380321621894836, + "epoch": 0.4117437038257669, + "grad_norm": 0.6073872447013855, + "learning_rate": 1.85805539114628e-05, + "loss": 1.4282, + "mean_token_accuracy": 0.6448562443256378, + "num_tokens": 629258009.0, + "step": 3748 + }, + { + "entropy": 1.6827170252799988, + "epoch": 0.41185356073713986, + "grad_norm": 0.6829259395599365, + "learning_rate": 1.8579690728037195e-05, + "loss": 1.3912, + "mean_token_accuracy": 0.6493991017341614, + "num_tokens": 629397477.0, + "step": 3749 + }, + { + "entropy": 1.72429492076238, + "epoch": 0.4119634176485128, + "grad_norm": 0.6729731559753418, + "learning_rate": 1.857882730471779e-05, + "loss": 1.3648, + "mean_token_accuracy": 0.6580958614746729, + "num_tokens": 629596068.0, + "step": 3750 + }, + { + "entropy": 1.7216622432072957, + "epoch": 0.41207327455988574, + "grad_norm": 0.7398632764816284, + "learning_rate": 1.8577963641531915e-05, + "loss": 1.2278, + "mean_token_accuracy": 0.6858376761277517, + "num_tokens": 629741637.0, + "step": 3751 + }, + { + "entropy": 1.7678700387477875, + "epoch": 0.4121831314712587, + "grad_norm": 1.0076205730438232, + "learning_rate": 1.857709973850691e-05, + "loss": 1.4064, + "mean_token_accuracy": 0.6636515309413274, + "num_tokens": 629853316.0, + "step": 3752 + }, + { + "entropy": 1.7365649839242299, + "epoch": 0.4122929883826316, + "grad_norm": 0.673788845539093, + "learning_rate": 1.8576235595670105e-05, + "loss": 1.2618, + "mean_token_accuracy": 0.6891842633485794, + "num_tokens": 629972831.0, + "step": 3753 + }, + { + "entropy": 1.7559735278288524, + "epoch": 0.41240284529400456, + "grad_norm": 0.9235732555389404, + "learning_rate": 1.8575371213048867e-05, + "loss": 1.4149, + "mean_token_accuracy": 0.6592928121487299, + "num_tokens": 630109076.0, + "step": 3754 + }, + { + "entropy": 1.7470454176266987, + "epoch": 0.4125127022053775, + "grad_norm": 0.6771215796470642, + "learning_rate": 1.8574506590670534e-05, + "loss": 1.4049, + "mean_token_accuracy": 0.6512129505475363, + "num_tokens": 630262653.0, + "step": 3755 + }, + { + "entropy": 1.6346890528996785, + "epoch": 0.41262255911675044, + "grad_norm": 0.6646851897239685, + "learning_rate": 1.8573641728562488e-05, + "loss": 1.5104, + "mean_token_accuracy": 0.64119320611159, + "num_tokens": 630444943.0, + "step": 3756 + }, + { + "entropy": 1.7178200781345367, + "epoch": 0.4127324160281234, + "grad_norm": 0.6681410670280457, + "learning_rate": 1.8572776626752092e-05, + "loss": 1.3195, + "mean_token_accuracy": 0.6758150657018026, + "num_tokens": 630608705.0, + "step": 3757 + }, + { + "entropy": 1.7018628120422363, + "epoch": 0.4128422729394963, + "grad_norm": 0.8846207857131958, + "learning_rate": 1.857191128526673e-05, + "loss": 1.5712, + "mean_token_accuracy": 0.6433374732732773, + "num_tokens": 630828140.0, + "step": 3758 + }, + { + "entropy": 1.7014219065507252, + "epoch": 0.41295212985086927, + "grad_norm": 0.6484429836273193, + "learning_rate": 1.857104570413378e-05, + "loss": 1.342, + "mean_token_accuracy": 0.692674994468689, + "num_tokens": 630979788.0, + "step": 3759 + }, + { + "entropy": 1.6751723786195118, + "epoch": 0.4130619867622422, + "grad_norm": 0.6921824812889099, + "learning_rate": 1.8570179883380652e-05, + "loss": 1.4416, + "mean_token_accuracy": 0.6488529096047083, + "num_tokens": 631180442.0, + "step": 3760 + }, + { + "entropy": 1.680382361014684, + "epoch": 0.4131718436736151, + "grad_norm": 0.7185025215148926, + "learning_rate": 1.8569313823034743e-05, + "loss": 1.4073, + "mean_token_accuracy": 0.6465843468904495, + "num_tokens": 631387922.0, + "step": 3761 + }, + { + "entropy": 1.6748826801776886, + "epoch": 0.41328170058498803, + "grad_norm": 0.592786967754364, + "learning_rate": 1.8568447523123457e-05, + "loss": 1.5418, + "mean_token_accuracy": 0.6322498073180517, + "num_tokens": 631598046.0, + "step": 3762 + }, + { + "entropy": 1.7133816381295521, + "epoch": 0.413391557496361, + "grad_norm": 0.7715355157852173, + "learning_rate": 1.856758098367422e-05, + "loss": 1.2644, + "mean_token_accuracy": 0.6847187926371893, + "num_tokens": 631744350.0, + "step": 3763 + }, + { + "entropy": 1.7190758188565571, + "epoch": 0.4135014144077339, + "grad_norm": 0.7023261189460754, + "learning_rate": 1.8566714204714454e-05, + "loss": 1.3741, + "mean_token_accuracy": 0.6586879988511404, + "num_tokens": 631894493.0, + "step": 3764 + }, + { + "entropy": 1.7163704931735992, + "epoch": 0.41361127131910685, + "grad_norm": 0.8502719402313232, + "learning_rate": 1.8565847186271594e-05, + "loss": 1.5034, + "mean_token_accuracy": 0.6552699059247971, + "num_tokens": 632056905.0, + "step": 3765 + }, + { + "entropy": 1.679179718097051, + "epoch": 0.4137211282304798, + "grad_norm": 0.7581773400306702, + "learning_rate": 1.8564979928373083e-05, + "loss": 1.2715, + "mean_token_accuracy": 0.6689160714546839, + "num_tokens": 632218501.0, + "step": 3766 + }, + { + "entropy": 1.6978221833705902, + "epoch": 0.41383098514185274, + "grad_norm": 0.7345089316368103, + "learning_rate": 1.856411243104636e-05, + "loss": 1.345, + "mean_token_accuracy": 0.6770609468221664, + "num_tokens": 632370942.0, + "step": 3767 + }, + { + "entropy": 1.727396120627721, + "epoch": 0.4139408420532257, + "grad_norm": 0.6835429072380066, + "learning_rate": 1.856324469431889e-05, + "loss": 1.2722, + "mean_token_accuracy": 0.6730792969465256, + "num_tokens": 632518943.0, + "step": 3768 + }, + { + "entropy": 1.7132271230220795, + "epoch": 0.4140506989645986, + "grad_norm": 0.7705096006393433, + "learning_rate": 1.8562376718218133e-05, + "loss": 1.3787, + "mean_token_accuracy": 0.6688175052404404, + "num_tokens": 632642148.0, + "step": 3769 + }, + { + "entropy": 1.6458029548327129, + "epoch": 0.41416055587597156, + "grad_norm": 0.5635362267494202, + "learning_rate": 1.856150850277156e-05, + "loss": 1.4653, + "mean_token_accuracy": 0.6459923932949702, + "num_tokens": 632871437.0, + "step": 3770 + }, + { + "entropy": 1.7273524105548859, + "epoch": 0.4142704127873445, + "grad_norm": 0.7517685890197754, + "learning_rate": 1.8560640048006652e-05, + "loss": 1.347, + "mean_token_accuracy": 0.6664700706799825, + "num_tokens": 633005488.0, + "step": 3771 + }, + { + "entropy": 1.6999563177426655, + "epoch": 0.41438026969871744, + "grad_norm": 0.6293025016784668, + "learning_rate": 1.8559771353950893e-05, + "loss": 1.385, + "mean_token_accuracy": 0.6580439954996109, + "num_tokens": 633211034.0, + "step": 3772 + }, + { + "entropy": 1.737761527299881, + "epoch": 0.4144901266100904, + "grad_norm": 0.7360339164733887, + "learning_rate": 1.8558902420631776e-05, + "loss": 1.4929, + "mean_token_accuracy": 0.6542394310235977, + "num_tokens": 633363034.0, + "step": 3773 + }, + { + "entropy": 1.7658338248729706, + "epoch": 0.41459998352146327, + "grad_norm": 0.6309468150138855, + "learning_rate": 1.85580332480768e-05, + "loss": 1.3943, + "mean_token_accuracy": 0.6525468230247498, + "num_tokens": 633528700.0, + "step": 3774 + }, + { + "entropy": 1.677381157875061, + "epoch": 0.4147098404328362, + "grad_norm": 0.8243128657341003, + "learning_rate": 1.8557163836313486e-05, + "loss": 1.4033, + "mean_token_accuracy": 0.660991777976354, + "num_tokens": 633681640.0, + "step": 3775 + }, + { + "entropy": 1.7099827925364177, + "epoch": 0.41481969734420915, + "grad_norm": 0.5814919471740723, + "learning_rate": 1.8556294185369336e-05, + "loss": 1.3706, + "mean_token_accuracy": 0.6490457753340403, + "num_tokens": 633876385.0, + "step": 3776 + }, + { + "entropy": 1.7081574300924938, + "epoch": 0.4149295542555821, + "grad_norm": 0.7149261236190796, + "learning_rate": 1.855542429527188e-05, + "loss": 1.3391, + "mean_token_accuracy": 0.6678998519976934, + "num_tokens": 634025843.0, + "step": 3777 + }, + { + "entropy": 1.743098219235738, + "epoch": 0.41503941116695503, + "grad_norm": 0.6656703352928162, + "learning_rate": 1.8554554166048654e-05, + "loss": 1.4263, + "mean_token_accuracy": 0.6521339118480682, + "num_tokens": 634164379.0, + "step": 3778 + }, + { + "entropy": 1.7285428146521251, + "epoch": 0.41514926807832797, + "grad_norm": 0.6746429800987244, + "learning_rate": 1.8553683797727188e-05, + "loss": 1.4432, + "mean_token_accuracy": 0.6566011756658554, + "num_tokens": 634307434.0, + "step": 3779 + }, + { + "entropy": 1.6698183516661327, + "epoch": 0.4152591249897009, + "grad_norm": 0.8133582472801208, + "learning_rate": 1.8552813190335034e-05, + "loss": 1.301, + "mean_token_accuracy": 0.6644681245088577, + "num_tokens": 634444248.0, + "step": 3780 + }, + { + "entropy": 1.699441949526469, + "epoch": 0.41536898190107385, + "grad_norm": 0.8208682537078857, + "learning_rate": 1.855194234389975e-05, + "loss": 1.2526, + "mean_token_accuracy": 0.6720673541227976, + "num_tokens": 634586931.0, + "step": 3781 + }, + { + "entropy": 1.7422561248143513, + "epoch": 0.4154788388124468, + "grad_norm": 0.6837664246559143, + "learning_rate": 1.8551071258448892e-05, + "loss": 1.6275, + "mean_token_accuracy": 0.6357202082872391, + "num_tokens": 634794197.0, + "step": 3782 + }, + { + "entropy": 1.6618089973926544, + "epoch": 0.41558869572381973, + "grad_norm": 0.7491135001182556, + "learning_rate": 1.855019993401003e-05, + "loss": 1.3817, + "mean_token_accuracy": 0.6542213608821233, + "num_tokens": 634995369.0, + "step": 3783 + }, + { + "entropy": 1.7044015129407246, + "epoch": 0.4156985526351927, + "grad_norm": 0.613198459148407, + "learning_rate": 1.854932837061074e-05, + "loss": 1.4134, + "mean_token_accuracy": 0.6356542706489563, + "num_tokens": 635184727.0, + "step": 3784 + }, + { + "entropy": 1.671468476454417, + "epoch": 0.4158084095465656, + "grad_norm": 0.6908861994743347, + "learning_rate": 1.8548456568278616e-05, + "loss": 1.4499, + "mean_token_accuracy": 0.6709446410338084, + "num_tokens": 635343732.0, + "step": 3785 + }, + { + "entropy": 1.723008821407954, + "epoch": 0.41591826645793856, + "grad_norm": 0.6988131403923035, + "learning_rate": 1.8547584527041235e-05, + "loss": 1.4046, + "mean_token_accuracy": 0.6638121704260508, + "num_tokens": 635507468.0, + "step": 3786 + }, + { + "entropy": 1.6880267560482025, + "epoch": 0.4160281233693115, + "grad_norm": 0.6676950454711914, + "learning_rate": 1.8546712246926207e-05, + "loss": 1.2988, + "mean_token_accuracy": 0.6631821393966675, + "num_tokens": 635649868.0, + "step": 3787 + }, + { + "entropy": 1.7301316161950429, + "epoch": 0.4161379802806844, + "grad_norm": 0.7677087783813477, + "learning_rate": 1.854583972796114e-05, + "loss": 1.5427, + "mean_token_accuracy": 0.652143269777298, + "num_tokens": 635819810.0, + "step": 3788 + }, + { + "entropy": 1.6811153590679169, + "epoch": 0.4162478371920573, + "grad_norm": 0.6712203621864319, + "learning_rate": 1.8544966970173645e-05, + "loss": 1.3512, + "mean_token_accuracy": 0.6549779176712036, + "num_tokens": 635984503.0, + "step": 3789 + }, + { + "entropy": 1.7444909314314525, + "epoch": 0.41635769410343026, + "grad_norm": 0.6587154865264893, + "learning_rate": 1.8544093973591343e-05, + "loss": 1.4814, + "mean_token_accuracy": 0.6409175097942352, + "num_tokens": 636134931.0, + "step": 3790 + }, + { + "entropy": 1.6592991352081299, + "epoch": 0.4164675510148032, + "grad_norm": 0.5977832078933716, + "learning_rate": 1.854322073824187e-05, + "loss": 1.4304, + "mean_token_accuracy": 0.6406304885943731, + "num_tokens": 636318464.0, + "step": 3791 + }, + { + "entropy": 1.6651886999607086, + "epoch": 0.41657740792617615, + "grad_norm": 0.6955407857894897, + "learning_rate": 1.8542347264152855e-05, + "loss": 1.3467, + "mean_token_accuracy": 0.6668408364057541, + "num_tokens": 636459849.0, + "step": 3792 + }, + { + "entropy": 1.7133266230424244, + "epoch": 0.4166872648375491, + "grad_norm": 0.7298943996429443, + "learning_rate": 1.854147355135195e-05, + "loss": 1.569, + "mean_token_accuracy": 0.6349954207738241, + "num_tokens": 636634478.0, + "step": 3793 + }, + { + "entropy": 1.6949812173843384, + "epoch": 0.41679712174892203, + "grad_norm": 0.9010490775108337, + "learning_rate": 1.8540599599866806e-05, + "loss": 1.3849, + "mean_token_accuracy": 0.6672961960236231, + "num_tokens": 636753414.0, + "step": 3794 + }, + { + "entropy": 1.700540026028951, + "epoch": 0.41690697866029497, + "grad_norm": 0.7625768184661865, + "learning_rate": 1.853972540972508e-05, + "loss": 1.1909, + "mean_token_accuracy": 0.6855234503746033, + "num_tokens": 636855409.0, + "step": 3795 + }, + { + "entropy": 1.790819029013316, + "epoch": 0.4170168355716679, + "grad_norm": 0.6526350975036621, + "learning_rate": 1.8538850980954446e-05, + "loss": 1.5692, + "mean_token_accuracy": 0.6258653302987417, + "num_tokens": 637050334.0, + "step": 3796 + }, + { + "entropy": 1.7093652784824371, + "epoch": 0.41712669248304085, + "grad_norm": 0.5973331332206726, + "learning_rate": 1.8537976313582573e-05, + "loss": 1.5218, + "mean_token_accuracy": 0.6466895639896393, + "num_tokens": 637235606.0, + "step": 3797 + }, + { + "entropy": 1.6295418043931325, + "epoch": 0.4172365493944138, + "grad_norm": 0.6998474597930908, + "learning_rate": 1.853710140763715e-05, + "loss": 1.4507, + "mean_token_accuracy": 0.6505574633677801, + "num_tokens": 637439394.0, + "step": 3798 + }, + { + "entropy": 1.7180461982885997, + "epoch": 0.41734640630578673, + "grad_norm": 0.6476940512657166, + "learning_rate": 1.8536226263145857e-05, + "loss": 1.6517, + "mean_token_accuracy": 0.6157064388195673, + "num_tokens": 637652827.0, + "step": 3799 + }, + { + "entropy": 1.7250114878018696, + "epoch": 0.4174562632171597, + "grad_norm": 0.7295282483100891, + "learning_rate": 1.8535350880136403e-05, + "loss": 1.4413, + "mean_token_accuracy": 0.6517095665136973, + "num_tokens": 637839760.0, + "step": 3800 + }, + { + "entropy": 1.7239097158114116, + "epoch": 0.41756612012853256, + "grad_norm": 0.6322441697120667, + "learning_rate": 1.8534475258636488e-05, + "loss": 1.4804, + "mean_token_accuracy": 0.6419643859068552, + "num_tokens": 638075885.0, + "step": 3801 + }, + { + "entropy": 1.6781774560610454, + "epoch": 0.4176759770399055, + "grad_norm": 0.6442082524299622, + "learning_rate": 1.8533599398673826e-05, + "loss": 1.5032, + "mean_token_accuracy": 0.6423897991577784, + "num_tokens": 638270735.0, + "step": 3802 + }, + { + "entropy": 1.713065505027771, + "epoch": 0.41778583395127844, + "grad_norm": 0.5872926712036133, + "learning_rate": 1.853272330027614e-05, + "loss": 1.4092, + "mean_token_accuracy": 0.645249143242836, + "num_tokens": 638459204.0, + "step": 3803 + }, + { + "entropy": 1.6904393831888835, + "epoch": 0.4178956908626514, + "grad_norm": 0.6932823061943054, + "learning_rate": 1.8531846963471155e-05, + "loss": 1.2647, + "mean_token_accuracy": 0.674810583392779, + "num_tokens": 638574639.0, + "step": 3804 + }, + { + "entropy": 1.7162601053714752, + "epoch": 0.4180055477740243, + "grad_norm": 0.716964840888977, + "learning_rate": 1.8530970388286605e-05, + "loss": 1.4352, + "mean_token_accuracy": 0.6317101766665777, + "num_tokens": 638765017.0, + "step": 3805 + }, + { + "entropy": 1.6731017033259075, + "epoch": 0.41811540468539726, + "grad_norm": 0.6435312628746033, + "learning_rate": 1.853009357475024e-05, + "loss": 1.3129, + "mean_token_accuracy": 0.6804841359456381, + "num_tokens": 638896095.0, + "step": 3806 + }, + { + "entropy": 1.7432553172111511, + "epoch": 0.4182252615967702, + "grad_norm": 0.668488621711731, + "learning_rate": 1.8529216522889802e-05, + "loss": 1.2866, + "mean_token_accuracy": 0.6711077938477198, + "num_tokens": 639039152.0, + "step": 3807 + }, + { + "entropy": 1.715178112188975, + "epoch": 0.41833511850814314, + "grad_norm": 0.8195774555206299, + "learning_rate": 1.852833923273306e-05, + "loss": 1.3363, + "mean_token_accuracy": 0.664946511387825, + "num_tokens": 639171512.0, + "step": 3808 + }, + { + "entropy": 1.6638068159421284, + "epoch": 0.4184449754195161, + "grad_norm": 0.6470533013343811, + "learning_rate": 1.852746170430777e-05, + "loss": 1.2879, + "mean_token_accuracy": 0.6690946668386459, + "num_tokens": 639301924.0, + "step": 3809 + }, + { + "entropy": 1.7037453750769298, + "epoch": 0.418554832330889, + "grad_norm": 0.6436436772346497, + "learning_rate": 1.8526583937641708e-05, + "loss": 1.3852, + "mean_token_accuracy": 0.6688676526149114, + "num_tokens": 639463966.0, + "step": 3810 + }, + { + "entropy": 1.7554031908512115, + "epoch": 0.41866468924226197, + "grad_norm": 0.736873984336853, + "learning_rate": 1.8525705932762658e-05, + "loss": 1.6376, + "mean_token_accuracy": 0.6553641508022944, + "num_tokens": 639647344.0, + "step": 3811 + }, + { + "entropy": 1.7499909301598866, + "epoch": 0.4187745461536349, + "grad_norm": 0.6630018949508667, + "learning_rate": 1.8524827689698403e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.6541923681894938, + "num_tokens": 639786116.0, + "step": 3812 + }, + { + "entropy": 1.65069513519605, + "epoch": 0.41888440306500785, + "grad_norm": 0.5848079919815063, + "learning_rate": 1.8523949208476744e-05, + "loss": 1.3979, + "mean_token_accuracy": 0.6475146114826202, + "num_tokens": 640023361.0, + "step": 3813 + }, + { + "entropy": 1.7403970857461293, + "epoch": 0.4189942599763808, + "grad_norm": 0.7221378684043884, + "learning_rate": 1.8523070489125484e-05, + "loss": 1.3454, + "mean_token_accuracy": 0.6649908721446991, + "num_tokens": 640193104.0, + "step": 3814 + }, + { + "entropy": 1.700227975845337, + "epoch": 0.4191041168877537, + "grad_norm": 0.6060642004013062, + "learning_rate": 1.8522191531672433e-05, + "loss": 1.2714, + "mean_token_accuracy": 0.6907776196797689, + "num_tokens": 640348073.0, + "step": 3815 + }, + { + "entropy": 1.7109603087107341, + "epoch": 0.4192139737991266, + "grad_norm": 0.650009036064148, + "learning_rate": 1.8521312336145406e-05, + "loss": 1.4307, + "mean_token_accuracy": 0.6513733565807343, + "num_tokens": 640537136.0, + "step": 3816 + }, + { + "entropy": 1.7343104382356007, + "epoch": 0.41932383071049956, + "grad_norm": 0.6624605655670166, + "learning_rate": 1.8520432902572238e-05, + "loss": 1.5207, + "mean_token_accuracy": 0.6583341757456461, + "num_tokens": 640755737.0, + "step": 3817 + }, + { + "entropy": 1.6712620953718822, + "epoch": 0.4194336876218725, + "grad_norm": 0.8070269823074341, + "learning_rate": 1.8519553230980755e-05, + "loss": 1.5578, + "mean_token_accuracy": 0.6397651135921478, + "num_tokens": 640971967.0, + "step": 3818 + }, + { + "entropy": 1.6985514958699544, + "epoch": 0.41954354453324544, + "grad_norm": 0.7821574211120605, + "learning_rate": 1.85186733213988e-05, + "loss": 1.2669, + "mean_token_accuracy": 0.6710014641284943, + "num_tokens": 641114791.0, + "step": 3819 + }, + { + "entropy": 1.7274354596932728, + "epoch": 0.4196534014446184, + "grad_norm": 0.7574983835220337, + "learning_rate": 1.8517793173854222e-05, + "loss": 1.4655, + "mean_token_accuracy": 0.6479006856679916, + "num_tokens": 641240015.0, + "step": 3820 + }, + { + "entropy": 1.773825873931249, + "epoch": 0.4197632583559913, + "grad_norm": 0.8301964998245239, + "learning_rate": 1.851691278837488e-05, + "loss": 1.3614, + "mean_token_accuracy": 0.6692739625771841, + "num_tokens": 641334970.0, + "step": 3821 + }, + { + "entropy": 1.6614450514316559, + "epoch": 0.41987311526736426, + "grad_norm": 0.6518927216529846, + "learning_rate": 1.8516032164988633e-05, + "loss": 1.4603, + "mean_token_accuracy": 0.6561418920755386, + "num_tokens": 641505654.0, + "step": 3822 + }, + { + "entropy": 1.7007905542850494, + "epoch": 0.4199829721787372, + "grad_norm": 0.6501317024230957, + "learning_rate": 1.8515151303723356e-05, + "loss": 1.4902, + "mean_token_accuracy": 0.6453222384055456, + "num_tokens": 641729379.0, + "step": 3823 + }, + { + "entropy": 1.7722167372703552, + "epoch": 0.42009282909011014, + "grad_norm": 0.7290734648704529, + "learning_rate": 1.851427020460693e-05, + "loss": 1.3669, + "mean_token_accuracy": 0.6568130205074946, + "num_tokens": 641850231.0, + "step": 3824 + }, + { + "entropy": 1.6922112007935841, + "epoch": 0.4202026860014831, + "grad_norm": 0.6940089464187622, + "learning_rate": 1.851338886766723e-05, + "loss": 1.3394, + "mean_token_accuracy": 0.6545537859201431, + "num_tokens": 642018167.0, + "step": 3825 + }, + { + "entropy": 1.7041266858577728, + "epoch": 0.420312542912856, + "grad_norm": 0.6359767317771912, + "learning_rate": 1.8512507292932164e-05, + "loss": 1.3692, + "mean_token_accuracy": 0.6600731213887533, + "num_tokens": 642225951.0, + "step": 3826 + }, + { + "entropy": 1.7149433890978496, + "epoch": 0.42042239982422897, + "grad_norm": 0.7300947308540344, + "learning_rate": 1.8511625480429626e-05, + "loss": 1.3976, + "mean_token_accuracy": 0.6626198341449102, + "num_tokens": 642380453.0, + "step": 3827 + }, + { + "entropy": 1.6327539483706157, + "epoch": 0.4205322567356019, + "grad_norm": 0.9952742457389832, + "learning_rate": 1.851074343018753e-05, + "loss": 1.2784, + "mean_token_accuracy": 0.6821300486723582, + "num_tokens": 642539722.0, + "step": 3828 + }, + { + "entropy": 1.700599084297816, + "epoch": 0.4206421136469748, + "grad_norm": 0.7428448796272278, + "learning_rate": 1.8509861142233783e-05, + "loss": 1.4237, + "mean_token_accuracy": 0.6547687749067942, + "num_tokens": 642698818.0, + "step": 3829 + }, + { + "entropy": 1.7721853852272034, + "epoch": 0.42075197055834773, + "grad_norm": 0.6790313124656677, + "learning_rate": 1.8508978616596318e-05, + "loss": 1.3894, + "mean_token_accuracy": 0.6521950215101242, + "num_tokens": 642883257.0, + "step": 3830 + }, + { + "entropy": 1.6714214185873668, + "epoch": 0.4208618274697207, + "grad_norm": 0.6689066290855408, + "learning_rate": 1.8508095853303064e-05, + "loss": 1.2784, + "mean_token_accuracy": 0.6672136187553406, + "num_tokens": 643011025.0, + "step": 3831 + }, + { + "entropy": 1.7504200140635173, + "epoch": 0.4209716843810936, + "grad_norm": 0.6306473016738892, + "learning_rate": 1.8507212852381958e-05, + "loss": 1.3652, + "mean_token_accuracy": 0.6601114968458811, + "num_tokens": 643175908.0, + "step": 3832 + }, + { + "entropy": 1.6559196809927623, + "epoch": 0.42108154129246655, + "grad_norm": 0.6528786420822144, + "learning_rate": 1.8506329613860944e-05, + "loss": 1.3326, + "mean_token_accuracy": 0.664582168062528, + "num_tokens": 643328930.0, + "step": 3833 + }, + { + "entropy": 1.7539990444978077, + "epoch": 0.4211913982038395, + "grad_norm": 0.910399854183197, + "learning_rate": 1.8505446137767984e-05, + "loss": 1.4594, + "mean_token_accuracy": 0.6541984180609385, + "num_tokens": 643510121.0, + "step": 3834 + }, + { + "entropy": 1.7388847768306732, + "epoch": 0.42130125511521244, + "grad_norm": 0.6952354907989502, + "learning_rate": 1.8504562424131035e-05, + "loss": 1.5782, + "mean_token_accuracy": 0.6242658942937851, + "num_tokens": 643682378.0, + "step": 3835 + }, + { + "entropy": 1.7046812276045482, + "epoch": 0.4214111120265854, + "grad_norm": 0.7153732180595398, + "learning_rate": 1.8503678472978072e-05, + "loss": 1.5552, + "mean_token_accuracy": 0.6502560079097748, + "num_tokens": 643904701.0, + "step": 3836 + }, + { + "entropy": 1.7115220228830974, + "epoch": 0.4215209689379583, + "grad_norm": 0.7493833899497986, + "learning_rate": 1.8502794284337063e-05, + "loss": 1.3032, + "mean_token_accuracy": 0.6713364919026693, + "num_tokens": 644010789.0, + "step": 3837 + }, + { + "entropy": 1.7032727301120758, + "epoch": 0.42163082584933126, + "grad_norm": 0.7664533257484436, + "learning_rate": 1.8501909858235996e-05, + "loss": 1.2455, + "mean_token_accuracy": 0.6849518169959387, + "num_tokens": 644123734.0, + "step": 3838 + }, + { + "entropy": 1.7397787670294445, + "epoch": 0.4217406827607042, + "grad_norm": 0.5922208428382874, + "learning_rate": 1.850102519470286e-05, + "loss": 1.3723, + "mean_token_accuracy": 0.6501923749844233, + "num_tokens": 644283398.0, + "step": 3839 + }, + { + "entropy": 1.6781314810117085, + "epoch": 0.42185053967207714, + "grad_norm": 0.6052653193473816, + "learning_rate": 1.8500140293765655e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.6537490636110306, + "num_tokens": 644454020.0, + "step": 3840 + }, + { + "entropy": 1.6567552785078685, + "epoch": 0.4219603965834501, + "grad_norm": 0.5396919846534729, + "learning_rate": 1.8499255155452397e-05, + "loss": 1.5281, + "mean_token_accuracy": 0.6358696967363358, + "num_tokens": 644708504.0, + "step": 3841 + }, + { + "entropy": 1.7547682126363118, + "epoch": 0.42207025349482297, + "grad_norm": 0.6936992406845093, + "learning_rate": 1.8498369779791085e-05, + "loss": 1.3401, + "mean_token_accuracy": 0.6526836852232615, + "num_tokens": 644878677.0, + "step": 3842 + }, + { + "entropy": 1.693197379509608, + "epoch": 0.4221801104061959, + "grad_norm": 0.7279648780822754, + "learning_rate": 1.8497484166809752e-05, + "loss": 1.3146, + "mean_token_accuracy": 0.6758704036474228, + "num_tokens": 645019940.0, + "step": 3843 + }, + { + "entropy": 1.69556125998497, + "epoch": 0.42228996731756885, + "grad_norm": 0.6791149377822876, + "learning_rate": 1.8496598316536425e-05, + "loss": 1.3299, + "mean_token_accuracy": 0.6537687480449677, + "num_tokens": 645159667.0, + "step": 3844 + }, + { + "entropy": 1.6956780850887299, + "epoch": 0.4223998242289418, + "grad_norm": 0.782292902469635, + "learning_rate": 1.8495712228999138e-05, + "loss": 1.3682, + "mean_token_accuracy": 0.6713423679272333, + "num_tokens": 645279821.0, + "step": 3845 + }, + { + "entropy": 1.728989193836848, + "epoch": 0.42250968114031473, + "grad_norm": 0.6862851977348328, + "learning_rate": 1.8494825904225933e-05, + "loss": 1.3393, + "mean_token_accuracy": 0.663465549548467, + "num_tokens": 645414111.0, + "step": 3846 + }, + { + "entropy": 1.7368880609671276, + "epoch": 0.42261953805168767, + "grad_norm": 1.014003872871399, + "learning_rate": 1.8493939342244868e-05, + "loss": 1.394, + "mean_token_accuracy": 0.6592635711034139, + "num_tokens": 645560546.0, + "step": 3847 + }, + { + "entropy": 1.675550679365794, + "epoch": 0.4227293949630606, + "grad_norm": 0.7420448660850525, + "learning_rate": 1.8493052543084e-05, + "loss": 1.2888, + "mean_token_accuracy": 0.6641269127527872, + "num_tokens": 645690733.0, + "step": 3848 + }, + { + "entropy": 1.6391872266928356, + "epoch": 0.42283925187443355, + "grad_norm": 0.7968411445617676, + "learning_rate": 1.84921655067714e-05, + "loss": 1.3562, + "mean_token_accuracy": 0.6737810671329498, + "num_tokens": 645852109.0, + "step": 3849 + }, + { + "entropy": 1.6818280915419261, + "epoch": 0.4229491087858065, + "grad_norm": 0.7735076546669006, + "learning_rate": 1.849127823333513e-05, + "loss": 1.4612, + "mean_token_accuracy": 0.6462418337663015, + "num_tokens": 646007934.0, + "step": 3850 + }, + { + "entropy": 1.7462623516718547, + "epoch": 0.42305896569717943, + "grad_norm": 0.7541219592094421, + "learning_rate": 1.849039072280328e-05, + "loss": 1.4529, + "mean_token_accuracy": 0.6520050664742788, + "num_tokens": 646170910.0, + "step": 3851 + }, + { + "entropy": 1.6965330342451732, + "epoch": 0.4231688226085524, + "grad_norm": 0.7076205611228943, + "learning_rate": 1.8489502975203945e-05, + "loss": 1.6429, + "mean_token_accuracy": 0.6315357536077499, + "num_tokens": 646372868.0, + "step": 3852 + }, + { + "entropy": 1.6979803641637166, + "epoch": 0.4232786795199253, + "grad_norm": 0.7357332706451416, + "learning_rate": 1.8488614990565214e-05, + "loss": 1.3529, + "mean_token_accuracy": 0.6634115974108378, + "num_tokens": 646543074.0, + "step": 3853 + }, + { + "entropy": 1.722920149564743, + "epoch": 0.42338853643129826, + "grad_norm": 0.8576663732528687, + "learning_rate": 1.8487726768915192e-05, + "loss": 1.5067, + "mean_token_accuracy": 0.6446111053228378, + "num_tokens": 646707309.0, + "step": 3854 + }, + { + "entropy": 1.7091480791568756, + "epoch": 0.4234983933426712, + "grad_norm": 0.6144663095474243, + "learning_rate": 1.848683831028199e-05, + "loss": 1.4944, + "mean_token_accuracy": 0.6384020894765854, + "num_tokens": 646886585.0, + "step": 3855 + }, + { + "entropy": 1.6747375428676605, + "epoch": 0.4236082502540441, + "grad_norm": 0.81898033618927, + "learning_rate": 1.8485949614693727e-05, + "loss": 1.1378, + "mean_token_accuracy": 0.6897122313578924, + "num_tokens": 647014518.0, + "step": 3856 + }, + { + "entropy": 1.7361929814020793, + "epoch": 0.423718107165417, + "grad_norm": 0.7163565158843994, + "learning_rate": 1.8485060682178537e-05, + "loss": 1.3322, + "mean_token_accuracy": 0.6657196134328842, + "num_tokens": 647213559.0, + "step": 3857 + }, + { + "entropy": 1.7410944600900014, + "epoch": 0.42382796407678996, + "grad_norm": 0.7472032904624939, + "learning_rate": 1.848417151276455e-05, + "loss": 1.3762, + "mean_token_accuracy": 0.6659711500008901, + "num_tokens": 647367678.0, + "step": 3858 + }, + { + "entropy": 1.681091417868932, + "epoch": 0.4239378209881629, + "grad_norm": 1.138753890991211, + "learning_rate": 1.8483282106479902e-05, + "loss": 1.4024, + "mean_token_accuracy": 0.6617006063461304, + "num_tokens": 647554358.0, + "step": 3859 + }, + { + "entropy": 1.7275851269563038, + "epoch": 0.42404767789953585, + "grad_norm": 0.6139320135116577, + "learning_rate": 1.848239246335275e-05, + "loss": 1.4733, + "mean_token_accuracy": 0.6429044504960378, + "num_tokens": 647728748.0, + "step": 3860 + }, + { + "entropy": 1.6915673911571503, + "epoch": 0.4241575348109088, + "grad_norm": 0.6831756234169006, + "learning_rate": 1.8481502583411247e-05, + "loss": 1.3334, + "mean_token_accuracy": 0.6636428534984589, + "num_tokens": 647878607.0, + "step": 3861 + }, + { + "entropy": 1.750975062449773, + "epoch": 0.42426739172228173, + "grad_norm": 0.7183418273925781, + "learning_rate": 1.848061246668356e-05, + "loss": 1.5977, + "mean_token_accuracy": 0.6374113808075587, + "num_tokens": 648024338.0, + "step": 3862 + }, + { + "entropy": 1.7088340322176616, + "epoch": 0.42437724863365467, + "grad_norm": 0.7100759744644165, + "learning_rate": 1.847972211319786e-05, + "loss": 1.3605, + "mean_token_accuracy": 0.6605943193038305, + "num_tokens": 648196447.0, + "step": 3863 + }, + { + "entropy": 1.6845079759756725, + "epoch": 0.4244871055450276, + "grad_norm": 0.687178909778595, + "learning_rate": 1.8478831522982324e-05, + "loss": 1.416, + "mean_token_accuracy": 0.6500076999266943, + "num_tokens": 648383053.0, + "step": 3864 + }, + { + "entropy": 1.7246264616648357, + "epoch": 0.42459696245640055, + "grad_norm": 0.7078330516815186, + "learning_rate": 1.847794069606514e-05, + "loss": 1.5448, + "mean_token_accuracy": 0.6318613439798355, + "num_tokens": 648584553.0, + "step": 3865 + }, + { + "entropy": 1.7536778251330059, + "epoch": 0.4247068193677735, + "grad_norm": 0.7490545511245728, + "learning_rate": 1.8477049632474508e-05, + "loss": 1.4708, + "mean_token_accuracy": 0.6561292608579, + "num_tokens": 648745917.0, + "step": 3866 + }, + { + "entropy": 1.6898792386054993, + "epoch": 0.42481667627914643, + "grad_norm": 0.7261310815811157, + "learning_rate": 1.8476158332238617e-05, + "loss": 1.3375, + "mean_token_accuracy": 0.6690275917450587, + "num_tokens": 648890450.0, + "step": 3867 + }, + { + "entropy": 1.710007667541504, + "epoch": 0.4249265331905194, + "grad_norm": 0.7561746835708618, + "learning_rate": 1.8475266795385685e-05, + "loss": 1.3708, + "mean_token_accuracy": 0.6579129894574484, + "num_tokens": 649015823.0, + "step": 3868 + }, + { + "entropy": 1.719689855972926, + "epoch": 0.42503639010189226, + "grad_norm": 0.7121495604515076, + "learning_rate": 1.8474375021943932e-05, + "loss": 1.2898, + "mean_token_accuracy": 0.6680507610241572, + "num_tokens": 649190115.0, + "step": 3869 + }, + { + "entropy": 1.7542536358038585, + "epoch": 0.4251462470132652, + "grad_norm": 0.7546373605728149, + "learning_rate": 1.8473483011941574e-05, + "loss": 1.3253, + "mean_token_accuracy": 0.6552920093139013, + "num_tokens": 649304401.0, + "step": 3870 + }, + { + "entropy": 1.716896414756775, + "epoch": 0.42525610392463814, + "grad_norm": 0.6879488825798035, + "learning_rate": 1.8472590765406845e-05, + "loss": 1.4352, + "mean_token_accuracy": 0.6727662235498428, + "num_tokens": 649459866.0, + "step": 3871 + }, + { + "entropy": 1.6816608607769012, + "epoch": 0.4253659608360111, + "grad_norm": 0.8443351984024048, + "learning_rate": 1.847169828236799e-05, + "loss": 1.3892, + "mean_token_accuracy": 0.6620204697052637, + "num_tokens": 649626901.0, + "step": 3872 + }, + { + "entropy": 1.740911195675532, + "epoch": 0.425475817747384, + "grad_norm": 0.7434075474739075, + "learning_rate": 1.8470805562853244e-05, + "loss": 1.5953, + "mean_token_accuracy": 0.6165072818597158, + "num_tokens": 649809343.0, + "step": 3873 + }, + { + "entropy": 1.7113256255785625, + "epoch": 0.42558567465875696, + "grad_norm": 0.8285733461380005, + "learning_rate": 1.846991260689087e-05, + "loss": 1.3399, + "mean_token_accuracy": 0.665142834186554, + "num_tokens": 649991640.0, + "step": 3874 + }, + { + "entropy": 1.6987220545609791, + "epoch": 0.4256955315701299, + "grad_norm": 0.6770405173301697, + "learning_rate": 1.8469019414509136e-05, + "loss": 1.4514, + "mean_token_accuracy": 0.6441337664922079, + "num_tokens": 650164246.0, + "step": 3875 + }, + { + "entropy": 1.7160173257191975, + "epoch": 0.42580538848150284, + "grad_norm": 0.7959917187690735, + "learning_rate": 1.8468125985736295e-05, + "loss": 1.5868, + "mean_token_accuracy": 0.6459453006585439, + "num_tokens": 650305891.0, + "step": 3876 + }, + { + "entropy": 1.6745306452115376, + "epoch": 0.4259152453928758, + "grad_norm": 0.6555696725845337, + "learning_rate": 1.8467232320600638e-05, + "loss": 1.4623, + "mean_token_accuracy": 0.6556547085444132, + "num_tokens": 650507562.0, + "step": 3877 + }, + { + "entropy": 1.7178467512130737, + "epoch": 0.4260251023042487, + "grad_norm": 0.6489704847335815, + "learning_rate": 1.846633841913044e-05, + "loss": 1.3224, + "mean_token_accuracy": 0.6665374338626862, + "num_tokens": 650667834.0, + "step": 3878 + }, + { + "entropy": 1.6714246372381847, + "epoch": 0.42613495921562167, + "grad_norm": 0.7068523168563843, + "learning_rate": 1.8465444281353992e-05, + "loss": 1.4546, + "mean_token_accuracy": 0.6516213566064835, + "num_tokens": 650840788.0, + "step": 3879 + }, + { + "entropy": 1.7186284760634105, + "epoch": 0.4262448161269946, + "grad_norm": 0.7457088828086853, + "learning_rate": 1.84645499072996e-05, + "loss": 1.2781, + "mean_token_accuracy": 0.672358974814415, + "num_tokens": 650941361.0, + "step": 3880 + }, + { + "entropy": 1.6859534084796906, + "epoch": 0.42635467303836755, + "grad_norm": 0.679847240447998, + "learning_rate": 1.8463655296995567e-05, + "loss": 1.3869, + "mean_token_accuracy": 0.6544978270928065, + "num_tokens": 651105965.0, + "step": 3881 + }, + { + "entropy": 1.7387334704399109, + "epoch": 0.4264645299497405, + "grad_norm": 0.8297735452651978, + "learning_rate": 1.8462760450470207e-05, + "loss": 1.4147, + "mean_token_accuracy": 0.6685324857632319, + "num_tokens": 651242560.0, + "step": 3882 + }, + { + "entropy": 1.808528443177541, + "epoch": 0.4265743868611134, + "grad_norm": 0.7231261730194092, + "learning_rate": 1.846186536775184e-05, + "loss": 1.5466, + "mean_token_accuracy": 0.6377104272445043, + "num_tokens": 651430287.0, + "step": 3883 + }, + { + "entropy": 1.7021079659461975, + "epoch": 0.4266842437724863, + "grad_norm": 0.7810244560241699, + "learning_rate": 1.84609700488688e-05, + "loss": 1.2778, + "mean_token_accuracy": 0.6832303404808044, + "num_tokens": 651559583.0, + "step": 3884 + }, + { + "entropy": 1.689590334892273, + "epoch": 0.42679410068385926, + "grad_norm": 0.6838991045951843, + "learning_rate": 1.8460074493849416e-05, + "loss": 1.2951, + "mean_token_accuracy": 0.6713648786147436, + "num_tokens": 651691973.0, + "step": 3885 + }, + { + "entropy": 1.714994877576828, + "epoch": 0.4269039575952322, + "grad_norm": 0.6699170470237732, + "learning_rate": 1.8459178702722037e-05, + "loss": 1.2707, + "mean_token_accuracy": 0.6692277739445368, + "num_tokens": 651816035.0, + "step": 3886 + }, + { + "entropy": 1.729461799065272, + "epoch": 0.42701381450660514, + "grad_norm": 0.6075051426887512, + "learning_rate": 1.8458282675515016e-05, + "loss": 1.3528, + "mean_token_accuracy": 0.6616584012905756, + "num_tokens": 652024333.0, + "step": 3887 + }, + { + "entropy": 1.6672471364339192, + "epoch": 0.4271236714179781, + "grad_norm": 0.6613723039627075, + "learning_rate": 1.8457386412256704e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6597268283367157, + "num_tokens": 652168775.0, + "step": 3888 + }, + { + "entropy": 1.662774880727132, + "epoch": 0.427233528329351, + "grad_norm": 0.7188613414764404, + "learning_rate": 1.8456489912975477e-05, + "loss": 1.4895, + "mean_token_accuracy": 0.6598201990127563, + "num_tokens": 652318630.0, + "step": 3889 + }, + { + "entropy": 1.7089114785194397, + "epoch": 0.42734338524072396, + "grad_norm": 0.7041028141975403, + "learning_rate": 1.8455593177699704e-05, + "loss": 1.4792, + "mean_token_accuracy": 0.6515317956606547, + "num_tokens": 652480939.0, + "step": 3890 + }, + { + "entropy": 1.706920713186264, + "epoch": 0.4274532421520969, + "grad_norm": 0.7197327613830566, + "learning_rate": 1.845469620645776e-05, + "loss": 1.441, + "mean_token_accuracy": 0.6517567286888758, + "num_tokens": 652646579.0, + "step": 3891 + }, + { + "entropy": 1.7612548073132832, + "epoch": 0.42756309906346984, + "grad_norm": 0.78521329164505, + "learning_rate": 1.8453798999278047e-05, + "loss": 1.5184, + "mean_token_accuracy": 0.6430316617091497, + "num_tokens": 652805441.0, + "step": 3892 + }, + { + "entropy": 1.711044450600942, + "epoch": 0.4276729559748428, + "grad_norm": 0.624224066734314, + "learning_rate": 1.8452901556188952e-05, + "loss": 1.4991, + "mean_token_accuracy": 0.6483441591262817, + "num_tokens": 652968733.0, + "step": 3893 + }, + { + "entropy": 1.7002309560775757, + "epoch": 0.4277828128862157, + "grad_norm": 0.6248944997787476, + "learning_rate": 1.845200387721888e-05, + "loss": 1.4019, + "mean_token_accuracy": 0.6617890248696009, + "num_tokens": 653141796.0, + "step": 3894 + }, + { + "entropy": 1.705695738395055, + "epoch": 0.42789266979758867, + "grad_norm": 0.6925023198127747, + "learning_rate": 1.8451105962396247e-05, + "loss": 1.3145, + "mean_token_accuracy": 0.6632231523593267, + "num_tokens": 653294696.0, + "step": 3895 + }, + { + "entropy": 1.751973956823349, + "epoch": 0.42800252670896155, + "grad_norm": 0.6819112300872803, + "learning_rate": 1.845020781174947e-05, + "loss": 1.3375, + "mean_token_accuracy": 0.6607561757167181, + "num_tokens": 653435104.0, + "step": 3896 + }, + { + "entropy": 1.686410774787267, + "epoch": 0.4281123836203345, + "grad_norm": 0.6627749800682068, + "learning_rate": 1.8449309425306963e-05, + "loss": 1.2681, + "mean_token_accuracy": 0.6718742549419403, + "num_tokens": 653540901.0, + "step": 3897 + }, + { + "entropy": 1.7340122958024342, + "epoch": 0.42822224053170743, + "grad_norm": 0.7688063979148865, + "learning_rate": 1.8448410803097177e-05, + "loss": 1.3647, + "mean_token_accuracy": 0.6578405052423477, + "num_tokens": 653723766.0, + "step": 3898 + }, + { + "entropy": 1.71702042222023, + "epoch": 0.4283320974430804, + "grad_norm": 0.7477086186408997, + "learning_rate": 1.8447511945148544e-05, + "loss": 1.3483, + "mean_token_accuracy": 0.65413269897302, + "num_tokens": 653885687.0, + "step": 3899 + }, + { + "entropy": 1.7188432812690735, + "epoch": 0.4284419543544533, + "grad_norm": 0.7725921273231506, + "learning_rate": 1.8446612851489513e-05, + "loss": 1.3839, + "mean_token_accuracy": 0.6639880041281382, + "num_tokens": 654016294.0, + "step": 3900 + }, + { + "entropy": 1.6381129622459412, + "epoch": 0.42855181126582625, + "grad_norm": 0.6253584027290344, + "learning_rate": 1.844571352214854e-05, + "loss": 1.3036, + "mean_token_accuracy": 0.6645344942808151, + "num_tokens": 654174210.0, + "step": 3901 + }, + { + "entropy": 1.7396024366219838, + "epoch": 0.4286616681771992, + "grad_norm": 0.6562190055847168, + "learning_rate": 1.8444813957154094e-05, + "loss": 1.4815, + "mean_token_accuracy": 0.6449996630350748, + "num_tokens": 654341703.0, + "step": 3902 + }, + { + "entropy": 1.7696809967358906, + "epoch": 0.42877152508857214, + "grad_norm": 0.7089744806289673, + "learning_rate": 1.8443914156534636e-05, + "loss": 1.338, + "mean_token_accuracy": 0.664988378683726, + "num_tokens": 654454220.0, + "step": 3903 + }, + { + "entropy": 1.7581712106863658, + "epoch": 0.4288813819999451, + "grad_norm": 0.7849205732345581, + "learning_rate": 1.8443014120318653e-05, + "loss": 1.2782, + "mean_token_accuracy": 0.6888795097668966, + "num_tokens": 654565966.0, + "step": 3904 + }, + { + "entropy": 1.6995848814646404, + "epoch": 0.428991238911318, + "grad_norm": 0.7430975437164307, + "learning_rate": 1.844211384853462e-05, + "loss": 1.5033, + "mean_token_accuracy": 0.6458527992169062, + "num_tokens": 654706153.0, + "step": 3905 + }, + { + "entropy": 1.7555510600407918, + "epoch": 0.42910109582269096, + "grad_norm": 0.6673111319541931, + "learning_rate": 1.8441213341211042e-05, + "loss": 1.3924, + "mean_token_accuracy": 0.6652990728616714, + "num_tokens": 654923563.0, + "step": 3906 + }, + { + "entropy": 1.6870744427045186, + "epoch": 0.4292109527340639, + "grad_norm": 0.6765327453613281, + "learning_rate": 1.8440312598376417e-05, + "loss": 1.4085, + "mean_token_accuracy": 0.6474004884560903, + "num_tokens": 655082747.0, + "step": 3907 + }, + { + "entropy": 1.8040563662846882, + "epoch": 0.42932080964543684, + "grad_norm": 0.7210556268692017, + "learning_rate": 1.843941162005925e-05, + "loss": 1.4306, + "mean_token_accuracy": 0.648560548822085, + "num_tokens": 655255550.0, + "step": 3908 + }, + { + "entropy": 1.730605661869049, + "epoch": 0.4294306665568098, + "grad_norm": 0.7293521165847778, + "learning_rate": 1.8438510406288054e-05, + "loss": 1.4941, + "mean_token_accuracy": 0.6423317690690359, + "num_tokens": 655410501.0, + "step": 3909 + }, + { + "entropy": 1.6763539016246796, + "epoch": 0.42954052346818267, + "grad_norm": 0.5762035250663757, + "learning_rate": 1.8437608957091356e-05, + "loss": 1.4347, + "mean_token_accuracy": 0.6584045539299647, + "num_tokens": 655612794.0, + "step": 3910 + }, + { + "entropy": 1.6941528419653575, + "epoch": 0.4296503803795556, + "grad_norm": 0.7608525156974792, + "learning_rate": 1.8436707272497687e-05, + "loss": 1.4372, + "mean_token_accuracy": 0.6504789739847183, + "num_tokens": 655775219.0, + "step": 3911 + }, + { + "entropy": 1.6654709080855052, + "epoch": 0.42976023729092855, + "grad_norm": 0.6557295918464661, + "learning_rate": 1.8435805352535588e-05, + "loss": 1.3395, + "mean_token_accuracy": 0.6681905190149943, + "num_tokens": 655919108.0, + "step": 3912 + }, + { + "entropy": 1.6901763478914897, + "epoch": 0.4298700942023015, + "grad_norm": 0.8068464994430542, + "learning_rate": 1.8434903197233594e-05, + "loss": 1.2511, + "mean_token_accuracy": 0.6827605366706848, + "num_tokens": 656042827.0, + "step": 3913 + }, + { + "entropy": 1.7511335114638011, + "epoch": 0.42997995111367443, + "grad_norm": 0.7122650146484375, + "learning_rate": 1.843400080662027e-05, + "loss": 1.4982, + "mean_token_accuracy": 0.6384455660978953, + "num_tokens": 656224813.0, + "step": 3914 + }, + { + "entropy": 1.7839158376057942, + "epoch": 0.43008980802504737, + "grad_norm": 0.581892192363739, + "learning_rate": 1.8433098180724165e-05, + "loss": 1.4154, + "mean_token_accuracy": 0.6490354090929031, + "num_tokens": 656425344.0, + "step": 3915 + }, + { + "entropy": 1.7053211430708568, + "epoch": 0.4301996649364203, + "grad_norm": 0.8363472819328308, + "learning_rate": 1.8432195319573855e-05, + "loss": 1.399, + "mean_token_accuracy": 0.6475622256596884, + "num_tokens": 656602663.0, + "step": 3916 + }, + { + "entropy": 1.711295525232951, + "epoch": 0.43030952184779325, + "grad_norm": 0.8222143054008484, + "learning_rate": 1.843129222319791e-05, + "loss": 1.5267, + "mean_token_accuracy": 0.6486119305094084, + "num_tokens": 656744412.0, + "step": 3917 + }, + { + "entropy": 1.703042854865392, + "epoch": 0.4304193787591662, + "grad_norm": 0.612709641456604, + "learning_rate": 1.8430388891624915e-05, + "loss": 1.505, + "mean_token_accuracy": 0.6417644868294398, + "num_tokens": 656950229.0, + "step": 3918 + }, + { + "entropy": 1.6889925201733906, + "epoch": 0.43052923567053913, + "grad_norm": 0.711890459060669, + "learning_rate": 1.8429485324883464e-05, + "loss": 1.4388, + "mean_token_accuracy": 0.6587661306063334, + "num_tokens": 657105618.0, + "step": 3919 + }, + { + "entropy": 1.688792844613393, + "epoch": 0.4306390925819121, + "grad_norm": 0.6642475128173828, + "learning_rate": 1.8428581523002146e-05, + "loss": 1.3509, + "mean_token_accuracy": 0.6492424011230469, + "num_tokens": 657267221.0, + "step": 3920 + }, + { + "entropy": 1.6864960094292958, + "epoch": 0.430748949493285, + "grad_norm": 0.6970334053039551, + "learning_rate": 1.842767748600957e-05, + "loss": 1.5943, + "mean_token_accuracy": 0.6339181611935297, + "num_tokens": 657467480.0, + "step": 3921 + }, + { + "entropy": 1.7039388716220856, + "epoch": 0.43085880640465796, + "grad_norm": 0.6595334410667419, + "learning_rate": 1.842677321393435e-05, + "loss": 1.4437, + "mean_token_accuracy": 0.6527088582515717, + "num_tokens": 657643535.0, + "step": 3922 + }, + { + "entropy": 1.7325763603051503, + "epoch": 0.43096866331603084, + "grad_norm": 0.7706548571586609, + "learning_rate": 1.8425868706805103e-05, + "loss": 1.5503, + "mean_token_accuracy": 0.6559620996316274, + "num_tokens": 657831508.0, + "step": 3923 + }, + { + "entropy": 1.6889927089214325, + "epoch": 0.4310785202274038, + "grad_norm": 0.8752990365028381, + "learning_rate": 1.842496396465046e-05, + "loss": 1.4015, + "mean_token_accuracy": 0.6509895920753479, + "num_tokens": 658002447.0, + "step": 3924 + }, + { + "entropy": 1.6617620189984639, + "epoch": 0.4311883771387767, + "grad_norm": 1.0145397186279297, + "learning_rate": 1.842405898749905e-05, + "loss": 1.3561, + "mean_token_accuracy": 0.6678018321593603, + "num_tokens": 658178958.0, + "step": 3925 + }, + { + "entropy": 1.7041081885496776, + "epoch": 0.43129823405014966, + "grad_norm": 0.8633971214294434, + "learning_rate": 1.842315377537952e-05, + "loss": 1.2597, + "mean_token_accuracy": 0.6791882663965225, + "num_tokens": 658312769.0, + "step": 3926 + }, + { + "entropy": 1.6727077662944794, + "epoch": 0.4314080909615226, + "grad_norm": 0.7873262166976929, + "learning_rate": 1.842224832832052e-05, + "loss": 1.3808, + "mean_token_accuracy": 0.6544581105311712, + "num_tokens": 658453178.0, + "step": 3927 + }, + { + "entropy": 1.7058905164400737, + "epoch": 0.43151794787289555, + "grad_norm": 0.6460711359977722, + "learning_rate": 1.8421342646350704e-05, + "loss": 1.36, + "mean_token_accuracy": 0.6649837543567022, + "num_tokens": 658667304.0, + "step": 3928 + }, + { + "entropy": 1.6966310739517212, + "epoch": 0.4316278047842685, + "grad_norm": 0.675844669342041, + "learning_rate": 1.8420436729498736e-05, + "loss": 1.4495, + "mean_token_accuracy": 0.6551729142665863, + "num_tokens": 658824408.0, + "step": 3929 + }, + { + "entropy": 1.7804734110832214, + "epoch": 0.43173766169564143, + "grad_norm": 0.5847294926643372, + "learning_rate": 1.841953057779329e-05, + "loss": 1.4555, + "mean_token_accuracy": 0.6308817764123281, + "num_tokens": 659070341.0, + "step": 3930 + }, + { + "entropy": 1.775219549735387, + "epoch": 0.43184751860701437, + "grad_norm": 0.6830585598945618, + "learning_rate": 1.8418624191263047e-05, + "loss": 1.552, + "mean_token_accuracy": 0.6412175844113032, + "num_tokens": 659256021.0, + "step": 3931 + }, + { + "entropy": 1.711199273665746, + "epoch": 0.4319573755183873, + "grad_norm": 0.7030532956123352, + "learning_rate": 1.8417717569936688e-05, + "loss": 1.4169, + "mean_token_accuracy": 0.654847651720047, + "num_tokens": 659433299.0, + "step": 3932 + }, + { + "entropy": 1.7225093841552734, + "epoch": 0.43206723242976025, + "grad_norm": 0.7642560601234436, + "learning_rate": 1.841681071384291e-05, + "loss": 1.5811, + "mean_token_accuracy": 0.6175984516739845, + "num_tokens": 659652811.0, + "step": 3933 + }, + { + "entropy": 1.6770412425200145, + "epoch": 0.4321770893411332, + "grad_norm": 0.634323000907898, + "learning_rate": 1.8415903623010415e-05, + "loss": 1.5018, + "mean_token_accuracy": 0.6280664106210073, + "num_tokens": 659865805.0, + "step": 3934 + }, + { + "entropy": 1.7322843472162883, + "epoch": 0.43228694625250613, + "grad_norm": 0.9006372094154358, + "learning_rate": 1.8414996297467917e-05, + "loss": 1.3899, + "mean_token_accuracy": 0.6665350049734116, + "num_tokens": 660052306.0, + "step": 3935 + }, + { + "entropy": 1.7359480261802673, + "epoch": 0.4323968031638791, + "grad_norm": 0.6146532297134399, + "learning_rate": 1.841408873724412e-05, + "loss": 1.5926, + "mean_token_accuracy": 0.6407150129477183, + "num_tokens": 660243568.0, + "step": 3936 + }, + { + "entropy": 1.7125855485598247, + "epoch": 0.43250666007525196, + "grad_norm": 0.718245267868042, + "learning_rate": 1.841318094236776e-05, + "loss": 1.4883, + "mean_token_accuracy": 0.6494996398687363, + "num_tokens": 660530282.0, + "step": 3937 + }, + { + "entropy": 1.749464641014735, + "epoch": 0.4326165169866249, + "grad_norm": 0.7096315622329712, + "learning_rate": 1.8412272912867563e-05, + "loss": 1.3765, + "mean_token_accuracy": 0.6703631083170573, + "num_tokens": 660667046.0, + "step": 3938 + }, + { + "entropy": 1.7773485879103343, + "epoch": 0.43272637389799784, + "grad_norm": 0.9372892379760742, + "learning_rate": 1.8411364648772268e-05, + "loss": 1.6595, + "mean_token_accuracy": 0.6321366230646769, + "num_tokens": 660833411.0, + "step": 3939 + }, + { + "entropy": 1.7520179847876232, + "epoch": 0.4328362308093708, + "grad_norm": 0.6285591721534729, + "learning_rate": 1.841045615011062e-05, + "loss": 1.3919, + "mean_token_accuracy": 0.6557580778996149, + "num_tokens": 661011775.0, + "step": 3940 + }, + { + "entropy": 1.736931214729945, + "epoch": 0.4329460877207437, + "grad_norm": 0.7110692262649536, + "learning_rate": 1.8409547416911378e-05, + "loss": 1.4228, + "mean_token_accuracy": 0.6697671810785929, + "num_tokens": 661139457.0, + "step": 3941 + }, + { + "entropy": 1.7010645965735118, + "epoch": 0.43305594463211666, + "grad_norm": 0.6413223743438721, + "learning_rate": 1.8408638449203296e-05, + "loss": 1.3613, + "mean_token_accuracy": 0.6522543032964071, + "num_tokens": 661303901.0, + "step": 3942 + }, + { + "entropy": 1.7435734967390697, + "epoch": 0.4331658015434896, + "grad_norm": 0.7069315910339355, + "learning_rate": 1.8407729247015146e-05, + "loss": 1.6427, + "mean_token_accuracy": 0.619743749499321, + "num_tokens": 661521048.0, + "step": 3943 + }, + { + "entropy": 1.6970693071683247, + "epoch": 0.43327565845486254, + "grad_norm": 0.7432729601860046, + "learning_rate": 1.8406819810375706e-05, + "loss": 1.4804, + "mean_token_accuracy": 0.6531406243642172, + "num_tokens": 661687623.0, + "step": 3944 + }, + { + "entropy": 1.770354559024175, + "epoch": 0.4333855153662355, + "grad_norm": 0.6008591651916504, + "learning_rate": 1.840591013931375e-05, + "loss": 1.3533, + "mean_token_accuracy": 0.655795618891716, + "num_tokens": 661876251.0, + "step": 3945 + }, + { + "entropy": 1.770843635002772, + "epoch": 0.4334953722776084, + "grad_norm": 0.7439899444580078, + "learning_rate": 1.8405000233858083e-05, + "loss": 1.4859, + "mean_token_accuracy": 0.6457716375589371, + "num_tokens": 662002220.0, + "step": 3946 + }, + { + "entropy": 1.7036712368329365, + "epoch": 0.43360522918898137, + "grad_norm": 0.6265702247619629, + "learning_rate": 1.8404090094037488e-05, + "loss": 1.536, + "mean_token_accuracy": 0.6389161348342896, + "num_tokens": 662199953.0, + "step": 3947 + }, + { + "entropy": 1.6811943848927815, + "epoch": 0.4337150861003543, + "grad_norm": 0.6257259249687195, + "learning_rate": 1.8403179719880782e-05, + "loss": 1.3708, + "mean_token_accuracy": 0.6628421694040298, + "num_tokens": 662393764.0, + "step": 3948 + }, + { + "entropy": 1.6686974863211315, + "epoch": 0.43382494301172725, + "grad_norm": 0.6457135081291199, + "learning_rate": 1.8402269111416776e-05, + "loss": 1.5583, + "mean_token_accuracy": 0.6230059290925661, + "num_tokens": 662600456.0, + "step": 3949 + }, + { + "entropy": 1.7240578730901082, + "epoch": 0.4339347999231002, + "grad_norm": 0.6716305017471313, + "learning_rate": 1.8401358268674282e-05, + "loss": 1.5154, + "mean_token_accuracy": 0.6316413730382919, + "num_tokens": 662798175.0, + "step": 3950 + }, + { + "entropy": 1.711452802022298, + "epoch": 0.4340446568344731, + "grad_norm": 0.7926602363586426, + "learning_rate": 1.840044719168214e-05, + "loss": 1.4417, + "mean_token_accuracy": 0.6603543410698572, + "num_tokens": 662943808.0, + "step": 3951 + }, + { + "entropy": 1.692206621170044, + "epoch": 0.434154513745846, + "grad_norm": 0.7351948022842407, + "learning_rate": 1.8399535880469174e-05, + "loss": 1.3929, + "mean_token_accuracy": 0.6664231171210607, + "num_tokens": 663096189.0, + "step": 3952 + }, + { + "entropy": 1.6513939301172893, + "epoch": 0.43426437065721896, + "grad_norm": 0.9081413149833679, + "learning_rate": 1.8398624335064234e-05, + "loss": 1.5123, + "mean_token_accuracy": 0.6428524355093638, + "num_tokens": 663262484.0, + "step": 3953 + }, + { + "entropy": 1.6838249266147614, + "epoch": 0.4343742275685919, + "grad_norm": 0.6735399961471558, + "learning_rate": 1.839771255549617e-05, + "loss": 1.3138, + "mean_token_accuracy": 0.6686491419871649, + "num_tokens": 663412036.0, + "step": 3954 + }, + { + "entropy": 1.705833335717519, + "epoch": 0.43448408447996484, + "grad_norm": 0.6810485124588013, + "learning_rate": 1.8396800541793837e-05, + "loss": 1.3139, + "mean_token_accuracy": 0.6651433457930883, + "num_tokens": 663526186.0, + "step": 3955 + }, + { + "entropy": 1.729738712310791, + "epoch": 0.4345939413913378, + "grad_norm": 0.7022128701210022, + "learning_rate": 1.8395888293986096e-05, + "loss": 1.3078, + "mean_token_accuracy": 0.6703850577274958, + "num_tokens": 663670981.0, + "step": 3956 + }, + { + "entropy": 1.701034019390742, + "epoch": 0.4347037983027107, + "grad_norm": 0.6978190541267395, + "learning_rate": 1.8394975812101824e-05, + "loss": 1.4422, + "mean_token_accuracy": 0.6494102279345194, + "num_tokens": 663868779.0, + "step": 3957 + }, + { + "entropy": 1.7295649846394856, + "epoch": 0.43481365521408366, + "grad_norm": 0.6927241683006287, + "learning_rate": 1.8394063096169904e-05, + "loss": 1.3859, + "mean_token_accuracy": 0.6548526287078857, + "num_tokens": 664041829.0, + "step": 3958 + }, + { + "entropy": 1.7171918253103893, + "epoch": 0.4349235121254566, + "grad_norm": 0.745561420917511, + "learning_rate": 1.8393150146219214e-05, + "loss": 1.4569, + "mean_token_accuracy": 0.6500047942002615, + "num_tokens": 664221934.0, + "step": 3959 + }, + { + "entropy": 1.6659683287143707, + "epoch": 0.43503336903682954, + "grad_norm": 0.6515949368476868, + "learning_rate": 1.8392236962278656e-05, + "loss": 1.3887, + "mean_token_accuracy": 0.6540011912584305, + "num_tokens": 664382086.0, + "step": 3960 + }, + { + "entropy": 1.6935789982477825, + "epoch": 0.4351432259482025, + "grad_norm": 0.6703075766563416, + "learning_rate": 1.839132354437713e-05, + "loss": 1.4385, + "mean_token_accuracy": 0.6582238326470057, + "num_tokens": 664527955.0, + "step": 3961 + }, + { + "entropy": 1.7064592937628429, + "epoch": 0.4352530828595754, + "grad_norm": 0.6275252103805542, + "learning_rate": 1.839040989254354e-05, + "loss": 1.346, + "mean_token_accuracy": 0.6551967660586039, + "num_tokens": 664697710.0, + "step": 3962 + }, + { + "entropy": 1.7683529754479725, + "epoch": 0.43536293977094837, + "grad_norm": 0.7211574912071228, + "learning_rate": 1.838949600680681e-05, + "loss": 1.374, + "mean_token_accuracy": 0.6625560075044632, + "num_tokens": 664850356.0, + "step": 3963 + }, + { + "entropy": 1.7406555513540904, + "epoch": 0.43547279668232125, + "grad_norm": 0.6910867094993591, + "learning_rate": 1.838858188719586e-05, + "loss": 1.2189, + "mean_token_accuracy": 0.6768646488587061, + "num_tokens": 664945901.0, + "step": 3964 + }, + { + "entropy": 1.746801644563675, + "epoch": 0.4355826535936942, + "grad_norm": 0.7130513787269592, + "learning_rate": 1.8387667533739627e-05, + "loss": 1.3185, + "mean_token_accuracy": 0.6610147307316462, + "num_tokens": 665097327.0, + "step": 3965 + }, + { + "entropy": 1.7367882827917736, + "epoch": 0.43569251050506713, + "grad_norm": 0.7409621477127075, + "learning_rate": 1.8386752946467043e-05, + "loss": 1.3773, + "mean_token_accuracy": 0.6539178440968195, + "num_tokens": 665328976.0, + "step": 3966 + }, + { + "entropy": 1.6604821781317394, + "epoch": 0.4358023674164401, + "grad_norm": 0.6217142343521118, + "learning_rate": 1.8385838125407053e-05, + "loss": 1.3066, + "mean_token_accuracy": 0.6670824587345123, + "num_tokens": 665484819.0, + "step": 3967 + }, + { + "entropy": 1.637445976336797, + "epoch": 0.435912224327813, + "grad_norm": 0.6176252365112305, + "learning_rate": 1.838492307058862e-05, + "loss": 1.409, + "mean_token_accuracy": 0.6582680841286978, + "num_tokens": 665638564.0, + "step": 3968 + }, + { + "entropy": 1.7330170969168346, + "epoch": 0.43602208123918595, + "grad_norm": 0.8453408479690552, + "learning_rate": 1.8384007782040693e-05, + "loss": 1.3667, + "mean_token_accuracy": 0.6572331438461939, + "num_tokens": 665789481.0, + "step": 3969 + }, + { + "entropy": 1.7294095953305562, + "epoch": 0.4361319381505589, + "grad_norm": 0.7358590960502625, + "learning_rate": 1.8383092259792254e-05, + "loss": 1.3653, + "mean_token_accuracy": 0.6577903230985006, + "num_tokens": 665932917.0, + "step": 3970 + }, + { + "entropy": 1.735178271929423, + "epoch": 0.43624179506193184, + "grad_norm": 0.6520756483078003, + "learning_rate": 1.8382176503872266e-05, + "loss": 1.3795, + "mean_token_accuracy": 0.6685875505208969, + "num_tokens": 666070750.0, + "step": 3971 + }, + { + "entropy": 1.7012029190858204, + "epoch": 0.4363516519733048, + "grad_norm": 0.640774130821228, + "learning_rate": 1.8381260514309722e-05, + "loss": 1.6331, + "mean_token_accuracy": 0.6277043521404266, + "num_tokens": 666299387.0, + "step": 3972 + }, + { + "entropy": 1.7609250446160634, + "epoch": 0.4364615088846777, + "grad_norm": 0.6770412921905518, + "learning_rate": 1.838034429113361e-05, + "loss": 1.3659, + "mean_token_accuracy": 0.6573913991451263, + "num_tokens": 666416424.0, + "step": 3973 + }, + { + "entropy": 1.7060511807600658, + "epoch": 0.43657136579605066, + "grad_norm": 0.7520270943641663, + "learning_rate": 1.837942783437292e-05, + "loss": 1.527, + "mean_token_accuracy": 0.670257126291593, + "num_tokens": 666584190.0, + "step": 3974 + }, + { + "entropy": 1.722565899292628, + "epoch": 0.4366812227074236, + "grad_norm": 0.6892067193984985, + "learning_rate": 1.8378511144056673e-05, + "loss": 1.4379, + "mean_token_accuracy": 0.6610093315442404, + "num_tokens": 666777502.0, + "step": 3975 + }, + { + "entropy": 1.7519932488600414, + "epoch": 0.43679107961879654, + "grad_norm": 0.701661229133606, + "learning_rate": 1.8377594220213867e-05, + "loss": 1.5047, + "mean_token_accuracy": 0.6375877310832342, + "num_tokens": 666962588.0, + "step": 3976 + }, + { + "entropy": 1.7207889755566914, + "epoch": 0.4369009365301695, + "grad_norm": 0.705245316028595, + "learning_rate": 1.837667706287353e-05, + "loss": 1.4913, + "mean_token_accuracy": 0.6380893290042877, + "num_tokens": 667138508.0, + "step": 3977 + }, + { + "entropy": 1.7350668410460155, + "epoch": 0.43701079344154237, + "grad_norm": 0.6259267330169678, + "learning_rate": 1.837575967206469e-05, + "loss": 1.3351, + "mean_token_accuracy": 0.6648585498332977, + "num_tokens": 667306171.0, + "step": 3978 + }, + { + "entropy": 1.6554445624351501, + "epoch": 0.4371206503529153, + "grad_norm": 0.7273040413856506, + "learning_rate": 1.837484204781638e-05, + "loss": 1.5635, + "mean_token_accuracy": 0.639080340663592, + "num_tokens": 667481940.0, + "step": 3979 + }, + { + "entropy": 1.6817783216635387, + "epoch": 0.43723050726428825, + "grad_norm": 0.6706110835075378, + "learning_rate": 1.837392419015764e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6793971409400305, + "num_tokens": 667638137.0, + "step": 3980 + }, + { + "entropy": 1.6714449326197307, + "epoch": 0.4373403641756612, + "grad_norm": 0.6944328546524048, + "learning_rate": 1.837300609911752e-05, + "loss": 1.4153, + "mean_token_accuracy": 0.6490057408809662, + "num_tokens": 667801298.0, + "step": 3981 + }, + { + "entropy": 1.7301402886708577, + "epoch": 0.43745022108703413, + "grad_norm": 0.6663272380828857, + "learning_rate": 1.8372087774725086e-05, + "loss": 1.3182, + "mean_token_accuracy": 0.6672502309083939, + "num_tokens": 667935582.0, + "step": 3982 + }, + { + "entropy": 1.6425547401110332, + "epoch": 0.43756007799840707, + "grad_norm": 0.6449568271636963, + "learning_rate": 1.837116921700939e-05, + "loss": 1.3805, + "mean_token_accuracy": 0.6671392023563385, + "num_tokens": 668108676.0, + "step": 3983 + }, + { + "entropy": 1.7166595160961151, + "epoch": 0.43766993490978, + "grad_norm": 0.7882794737815857, + "learning_rate": 1.8370250425999513e-05, + "loss": 1.5201, + "mean_token_accuracy": 0.6302592655022939, + "num_tokens": 668335947.0, + "step": 3984 + }, + { + "entropy": 1.6976383328437805, + "epoch": 0.43777979182115295, + "grad_norm": 0.7272980809211731, + "learning_rate": 1.836933140172453e-05, + "loss": 1.3725, + "mean_token_accuracy": 0.6644467264413834, + "num_tokens": 668487476.0, + "step": 3985 + }, + { + "entropy": 1.708019107580185, + "epoch": 0.4378896487325259, + "grad_norm": 0.6897531151771545, + "learning_rate": 1.8368412144213527e-05, + "loss": 1.3867, + "mean_token_accuracy": 0.6588212251663208, + "num_tokens": 668685652.0, + "step": 3986 + }, + { + "entropy": 1.7161248624324799, + "epoch": 0.43799950564389883, + "grad_norm": 0.6213241815567017, + "learning_rate": 1.8367492653495603e-05, + "loss": 1.3569, + "mean_token_accuracy": 0.6537472208340963, + "num_tokens": 668831434.0, + "step": 3987 + }, + { + "entropy": 1.6518361568450928, + "epoch": 0.4381093625552718, + "grad_norm": 0.6153174042701721, + "learning_rate": 1.8366572929599853e-05, + "loss": 1.418, + "mean_token_accuracy": 0.6647394746541977, + "num_tokens": 669014444.0, + "step": 3988 + }, + { + "entropy": 1.764929711818695, + "epoch": 0.4382192194666447, + "grad_norm": 0.789162278175354, + "learning_rate": 1.8365652972555395e-05, + "loss": 1.214, + "mean_token_accuracy": 0.6773978173732758, + "num_tokens": 669117485.0, + "step": 3989 + }, + { + "entropy": 1.732424944639206, + "epoch": 0.43832907637801766, + "grad_norm": 0.7520033121109009, + "learning_rate": 1.836473278239133e-05, + "loss": 1.3209, + "mean_token_accuracy": 0.6744516342878342, + "num_tokens": 669273942.0, + "step": 3990 + }, + { + "entropy": 1.7452230354150136, + "epoch": 0.43843893328939054, + "grad_norm": 1.1524012088775635, + "learning_rate": 1.83638123591368e-05, + "loss": 1.1884, + "mean_token_accuracy": 0.662539561589559, + "num_tokens": 669456919.0, + "step": 3991 + }, + { + "entropy": 1.7065231601397197, + "epoch": 0.4385487902007635, + "grad_norm": 0.8054308891296387, + "learning_rate": 1.8362891702820928e-05, + "loss": 1.4485, + "mean_token_accuracy": 0.6653054704268774, + "num_tokens": 669595872.0, + "step": 3992 + }, + { + "entropy": 1.6997679869333904, + "epoch": 0.4386586471121364, + "grad_norm": 0.6387143731117249, + "learning_rate": 1.8361970813472847e-05, + "loss": 1.4909, + "mean_token_accuracy": 0.6452397058407465, + "num_tokens": 669834698.0, + "step": 3993 + }, + { + "entropy": 1.6552885274092357, + "epoch": 0.43876850402350936, + "grad_norm": 0.659257173538208, + "learning_rate": 1.8361049691121703e-05, + "loss": 1.3508, + "mean_token_accuracy": 0.6857950339714686, + "num_tokens": 670012914.0, + "step": 3994 + }, + { + "entropy": 1.7258447209994, + "epoch": 0.4388783609348823, + "grad_norm": 0.6504175066947937, + "learning_rate": 1.836012833579666e-05, + "loss": 1.3421, + "mean_token_accuracy": 0.6709899504979452, + "num_tokens": 670167144.0, + "step": 3995 + }, + { + "entropy": 1.6821411649386089, + "epoch": 0.43898821784625525, + "grad_norm": 0.7438737750053406, + "learning_rate": 1.835920674752687e-05, + "loss": 1.4678, + "mean_token_accuracy": 0.6426598926385244, + "num_tokens": 670364528.0, + "step": 3996 + }, + { + "entropy": 1.6863566239674885, + "epoch": 0.4390980747576282, + "grad_norm": 0.6272345185279846, + "learning_rate": 1.8358284926341502e-05, + "loss": 1.3699, + "mean_token_accuracy": 0.6651715586582819, + "num_tokens": 670559676.0, + "step": 3997 + }, + { + "entropy": 1.7127248346805573, + "epoch": 0.43920793166900113, + "grad_norm": 0.7947668433189392, + "learning_rate": 1.835736287226973e-05, + "loss": 1.5277, + "mean_token_accuracy": 0.6522035598754883, + "num_tokens": 670751739.0, + "step": 3998 + }, + { + "entropy": 1.6697716514269512, + "epoch": 0.43931778858037407, + "grad_norm": 0.7031921744346619, + "learning_rate": 1.835644058534074e-05, + "loss": 1.4262, + "mean_token_accuracy": 0.668352390329043, + "num_tokens": 670926584.0, + "step": 3999 + }, + { + "entropy": 1.7474531829357147, + "epoch": 0.439427645491747, + "grad_norm": 0.8411499261856079, + "learning_rate": 1.8355518065583725e-05, + "loss": 1.3682, + "mean_token_accuracy": 0.6587245215972265, + "num_tokens": 671068877.0, + "step": 4000 + }, + { + "entropy": 1.7535376648108165, + "epoch": 0.43953750240311995, + "grad_norm": 0.6565456986427307, + "learning_rate": 1.835459531302787e-05, + "loss": 1.4578, + "mean_token_accuracy": 0.636372705300649, + "num_tokens": 671261086.0, + "step": 4001 + }, + { + "entropy": 1.7117730776468914, + "epoch": 0.4396473593144929, + "grad_norm": 0.7486881613731384, + "learning_rate": 1.835367232770239e-05, + "loss": 1.4796, + "mean_token_accuracy": 0.6628977358341217, + "num_tokens": 671432376.0, + "step": 4002 + }, + { + "entropy": 1.6681561470031738, + "epoch": 0.43975721622586583, + "grad_norm": 0.7439094185829163, + "learning_rate": 1.8352749109636498e-05, + "loss": 1.5121, + "mean_token_accuracy": 0.6407729138930639, + "num_tokens": 671633326.0, + "step": 4003 + }, + { + "entropy": 1.726431320110957, + "epoch": 0.4398670731372388, + "grad_norm": 0.6899568438529968, + "learning_rate": 1.8351825658859405e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6614230573177338, + "num_tokens": 671759712.0, + "step": 4004 + }, + { + "entropy": 1.7370514472325642, + "epoch": 0.43997693004861166, + "grad_norm": 0.6520119905471802, + "learning_rate": 1.8350901975400347e-05, + "loss": 1.3351, + "mean_token_accuracy": 0.6576328774293264, + "num_tokens": 671896888.0, + "step": 4005 + }, + { + "entropy": 1.6817982296148937, + "epoch": 0.4400867869599846, + "grad_norm": 0.6815257668495178, + "learning_rate": 1.834997805928855e-05, + "loss": 1.3744, + "mean_token_accuracy": 0.6502506881952286, + "num_tokens": 672042565.0, + "step": 4006 + }, + { + "entropy": 1.7248376111189525, + "epoch": 0.44019664387135754, + "grad_norm": 0.5518606901168823, + "learning_rate": 1.8349053910553264e-05, + "loss": 1.4485, + "mean_token_accuracy": 0.6401482870181402, + "num_tokens": 672283150.0, + "step": 4007 + }, + { + "entropy": 1.7700274089972179, + "epoch": 0.4403065007827305, + "grad_norm": 0.85428386926651, + "learning_rate": 1.834812952922373e-05, + "loss": 1.4366, + "mean_token_accuracy": 0.64705158273379, + "num_tokens": 672418954.0, + "step": 4008 + }, + { + "entropy": 1.574070413907369, + "epoch": 0.4404163576941034, + "grad_norm": 0.7439473867416382, + "learning_rate": 1.8347204915329207e-05, + "loss": 1.2231, + "mean_token_accuracy": 0.6847544858853022, + "num_tokens": 672559063.0, + "step": 4009 + }, + { + "entropy": 1.7344237864017487, + "epoch": 0.44052621460547636, + "grad_norm": 0.6516211628913879, + "learning_rate": 1.834628006889896e-05, + "loss": 1.3989, + "mean_token_accuracy": 0.6455797751744589, + "num_tokens": 672704524.0, + "step": 4010 + }, + { + "entropy": 1.7220177451769512, + "epoch": 0.4406360715168493, + "grad_norm": 0.6377677321434021, + "learning_rate": 1.8345354989962262e-05, + "loss": 1.4016, + "mean_token_accuracy": 0.6649409184853236, + "num_tokens": 672869556.0, + "step": 4011 + }, + { + "entropy": 1.7253966728846233, + "epoch": 0.44074592842822224, + "grad_norm": 0.7619969844818115, + "learning_rate": 1.834442967854838e-05, + "loss": 1.3548, + "mean_token_accuracy": 0.6682330717643102, + "num_tokens": 673023121.0, + "step": 4012 + }, + { + "entropy": 1.6592314541339874, + "epoch": 0.4408557853395952, + "grad_norm": 0.8172975778579712, + "learning_rate": 1.834350413468662e-05, + "loss": 1.3785, + "mean_token_accuracy": 0.6685704290866852, + "num_tokens": 673202298.0, + "step": 4013 + }, + { + "entropy": 1.6940802733103435, + "epoch": 0.4409656422509681, + "grad_norm": 0.8062915802001953, + "learning_rate": 1.8342578358406253e-05, + "loss": 1.446, + "mean_token_accuracy": 0.6594598790009817, + "num_tokens": 673377211.0, + "step": 4014 + }, + { + "entropy": 1.7303107678890228, + "epoch": 0.44107549916234107, + "grad_norm": 0.7482844591140747, + "learning_rate": 1.8341652349736593e-05, + "loss": 1.4081, + "mean_token_accuracy": 0.6585040787855784, + "num_tokens": 673529505.0, + "step": 4015 + }, + { + "entropy": 1.7489991386731465, + "epoch": 0.441185356073714, + "grad_norm": 0.7208905220031738, + "learning_rate": 1.8340726108706948e-05, + "loss": 1.4499, + "mean_token_accuracy": 0.6458181291818619, + "num_tokens": 673705981.0, + "step": 4016 + }, + { + "entropy": 1.708936224381129, + "epoch": 0.44129521298508695, + "grad_norm": 0.6100684404373169, + "learning_rate": 1.8339799635346624e-05, + "loss": 1.4808, + "mean_token_accuracy": 0.6417776246865591, + "num_tokens": 673975367.0, + "step": 4017 + }, + { + "entropy": 1.6850675543149312, + "epoch": 0.44140506989645983, + "grad_norm": 0.5926774740219116, + "learning_rate": 1.8338872929684953e-05, + "loss": 1.4223, + "mean_token_accuracy": 0.651220291852951, + "num_tokens": 674168873.0, + "step": 4018 + }, + { + "entropy": 1.709786633650462, + "epoch": 0.4415149268078328, + "grad_norm": 0.8183510899543762, + "learning_rate": 1.833794599175126e-05, + "loss": 1.3911, + "mean_token_accuracy": 0.6542472541332245, + "num_tokens": 674343048.0, + "step": 4019 + }, + { + "entropy": 1.7094794114430745, + "epoch": 0.4416247837192057, + "grad_norm": 0.7141227126121521, + "learning_rate": 1.833701882157488e-05, + "loss": 1.43, + "mean_token_accuracy": 0.6631141652663549, + "num_tokens": 674505930.0, + "step": 4020 + }, + { + "entropy": 1.7598174810409546, + "epoch": 0.44173464063057866, + "grad_norm": 0.6930931210517883, + "learning_rate": 1.833609141918516e-05, + "loss": 1.4248, + "mean_token_accuracy": 0.6471636444330215, + "num_tokens": 674681600.0, + "step": 4021 + }, + { + "entropy": 1.7052448689937592, + "epoch": 0.4418444975419516, + "grad_norm": 0.748052716255188, + "learning_rate": 1.833516378461146e-05, + "loss": 1.4185, + "mean_token_accuracy": 0.6527615735928217, + "num_tokens": 674852988.0, + "step": 4022 + }, + { + "entropy": 1.735913723707199, + "epoch": 0.44195435445332454, + "grad_norm": 0.8284699320793152, + "learning_rate": 1.8334235917883124e-05, + "loss": 1.5755, + "mean_token_accuracy": 0.6278869633873304, + "num_tokens": 675030743.0, + "step": 4023 + }, + { + "entropy": 1.718154142300288, + "epoch": 0.4420642113646975, + "grad_norm": 0.8008006811141968, + "learning_rate": 1.833330781902953e-05, + "loss": 1.258, + "mean_token_accuracy": 0.6701284448305765, + "num_tokens": 675149913.0, + "step": 4024 + }, + { + "entropy": 1.6997297902901967, + "epoch": 0.4421740682760704, + "grad_norm": 0.9283497929573059, + "learning_rate": 1.8332379488080046e-05, + "loss": 1.1681, + "mean_token_accuracy": 0.6950256576140722, + "num_tokens": 675290297.0, + "step": 4025 + }, + { + "entropy": 1.7370579838752747, + "epoch": 0.44228392518744336, + "grad_norm": 0.7015495896339417, + "learning_rate": 1.8331450925064057e-05, + "loss": 1.3353, + "mean_token_accuracy": 0.6572895298401514, + "num_tokens": 675441895.0, + "step": 4026 + }, + { + "entropy": 1.638562301794688, + "epoch": 0.4423937820988163, + "grad_norm": 0.6312068700790405, + "learning_rate": 1.833052213001095e-05, + "loss": 1.271, + "mean_token_accuracy": 0.6829714129368464, + "num_tokens": 675571939.0, + "step": 4027 + }, + { + "entropy": 1.763380487759908, + "epoch": 0.44250363901018924, + "grad_norm": 0.7643721699714661, + "learning_rate": 1.8329593102950115e-05, + "loss": 1.4964, + "mean_token_accuracy": 0.6557734707991282, + "num_tokens": 675760546.0, + "step": 4028 + }, + { + "entropy": 1.675294816493988, + "epoch": 0.4426134959215622, + "grad_norm": 0.7117913365364075, + "learning_rate": 1.832866384391097e-05, + "loss": 1.3794, + "mean_token_accuracy": 0.6646661460399628, + "num_tokens": 675909939.0, + "step": 4029 + }, + { + "entropy": 1.7222477793693542, + "epoch": 0.4427233528329351, + "grad_norm": 0.609602153301239, + "learning_rate": 1.8327734352922912e-05, + "loss": 1.385, + "mean_token_accuracy": 0.6593077381451925, + "num_tokens": 676074765.0, + "step": 4030 + }, + { + "entropy": 1.7006352543830872, + "epoch": 0.44283320974430807, + "grad_norm": 0.6742071509361267, + "learning_rate": 1.8326804630015364e-05, + "loss": 1.4513, + "mean_token_accuracy": 0.6537482092777888, + "num_tokens": 676233903.0, + "step": 4031 + }, + { + "entropy": 1.672971785068512, + "epoch": 0.44294306665568095, + "grad_norm": 0.7731028199195862, + "learning_rate": 1.8325874675217747e-05, + "loss": 1.293, + "mean_token_accuracy": 0.6704902996619543, + "num_tokens": 676397595.0, + "step": 4032 + }, + { + "entropy": 1.7384453018506367, + "epoch": 0.4430529235670539, + "grad_norm": 0.9834579825401306, + "learning_rate": 1.8324944488559505e-05, + "loss": 1.5189, + "mean_token_accuracy": 0.6412886679172516, + "num_tokens": 676566969.0, + "step": 4033 + }, + { + "entropy": 1.7224073906739552, + "epoch": 0.44316278047842683, + "grad_norm": 0.6622791290283203, + "learning_rate": 1.8324014070070063e-05, + "loss": 1.563, + "mean_token_accuracy": 0.6388835261265436, + "num_tokens": 676803944.0, + "step": 4034 + }, + { + "entropy": 1.7606834868590038, + "epoch": 0.4432726373897998, + "grad_norm": 0.6880962252616882, + "learning_rate": 1.832308341977888e-05, + "loss": 1.3521, + "mean_token_accuracy": 0.6607374300559362, + "num_tokens": 676954542.0, + "step": 4035 + }, + { + "entropy": 1.6744478940963745, + "epoch": 0.4433824943011727, + "grad_norm": 0.7972778081893921, + "learning_rate": 1.8322152537715408e-05, + "loss": 1.4395, + "mean_token_accuracy": 0.6546510507663091, + "num_tokens": 677134397.0, + "step": 4036 + }, + { + "entropy": 1.6546966234842937, + "epoch": 0.44349235121254565, + "grad_norm": 0.7038325667381287, + "learning_rate": 1.8321221423909105e-05, + "loss": 1.2629, + "mean_token_accuracy": 0.6734778136014938, + "num_tokens": 677303311.0, + "step": 4037 + }, + { + "entropy": 1.744826744000117, + "epoch": 0.4436022081239186, + "grad_norm": 0.739396333694458, + "learning_rate": 1.8320290078389448e-05, + "loss": 1.5118, + "mean_token_accuracy": 0.65053657690684, + "num_tokens": 677507834.0, + "step": 4038 + }, + { + "entropy": 1.6409766773382823, + "epoch": 0.44371206503529154, + "grad_norm": 0.7920038104057312, + "learning_rate": 1.8319358501185903e-05, + "loss": 1.5389, + "mean_token_accuracy": 0.6472673763831457, + "num_tokens": 677656509.0, + "step": 4039 + }, + { + "entropy": 1.7347593108812969, + "epoch": 0.4438219219466645, + "grad_norm": 0.7497395873069763, + "learning_rate": 1.8318426692327958e-05, + "loss": 1.5772, + "mean_token_accuracy": 0.6470949848492941, + "num_tokens": 677792553.0, + "step": 4040 + }, + { + "entropy": 1.7003744939963024, + "epoch": 0.4439317788580374, + "grad_norm": 0.6454471945762634, + "learning_rate": 1.8317494651845113e-05, + "loss": 1.3954, + "mean_token_accuracy": 0.6603303998708725, + "num_tokens": 677975581.0, + "step": 4041 + }, + { + "entropy": 1.724554717540741, + "epoch": 0.44404163576941036, + "grad_norm": 0.7715175747871399, + "learning_rate": 1.8316562379766855e-05, + "loss": 1.6706, + "mean_token_accuracy": 0.6182306359211603, + "num_tokens": 678156142.0, + "step": 4042 + }, + { + "entropy": 1.7756297886371613, + "epoch": 0.4441514926807833, + "grad_norm": 0.7351700663566589, + "learning_rate": 1.83156298761227e-05, + "loss": 1.5132, + "mean_token_accuracy": 0.6380604902903239, + "num_tokens": 678314615.0, + "step": 4043 + }, + { + "entropy": 1.6692375938097637, + "epoch": 0.44426134959215624, + "grad_norm": 0.7419458627700806, + "learning_rate": 1.831469714094215e-05, + "loss": 1.3315, + "mean_token_accuracy": 0.6649828652540842, + "num_tokens": 678494083.0, + "step": 4044 + }, + { + "entropy": 1.7391583820184071, + "epoch": 0.4443712065035291, + "grad_norm": 0.6613411903381348, + "learning_rate": 1.831376417425473e-05, + "loss": 1.4028, + "mean_token_accuracy": 0.6531075437863668, + "num_tokens": 678688690.0, + "step": 4045 + }, + { + "entropy": 1.7318035662174225, + "epoch": 0.44448106341490207, + "grad_norm": 0.6976780295372009, + "learning_rate": 1.831283097608997e-05, + "loss": 1.429, + "mean_token_accuracy": 0.6665694663921992, + "num_tokens": 678819678.0, + "step": 4046 + }, + { + "entropy": 1.6295676430066426, + "epoch": 0.444590920326275, + "grad_norm": 0.6185345649719238, + "learning_rate": 1.8311897546477412e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6613521029551824, + "num_tokens": 679001480.0, + "step": 4047 + }, + { + "entropy": 1.7244457403818767, + "epoch": 0.44470077723764795, + "grad_norm": 0.7285200953483582, + "learning_rate": 1.831096388544659e-05, + "loss": 1.5472, + "mean_token_accuracy": 0.6468717704216639, + "num_tokens": 679229552.0, + "step": 4048 + }, + { + "entropy": 1.6660625040531158, + "epoch": 0.4448106341490209, + "grad_norm": 0.7275331020355225, + "learning_rate": 1.831002999302705e-05, + "loss": 1.3838, + "mean_token_accuracy": 0.6649215320746104, + "num_tokens": 679408172.0, + "step": 4049 + }, + { + "entropy": 1.6905015210310619, + "epoch": 0.44492049106039383, + "grad_norm": 0.7201270461082458, + "learning_rate": 1.8309095869248355e-05, + "loss": 1.3025, + "mean_token_accuracy": 0.671828548113505, + "num_tokens": 679548700.0, + "step": 4050 + }, + { + "entropy": 1.6812767088413239, + "epoch": 0.44503034797176677, + "grad_norm": 0.666533887386322, + "learning_rate": 1.8308161514140073e-05, + "loss": 1.2311, + "mean_token_accuracy": 0.6841448297103246, + "num_tokens": 679682542.0, + "step": 4051 + }, + { + "entropy": 1.6990590194861095, + "epoch": 0.4451402048831397, + "grad_norm": 0.7694803476333618, + "learning_rate": 1.8307226927731773e-05, + "loss": 1.508, + "mean_token_accuracy": 0.653964231411616, + "num_tokens": 679890013.0, + "step": 4052 + }, + { + "entropy": 1.6936483283837636, + "epoch": 0.44525006179451265, + "grad_norm": 0.7189439535140991, + "learning_rate": 1.830629211005303e-05, + "loss": 1.3196, + "mean_token_accuracy": 0.6643483489751816, + "num_tokens": 680032399.0, + "step": 4053 + }, + { + "entropy": 1.7283145984013875, + "epoch": 0.4453599187058856, + "grad_norm": 0.6931325793266296, + "learning_rate": 1.8305357061133432e-05, + "loss": 1.2627, + "mean_token_accuracy": 0.6824038575092951, + "num_tokens": 680186468.0, + "step": 4054 + }, + { + "entropy": 1.7289324204126995, + "epoch": 0.44546977561725853, + "grad_norm": 0.7332465052604675, + "learning_rate": 1.830442178100258e-05, + "loss": 1.3448, + "mean_token_accuracy": 0.6571111728747686, + "num_tokens": 680332193.0, + "step": 4055 + }, + { + "entropy": 1.7295528848965962, + "epoch": 0.4455796325286315, + "grad_norm": 0.6440022587776184, + "learning_rate": 1.830348626969007e-05, + "loss": 1.3409, + "mean_token_accuracy": 0.6647644688685735, + "num_tokens": 680522384.0, + "step": 4056 + }, + { + "entropy": 1.6799738903840382, + "epoch": 0.4456894894400044, + "grad_norm": 0.6439666152000427, + "learning_rate": 1.8302550527225507e-05, + "loss": 1.4989, + "mean_token_accuracy": 0.6489834437767664, + "num_tokens": 680717915.0, + "step": 4057 + }, + { + "entropy": 1.7007416983445485, + "epoch": 0.44579934635137736, + "grad_norm": 0.5994968414306641, + "learning_rate": 1.830161455363851e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.6613183865944544, + "num_tokens": 680932364.0, + "step": 4058 + }, + { + "entropy": 1.7307079831759136, + "epoch": 0.44590920326275024, + "grad_norm": 0.772515058517456, + "learning_rate": 1.8300678348958708e-05, + "loss": 1.5584, + "mean_token_accuracy": 0.6598212644457817, + "num_tokens": 681086770.0, + "step": 4059 + }, + { + "entropy": 1.7124249339103699, + "epoch": 0.4460190601741232, + "grad_norm": 0.6902133822441101, + "learning_rate": 1.829974191321572e-05, + "loss": 1.3396, + "mean_token_accuracy": 0.6813353697458903, + "num_tokens": 681208149.0, + "step": 4060 + }, + { + "entropy": 1.6109780669212341, + "epoch": 0.4461289170854961, + "grad_norm": 0.7003684639930725, + "learning_rate": 1.8298805246439197e-05, + "loss": 1.3003, + "mean_token_accuracy": 0.6696944236755371, + "num_tokens": 681407580.0, + "step": 4061 + }, + { + "entropy": 1.68620361884435, + "epoch": 0.44623877399686906, + "grad_norm": 0.8141494393348694, + "learning_rate": 1.829786834865877e-05, + "loss": 1.3527, + "mean_token_accuracy": 0.6631912092367808, + "num_tokens": 681612026.0, + "step": 4062 + }, + { + "entropy": 1.6939865350723267, + "epoch": 0.446348630908242, + "grad_norm": 0.75359046459198, + "learning_rate": 1.82969312199041e-05, + "loss": 1.34, + "mean_token_accuracy": 0.6642651607592901, + "num_tokens": 681769299.0, + "step": 4063 + }, + { + "entropy": 1.7100327412287395, + "epoch": 0.44645848781961495, + "grad_norm": 0.5752301812171936, + "learning_rate": 1.8295993860204845e-05, + "loss": 1.5232, + "mean_token_accuracy": 0.6351617823044459, + "num_tokens": 682005797.0, + "step": 4064 + }, + { + "entropy": 1.6280939678351085, + "epoch": 0.4465683447309879, + "grad_norm": 0.8501309156417847, + "learning_rate": 1.8295056269590675e-05, + "loss": 1.3887, + "mean_token_accuracy": 0.6680960903565089, + "num_tokens": 682239395.0, + "step": 4065 + }, + { + "entropy": 1.6829663415749867, + "epoch": 0.44667820164236083, + "grad_norm": 0.7033583521842957, + "learning_rate": 1.8294118448091255e-05, + "loss": 1.3061, + "mean_token_accuracy": 0.6647171477476755, + "num_tokens": 682383725.0, + "step": 4066 + }, + { + "entropy": 1.7133256395657857, + "epoch": 0.44678805855373377, + "grad_norm": 0.630029559135437, + "learning_rate": 1.8293180395736278e-05, + "loss": 1.5028, + "mean_token_accuracy": 0.6593478719393412, + "num_tokens": 682577842.0, + "step": 4067 + }, + { + "entropy": 1.6594026386737823, + "epoch": 0.4468979154651067, + "grad_norm": 0.7004885077476501, + "learning_rate": 1.8292242112555428e-05, + "loss": 1.4813, + "mean_token_accuracy": 0.6722660760084788, + "num_tokens": 682739696.0, + "step": 4068 + }, + { + "entropy": 1.6464967628320057, + "epoch": 0.44700777237647965, + "grad_norm": 0.6789165139198303, + "learning_rate": 1.82913035985784e-05, + "loss": 1.3377, + "mean_token_accuracy": 0.6721230993668238, + "num_tokens": 682906514.0, + "step": 4069 + }, + { + "entropy": 1.6850634415944417, + "epoch": 0.4471176292878526, + "grad_norm": 0.6883268356323242, + "learning_rate": 1.8290364853834898e-05, + "loss": 1.4961, + "mean_token_accuracy": 0.6561469584703445, + "num_tokens": 683089692.0, + "step": 4070 + }, + { + "entropy": 1.722246805826823, + "epoch": 0.44722748619922553, + "grad_norm": 0.7800368070602417, + "learning_rate": 1.8289425878354633e-05, + "loss": 1.495, + "mean_token_accuracy": 0.6425358355045319, + "num_tokens": 683271437.0, + "step": 4071 + }, + { + "entropy": 1.6442756354808807, + "epoch": 0.4473373431105985, + "grad_norm": 0.5757925510406494, + "learning_rate": 1.8288486672167327e-05, + "loss": 1.4154, + "mean_token_accuracy": 0.6556447048981985, + "num_tokens": 683521616.0, + "step": 4072 + }, + { + "entropy": 1.7113324999809265, + "epoch": 0.44744720002197136, + "grad_norm": 0.6120255589485168, + "learning_rate": 1.82875472353027e-05, + "loss": 1.3365, + "mean_token_accuracy": 0.6572584211826324, + "num_tokens": 683677858.0, + "step": 4073 + }, + { + "entropy": 1.756430298089981, + "epoch": 0.4475570569333443, + "grad_norm": 0.9076440334320068, + "learning_rate": 1.8286607567790485e-05, + "loss": 1.3104, + "mean_token_accuracy": 0.6704193005959193, + "num_tokens": 683788448.0, + "step": 4074 + }, + { + "entropy": 1.6399111052354176, + "epoch": 0.44766691384471724, + "grad_norm": 0.6672569513320923, + "learning_rate": 1.8285667669660426e-05, + "loss": 1.3935, + "mean_token_accuracy": 0.6672036250432333, + "num_tokens": 683930395.0, + "step": 4075 + }, + { + "entropy": 1.6177968084812164, + "epoch": 0.4477767707560902, + "grad_norm": 0.5370674729347229, + "learning_rate": 1.8284727540942266e-05, + "loss": 1.3163, + "mean_token_accuracy": 0.6624527275562286, + "num_tokens": 684129876.0, + "step": 4076 + }, + { + "entropy": 1.7107476492722828, + "epoch": 0.4478866276674631, + "grad_norm": 0.6975926756858826, + "learning_rate": 1.8283787181665766e-05, + "loss": 1.4658, + "mean_token_accuracy": 0.6649006853501002, + "num_tokens": 684294327.0, + "step": 4077 + }, + { + "entropy": 1.7037384510040283, + "epoch": 0.44799648457883606, + "grad_norm": 0.7005517482757568, + "learning_rate": 1.828284659186068e-05, + "loss": 1.2692, + "mean_token_accuracy": 0.6762935618559519, + "num_tokens": 684409485.0, + "step": 4078 + }, + { + "entropy": 1.7332588632901509, + "epoch": 0.448106341490209, + "grad_norm": 0.7170990109443665, + "learning_rate": 1.828190577155678e-05, + "loss": 1.3511, + "mean_token_accuracy": 0.655609572927157, + "num_tokens": 684573662.0, + "step": 4079 + }, + { + "entropy": 1.656719873348872, + "epoch": 0.44821619840158194, + "grad_norm": 0.7201644778251648, + "learning_rate": 1.8280964720783847e-05, + "loss": 1.4354, + "mean_token_accuracy": 0.6612338771422704, + "num_tokens": 684745344.0, + "step": 4080 + }, + { + "entropy": 1.6805977523326874, + "epoch": 0.4483260553129549, + "grad_norm": 0.6354820728302002, + "learning_rate": 1.8280023439571662e-05, + "loss": 1.5477, + "mean_token_accuracy": 0.6504618128140768, + "num_tokens": 684920877.0, + "step": 4081 + }, + { + "entropy": 1.6839341123898823, + "epoch": 0.4484359122243278, + "grad_norm": 0.6117289662361145, + "learning_rate": 1.8279081927950012e-05, + "loss": 1.3716, + "mean_token_accuracy": 0.6628950238227844, + "num_tokens": 685094960.0, + "step": 4082 + }, + { + "entropy": 1.6351311802864075, + "epoch": 0.44854576913570077, + "grad_norm": 0.8039343357086182, + "learning_rate": 1.8278140185948706e-05, + "loss": 1.4611, + "mean_token_accuracy": 0.6522148499886194, + "num_tokens": 685258463.0, + "step": 4083 + }, + { + "entropy": 1.7667676905790966, + "epoch": 0.4486556260470737, + "grad_norm": 0.9782058000564575, + "learning_rate": 1.8277198213597535e-05, + "loss": 1.3842, + "mean_token_accuracy": 0.655416414141655, + "num_tokens": 685386946.0, + "step": 4084 + }, + { + "entropy": 1.6758897999922435, + "epoch": 0.44876548295844665, + "grad_norm": 0.6297981142997742, + "learning_rate": 1.8276256010926325e-05, + "loss": 1.2551, + "mean_token_accuracy": 0.6806664168834686, + "num_tokens": 685520402.0, + "step": 4085 + }, + { + "entropy": 1.729985237121582, + "epoch": 0.44887533986981953, + "grad_norm": 0.7002821564674377, + "learning_rate": 1.8275313577964885e-05, + "loss": 1.2529, + "mean_token_accuracy": 0.6838051875432333, + "num_tokens": 685648637.0, + "step": 4086 + }, + { + "entropy": 1.7272369960943859, + "epoch": 0.4489851967811925, + "grad_norm": 0.5778807401657104, + "learning_rate": 1.8274370914743054e-05, + "loss": 1.3942, + "mean_token_accuracy": 0.6614800641934077, + "num_tokens": 685844253.0, + "step": 4087 + }, + { + "entropy": 1.717577338218689, + "epoch": 0.4490950536925654, + "grad_norm": 0.8240344524383545, + "learning_rate": 1.8273428021290658e-05, + "loss": 1.5095, + "mean_token_accuracy": 0.6495123704274496, + "num_tokens": 686028134.0, + "step": 4088 + }, + { + "entropy": 1.7241438726584117, + "epoch": 0.44920491060393836, + "grad_norm": 0.6883000731468201, + "learning_rate": 1.8272484897637546e-05, + "loss": 1.3816, + "mean_token_accuracy": 0.6627028236786524, + "num_tokens": 686191080.0, + "step": 4089 + }, + { + "entropy": 1.6412847638130188, + "epoch": 0.4493147675153113, + "grad_norm": 0.6836743950843811, + "learning_rate": 1.827154154381356e-05, + "loss": 1.5132, + "mean_token_accuracy": 0.653800850113233, + "num_tokens": 686350524.0, + "step": 4090 + }, + { + "entropy": 1.6573506991068523, + "epoch": 0.44942462442668424, + "grad_norm": 0.6218001246452332, + "learning_rate": 1.8270597959848563e-05, + "loss": 1.3456, + "mean_token_accuracy": 0.6675258924563726, + "num_tokens": 686518644.0, + "step": 4091 + }, + { + "entropy": 1.709700067838033, + "epoch": 0.4495344813380572, + "grad_norm": 0.6943197846412659, + "learning_rate": 1.826965414577242e-05, + "loss": 1.2814, + "mean_token_accuracy": 0.6749483694632848, + "num_tokens": 686666877.0, + "step": 4092 + }, + { + "entropy": 1.6798059542973836, + "epoch": 0.4496443382494301, + "grad_norm": 0.8047642707824707, + "learning_rate": 1.8268710101614996e-05, + "loss": 1.4749, + "mean_token_accuracy": 0.6598973522583643, + "num_tokens": 686831962.0, + "step": 4093 + }, + { + "entropy": 1.720873127381007, + "epoch": 0.44975419516080306, + "grad_norm": 0.7131839394569397, + "learning_rate": 1.8267765827406173e-05, + "loss": 1.3472, + "mean_token_accuracy": 0.6661649147669474, + "num_tokens": 686961662.0, + "step": 4094 + }, + { + "entropy": 1.7046670416990917, + "epoch": 0.449864052072176, + "grad_norm": 0.6924872398376465, + "learning_rate": 1.8266821323175833e-05, + "loss": 1.3393, + "mean_token_accuracy": 0.6566531558831533, + "num_tokens": 687125536.0, + "step": 4095 + }, + { + "entropy": 1.6233412722746532, + "epoch": 0.44997390898354894, + "grad_norm": 0.6866830587387085, + "learning_rate": 1.826587658895388e-05, + "loss": 1.3427, + "mean_token_accuracy": 0.6586346874634424, + "num_tokens": 687304023.0, + "step": 4096 + }, + { + "entropy": 1.659420023361842, + "epoch": 0.4500837658949219, + "grad_norm": 0.7031760811805725, + "learning_rate": 1.8264931624770198e-05, + "loss": 1.4316, + "mean_token_accuracy": 0.6656246980031332, + "num_tokens": 687457564.0, + "step": 4097 + }, + { + "entropy": 1.7746712168057759, + "epoch": 0.4501936228062948, + "grad_norm": 0.6770200729370117, + "learning_rate": 1.8263986430654713e-05, + "loss": 1.4262, + "mean_token_accuracy": 0.654616062839826, + "num_tokens": 687663560.0, + "step": 4098 + }, + { + "entropy": 1.6433165371418, + "epoch": 0.45030347971766777, + "grad_norm": 0.6647341251373291, + "learning_rate": 1.8263041006637326e-05, + "loss": 1.3283, + "mean_token_accuracy": 0.6662708769241968, + "num_tokens": 687804204.0, + "step": 4099 + }, + { + "entropy": 1.6864981253941853, + "epoch": 0.45041333662904065, + "grad_norm": 0.8306163549423218, + "learning_rate": 1.8262095352747964e-05, + "loss": 1.4174, + "mean_token_accuracy": 0.673849806189537, + "num_tokens": 687948844.0, + "step": 4100 + }, + { + "entropy": 1.6836271584033966, + "epoch": 0.4505231935404136, + "grad_norm": 0.712296187877655, + "learning_rate": 1.8261149469016554e-05, + "loss": 1.435, + "mean_token_accuracy": 0.6426206976175308, + "num_tokens": 688141646.0, + "step": 4101 + }, + { + "entropy": 1.702450027068456, + "epoch": 0.45063305045178653, + "grad_norm": 0.6855894923210144, + "learning_rate": 1.826020335547304e-05, + "loss": 1.4706, + "mean_token_accuracy": 0.6506734440724055, + "num_tokens": 688304546.0, + "step": 4102 + }, + { + "entropy": 1.7510944306850433, + "epoch": 0.4507429073631595, + "grad_norm": 0.7090582251548767, + "learning_rate": 1.825925701214736e-05, + "loss": 1.5314, + "mean_token_accuracy": 0.6348803093036016, + "num_tokens": 688491140.0, + "step": 4103 + }, + { + "entropy": 1.6313609679539998, + "epoch": 0.4508527642745324, + "grad_norm": 0.6523467898368835, + "learning_rate": 1.8258310439069464e-05, + "loss": 1.2844, + "mean_token_accuracy": 0.6678755730390549, + "num_tokens": 688670616.0, + "step": 4104 + }, + { + "entropy": 1.6910007297992706, + "epoch": 0.45096262118590535, + "grad_norm": 0.7028467655181885, + "learning_rate": 1.8257363636269315e-05, + "loss": 1.6276, + "mean_token_accuracy": 0.622983917593956, + "num_tokens": 688883798.0, + "step": 4105 + }, + { + "entropy": 1.735917756954829, + "epoch": 0.4510724780972783, + "grad_norm": 0.6216316819190979, + "learning_rate": 1.825641660377688e-05, + "loss": 1.39, + "mean_token_accuracy": 0.643417959411939, + "num_tokens": 689099236.0, + "step": 4106 + }, + { + "entropy": 1.7116466561953227, + "epoch": 0.45118233500865124, + "grad_norm": 0.639937698841095, + "learning_rate": 1.8255469341622127e-05, + "loss": 1.2285, + "mean_token_accuracy": 0.6806729783614477, + "num_tokens": 689222746.0, + "step": 4107 + }, + { + "entropy": 1.7362925708293915, + "epoch": 0.4512921919200242, + "grad_norm": 0.6595767140388489, + "learning_rate": 1.8254521849835038e-05, + "loss": 1.4364, + "mean_token_accuracy": 0.6543554663658142, + "num_tokens": 689427218.0, + "step": 4108 + }, + { + "entropy": 1.6418420473734539, + "epoch": 0.4514020488313971, + "grad_norm": 0.7125058770179749, + "learning_rate": 1.82535741284456e-05, + "loss": 1.4369, + "mean_token_accuracy": 0.6766296525796255, + "num_tokens": 689584306.0, + "step": 4109 + }, + { + "entropy": 1.709335704644521, + "epoch": 0.45151190574277006, + "grad_norm": 0.6635110974311829, + "learning_rate": 1.825262617748381e-05, + "loss": 1.3638, + "mean_token_accuracy": 0.668940449754397, + "num_tokens": 689719332.0, + "step": 4110 + }, + { + "entropy": 1.6440646350383759, + "epoch": 0.451621762654143, + "grad_norm": 0.6006796956062317, + "learning_rate": 1.8251677996979674e-05, + "loss": 1.3163, + "mean_token_accuracy": 0.671658530831337, + "num_tokens": 689892119.0, + "step": 4111 + }, + { + "entropy": 1.6859375437100728, + "epoch": 0.45173161956551594, + "grad_norm": 0.8752461075782776, + "learning_rate": 1.825072958696319e-05, + "loss": 1.4466, + "mean_token_accuracy": 0.6580664763847986, + "num_tokens": 690084462.0, + "step": 4112 + }, + { + "entropy": 1.6572500467300415, + "epoch": 0.4518414764768888, + "grad_norm": 0.7441008687019348, + "learning_rate": 1.8249780947464388e-05, + "loss": 1.2938, + "mean_token_accuracy": 0.6691722124814987, + "num_tokens": 690248438.0, + "step": 4113 + }, + { + "entropy": 1.738774597644806, + "epoch": 0.45195133338826177, + "grad_norm": 0.6293894648551941, + "learning_rate": 1.8248832078513284e-05, + "loss": 1.5194, + "mean_token_accuracy": 0.6342577387889227, + "num_tokens": 690441913.0, + "step": 4114 + }, + { + "entropy": 1.7527674436569214, + "epoch": 0.4520611902996347, + "grad_norm": 1.1343533992767334, + "learning_rate": 1.824788298013991e-05, + "loss": 1.3726, + "mean_token_accuracy": 0.6623023301362991, + "num_tokens": 690558299.0, + "step": 4115 + }, + { + "entropy": 1.6640233397483826, + "epoch": 0.45217104721100765, + "grad_norm": 0.6992958784103394, + "learning_rate": 1.8246933652374307e-05, + "loss": 1.2844, + "mean_token_accuracy": 0.6788427929083506, + "num_tokens": 690739947.0, + "step": 4116 + }, + { + "entropy": 1.7419516444206238, + "epoch": 0.4522809041223806, + "grad_norm": 0.5939506888389587, + "learning_rate": 1.8245984095246518e-05, + "loss": 1.4716, + "mean_token_accuracy": 0.6305443296829859, + "num_tokens": 690997943.0, + "step": 4117 + }, + { + "entropy": 1.748884916305542, + "epoch": 0.45239076103375353, + "grad_norm": 1.0354598760604858, + "learning_rate": 1.8245034308786598e-05, + "loss": 1.4619, + "mean_token_accuracy": 0.6558974186579386, + "num_tokens": 691153064.0, + "step": 4118 + }, + { + "entropy": 1.6696610649426777, + "epoch": 0.45250061794512647, + "grad_norm": 0.8129194974899292, + "learning_rate": 1.8244084293024607e-05, + "loss": 1.3371, + "mean_token_accuracy": 0.6734591573476791, + "num_tokens": 691283368.0, + "step": 4119 + }, + { + "entropy": 1.7136721312999725, + "epoch": 0.4526104748564994, + "grad_norm": 0.7549412250518799, + "learning_rate": 1.8243134047990615e-05, + "loss": 1.5517, + "mean_token_accuracy": 0.6566175570090612, + "num_tokens": 691452676.0, + "step": 4120 + }, + { + "entropy": 1.6813490390777588, + "epoch": 0.45272033176787235, + "grad_norm": 0.7577283978462219, + "learning_rate": 1.824218357371469e-05, + "loss": 1.2974, + "mean_token_accuracy": 0.6722496549288431, + "num_tokens": 691597131.0, + "step": 4121 + }, + { + "entropy": 1.713281015555064, + "epoch": 0.4528301886792453, + "grad_norm": 0.7372899651527405, + "learning_rate": 1.824123287022692e-05, + "loss": 1.4402, + "mean_token_accuracy": 0.6518440991640091, + "num_tokens": 691743841.0, + "step": 4122 + }, + { + "entropy": 1.6881952385107677, + "epoch": 0.45294004559061823, + "grad_norm": 0.5627362728118896, + "learning_rate": 1.824028193755739e-05, + "loss": 1.4554, + "mean_token_accuracy": 0.6475944221019745, + "num_tokens": 691960827.0, + "step": 4123 + }, + { + "entropy": 1.7053532501061757, + "epoch": 0.4530499025019912, + "grad_norm": 0.7508504986763, + "learning_rate": 1.8239330775736208e-05, + "loss": 1.4518, + "mean_token_accuracy": 0.660346490641435, + "num_tokens": 692117432.0, + "step": 4124 + }, + { + "entropy": 1.7332323094209034, + "epoch": 0.4531597594133641, + "grad_norm": 0.7479596734046936, + "learning_rate": 1.823837938479346e-05, + "loss": 1.3183, + "mean_token_accuracy": 0.6600077897310257, + "num_tokens": 692233378.0, + "step": 4125 + }, + { + "entropy": 1.7280583083629608, + "epoch": 0.45326961632473706, + "grad_norm": 0.7284284234046936, + "learning_rate": 1.8237427764759268e-05, + "loss": 1.2877, + "mean_token_accuracy": 0.6728298515081406, + "num_tokens": 692352078.0, + "step": 4126 + }, + { + "entropy": 1.733269860347112, + "epoch": 0.45337947323610994, + "grad_norm": 0.6703977584838867, + "learning_rate": 1.823647591566375e-05, + "loss": 1.318, + "mean_token_accuracy": 0.6564703285694122, + "num_tokens": 692524229.0, + "step": 4127 + }, + { + "entropy": 1.7730626364549, + "epoch": 0.4534893301474829, + "grad_norm": 0.720513105392456, + "learning_rate": 1.823552383753703e-05, + "loss": 1.5289, + "mean_token_accuracy": 0.6443274269501368, + "num_tokens": 692654697.0, + "step": 4128 + }, + { + "entropy": 1.6843830545743306, + "epoch": 0.4535991870588558, + "grad_norm": 0.6629505157470703, + "learning_rate": 1.823457153040924e-05, + "loss": 1.4347, + "mean_token_accuracy": 0.6500236590703329, + "num_tokens": 692822773.0, + "step": 4129 + }, + { + "entropy": 1.6885010202725728, + "epoch": 0.45370904397022876, + "grad_norm": 0.7359088659286499, + "learning_rate": 1.823361899431052e-05, + "loss": 1.1937, + "mean_token_accuracy": 0.6897448152303696, + "num_tokens": 692937863.0, + "step": 4130 + }, + { + "entropy": 1.6834450860818226, + "epoch": 0.4538189008816017, + "grad_norm": 0.6505681276321411, + "learning_rate": 1.8232666229271022e-05, + "loss": 1.4981, + "mean_token_accuracy": 0.6411355634530386, + "num_tokens": 693128486.0, + "step": 4131 + }, + { + "entropy": 1.67915278673172, + "epoch": 0.45392875779297465, + "grad_norm": 0.6337352991104126, + "learning_rate": 1.8231713235320897e-05, + "loss": 1.4664, + "mean_token_accuracy": 0.6389060864845911, + "num_tokens": 693290525.0, + "step": 4132 + }, + { + "entropy": 1.7464499572912853, + "epoch": 0.4540386147043476, + "grad_norm": 0.6891757249832153, + "learning_rate": 1.8230760012490303e-05, + "loss": 1.4274, + "mean_token_accuracy": 0.6466375986735026, + "num_tokens": 693428652.0, + "step": 4133 + }, + { + "entropy": 1.6625401278336842, + "epoch": 0.45414847161572053, + "grad_norm": 0.6760391592979431, + "learning_rate": 1.8229806560809414e-05, + "loss": 1.2088, + "mean_token_accuracy": 0.6958808700243632, + "num_tokens": 693542006.0, + "step": 4134 + }, + { + "entropy": 1.6763292451699574, + "epoch": 0.45425832852709347, + "grad_norm": 0.6257210373878479, + "learning_rate": 1.8228852880308406e-05, + "loss": 1.3335, + "mean_token_accuracy": 0.655666912595431, + "num_tokens": 693691484.0, + "step": 4135 + }, + { + "entropy": 1.6853256324927013, + "epoch": 0.4543681854384664, + "grad_norm": 0.7661891579627991, + "learning_rate": 1.8227898971017463e-05, + "loss": 1.3239, + "mean_token_accuracy": 0.6566215753555298, + "num_tokens": 693824382.0, + "step": 4136 + }, + { + "entropy": 1.7048971951007843, + "epoch": 0.45447804234983935, + "grad_norm": 0.5919292569160461, + "learning_rate": 1.822694483296677e-05, + "loss": 1.4844, + "mean_token_accuracy": 0.6445303509632746, + "num_tokens": 694012674.0, + "step": 4137 + }, + { + "entropy": 1.7726793487866719, + "epoch": 0.4545878992612123, + "grad_norm": 0.8488749265670776, + "learning_rate": 1.8225990466186535e-05, + "loss": 1.3119, + "mean_token_accuracy": 0.6763729850451151, + "num_tokens": 694131834.0, + "step": 4138 + }, + { + "entropy": 1.6792495449384053, + "epoch": 0.45469775617258523, + "grad_norm": 0.6812113523483276, + "learning_rate": 1.8225035870706954e-05, + "loss": 1.3836, + "mean_token_accuracy": 0.6785061955451965, + "num_tokens": 694265200.0, + "step": 4139 + }, + { + "entropy": 1.7176821033159893, + "epoch": 0.4548076130839581, + "grad_norm": 0.6587532162666321, + "learning_rate": 1.8224081046558245e-05, + "loss": 1.2966, + "mean_token_accuracy": 0.6683917393287023, + "num_tokens": 694386638.0, + "step": 4140 + }, + { + "entropy": 1.720909317334493, + "epoch": 0.45491746999533106, + "grad_norm": 0.6936853528022766, + "learning_rate": 1.8223125993770628e-05, + "loss": 1.2505, + "mean_token_accuracy": 0.6704763223727545, + "num_tokens": 694537910.0, + "step": 4141 + }, + { + "entropy": 1.616556574900945, + "epoch": 0.455027326906704, + "grad_norm": 0.6663881540298462, + "learning_rate": 1.8222170712374324e-05, + "loss": 1.4531, + "mean_token_accuracy": 0.6384978145360947, + "num_tokens": 694748185.0, + "step": 4142 + }, + { + "entropy": 1.7091066340605419, + "epoch": 0.45513718381807694, + "grad_norm": 0.6039077043533325, + "learning_rate": 1.8221215202399575e-05, + "loss": 1.4285, + "mean_token_accuracy": 0.6513800273338953, + "num_tokens": 694907770.0, + "step": 4143 + }, + { + "entropy": 1.7064509391784668, + "epoch": 0.4552470407294499, + "grad_norm": 0.6601234674453735, + "learning_rate": 1.8220259463876618e-05, + "loss": 1.4402, + "mean_token_accuracy": 0.6421432644128799, + "num_tokens": 695075241.0, + "step": 4144 + }, + { + "entropy": 1.7203948597113292, + "epoch": 0.4553568976408228, + "grad_norm": 1.0562800168991089, + "learning_rate": 1.8219303496835698e-05, + "loss": 1.3034, + "mean_token_accuracy": 0.6741875658432642, + "num_tokens": 695192463.0, + "step": 4145 + }, + { + "entropy": 1.7525762518246968, + "epoch": 0.45546675455219576, + "grad_norm": 0.6298994421958923, + "learning_rate": 1.8218347301307082e-05, + "loss": 1.4266, + "mean_token_accuracy": 0.6470825970172882, + "num_tokens": 695375939.0, + "step": 4146 + }, + { + "entropy": 1.7427566250165303, + "epoch": 0.4555766114635687, + "grad_norm": 0.5814052224159241, + "learning_rate": 1.8217390877321025e-05, + "loss": 1.3958, + "mean_token_accuracy": 0.6584447820981344, + "num_tokens": 695578543.0, + "step": 4147 + }, + { + "entropy": 1.7967002391815186, + "epoch": 0.45568646837494164, + "grad_norm": 0.8577557802200317, + "learning_rate": 1.8216434224907797e-05, + "loss": 1.36, + "mean_token_accuracy": 0.6580723375082016, + "num_tokens": 695749141.0, + "step": 4148 + }, + { + "entropy": 1.7214231391747792, + "epoch": 0.4557963252863146, + "grad_norm": 0.6104874610900879, + "learning_rate": 1.8215477344097678e-05, + "loss": 1.4512, + "mean_token_accuracy": 0.6513601044813792, + "num_tokens": 695926344.0, + "step": 4149 + }, + { + "entropy": 1.693449040253957, + "epoch": 0.4559061821976875, + "grad_norm": 0.5731639862060547, + "learning_rate": 1.821452023492095e-05, + "loss": 1.2681, + "mean_token_accuracy": 0.6703798075517019, + "num_tokens": 696083860.0, + "step": 4150 + }, + { + "entropy": 1.7001817524433136, + "epoch": 0.45601603910906047, + "grad_norm": 0.6833996176719666, + "learning_rate": 1.8213562897407915e-05, + "loss": 1.2219, + "mean_token_accuracy": 0.6803639133771261, + "num_tokens": 696195681.0, + "step": 4151 + }, + { + "entropy": 1.718630462884903, + "epoch": 0.4561258960204334, + "grad_norm": 0.6625598073005676, + "learning_rate": 1.8212605331588858e-05, + "loss": 1.4388, + "mean_token_accuracy": 0.6539454261461893, + "num_tokens": 696368212.0, + "step": 4152 + }, + { + "entropy": 1.7498544255892436, + "epoch": 0.45623575293180635, + "grad_norm": 0.6957936882972717, + "learning_rate": 1.8211647537494093e-05, + "loss": 1.2725, + "mean_token_accuracy": 0.6697985430558523, + "num_tokens": 696502071.0, + "step": 4153 + }, + { + "entropy": 1.6797556082407634, + "epoch": 0.45634560984317923, + "grad_norm": 0.8322230577468872, + "learning_rate": 1.8210689515153934e-05, + "loss": 1.4798, + "mean_token_accuracy": 0.6528271933396658, + "num_tokens": 696684855.0, + "step": 4154 + }, + { + "entropy": 1.7090383271376293, + "epoch": 0.4564554667545522, + "grad_norm": 0.6606098413467407, + "learning_rate": 1.82097312645987e-05, + "loss": 1.2863, + "mean_token_accuracy": 0.6675911794106165, + "num_tokens": 696875468.0, + "step": 4155 + }, + { + "entropy": 1.6464302639166515, + "epoch": 0.4565653236659251, + "grad_norm": 0.7480058670043945, + "learning_rate": 1.8208772785858724e-05, + "loss": 1.3633, + "mean_token_accuracy": 0.6650909036397934, + "num_tokens": 697063206.0, + "step": 4156 + }, + { + "entropy": 1.6582618256409962, + "epoch": 0.45667518057729806, + "grad_norm": 0.5773199796676636, + "learning_rate": 1.8207814078964335e-05, + "loss": 1.4054, + "mean_token_accuracy": 0.6530261288086573, + "num_tokens": 697347872.0, + "step": 4157 + }, + { + "entropy": 1.7507151464621227, + "epoch": 0.456785037488671, + "grad_norm": 0.6207919716835022, + "learning_rate": 1.820685514394588e-05, + "loss": 1.4224, + "mean_token_accuracy": 0.6475935826698939, + "num_tokens": 697519220.0, + "step": 4158 + }, + { + "entropy": 1.7405341863632202, + "epoch": 0.45689489440004394, + "grad_norm": 0.7598936557769775, + "learning_rate": 1.8205895980833708e-05, + "loss": 1.4921, + "mean_token_accuracy": 0.6516546607017517, + "num_tokens": 697663160.0, + "step": 4159 + }, + { + "entropy": 1.7247182031472523, + "epoch": 0.4570047513114169, + "grad_norm": 0.7689334750175476, + "learning_rate": 1.8204936589658172e-05, + "loss": 1.3283, + "mean_token_accuracy": 0.6829536060492197, + "num_tokens": 697796527.0, + "step": 4160 + }, + { + "entropy": 1.6589301824569702, + "epoch": 0.4571146082227898, + "grad_norm": 0.6805022358894348, + "learning_rate": 1.820397697044964e-05, + "loss": 1.3969, + "mean_token_accuracy": 0.6660082787275314, + "num_tokens": 698022883.0, + "step": 4161 + }, + { + "entropy": 1.7611735065778096, + "epoch": 0.45722446513416276, + "grad_norm": 0.774599015712738, + "learning_rate": 1.8203017123238484e-05, + "loss": 1.4106, + "mean_token_accuracy": 0.6451850136121114, + "num_tokens": 698259147.0, + "step": 4162 + }, + { + "entropy": 1.7202747464179993, + "epoch": 0.4573343220455357, + "grad_norm": 0.6693452000617981, + "learning_rate": 1.820205704805508e-05, + "loss": 1.3082, + "mean_token_accuracy": 0.6705300956964493, + "num_tokens": 698409150.0, + "step": 4163 + }, + { + "entropy": 1.6769547561804454, + "epoch": 0.45744417895690864, + "grad_norm": 0.5999146699905396, + "learning_rate": 1.820109674492982e-05, + "loss": 1.4154, + "mean_token_accuracy": 0.6518602818250656, + "num_tokens": 698583103.0, + "step": 4164 + }, + { + "entropy": 1.7415876587231953, + "epoch": 0.4575540358682816, + "grad_norm": 0.6980689167976379, + "learning_rate": 1.820013621389309e-05, + "loss": 1.4015, + "mean_token_accuracy": 0.6452522526184717, + "num_tokens": 698776410.0, + "step": 4165 + }, + { + "entropy": 1.7366366783777873, + "epoch": 0.4576638927796545, + "grad_norm": 0.6637096405029297, + "learning_rate": 1.8199175454975293e-05, + "loss": 1.3677, + "mean_token_accuracy": 0.6576673090457916, + "num_tokens": 698964243.0, + "step": 4166 + }, + { + "entropy": 1.7057179510593414, + "epoch": 0.4577737496910274, + "grad_norm": 1.1949498653411865, + "learning_rate": 1.8198214468206836e-05, + "loss": 1.3636, + "mean_token_accuracy": 0.6599676311016083, + "num_tokens": 699149054.0, + "step": 4167 + }, + { + "entropy": 1.6758360266685486, + "epoch": 0.45788360660240035, + "grad_norm": 0.6972344517707825, + "learning_rate": 1.819725325361814e-05, + "loss": 1.2711, + "mean_token_accuracy": 0.6723342637221018, + "num_tokens": 699279792.0, + "step": 4168 + }, + { + "entropy": 1.7160559793313344, + "epoch": 0.4579934635137733, + "grad_norm": 0.6513280868530273, + "learning_rate": 1.8196291811239614e-05, + "loss": 1.6112, + "mean_token_accuracy": 0.6390035400787989, + "num_tokens": 699483735.0, + "step": 4169 + }, + { + "entropy": 1.6684886713822682, + "epoch": 0.45810332042514623, + "grad_norm": 0.6418635249137878, + "learning_rate": 1.81953301411017e-05, + "loss": 1.4819, + "mean_token_accuracy": 0.6575515071551005, + "num_tokens": 699650354.0, + "step": 4170 + }, + { + "entropy": 1.7136259178320568, + "epoch": 0.4582131773365192, + "grad_norm": 0.8453693389892578, + "learning_rate": 1.819436824323483e-05, + "loss": 1.3379, + "mean_token_accuracy": 0.6703378160794576, + "num_tokens": 699778940.0, + "step": 4171 + }, + { + "entropy": 1.688475747903188, + "epoch": 0.4583230342478921, + "grad_norm": 0.6598884463310242, + "learning_rate": 1.8193406117669442e-05, + "loss": 1.3524, + "mean_token_accuracy": 0.654664620757103, + "num_tokens": 699941197.0, + "step": 4172 + }, + { + "entropy": 1.703117161989212, + "epoch": 0.45843289115926505, + "grad_norm": 0.6558519601821899, + "learning_rate": 1.8192443764435996e-05, + "loss": 1.3037, + "mean_token_accuracy": 0.6665119081735611, + "num_tokens": 700098267.0, + "step": 4173 + }, + { + "entropy": 1.699433147907257, + "epoch": 0.458542748070638, + "grad_norm": 0.7802864909172058, + "learning_rate": 1.8191481183564947e-05, + "loss": 1.5111, + "mean_token_accuracy": 0.6549594352642695, + "num_tokens": 700262148.0, + "step": 4174 + }, + { + "entropy": 1.7704039216041565, + "epoch": 0.45865260498201094, + "grad_norm": 0.8732252717018127, + "learning_rate": 1.8190518375086756e-05, + "loss": 1.4362, + "mean_token_accuracy": 0.6573774913946787, + "num_tokens": 700446932.0, + "step": 4175 + }, + { + "entropy": 1.6488378445307414, + "epoch": 0.4587624618933839, + "grad_norm": 0.702743649482727, + "learning_rate": 1.81895553390319e-05, + "loss": 1.3467, + "mean_token_accuracy": 0.6733838816483816, + "num_tokens": 700593036.0, + "step": 4176 + }, + { + "entropy": 1.73904745777448, + "epoch": 0.4588723188047568, + "grad_norm": 0.6903389096260071, + "learning_rate": 1.8188592075430854e-05, + "loss": 1.5451, + "mean_token_accuracy": 0.6281401266654333, + "num_tokens": 700807095.0, + "step": 4177 + }, + { + "entropy": 1.7598382830619812, + "epoch": 0.45898217571612976, + "grad_norm": 0.7444609999656677, + "learning_rate": 1.8187628584314113e-05, + "loss": 1.4259, + "mean_token_accuracy": 0.6544857770204544, + "num_tokens": 701000480.0, + "step": 4178 + }, + { + "entropy": 1.7172856330871582, + "epoch": 0.4590920326275027, + "grad_norm": 0.7468621134757996, + "learning_rate": 1.8186664865712163e-05, + "loss": 1.4648, + "mean_token_accuracy": 0.6561927249034246, + "num_tokens": 701148536.0, + "step": 4179 + }, + { + "entropy": 1.7054723699887593, + "epoch": 0.45920188953887564, + "grad_norm": 0.7430470585823059, + "learning_rate": 1.818570091965551e-05, + "loss": 1.3712, + "mean_token_accuracy": 0.662121370434761, + "num_tokens": 701303541.0, + "step": 4180 + }, + { + "entropy": 1.7247373064359028, + "epoch": 0.4593117464502485, + "grad_norm": 0.6628074645996094, + "learning_rate": 1.8184736746174658e-05, + "loss": 1.4219, + "mean_token_accuracy": 0.6571998844544092, + "num_tokens": 701471247.0, + "step": 4181 + }, + { + "entropy": 1.6875219146410625, + "epoch": 0.45942160336162147, + "grad_norm": 0.7151501178741455, + "learning_rate": 1.818377234530013e-05, + "loss": 1.3622, + "mean_token_accuracy": 0.6559572865565618, + "num_tokens": 701645326.0, + "step": 4182 + }, + { + "entropy": 1.7306556105613708, + "epoch": 0.4595314602729944, + "grad_norm": 0.6144996881484985, + "learning_rate": 1.818280771706244e-05, + "loss": 1.4024, + "mean_token_accuracy": 0.6534335116545359, + "num_tokens": 701805577.0, + "step": 4183 + }, + { + "entropy": 1.7050406138102214, + "epoch": 0.45964131718436735, + "grad_norm": 0.6486047506332397, + "learning_rate": 1.8181842861492126e-05, + "loss": 1.3204, + "mean_token_accuracy": 0.6658004969358444, + "num_tokens": 701938888.0, + "step": 4184 + }, + { + "entropy": 1.6983507772286732, + "epoch": 0.4597511740957403, + "grad_norm": 0.6919155716896057, + "learning_rate": 1.818087777861972e-05, + "loss": 1.4086, + "mean_token_accuracy": 0.6503052040934563, + "num_tokens": 702099604.0, + "step": 4185 + }, + { + "entropy": 1.6433574159940083, + "epoch": 0.45986103100711323, + "grad_norm": 0.8726625442504883, + "learning_rate": 1.8179912468475768e-05, + "loss": 1.2663, + "mean_token_accuracy": 0.6762971927722295, + "num_tokens": 702232628.0, + "step": 4186 + }, + { + "entropy": 1.6802456776301067, + "epoch": 0.45997088791848617, + "grad_norm": 0.8037099242210388, + "learning_rate": 1.8178946931090822e-05, + "loss": 1.3511, + "mean_token_accuracy": 0.6654014339049658, + "num_tokens": 702373194.0, + "step": 4187 + }, + { + "entropy": 1.6643012166023254, + "epoch": 0.4600807448298591, + "grad_norm": 0.794750452041626, + "learning_rate": 1.817798116649544e-05, + "loss": 1.3445, + "mean_token_accuracy": 0.6783427347739538, + "num_tokens": 702492194.0, + "step": 4188 + }, + { + "entropy": 1.7596985697746277, + "epoch": 0.46019060174123205, + "grad_norm": 0.6531470417976379, + "learning_rate": 1.8177015174720186e-05, + "loss": 1.5094, + "mean_token_accuracy": 0.642010380824407, + "num_tokens": 702706325.0, + "step": 4189 + }, + { + "entropy": 1.7269844611485798, + "epoch": 0.460300458652605, + "grad_norm": 0.7435563206672668, + "learning_rate": 1.817604895579564e-05, + "loss": 1.4141, + "mean_token_accuracy": 0.6406523485978445, + "num_tokens": 702871807.0, + "step": 4190 + }, + { + "entropy": 1.7121829390525818, + "epoch": 0.46041031556397793, + "grad_norm": 0.6904016137123108, + "learning_rate": 1.817508250975238e-05, + "loss": 1.4689, + "mean_token_accuracy": 0.6557297557592392, + "num_tokens": 703015510.0, + "step": 4191 + }, + { + "entropy": 1.6617660621802013, + "epoch": 0.4605201724753509, + "grad_norm": 0.5894798636436462, + "learning_rate": 1.8174115836620985e-05, + "loss": 1.385, + "mean_token_accuracy": 0.6519947598377863, + "num_tokens": 703225072.0, + "step": 4192 + }, + { + "entropy": 1.717599133650462, + "epoch": 0.4606300293867238, + "grad_norm": 0.6478140354156494, + "learning_rate": 1.8173148936432062e-05, + "loss": 1.4113, + "mean_token_accuracy": 0.6411587198575338, + "num_tokens": 703450060.0, + "step": 4193 + }, + { + "entropy": 1.69918089111646, + "epoch": 0.4607398862980967, + "grad_norm": 0.6551547050476074, + "learning_rate": 1.8172181809216206e-05, + "loss": 1.4039, + "mean_token_accuracy": 0.6578977555036545, + "num_tokens": 703686905.0, + "step": 4194 + }, + { + "entropy": 1.7954679628213246, + "epoch": 0.46084974320946964, + "grad_norm": 0.7491250038146973, + "learning_rate": 1.8171214455004024e-05, + "loss": 1.4899, + "mean_token_accuracy": 0.6477667888005575, + "num_tokens": 703859777.0, + "step": 4195 + }, + { + "entropy": 1.6918367048104603, + "epoch": 0.4609596001208426, + "grad_norm": 0.6992884874343872, + "learning_rate": 1.817024687382614e-05, + "loss": 1.3772, + "mean_token_accuracy": 0.6759544163942337, + "num_tokens": 703980670.0, + "step": 4196 + }, + { + "entropy": 1.6738916039466858, + "epoch": 0.4610694570322155, + "grad_norm": 0.9104806184768677, + "learning_rate": 1.8169279065713173e-05, + "loss": 1.3039, + "mean_token_accuracy": 0.6699705421924591, + "num_tokens": 704127133.0, + "step": 4197 + }, + { + "entropy": 1.731379359960556, + "epoch": 0.46117931394358846, + "grad_norm": 0.6715532541275024, + "learning_rate": 1.8168311030695753e-05, + "loss": 1.4613, + "mean_token_accuracy": 0.6445593535900116, + "num_tokens": 704309900.0, + "step": 4198 + }, + { + "entropy": 1.6891814172267914, + "epoch": 0.4612891708549614, + "grad_norm": 0.5850751399993896, + "learning_rate": 1.8167342768804518e-05, + "loss": 1.4346, + "mean_token_accuracy": 0.6435820659001669, + "num_tokens": 704530111.0, + "step": 4199 + }, + { + "entropy": 1.6919045945008595, + "epoch": 0.46139902776633435, + "grad_norm": 0.7237669825553894, + "learning_rate": 1.8166374280070118e-05, + "loss": 1.5484, + "mean_token_accuracy": 0.627090315024058, + "num_tokens": 704732824.0, + "step": 4200 + }, + { + "entropy": 1.7176773647467296, + "epoch": 0.4615088846777073, + "grad_norm": 0.7041877508163452, + "learning_rate": 1.81654055645232e-05, + "loss": 1.5581, + "mean_token_accuracy": 0.6256896555423737, + "num_tokens": 704992928.0, + "step": 4201 + }, + { + "entropy": 1.6486956278483074, + "epoch": 0.46161874158908023, + "grad_norm": 0.7475844621658325, + "learning_rate": 1.8164436622194425e-05, + "loss": 1.4561, + "mean_token_accuracy": 0.6535168488820394, + "num_tokens": 705157798.0, + "step": 4202 + }, + { + "entropy": 1.7114735345045726, + "epoch": 0.46172859850045317, + "grad_norm": 0.657800555229187, + "learning_rate": 1.8163467453114454e-05, + "loss": 1.2966, + "mean_token_accuracy": 0.6700218071540197, + "num_tokens": 705312195.0, + "step": 4203 + }, + { + "entropy": 1.7229937215646107, + "epoch": 0.4618384554118261, + "grad_norm": 0.6623897552490234, + "learning_rate": 1.816249805731397e-05, + "loss": 1.393, + "mean_token_accuracy": 0.6550327589114507, + "num_tokens": 705484259.0, + "step": 4204 + }, + { + "entropy": 1.7269688149293263, + "epoch": 0.46194831232319905, + "grad_norm": 0.6905580163002014, + "learning_rate": 1.816152843482365e-05, + "loss": 1.4828, + "mean_token_accuracy": 0.6393428792556127, + "num_tokens": 705693058.0, + "step": 4205 + }, + { + "entropy": 1.6766071319580078, + "epoch": 0.462058169234572, + "grad_norm": 0.6602928638458252, + "learning_rate": 1.816055858567418e-05, + "loss": 1.3148, + "mean_token_accuracy": 0.6683008025089899, + "num_tokens": 705870511.0, + "step": 4206 + }, + { + "entropy": 1.6405730545520782, + "epoch": 0.46216802614594493, + "grad_norm": 0.7517810463905334, + "learning_rate": 1.8159588509896262e-05, + "loss": 1.2918, + "mean_token_accuracy": 0.6697532882293066, + "num_tokens": 706068879.0, + "step": 4207 + }, + { + "entropy": 1.7424190441767375, + "epoch": 0.4622778830573178, + "grad_norm": 0.7065527439117432, + "learning_rate": 1.815861820752059e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.6661938230196635, + "num_tokens": 706280061.0, + "step": 4208 + }, + { + "entropy": 1.648894727230072, + "epoch": 0.46238773996869076, + "grad_norm": 0.5504491925239563, + "learning_rate": 1.815764767857788e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.6563327610492706, + "num_tokens": 706477220.0, + "step": 4209 + }, + { + "entropy": 1.6771936317284901, + "epoch": 0.4624975968800637, + "grad_norm": 0.6242183446884155, + "learning_rate": 1.8156676923098847e-05, + "loss": 1.435, + "mean_token_accuracy": 0.6519751648108164, + "num_tokens": 706659831.0, + "step": 4210 + }, + { + "entropy": 1.6414129038651784, + "epoch": 0.46260745379143664, + "grad_norm": 0.6485480070114136, + "learning_rate": 1.815570594111421e-05, + "loss": 1.2985, + "mean_token_accuracy": 0.6742209245761236, + "num_tokens": 706803926.0, + "step": 4211 + }, + { + "entropy": 1.7177407443523407, + "epoch": 0.4627173107028096, + "grad_norm": 0.6429049968719482, + "learning_rate": 1.8154734732654708e-05, + "loss": 1.4829, + "mean_token_accuracy": 0.6438324997822443, + "num_tokens": 707004084.0, + "step": 4212 + }, + { + "entropy": 1.7026935319105785, + "epoch": 0.4628271676141825, + "grad_norm": 0.5772531628608704, + "learning_rate": 1.8153763297751072e-05, + "loss": 1.3852, + "mean_token_accuracy": 0.6560295174519221, + "num_tokens": 707183877.0, + "step": 4213 + }, + { + "entropy": 1.7273212869962056, + "epoch": 0.46293702452555546, + "grad_norm": 0.6105433702468872, + "learning_rate": 1.8152791636434057e-05, + "loss": 1.596, + "mean_token_accuracy": 0.6389935463666916, + "num_tokens": 707371269.0, + "step": 4214 + }, + { + "entropy": 1.7505205670992534, + "epoch": 0.4630468814369284, + "grad_norm": 0.6398033499717712, + "learning_rate": 1.8151819748734404e-05, + "loss": 1.4717, + "mean_token_accuracy": 0.6463633726040522, + "num_tokens": 707525669.0, + "step": 4215 + }, + { + "entropy": 1.7202585935592651, + "epoch": 0.46315673834830134, + "grad_norm": 0.6473333835601807, + "learning_rate": 1.8150847634682883e-05, + "loss": 1.4063, + "mean_token_accuracy": 0.643185963233312, + "num_tokens": 707735780.0, + "step": 4216 + }, + { + "entropy": 1.7204928199450176, + "epoch": 0.4632665952596743, + "grad_norm": 0.7724722027778625, + "learning_rate": 1.8149875294310253e-05, + "loss": 1.4856, + "mean_token_accuracy": 0.6558432877063751, + "num_tokens": 707897505.0, + "step": 4217 + }, + { + "entropy": 1.7396677335103352, + "epoch": 0.4633764521710472, + "grad_norm": 0.7432756423950195, + "learning_rate": 1.8148902727647293e-05, + "loss": 1.3591, + "mean_token_accuracy": 0.651182030638059, + "num_tokens": 708040117.0, + "step": 4218 + }, + { + "entropy": 1.716854214668274, + "epoch": 0.46348630908242017, + "grad_norm": 0.7438036799430847, + "learning_rate": 1.8147929934724783e-05, + "loss": 1.4251, + "mean_token_accuracy": 0.6668249318997065, + "num_tokens": 708197909.0, + "step": 4219 + }, + { + "entropy": 1.770167201757431, + "epoch": 0.4635961659937931, + "grad_norm": 0.7183382511138916, + "learning_rate": 1.8146956915573512e-05, + "loss": 1.5752, + "mean_token_accuracy": 0.6293011605739594, + "num_tokens": 708386792.0, + "step": 4220 + }, + { + "entropy": 1.8206903139750164, + "epoch": 0.46370602290516605, + "grad_norm": 0.8068307638168335, + "learning_rate": 1.8145983670224278e-05, + "loss": 1.3832, + "mean_token_accuracy": 0.6531219184398651, + "num_tokens": 708535932.0, + "step": 4221 + }, + { + "entropy": 1.7339389224847157, + "epoch": 0.46381587981653893, + "grad_norm": 0.5660984516143799, + "learning_rate": 1.8145010198707875e-05, + "loss": 1.4435, + "mean_token_accuracy": 0.6397424240907034, + "num_tokens": 708781487.0, + "step": 4222 + }, + { + "entropy": 1.7772690852483113, + "epoch": 0.4639257367279119, + "grad_norm": 0.7798457145690918, + "learning_rate": 1.8144036501055123e-05, + "loss": 1.5243, + "mean_token_accuracy": 0.6494849175214767, + "num_tokens": 708920912.0, + "step": 4223 + }, + { + "entropy": 1.7303331792354584, + "epoch": 0.4640355936392848, + "grad_norm": 0.6171626448631287, + "learning_rate": 1.8143062577296835e-05, + "loss": 1.4185, + "mean_token_accuracy": 0.6400276025136312, + "num_tokens": 709104156.0, + "step": 4224 + }, + { + "entropy": 1.6459304491678874, + "epoch": 0.46414545055065776, + "grad_norm": 0.6023601293563843, + "learning_rate": 1.814208842746383e-05, + "loss": 1.3254, + "mean_token_accuracy": 0.6633083323637644, + "num_tokens": 709240703.0, + "step": 4225 + }, + { + "entropy": 1.6952888270219166, + "epoch": 0.4642553074620307, + "grad_norm": 0.7373741865158081, + "learning_rate": 1.814111405158695e-05, + "loss": 1.4096, + "mean_token_accuracy": 0.641968791683515, + "num_tokens": 709451932.0, + "step": 4226 + }, + { + "entropy": 1.745063195625941, + "epoch": 0.46436516437340364, + "grad_norm": 0.7500935792922974, + "learning_rate": 1.8140139449697028e-05, + "loss": 1.4981, + "mean_token_accuracy": 0.65280049542586, + "num_tokens": 709691349.0, + "step": 4227 + }, + { + "entropy": 1.8040038843949635, + "epoch": 0.4644750212847766, + "grad_norm": 0.8563746213912964, + "learning_rate": 1.8139164621824907e-05, + "loss": 1.4651, + "mean_token_accuracy": 0.6291163365046183, + "num_tokens": 709840840.0, + "step": 4228 + }, + { + "entropy": 1.665471722682317, + "epoch": 0.4645848781961495, + "grad_norm": 0.6224023103713989, + "learning_rate": 1.8138189568001445e-05, + "loss": 1.3097, + "mean_token_accuracy": 0.6575134992599487, + "num_tokens": 709985259.0, + "step": 4229 + }, + { + "entropy": 1.605366716782252, + "epoch": 0.46469473510752246, + "grad_norm": 0.6934778690338135, + "learning_rate": 1.8137214288257497e-05, + "loss": 1.352, + "mean_token_accuracy": 0.6759183506170908, + "num_tokens": 710148873.0, + "step": 4230 + }, + { + "entropy": 1.7328666150569916, + "epoch": 0.4648045920188954, + "grad_norm": 0.6883534789085388, + "learning_rate": 1.8136238782623937e-05, + "loss": 1.4765, + "mean_token_accuracy": 0.6576277663310369, + "num_tokens": 710287781.0, + "step": 4231 + }, + { + "entropy": 1.7534802854061127, + "epoch": 0.46491444893026834, + "grad_norm": 0.7432095408439636, + "learning_rate": 1.813526305113163e-05, + "loss": 1.386, + "mean_token_accuracy": 0.6680583655834198, + "num_tokens": 710462445.0, + "step": 4232 + }, + { + "entropy": 1.7207884788513184, + "epoch": 0.4650243058416413, + "grad_norm": 0.6437093615531921, + "learning_rate": 1.813428709381147e-05, + "loss": 1.2856, + "mean_token_accuracy": 0.6679480870564779, + "num_tokens": 710589866.0, + "step": 4233 + }, + { + "entropy": 1.667872816324234, + "epoch": 0.4651341627530142, + "grad_norm": 0.6963672637939453, + "learning_rate": 1.813331091069433e-05, + "loss": 1.3454, + "mean_token_accuracy": 0.6745727012554804, + "num_tokens": 710746216.0, + "step": 4234 + }, + { + "entropy": 1.7533026337623596, + "epoch": 0.4652440196643871, + "grad_norm": 0.9982355833053589, + "learning_rate": 1.813233450181112e-05, + "loss": 1.5738, + "mean_token_accuracy": 0.6681816776593527, + "num_tokens": 710904614.0, + "step": 4235 + }, + { + "entropy": 1.6505067149798076, + "epoch": 0.46535387657576005, + "grad_norm": 0.9331510066986084, + "learning_rate": 1.8131357867192738e-05, + "loss": 1.0794, + "mean_token_accuracy": 0.6999476154645284, + "num_tokens": 711051143.0, + "step": 4236 + }, + { + "entropy": 1.690159449974696, + "epoch": 0.465463733487133, + "grad_norm": 0.7899004817008972, + "learning_rate": 1.8130381006870087e-05, + "loss": 1.544, + "mean_token_accuracy": 0.6387489885091782, + "num_tokens": 711244954.0, + "step": 4237 + }, + { + "entropy": 1.726913332939148, + "epoch": 0.46557359039850593, + "grad_norm": 0.6733956933021545, + "learning_rate": 1.8129403920874093e-05, + "loss": 1.4999, + "mean_token_accuracy": 0.6343253751595815, + "num_tokens": 711452273.0, + "step": 4238 + }, + { + "entropy": 1.7664103507995605, + "epoch": 0.4656834473098789, + "grad_norm": 0.7927049398422241, + "learning_rate": 1.8128426609235673e-05, + "loss": 1.3575, + "mean_token_accuracy": 0.6616076578696569, + "num_tokens": 711585439.0, + "step": 4239 + }, + { + "entropy": 1.729611297448476, + "epoch": 0.4657933042212518, + "grad_norm": 0.7896009683609009, + "learning_rate": 1.812744907198577e-05, + "loss": 1.3908, + "mean_token_accuracy": 0.6635908087094625, + "num_tokens": 711773155.0, + "step": 4240 + }, + { + "entropy": 1.6715228458245595, + "epoch": 0.46590316113262475, + "grad_norm": 0.7059125304222107, + "learning_rate": 1.8126471309155314e-05, + "loss": 1.4095, + "mean_token_accuracy": 0.6561342726151148, + "num_tokens": 711914804.0, + "step": 4241 + }, + { + "entropy": 1.7312528987725575, + "epoch": 0.4660130180439977, + "grad_norm": 0.6025689244270325, + "learning_rate": 1.812549332077525e-05, + "loss": 1.546, + "mean_token_accuracy": 0.6508001486460367, + "num_tokens": 712129813.0, + "step": 4242 + }, + { + "entropy": 1.7039500772953033, + "epoch": 0.46612287495537064, + "grad_norm": 0.8072606921195984, + "learning_rate": 1.8124515106876534e-05, + "loss": 1.5661, + "mean_token_accuracy": 0.6641567001740137, + "num_tokens": 712276945.0, + "step": 4243 + }, + { + "entropy": 1.716274122397105, + "epoch": 0.4662327318667436, + "grad_norm": 0.7391960620880127, + "learning_rate": 1.8123536667490127e-05, + "loss": 1.2449, + "mean_token_accuracy": 0.6780173579851786, + "num_tokens": 712433226.0, + "step": 4244 + }, + { + "entropy": 1.7017336984475453, + "epoch": 0.4663425887781165, + "grad_norm": 0.6935653686523438, + "learning_rate": 1.812255800264699e-05, + "loss": 1.3873, + "mean_token_accuracy": 0.6610340823729833, + "num_tokens": 712584310.0, + "step": 4245 + }, + { + "entropy": 1.6905235250790913, + "epoch": 0.46645244568948946, + "grad_norm": 0.6185707449913025, + "learning_rate": 1.8121579112378106e-05, + "loss": 1.5186, + "mean_token_accuracy": 0.6343037039041519, + "num_tokens": 712824746.0, + "step": 4246 + }, + { + "entropy": 1.7747002641359966, + "epoch": 0.4665623026008624, + "grad_norm": 0.7528815269470215, + "learning_rate": 1.812059999671445e-05, + "loss": 1.5891, + "mean_token_accuracy": 0.6364632795254389, + "num_tokens": 713007981.0, + "step": 4247 + }, + { + "entropy": 1.7169700662295024, + "epoch": 0.46667215951223534, + "grad_norm": 0.6410589218139648, + "learning_rate": 1.811962065568702e-05, + "loss": 1.3224, + "mean_token_accuracy": 0.6646380325158437, + "num_tokens": 713151048.0, + "step": 4248 + }, + { + "entropy": 1.7213744620482128, + "epoch": 0.4667820164236082, + "grad_norm": 0.6539469957351685, + "learning_rate": 1.8118641089326795e-05, + "loss": 1.3819, + "mean_token_accuracy": 0.6464784244696299, + "num_tokens": 713338388.0, + "step": 4249 + }, + { + "entropy": 1.69509752591451, + "epoch": 0.46689187333498117, + "grad_norm": 0.6454856395721436, + "learning_rate": 1.811766129766479e-05, + "loss": 1.3495, + "mean_token_accuracy": 0.663901224732399, + "num_tokens": 713520267.0, + "step": 4250 + }, + { + "entropy": 1.7052615284919739, + "epoch": 0.4670017302463541, + "grad_norm": 0.6770211458206177, + "learning_rate": 1.811668128073201e-05, + "loss": 1.5541, + "mean_token_accuracy": 0.6339400360981623, + "num_tokens": 713708802.0, + "step": 4251 + }, + { + "entropy": 1.6545226871967316, + "epoch": 0.46711158715772705, + "grad_norm": 0.7631199359893799, + "learning_rate": 1.811570103855948e-05, + "loss": 1.1884, + "mean_token_accuracy": 0.6902492394049963, + "num_tokens": 713859057.0, + "step": 4252 + }, + { + "entropy": 1.755904217561086, + "epoch": 0.4672214440691, + "grad_norm": 0.7086718082427979, + "learning_rate": 1.8114720571178215e-05, + "loss": 1.3183, + "mean_token_accuracy": 0.6702389965454737, + "num_tokens": 713975952.0, + "step": 4253 + }, + { + "entropy": 1.7080712616443634, + "epoch": 0.46733130098047293, + "grad_norm": 0.7158058881759644, + "learning_rate": 1.811373987861925e-05, + "loss": 1.5163, + "mean_token_accuracy": 0.649703840414683, + "num_tokens": 714152247.0, + "step": 4254 + }, + { + "entropy": 1.6785069803396861, + "epoch": 0.46744115789184587, + "grad_norm": 0.8712213039398193, + "learning_rate": 1.8112758960913622e-05, + "loss": 1.4157, + "mean_token_accuracy": 0.6431390146414439, + "num_tokens": 714309228.0, + "step": 4255 + }, + { + "entropy": 1.6953572432200115, + "epoch": 0.4675510148032188, + "grad_norm": 0.7182803750038147, + "learning_rate": 1.811177781809238e-05, + "loss": 1.4312, + "mean_token_accuracy": 0.6548483719428381, + "num_tokens": 714477884.0, + "step": 4256 + }, + { + "entropy": 1.6632357239723206, + "epoch": 0.46766087171459175, + "grad_norm": 0.6293473839759827, + "learning_rate": 1.8110796450186575e-05, + "loss": 1.4182, + "mean_token_accuracy": 0.6520185619592667, + "num_tokens": 714659406.0, + "step": 4257 + }, + { + "entropy": 1.7589415311813354, + "epoch": 0.4677707286259647, + "grad_norm": 0.6381205916404724, + "learning_rate": 1.810981485722727e-05, + "loss": 1.3121, + "mean_token_accuracy": 0.6649649838606516, + "num_tokens": 714782279.0, + "step": 4258 + }, + { + "entropy": 1.7126728395620983, + "epoch": 0.46788058553733763, + "grad_norm": 0.7846829295158386, + "learning_rate": 1.8108833039245522e-05, + "loss": 1.3273, + "mean_token_accuracy": 0.6554552515347799, + "num_tokens": 714981056.0, + "step": 4259 + }, + { + "entropy": 1.702657401561737, + "epoch": 0.4679904424487106, + "grad_norm": 0.9136094450950623, + "learning_rate": 1.8107850996272414e-05, + "loss": 1.5338, + "mean_token_accuracy": 0.654092272122701, + "num_tokens": 715155260.0, + "step": 4260 + }, + { + "entropy": 1.6792535583178203, + "epoch": 0.4681002993600835, + "grad_norm": 0.6011461019515991, + "learning_rate": 1.8106868728339024e-05, + "loss": 1.3942, + "mean_token_accuracy": 0.6538469940423965, + "num_tokens": 715359586.0, + "step": 4261 + }, + { + "entropy": 1.7443041900793712, + "epoch": 0.4682101562714564, + "grad_norm": 0.8767224550247192, + "learning_rate": 1.810588623547644e-05, + "loss": 1.5, + "mean_token_accuracy": 0.650155504544576, + "num_tokens": 715519166.0, + "step": 4262 + }, + { + "entropy": 1.7525466084480286, + "epoch": 0.46832001318282934, + "grad_norm": 0.707300066947937, + "learning_rate": 1.8104903517715765e-05, + "loss": 1.4655, + "mean_token_accuracy": 0.6447683721780777, + "num_tokens": 715712649.0, + "step": 4263 + }, + { + "entropy": 1.756022532780965, + "epoch": 0.4684298700942023, + "grad_norm": 0.6850044131278992, + "learning_rate": 1.8103920575088092e-05, + "loss": 1.3964, + "mean_token_accuracy": 0.6608478824297587, + "num_tokens": 715836727.0, + "step": 4264 + }, + { + "entropy": 1.7747258146603901, + "epoch": 0.4685397270055752, + "grad_norm": 0.8568280935287476, + "learning_rate": 1.810293740762453e-05, + "loss": 1.3805, + "mean_token_accuracy": 0.6580488632122675, + "num_tokens": 715980752.0, + "step": 4265 + }, + { + "entropy": 1.7230225205421448, + "epoch": 0.46864958391694816, + "grad_norm": 0.7651719450950623, + "learning_rate": 1.8101954015356204e-05, + "loss": 1.3571, + "mean_token_accuracy": 0.6602768997351328, + "num_tokens": 716150567.0, + "step": 4266 + }, + { + "entropy": 1.6723913550376892, + "epoch": 0.4687594408283211, + "grad_norm": 0.9501248002052307, + "learning_rate": 1.810097039831423e-05, + "loss": 1.3772, + "mean_token_accuracy": 0.6587880253791809, + "num_tokens": 716307037.0, + "step": 4267 + }, + { + "entropy": 1.7002276877562206, + "epoch": 0.46886929773969405, + "grad_norm": 0.6281111836433411, + "learning_rate": 1.8099986556529748e-05, + "loss": 1.5066, + "mean_token_accuracy": 0.630008652806282, + "num_tokens": 716473812.0, + "step": 4268 + }, + { + "entropy": 1.7168980836868286, + "epoch": 0.468979154651067, + "grad_norm": 0.6951456665992737, + "learning_rate": 1.8099002490033886e-05, + "loss": 1.3071, + "mean_token_accuracy": 0.6706042140722275, + "num_tokens": 716625498.0, + "step": 4269 + }, + { + "entropy": 1.6649049123128254, + "epoch": 0.46908901156243993, + "grad_norm": 0.783587634563446, + "learning_rate": 1.8098018198857797e-05, + "loss": 1.5879, + "mean_token_accuracy": 0.6355616301298141, + "num_tokens": 716806833.0, + "step": 4270 + }, + { + "entropy": 1.6906762719154358, + "epoch": 0.46919886847381287, + "grad_norm": 0.7415374517440796, + "learning_rate": 1.8097033683032627e-05, + "loss": 1.2876, + "mean_token_accuracy": 0.6685866812864939, + "num_tokens": 716934086.0, + "step": 4271 + }, + { + "entropy": 1.6387253900369008, + "epoch": 0.4693087253851858, + "grad_norm": 0.7111901640892029, + "learning_rate": 1.8096048942589545e-05, + "loss": 1.4057, + "mean_token_accuracy": 0.6655648052692413, + "num_tokens": 717123686.0, + "step": 4272 + }, + { + "entropy": 1.7091521223386128, + "epoch": 0.46941858229655875, + "grad_norm": 0.8598929047584534, + "learning_rate": 1.8095063977559706e-05, + "loss": 1.4529, + "mean_token_accuracy": 0.6519492069880167, + "num_tokens": 717281760.0, + "step": 4273 + }, + { + "entropy": 1.7045681079228718, + "epoch": 0.4695284392079317, + "grad_norm": 0.6577821969985962, + "learning_rate": 1.809407878797429e-05, + "loss": 1.4014, + "mean_token_accuracy": 0.6432823886473974, + "num_tokens": 717548049.0, + "step": 4274 + }, + { + "entropy": 1.7074416081110637, + "epoch": 0.46963829611930463, + "grad_norm": 0.7430242300033569, + "learning_rate": 1.809309337386448e-05, + "loss": 1.216, + "mean_token_accuracy": 0.6894436677296957, + "num_tokens": 717666119.0, + "step": 4275 + }, + { + "entropy": 1.7699640194574993, + "epoch": 0.4697481530306775, + "grad_norm": 0.635347306728363, + "learning_rate": 1.8092107735261456e-05, + "loss": 1.3919, + "mean_token_accuracy": 0.6452954411506653, + "num_tokens": 717850472.0, + "step": 4276 + }, + { + "entropy": 1.6869786580403645, + "epoch": 0.46985800994205046, + "grad_norm": 0.6386687755584717, + "learning_rate": 1.8091121872196424e-05, + "loss": 1.396, + "mean_token_accuracy": 0.6727607051531473, + "num_tokens": 718040207.0, + "step": 4277 + }, + { + "entropy": 1.741450657447179, + "epoch": 0.4699678668534234, + "grad_norm": 0.6524372696876526, + "learning_rate": 1.8090135784700573e-05, + "loss": 1.4028, + "mean_token_accuracy": 0.6533336142698923, + "num_tokens": 718201275.0, + "step": 4278 + }, + { + "entropy": 1.712234725554784, + "epoch": 0.47007772376479634, + "grad_norm": 0.6172965168952942, + "learning_rate": 1.8089149472805124e-05, + "loss": 1.3392, + "mean_token_accuracy": 0.6610304166873296, + "num_tokens": 718387782.0, + "step": 4279 + }, + { + "entropy": 1.7065201203028362, + "epoch": 0.4701875806761693, + "grad_norm": 1.2049860954284668, + "learning_rate": 1.808816293654129e-05, + "loss": 1.3301, + "mean_token_accuracy": 0.6645681808392206, + "num_tokens": 718584128.0, + "step": 4280 + }, + { + "entropy": 1.7023885548114777, + "epoch": 0.4702974375875422, + "grad_norm": 0.7760981917381287, + "learning_rate": 1.808717617594029e-05, + "loss": 1.459, + "mean_token_accuracy": 0.6542429427305857, + "num_tokens": 718796876.0, + "step": 4281 + }, + { + "entropy": 1.6860670546690624, + "epoch": 0.47040729449891516, + "grad_norm": 0.5433915257453918, + "learning_rate": 1.808618919103336e-05, + "loss": 1.3945, + "mean_token_accuracy": 0.6503080328305563, + "num_tokens": 719018527.0, + "step": 4282 + }, + { + "entropy": 1.6572819451491039, + "epoch": 0.4705171514102881, + "grad_norm": 0.6897891759872437, + "learning_rate": 1.8085201981851736e-05, + "loss": 1.2418, + "mean_token_accuracy": 0.6722082744042078, + "num_tokens": 719147052.0, + "step": 4283 + }, + { + "entropy": 1.6799413760503132, + "epoch": 0.47062700832166104, + "grad_norm": 0.6752045154571533, + "learning_rate": 1.8084214548426654e-05, + "loss": 1.4649, + "mean_token_accuracy": 0.6453558802604675, + "num_tokens": 719329449.0, + "step": 4284 + }, + { + "entropy": 1.722192605336507, + "epoch": 0.470736865233034, + "grad_norm": 0.6820024847984314, + "learning_rate": 1.808322689078938e-05, + "loss": 1.4336, + "mean_token_accuracy": 0.6577473928531011, + "num_tokens": 719487213.0, + "step": 4285 + }, + { + "entropy": 1.74460373322169, + "epoch": 0.4708467221444069, + "grad_norm": 0.6988834142684937, + "learning_rate": 1.808223900897117e-05, + "loss": 1.4352, + "mean_token_accuracy": 0.6543379972378413, + "num_tokens": 719694255.0, + "step": 4286 + }, + { + "entropy": 1.7346366445223491, + "epoch": 0.47095657905577987, + "grad_norm": 0.583162248134613, + "learning_rate": 1.808125090300328e-05, + "loss": 1.5286, + "mean_token_accuracy": 0.641033207376798, + "num_tokens": 719893163.0, + "step": 4287 + }, + { + "entropy": 1.7112852732340496, + "epoch": 0.4710664359671528, + "grad_norm": 0.7566839456558228, + "learning_rate": 1.8080262572916995e-05, + "loss": 1.4578, + "mean_token_accuracy": 0.6502961864074072, + "num_tokens": 720100241.0, + "step": 4288 + }, + { + "entropy": 1.7262468834718068, + "epoch": 0.4711762928785257, + "grad_norm": 0.7071588039398193, + "learning_rate": 1.8079274018743586e-05, + "loss": 1.4254, + "mean_token_accuracy": 0.6600110133488973, + "num_tokens": 720246867.0, + "step": 4289 + }, + { + "entropy": 1.7163510620594025, + "epoch": 0.47128614978989863, + "grad_norm": 0.6831440925598145, + "learning_rate": 1.8078285240514346e-05, + "loss": 1.4533, + "mean_token_accuracy": 0.6454131652911504, + "num_tokens": 720458244.0, + "step": 4290 + }, + { + "entropy": 1.728024274110794, + "epoch": 0.4713960067012716, + "grad_norm": 0.7538992762565613, + "learning_rate": 1.8077296238260566e-05, + "loss": 1.411, + "mean_token_accuracy": 0.6426265041033427, + "num_tokens": 720649523.0, + "step": 4291 + }, + { + "entropy": 1.7086349626382191, + "epoch": 0.4715058636126445, + "grad_norm": 0.7316638231277466, + "learning_rate": 1.807630701201355e-05, + "loss": 1.5146, + "mean_token_accuracy": 0.6412151654561361, + "num_tokens": 720851857.0, + "step": 4292 + }, + { + "entropy": 1.7693568468093872, + "epoch": 0.47161572052401746, + "grad_norm": 0.632696807384491, + "learning_rate": 1.8075317561804607e-05, + "loss": 1.4909, + "mean_token_accuracy": 0.6514314661423365, + "num_tokens": 721060765.0, + "step": 4293 + }, + { + "entropy": 1.6974764168262482, + "epoch": 0.4717255774353904, + "grad_norm": 0.6965688467025757, + "learning_rate": 1.8074327887665055e-05, + "loss": 1.2898, + "mean_token_accuracy": 0.6642700731754303, + "num_tokens": 721194660.0, + "step": 4294 + }, + { + "entropy": 1.6968635221322377, + "epoch": 0.47183543434676334, + "grad_norm": 0.6898081302642822, + "learning_rate": 1.8073337989626204e-05, + "loss": 1.3322, + "mean_token_accuracy": 0.6627618571122488, + "num_tokens": 721383484.0, + "step": 4295 + }, + { + "entropy": 1.6964893241723378, + "epoch": 0.4719452912581363, + "grad_norm": 0.8017412424087524, + "learning_rate": 1.80723478677194e-05, + "loss": 1.5026, + "mean_token_accuracy": 0.6530030220746994, + "num_tokens": 721549670.0, + "step": 4296 + }, + { + "entropy": 1.6955971519152324, + "epoch": 0.4720551481695092, + "grad_norm": 0.6565828919410706, + "learning_rate": 1.8071357521975973e-05, + "loss": 1.3502, + "mean_token_accuracy": 0.65217158695062, + "num_tokens": 721745653.0, + "step": 4297 + }, + { + "entropy": 1.7536290884017944, + "epoch": 0.47216500508088216, + "grad_norm": 0.8173093795776367, + "learning_rate": 1.8070366952427264e-05, + "loss": 1.2962, + "mean_token_accuracy": 0.6676636189222336, + "num_tokens": 721858073.0, + "step": 4298 + }, + { + "entropy": 1.7875964442888896, + "epoch": 0.4722748619922551, + "grad_norm": 0.7748461961746216, + "learning_rate": 1.8069376159104627e-05, + "loss": 1.3616, + "mean_token_accuracy": 0.6534435500701269, + "num_tokens": 721959036.0, + "step": 4299 + }, + { + "entropy": 1.6941121816635132, + "epoch": 0.47238471890362804, + "grad_norm": 0.6175353527069092, + "learning_rate": 1.8068385142039423e-05, + "loss": 1.422, + "mean_token_accuracy": 0.6544578274091085, + "num_tokens": 722165019.0, + "step": 4300 + }, + { + "entropy": 1.7405302226543427, + "epoch": 0.472494575815001, + "grad_norm": 0.7650004625320435, + "learning_rate": 1.8067393901263012e-05, + "loss": 1.4173, + "mean_token_accuracy": 0.6511756877104441, + "num_tokens": 722308976.0, + "step": 4301 + }, + { + "entropy": 1.676590492328008, + "epoch": 0.4726044327263739, + "grad_norm": 0.6478630900382996, + "learning_rate": 1.806640243680677e-05, + "loss": 1.4806, + "mean_token_accuracy": 0.6535717646280924, + "num_tokens": 722469917.0, + "step": 4302 + }, + { + "entropy": 1.7782863477865856, + "epoch": 0.4727142896377468, + "grad_norm": 0.719274640083313, + "learning_rate": 1.8065410748702074e-05, + "loss": 1.2816, + "mean_token_accuracy": 0.65965636074543, + "num_tokens": 722610621.0, + "step": 4303 + }, + { + "entropy": 1.710064172744751, + "epoch": 0.47282414654911975, + "grad_norm": 0.7524754405021667, + "learning_rate": 1.8064418836980308e-05, + "loss": 1.309, + "mean_token_accuracy": 0.6599444597959518, + "num_tokens": 722755392.0, + "step": 4304 + }, + { + "entropy": 1.7238652805487316, + "epoch": 0.4729340034604927, + "grad_norm": 0.7304325699806213, + "learning_rate": 1.8063426701672873e-05, + "loss": 1.4739, + "mean_token_accuracy": 0.6304669479529063, + "num_tokens": 722957312.0, + "step": 4305 + }, + { + "entropy": 1.7300621767838795, + "epoch": 0.47304386037186563, + "grad_norm": 0.6810824275016785, + "learning_rate": 1.8062434342811162e-05, + "loss": 1.2365, + "mean_token_accuracy": 0.676527221997579, + "num_tokens": 723083250.0, + "step": 4306 + }, + { + "entropy": 1.6840552985668182, + "epoch": 0.4731537172832386, + "grad_norm": 0.6959050297737122, + "learning_rate": 1.806144176042659e-05, + "loss": 1.2747, + "mean_token_accuracy": 0.6719006498654684, + "num_tokens": 723200410.0, + "step": 4307 + }, + { + "entropy": 1.674494077761968, + "epoch": 0.4732635741946115, + "grad_norm": 0.821858823299408, + "learning_rate": 1.806044895455057e-05, + "loss": 1.5042, + "mean_token_accuracy": 0.6550217668215433, + "num_tokens": 723415334.0, + "step": 4308 + }, + { + "entropy": 1.7024798194567363, + "epoch": 0.47337343110598445, + "grad_norm": 0.6039428114891052, + "learning_rate": 1.805945592521452e-05, + "loss": 1.529, + "mean_token_accuracy": 0.6412127415339152, + "num_tokens": 723682772.0, + "step": 4309 + }, + { + "entropy": 1.6901743511358898, + "epoch": 0.4734832880173574, + "grad_norm": 0.6753347516059875, + "learning_rate": 1.805846267244987e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6477407167355219, + "num_tokens": 723833718.0, + "step": 4310 + }, + { + "entropy": 1.7576901117960613, + "epoch": 0.47359314492873034, + "grad_norm": 0.6248276233673096, + "learning_rate": 1.805746919628806e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.6350358178218206, + "num_tokens": 724044003.0, + "step": 4311 + }, + { + "entropy": 1.8473595082759857, + "epoch": 0.4737030018401033, + "grad_norm": 0.74982750415802, + "learning_rate": 1.805647549676053e-05, + "loss": 1.3729, + "mean_token_accuracy": 0.6502262949943542, + "num_tokens": 724178065.0, + "step": 4312 + }, + { + "entropy": 1.701841801404953, + "epoch": 0.4738128587514762, + "grad_norm": 0.6254753470420837, + "learning_rate": 1.805548157389873e-05, + "loss": 1.4934, + "mean_token_accuracy": 0.6539788742860159, + "num_tokens": 724368669.0, + "step": 4313 + }, + { + "entropy": 1.706893652677536, + "epoch": 0.47392271566284916, + "grad_norm": 0.7036319971084595, + "learning_rate": 1.8054487427734114e-05, + "loss": 1.3255, + "mean_token_accuracy": 0.6660927186409632, + "num_tokens": 724504066.0, + "step": 4314 + }, + { + "entropy": 1.7539520064989726, + "epoch": 0.4740325725742221, + "grad_norm": 0.7536458373069763, + "learning_rate": 1.805349305829815e-05, + "loss": 1.4542, + "mean_token_accuracy": 0.655565415819486, + "num_tokens": 724665861.0, + "step": 4315 + }, + { + "entropy": 1.6917597552140553, + "epoch": 0.474142429485595, + "grad_norm": 0.7118553519248962, + "learning_rate": 1.8052498465622314e-05, + "loss": 1.3756, + "mean_token_accuracy": 0.6556372493505478, + "num_tokens": 724831820.0, + "step": 4316 + }, + { + "entropy": 1.6829906304677327, + "epoch": 0.4742522863969679, + "grad_norm": 0.6873073577880859, + "learning_rate": 1.8051503649738072e-05, + "loss": 1.2962, + "mean_token_accuracy": 0.6665286769469579, + "num_tokens": 724954659.0, + "step": 4317 + }, + { + "entropy": 1.7101400991280873, + "epoch": 0.47436214330834087, + "grad_norm": 0.6849009990692139, + "learning_rate": 1.8050508610676922e-05, + "loss": 1.3413, + "mean_token_accuracy": 0.6543218890825907, + "num_tokens": 725144933.0, + "step": 4318 + }, + { + "entropy": 1.7625108063220978, + "epoch": 0.4744720002197138, + "grad_norm": 0.6787395477294922, + "learning_rate": 1.804951334847035e-05, + "loss": 1.4429, + "mean_token_accuracy": 0.6538337916135788, + "num_tokens": 725328948.0, + "step": 4319 + }, + { + "entropy": 1.7518400251865387, + "epoch": 0.47458185713108675, + "grad_norm": 0.7725258469581604, + "learning_rate": 1.804851786314986e-05, + "loss": 1.4116, + "mean_token_accuracy": 0.6575403213500977, + "num_tokens": 725488681.0, + "step": 4320 + }, + { + "entropy": 1.7236202557881672, + "epoch": 0.4746917140424597, + "grad_norm": 0.6077833771705627, + "learning_rate": 1.8047522154746953e-05, + "loss": 1.5031, + "mean_token_accuracy": 0.6396900862455368, + "num_tokens": 725679245.0, + "step": 4321 + }, + { + "entropy": 1.6955150763193767, + "epoch": 0.47480157095383263, + "grad_norm": 0.628399670124054, + "learning_rate": 1.8046526223293147e-05, + "loss": 1.4053, + "mean_token_accuracy": 0.6565594325462977, + "num_tokens": 725865088.0, + "step": 4322 + }, + { + "entropy": 1.6883783340454102, + "epoch": 0.47491142786520557, + "grad_norm": 0.7310377359390259, + "learning_rate": 1.804553006881996e-05, + "loss": 1.479, + "mean_token_accuracy": 0.6422811150550842, + "num_tokens": 726050927.0, + "step": 4323 + }, + { + "entropy": 1.709149529536565, + "epoch": 0.4750212847765785, + "grad_norm": 0.683785080909729, + "learning_rate": 1.8044533691358924e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.655320425828298, + "num_tokens": 726212417.0, + "step": 4324 + }, + { + "entropy": 1.694841782251994, + "epoch": 0.47513114168795145, + "grad_norm": 0.6588259339332581, + "learning_rate": 1.8043537090941566e-05, + "loss": 1.447, + "mean_token_accuracy": 0.6471510380506516, + "num_tokens": 726413076.0, + "step": 4325 + }, + { + "entropy": 1.632349779208501, + "epoch": 0.4752409985993244, + "grad_norm": 0.7610387802124023, + "learning_rate": 1.8042540267599434e-05, + "loss": 1.2092, + "mean_token_accuracy": 0.6765281210343043, + "num_tokens": 726564146.0, + "step": 4326 + }, + { + "entropy": 1.6315424640973408, + "epoch": 0.47535085551069733, + "grad_norm": 0.660910427570343, + "learning_rate": 1.804154322136408e-05, + "loss": 1.4702, + "mean_token_accuracy": 0.652957613269488, + "num_tokens": 726780422.0, + "step": 4327 + }, + { + "entropy": 1.766084998846054, + "epoch": 0.4754607124220703, + "grad_norm": 0.7246162295341492, + "learning_rate": 1.8040545952267054e-05, + "loss": 1.3591, + "mean_token_accuracy": 0.6496909161408743, + "num_tokens": 726905268.0, + "step": 4328 + }, + { + "entropy": 1.6654809912045796, + "epoch": 0.4755705693334432, + "grad_norm": 0.5605107545852661, + "learning_rate": 1.803954846033992e-05, + "loss": 1.4205, + "mean_token_accuracy": 0.6426665484905243, + "num_tokens": 727121679.0, + "step": 4329 + }, + { + "entropy": 1.651865929365158, + "epoch": 0.4756804262448161, + "grad_norm": 0.7960909605026245, + "learning_rate": 1.803855074561425e-05, + "loss": 1.4079, + "mean_token_accuracy": 0.6638611356417338, + "num_tokens": 727277692.0, + "step": 4330 + }, + { + "entropy": 1.6862823764483135, + "epoch": 0.47579028315618904, + "grad_norm": 0.7021883726119995, + "learning_rate": 1.8037552808121623e-05, + "loss": 1.3779, + "mean_token_accuracy": 0.6561082353194555, + "num_tokens": 727419140.0, + "step": 4331 + }, + { + "entropy": 1.6958340108394623, + "epoch": 0.475900140067562, + "grad_norm": 0.5929916501045227, + "learning_rate": 1.8036554647893614e-05, + "loss": 1.4561, + "mean_token_accuracy": 0.6269871642192205, + "num_tokens": 727672721.0, + "step": 4332 + }, + { + "entropy": 1.667265961567561, + "epoch": 0.4760099969789349, + "grad_norm": 0.7513339519500732, + "learning_rate": 1.8035556264961827e-05, + "loss": 1.4436, + "mean_token_accuracy": 0.6567800442377726, + "num_tokens": 727874717.0, + "step": 4333 + }, + { + "entropy": 1.768438736597697, + "epoch": 0.47611985389030786, + "grad_norm": 0.893416702747345, + "learning_rate": 1.8034557659357854e-05, + "loss": 1.3738, + "mean_token_accuracy": 0.6622404058774313, + "num_tokens": 728038303.0, + "step": 4334 + }, + { + "entropy": 1.6817525227864583, + "epoch": 0.4762297108016808, + "grad_norm": 0.6666687726974487, + "learning_rate": 1.8033558831113296e-05, + "loss": 1.4322, + "mean_token_accuracy": 0.6520895212888718, + "num_tokens": 728211654.0, + "step": 4335 + }, + { + "entropy": 1.7423338790734608, + "epoch": 0.47633956771305375, + "grad_norm": 0.7516040205955505, + "learning_rate": 1.8032559780259777e-05, + "loss": 1.4885, + "mean_token_accuracy": 0.6466931303342184, + "num_tokens": 728382033.0, + "step": 4336 + }, + { + "entropy": 1.714792827765147, + "epoch": 0.4764494246244267, + "grad_norm": 0.6818469762802124, + "learning_rate": 1.803156050682891e-05, + "loss": 1.4962, + "mean_token_accuracy": 0.6401876310507456, + "num_tokens": 728568806.0, + "step": 4337 + }, + { + "entropy": 1.6942576467990875, + "epoch": 0.47655928153579963, + "grad_norm": 0.7585091590881348, + "learning_rate": 1.8030561010852318e-05, + "loss": 1.417, + "mean_token_accuracy": 0.6545371363560358, + "num_tokens": 728696501.0, + "step": 4338 + }, + { + "entropy": 1.6838724116484325, + "epoch": 0.47666913844717257, + "grad_norm": 0.6353939771652222, + "learning_rate": 1.8029561292361636e-05, + "loss": 1.3464, + "mean_token_accuracy": 0.6659842431545258, + "num_tokens": 728827471.0, + "step": 4339 + }, + { + "entropy": 1.736908346414566, + "epoch": 0.4767789953585455, + "grad_norm": 0.6974917650222778, + "learning_rate": 1.802856135138851e-05, + "loss": 1.3136, + "mean_token_accuracy": 0.6713606069485346, + "num_tokens": 728948450.0, + "step": 4340 + }, + { + "entropy": 1.7290816803773243, + "epoch": 0.47688885226991845, + "grad_norm": 0.6494654417037964, + "learning_rate": 1.8027561187964583e-05, + "loss": 1.3931, + "mean_token_accuracy": 0.6582505901654562, + "num_tokens": 729130424.0, + "step": 4341 + }, + { + "entropy": 1.7498342792193096, + "epoch": 0.4769987091812914, + "grad_norm": 0.7365612983703613, + "learning_rate": 1.8026560802121514e-05, + "loss": 1.4257, + "mean_token_accuracy": 0.656251793106397, + "num_tokens": 729273974.0, + "step": 4342 + }, + { + "entropy": 1.6831127107143402, + "epoch": 0.47710856609266433, + "grad_norm": 0.6563628315925598, + "learning_rate": 1.8025560193890957e-05, + "loss": 1.2273, + "mean_token_accuracy": 0.6860732932885488, + "num_tokens": 729412026.0, + "step": 4343 + }, + { + "entropy": 1.7000919779141743, + "epoch": 0.4772184230040372, + "grad_norm": 0.6804105639457703, + "learning_rate": 1.802455936330459e-05, + "loss": 1.4399, + "mean_token_accuracy": 0.6513183464606603, + "num_tokens": 729599255.0, + "step": 4344 + }, + { + "entropy": 1.6972023944060008, + "epoch": 0.47732827991541016, + "grad_norm": 0.7301200032234192, + "learning_rate": 1.8023558310394085e-05, + "loss": 1.4833, + "mean_token_accuracy": 0.6496182779471079, + "num_tokens": 729748405.0, + "step": 4345 + }, + { + "entropy": 1.6879161496957142, + "epoch": 0.4774381368267831, + "grad_norm": 0.7359111309051514, + "learning_rate": 1.802255703519112e-05, + "loss": 1.3136, + "mean_token_accuracy": 0.6658755342165629, + "num_tokens": 729884748.0, + "step": 4346 + }, + { + "entropy": 1.6763985455036163, + "epoch": 0.47754799373815604, + "grad_norm": 0.6414416432380676, + "learning_rate": 1.802155553772739e-05, + "loss": 1.4129, + "mean_token_accuracy": 0.6613647441069285, + "num_tokens": 730055896.0, + "step": 4347 + }, + { + "entropy": 1.670947680870692, + "epoch": 0.477657850649529, + "grad_norm": 0.6289905905723572, + "learning_rate": 1.8020553818034598e-05, + "loss": 1.3287, + "mean_token_accuracy": 0.6603012681007385, + "num_tokens": 730249265.0, + "step": 4348 + }, + { + "entropy": 1.7056255042552948, + "epoch": 0.4777677075609019, + "grad_norm": 0.6382940411567688, + "learning_rate": 1.801955187614443e-05, + "loss": 1.288, + "mean_token_accuracy": 0.6747541030248007, + "num_tokens": 730391443.0, + "step": 4349 + }, + { + "entropy": 1.6851585308710735, + "epoch": 0.47787756447227486, + "grad_norm": 0.735909640789032, + "learning_rate": 1.8018549712088616e-05, + "loss": 1.4657, + "mean_token_accuracy": 0.6491023351748785, + "num_tokens": 730555554.0, + "step": 4350 + }, + { + "entropy": 1.7253076136112213, + "epoch": 0.4779874213836478, + "grad_norm": 0.6947519183158875, + "learning_rate": 1.8017547325898867e-05, + "loss": 1.5152, + "mean_token_accuracy": 0.6418954481681188, + "num_tokens": 730748007.0, + "step": 4351 + }, + { + "entropy": 1.7013379335403442, + "epoch": 0.47809727829502074, + "grad_norm": 0.7011768221855164, + "learning_rate": 1.8016544717606902e-05, + "loss": 1.4242, + "mean_token_accuracy": 0.6465904712677002, + "num_tokens": 730952979.0, + "step": 4352 + }, + { + "entropy": 1.6715512077013652, + "epoch": 0.4782071352063937, + "grad_norm": 0.7021841406822205, + "learning_rate": 1.8015541887244464e-05, + "loss": 1.5118, + "mean_token_accuracy": 0.6352577755848566, + "num_tokens": 731173855.0, + "step": 4353 + }, + { + "entropy": 1.6683913866678874, + "epoch": 0.4783169921177666, + "grad_norm": 0.8116162419319153, + "learning_rate": 1.801453883484328e-05, + "loss": 1.5141, + "mean_token_accuracy": 0.6527464812000593, + "num_tokens": 731356672.0, + "step": 4354 + }, + { + "entropy": 1.7470175723234813, + "epoch": 0.47842684902913957, + "grad_norm": 0.72096848487854, + "learning_rate": 1.801353556043511e-05, + "loss": 1.3687, + "mean_token_accuracy": 0.653298462430636, + "num_tokens": 731479511.0, + "step": 4355 + }, + { + "entropy": 1.6899640957514446, + "epoch": 0.4785367059405125, + "grad_norm": 0.7514692544937134, + "learning_rate": 1.8012532064051695e-05, + "loss": 1.4348, + "mean_token_accuracy": 0.6547542959451675, + "num_tokens": 731618070.0, + "step": 4356 + }, + { + "entropy": 1.7060795327027638, + "epoch": 0.4786465628518854, + "grad_norm": 0.6599856019020081, + "learning_rate": 1.8011528345724804e-05, + "loss": 1.4117, + "mean_token_accuracy": 0.6612872232993444, + "num_tokens": 731758018.0, + "step": 4357 + }, + { + "entropy": 1.7181631028652191, + "epoch": 0.47875641976325833, + "grad_norm": 0.8530781865119934, + "learning_rate": 1.8010524405486197e-05, + "loss": 1.6284, + "mean_token_accuracy": 0.6449073478579521, + "num_tokens": 731909530.0, + "step": 4358 + }, + { + "entropy": 1.7219158411026, + "epoch": 0.4788662766746313, + "grad_norm": 0.6865781545639038, + "learning_rate": 1.8009520243367652e-05, + "loss": 1.4611, + "mean_token_accuracy": 0.6500067412853241, + "num_tokens": 732098400.0, + "step": 4359 + }, + { + "entropy": 1.6570659577846527, + "epoch": 0.4789761335860042, + "grad_norm": 0.6869319677352905, + "learning_rate": 1.800851585940095e-05, + "loss": 1.4586, + "mean_token_accuracy": 0.6624219765265783, + "num_tokens": 732254430.0, + "step": 4360 + }, + { + "entropy": 1.719583551088969, + "epoch": 0.47908599049737716, + "grad_norm": 0.6726648807525635, + "learning_rate": 1.800751125361788e-05, + "loss": 1.5502, + "mean_token_accuracy": 0.643532986442248, + "num_tokens": 732476478.0, + "step": 4361 + }, + { + "entropy": 1.714612752199173, + "epoch": 0.4791958474087501, + "grad_norm": 0.654414713382721, + "learning_rate": 1.8006506426050235e-05, + "loss": 1.4203, + "mean_token_accuracy": 0.6522560815016428, + "num_tokens": 732620031.0, + "step": 4362 + }, + { + "entropy": 1.7117332716782887, + "epoch": 0.47930570432012304, + "grad_norm": 0.7520214319229126, + "learning_rate": 1.8005501376729818e-05, + "loss": 1.4046, + "mean_token_accuracy": 0.6597649057706197, + "num_tokens": 732780049.0, + "step": 4363 + }, + { + "entropy": 1.845967213312785, + "epoch": 0.479415561231496, + "grad_norm": 0.6956945061683655, + "learning_rate": 1.800449610568844e-05, + "loss": 1.525, + "mean_token_accuracy": 0.6418049583832423, + "num_tokens": 732964821.0, + "step": 4364 + }, + { + "entropy": 1.7225728332996368, + "epoch": 0.4795254181428689, + "grad_norm": 1.2998478412628174, + "learning_rate": 1.800349061295792e-05, + "loss": 1.3252, + "mean_token_accuracy": 0.6638927906751633, + "num_tokens": 733081631.0, + "step": 4365 + }, + { + "entropy": 1.6589208642641704, + "epoch": 0.47963527505424186, + "grad_norm": 0.5991522073745728, + "learning_rate": 1.8002484898570073e-05, + "loss": 1.4725, + "mean_token_accuracy": 0.6591239819924036, + "num_tokens": 733257026.0, + "step": 4366 + }, + { + "entropy": 1.6971391240755718, + "epoch": 0.4797451319656148, + "grad_norm": 0.6521365642547607, + "learning_rate": 1.800147896255674e-05, + "loss": 1.3539, + "mean_token_accuracy": 0.6691495180130005, + "num_tokens": 733446044.0, + "step": 4367 + }, + { + "entropy": 1.7168017029762268, + "epoch": 0.47985498887698774, + "grad_norm": 0.6156492233276367, + "learning_rate": 1.800047280494975e-05, + "loss": 1.4011, + "mean_token_accuracy": 0.6625112245480219, + "num_tokens": 733656891.0, + "step": 4368 + }, + { + "entropy": 1.7018579840660095, + "epoch": 0.4799648457883607, + "grad_norm": 0.7289679050445557, + "learning_rate": 1.7999466425780948e-05, + "loss": 1.1951, + "mean_token_accuracy": 0.6875071277221044, + "num_tokens": 733770636.0, + "step": 4369 + }, + { + "entropy": 1.71112060546875, + "epoch": 0.4800747026997336, + "grad_norm": 0.7808387279510498, + "learning_rate": 1.7998459825082192e-05, + "loss": 1.2365, + "mean_token_accuracy": 0.6787021855513254, + "num_tokens": 733883779.0, + "step": 4370 + }, + { + "entropy": 1.778799831867218, + "epoch": 0.4801845596111065, + "grad_norm": 0.736013650894165, + "learning_rate": 1.799745300288533e-05, + "loss": 1.5431, + "mean_token_accuracy": 0.6357332865397135, + "num_tokens": 734086295.0, + "step": 4371 + }, + { + "entropy": 1.648956149816513, + "epoch": 0.48029441652247945, + "grad_norm": 0.5882608294487, + "learning_rate": 1.7996445959222237e-05, + "loss": 1.4369, + "mean_token_accuracy": 0.6544050325949987, + "num_tokens": 734312987.0, + "step": 4372 + }, + { + "entropy": 1.6493046085039775, + "epoch": 0.4804042734338524, + "grad_norm": 0.7231053709983826, + "learning_rate": 1.7995438694124782e-05, + "loss": 1.2697, + "mean_token_accuracy": 0.6721268246571223, + "num_tokens": 734444526.0, + "step": 4373 + }, + { + "entropy": 1.7278722524642944, + "epoch": 0.48051413034522533, + "grad_norm": 0.7068110108375549, + "learning_rate": 1.7994431207624845e-05, + "loss": 1.4124, + "mean_token_accuracy": 0.654108315706253, + "num_tokens": 734630234.0, + "step": 4374 + }, + { + "entropy": 1.7103682061036427, + "epoch": 0.4806239872565983, + "grad_norm": 0.605778157711029, + "learning_rate": 1.7993423499754314e-05, + "loss": 1.372, + "mean_token_accuracy": 0.6507877210776011, + "num_tokens": 734822527.0, + "step": 4375 + }, + { + "entropy": 1.7161585489908855, + "epoch": 0.4807338441679712, + "grad_norm": 0.7217233777046204, + "learning_rate": 1.7992415570545078e-05, + "loss": 1.2415, + "mean_token_accuracy": 0.6822354594866434, + "num_tokens": 734943030.0, + "step": 4376 + }, + { + "entropy": 1.6399229069550831, + "epoch": 0.48084370107934415, + "grad_norm": 0.5977116823196411, + "learning_rate": 1.799140742002904e-05, + "loss": 1.3927, + "mean_token_accuracy": 0.6513981918493906, + "num_tokens": 735178947.0, + "step": 4377 + }, + { + "entropy": 1.6955311596393585, + "epoch": 0.4809535579907171, + "grad_norm": 0.8311605453491211, + "learning_rate": 1.7990399048238107e-05, + "loss": 1.3563, + "mean_token_accuracy": 0.6646893272797266, + "num_tokens": 735359480.0, + "step": 4378 + }, + { + "entropy": 1.7144683202107747, + "epoch": 0.48106341490209004, + "grad_norm": 0.6836698055267334, + "learning_rate": 1.7989390455204193e-05, + "loss": 1.3922, + "mean_token_accuracy": 0.6572039127349854, + "num_tokens": 735565677.0, + "step": 4379 + }, + { + "entropy": 1.7366363008817036, + "epoch": 0.481173271813463, + "grad_norm": 0.8185579180717468, + "learning_rate": 1.7988381640959223e-05, + "loss": 1.4593, + "mean_token_accuracy": 0.6663338194290797, + "num_tokens": 735749818.0, + "step": 4380 + }, + { + "entropy": 1.712560087442398, + "epoch": 0.4812831287248359, + "grad_norm": 0.7879271507263184, + "learning_rate": 1.7987372605535123e-05, + "loss": 1.5386, + "mean_token_accuracy": 0.655731255809466, + "num_tokens": 735916786.0, + "step": 4381 + }, + { + "entropy": 1.7204462985197704, + "epoch": 0.48139298563620886, + "grad_norm": 0.8967468738555908, + "learning_rate": 1.798636334896383e-05, + "loss": 1.3121, + "mean_token_accuracy": 0.665294274687767, + "num_tokens": 736095507.0, + "step": 4382 + }, + { + "entropy": 1.6719180544217427, + "epoch": 0.4815028425475818, + "grad_norm": 0.7155001163482666, + "learning_rate": 1.7985353871277284e-05, + "loss": 1.4705, + "mean_token_accuracy": 0.6568170140186945, + "num_tokens": 736227693.0, + "step": 4383 + }, + { + "entropy": 1.6862878203392029, + "epoch": 0.4816126994589547, + "grad_norm": 0.7051180601119995, + "learning_rate": 1.798434417250743e-05, + "loss": 1.4039, + "mean_token_accuracy": 0.658642495671908, + "num_tokens": 736398354.0, + "step": 4384 + }, + { + "entropy": 1.7053897380828857, + "epoch": 0.4817225563703276, + "grad_norm": 0.6290838122367859, + "learning_rate": 1.7983334252686236e-05, + "loss": 1.3671, + "mean_token_accuracy": 0.6522834599018097, + "num_tokens": 736528732.0, + "step": 4385 + }, + { + "entropy": 1.7519411742687225, + "epoch": 0.48183241328170057, + "grad_norm": 0.7291600704193115, + "learning_rate": 1.798232411184566e-05, + "loss": 1.3714, + "mean_token_accuracy": 0.6766092479228973, + "num_tokens": 736658095.0, + "step": 4386 + }, + { + "entropy": 1.6966538329919179, + "epoch": 0.4819422701930735, + "grad_norm": 0.8086454272270203, + "learning_rate": 1.7981313750017672e-05, + "loss": 1.405, + "mean_token_accuracy": 0.6665053268273672, + "num_tokens": 736795590.0, + "step": 4387 + }, + { + "entropy": 1.7400443057219188, + "epoch": 0.48205212710444645, + "grad_norm": 0.6428614854812622, + "learning_rate": 1.798030316723425e-05, + "loss": 1.4453, + "mean_token_accuracy": 0.6424688597520193, + "num_tokens": 736984070.0, + "step": 4388 + }, + { + "entropy": 1.6672666768232982, + "epoch": 0.4821619840158194, + "grad_norm": 0.7134124040603638, + "learning_rate": 1.7979292363527375e-05, + "loss": 1.469, + "mean_token_accuracy": 0.6481978793938955, + "num_tokens": 737202061.0, + "step": 4389 + }, + { + "entropy": 1.7022729615370433, + "epoch": 0.48227184092719233, + "grad_norm": 0.6730693578720093, + "learning_rate": 1.7978281338929048e-05, + "loss": 1.3695, + "mean_token_accuracy": 0.655126636226972, + "num_tokens": 737403611.0, + "step": 4390 + }, + { + "entropy": 1.6466123759746552, + "epoch": 0.48238169783856527, + "grad_norm": 0.7256969809532166, + "learning_rate": 1.7977270093471254e-05, + "loss": 1.4186, + "mean_token_accuracy": 0.6552423536777496, + "num_tokens": 737593766.0, + "step": 4391 + }, + { + "entropy": 1.7008427878220875, + "epoch": 0.4824915547499382, + "grad_norm": 0.9718957543373108, + "learning_rate": 1.7976258627186008e-05, + "loss": 1.2792, + "mean_token_accuracy": 0.6838826090097427, + "num_tokens": 737759579.0, + "step": 4392 + }, + { + "entropy": 1.7387726704279582, + "epoch": 0.48260141166131115, + "grad_norm": 0.8315878510475159, + "learning_rate": 1.797524694010532e-05, + "loss": 1.4845, + "mean_token_accuracy": 0.658362532655398, + "num_tokens": 737911150.0, + "step": 4393 + }, + { + "entropy": 1.7291166086991627, + "epoch": 0.4827112685726841, + "grad_norm": 0.6660434603691101, + "learning_rate": 1.797423503226121e-05, + "loss": 1.3869, + "mean_token_accuracy": 0.642819325129191, + "num_tokens": 738088880.0, + "step": 4394 + }, + { + "entropy": 1.772065391143163, + "epoch": 0.48282112548405703, + "grad_norm": 0.6660803556442261, + "learning_rate": 1.7973222903685702e-05, + "loss": 1.359, + "mean_token_accuracy": 0.6438145389159521, + "num_tokens": 738234766.0, + "step": 4395 + }, + { + "entropy": 1.731687754392624, + "epoch": 0.48293098239543, + "grad_norm": 0.6683153510093689, + "learning_rate": 1.7972210554410834e-05, + "loss": 1.2882, + "mean_token_accuracy": 0.6693693796793619, + "num_tokens": 738386427.0, + "step": 4396 + }, + { + "entropy": 1.710336794455846, + "epoch": 0.4830408393068029, + "grad_norm": 0.6479591727256775, + "learning_rate": 1.797119798446864e-05, + "loss": 1.4377, + "mean_token_accuracy": 0.6659507850805918, + "num_tokens": 738579020.0, + "step": 4397 + }, + { + "entropy": 1.7027187943458557, + "epoch": 0.4831506962181758, + "grad_norm": 0.6037660837173462, + "learning_rate": 1.7970185193891176e-05, + "loss": 1.439, + "mean_token_accuracy": 0.6502730449040731, + "num_tokens": 738777017.0, + "step": 4398 + }, + { + "entropy": 1.7039030492305756, + "epoch": 0.48326055312954874, + "grad_norm": 0.673244059085846, + "learning_rate": 1.796917218271049e-05, + "loss": 1.3637, + "mean_token_accuracy": 0.6623422205448151, + "num_tokens": 738963981.0, + "step": 4399 + }, + { + "entropy": 1.753426233927409, + "epoch": 0.4833704100409217, + "grad_norm": 0.706851065158844, + "learning_rate": 1.7968158950958642e-05, + "loss": 1.4578, + "mean_token_accuracy": 0.6659545401732127, + "num_tokens": 739139126.0, + "step": 4400 + }, + { + "entropy": 1.6917196214199066, + "epoch": 0.4834802669522946, + "grad_norm": 0.7611788511276245, + "learning_rate": 1.7967145498667706e-05, + "loss": 1.3534, + "mean_token_accuracy": 0.6647525678078333, + "num_tokens": 739309024.0, + "step": 4401 + }, + { + "entropy": 1.7070514857769012, + "epoch": 0.48359012386366756, + "grad_norm": 1.02863347530365, + "learning_rate": 1.7966131825869753e-05, + "loss": 1.5179, + "mean_token_accuracy": 0.6395848045746485, + "num_tokens": 739513763.0, + "step": 4402 + }, + { + "entropy": 1.6623660226662953, + "epoch": 0.4836999807750405, + "grad_norm": 0.628810703754425, + "learning_rate": 1.7965117932596866e-05, + "loss": 1.2627, + "mean_token_accuracy": 0.6694687008857727, + "num_tokens": 739651066.0, + "step": 4403 + }, + { + "entropy": 1.68434273203214, + "epoch": 0.48380983768641345, + "grad_norm": 0.7587992548942566, + "learning_rate": 1.7964103818881138e-05, + "loss": 1.3369, + "mean_token_accuracy": 0.6658577223618826, + "num_tokens": 739786817.0, + "step": 4404 + }, + { + "entropy": 1.664050579071045, + "epoch": 0.4839196945977864, + "grad_norm": 0.6519520878791809, + "learning_rate": 1.7963089484754663e-05, + "loss": 1.3324, + "mean_token_accuracy": 0.6710617194573084, + "num_tokens": 739941289.0, + "step": 4405 + }, + { + "entropy": 1.716511865456899, + "epoch": 0.48402955150915933, + "grad_norm": 0.659646213054657, + "learning_rate": 1.7962074930249537e-05, + "loss": 1.5393, + "mean_token_accuracy": 0.6423588742812475, + "num_tokens": 740142557.0, + "step": 4406 + }, + { + "entropy": 1.682478795448939, + "epoch": 0.48413940842053227, + "grad_norm": 0.5458212494850159, + "learning_rate": 1.796106015539788e-05, + "loss": 1.4259, + "mean_token_accuracy": 0.637840062379837, + "num_tokens": 740392429.0, + "step": 4407 + }, + { + "entropy": 1.7216579516728718, + "epoch": 0.4842492653319052, + "grad_norm": 0.66231369972229, + "learning_rate": 1.796004516023181e-05, + "loss": 1.4048, + "mean_token_accuracy": 0.6517205735047659, + "num_tokens": 740528700.0, + "step": 4408 + }, + { + "entropy": 1.7604460815588634, + "epoch": 0.48435912224327815, + "grad_norm": 0.6421491503715515, + "learning_rate": 1.795902994478344e-05, + "loss": 1.3674, + "mean_token_accuracy": 0.6596113989750544, + "num_tokens": 740723647.0, + "step": 4409 + }, + { + "entropy": 1.6752793689568837, + "epoch": 0.4844689791546511, + "grad_norm": 0.6714054346084595, + "learning_rate": 1.7958014509084912e-05, + "loss": 1.4342, + "mean_token_accuracy": 0.6613290458917618, + "num_tokens": 740917843.0, + "step": 4410 + }, + { + "entropy": 1.6832947333653767, + "epoch": 0.484578836066024, + "grad_norm": 0.7269577980041504, + "learning_rate": 1.795699885316836e-05, + "loss": 1.3856, + "mean_token_accuracy": 0.6611769000689188, + "num_tokens": 741057925.0, + "step": 4411 + }, + { + "entropy": 1.7017952899138133, + "epoch": 0.4846886929773969, + "grad_norm": 0.6696016788482666, + "learning_rate": 1.7955982977065928e-05, + "loss": 1.3691, + "mean_token_accuracy": 0.6629203210274378, + "num_tokens": 741243418.0, + "step": 4412 + }, + { + "entropy": 1.745782047510147, + "epoch": 0.48479854988876986, + "grad_norm": 1.2233498096466064, + "learning_rate": 1.7954966880809772e-05, + "loss": 1.5333, + "mean_token_accuracy": 0.6656019041935602, + "num_tokens": 741355751.0, + "step": 4413 + }, + { + "entropy": 1.6869251827398937, + "epoch": 0.4849084068001428, + "grad_norm": 0.7494601607322693, + "learning_rate": 1.7953950564432044e-05, + "loss": 1.4045, + "mean_token_accuracy": 0.6549041916926702, + "num_tokens": 741529089.0, + "step": 4414 + }, + { + "entropy": 1.6820165514945984, + "epoch": 0.48501826371151574, + "grad_norm": 0.7255376577377319, + "learning_rate": 1.7952934027964917e-05, + "loss": 1.2338, + "mean_token_accuracy": 0.675830195347468, + "num_tokens": 741679225.0, + "step": 4415 + }, + { + "entropy": 1.7140280703703563, + "epoch": 0.4851281206228887, + "grad_norm": 0.6368587613105774, + "learning_rate": 1.795191727144056e-05, + "loss": 1.3813, + "mean_token_accuracy": 0.6670277963081995, + "num_tokens": 741825088.0, + "step": 4416 + }, + { + "entropy": 1.6830491324265797, + "epoch": 0.4852379775342616, + "grad_norm": 0.6319524645805359, + "learning_rate": 1.7950900294891154e-05, + "loss": 1.3061, + "mean_token_accuracy": 0.6556812673807144, + "num_tokens": 742051866.0, + "step": 4417 + }, + { + "entropy": 1.7843901813030243, + "epoch": 0.48534783444563456, + "grad_norm": 0.6758350729942322, + "learning_rate": 1.794988309834889e-05, + "loss": 1.4969, + "mean_token_accuracy": 0.637373631199201, + "num_tokens": 742230190.0, + "step": 4418 + }, + { + "entropy": 1.6947405536969502, + "epoch": 0.4854576913570075, + "grad_norm": 0.5639503598213196, + "learning_rate": 1.7948865681845952e-05, + "loss": 1.3941, + "mean_token_accuracy": 0.647869884967804, + "num_tokens": 742475673.0, + "step": 4419 + }, + { + "entropy": 1.7732278009255726, + "epoch": 0.48556754826838044, + "grad_norm": 0.7282741665840149, + "learning_rate": 1.7947848045414548e-05, + "loss": 1.3657, + "mean_token_accuracy": 0.6630784372488657, + "num_tokens": 742612272.0, + "step": 4420 + }, + { + "entropy": 1.6553172965844472, + "epoch": 0.4856774051797534, + "grad_norm": 0.629048228263855, + "learning_rate": 1.7946830189086883e-05, + "loss": 1.5002, + "mean_token_accuracy": 0.6393060237169266, + "num_tokens": 742798477.0, + "step": 4421 + }, + { + "entropy": 1.7043809791405995, + "epoch": 0.4857872620911263, + "grad_norm": 0.7646244168281555, + "learning_rate": 1.7945812112895177e-05, + "loss": 1.2612, + "mean_token_accuracy": 0.6775466799736023, + "num_tokens": 742897713.0, + "step": 4422 + }, + { + "entropy": 1.7202276587486267, + "epoch": 0.48589711900249927, + "grad_norm": 0.7730578780174255, + "learning_rate": 1.794479381687164e-05, + "loss": 1.4446, + "mean_token_accuracy": 0.6524418741464615, + "num_tokens": 743076435.0, + "step": 4423 + }, + { + "entropy": 1.7322371204694111, + "epoch": 0.4860069759138722, + "grad_norm": 0.781139612197876, + "learning_rate": 1.7943775301048517e-05, + "loss": 1.4211, + "mean_token_accuracy": 0.6420817424853643, + "num_tokens": 743201684.0, + "step": 4424 + }, + { + "entropy": 1.6999266147613525, + "epoch": 0.4861168328252451, + "grad_norm": 0.7373039126396179, + "learning_rate": 1.7942756565458027e-05, + "loss": 1.4017, + "mean_token_accuracy": 0.6548285136620203, + "num_tokens": 743370658.0, + "step": 4425 + }, + { + "entropy": 1.6692801713943481, + "epoch": 0.48622668973661803, + "grad_norm": 0.7501516342163086, + "learning_rate": 1.7941737610132424e-05, + "loss": 1.1894, + "mean_token_accuracy": 0.6914908140897751, + "num_tokens": 743481925.0, + "step": 4426 + }, + { + "entropy": 1.6509476502736409, + "epoch": 0.486336546647991, + "grad_norm": 0.6218562126159668, + "learning_rate": 1.7940718435103954e-05, + "loss": 1.3426, + "mean_token_accuracy": 0.6639609535535177, + "num_tokens": 743628666.0, + "step": 4427 + }, + { + "entropy": 1.747880756855011, + "epoch": 0.4864464035593639, + "grad_norm": 0.6217747926712036, + "learning_rate": 1.7939699040404875e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6518111626307169, + "num_tokens": 743787470.0, + "step": 4428 + }, + { + "entropy": 1.756358911593755, + "epoch": 0.48655626047073686, + "grad_norm": 0.7568204402923584, + "learning_rate": 1.7938679426067446e-05, + "loss": 1.5833, + "mean_token_accuracy": 0.6492966512838999, + "num_tokens": 743928141.0, + "step": 4429 + }, + { + "entropy": 1.673755685488383, + "epoch": 0.4866661173821098, + "grad_norm": 0.7150989174842834, + "learning_rate": 1.7937659592123935e-05, + "loss": 1.3197, + "mean_token_accuracy": 0.6532700707515081, + "num_tokens": 744042694.0, + "step": 4430 + }, + { + "entropy": 1.7202748954296112, + "epoch": 0.48677597429348274, + "grad_norm": 0.636058509349823, + "learning_rate": 1.7936639538606632e-05, + "loss": 1.4411, + "mean_token_accuracy": 0.6541072924931844, + "num_tokens": 744207583.0, + "step": 4431 + }, + { + "entropy": 1.6693733135859172, + "epoch": 0.4868858312048557, + "grad_norm": 0.7030401229858398, + "learning_rate": 1.793561926554781e-05, + "loss": 1.2652, + "mean_token_accuracy": 0.673239087065061, + "num_tokens": 744323763.0, + "step": 4432 + }, + { + "entropy": 1.720289280017217, + "epoch": 0.4869956881162286, + "grad_norm": 0.7169565558433533, + "learning_rate": 1.7934598772979764e-05, + "loss": 1.375, + "mean_token_accuracy": 0.6588334242502848, + "num_tokens": 744476040.0, + "step": 4433 + }, + { + "entropy": 1.723343511422475, + "epoch": 0.48710554502760156, + "grad_norm": 0.6646847724914551, + "learning_rate": 1.7933578060934788e-05, + "loss": 1.405, + "mean_token_accuracy": 0.645707756280899, + "num_tokens": 744626251.0, + "step": 4434 + }, + { + "entropy": 1.7332356572151184, + "epoch": 0.4872154019389745, + "grad_norm": 0.7763069272041321, + "learning_rate": 1.7932557129445195e-05, + "loss": 1.3344, + "mean_token_accuracy": 0.6678619384765625, + "num_tokens": 744754753.0, + "step": 4435 + }, + { + "entropy": 1.6924299697081249, + "epoch": 0.48732525885034744, + "grad_norm": 0.7134109735488892, + "learning_rate": 1.7931535978543295e-05, + "loss": 1.3427, + "mean_token_accuracy": 0.6666603734095892, + "num_tokens": 744889940.0, + "step": 4436 + }, + { + "entropy": 1.6620939671993256, + "epoch": 0.4874351157617204, + "grad_norm": 0.5839754939079285, + "learning_rate": 1.79305146082614e-05, + "loss": 1.3568, + "mean_token_accuracy": 0.6570223172505697, + "num_tokens": 745065068.0, + "step": 4437 + }, + { + "entropy": 1.693219780921936, + "epoch": 0.48754497267309327, + "grad_norm": 0.7458364963531494, + "learning_rate": 1.792949301863184e-05, + "loss": 1.3768, + "mean_token_accuracy": 0.6642978092034658, + "num_tokens": 745247547.0, + "step": 4438 + }, + { + "entropy": 1.724270612001419, + "epoch": 0.4876548295844662, + "grad_norm": 0.7193183898925781, + "learning_rate": 1.792847120968695e-05, + "loss": 1.3431, + "mean_token_accuracy": 0.6732023855050405, + "num_tokens": 745370035.0, + "step": 4439 + }, + { + "entropy": 1.695033311843872, + "epoch": 0.48776468649583915, + "grad_norm": 0.6847495436668396, + "learning_rate": 1.792744918145907e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6749068647623062, + "num_tokens": 745555069.0, + "step": 4440 + }, + { + "entropy": 1.6993265946706135, + "epoch": 0.4878745434072121, + "grad_norm": 0.7159141302108765, + "learning_rate": 1.7926426933980543e-05, + "loss": 1.5518, + "mean_token_accuracy": 0.6463808168967565, + "num_tokens": 745735255.0, + "step": 4441 + }, + { + "entropy": 1.7386601070563, + "epoch": 0.48798440031858503, + "grad_norm": 0.7798255085945129, + "learning_rate": 1.7925404467283727e-05, + "loss": 1.5025, + "mean_token_accuracy": 0.655009463429451, + "num_tokens": 745875552.0, + "step": 4442 + }, + { + "entropy": 1.7042417724927266, + "epoch": 0.488094257229958, + "grad_norm": 0.6437844634056091, + "learning_rate": 1.7924381781400978e-05, + "loss": 1.4309, + "mean_token_accuracy": 0.6641700814167658, + "num_tokens": 746048420.0, + "step": 4443 + }, + { + "entropy": 1.7275199890136719, + "epoch": 0.4882041141413309, + "grad_norm": 0.7542555928230286, + "learning_rate": 1.7923358876364665e-05, + "loss": 1.5449, + "mean_token_accuracy": 0.6298695877194405, + "num_tokens": 746276936.0, + "step": 4444 + }, + { + "entropy": 1.7163825233777363, + "epoch": 0.48831397105270385, + "grad_norm": 0.6992437243461609, + "learning_rate": 1.7922335752207164e-05, + "loss": 1.3827, + "mean_token_accuracy": 0.6514418671528498, + "num_tokens": 746413460.0, + "step": 4445 + }, + { + "entropy": 1.75390621026357, + "epoch": 0.4884238279640768, + "grad_norm": 0.7331777811050415, + "learning_rate": 1.792131240896086e-05, + "loss": 1.3556, + "mean_token_accuracy": 0.6733482579390208, + "num_tokens": 746529153.0, + "step": 4446 + }, + { + "entropy": 1.6997243563334148, + "epoch": 0.48853368487544974, + "grad_norm": 0.763002336025238, + "learning_rate": 1.792028884665813e-05, + "loss": 1.4874, + "mean_token_accuracy": 0.6523915976285934, + "num_tokens": 746715032.0, + "step": 4447 + }, + { + "entropy": 1.6606244643529255, + "epoch": 0.4886435417868227, + "grad_norm": 0.5833660364151001, + "learning_rate": 1.791926506533138e-05, + "loss": 1.3891, + "mean_token_accuracy": 0.6538758873939514, + "num_tokens": 746927795.0, + "step": 4448 + }, + { + "entropy": 1.7194021840890248, + "epoch": 0.4887533986981956, + "grad_norm": 0.7860300540924072, + "learning_rate": 1.791824106501301e-05, + "loss": 1.5285, + "mean_token_accuracy": 0.6494088371594747, + "num_tokens": 747073603.0, + "step": 4449 + }, + { + "entropy": 1.7067668239275615, + "epoch": 0.48886325560956856, + "grad_norm": 0.6611225605010986, + "learning_rate": 1.7917216845735427e-05, + "loss": 1.4851, + "mean_token_accuracy": 0.6474853207667669, + "num_tokens": 747276179.0, + "step": 4450 + }, + { + "entropy": 1.6850255330403645, + "epoch": 0.4889731125209415, + "grad_norm": 0.8468097448348999, + "learning_rate": 1.7916192407531045e-05, + "loss": 1.3876, + "mean_token_accuracy": 0.6508926798899969, + "num_tokens": 747433017.0, + "step": 4451 + }, + { + "entropy": 1.7228013674418132, + "epoch": 0.4890829694323144, + "grad_norm": 0.6649202704429626, + "learning_rate": 1.7915167750432293e-05, + "loss": 1.4806, + "mean_token_accuracy": 0.641407698392868, + "num_tokens": 747595215.0, + "step": 4452 + }, + { + "entropy": 1.7136965990066528, + "epoch": 0.4891928263436873, + "grad_norm": 0.6682122945785522, + "learning_rate": 1.7914142874471597e-05, + "loss": 1.5594, + "mean_token_accuracy": 0.642529770731926, + "num_tokens": 747794900.0, + "step": 4453 + }, + { + "entropy": 1.7261795302232106, + "epoch": 0.48930268325506027, + "grad_norm": 0.634833037853241, + "learning_rate": 1.7913117779681396e-05, + "loss": 1.4382, + "mean_token_accuracy": 0.6446546812852224, + "num_tokens": 748022714.0, + "step": 4454 + }, + { + "entropy": 1.6885162591934204, + "epoch": 0.4894125401664332, + "grad_norm": 0.7759763598442078, + "learning_rate": 1.791209246609413e-05, + "loss": 1.2402, + "mean_token_accuracy": 0.6778834859530131, + "num_tokens": 748177827.0, + "step": 4455 + }, + { + "entropy": 1.6844123204549153, + "epoch": 0.48952239707780615, + "grad_norm": 0.7225537896156311, + "learning_rate": 1.791106693374225e-05, + "loss": 1.485, + "mean_token_accuracy": 0.6536487142244974, + "num_tokens": 748361870.0, + "step": 4456 + }, + { + "entropy": 1.6790739993254344, + "epoch": 0.4896322539891791, + "grad_norm": 0.728463351726532, + "learning_rate": 1.791004118265822e-05, + "loss": 1.5057, + "mean_token_accuracy": 0.6347111016511917, + "num_tokens": 748650354.0, + "step": 4457 + }, + { + "entropy": 1.695796012878418, + "epoch": 0.48974211090055203, + "grad_norm": 0.6192063689231873, + "learning_rate": 1.79090152128745e-05, + "loss": 1.4496, + "mean_token_accuracy": 0.6385799000660578, + "num_tokens": 748830552.0, + "step": 4458 + }, + { + "entropy": 1.7009925544261932, + "epoch": 0.48985196781192497, + "grad_norm": 0.6495517492294312, + "learning_rate": 1.7907989024423557e-05, + "loss": 1.2719, + "mean_token_accuracy": 0.6756831457217535, + "num_tokens": 748964018.0, + "step": 4459 + }, + { + "entropy": 1.7054378390312195, + "epoch": 0.4899618247232979, + "grad_norm": 0.6384367346763611, + "learning_rate": 1.790696261733788e-05, + "loss": 1.4366, + "mean_token_accuracy": 0.6499034762382507, + "num_tokens": 749180374.0, + "step": 4460 + }, + { + "entropy": 1.728455811738968, + "epoch": 0.49007168163467085, + "grad_norm": 0.6712200045585632, + "learning_rate": 1.790593599164994e-05, + "loss": 1.3554, + "mean_token_accuracy": 0.658992608388265, + "num_tokens": 749318103.0, + "step": 4461 + }, + { + "entropy": 1.7136681576569874, + "epoch": 0.4901815385460438, + "grad_norm": 0.6368706226348877, + "learning_rate": 1.7904909147392247e-05, + "loss": 1.3982, + "mean_token_accuracy": 0.6673699120680491, + "num_tokens": 749529503.0, + "step": 4462 + }, + { + "entropy": 1.6937820812066395, + "epoch": 0.49029139545741673, + "grad_norm": 0.6878102421760559, + "learning_rate": 1.7903882084597287e-05, + "loss": 1.4045, + "mean_token_accuracy": 0.6586268643538157, + "num_tokens": 749703151.0, + "step": 4463 + }, + { + "entropy": 1.7233433425426483, + "epoch": 0.4904012523687897, + "grad_norm": 0.7706781625747681, + "learning_rate": 1.7902854803297575e-05, + "loss": 1.45, + "mean_token_accuracy": 0.6549730747938156, + "num_tokens": 749851059.0, + "step": 4464 + }, + { + "entropy": 1.701189527908961, + "epoch": 0.49051110928016256, + "grad_norm": 0.7401324510574341, + "learning_rate": 1.7901827303525613e-05, + "loss": 1.3076, + "mean_token_accuracy": 0.6692301680644354, + "num_tokens": 749986623.0, + "step": 4465 + }, + { + "entropy": 1.7433607876300812, + "epoch": 0.4906209661915355, + "grad_norm": 0.8140077590942383, + "learning_rate": 1.790079958531393e-05, + "loss": 1.4647, + "mean_token_accuracy": 0.6411833713452021, + "num_tokens": 750134250.0, + "step": 4466 + }, + { + "entropy": 1.778177946805954, + "epoch": 0.49073082310290844, + "grad_norm": 0.6582893133163452, + "learning_rate": 1.7899771648695048e-05, + "loss": 1.3474, + "mean_token_accuracy": 0.656017060081164, + "num_tokens": 750276063.0, + "step": 4467 + }, + { + "entropy": 1.713607092698415, + "epoch": 0.4908406800142814, + "grad_norm": 0.7441067695617676, + "learning_rate": 1.78987434937015e-05, + "loss": 1.4452, + "mean_token_accuracy": 0.6515307128429413, + "num_tokens": 750403743.0, + "step": 4468 + }, + { + "entropy": 1.602983335653941, + "epoch": 0.4909505369256543, + "grad_norm": 0.59381502866745, + "learning_rate": 1.7897715120365836e-05, + "loss": 1.2882, + "mean_token_accuracy": 0.6720236440499624, + "num_tokens": 750542516.0, + "step": 4469 + }, + { + "entropy": 1.6924793124198914, + "epoch": 0.49106039383702726, + "grad_norm": 0.977676272392273, + "learning_rate": 1.789668652872059e-05, + "loss": 1.3781, + "mean_token_accuracy": 0.6760386377573013, + "num_tokens": 750699705.0, + "step": 4470 + }, + { + "entropy": 1.7474435865879059, + "epoch": 0.4911702507484002, + "grad_norm": 0.6933272480964661, + "learning_rate": 1.7895657718798327e-05, + "loss": 1.2821, + "mean_token_accuracy": 0.6641086836655935, + "num_tokens": 750805798.0, + "step": 4471 + }, + { + "entropy": 1.7326118846734364, + "epoch": 0.49128010765977315, + "grad_norm": 0.6269978880882263, + "learning_rate": 1.7894628690631603e-05, + "loss": 1.3857, + "mean_token_accuracy": 0.6626192231973013, + "num_tokens": 750959285.0, + "step": 4472 + }, + { + "entropy": 1.6883742014567058, + "epoch": 0.4913899645711461, + "grad_norm": 0.7878180146217346, + "learning_rate": 1.7893599444252987e-05, + "loss": 1.4636, + "mean_token_accuracy": 0.6607634474833807, + "num_tokens": 751144530.0, + "step": 4473 + }, + { + "entropy": 1.6989558438460033, + "epoch": 0.49149982148251903, + "grad_norm": 0.6729726195335388, + "learning_rate": 1.789256997969506e-05, + "loss": 1.4828, + "mean_token_accuracy": 0.6353020220994949, + "num_tokens": 751357135.0, + "step": 4474 + }, + { + "entropy": 1.7190554738044739, + "epoch": 0.49160967839389197, + "grad_norm": 0.6713739037513733, + "learning_rate": 1.789154029699039e-05, + "loss": 1.419, + "mean_token_accuracy": 0.654554526011149, + "num_tokens": 751494654.0, + "step": 4475 + }, + { + "entropy": 1.7132868468761444, + "epoch": 0.4917195353052649, + "grad_norm": 0.6637577414512634, + "learning_rate": 1.7890510396171582e-05, + "loss": 1.2547, + "mean_token_accuracy": 0.6833054423332214, + "num_tokens": 751625039.0, + "step": 4476 + }, + { + "entropy": 1.6934907635052998, + "epoch": 0.49182939221663785, + "grad_norm": 0.7464581727981567, + "learning_rate": 1.788948027727122e-05, + "loss": 1.3886, + "mean_token_accuracy": 0.6680949032306671, + "num_tokens": 751844906.0, + "step": 4477 + }, + { + "entropy": 1.7225241959095001, + "epoch": 0.4919392491280108, + "grad_norm": 0.6509267091751099, + "learning_rate": 1.7888449940321917e-05, + "loss": 1.3403, + "mean_token_accuracy": 0.6601742456356684, + "num_tokens": 751975242.0, + "step": 4478 + }, + { + "entropy": 1.6977383097012837, + "epoch": 0.4920491060393837, + "grad_norm": 0.6957575678825378, + "learning_rate": 1.7887419385356273e-05, + "loss": 1.3555, + "mean_token_accuracy": 0.6612063000599543, + "num_tokens": 752113545.0, + "step": 4479 + }, + { + "entropy": 1.7447110712528229, + "epoch": 0.4921589629507566, + "grad_norm": 0.7940369248390198, + "learning_rate": 1.788638861240691e-05, + "loss": 1.5439, + "mean_token_accuracy": 0.640469511349996, + "num_tokens": 752291398.0, + "step": 4480 + }, + { + "entropy": 1.679489274819692, + "epoch": 0.49226881986212956, + "grad_norm": 0.7236880660057068, + "learning_rate": 1.7885357621506453e-05, + "loss": 1.2744, + "mean_token_accuracy": 0.6717104216416677, + "num_tokens": 752428595.0, + "step": 4481 + }, + { + "entropy": 1.721798598766327, + "epoch": 0.4923786767735025, + "grad_norm": 0.7670096755027771, + "learning_rate": 1.788432641268753e-05, + "loss": 1.293, + "mean_token_accuracy": 0.6676533122857412, + "num_tokens": 752569748.0, + "step": 4482 + }, + { + "entropy": 1.6857047577699025, + "epoch": 0.49248853368487544, + "grad_norm": 0.6117812395095825, + "learning_rate": 1.7883294985982772e-05, + "loss": 1.3615, + "mean_token_accuracy": 0.6617532074451447, + "num_tokens": 752779194.0, + "step": 4483 + }, + { + "entropy": 1.7368919948736827, + "epoch": 0.4925983905962484, + "grad_norm": 0.6931723356246948, + "learning_rate": 1.7882263341424838e-05, + "loss": 1.3838, + "mean_token_accuracy": 0.6611840128898621, + "num_tokens": 752931824.0, + "step": 4484 + }, + { + "entropy": 1.7122445404529572, + "epoch": 0.4927082475076213, + "grad_norm": 0.5953887104988098, + "learning_rate": 1.7881231479046364e-05, + "loss": 1.3601, + "mean_token_accuracy": 0.6550240367650986, + "num_tokens": 753122621.0, + "step": 4485 + }, + { + "entropy": 1.6985189219315846, + "epoch": 0.49281810441899426, + "grad_norm": 0.6507508158683777, + "learning_rate": 1.7880199398880018e-05, + "loss": 1.4812, + "mean_token_accuracy": 0.6505904843409857, + "num_tokens": 753270777.0, + "step": 4486 + }, + { + "entropy": 1.7590989172458649, + "epoch": 0.4929279613303672, + "grad_norm": 0.6733105182647705, + "learning_rate": 1.787916710095846e-05, + "loss": 1.49, + "mean_token_accuracy": 0.6299261252085367, + "num_tokens": 753456920.0, + "step": 4487 + }, + { + "entropy": 1.6458527743816376, + "epoch": 0.49303781824174014, + "grad_norm": 0.7507854700088501, + "learning_rate": 1.7878134585314363e-05, + "loss": 1.4018, + "mean_token_accuracy": 0.6749738603830338, + "num_tokens": 753613788.0, + "step": 4488 + }, + { + "entropy": 1.6525565882523854, + "epoch": 0.4931476751531131, + "grad_norm": 0.5717189311981201, + "learning_rate": 1.7877101851980404e-05, + "loss": 1.4242, + "mean_token_accuracy": 0.6573587109645208, + "num_tokens": 753800915.0, + "step": 4489 + }, + { + "entropy": 1.692369411389033, + "epoch": 0.493257532064486, + "grad_norm": 0.7131385803222656, + "learning_rate": 1.7876068900989274e-05, + "loss": 1.2987, + "mean_token_accuracy": 0.6656559258699417, + "num_tokens": 753944837.0, + "step": 4490 + }, + { + "entropy": 1.6467917760213215, + "epoch": 0.49336738897585897, + "grad_norm": 0.6811399459838867, + "learning_rate": 1.7875035732373658e-05, + "loss": 1.301, + "mean_token_accuracy": 0.671561042467753, + "num_tokens": 754086477.0, + "step": 4491 + }, + { + "entropy": 1.781138926744461, + "epoch": 0.4934772458872319, + "grad_norm": 0.6936325430870056, + "learning_rate": 1.7874002346166263e-05, + "loss": 1.4253, + "mean_token_accuracy": 0.650108148654302, + "num_tokens": 754275449.0, + "step": 4492 + }, + { + "entropy": 1.7359409630298615, + "epoch": 0.4935871027986048, + "grad_norm": 0.7305500507354736, + "learning_rate": 1.7872968742399786e-05, + "loss": 1.4184, + "mean_token_accuracy": 0.6582159698009491, + "num_tokens": 754439599.0, + "step": 4493 + }, + { + "entropy": 1.6950959861278534, + "epoch": 0.49369695970997773, + "grad_norm": 0.5642681121826172, + "learning_rate": 1.787193492110695e-05, + "loss": 1.6434, + "mean_token_accuracy": 0.633182168006897, + "num_tokens": 754640289.0, + "step": 4494 + }, + { + "entropy": 1.706775536139806, + "epoch": 0.4938068166213507, + "grad_norm": 0.6591259241104126, + "learning_rate": 1.7870900882320467e-05, + "loss": 1.46, + "mean_token_accuracy": 0.6489850531021754, + "num_tokens": 754801543.0, + "step": 4495 + }, + { + "entropy": 1.724029650290807, + "epoch": 0.4939166735327236, + "grad_norm": 0.7175586819648743, + "learning_rate": 1.786986662607307e-05, + "loss": 1.3789, + "mean_token_accuracy": 0.6690233945846558, + "num_tokens": 754964581.0, + "step": 4496 + }, + { + "entropy": 1.6978013416131337, + "epoch": 0.49402653044409656, + "grad_norm": 0.5896649956703186, + "learning_rate": 1.786883215239749e-05, + "loss": 1.3869, + "mean_token_accuracy": 0.6515233715375265, + "num_tokens": 755162191.0, + "step": 4497 + }, + { + "entropy": 1.7095728317896526, + "epoch": 0.4941363873554695, + "grad_norm": 0.671416163444519, + "learning_rate": 1.7867797461326466e-05, + "loss": 1.4268, + "mean_token_accuracy": 0.6535071631272634, + "num_tokens": 755335730.0, + "step": 4498 + }, + { + "entropy": 1.708453506231308, + "epoch": 0.49424624426684244, + "grad_norm": 0.7201808094978333, + "learning_rate": 1.786676255289275e-05, + "loss": 1.4291, + "mean_token_accuracy": 0.662381132443746, + "num_tokens": 755479598.0, + "step": 4499 + }, + { + "entropy": 1.6969383358955383, + "epoch": 0.4943561011782154, + "grad_norm": 0.6979060769081116, + "learning_rate": 1.7865727427129087e-05, + "loss": 1.3646, + "mean_token_accuracy": 0.6659565716981888, + "num_tokens": 755646726.0, + "step": 4500 + }, + { + "entropy": 1.7146336535612743, + "epoch": 0.4944659580895883, + "grad_norm": 0.6201717853546143, + "learning_rate": 1.786469208406825e-05, + "loss": 1.2754, + "mean_token_accuracy": 0.6736994286378225, + "num_tokens": 755793537.0, + "step": 4501 + }, + { + "entropy": 1.6687723497549694, + "epoch": 0.49457581500096126, + "grad_norm": 0.6654130816459656, + "learning_rate": 1.7863656523743002e-05, + "loss": 1.3844, + "mean_token_accuracy": 0.6587973336378733, + "num_tokens": 755989928.0, + "step": 4502 + }, + { + "entropy": 1.6637324293454487, + "epoch": 0.4946856719123342, + "grad_norm": 0.8360695242881775, + "learning_rate": 1.7862620746186115e-05, + "loss": 1.3913, + "mean_token_accuracy": 0.6531608452399572, + "num_tokens": 756136563.0, + "step": 4503 + }, + { + "entropy": 1.7622207999229431, + "epoch": 0.49479552882370714, + "grad_norm": 0.6121542453765869, + "learning_rate": 1.7861584751430373e-05, + "loss": 1.5139, + "mean_token_accuracy": 0.6296129673719406, + "num_tokens": 756322688.0, + "step": 4504 + }, + { + "entropy": 1.6936173935731251, + "epoch": 0.4949053857350801, + "grad_norm": 0.7181201577186584, + "learning_rate": 1.786054853950857e-05, + "loss": 1.4316, + "mean_token_accuracy": 0.6570375859737396, + "num_tokens": 756472507.0, + "step": 4505 + }, + { + "entropy": 1.7213451365629833, + "epoch": 0.49501524264645297, + "grad_norm": 0.7521805763244629, + "learning_rate": 1.7859512110453493e-05, + "loss": 1.3723, + "mean_token_accuracy": 0.6491851558287939, + "num_tokens": 756630765.0, + "step": 4506 + }, + { + "entropy": 1.7246767083803813, + "epoch": 0.4951250995578259, + "grad_norm": 0.6681076288223267, + "learning_rate": 1.7858475464297952e-05, + "loss": 1.5371, + "mean_token_accuracy": 0.6519037286440531, + "num_tokens": 756811520.0, + "step": 4507 + }, + { + "entropy": 1.724827955166499, + "epoch": 0.49523495646919885, + "grad_norm": 0.7627193331718445, + "learning_rate": 1.785743860107475e-05, + "loss": 1.3533, + "mean_token_accuracy": 0.6570751518011093, + "num_tokens": 756947022.0, + "step": 4508 + }, + { + "entropy": 1.7522861162821453, + "epoch": 0.4953448133805718, + "grad_norm": 0.7629287242889404, + "learning_rate": 1.7856401520816707e-05, + "loss": 1.4731, + "mean_token_accuracy": 0.6540786474943161, + "num_tokens": 757085562.0, + "step": 4509 + }, + { + "entropy": 1.7017335096995037, + "epoch": 0.49545467029194473, + "grad_norm": 0.6842020153999329, + "learning_rate": 1.7855364223556647e-05, + "loss": 1.4837, + "mean_token_accuracy": 0.6524115850528082, + "num_tokens": 757267752.0, + "step": 4510 + }, + { + "entropy": 1.754370888074239, + "epoch": 0.4955645272033177, + "grad_norm": 0.7372490763664246, + "learning_rate": 1.78543267093274e-05, + "loss": 1.3641, + "mean_token_accuracy": 0.6588715563217798, + "num_tokens": 757385616.0, + "step": 4511 + }, + { + "entropy": 1.7240610619386036, + "epoch": 0.4956743841146906, + "grad_norm": 0.7373293042182922, + "learning_rate": 1.7853288978161797e-05, + "loss": 1.4162, + "mean_token_accuracy": 0.6595638593037924, + "num_tokens": 757553063.0, + "step": 4512 + }, + { + "entropy": 1.6941333214441936, + "epoch": 0.49578424102606355, + "grad_norm": 0.6690158843994141, + "learning_rate": 1.7852251030092686e-05, + "loss": 1.5101, + "mean_token_accuracy": 0.6566884964704514, + "num_tokens": 757694092.0, + "step": 4513 + }, + { + "entropy": 1.7551434238751729, + "epoch": 0.4958940979374365, + "grad_norm": 0.5994437336921692, + "learning_rate": 1.785121286515292e-05, + "loss": 1.438, + "mean_token_accuracy": 0.6497039496898651, + "num_tokens": 757875258.0, + "step": 4514 + }, + { + "entropy": 1.7067551612854004, + "epoch": 0.49600395484880944, + "grad_norm": 0.7220733761787415, + "learning_rate": 1.7850174483375353e-05, + "loss": 1.2979, + "mean_token_accuracy": 0.6739718317985535, + "num_tokens": 758004198.0, + "step": 4515 + }, + { + "entropy": 1.6703088978926341, + "epoch": 0.4961138117601824, + "grad_norm": 0.6553620100021362, + "learning_rate": 1.7849135884792853e-05, + "loss": 1.4163, + "mean_token_accuracy": 0.6570161531368891, + "num_tokens": 758196716.0, + "step": 4516 + }, + { + "entropy": 1.7132277488708496, + "epoch": 0.4962236686715553, + "grad_norm": 0.690665066242218, + "learning_rate": 1.784809706943829e-05, + "loss": 1.4286, + "mean_token_accuracy": 0.6602544039487839, + "num_tokens": 758382983.0, + "step": 4517 + }, + { + "entropy": 1.7164062758286793, + "epoch": 0.49633352558292826, + "grad_norm": 0.6790395975112915, + "learning_rate": 1.784705803734453e-05, + "loss": 1.4582, + "mean_token_accuracy": 0.6742985943953196, + "num_tokens": 758572656.0, + "step": 4518 + }, + { + "entropy": 1.67890664935112, + "epoch": 0.4964433824943012, + "grad_norm": 1.2717398405075073, + "learning_rate": 1.784601878854448e-05, + "loss": 1.4087, + "mean_token_accuracy": 0.660420835018158, + "num_tokens": 758793304.0, + "step": 4519 + }, + { + "entropy": 1.7838424642880757, + "epoch": 0.4965532394056741, + "grad_norm": 0.6524944305419922, + "learning_rate": 1.7844979323071016e-05, + "loss": 1.4004, + "mean_token_accuracy": 0.6474776168664297, + "num_tokens": 758916623.0, + "step": 4520 + }, + { + "entropy": 1.6456107993920643, + "epoch": 0.496663096317047, + "grad_norm": 0.7245497703552246, + "learning_rate": 1.7843939640957038e-05, + "loss": 1.4214, + "mean_token_accuracy": 0.6555730899175009, + "num_tokens": 759097278.0, + "step": 4521 + }, + { + "entropy": 1.7413156827290852, + "epoch": 0.49677295322841997, + "grad_norm": 0.7097477316856384, + "learning_rate": 1.7842899742235458e-05, + "loss": 1.3122, + "mean_token_accuracy": 0.664093608657519, + "num_tokens": 759245582.0, + "step": 4522 + }, + { + "entropy": 1.6793685257434845, + "epoch": 0.4968828101397929, + "grad_norm": 0.717451810836792, + "learning_rate": 1.7841859626939185e-05, + "loss": 1.4663, + "mean_token_accuracy": 0.6448543965816498, + "num_tokens": 759450714.0, + "step": 4523 + }, + { + "entropy": 1.7351475755373638, + "epoch": 0.49699266705116585, + "grad_norm": 0.7763004899024963, + "learning_rate": 1.784081929510113e-05, + "loss": 1.4956, + "mean_token_accuracy": 0.6306874205668768, + "num_tokens": 759665939.0, + "step": 4524 + }, + { + "entropy": 1.6486627856890361, + "epoch": 0.4971025239625388, + "grad_norm": 0.6308198571205139, + "learning_rate": 1.783977874675424e-05, + "loss": 1.3943, + "mean_token_accuracy": 0.6578763922055563, + "num_tokens": 759846721.0, + "step": 4525 + }, + { + "entropy": 1.702409029006958, + "epoch": 0.49721238087391173, + "grad_norm": 0.7921629548072815, + "learning_rate": 1.7838737981931425e-05, + "loss": 1.4551, + "mean_token_accuracy": 0.6458623309930166, + "num_tokens": 760104447.0, + "step": 4526 + }, + { + "entropy": 1.685009628534317, + "epoch": 0.49732223778528467, + "grad_norm": 0.6673221588134766, + "learning_rate": 1.7837697000665638e-05, + "loss": 1.3185, + "mean_token_accuracy": 0.6742521325747172, + "num_tokens": 760293720.0, + "step": 4527 + }, + { + "entropy": 1.7403136988480885, + "epoch": 0.4974320946966576, + "grad_norm": 0.6920185685157776, + "learning_rate": 1.783665580298982e-05, + "loss": 1.2648, + "mean_token_accuracy": 0.6715550472338995, + "num_tokens": 760387875.0, + "step": 4528 + }, + { + "entropy": 1.6765425205230713, + "epoch": 0.49754195160803055, + "grad_norm": 0.6350904107093811, + "learning_rate": 1.7835614388936927e-05, + "loss": 1.4115, + "mean_token_accuracy": 0.6627595176299413, + "num_tokens": 760546642.0, + "step": 4529 + }, + { + "entropy": 1.7340703904628754, + "epoch": 0.4976518085194035, + "grad_norm": 0.7262073755264282, + "learning_rate": 1.7834572758539922e-05, + "loss": 1.2949, + "mean_token_accuracy": 0.6761989891529083, + "num_tokens": 760670647.0, + "step": 4530 + }, + { + "entropy": 1.733684519926707, + "epoch": 0.49776166543077643, + "grad_norm": 0.6469461917877197, + "learning_rate": 1.7833530911831767e-05, + "loss": 1.4896, + "mean_token_accuracy": 0.6465209424495697, + "num_tokens": 760823361.0, + "step": 4531 + }, + { + "entropy": 1.7105149626731873, + "epoch": 0.4978715223421494, + "grad_norm": 0.7094139456748962, + "learning_rate": 1.7832488848845438e-05, + "loss": 1.3516, + "mean_token_accuracy": 0.6633161505063375, + "num_tokens": 760986995.0, + "step": 4532 + }, + { + "entropy": 1.7280435959498088, + "epoch": 0.49798137925352226, + "grad_norm": 0.5853270888328552, + "learning_rate": 1.7831446569613915e-05, + "loss": 1.4263, + "mean_token_accuracy": 0.6446743756532669, + "num_tokens": 761178147.0, + "step": 4533 + }, + { + "entropy": 1.6787182490030925, + "epoch": 0.4980912361648952, + "grad_norm": 0.6534310579299927, + "learning_rate": 1.7830404074170188e-05, + "loss": 1.4564, + "mean_token_accuracy": 0.6509255568186442, + "num_tokens": 761373044.0, + "step": 4534 + }, + { + "entropy": 1.7558597127596538, + "epoch": 0.49820109307626814, + "grad_norm": 0.6207196116447449, + "learning_rate": 1.7829361362547248e-05, + "loss": 1.5756, + "mean_token_accuracy": 0.628658264875412, + "num_tokens": 761576148.0, + "step": 4535 + }, + { + "entropy": 1.6844909886519115, + "epoch": 0.4983109499876411, + "grad_norm": 0.7019922733306885, + "learning_rate": 1.7828318434778098e-05, + "loss": 1.1964, + "mean_token_accuracy": 0.679168184598287, + "num_tokens": 761706398.0, + "step": 4536 + }, + { + "entropy": 1.6976350645224254, + "epoch": 0.498420806899014, + "grad_norm": 0.7925885915756226, + "learning_rate": 1.7827275290895745e-05, + "loss": 1.4483, + "mean_token_accuracy": 0.6481008778015772, + "num_tokens": 761941759.0, + "step": 4537 + }, + { + "entropy": 1.7484288016955059, + "epoch": 0.49853066381038696, + "grad_norm": 0.8223802447319031, + "learning_rate": 1.782623193093321e-05, + "loss": 1.4108, + "mean_token_accuracy": 0.6608029355605444, + "num_tokens": 762094461.0, + "step": 4538 + }, + { + "entropy": 1.6935599744319916, + "epoch": 0.4986405207217599, + "grad_norm": 0.6052656769752502, + "learning_rate": 1.782518835492351e-05, + "loss": 1.299, + "mean_token_accuracy": 0.6712607592344284, + "num_tokens": 762245616.0, + "step": 4539 + }, + { + "entropy": 1.768530507882436, + "epoch": 0.49875037763313285, + "grad_norm": 0.6017442941665649, + "learning_rate": 1.782414456289967e-05, + "loss": 1.5665, + "mean_token_accuracy": 0.6245706081390381, + "num_tokens": 762425328.0, + "step": 4540 + }, + { + "entropy": 1.743304302295049, + "epoch": 0.4988602345445058, + "grad_norm": 0.7569601535797119, + "learning_rate": 1.782310055489473e-05, + "loss": 1.6859, + "mean_token_accuracy": 0.6368949313958486, + "num_tokens": 762582323.0, + "step": 4541 + }, + { + "entropy": 1.7565892438093822, + "epoch": 0.49897009145587873, + "grad_norm": 0.7533148527145386, + "learning_rate": 1.782205633094174e-05, + "loss": 1.5002, + "mean_token_accuracy": 0.6462213893731436, + "num_tokens": 762732218.0, + "step": 4542 + }, + { + "entropy": 1.7235056459903717, + "epoch": 0.49907994836725167, + "grad_norm": 0.7903603315353394, + "learning_rate": 1.7821011891073732e-05, + "loss": 1.5051, + "mean_token_accuracy": 0.6527835627396902, + "num_tokens": 762921443.0, + "step": 4543 + }, + { + "entropy": 1.6993493934472401, + "epoch": 0.4991898052786246, + "grad_norm": 0.8247820138931274, + "learning_rate": 1.7819967235323773e-05, + "loss": 1.2088, + "mean_token_accuracy": 0.6772788912057877, + "num_tokens": 763015220.0, + "step": 4544 + }, + { + "entropy": 1.7162687877813976, + "epoch": 0.49929966218999755, + "grad_norm": 0.8071532249450684, + "learning_rate": 1.7818922363724926e-05, + "loss": 1.3451, + "mean_token_accuracy": 0.6634253213802973, + "num_tokens": 763167117.0, + "step": 4545 + }, + { + "entropy": 1.6914705137411754, + "epoch": 0.4994095191013705, + "grad_norm": 0.787642240524292, + "learning_rate": 1.7817877276310257e-05, + "loss": 1.4834, + "mean_token_accuracy": 0.6583209584156672, + "num_tokens": 763346825.0, + "step": 4546 + }, + { + "entropy": 1.6890590290228527, + "epoch": 0.4995193760127434, + "grad_norm": 0.6730331778526306, + "learning_rate": 1.781683197311285e-05, + "loss": 1.488, + "mean_token_accuracy": 0.6388949304819107, + "num_tokens": 763552338.0, + "step": 4547 + }, + { + "entropy": 1.752114752928416, + "epoch": 0.4996292329241163, + "grad_norm": 0.7115664482116699, + "learning_rate": 1.7815786454165776e-05, + "loss": 1.4952, + "mean_token_accuracy": 0.6519523759682974, + "num_tokens": 763708637.0, + "step": 4548 + }, + { + "entropy": 1.7708389262358348, + "epoch": 0.49973908983548926, + "grad_norm": 0.8620888590812683, + "learning_rate": 1.7814740719502135e-05, + "loss": 1.434, + "mean_token_accuracy": 0.6429966588815054, + "num_tokens": 763848626.0, + "step": 4549 + }, + { + "entropy": 1.7015598714351654, + "epoch": 0.4998489467468622, + "grad_norm": 0.6294053196907043, + "learning_rate": 1.7813694769155022e-05, + "loss": 1.4717, + "mean_token_accuracy": 0.6426868637402853, + "num_tokens": 764021913.0, + "step": 4550 + }, + { + "entropy": 1.6892776091893513, + "epoch": 0.49995880365823514, + "grad_norm": 0.6385858058929443, + "learning_rate": 1.781264860315754e-05, + "loss": 1.2928, + "mean_token_accuracy": 0.6771899660428365, + "num_tokens": 764190314.0, + "step": 4551 + }, + { + "entropy": 1.7139320472876232, + "epoch": 0.5000686605696081, + "grad_norm": 0.6113952994346619, + "learning_rate": 1.7811602221542795e-05, + "loss": 1.3597, + "mean_token_accuracy": 0.658218597372373, + "num_tokens": 764343722.0, + "step": 4552 + }, + { + "entropy": 1.698420782883962, + "epoch": 0.5001785174809811, + "grad_norm": 0.7712813019752502, + "learning_rate": 1.781055562434391e-05, + "loss": 1.5078, + "mean_token_accuracy": 0.6449939161539078, + "num_tokens": 764543119.0, + "step": 4553 + }, + { + "entropy": 1.6849484543005626, + "epoch": 0.500288374392354, + "grad_norm": 0.8930643200874329, + "learning_rate": 1.7809508811594015e-05, + "loss": 1.2912, + "mean_token_accuracy": 0.6756371855735779, + "num_tokens": 764718997.0, + "step": 4554 + }, + { + "entropy": 1.669805000225703, + "epoch": 0.5003982313037268, + "grad_norm": 0.6647879481315613, + "learning_rate": 1.7808461783326228e-05, + "loss": 1.3159, + "mean_token_accuracy": 0.6644556125005087, + "num_tokens": 764865095.0, + "step": 4555 + }, + { + "entropy": 1.6393884023030598, + "epoch": 0.5005080882150998, + "grad_norm": 0.6310734152793884, + "learning_rate": 1.7807414539573696e-05, + "loss": 1.3232, + "mean_token_accuracy": 0.6677381098270416, + "num_tokens": 765049052.0, + "step": 4556 + }, + { + "entropy": 1.6223057607809703, + "epoch": 0.5006179451264727, + "grad_norm": 0.9187425971031189, + "learning_rate": 1.780636708036956e-05, + "loss": 1.4747, + "mean_token_accuracy": 0.6508963604768118, + "num_tokens": 765223488.0, + "step": 4557 + }, + { + "entropy": 1.6926488975683849, + "epoch": 0.5007278020378457, + "grad_norm": 0.7379801869392395, + "learning_rate": 1.780531940574697e-05, + "loss": 1.3283, + "mean_token_accuracy": 0.6766380667686462, + "num_tokens": 765383670.0, + "step": 4558 + }, + { + "entropy": 1.74741593003273, + "epoch": 0.5008376589492186, + "grad_norm": 0.6469904780387878, + "learning_rate": 1.7804271515739096e-05, + "loss": 1.3902, + "mean_token_accuracy": 0.653177946805954, + "num_tokens": 765523127.0, + "step": 4559 + }, + { + "entropy": 1.701665033896764, + "epoch": 0.5009475158605916, + "grad_norm": 0.6925680041313171, + "learning_rate": 1.780322341037909e-05, + "loss": 1.4267, + "mean_token_accuracy": 0.6503228594859441, + "num_tokens": 765700770.0, + "step": 4560 + }, + { + "entropy": 1.7469732860724132, + "epoch": 0.5010573727719645, + "grad_norm": 0.7270328998565674, + "learning_rate": 1.780217508970013e-05, + "loss": 1.584, + "mean_token_accuracy": 0.6263004789749781, + "num_tokens": 765875041.0, + "step": 4561 + }, + { + "entropy": 1.6818542679150899, + "epoch": 0.5011672296833375, + "grad_norm": 0.7486089468002319, + "learning_rate": 1.780112655373539e-05, + "loss": 1.4211, + "mean_token_accuracy": 0.6588012526432673, + "num_tokens": 766026284.0, + "step": 4562 + }, + { + "entropy": 1.7353224456310272, + "epoch": 0.5012770865947104, + "grad_norm": 0.6791040301322937, + "learning_rate": 1.7800077802518062e-05, + "loss": 1.4015, + "mean_token_accuracy": 0.6505940506855646, + "num_tokens": 766178332.0, + "step": 4563 + }, + { + "entropy": 1.6911177138487499, + "epoch": 0.5013869435060834, + "grad_norm": 0.6527238488197327, + "learning_rate": 1.7799028836081333e-05, + "loss": 1.321, + "mean_token_accuracy": 0.6581740925709406, + "num_tokens": 766368332.0, + "step": 4564 + }, + { + "entropy": 1.7469729781150818, + "epoch": 0.5014968004174563, + "grad_norm": 0.6890334486961365, + "learning_rate": 1.7797979654458408e-05, + "loss": 1.415, + "mean_token_accuracy": 0.6579999874035517, + "num_tokens": 766550333.0, + "step": 4565 + }, + { + "entropy": 1.7444235583146412, + "epoch": 0.5016066573288293, + "grad_norm": 0.6815066933631897, + "learning_rate": 1.7796930257682487e-05, + "loss": 1.3465, + "mean_token_accuracy": 0.6652761151393255, + "num_tokens": 766711842.0, + "step": 4566 + }, + { + "entropy": 1.745679686466853, + "epoch": 0.5017165142402021, + "grad_norm": 0.6902961730957031, + "learning_rate": 1.7795880645786788e-05, + "loss": 1.2986, + "mean_token_accuracy": 0.6666453431049982, + "num_tokens": 766838368.0, + "step": 4567 + }, + { + "entropy": 1.7069720129172008, + "epoch": 0.501826371151575, + "grad_norm": 0.6801702380180359, + "learning_rate": 1.779483081880453e-05, + "loss": 1.495, + "mean_token_accuracy": 0.6321636736392975, + "num_tokens": 767011476.0, + "step": 4568 + }, + { + "entropy": 1.6371342142422993, + "epoch": 0.501936228062948, + "grad_norm": 0.6808404326438904, + "learning_rate": 1.779378077676894e-05, + "loss": 1.5138, + "mean_token_accuracy": 0.6529216965039571, + "num_tokens": 767228694.0, + "step": 4569 + }, + { + "entropy": 1.624429355065028, + "epoch": 0.5020460849743209, + "grad_norm": 0.7168002724647522, + "learning_rate": 1.7792730519713245e-05, + "loss": 1.3463, + "mean_token_accuracy": 0.6598889629046122, + "num_tokens": 767386847.0, + "step": 4570 + }, + { + "entropy": 1.6681628028551738, + "epoch": 0.5021559418856939, + "grad_norm": 0.7332079410552979, + "learning_rate": 1.7791680047670696e-05, + "loss": 1.5026, + "mean_token_accuracy": 0.6463381548722585, + "num_tokens": 767562231.0, + "step": 4571 + }, + { + "entropy": 1.7176694869995117, + "epoch": 0.5022657987970668, + "grad_norm": 0.6463919878005981, + "learning_rate": 1.7790629360674528e-05, + "loss": 1.336, + "mean_token_accuracy": 0.6605811516443888, + "num_tokens": 767694393.0, + "step": 4572 + }, + { + "entropy": 1.7141607999801636, + "epoch": 0.5023756557084398, + "grad_norm": 0.6464807391166687, + "learning_rate": 1.7789578458758004e-05, + "loss": 1.4295, + "mean_token_accuracy": 0.6666828741629919, + "num_tokens": 767842803.0, + "step": 4573 + }, + { + "entropy": 1.6756874024868011, + "epoch": 0.5024855126198127, + "grad_norm": 0.6780061721801758, + "learning_rate": 1.7788527341954378e-05, + "loss": 1.3292, + "mean_token_accuracy": 0.6682515839735667, + "num_tokens": 767995295.0, + "step": 4574 + }, + { + "entropy": 1.7120186189810436, + "epoch": 0.5025953695311857, + "grad_norm": 0.6508592963218689, + "learning_rate": 1.7787476010296922e-05, + "loss": 1.5077, + "mean_token_accuracy": 0.6511927644411722, + "num_tokens": 768165030.0, + "step": 4575 + }, + { + "entropy": 1.7303796609242756, + "epoch": 0.5027052264425586, + "grad_norm": 0.7258087992668152, + "learning_rate": 1.778642446381891e-05, + "loss": 1.3302, + "mean_token_accuracy": 0.664924239118894, + "num_tokens": 768276477.0, + "step": 4576 + }, + { + "entropy": 1.735611488421758, + "epoch": 0.5028150833539315, + "grad_norm": 0.6726675629615784, + "learning_rate": 1.7785372702553624e-05, + "loss": 1.5562, + "mean_token_accuracy": 0.6415247122446696, + "num_tokens": 768471537.0, + "step": 4577 + }, + { + "entropy": 1.6898790796597798, + "epoch": 0.5029249402653044, + "grad_norm": 0.6226996779441833, + "learning_rate": 1.7784320726534345e-05, + "loss": 1.3483, + "mean_token_accuracy": 0.6662185192108154, + "num_tokens": 768638518.0, + "step": 4578 + }, + { + "entropy": 1.6402369737625122, + "epoch": 0.5030347971766774, + "grad_norm": 0.5572636127471924, + "learning_rate": 1.7783268535794376e-05, + "loss": 1.4847, + "mean_token_accuracy": 0.6341389069954554, + "num_tokens": 768861174.0, + "step": 4579 + }, + { + "entropy": 1.6863535543282826, + "epoch": 0.5031446540880503, + "grad_norm": 0.634425938129425, + "learning_rate": 1.778221613036701e-05, + "loss": 1.4483, + "mean_token_accuracy": 0.6415148476759592, + "num_tokens": 769041187.0, + "step": 4580 + }, + { + "entropy": 1.706571986277898, + "epoch": 0.5032545109994232, + "grad_norm": 0.659997284412384, + "learning_rate": 1.7781163510285564e-05, + "loss": 1.2811, + "mean_token_accuracy": 0.6710793773333231, + "num_tokens": 769218716.0, + "step": 4581 + }, + { + "entropy": 1.6553989350795746, + "epoch": 0.5033643679107962, + "grad_norm": 0.7408038377761841, + "learning_rate": 1.7780110675583345e-05, + "loss": 1.4047, + "mean_token_accuracy": 0.6643812855084738, + "num_tokens": 769399208.0, + "step": 4582 + }, + { + "entropy": 1.7550034324328105, + "epoch": 0.5034742248221691, + "grad_norm": 0.8029376268386841, + "learning_rate": 1.777905762629368e-05, + "loss": 1.4544, + "mean_token_accuracy": 0.6457452674706777, + "num_tokens": 769538565.0, + "step": 4583 + }, + { + "entropy": 1.701194703578949, + "epoch": 0.5035840817335421, + "grad_norm": 0.6260977387428284, + "learning_rate": 1.7778004362449897e-05, + "loss": 1.5601, + "mean_token_accuracy": 0.6248839298884074, + "num_tokens": 769785297.0, + "step": 4584 + }, + { + "entropy": 1.6856548488140106, + "epoch": 0.503693938644915, + "grad_norm": 0.8055636882781982, + "learning_rate": 1.7776950884085325e-05, + "loss": 1.4075, + "mean_token_accuracy": 0.6510484715302786, + "num_tokens": 769974441.0, + "step": 4585 + }, + { + "entropy": 1.696067899465561, + "epoch": 0.503803795556288, + "grad_norm": 0.6942035555839539, + "learning_rate": 1.777589719123332e-05, + "loss": 1.398, + "mean_token_accuracy": 0.6512449930111567, + "num_tokens": 770158357.0, + "step": 4586 + }, + { + "entropy": 1.7241803507010143, + "epoch": 0.5039136524676608, + "grad_norm": 0.7550591230392456, + "learning_rate": 1.7774843283927215e-05, + "loss": 1.5366, + "mean_token_accuracy": 0.6322035938501358, + "num_tokens": 770356874.0, + "step": 4587 + }, + { + "entropy": 1.7421184480190277, + "epoch": 0.5040235093790338, + "grad_norm": 0.6920149326324463, + "learning_rate": 1.7773789162200378e-05, + "loss": 1.4192, + "mean_token_accuracy": 0.6601304560899734, + "num_tokens": 770542678.0, + "step": 4588 + }, + { + "entropy": 1.7092435161272685, + "epoch": 0.5041333662904067, + "grad_norm": 0.6304470896720886, + "learning_rate": 1.777273482608616e-05, + "loss": 1.5378, + "mean_token_accuracy": 0.6353190094232559, + "num_tokens": 770728579.0, + "step": 4589 + }, + { + "entropy": 1.6159135500590007, + "epoch": 0.5042432232017797, + "grad_norm": 0.680613100528717, + "learning_rate": 1.777168027561794e-05, + "loss": 1.2946, + "mean_token_accuracy": 0.6721889674663544, + "num_tokens": 770911912.0, + "step": 4590 + }, + { + "entropy": 1.7247054477532704, + "epoch": 0.5043530801131526, + "grad_norm": 0.7321951389312744, + "learning_rate": 1.777062551082909e-05, + "loss": 1.4008, + "mean_token_accuracy": 0.6659555484851202, + "num_tokens": 771043582.0, + "step": 4591 + }, + { + "entropy": 1.641870786746343, + "epoch": 0.5044629370245256, + "grad_norm": 0.658308744430542, + "learning_rate": 1.7769570531752995e-05, + "loss": 1.2654, + "mean_token_accuracy": 0.688631405433019, + "num_tokens": 771194901.0, + "step": 4592 + }, + { + "entropy": 1.7228473524252574, + "epoch": 0.5045727939358985, + "grad_norm": 0.7475135326385498, + "learning_rate": 1.7768515338423044e-05, + "loss": 1.3101, + "mean_token_accuracy": 0.6785301913817724, + "num_tokens": 771320509.0, + "step": 4593 + }, + { + "entropy": 1.7426639099915822, + "epoch": 0.5046826508472714, + "grad_norm": 0.7170320749282837, + "learning_rate": 1.776745993087263e-05, + "loss": 1.6534, + "mean_token_accuracy": 0.6144102613131205, + "num_tokens": 771526316.0, + "step": 4594 + }, + { + "entropy": 1.6618753671646118, + "epoch": 0.5047925077586444, + "grad_norm": 0.7222857475280762, + "learning_rate": 1.776640430913516e-05, + "loss": 1.4575, + "mean_token_accuracy": 0.6568075368801752, + "num_tokens": 771684867.0, + "step": 4595 + }, + { + "entropy": 1.7489372690518696, + "epoch": 0.5049023646700173, + "grad_norm": 0.8467748761177063, + "learning_rate": 1.7765348473244042e-05, + "loss": 1.4878, + "mean_token_accuracy": 0.6579713523387909, + "num_tokens": 771824096.0, + "step": 4596 + }, + { + "entropy": 1.7483091453711193, + "epoch": 0.5050122215813903, + "grad_norm": 0.6738401651382446, + "learning_rate": 1.7764292423232694e-05, + "loss": 1.4071, + "mean_token_accuracy": 0.6498169700304667, + "num_tokens": 771969053.0, + "step": 4597 + }, + { + "entropy": 1.6820040146509807, + "epoch": 0.5051220784927631, + "grad_norm": 0.7400838136672974, + "learning_rate": 1.7763236159134538e-05, + "loss": 1.3708, + "mean_token_accuracy": 0.6718896230061849, + "num_tokens": 772120605.0, + "step": 4598 + }, + { + "entropy": 1.694365570942561, + "epoch": 0.5052319354041361, + "grad_norm": 0.6368020176887512, + "learning_rate": 1.7762179680983007e-05, + "loss": 1.4105, + "mean_token_accuracy": 0.6443684299786886, + "num_tokens": 772319397.0, + "step": 4599 + }, + { + "entropy": 1.7446305255095165, + "epoch": 0.505341792315509, + "grad_norm": 0.7226773500442505, + "learning_rate": 1.7761122988811533e-05, + "loss": 1.4982, + "mean_token_accuracy": 0.6516734858353933, + "num_tokens": 772512021.0, + "step": 4600 + }, + { + "entropy": 1.6962950527668, + "epoch": 0.505451649226882, + "grad_norm": 0.6551104784011841, + "learning_rate": 1.7760066082653566e-05, + "loss": 1.4639, + "mean_token_accuracy": 0.6513085216283798, + "num_tokens": 772707795.0, + "step": 4601 + }, + { + "entropy": 1.7322811285654705, + "epoch": 0.5055615061382549, + "grad_norm": 0.7607578635215759, + "learning_rate": 1.775900896254255e-05, + "loss": 1.4611, + "mean_token_accuracy": 0.653966099023819, + "num_tokens": 772888030.0, + "step": 4602 + }, + { + "entropy": 1.7565113206704457, + "epoch": 0.5056713630496279, + "grad_norm": 0.780271053314209, + "learning_rate": 1.7757951628511953e-05, + "loss": 1.3276, + "mean_token_accuracy": 0.6591301510731379, + "num_tokens": 773074743.0, + "step": 4603 + }, + { + "entropy": 1.7075756589571636, + "epoch": 0.5057812199610008, + "grad_norm": 0.8063814640045166, + "learning_rate": 1.7756894080595225e-05, + "loss": 1.2822, + "mean_token_accuracy": 0.672131285071373, + "num_tokens": 773213092.0, + "step": 4604 + }, + { + "entropy": 1.7614882191022236, + "epoch": 0.5058910768723738, + "grad_norm": 0.7280179858207703, + "learning_rate": 1.7755836318825846e-05, + "loss": 1.5566, + "mean_token_accuracy": 0.6349164942900339, + "num_tokens": 773418841.0, + "step": 4605 + }, + { + "entropy": 1.7605247398217518, + "epoch": 0.5060009337837467, + "grad_norm": 0.6884891390800476, + "learning_rate": 1.7754778343237294e-05, + "loss": 1.5146, + "mean_token_accuracy": 0.6434789101282755, + "num_tokens": 773579477.0, + "step": 4606 + }, + { + "entropy": 1.7038259605566661, + "epoch": 0.5061107906951197, + "grad_norm": 0.6832484602928162, + "learning_rate": 1.775372015386305e-05, + "loss": 1.2357, + "mean_token_accuracy": 0.6903716921806335, + "num_tokens": 773730733.0, + "step": 4607 + }, + { + "entropy": 1.6778667668501537, + "epoch": 0.5062206476064925, + "grad_norm": 0.7103332281112671, + "learning_rate": 1.7752661750736608e-05, + "loss": 1.3739, + "mean_token_accuracy": 0.6615954885880152, + "num_tokens": 773881901.0, + "step": 4608 + }, + { + "entropy": 1.747862070798874, + "epoch": 0.5063305045178654, + "grad_norm": 0.6749265789985657, + "learning_rate": 1.7751603133891463e-05, + "loss": 1.5966, + "mean_token_accuracy": 0.622282862663269, + "num_tokens": 774103203.0, + "step": 4609 + }, + { + "entropy": 1.7335962454477947, + "epoch": 0.5064403614292384, + "grad_norm": 0.7897728085517883, + "learning_rate": 1.775054430336112e-05, + "loss": 1.4646, + "mean_token_accuracy": 0.6427331467469534, + "num_tokens": 774268686.0, + "step": 4610 + }, + { + "entropy": 1.7124591569105785, + "epoch": 0.5065502183406113, + "grad_norm": 0.6754243969917297, + "learning_rate": 1.774948525917909e-05, + "loss": 1.316, + "mean_token_accuracy": 0.6721870998541514, + "num_tokens": 774395456.0, + "step": 4611 + }, + { + "entropy": 1.6793800294399261, + "epoch": 0.5066600752519843, + "grad_norm": 0.6710286736488342, + "learning_rate": 1.7748426001378897e-05, + "loss": 1.2881, + "mean_token_accuracy": 0.6775974581638972, + "num_tokens": 774532094.0, + "step": 4612 + }, + { + "entropy": 1.6808149913946788, + "epoch": 0.5067699321633572, + "grad_norm": 0.6567005515098572, + "learning_rate": 1.774736652999406e-05, + "loss": 1.3745, + "mean_token_accuracy": 0.6517610251903534, + "num_tokens": 774685619.0, + "step": 4613 + }, + { + "entropy": 1.751223752895991, + "epoch": 0.5068797890747302, + "grad_norm": 0.6677860021591187, + "learning_rate": 1.7746306845058113e-05, + "loss": 1.4449, + "mean_token_accuracy": 0.6329387575387955, + "num_tokens": 774885515.0, + "step": 4614 + }, + { + "entropy": 1.674304574728012, + "epoch": 0.5069896459861031, + "grad_norm": 0.779396653175354, + "learning_rate": 1.7745246946604594e-05, + "loss": 1.1997, + "mean_token_accuracy": 0.6830140401919683, + "num_tokens": 774995865.0, + "step": 4615 + }, + { + "entropy": 1.689636766910553, + "epoch": 0.5070995028974761, + "grad_norm": 0.872032642364502, + "learning_rate": 1.774418683466705e-05, + "loss": 1.5374, + "mean_token_accuracy": 0.6345034042994181, + "num_tokens": 775247345.0, + "step": 4616 + }, + { + "entropy": 1.6756864488124847, + "epoch": 0.507209359808849, + "grad_norm": 0.7993032932281494, + "learning_rate": 1.7743126509279028e-05, + "loss": 1.2167, + "mean_token_accuracy": 0.6787678301334381, + "num_tokens": 775362640.0, + "step": 4617 + }, + { + "entropy": 1.7568728228410084, + "epoch": 0.507319216720222, + "grad_norm": 0.6227509379386902, + "learning_rate": 1.7742065970474096e-05, + "loss": 1.3976, + "mean_token_accuracy": 0.6522872199614843, + "num_tokens": 775565203.0, + "step": 4618 + }, + { + "entropy": 1.706842044989268, + "epoch": 0.5074290736315948, + "grad_norm": 0.7736787796020508, + "learning_rate": 1.774100521828581e-05, + "loss": 1.5008, + "mean_token_accuracy": 0.6404502143462499, + "num_tokens": 775781749.0, + "step": 4619 + }, + { + "entropy": 1.6234171589215596, + "epoch": 0.5075389305429678, + "grad_norm": 0.7655417919158936, + "learning_rate": 1.773994425274775e-05, + "loss": 1.2916, + "mean_token_accuracy": 0.6704551080862681, + "num_tokens": 775925323.0, + "step": 4620 + }, + { + "entropy": 1.7151438395182292, + "epoch": 0.5076487874543407, + "grad_norm": 0.7209942936897278, + "learning_rate": 1.7738883073893488e-05, + "loss": 1.2973, + "mean_token_accuracy": 0.6704816569884618, + "num_tokens": 776032961.0, + "step": 4621 + }, + { + "entropy": 1.7023293673992157, + "epoch": 0.5077586443657136, + "grad_norm": 0.7943500280380249, + "learning_rate": 1.7737821681756615e-05, + "loss": 1.505, + "mean_token_accuracy": 0.655063678820928, + "num_tokens": 776200041.0, + "step": 4622 + }, + { + "entropy": 1.72823366522789, + "epoch": 0.5078685012770866, + "grad_norm": 0.7248928546905518, + "learning_rate": 1.773676007637072e-05, + "loss": 1.4479, + "mean_token_accuracy": 0.6552989184856415, + "num_tokens": 776358646.0, + "step": 4623 + }, + { + "entropy": 1.6740071376164753, + "epoch": 0.5079783581884595, + "grad_norm": 0.6074709296226501, + "learning_rate": 1.7735698257769407e-05, + "loss": 1.3812, + "mean_token_accuracy": 0.6505249341328939, + "num_tokens": 776571031.0, + "step": 4624 + }, + { + "entropy": 1.6604996422926586, + "epoch": 0.5080882150998325, + "grad_norm": 0.7322157025337219, + "learning_rate": 1.7734636225986276e-05, + "loss": 1.3079, + "mean_token_accuracy": 0.6671365002791086, + "num_tokens": 776722724.0, + "step": 4625 + }, + { + "entropy": 1.6912861963113148, + "epoch": 0.5081980720112054, + "grad_norm": 0.6166011691093445, + "learning_rate": 1.7733573981054947e-05, + "loss": 1.4829, + "mean_token_accuracy": 0.6378809263308843, + "num_tokens": 776971710.0, + "step": 4626 + }, + { + "entropy": 1.707468460003535, + "epoch": 0.5083079289225784, + "grad_norm": 0.6257423162460327, + "learning_rate": 1.773251152300903e-05, + "loss": 1.4643, + "mean_token_accuracy": 0.6333738813797632, + "num_tokens": 777202200.0, + "step": 4627 + }, + { + "entropy": 1.6994255880514781, + "epoch": 0.5084177858339513, + "grad_norm": 0.5768167972564697, + "learning_rate": 1.7731448851882162e-05, + "loss": 1.432, + "mean_token_accuracy": 0.6558799743652344, + "num_tokens": 777407416.0, + "step": 4628 + }, + { + "entropy": 1.7263440589110057, + "epoch": 0.5085276427453242, + "grad_norm": 0.809219241142273, + "learning_rate": 1.7730385967707974e-05, + "loss": 1.4374, + "mean_token_accuracy": 0.6372658809026083, + "num_tokens": 777583936.0, + "step": 4629 + }, + { + "entropy": 1.7062489092350006, + "epoch": 0.5086374996566971, + "grad_norm": 0.7165677547454834, + "learning_rate": 1.7729322870520097e-05, + "loss": 1.3335, + "mean_token_accuracy": 0.6688279807567596, + "num_tokens": 777709835.0, + "step": 4630 + }, + { + "entropy": 1.7441291213035583, + "epoch": 0.5087473565680701, + "grad_norm": 0.7295101881027222, + "learning_rate": 1.7728259560352185e-05, + "loss": 1.5241, + "mean_token_accuracy": 0.6376437743504842, + "num_tokens": 777845178.0, + "step": 4631 + }, + { + "entropy": 1.7269198099772136, + "epoch": 0.508857213479443, + "grad_norm": 0.858462929725647, + "learning_rate": 1.772719603723789e-05, + "loss": 1.329, + "mean_token_accuracy": 0.6706622143586477, + "num_tokens": 777971047.0, + "step": 4632 + }, + { + "entropy": 1.730207492907842, + "epoch": 0.508967070390816, + "grad_norm": 0.9941986203193665, + "learning_rate": 1.7726132301210873e-05, + "loss": 1.4901, + "mean_token_accuracy": 0.6505331347386042, + "num_tokens": 778156638.0, + "step": 4633 + }, + { + "entropy": 1.6990565558274586, + "epoch": 0.5090769273021889, + "grad_norm": 0.7046946287155151, + "learning_rate": 1.7725068352304797e-05, + "loss": 1.359, + "mean_token_accuracy": 0.6794573764006296, + "num_tokens": 778290929.0, + "step": 4634 + }, + { + "entropy": 1.649871587753296, + "epoch": 0.5091867842135618, + "grad_norm": 0.703970193862915, + "learning_rate": 1.772400419055334e-05, + "loss": 1.4921, + "mean_token_accuracy": 0.6583941678206126, + "num_tokens": 778452940.0, + "step": 4635 + }, + { + "entropy": 1.6483552952607472, + "epoch": 0.5092966411249348, + "grad_norm": 0.74805748462677, + "learning_rate": 1.7722939815990182e-05, + "loss": 1.3589, + "mean_token_accuracy": 0.6700867811838785, + "num_tokens": 778598364.0, + "step": 4636 + }, + { + "entropy": 1.6974958876768749, + "epoch": 0.5094064980363077, + "grad_norm": 0.735685408115387, + "learning_rate": 1.7721875228649004e-05, + "loss": 1.3238, + "mean_token_accuracy": 0.6773294111092886, + "num_tokens": 778717920.0, + "step": 4637 + }, + { + "entropy": 1.6946069101492565, + "epoch": 0.5095163549476807, + "grad_norm": 0.6127060651779175, + "learning_rate": 1.7720810428563505e-05, + "loss": 1.44, + "mean_token_accuracy": 0.639764870206515, + "num_tokens": 778896607.0, + "step": 4638 + }, + { + "entropy": 1.733677049477895, + "epoch": 0.5096262118590535, + "grad_norm": 0.6817284822463989, + "learning_rate": 1.7719745415767388e-05, + "loss": 1.3651, + "mean_token_accuracy": 0.6538938681284586, + "num_tokens": 779050088.0, + "step": 4639 + }, + { + "entropy": 1.6518004635969799, + "epoch": 0.5097360687704265, + "grad_norm": 0.7022289633750916, + "learning_rate": 1.7718680190294353e-05, + "loss": 1.4326, + "mean_token_accuracy": 0.6522137075662613, + "num_tokens": 779254586.0, + "step": 4640 + }, + { + "entropy": 1.7444305717945099, + "epoch": 0.5098459256817994, + "grad_norm": 0.6511925458908081, + "learning_rate": 1.7717614752178118e-05, + "loss": 1.4276, + "mean_token_accuracy": 0.6486201186974844, + "num_tokens": 779421316.0, + "step": 4641 + }, + { + "entropy": 1.6676433086395264, + "epoch": 0.5099557825931724, + "grad_norm": 0.7357949018478394, + "learning_rate": 1.7716549101452402e-05, + "loss": 1.3955, + "mean_token_accuracy": 0.6542571783065796, + "num_tokens": 779594236.0, + "step": 4642 + }, + { + "entropy": 1.7073079347610474, + "epoch": 0.5100656395045453, + "grad_norm": 0.5698668360710144, + "learning_rate": 1.7715483238150937e-05, + "loss": 1.4319, + "mean_token_accuracy": 0.6443966527779897, + "num_tokens": 779786398.0, + "step": 4643 + }, + { + "entropy": 1.7269446750481923, + "epoch": 0.5101754964159183, + "grad_norm": 0.7325721979141235, + "learning_rate": 1.771441716230745e-05, + "loss": 1.3253, + "mean_token_accuracy": 0.6597764392693838, + "num_tokens": 779919839.0, + "step": 4644 + }, + { + "entropy": 1.671245684226354, + "epoch": 0.5102853533272912, + "grad_norm": 0.5406301021575928, + "learning_rate": 1.7713350873955688e-05, + "loss": 1.587, + "mean_token_accuracy": 0.6348774433135986, + "num_tokens": 780150472.0, + "step": 4645 + }, + { + "entropy": 1.6824797888596852, + "epoch": 0.5103952102386642, + "grad_norm": 0.6999531388282776, + "learning_rate": 1.7712284373129397e-05, + "loss": 1.352, + "mean_token_accuracy": 0.6638096670309702, + "num_tokens": 780343092.0, + "step": 4646 + }, + { + "entropy": 1.7021546860535939, + "epoch": 0.5105050671500371, + "grad_norm": 0.6942962408065796, + "learning_rate": 1.771121765986233e-05, + "loss": 1.4937, + "mean_token_accuracy": 0.6427315473556519, + "num_tokens": 780531400.0, + "step": 4647 + }, + { + "entropy": 1.7465800046920776, + "epoch": 0.51061492406141, + "grad_norm": 0.7457360625267029, + "learning_rate": 1.7710150734188242e-05, + "loss": 1.4176, + "mean_token_accuracy": 0.6418644239505132, + "num_tokens": 780693513.0, + "step": 4648 + }, + { + "entropy": 1.7026427487532299, + "epoch": 0.510724780972783, + "grad_norm": 0.8374441862106323, + "learning_rate": 1.7709083596140914e-05, + "loss": 1.4585, + "mean_token_accuracy": 0.6526228909691175, + "num_tokens": 780839738.0, + "step": 4649 + }, + { + "entropy": 1.6698659559090931, + "epoch": 0.5108346378841558, + "grad_norm": 0.7197142839431763, + "learning_rate": 1.770801624575411e-05, + "loss": 1.4273, + "mean_token_accuracy": 0.6493804206450781, + "num_tokens": 781048603.0, + "step": 4650 + }, + { + "entropy": 1.6799350480238597, + "epoch": 0.5109444947955288, + "grad_norm": 0.7679303288459778, + "learning_rate": 1.7706948683061612e-05, + "loss": 1.3085, + "mean_token_accuracy": 0.6658252626657486, + "num_tokens": 781188858.0, + "step": 4651 + }, + { + "entropy": 1.6943186322848003, + "epoch": 0.5110543517069017, + "grad_norm": 0.6792766451835632, + "learning_rate": 1.7705880908097214e-05, + "loss": 1.3816, + "mean_token_accuracy": 0.6598533739646276, + "num_tokens": 781320802.0, + "step": 4652 + }, + { + "entropy": 1.7791239122549694, + "epoch": 0.5111642086182747, + "grad_norm": 0.706912100315094, + "learning_rate": 1.7704812920894708e-05, + "loss": 1.3435, + "mean_token_accuracy": 0.658470019698143, + "num_tokens": 781474836.0, + "step": 4653 + }, + { + "entropy": 1.7638680239518483, + "epoch": 0.5112740655296476, + "grad_norm": 0.6044894456863403, + "learning_rate": 1.770374472148789e-05, + "loss": 1.5903, + "mean_token_accuracy": 0.6270778377850851, + "num_tokens": 781724073.0, + "step": 4654 + }, + { + "entropy": 1.7358222007751465, + "epoch": 0.5113839224410206, + "grad_norm": 0.6887006759643555, + "learning_rate": 1.770267630991058e-05, + "loss": 1.4788, + "mean_token_accuracy": 0.6502551734447479, + "num_tokens": 781917542.0, + "step": 4655 + }, + { + "entropy": 1.6472338835398357, + "epoch": 0.5114937793523935, + "grad_norm": 0.5894920229911804, + "learning_rate": 1.770160768619658e-05, + "loss": 1.4373, + "mean_token_accuracy": 0.6528671483198801, + "num_tokens": 782156130.0, + "step": 4656 + }, + { + "entropy": 1.6651289065678914, + "epoch": 0.5116036362637665, + "grad_norm": 0.6169312000274658, + "learning_rate": 1.7700538850379715e-05, + "loss": 1.4626, + "mean_token_accuracy": 0.6592658758163452, + "num_tokens": 782327215.0, + "step": 4657 + }, + { + "entropy": 1.7671948075294495, + "epoch": 0.5117134931751394, + "grad_norm": 0.7001243233680725, + "learning_rate": 1.7699469802493818e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.6645391583442688, + "num_tokens": 782488955.0, + "step": 4658 + }, + { + "entropy": 1.6598977148532867, + "epoch": 0.5118233500865124, + "grad_norm": 0.6964993476867676, + "learning_rate": 1.7698400542572717e-05, + "loss": 1.3115, + "mean_token_accuracy": 0.6722627530495325, + "num_tokens": 782611825.0, + "step": 4659 + }, + { + "entropy": 1.727137674887975, + "epoch": 0.5119332069978852, + "grad_norm": 0.7142112851142883, + "learning_rate": 1.769733107065026e-05, + "loss": 1.231, + "mean_token_accuracy": 0.6792268306016922, + "num_tokens": 782755942.0, + "step": 4660 + }, + { + "entropy": 1.7172054847081502, + "epoch": 0.5120430639092582, + "grad_norm": 0.6606463193893433, + "learning_rate": 1.769626138676029e-05, + "loss": 1.4273, + "mean_token_accuracy": 0.6487047125895818, + "num_tokens": 782910156.0, + "step": 4661 + }, + { + "entropy": 1.687313159306844, + "epoch": 0.5121529208206311, + "grad_norm": 0.5523031949996948, + "learning_rate": 1.7695191490936666e-05, + "loss": 1.4415, + "mean_token_accuracy": 0.649812196691831, + "num_tokens": 783153434.0, + "step": 4662 + }, + { + "entropy": 1.6816494663556416, + "epoch": 0.512262777732004, + "grad_norm": 0.7296652793884277, + "learning_rate": 1.769412138321325e-05, + "loss": 1.3972, + "mean_token_accuracy": 0.6516829133033752, + "num_tokens": 783299870.0, + "step": 4663 + }, + { + "entropy": 1.6838775873184204, + "epoch": 0.512372634643377, + "grad_norm": 0.6139092445373535, + "learning_rate": 1.769305106362391e-05, + "loss": 1.4849, + "mean_token_accuracy": 0.6372493157784144, + "num_tokens": 783502449.0, + "step": 4664 + }, + { + "entropy": 1.6079521874586742, + "epoch": 0.5124824915547499, + "grad_norm": 0.7152103781700134, + "learning_rate": 1.7691980532202515e-05, + "loss": 1.2326, + "mean_token_accuracy": 0.6814493189255396, + "num_tokens": 783649568.0, + "step": 4665 + }, + { + "entropy": 1.7108632425467174, + "epoch": 0.5125923484661229, + "grad_norm": 0.6728825569152832, + "learning_rate": 1.7690909788982955e-05, + "loss": 1.5291, + "mean_token_accuracy": 0.640701100230217, + "num_tokens": 783821716.0, + "step": 4666 + }, + { + "entropy": 1.7542012830575306, + "epoch": 0.5127022053774958, + "grad_norm": 0.7167527675628662, + "learning_rate": 1.7689838833999114e-05, + "loss": 1.4436, + "mean_token_accuracy": 0.6508532166481018, + "num_tokens": 783965529.0, + "step": 4667 + }, + { + "entropy": 1.7339794039726257, + "epoch": 0.5128120622888688, + "grad_norm": 0.7113919854164124, + "learning_rate": 1.768876766728489e-05, + "loss": 1.4262, + "mean_token_accuracy": 0.6611630270878474, + "num_tokens": 784156298.0, + "step": 4668 + }, + { + "entropy": 1.7534903983275096, + "epoch": 0.5129219192002417, + "grad_norm": 0.7554349899291992, + "learning_rate": 1.7687696288874182e-05, + "loss": 1.5153, + "mean_token_accuracy": 0.6601629306872686, + "num_tokens": 784364848.0, + "step": 4669 + }, + { + "entropy": 1.6630991399288177, + "epoch": 0.5130317761116147, + "grad_norm": 0.7319965958595276, + "learning_rate": 1.7686624698800897e-05, + "loss": 1.432, + "mean_token_accuracy": 0.6549634039402008, + "num_tokens": 784550039.0, + "step": 4670 + }, + { + "entropy": 1.721706211566925, + "epoch": 0.5131416330229875, + "grad_norm": 0.6252912878990173, + "learning_rate": 1.7685552897098955e-05, + "loss": 1.3013, + "mean_token_accuracy": 0.674846296509107, + "num_tokens": 784672000.0, + "step": 4671 + }, + { + "entropy": 1.6824569404125214, + "epoch": 0.5132514899343605, + "grad_norm": 0.6189214587211609, + "learning_rate": 1.768448088380228e-05, + "loss": 1.3107, + "mean_token_accuracy": 0.6635235399007797, + "num_tokens": 784879713.0, + "step": 4672 + }, + { + "entropy": 1.7048685650030773, + "epoch": 0.5133613468457334, + "grad_norm": 0.5870686769485474, + "learning_rate": 1.7683408658944795e-05, + "loss": 1.4357, + "mean_token_accuracy": 0.6394040137529373, + "num_tokens": 785094291.0, + "step": 4673 + }, + { + "entropy": 1.7368608117103577, + "epoch": 0.5134712037571064, + "grad_norm": 0.7474855184555054, + "learning_rate": 1.7682336222560438e-05, + "loss": 1.4872, + "mean_token_accuracy": 0.6371827771266302, + "num_tokens": 785282608.0, + "step": 4674 + }, + { + "entropy": 1.718473623196284, + "epoch": 0.5135810606684793, + "grad_norm": 0.5720936059951782, + "learning_rate": 1.768126357468315e-05, + "loss": 1.4686, + "mean_token_accuracy": 0.6401006182034811, + "num_tokens": 785508116.0, + "step": 4675 + }, + { + "entropy": 1.712001125017802, + "epoch": 0.5136909175798522, + "grad_norm": 0.8043569922447205, + "learning_rate": 1.7680190715346876e-05, + "loss": 1.502, + "mean_token_accuracy": 0.6602791597445806, + "num_tokens": 785672334.0, + "step": 4676 + }, + { + "entropy": 1.7085146109263103, + "epoch": 0.5138007744912252, + "grad_norm": 0.855053722858429, + "learning_rate": 1.7679117644585583e-05, + "loss": 1.5407, + "mean_token_accuracy": 0.6532570545872053, + "num_tokens": 785838017.0, + "step": 4677 + }, + { + "entropy": 1.6705704132715862, + "epoch": 0.5139106314025981, + "grad_norm": 0.659695029258728, + "learning_rate": 1.7678044362433224e-05, + "loss": 1.2934, + "mean_token_accuracy": 0.6710864454507828, + "num_tokens": 786031568.0, + "step": 4678 + }, + { + "entropy": 1.6983959476153057, + "epoch": 0.5140204883139711, + "grad_norm": 0.6366593241691589, + "learning_rate": 1.767697086892377e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.6328398436307907, + "num_tokens": 786232555.0, + "step": 4679 + }, + { + "entropy": 1.7187410493691762, + "epoch": 0.514130345225344, + "grad_norm": 0.7009495496749878, + "learning_rate": 1.7675897164091197e-05, + "loss": 1.365, + "mean_token_accuracy": 0.6644991288582484, + "num_tokens": 786446202.0, + "step": 4680 + }, + { + "entropy": 1.6954729159673054, + "epoch": 0.514240202136717, + "grad_norm": 0.6604549288749695, + "learning_rate": 1.7674823247969487e-05, + "loss": 1.4347, + "mean_token_accuracy": 0.6466411848862966, + "num_tokens": 786658386.0, + "step": 4681 + }, + { + "entropy": 1.6737544735272725, + "epoch": 0.5143500590480898, + "grad_norm": 0.7725059986114502, + "learning_rate": 1.7673749120592627e-05, + "loss": 1.2401, + "mean_token_accuracy": 0.679726297656695, + "num_tokens": 786790667.0, + "step": 4682 + }, + { + "entropy": 1.7507797380288441, + "epoch": 0.5144599159594628, + "grad_norm": 0.7108690142631531, + "learning_rate": 1.7672674781994617e-05, + "loss": 1.4978, + "mean_token_accuracy": 0.6364447275797526, + "num_tokens": 786961389.0, + "step": 4683 + }, + { + "entropy": 1.708246519168218, + "epoch": 0.5145697728708357, + "grad_norm": 0.6990996599197388, + "learning_rate": 1.7671600232209456e-05, + "loss": 1.3944, + "mean_token_accuracy": 0.660065030058225, + "num_tokens": 787090202.0, + "step": 4684 + }, + { + "entropy": 1.725678304831187, + "epoch": 0.5146796297822087, + "grad_norm": 0.6911423206329346, + "learning_rate": 1.7670525471271152e-05, + "loss": 1.5114, + "mean_token_accuracy": 0.6476559440294901, + "num_tokens": 787250677.0, + "step": 4685 + }, + { + "entropy": 1.673937330643336, + "epoch": 0.5147894866935816, + "grad_norm": 0.5852583050727844, + "learning_rate": 1.7669450499213725e-05, + "loss": 1.395, + "mean_token_accuracy": 0.6683137913544973, + "num_tokens": 787421903.0, + "step": 4686 + }, + { + "entropy": 1.6368590195973713, + "epoch": 0.5148993436049546, + "grad_norm": 0.5973182320594788, + "learning_rate": 1.7668375316071195e-05, + "loss": 1.3752, + "mean_token_accuracy": 0.6662927816311518, + "num_tokens": 787611043.0, + "step": 4687 + }, + { + "entropy": 1.7234513560930889, + "epoch": 0.5150092005163275, + "grad_norm": 0.6610357165336609, + "learning_rate": 1.7667299921877588e-05, + "loss": 1.4776, + "mean_token_accuracy": 0.6609990646441778, + "num_tokens": 787769655.0, + "step": 4688 + }, + { + "entropy": 1.7650385200977325, + "epoch": 0.5151190574277004, + "grad_norm": 0.6733184456825256, + "learning_rate": 1.766622431666695e-05, + "loss": 1.5617, + "mean_token_accuracy": 0.6246377180020014, + "num_tokens": 788020214.0, + "step": 4689 + }, + { + "entropy": 1.7412991126378377, + "epoch": 0.5152289143390734, + "grad_norm": 0.7549744248390198, + "learning_rate": 1.766514850047331e-05, + "loss": 1.4773, + "mean_token_accuracy": 0.6487500021855036, + "num_tokens": 788192824.0, + "step": 4690 + }, + { + "entropy": 1.73355237642924, + "epoch": 0.5153387712504462, + "grad_norm": 0.6612739562988281, + "learning_rate": 1.7664072473330724e-05, + "loss": 1.5076, + "mean_token_accuracy": 0.6572160919507345, + "num_tokens": 788373703.0, + "step": 4691 + }, + { + "entropy": 1.7703491747379303, + "epoch": 0.5154486281618192, + "grad_norm": 0.780145525932312, + "learning_rate": 1.766299623527325e-05, + "loss": 1.3967, + "mean_token_accuracy": 0.6519978841145834, + "num_tokens": 788511272.0, + "step": 4692 + }, + { + "entropy": 1.7383658389250438, + "epoch": 0.5155584850731921, + "grad_norm": 0.6199609041213989, + "learning_rate": 1.7661919786334945e-05, + "loss": 1.3815, + "mean_token_accuracy": 0.6616942385832468, + "num_tokens": 788658018.0, + "step": 4693 + }, + { + "entropy": 1.6493331988652546, + "epoch": 0.5156683419845651, + "grad_norm": 0.672944962978363, + "learning_rate": 1.766084312654988e-05, + "loss": 1.3136, + "mean_token_accuracy": 0.6737960278987885, + "num_tokens": 788789174.0, + "step": 4694 + }, + { + "entropy": 1.7152611513932545, + "epoch": 0.515778198895938, + "grad_norm": 0.5982018709182739, + "learning_rate": 1.7659766255952134e-05, + "loss": 1.3505, + "mean_token_accuracy": 0.6602517565091451, + "num_tokens": 788936963.0, + "step": 4695 + }, + { + "entropy": 1.651655336221059, + "epoch": 0.515888055807311, + "grad_norm": 0.6493039131164551, + "learning_rate": 1.7658689174575785e-05, + "loss": 1.2292, + "mean_token_accuracy": 0.673387145002683, + "num_tokens": 789087968.0, + "step": 4696 + }, + { + "entropy": 1.7118739585081737, + "epoch": 0.5159979127186839, + "grad_norm": 0.634550929069519, + "learning_rate": 1.7657611882454925e-05, + "loss": 1.2787, + "mean_token_accuracy": 0.6688071837027868, + "num_tokens": 789233586.0, + "step": 4697 + }, + { + "entropy": 1.7723990778128307, + "epoch": 0.5161077696300569, + "grad_norm": 0.7632473707199097, + "learning_rate": 1.7656534379623652e-05, + "loss": 1.5881, + "mean_token_accuracy": 0.635076088209947, + "num_tokens": 789379455.0, + "step": 4698 + }, + { + "entropy": 1.7294781108697255, + "epoch": 0.5162176265414298, + "grad_norm": 0.6788251996040344, + "learning_rate": 1.765545666611606e-05, + "loss": 1.344, + "mean_token_accuracy": 0.660454789797465, + "num_tokens": 789515444.0, + "step": 4699 + }, + { + "entropy": 1.7380349238713582, + "epoch": 0.5163274834528028, + "grad_norm": 0.6425085663795471, + "learning_rate": 1.7654378741966264e-05, + "loss": 1.5548, + "mean_token_accuracy": 0.627402106920878, + "num_tokens": 789708134.0, + "step": 4700 + }, + { + "entropy": 1.7113960087299347, + "epoch": 0.5164373403641757, + "grad_norm": 0.7413278818130493, + "learning_rate": 1.7653300607208385e-05, + "loss": 1.3543, + "mean_token_accuracy": 0.6587297916412354, + "num_tokens": 789853557.0, + "step": 4701 + }, + { + "entropy": 1.7466691235701244, + "epoch": 0.5165471972755487, + "grad_norm": 0.6979295015335083, + "learning_rate": 1.7652222261876536e-05, + "loss": 1.4306, + "mean_token_accuracy": 0.6643926600615183, + "num_tokens": 790024665.0, + "step": 4702 + }, + { + "entropy": 1.7214144865671794, + "epoch": 0.5166570541869215, + "grad_norm": 0.9388607740402222, + "learning_rate": 1.7651143706004847e-05, + "loss": 1.3205, + "mean_token_accuracy": 0.6683923502763113, + "num_tokens": 790147552.0, + "step": 4703 + }, + { + "entropy": 1.734945813814799, + "epoch": 0.5167669110982944, + "grad_norm": 0.7507519721984863, + "learning_rate": 1.765006493962746e-05, + "loss": 1.3504, + "mean_token_accuracy": 0.6659105022748312, + "num_tokens": 790333823.0, + "step": 4704 + }, + { + "entropy": 1.6679200232028961, + "epoch": 0.5168767680096674, + "grad_norm": 0.6735995411872864, + "learning_rate": 1.7648985962778514e-05, + "loss": 1.3773, + "mean_token_accuracy": 0.6531980137030283, + "num_tokens": 790525621.0, + "step": 4705 + }, + { + "entropy": 1.7311872939268749, + "epoch": 0.5169866249210403, + "grad_norm": 0.6073651909828186, + "learning_rate": 1.764790677549216e-05, + "loss": 1.4879, + "mean_token_accuracy": 0.6515211214621862, + "num_tokens": 790711418.0, + "step": 4706 + }, + { + "entropy": 1.6866132219632466, + "epoch": 0.5170964818324133, + "grad_norm": 0.5769153237342834, + "learning_rate": 1.764682737780255e-05, + "loss": 1.4447, + "mean_token_accuracy": 0.6449514329433441, + "num_tokens": 790912827.0, + "step": 4707 + }, + { + "entropy": 1.701870898405711, + "epoch": 0.5172063387437862, + "grad_norm": 0.6132122874259949, + "learning_rate": 1.7645747769743852e-05, + "loss": 1.4732, + "mean_token_accuracy": 0.6592078804969788, + "num_tokens": 791053561.0, + "step": 4708 + }, + { + "entropy": 1.6868782341480255, + "epoch": 0.5173161956551592, + "grad_norm": 0.7153650522232056, + "learning_rate": 1.764466795135023e-05, + "loss": 1.4347, + "mean_token_accuracy": 0.6551420340935389, + "num_tokens": 791206764.0, + "step": 4709 + }, + { + "entropy": 1.6517098446687062, + "epoch": 0.5174260525665321, + "grad_norm": 0.6770890355110168, + "learning_rate": 1.7643587922655855e-05, + "loss": 1.4078, + "mean_token_accuracy": 0.6538704832394918, + "num_tokens": 791388634.0, + "step": 4710 + }, + { + "entropy": 1.6130631566047668, + "epoch": 0.5175359094779051, + "grad_norm": 0.6374915838241577, + "learning_rate": 1.7642507683694924e-05, + "loss": 1.3895, + "mean_token_accuracy": 0.6648249477148056, + "num_tokens": 791576408.0, + "step": 4711 + }, + { + "entropy": 1.6589768826961517, + "epoch": 0.517645766389278, + "grad_norm": 0.5480639338493347, + "learning_rate": 1.7641427234501614e-05, + "loss": 1.3822, + "mean_token_accuracy": 0.6536247779925665, + "num_tokens": 791848186.0, + "step": 4712 + }, + { + "entropy": 1.7161799172560375, + "epoch": 0.517755623300651, + "grad_norm": 0.6144800782203674, + "learning_rate": 1.7640346575110127e-05, + "loss": 1.4613, + "mean_token_accuracy": 0.6567795326312383, + "num_tokens": 792008443.0, + "step": 4713 + }, + { + "entropy": 1.7252692977587383, + "epoch": 0.5178654802120238, + "grad_norm": 0.6382774114608765, + "learning_rate": 1.7639265705554664e-05, + "loss": 1.4068, + "mean_token_accuracy": 0.6472566872835159, + "num_tokens": 792193171.0, + "step": 4714 + }, + { + "entropy": 1.6851609845956166, + "epoch": 0.5179753371233968, + "grad_norm": 0.5510247945785522, + "learning_rate": 1.763818462586943e-05, + "loss": 1.4839, + "mean_token_accuracy": 0.6341405063867569, + "num_tokens": 792463312.0, + "step": 4715 + }, + { + "entropy": 1.712292383114497, + "epoch": 0.5180851940347697, + "grad_norm": 0.666677713394165, + "learning_rate": 1.7637103336088642e-05, + "loss": 1.4422, + "mean_token_accuracy": 0.6612060219049454, + "num_tokens": 792584786.0, + "step": 4716 + }, + { + "entropy": 1.6544869939486186, + "epoch": 0.5181950509461426, + "grad_norm": 0.7078261375427246, + "learning_rate": 1.7636021836246527e-05, + "loss": 1.3223, + "mean_token_accuracy": 0.6622943033774694, + "num_tokens": 792722437.0, + "step": 4717 + }, + { + "entropy": 1.7123637199401855, + "epoch": 0.5183049078575156, + "grad_norm": 0.6480149030685425, + "learning_rate": 1.7634940126377315e-05, + "loss": 1.3032, + "mean_token_accuracy": 0.6672448466221491, + "num_tokens": 792887386.0, + "step": 4718 + }, + { + "entropy": 1.7484397490819295, + "epoch": 0.5184147647688885, + "grad_norm": 0.782447099685669, + "learning_rate": 1.7633858206515234e-05, + "loss": 1.3804, + "mean_token_accuracy": 0.6604089935620626, + "num_tokens": 793053286.0, + "step": 4719 + }, + { + "entropy": 1.6887016395727794, + "epoch": 0.5185246216802615, + "grad_norm": 0.7643845081329346, + "learning_rate": 1.763277607669453e-05, + "loss": 1.277, + "mean_token_accuracy": 0.6705976724624634, + "num_tokens": 793189872.0, + "step": 4720 + }, + { + "entropy": 1.689270446697871, + "epoch": 0.5186344785916344, + "grad_norm": 0.6115739941596985, + "learning_rate": 1.7631693736949452e-05, + "loss": 1.3857, + "mean_token_accuracy": 0.6669471363226572, + "num_tokens": 793367014.0, + "step": 4721 + }, + { + "entropy": 1.6571108798185985, + "epoch": 0.5187443355030074, + "grad_norm": 0.6382631659507751, + "learning_rate": 1.7630611187314255e-05, + "loss": 1.3474, + "mean_token_accuracy": 0.6653915196657181, + "num_tokens": 793566430.0, + "step": 4722 + }, + { + "entropy": 1.7158975899219513, + "epoch": 0.5188541924143802, + "grad_norm": 0.6732120513916016, + "learning_rate": 1.7629528427823204e-05, + "loss": 1.4011, + "mean_token_accuracy": 0.6585634350776672, + "num_tokens": 793742006.0, + "step": 4723 + }, + { + "entropy": 1.7122439642747243, + "epoch": 0.5189640493257532, + "grad_norm": 0.672660231590271, + "learning_rate": 1.7628445458510564e-05, + "loss": 1.347, + "mean_token_accuracy": 0.6541797667741776, + "num_tokens": 793896843.0, + "step": 4724 + }, + { + "entropy": 1.6989329655965169, + "epoch": 0.5190739062371261, + "grad_norm": 0.6647095084190369, + "learning_rate": 1.7627362279410612e-05, + "loss": 1.3124, + "mean_token_accuracy": 0.6610298504432043, + "num_tokens": 794009999.0, + "step": 4725 + }, + { + "entropy": 1.6795764764149983, + "epoch": 0.5191837631484991, + "grad_norm": 0.6302659511566162, + "learning_rate": 1.7626278890557634e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.6671308130025864, + "num_tokens": 794196147.0, + "step": 4726 + }, + { + "entropy": 1.6657175024350483, + "epoch": 0.519293620059872, + "grad_norm": 0.6287113428115845, + "learning_rate": 1.762519529198591e-05, + "loss": 1.3875, + "mean_token_accuracy": 0.6727512627840042, + "num_tokens": 794431033.0, + "step": 4727 + }, + { + "entropy": 1.678794761498769, + "epoch": 0.519403476971245, + "grad_norm": 0.7044976949691772, + "learning_rate": 1.762411148372974e-05, + "loss": 1.3174, + "mean_token_accuracy": 0.654426708817482, + "num_tokens": 794584379.0, + "step": 4728 + }, + { + "entropy": 1.6541787485281627, + "epoch": 0.5195133338826179, + "grad_norm": 0.5799862146377563, + "learning_rate": 1.762302746582343e-05, + "loss": 1.3385, + "mean_token_accuracy": 0.6738729576269785, + "num_tokens": 794760900.0, + "step": 4729 + }, + { + "entropy": 1.7091464201609294, + "epoch": 0.5196231907939908, + "grad_norm": 0.664982259273529, + "learning_rate": 1.762194323830128e-05, + "loss": 1.3998, + "mean_token_accuracy": 0.6595932294925054, + "num_tokens": 794928750.0, + "step": 4730 + }, + { + "entropy": 1.6746362348397572, + "epoch": 0.5197330477053638, + "grad_norm": 0.5980947613716125, + "learning_rate": 1.7620858801197617e-05, + "loss": 1.3872, + "mean_token_accuracy": 0.6583947539329529, + "num_tokens": 795101737.0, + "step": 4731 + }, + { + "entropy": 1.713165243466695, + "epoch": 0.5198429046167367, + "grad_norm": 0.7423360347747803, + "learning_rate": 1.761977415454675e-05, + "loss": 1.3698, + "mean_token_accuracy": 0.649875541528066, + "num_tokens": 795282151.0, + "step": 4732 + }, + { + "entropy": 1.7285096148649852, + "epoch": 0.5199527615281097, + "grad_norm": 0.7760996222496033, + "learning_rate": 1.761868929838302e-05, + "loss": 1.2879, + "mean_token_accuracy": 0.6752708901961645, + "num_tokens": 795420946.0, + "step": 4733 + }, + { + "entropy": 1.730550895134608, + "epoch": 0.5200626184394825, + "grad_norm": 0.6987181901931763, + "learning_rate": 1.761760423274075e-05, + "loss": 1.36, + "mean_token_accuracy": 0.6588483999172846, + "num_tokens": 795599685.0, + "step": 4734 + }, + { + "entropy": 1.6970980167388916, + "epoch": 0.5201724753508555, + "grad_norm": 0.6524776220321655, + "learning_rate": 1.761651895765429e-05, + "loss": 1.4112, + "mean_token_accuracy": 0.6538165758053461, + "num_tokens": 795780156.0, + "step": 4735 + }, + { + "entropy": 1.6695838073889415, + "epoch": 0.5202823322622284, + "grad_norm": 0.7201446890830994, + "learning_rate": 1.7615433473157993e-05, + "loss": 1.4756, + "mean_token_accuracy": 0.6536561946074168, + "num_tokens": 795962820.0, + "step": 4736 + }, + { + "entropy": 1.7082595427831013, + "epoch": 0.5203921891736014, + "grad_norm": 0.6136282682418823, + "learning_rate": 1.76143477792862e-05, + "loss": 1.3876, + "mean_token_accuracy": 0.6512712786595026, + "num_tokens": 796138721.0, + "step": 4737 + }, + { + "entropy": 1.6985305150349934, + "epoch": 0.5205020460849743, + "grad_norm": 0.7493578791618347, + "learning_rate": 1.7613261876073285e-05, + "loss": 1.3506, + "mean_token_accuracy": 0.6512027581532797, + "num_tokens": 796276157.0, + "step": 4738 + }, + { + "entropy": 1.7088461021582286, + "epoch": 0.5206119029963473, + "grad_norm": 0.7467851042747498, + "learning_rate": 1.7612175763553607e-05, + "loss": 1.4494, + "mean_token_accuracy": 0.6403765877087911, + "num_tokens": 796476359.0, + "step": 4739 + }, + { + "entropy": 1.6997049450874329, + "epoch": 0.5207217599077202, + "grad_norm": 0.7162910103797913, + "learning_rate": 1.7611089441761548e-05, + "loss": 1.3843, + "mean_token_accuracy": 0.647352010011673, + "num_tokens": 796648058.0, + "step": 4740 + }, + { + "entropy": 1.7611575225989025, + "epoch": 0.5208316168190932, + "grad_norm": 0.7801529765129089, + "learning_rate": 1.7610002910731486e-05, + "loss": 1.4835, + "mean_token_accuracy": 0.6569034606218338, + "num_tokens": 796791732.0, + "step": 4741 + }, + { + "entropy": 1.7292684316635132, + "epoch": 0.5209414737304661, + "grad_norm": 0.6974871754646301, + "learning_rate": 1.7608916170497812e-05, + "loss": 1.4167, + "mean_token_accuracy": 0.6439520965019861, + "num_tokens": 796942236.0, + "step": 4742 + }, + { + "entropy": 1.7139594753583272, + "epoch": 0.521051330641839, + "grad_norm": 0.7737529277801514, + "learning_rate": 1.7607829221094922e-05, + "loss": 1.5007, + "mean_token_accuracy": 0.6438900580008825, + "num_tokens": 797114208.0, + "step": 4743 + }, + { + "entropy": 1.653033008178075, + "epoch": 0.521161187553212, + "grad_norm": 0.7125444412231445, + "learning_rate": 1.760674206255721e-05, + "loss": 1.3416, + "mean_token_accuracy": 0.6721793164809545, + "num_tokens": 797292358.0, + "step": 4744 + }, + { + "entropy": 1.7628530263900757, + "epoch": 0.5212710444645848, + "grad_norm": 0.7119945883750916, + "learning_rate": 1.760565469491909e-05, + "loss": 1.2987, + "mean_token_accuracy": 0.6636313299338022, + "num_tokens": 797431327.0, + "step": 4745 + }, + { + "entropy": 1.748626043399175, + "epoch": 0.5213809013759578, + "grad_norm": 0.7918199300765991, + "learning_rate": 1.7604567118214975e-05, + "loss": 1.4559, + "mean_token_accuracy": 0.6555203547080358, + "num_tokens": 797555640.0, + "step": 4746 + }, + { + "entropy": 1.6953304906686146, + "epoch": 0.5214907582873307, + "grad_norm": 0.9008361101150513, + "learning_rate": 1.7603479332479284e-05, + "loss": 1.3674, + "mean_token_accuracy": 0.6686960806449255, + "num_tokens": 797751389.0, + "step": 4747 + }, + { + "entropy": 1.6819122731685638, + "epoch": 0.5216006151987037, + "grad_norm": 0.6910074949264526, + "learning_rate": 1.7602391337746458e-05, + "loss": 1.2831, + "mean_token_accuracy": 0.6620205889145533, + "num_tokens": 797870103.0, + "step": 4748 + }, + { + "entropy": 1.7419504225254059, + "epoch": 0.5217104721100766, + "grad_norm": 0.6273844242095947, + "learning_rate": 1.760130313405091e-05, + "loss": 1.3587, + "mean_token_accuracy": 0.6500416547060013, + "num_tokens": 798027954.0, + "step": 4749 + }, + { + "entropy": 1.7702117661635082, + "epoch": 0.5218203290214496, + "grad_norm": 0.8262366652488708, + "learning_rate": 1.76002147214271e-05, + "loss": 1.52, + "mean_token_accuracy": 0.6454491962989172, + "num_tokens": 798183172.0, + "step": 4750 + }, + { + "entropy": 1.6769113938013713, + "epoch": 0.5219301859328225, + "grad_norm": 0.6600481271743774, + "learning_rate": 1.7599126099909464e-05, + "loss": 1.6274, + "mean_token_accuracy": 0.6358497887849808, + "num_tokens": 798411244.0, + "step": 4751 + }, + { + "entropy": 1.712651213010152, + "epoch": 0.5220400428441955, + "grad_norm": 0.8367064595222473, + "learning_rate": 1.759803726953246e-05, + "loss": 1.3546, + "mean_token_accuracy": 0.6678621719280878, + "num_tokens": 798620785.0, + "step": 4752 + }, + { + "entropy": 1.7246687213579814, + "epoch": 0.5221498997555684, + "grad_norm": 0.7160963416099548, + "learning_rate": 1.759694823033055e-05, + "loss": 1.4249, + "mean_token_accuracy": 0.6564734677473704, + "num_tokens": 798805644.0, + "step": 4753 + }, + { + "entropy": 1.7259255250295003, + "epoch": 0.5222597566669414, + "grad_norm": 0.7170692682266235, + "learning_rate": 1.7595858982338204e-05, + "loss": 1.3938, + "mean_token_accuracy": 0.6538248707850774, + "num_tokens": 798956926.0, + "step": 4754 + }, + { + "entropy": 1.7542922695477803, + "epoch": 0.5223696135783142, + "grad_norm": 0.6757575869560242, + "learning_rate": 1.759476952558989e-05, + "loss": 1.5494, + "mean_token_accuracy": 0.6471764942010244, + "num_tokens": 799127230.0, + "step": 4755 + }, + { + "entropy": 1.6838472684224446, + "epoch": 0.5224794704896872, + "grad_norm": 0.8514281511306763, + "learning_rate": 1.7593679860120097e-05, + "loss": 1.3779, + "mean_token_accuracy": 0.6497372736533483, + "num_tokens": 799315432.0, + "step": 4756 + }, + { + "entropy": 1.6587632795174916, + "epoch": 0.5225893274010601, + "grad_norm": 0.6651864647865295, + "learning_rate": 1.7592589985963303e-05, + "loss": 1.4102, + "mean_token_accuracy": 0.6655519803365072, + "num_tokens": 799512409.0, + "step": 4757 + }, + { + "entropy": 1.7543583710988362, + "epoch": 0.522699184312433, + "grad_norm": 0.798579216003418, + "learning_rate": 1.759149990315401e-05, + "loss": 1.368, + "mean_token_accuracy": 0.6612423459688822, + "num_tokens": 799651240.0, + "step": 4758 + }, + { + "entropy": 1.6546010772387187, + "epoch": 0.522809041223806, + "grad_norm": 0.7455418109893799, + "learning_rate": 1.759040961172671e-05, + "loss": 1.3556, + "mean_token_accuracy": 0.67067651450634, + "num_tokens": 799845563.0, + "step": 4759 + }, + { + "entropy": 1.7300900121529896, + "epoch": 0.5229188981351789, + "grad_norm": 0.7077800631523132, + "learning_rate": 1.758931911171592e-05, + "loss": 1.4215, + "mean_token_accuracy": 0.6541052609682083, + "num_tokens": 800017346.0, + "step": 4760 + }, + { + "entropy": 1.7226456105709076, + "epoch": 0.5230287550465519, + "grad_norm": 0.5930922627449036, + "learning_rate": 1.758822840315615e-05, + "loss": 1.4411, + "mean_token_accuracy": 0.6517085035641988, + "num_tokens": 800192442.0, + "step": 4761 + }, + { + "entropy": 1.7274209260940552, + "epoch": 0.5231386119579248, + "grad_norm": 0.6994463205337524, + "learning_rate": 1.7587137486081916e-05, + "loss": 1.4818, + "mean_token_accuracy": 0.6374549319346746, + "num_tokens": 800395066.0, + "step": 4762 + }, + { + "entropy": 1.7726040482521057, + "epoch": 0.5232484688692978, + "grad_norm": 0.8898850083351135, + "learning_rate": 1.7586046360527753e-05, + "loss": 1.5346, + "mean_token_accuracy": 0.6310683737198511, + "num_tokens": 800573953.0, + "step": 4763 + }, + { + "entropy": 1.7151733040809631, + "epoch": 0.5233583257806707, + "grad_norm": 0.756864070892334, + "learning_rate": 1.758495502652819e-05, + "loss": 1.3788, + "mean_token_accuracy": 0.658622587720553, + "num_tokens": 800712722.0, + "step": 4764 + }, + { + "entropy": 1.747101644674937, + "epoch": 0.5234681826920436, + "grad_norm": 0.7063867449760437, + "learning_rate": 1.7583863484117766e-05, + "loss": 1.3973, + "mean_token_accuracy": 0.652265245715777, + "num_tokens": 800860313.0, + "step": 4765 + }, + { + "entropy": 1.7522001167138417, + "epoch": 0.5235780396034165, + "grad_norm": 0.5486934781074524, + "learning_rate": 1.7582771733331027e-05, + "loss": 1.4967, + "mean_token_accuracy": 0.6310158222913742, + "num_tokens": 801086371.0, + "step": 4766 + }, + { + "entropy": 1.716380735238393, + "epoch": 0.5236878965147895, + "grad_norm": 0.6977860927581787, + "learning_rate": 1.7581679774202534e-05, + "loss": 1.43, + "mean_token_accuracy": 0.652380645275116, + "num_tokens": 801271543.0, + "step": 4767 + }, + { + "entropy": 1.7060090104738872, + "epoch": 0.5237977534261624, + "grad_norm": 0.6787402033805847, + "learning_rate": 1.7580587606766838e-05, + "loss": 1.543, + "mean_token_accuracy": 0.6341428657372793, + "num_tokens": 801457586.0, + "step": 4768 + }, + { + "entropy": 1.682382086912791, + "epoch": 0.5239076103375354, + "grad_norm": 0.7111299633979797, + "learning_rate": 1.757949523105851e-05, + "loss": 1.244, + "mean_token_accuracy": 0.6785935560862223, + "num_tokens": 801599366.0, + "step": 4769 + }, + { + "entropy": 1.7559874653816223, + "epoch": 0.5240174672489083, + "grad_norm": 0.6998342275619507, + "learning_rate": 1.7578402647112124e-05, + "loss": 1.4426, + "mean_token_accuracy": 0.6572673618793488, + "num_tokens": 801751007.0, + "step": 4770 + }, + { + "entropy": 1.7509790360927582, + "epoch": 0.5241273241602812, + "grad_norm": 1.142534613609314, + "learning_rate": 1.7577309854962256e-05, + "loss": 1.1564, + "mean_token_accuracy": 0.6708057522773743, + "num_tokens": 801943593.0, + "step": 4771 + }, + { + "entropy": 1.6971227129300435, + "epoch": 0.5242371810716542, + "grad_norm": 0.6250020861625671, + "learning_rate": 1.75762168546435e-05, + "loss": 1.5585, + "mean_token_accuracy": 0.6346791485945383, + "num_tokens": 802173777.0, + "step": 4772 + }, + { + "entropy": 1.695990224679311, + "epoch": 0.5243470379830271, + "grad_norm": 0.7434117794036865, + "learning_rate": 1.757512364619044e-05, + "loss": 1.311, + "mean_token_accuracy": 0.6738946636517843, + "num_tokens": 802347486.0, + "step": 4773 + }, + { + "entropy": 1.6820709705352783, + "epoch": 0.5244568948944001, + "grad_norm": 0.6679350137710571, + "learning_rate": 1.757403022963768e-05, + "loss": 1.3477, + "mean_token_accuracy": 0.6635241111119589, + "num_tokens": 802506390.0, + "step": 4774 + }, + { + "entropy": 1.6601552367210388, + "epoch": 0.524566751805773, + "grad_norm": 0.7635940313339233, + "learning_rate": 1.757293660501983e-05, + "loss": 1.4112, + "mean_token_accuracy": 0.6675299257040024, + "num_tokens": 802666473.0, + "step": 4775 + }, + { + "entropy": 1.651892900466919, + "epoch": 0.5246766087171459, + "grad_norm": 0.6960279941558838, + "learning_rate": 1.757184277237149e-05, + "loss": 1.3256, + "mean_token_accuracy": 0.6727364957332611, + "num_tokens": 802810524.0, + "step": 4776 + }, + { + "entropy": 1.704396516084671, + "epoch": 0.5247864656285188, + "grad_norm": 0.5887051820755005, + "learning_rate": 1.7570748731727293e-05, + "loss": 1.3548, + "mean_token_accuracy": 0.6513074586788813, + "num_tokens": 803037321.0, + "step": 4777 + }, + { + "entropy": 1.7361102004845936, + "epoch": 0.5248963225398918, + "grad_norm": 0.8553687930107117, + "learning_rate": 1.7569654483121857e-05, + "loss": 1.4555, + "mean_token_accuracy": 0.652128721276919, + "num_tokens": 803197712.0, + "step": 4778 + }, + { + "entropy": 1.7074143290519714, + "epoch": 0.5250061794512647, + "grad_norm": 0.6320570111274719, + "learning_rate": 1.7568560026589818e-05, + "loss": 1.3462, + "mean_token_accuracy": 0.6641202121973038, + "num_tokens": 803369072.0, + "step": 4779 + }, + { + "entropy": 1.7333478927612305, + "epoch": 0.5251160363626377, + "grad_norm": 0.776484489440918, + "learning_rate": 1.7567465362165818e-05, + "loss": 1.6121, + "mean_token_accuracy": 0.62413057188193, + "num_tokens": 803557142.0, + "step": 4780 + }, + { + "entropy": 1.7478283047676086, + "epoch": 0.5252258932740106, + "grad_norm": 0.6890655755996704, + "learning_rate": 1.756637048988449e-05, + "loss": 1.5187, + "mean_token_accuracy": 0.6530665705601374, + "num_tokens": 803710112.0, + "step": 4781 + }, + { + "entropy": 1.6944151123364766, + "epoch": 0.5253357501853836, + "grad_norm": 0.8545740842819214, + "learning_rate": 1.7565275409780504e-05, + "loss": 1.5678, + "mean_token_accuracy": 0.6560395757357279, + "num_tokens": 803873255.0, + "step": 4782 + }, + { + "entropy": 1.754347950220108, + "epoch": 0.5254456070967565, + "grad_norm": 0.717082679271698, + "learning_rate": 1.7564180121888504e-05, + "loss": 1.4302, + "mean_token_accuracy": 0.6466685732205709, + "num_tokens": 804002011.0, + "step": 4783 + }, + { + "entropy": 1.6676461199919383, + "epoch": 0.5255554640081294, + "grad_norm": 0.7122258543968201, + "learning_rate": 1.756308462624316e-05, + "loss": 1.2871, + "mean_token_accuracy": 0.681659941871961, + "num_tokens": 804132887.0, + "step": 4784 + }, + { + "entropy": 1.6842567523320515, + "epoch": 0.5256653209195024, + "grad_norm": 0.7058034539222717, + "learning_rate": 1.7561988922879147e-05, + "loss": 1.2925, + "mean_token_accuracy": 0.667713056008021, + "num_tokens": 804271045.0, + "step": 4785 + }, + { + "entropy": 1.6645110448201497, + "epoch": 0.5257751778308752, + "grad_norm": 0.6072272062301636, + "learning_rate": 1.756089301183114e-05, + "loss": 1.413, + "mean_token_accuracy": 0.6601279675960541, + "num_tokens": 804484901.0, + "step": 4786 + }, + { + "entropy": 1.6867012182871501, + "epoch": 0.5258850347422482, + "grad_norm": 0.734171986579895, + "learning_rate": 1.755979689313383e-05, + "loss": 1.4347, + "mean_token_accuracy": 0.6705892930428187, + "num_tokens": 804631499.0, + "step": 4787 + }, + { + "entropy": 1.7348099152247112, + "epoch": 0.5259948916536211, + "grad_norm": 0.6457310914993286, + "learning_rate": 1.75587005668219e-05, + "loss": 1.3843, + "mean_token_accuracy": 0.6592828581730524, + "num_tokens": 804800057.0, + "step": 4788 + }, + { + "entropy": 1.7324989934762318, + "epoch": 0.5261047485649941, + "grad_norm": 0.7020387053489685, + "learning_rate": 1.7557604032930056e-05, + "loss": 1.3377, + "mean_token_accuracy": 0.6654490580161413, + "num_tokens": 804932809.0, + "step": 4789 + }, + { + "entropy": 1.7356117367744446, + "epoch": 0.526214605476367, + "grad_norm": 0.8487410545349121, + "learning_rate": 1.7556507291493e-05, + "loss": 1.5411, + "mean_token_accuracy": 0.638851081331571, + "num_tokens": 805093928.0, + "step": 4790 + }, + { + "entropy": 1.66506223877271, + "epoch": 0.52632446238774, + "grad_norm": 0.6197313070297241, + "learning_rate": 1.755541034254544e-05, + "loss": 1.4225, + "mean_token_accuracy": 0.6506545394659042, + "num_tokens": 805275885.0, + "step": 4791 + }, + { + "entropy": 1.6665216783682506, + "epoch": 0.5264343192991129, + "grad_norm": 0.7226223945617676, + "learning_rate": 1.7554313186122095e-05, + "loss": 1.3719, + "mean_token_accuracy": 0.659637118379275, + "num_tokens": 805455227.0, + "step": 4792 + }, + { + "entropy": 1.7102013031641643, + "epoch": 0.5265441762104859, + "grad_norm": 0.7095229625701904, + "learning_rate": 1.7553215822257692e-05, + "loss": 1.4586, + "mean_token_accuracy": 0.6529026329517365, + "num_tokens": 805613449.0, + "step": 4793 + }, + { + "entropy": 1.6817038357257843, + "epoch": 0.5266540331218588, + "grad_norm": 0.6859667301177979, + "learning_rate": 1.7552118250986962e-05, + "loss": 1.3303, + "mean_token_accuracy": 0.6643107682466507, + "num_tokens": 805728020.0, + "step": 4794 + }, + { + "entropy": 1.66180619597435, + "epoch": 0.5267638900332318, + "grad_norm": 0.6296705007553101, + "learning_rate": 1.7551020472344643e-05, + "loss": 1.3457, + "mean_token_accuracy": 0.6573961029450098, + "num_tokens": 805862681.0, + "step": 4795 + }, + { + "entropy": 1.699836363395055, + "epoch": 0.5268737469446046, + "grad_norm": 0.6737419962882996, + "learning_rate": 1.7549922486365478e-05, + "loss": 1.537, + "mean_token_accuracy": 0.6582231894135475, + "num_tokens": 806037799.0, + "step": 4796 + }, + { + "entropy": 1.7254907389481862, + "epoch": 0.5269836038559775, + "grad_norm": 0.7271363735198975, + "learning_rate": 1.7548824293084214e-05, + "loss": 1.4136, + "mean_token_accuracy": 0.6628665079673132, + "num_tokens": 806221209.0, + "step": 4797 + }, + { + "entropy": 1.6994514266649883, + "epoch": 0.5270934607673505, + "grad_norm": 0.7013587951660156, + "learning_rate": 1.7547725892535615e-05, + "loss": 1.3026, + "mean_token_accuracy": 0.6708128750324249, + "num_tokens": 806383779.0, + "step": 4798 + }, + { + "entropy": 1.681582232316335, + "epoch": 0.5272033176787234, + "grad_norm": 0.6710511445999146, + "learning_rate": 1.754662728475444e-05, + "loss": 1.3914, + "mean_token_accuracy": 0.6664837151765823, + "num_tokens": 806556636.0, + "step": 4799 + }, + { + "entropy": 1.7152946889400482, + "epoch": 0.5273131745900964, + "grad_norm": 0.6160458326339722, + "learning_rate": 1.7545528469775467e-05, + "loss": 1.3605, + "mean_token_accuracy": 0.6646227290232977, + "num_tokens": 806699683.0, + "step": 4800 + }, + { + "entropy": 1.6824420094490051, + "epoch": 0.5274230315014693, + "grad_norm": 0.7939539551734924, + "learning_rate": 1.7544429447633464e-05, + "loss": 1.3189, + "mean_token_accuracy": 0.6681206673383713, + "num_tokens": 806867720.0, + "step": 4801 + }, + { + "entropy": 1.6991462310155232, + "epoch": 0.5275328884128423, + "grad_norm": 0.6454995274543762, + "learning_rate": 1.7543330218363214e-05, + "loss": 1.4584, + "mean_token_accuracy": 0.6686960806449255, + "num_tokens": 807035383.0, + "step": 4802 + }, + { + "entropy": 1.7097290853659313, + "epoch": 0.5276427453242152, + "grad_norm": 0.6477057933807373, + "learning_rate": 1.7542230781999518e-05, + "loss": 1.2847, + "mean_token_accuracy": 0.671577995022138, + "num_tokens": 807190570.0, + "step": 4803 + }, + { + "entropy": 1.7435030043125153, + "epoch": 0.5277526022355882, + "grad_norm": 0.72170490026474, + "learning_rate": 1.754113113857716e-05, + "loss": 1.4119, + "mean_token_accuracy": 0.6534734964370728, + "num_tokens": 807344375.0, + "step": 4804 + }, + { + "entropy": 1.6681690216064453, + "epoch": 0.5278624591469611, + "grad_norm": 0.6512613296508789, + "learning_rate": 1.754003128813095e-05, + "loss": 1.3037, + "mean_token_accuracy": 0.6649158795674642, + "num_tokens": 807482406.0, + "step": 4805 + }, + { + "entropy": 1.7332804004351299, + "epoch": 0.5279723160583341, + "grad_norm": 0.6607586741447449, + "learning_rate": 1.75389312306957e-05, + "loss": 1.5653, + "mean_token_accuracy": 0.6444868743419647, + "num_tokens": 807672269.0, + "step": 4806 + }, + { + "entropy": 1.722442050774892, + "epoch": 0.5280821729697069, + "grad_norm": 0.7250016927719116, + "learning_rate": 1.7537830966306224e-05, + "loss": 1.3925, + "mean_token_accuracy": 0.6732650498549143, + "num_tokens": 807822790.0, + "step": 4807 + }, + { + "entropy": 1.7573895851771038, + "epoch": 0.5281920298810799, + "grad_norm": 0.7589662671089172, + "learning_rate": 1.753673049499734e-05, + "loss": 1.3706, + "mean_token_accuracy": 0.6612446457147598, + "num_tokens": 807990370.0, + "step": 4808 + }, + { + "entropy": 1.6575620273749034, + "epoch": 0.5283018867924528, + "grad_norm": 0.5413112044334412, + "learning_rate": 1.753562981680388e-05, + "loss": 1.3177, + "mean_token_accuracy": 0.649698426326116, + "num_tokens": 808176218.0, + "step": 4809 + }, + { + "entropy": 1.728501945734024, + "epoch": 0.5284117437038258, + "grad_norm": 0.6827234029769897, + "learning_rate": 1.7534528931760683e-05, + "loss": 1.3945, + "mean_token_accuracy": 0.6664670258760452, + "num_tokens": 808328962.0, + "step": 4810 + }, + { + "entropy": 1.6973057091236115, + "epoch": 0.5285216006151987, + "grad_norm": 0.6527639627456665, + "learning_rate": 1.753342783990259e-05, + "loss": 1.3681, + "mean_token_accuracy": 0.6585773775974909, + "num_tokens": 808462565.0, + "step": 4811 + }, + { + "entropy": 1.7292284766832988, + "epoch": 0.5286314575265716, + "grad_norm": 0.8234806656837463, + "learning_rate": 1.7532326541264454e-05, + "loss": 1.5218, + "mean_token_accuracy": 0.6624507009983063, + "num_tokens": 808601980.0, + "step": 4812 + }, + { + "entropy": 1.6868124802907307, + "epoch": 0.5287413144379446, + "grad_norm": 0.5748288631439209, + "learning_rate": 1.753122503588112e-05, + "loss": 1.4779, + "mean_token_accuracy": 0.6543243726094564, + "num_tokens": 808866293.0, + "step": 4813 + }, + { + "entropy": 1.7639289100964863, + "epoch": 0.5288511713493175, + "grad_norm": 0.7285853624343872, + "learning_rate": 1.753012332378746e-05, + "loss": 1.4627, + "mean_token_accuracy": 0.6443410267432531, + "num_tokens": 809048139.0, + "step": 4814 + }, + { + "entropy": 1.7024679978688557, + "epoch": 0.5289610282606905, + "grad_norm": 0.7826334238052368, + "learning_rate": 1.752902140501834e-05, + "loss": 1.4016, + "mean_token_accuracy": 0.6619109660387039, + "num_tokens": 809214733.0, + "step": 4815 + }, + { + "entropy": 1.630326509475708, + "epoch": 0.5290708851720634, + "grad_norm": 0.7884056568145752, + "learning_rate": 1.7527919279608633e-05, + "loss": 1.3239, + "mean_token_accuracy": 0.6573766022920609, + "num_tokens": 809357746.0, + "step": 4816 + }, + { + "entropy": 1.7573048671086628, + "epoch": 0.5291807420834364, + "grad_norm": 0.8989459276199341, + "learning_rate": 1.7526816947593224e-05, + "loss": 1.4258, + "mean_token_accuracy": 0.6477284530798594, + "num_tokens": 809518204.0, + "step": 4817 + }, + { + "entropy": 1.635603408018748, + "epoch": 0.5292905989948092, + "grad_norm": 0.6966229677200317, + "learning_rate": 1.7525714409006998e-05, + "loss": 1.3025, + "mean_token_accuracy": 0.6680015424887339, + "num_tokens": 809662973.0, + "step": 4818 + }, + { + "entropy": 1.6825711230436962, + "epoch": 0.5294004559061822, + "grad_norm": 0.6718734502792358, + "learning_rate": 1.7524611663884852e-05, + "loss": 1.4107, + "mean_token_accuracy": 0.6541168093681335, + "num_tokens": 809835552.0, + "step": 4819 + }, + { + "entropy": 1.707334001859029, + "epoch": 0.5295103128175551, + "grad_norm": 0.687263548374176, + "learning_rate": 1.7523508712261685e-05, + "loss": 1.3849, + "mean_token_accuracy": 0.6574449588855108, + "num_tokens": 810024901.0, + "step": 4820 + }, + { + "entropy": 1.713168462117513, + "epoch": 0.5296201697289281, + "grad_norm": 0.6556559801101685, + "learning_rate": 1.752240555417241e-05, + "loss": 1.425, + "mean_token_accuracy": 0.6502045691013336, + "num_tokens": 810199634.0, + "step": 4821 + }, + { + "entropy": 1.7425429324309032, + "epoch": 0.529730026640301, + "grad_norm": 0.6769330501556396, + "learning_rate": 1.7521302189651937e-05, + "loss": 1.332, + "mean_token_accuracy": 0.6587035904328028, + "num_tokens": 810362916.0, + "step": 4822 + }, + { + "entropy": 1.6665611068407695, + "epoch": 0.529839883551674, + "grad_norm": 0.8234541416168213, + "learning_rate": 1.752019861873519e-05, + "loss": 1.3859, + "mean_token_accuracy": 0.6658773571252823, + "num_tokens": 810514991.0, + "step": 4823 + }, + { + "entropy": 1.6548964281876881, + "epoch": 0.5299497404630469, + "grad_norm": 0.6913493275642395, + "learning_rate": 1.7519094841457092e-05, + "loss": 1.4466, + "mean_token_accuracy": 0.6507799476385117, + "num_tokens": 810675171.0, + "step": 4824 + }, + { + "entropy": 1.73293998837471, + "epoch": 0.5300595973744198, + "grad_norm": 0.7102120518684387, + "learning_rate": 1.751799085785258e-05, + "loss": 1.4008, + "mean_token_accuracy": 0.661358987291654, + "num_tokens": 810802681.0, + "step": 4825 + }, + { + "entropy": 1.7738582690556843, + "epoch": 0.5301694542857928, + "grad_norm": 0.6953791379928589, + "learning_rate": 1.7516886667956596e-05, + "loss": 1.4149, + "mean_token_accuracy": 0.6516300787528356, + "num_tokens": 811021221.0, + "step": 4826 + }, + { + "entropy": 1.7118199865023296, + "epoch": 0.5302793111971656, + "grad_norm": 0.6161823272705078, + "learning_rate": 1.7515782271804084e-05, + "loss": 1.4182, + "mean_token_accuracy": 0.6468035380045573, + "num_tokens": 811208787.0, + "step": 4827 + }, + { + "entropy": 1.7987407644589741, + "epoch": 0.5303891681085386, + "grad_norm": 0.8192143440246582, + "learning_rate": 1.7514677669430003e-05, + "loss": 1.3789, + "mean_token_accuracy": 0.6606917083263397, + "num_tokens": 811338291.0, + "step": 4828 + }, + { + "entropy": 1.6597908238569896, + "epoch": 0.5304990250199115, + "grad_norm": 0.833269476890564, + "learning_rate": 1.7513572860869306e-05, + "loss": 1.1746, + "mean_token_accuracy": 0.6920550564924876, + "num_tokens": 811441985.0, + "step": 4829 + }, + { + "entropy": 1.7635074357191722, + "epoch": 0.5306088819312845, + "grad_norm": 0.7363488078117371, + "learning_rate": 1.751246784615696e-05, + "loss": 1.4692, + "mean_token_accuracy": 0.6375128527482351, + "num_tokens": 811634058.0, + "step": 4830 + }, + { + "entropy": 1.7353008687496185, + "epoch": 0.5307187388426574, + "grad_norm": 0.7418941259384155, + "learning_rate": 1.7511362625327947e-05, + "loss": 1.4245, + "mean_token_accuracy": 0.6573583434025446, + "num_tokens": 811760632.0, + "step": 4831 + }, + { + "entropy": 1.7234211166699727, + "epoch": 0.5308285957540304, + "grad_norm": 0.6825436353683472, + "learning_rate": 1.751025719841724e-05, + "loss": 1.3955, + "mean_token_accuracy": 0.6559055695931116, + "num_tokens": 811922986.0, + "step": 4832 + }, + { + "entropy": 1.7144565085570018, + "epoch": 0.5309384526654033, + "grad_norm": 0.6612206101417542, + "learning_rate": 1.7509151565459823e-05, + "loss": 1.5228, + "mean_token_accuracy": 0.6367465257644653, + "num_tokens": 812112649.0, + "step": 4833 + }, + { + "entropy": 1.7186235984166462, + "epoch": 0.5310483095767763, + "grad_norm": 0.6164059638977051, + "learning_rate": 1.7508045726490695e-05, + "loss": 1.3859, + "mean_token_accuracy": 0.655776783823967, + "num_tokens": 812281703.0, + "step": 4834 + }, + { + "entropy": 1.7127485771973927, + "epoch": 0.5311581664881492, + "grad_norm": 0.8158262372016907, + "learning_rate": 1.750693968154485e-05, + "loss": 1.4447, + "mean_token_accuracy": 0.6493441561857859, + "num_tokens": 812448324.0, + "step": 4835 + }, + { + "entropy": 1.696526567141215, + "epoch": 0.5312680233995222, + "grad_norm": 0.7266950011253357, + "learning_rate": 1.7505833430657298e-05, + "loss": 1.3453, + "mean_token_accuracy": 0.6664767960707346, + "num_tokens": 812601651.0, + "step": 4836 + }, + { + "entropy": 1.7216827968756359, + "epoch": 0.5313778803108951, + "grad_norm": 0.7393192052841187, + "learning_rate": 1.7504726973863053e-05, + "loss": 1.4682, + "mean_token_accuracy": 0.6517121444145838, + "num_tokens": 812800925.0, + "step": 4837 + }, + { + "entropy": 1.7090055843194325, + "epoch": 0.5314877372222679, + "grad_norm": 0.7309879064559937, + "learning_rate": 1.7503620311197124e-05, + "loss": 1.3463, + "mean_token_accuracy": 0.6595309128363928, + "num_tokens": 812928476.0, + "step": 4838 + }, + { + "entropy": 1.6507777372996013, + "epoch": 0.5315975941336409, + "grad_norm": 0.5989612340927124, + "learning_rate": 1.7502513442694546e-05, + "loss": 1.3661, + "mean_token_accuracy": 0.6650850723187128, + "num_tokens": 813101231.0, + "step": 4839 + }, + { + "entropy": 1.780005156993866, + "epoch": 0.5317074510450138, + "grad_norm": 0.8584796786308289, + "learning_rate": 1.7501406368390344e-05, + "loss": 1.6108, + "mean_token_accuracy": 0.6370650803049406, + "num_tokens": 813269613.0, + "step": 4840 + }, + { + "entropy": 1.6397046148777008, + "epoch": 0.5318173079563868, + "grad_norm": 0.7476561665534973, + "learning_rate": 1.7500299088319566e-05, + "loss": 1.4002, + "mean_token_accuracy": 0.6559847990671793, + "num_tokens": 813481357.0, + "step": 4841 + }, + { + "entropy": 1.7163714965184529, + "epoch": 0.5319271648677597, + "grad_norm": 0.7873140573501587, + "learning_rate": 1.7499191602517245e-05, + "loss": 1.4229, + "mean_token_accuracy": 0.6485390017429987, + "num_tokens": 813693879.0, + "step": 4842 + }, + { + "entropy": 1.6578922768433888, + "epoch": 0.5320370217791327, + "grad_norm": 0.7017808556556702, + "learning_rate": 1.749808391101844e-05, + "loss": 1.2177, + "mean_token_accuracy": 0.6817097862561544, + "num_tokens": 813842919.0, + "step": 4843 + }, + { + "entropy": 1.6423076788584392, + "epoch": 0.5321468786905056, + "grad_norm": 0.685593843460083, + "learning_rate": 1.7496976013858207e-05, + "loss": 1.2957, + "mean_token_accuracy": 0.6747591296831766, + "num_tokens": 813995390.0, + "step": 4844 + }, + { + "entropy": 1.7144256333510082, + "epoch": 0.5322567356018786, + "grad_norm": 0.7866110801696777, + "learning_rate": 1.749586791107162e-05, + "loss": 1.3694, + "mean_token_accuracy": 0.6647010346253713, + "num_tokens": 814119386.0, + "step": 4845 + }, + { + "entropy": 1.7141908307870228, + "epoch": 0.5323665925132515, + "grad_norm": 0.6247113943099976, + "learning_rate": 1.749475960269373e-05, + "loss": 1.4549, + "mean_token_accuracy": 0.647007574637731, + "num_tokens": 814297783.0, + "step": 4846 + }, + { + "entropy": 1.7318035264809926, + "epoch": 0.5324764494246245, + "grad_norm": 0.8848351240158081, + "learning_rate": 1.7493651088759628e-05, + "loss": 1.5185, + "mean_token_accuracy": 0.6626861343781153, + "num_tokens": 814495884.0, + "step": 4847 + }, + { + "entropy": 1.7968285183111827, + "epoch": 0.5325863063359974, + "grad_norm": 0.774684727191925, + "learning_rate": 1.7492542369304394e-05, + "loss": 1.4586, + "mean_token_accuracy": 0.6544144451618195, + "num_tokens": 814629185.0, + "step": 4848 + }, + { + "entropy": 1.788926084836324, + "epoch": 0.5326961632473703, + "grad_norm": 0.7674810886383057, + "learning_rate": 1.749143344436312e-05, + "loss": 1.4702, + "mean_token_accuracy": 0.6507733265558878, + "num_tokens": 814789810.0, + "step": 4849 + }, + { + "entropy": 1.7028450568517048, + "epoch": 0.5328060201587432, + "grad_norm": 0.8451623320579529, + "learning_rate": 1.7490324313970905e-05, + "loss": 1.3885, + "mean_token_accuracy": 0.6632057080666224, + "num_tokens": 814936262.0, + "step": 4850 + }, + { + "entropy": 1.7443354924519856, + "epoch": 0.5329158770701162, + "grad_norm": 0.7909703254699707, + "learning_rate": 1.748921497816285e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.6472253054380417, + "num_tokens": 815079833.0, + "step": 4851 + }, + { + "entropy": 1.7053726116816204, + "epoch": 0.5330257339814891, + "grad_norm": 0.7654147148132324, + "learning_rate": 1.7488105436974062e-05, + "loss": 1.5746, + "mean_token_accuracy": 0.6418164720137914, + "num_tokens": 815251335.0, + "step": 4852 + }, + { + "entropy": 1.7527342240015666, + "epoch": 0.533135590892862, + "grad_norm": 0.6501696705818176, + "learning_rate": 1.7486995690439666e-05, + "loss": 1.5457, + "mean_token_accuracy": 0.6359410037597021, + "num_tokens": 815454559.0, + "step": 4853 + }, + { + "entropy": 1.719238390525182, + "epoch": 0.533245447804235, + "grad_norm": 2.1015734672546387, + "learning_rate": 1.7485885738594773e-05, + "loss": 1.2862, + "mean_token_accuracy": 0.663156678279241, + "num_tokens": 815642372.0, + "step": 4854 + }, + { + "entropy": 1.7204244335492451, + "epoch": 0.5333553047156079, + "grad_norm": 0.8685169816017151, + "learning_rate": 1.748477558147452e-05, + "loss": 1.3431, + "mean_token_accuracy": 0.6598953902721405, + "num_tokens": 815782219.0, + "step": 4855 + }, + { + "entropy": 1.7100018362204235, + "epoch": 0.5334651616269809, + "grad_norm": 1.181349515914917, + "learning_rate": 1.7483665219114045e-05, + "loss": 1.1781, + "mean_token_accuracy": 0.6599796116352081, + "num_tokens": 815931852.0, + "step": 4856 + }, + { + "entropy": 1.6601063509782155, + "epoch": 0.5335750185383538, + "grad_norm": 0.7519736289978027, + "learning_rate": 1.7482554651548485e-05, + "loss": 1.4403, + "mean_token_accuracy": 0.6680620610713959, + "num_tokens": 816123952.0, + "step": 4857 + }, + { + "entropy": 1.69512935479482, + "epoch": 0.5336848754497268, + "grad_norm": 0.747626781463623, + "learning_rate": 1.7481443878812996e-05, + "loss": 1.3925, + "mean_token_accuracy": 0.6561353007952372, + "num_tokens": 816256226.0, + "step": 4858 + }, + { + "entropy": 1.726113458474477, + "epoch": 0.5337947323610996, + "grad_norm": 0.671970784664154, + "learning_rate": 1.7480332900942722e-05, + "loss": 1.3526, + "mean_token_accuracy": 0.6597483803828558, + "num_tokens": 816387859.0, + "step": 4859 + }, + { + "entropy": 1.6943202714125316, + "epoch": 0.5339045892724726, + "grad_norm": 0.7271941304206848, + "learning_rate": 1.747922171797284e-05, + "loss": 1.2837, + "mean_token_accuracy": 0.6677757650613785, + "num_tokens": 816532923.0, + "step": 4860 + }, + { + "entropy": 1.7412850956122081, + "epoch": 0.5340144461838455, + "grad_norm": 0.7533559203147888, + "learning_rate": 1.74781103299385e-05, + "loss": 1.4504, + "mean_token_accuracy": 0.664714311559995, + "num_tokens": 816655245.0, + "step": 4861 + }, + { + "entropy": 1.7081489165623982, + "epoch": 0.5341243030952185, + "grad_norm": 0.6002654433250427, + "learning_rate": 1.7476998736874896e-05, + "loss": 1.42, + "mean_token_accuracy": 0.6530174712340037, + "num_tokens": 816838841.0, + "step": 4862 + }, + { + "entropy": 1.7771287858486176, + "epoch": 0.5342341600065914, + "grad_norm": 0.9644994139671326, + "learning_rate": 1.74758869388172e-05, + "loss": 1.2972, + "mean_token_accuracy": 0.6680352141459783, + "num_tokens": 817007159.0, + "step": 4863 + }, + { + "entropy": 1.7283466557661693, + "epoch": 0.5343440169179644, + "grad_norm": 0.7296448945999146, + "learning_rate": 1.7474774935800594e-05, + "loss": 1.474, + "mean_token_accuracy": 0.6520171463489532, + "num_tokens": 817134109.0, + "step": 4864 + }, + { + "entropy": 1.6883817911148071, + "epoch": 0.5344538738293373, + "grad_norm": 0.6540996432304382, + "learning_rate": 1.7473662727860285e-05, + "loss": 1.2972, + "mean_token_accuracy": 0.6820466021696726, + "num_tokens": 817272978.0, + "step": 4865 + }, + { + "entropy": 1.7053470313549042, + "epoch": 0.5345637307407102, + "grad_norm": 0.5956873297691345, + "learning_rate": 1.747255031503146e-05, + "loss": 1.3342, + "mean_token_accuracy": 0.6530234664678574, + "num_tokens": 817430466.0, + "step": 4866 + }, + { + "entropy": 1.725131352742513, + "epoch": 0.5346735876520832, + "grad_norm": 0.6326978206634521, + "learning_rate": 1.7471437697349342e-05, + "loss": 1.4458, + "mean_token_accuracy": 0.6490624397993088, + "num_tokens": 817587619.0, + "step": 4867 + }, + { + "entropy": 1.7081331610679626, + "epoch": 0.5347834445634561, + "grad_norm": 0.7116954922676086, + "learning_rate": 1.7470324874849133e-05, + "loss": 1.5435, + "mean_token_accuracy": 0.6464557300011317, + "num_tokens": 817794229.0, + "step": 4868 + }, + { + "entropy": 1.7534505824247997, + "epoch": 0.534893301474829, + "grad_norm": 0.7030560970306396, + "learning_rate": 1.7469211847566062e-05, + "loss": 1.3342, + "mean_token_accuracy": 0.6696716099977493, + "num_tokens": 817920264.0, + "step": 4869 + }, + { + "entropy": 1.7026815017064412, + "epoch": 0.5350031583862019, + "grad_norm": 0.6979672908782959, + "learning_rate": 1.7468098615535347e-05, + "loss": 1.5933, + "mean_token_accuracy": 0.635742649435997, + "num_tokens": 818085089.0, + "step": 4870 + }, + { + "entropy": 1.6528734962145488, + "epoch": 0.5351130152975749, + "grad_norm": 0.7067160606384277, + "learning_rate": 1.7466985178792222e-05, + "loss": 1.2836, + "mean_token_accuracy": 0.6861815551916758, + "num_tokens": 818278281.0, + "step": 4871 + }, + { + "entropy": 1.7629452645778656, + "epoch": 0.5352228722089478, + "grad_norm": 0.709534227848053, + "learning_rate": 1.7465871537371938e-05, + "loss": 1.3458, + "mean_token_accuracy": 0.669313962260882, + "num_tokens": 818428687.0, + "step": 4872 + }, + { + "entropy": 1.7045076290766399, + "epoch": 0.5353327291203208, + "grad_norm": 0.7088239789009094, + "learning_rate": 1.746475769130973e-05, + "loss": 1.3678, + "mean_token_accuracy": 0.6577753275632858, + "num_tokens": 818577943.0, + "step": 4873 + }, + { + "entropy": 1.7641015152136486, + "epoch": 0.5354425860316937, + "grad_norm": 0.7473663687705994, + "learning_rate": 1.746364364064085e-05, + "loss": 1.3615, + "mean_token_accuracy": 0.6545535524686178, + "num_tokens": 818704762.0, + "step": 4874 + }, + { + "entropy": 1.73165625333786, + "epoch": 0.5355524429430667, + "grad_norm": 0.6683910489082336, + "learning_rate": 1.7462529385400567e-05, + "loss": 1.2848, + "mean_token_accuracy": 0.6737756431102753, + "num_tokens": 818836769.0, + "step": 4875 + }, + { + "entropy": 1.727175106604894, + "epoch": 0.5356622998544396, + "grad_norm": 0.7146100997924805, + "learning_rate": 1.7461414925624144e-05, + "loss": 1.2557, + "mean_token_accuracy": 0.6819527049859365, + "num_tokens": 819013897.0, + "step": 4876 + }, + { + "entropy": 1.6949097514152527, + "epoch": 0.5357721567658126, + "grad_norm": 0.7949661016464233, + "learning_rate": 1.7460300261346842e-05, + "loss": 1.4601, + "mean_token_accuracy": 0.6500726789236069, + "num_tokens": 819200845.0, + "step": 4877 + }, + { + "entropy": 1.7253201305866241, + "epoch": 0.5358820136771855, + "grad_norm": 0.6605319976806641, + "learning_rate": 1.745918539260395e-05, + "loss": 1.5041, + "mean_token_accuracy": 0.6472121477127075, + "num_tokens": 819388584.0, + "step": 4878 + }, + { + "entropy": 1.7595091660817463, + "epoch": 0.5359918705885584, + "grad_norm": 0.9308416247367859, + "learning_rate": 1.7458070319430754e-05, + "loss": 1.5802, + "mean_token_accuracy": 0.6386895179748535, + "num_tokens": 819596639.0, + "step": 4879 + }, + { + "entropy": 1.6913528839747112, + "epoch": 0.5361017274999313, + "grad_norm": 0.7783805727958679, + "learning_rate": 1.7456955041862543e-05, + "loss": 1.2217, + "mean_token_accuracy": 0.6871163348356882, + "num_tokens": 819708937.0, + "step": 4880 + }, + { + "entropy": 1.708279420932134, + "epoch": 0.5362115844113042, + "grad_norm": 0.6577259302139282, + "learning_rate": 1.745583955993461e-05, + "loss": 1.3714, + "mean_token_accuracy": 0.6517359912395477, + "num_tokens": 819915470.0, + "step": 4881 + }, + { + "entropy": 1.703675111134847, + "epoch": 0.5363214413226772, + "grad_norm": 0.5665971636772156, + "learning_rate": 1.7454723873682268e-05, + "loss": 1.4554, + "mean_token_accuracy": 0.6373900771141052, + "num_tokens": 820172132.0, + "step": 4882 + }, + { + "entropy": 1.7301917870839436, + "epoch": 0.5364312982340501, + "grad_norm": 0.5813365578651428, + "learning_rate": 1.7453607983140823e-05, + "loss": 1.5125, + "mean_token_accuracy": 0.6375814924637476, + "num_tokens": 820379791.0, + "step": 4883 + }, + { + "entropy": 1.7420111298561096, + "epoch": 0.5365411551454231, + "grad_norm": 0.6530336141586304, + "learning_rate": 1.745249188834559e-05, + "loss": 1.456, + "mean_token_accuracy": 0.6411418666442236, + "num_tokens": 820565833.0, + "step": 4884 + }, + { + "entropy": 1.801695555448532, + "epoch": 0.536651012056796, + "grad_norm": 0.7685750722885132, + "learning_rate": 1.74513755893319e-05, + "loss": 1.5374, + "mean_token_accuracy": 0.634370227654775, + "num_tokens": 820688906.0, + "step": 4885 + }, + { + "entropy": 1.751669466495514, + "epoch": 0.536760868968169, + "grad_norm": 0.7663411498069763, + "learning_rate": 1.7450259086135078e-05, + "loss": 1.4194, + "mean_token_accuracy": 0.6512367278337479, + "num_tokens": 820836128.0, + "step": 4886 + }, + { + "entropy": 1.7377658585707347, + "epoch": 0.5368707258795419, + "grad_norm": 0.7554279565811157, + "learning_rate": 1.744914237879046e-05, + "loss": 1.198, + "mean_token_accuracy": 0.6807886908451716, + "num_tokens": 820951980.0, + "step": 4887 + }, + { + "entropy": 1.782357394695282, + "epoch": 0.5369805827909149, + "grad_norm": 0.672864556312561, + "learning_rate": 1.74480254673334e-05, + "loss": 1.3533, + "mean_token_accuracy": 0.65904101729393, + "num_tokens": 821101205.0, + "step": 4888 + }, + { + "entropy": 1.742653727531433, + "epoch": 0.5370904397022878, + "grad_norm": 0.6784942150115967, + "learning_rate": 1.7446908351799233e-05, + "loss": 1.2928, + "mean_token_accuracy": 0.6659032354752222, + "num_tokens": 821266573.0, + "step": 4889 + }, + { + "entropy": 1.679027299086253, + "epoch": 0.5372002966136608, + "grad_norm": 0.6892912983894348, + "learning_rate": 1.7445791032223322e-05, + "loss": 1.3674, + "mean_token_accuracy": 0.6666253606478373, + "num_tokens": 821437038.0, + "step": 4890 + }, + { + "entropy": 1.7067703604698181, + "epoch": 0.5373101535250336, + "grad_norm": 0.6747919917106628, + "learning_rate": 1.744467350864103e-05, + "loss": 1.2386, + "mean_token_accuracy": 0.6692210485537847, + "num_tokens": 821571344.0, + "step": 4891 + }, + { + "entropy": 1.7276716828346252, + "epoch": 0.5374200104364065, + "grad_norm": 0.9028007388114929, + "learning_rate": 1.7443555781087726e-05, + "loss": 1.5016, + "mean_token_accuracy": 0.6459170381228129, + "num_tokens": 821732320.0, + "step": 4892 + }, + { + "entropy": 1.6987963616847992, + "epoch": 0.5375298673477795, + "grad_norm": 0.7619015574455261, + "learning_rate": 1.7442437849598785e-05, + "loss": 1.3314, + "mean_token_accuracy": 0.6694723268349966, + "num_tokens": 821925047.0, + "step": 4893 + }, + { + "entropy": 1.74809134999911, + "epoch": 0.5376397242591524, + "grad_norm": 0.6891080141067505, + "learning_rate": 1.744131971420959e-05, + "loss": 1.6016, + "mean_token_accuracy": 0.633436476190885, + "num_tokens": 822098417.0, + "step": 4894 + }, + { + "entropy": 1.7045816977818806, + "epoch": 0.5377495811705254, + "grad_norm": 0.603262186050415, + "learning_rate": 1.7440201374955528e-05, + "loss": 1.5185, + "mean_token_accuracy": 0.6483029425144196, + "num_tokens": 822284175.0, + "step": 4895 + }, + { + "entropy": 1.7040814061959584, + "epoch": 0.5378594380818983, + "grad_norm": 0.6909913420677185, + "learning_rate": 1.7439082831871997e-05, + "loss": 1.3295, + "mean_token_accuracy": 0.6752724895874659, + "num_tokens": 822404812.0, + "step": 4896 + }, + { + "entropy": 1.6904057959715526, + "epoch": 0.5379692949932713, + "grad_norm": 0.6549181342124939, + "learning_rate": 1.743796408499439e-05, + "loss": 1.3216, + "mean_token_accuracy": 0.6638427078723907, + "num_tokens": 822590204.0, + "step": 4897 + }, + { + "entropy": 1.6527547438939412, + "epoch": 0.5380791519046442, + "grad_norm": 0.7167640924453735, + "learning_rate": 1.7436845134358123e-05, + "loss": 1.3751, + "mean_token_accuracy": 0.6618984391291937, + "num_tokens": 822782283.0, + "step": 4898 + }, + { + "entropy": 1.6621145009994507, + "epoch": 0.5381890088160172, + "grad_norm": 0.7170586585998535, + "learning_rate": 1.743572597999861e-05, + "loss": 1.3813, + "mean_token_accuracy": 0.657960906624794, + "num_tokens": 822958457.0, + "step": 4899 + }, + { + "entropy": 1.6679618457953136, + "epoch": 0.53829886572739, + "grad_norm": 0.6240289807319641, + "learning_rate": 1.743460662195127e-05, + "loss": 1.3381, + "mean_token_accuracy": 0.6678373962640762, + "num_tokens": 823116527.0, + "step": 4900 + }, + { + "entropy": 1.7151422599951427, + "epoch": 0.538408722638763, + "grad_norm": 0.6621232628822327, + "learning_rate": 1.7433487060251527e-05, + "loss": 1.5341, + "mean_token_accuracy": 0.6566172788540522, + "num_tokens": 823296593.0, + "step": 4901 + }, + { + "entropy": 1.6715179483095806, + "epoch": 0.5385185795501359, + "grad_norm": 0.663271427154541, + "learning_rate": 1.743236729493482e-05, + "loss": 1.3851, + "mean_token_accuracy": 0.6610744049151739, + "num_tokens": 823449082.0, + "step": 4902 + }, + { + "entropy": 1.7014525334040325, + "epoch": 0.5386284364615089, + "grad_norm": 0.6124004125595093, + "learning_rate": 1.7431247326036583e-05, + "loss": 1.4697, + "mean_token_accuracy": 0.6529516379038492, + "num_tokens": 823615779.0, + "step": 4903 + }, + { + "entropy": 1.6754214763641357, + "epoch": 0.5387382933728818, + "grad_norm": 0.7338191270828247, + "learning_rate": 1.7430127153592272e-05, + "loss": 1.3501, + "mean_token_accuracy": 0.6698144127925237, + "num_tokens": 823744244.0, + "step": 4904 + }, + { + "entropy": 1.7061218818028767, + "epoch": 0.5388481502842548, + "grad_norm": 0.6320319771766663, + "learning_rate": 1.742900677763733e-05, + "loss": 1.333, + "mean_token_accuracy": 0.66542187333107, + "num_tokens": 823896705.0, + "step": 4905 + }, + { + "entropy": 1.70430921514829, + "epoch": 0.5389580071956277, + "grad_norm": 0.7169397473335266, + "learning_rate": 1.742788619820722e-05, + "loss": 1.4334, + "mean_token_accuracy": 0.6706308672825495, + "num_tokens": 824071474.0, + "step": 4906 + }, + { + "entropy": 1.7034543951352437, + "epoch": 0.5390678641070006, + "grad_norm": 0.6652551293373108, + "learning_rate": 1.7426765415337406e-05, + "loss": 1.5371, + "mean_token_accuracy": 0.6554579238096873, + "num_tokens": 824216176.0, + "step": 4907 + }, + { + "entropy": 1.739420880873998, + "epoch": 0.5391777210183736, + "grad_norm": 0.9010681509971619, + "learning_rate": 1.7425644429063372e-05, + "loss": 1.5553, + "mean_token_accuracy": 0.6332442959149679, + "num_tokens": 824346870.0, + "step": 4908 + }, + { + "entropy": 1.7290275891621907, + "epoch": 0.5392875779297465, + "grad_norm": 0.6607105135917664, + "learning_rate": 1.742452323942058e-05, + "loss": 1.4219, + "mean_token_accuracy": 0.6464213828245798, + "num_tokens": 824523488.0, + "step": 4909 + }, + { + "entropy": 1.7586493094762166, + "epoch": 0.5393974348411195, + "grad_norm": 0.7161458730697632, + "learning_rate": 1.742340184644452e-05, + "loss": 1.3448, + "mean_token_accuracy": 0.6701912134885788, + "num_tokens": 824669157.0, + "step": 4910 + }, + { + "entropy": 1.679528295993805, + "epoch": 0.5395072917524923, + "grad_norm": 0.6938197612762451, + "learning_rate": 1.7422280250170693e-05, + "loss": 1.3921, + "mean_token_accuracy": 0.6523070633411407, + "num_tokens": 824861531.0, + "step": 4911 + }, + { + "entropy": 1.6967064638932545, + "epoch": 0.5396171486638653, + "grad_norm": 0.6822254657745361, + "learning_rate": 1.7421158450634586e-05, + "loss": 1.448, + "mean_token_accuracy": 0.6576181898514429, + "num_tokens": 825039311.0, + "step": 4912 + }, + { + "entropy": 1.6692471305529277, + "epoch": 0.5397270055752382, + "grad_norm": 0.7633799910545349, + "learning_rate": 1.742003644787171e-05, + "loss": 1.3524, + "mean_token_accuracy": 0.6830050398906072, + "num_tokens": 825165157.0, + "step": 4913 + }, + { + "entropy": 1.684964507818222, + "epoch": 0.5398368624866112, + "grad_norm": 0.5935234427452087, + "learning_rate": 1.7418914241917572e-05, + "loss": 1.3683, + "mean_token_accuracy": 0.6735412726799647, + "num_tokens": 825335892.0, + "step": 4914 + }, + { + "entropy": 1.7406889696915944, + "epoch": 0.5399467193979841, + "grad_norm": 0.7186749577522278, + "learning_rate": 1.741779183280769e-05, + "loss": 1.594, + "mean_token_accuracy": 0.6328186293443044, + "num_tokens": 825567908.0, + "step": 4915 + }, + { + "entropy": 1.6296118994553883, + "epoch": 0.5400565763093571, + "grad_norm": 0.7475072741508484, + "learning_rate": 1.741666922057759e-05, + "loss": 1.3133, + "mean_token_accuracy": 0.6770787388086319, + "num_tokens": 825723520.0, + "step": 4916 + }, + { + "entropy": 1.6742859582106273, + "epoch": 0.54016643322073, + "grad_norm": 0.7436842918395996, + "learning_rate": 1.7415546405262797e-05, + "loss": 1.3603, + "mean_token_accuracy": 0.6628256142139435, + "num_tokens": 825860111.0, + "step": 4917 + }, + { + "entropy": 1.6383836766084034, + "epoch": 0.540276290132103, + "grad_norm": 0.6304495930671692, + "learning_rate": 1.7414423386898857e-05, + "loss": 1.3379, + "mean_token_accuracy": 0.664565180738767, + "num_tokens": 826041019.0, + "step": 4918 + }, + { + "entropy": 1.6786586443583171, + "epoch": 0.5403861470434759, + "grad_norm": 0.6277565956115723, + "learning_rate": 1.74133001655213e-05, + "loss": 1.4681, + "mean_token_accuracy": 0.6444205145041147, + "num_tokens": 826262462.0, + "step": 4919 + }, + { + "entropy": 1.6958061456680298, + "epoch": 0.5404960039548488, + "grad_norm": 0.8898590803146362, + "learning_rate": 1.7412176741165687e-05, + "loss": 1.4818, + "mean_token_accuracy": 0.6551410953203837, + "num_tokens": 826420243.0, + "step": 4920 + }, + { + "entropy": 1.7218329807122548, + "epoch": 0.5406058608662218, + "grad_norm": 0.6511401534080505, + "learning_rate": 1.741105311386757e-05, + "loss": 1.3571, + "mean_token_accuracy": 0.6615760376056036, + "num_tokens": 826616257.0, + "step": 4921 + }, + { + "entropy": 1.7913587391376495, + "epoch": 0.5407157177775946, + "grad_norm": 0.6388662457466125, + "learning_rate": 1.740992928366251e-05, + "loss": 1.3981, + "mean_token_accuracy": 0.6544657200574875, + "num_tokens": 826805321.0, + "step": 4922 + }, + { + "entropy": 1.6517898738384247, + "epoch": 0.5408255746889676, + "grad_norm": 0.6503629684448242, + "learning_rate": 1.7408805250586077e-05, + "loss": 1.4909, + "mean_token_accuracy": 0.6562631527582804, + "num_tokens": 826976397.0, + "step": 4923 + }, + { + "entropy": 1.7049082815647125, + "epoch": 0.5409354316003405, + "grad_norm": 0.7112172842025757, + "learning_rate": 1.7407681014673844e-05, + "loss": 1.3882, + "mean_token_accuracy": 0.6503375222285589, + "num_tokens": 827134334.0, + "step": 4924 + }, + { + "entropy": 1.721593697865804, + "epoch": 0.5410452885117135, + "grad_norm": 0.6750729084014893, + "learning_rate": 1.7406556575961394e-05, + "loss": 1.3612, + "mean_token_accuracy": 0.6642037878433863, + "num_tokens": 827268866.0, + "step": 4925 + }, + { + "entropy": 1.743778149286906, + "epoch": 0.5411551454230864, + "grad_norm": 0.8075199723243713, + "learning_rate": 1.7405431934484318e-05, + "loss": 1.5953, + "mean_token_accuracy": 0.6419996519883474, + "num_tokens": 827456250.0, + "step": 4926 + }, + { + "entropy": 1.701025813817978, + "epoch": 0.5412650023344594, + "grad_norm": 0.6938318014144897, + "learning_rate": 1.7404307090278206e-05, + "loss": 1.2758, + "mean_token_accuracy": 0.6731408586104711, + "num_tokens": 827569239.0, + "step": 4927 + }, + { + "entropy": 1.6912222007910411, + "epoch": 0.5413748592458323, + "grad_norm": 0.7330572009086609, + "learning_rate": 1.7403182043378662e-05, + "loss": 1.3019, + "mean_token_accuracy": 0.6731463919083277, + "num_tokens": 827733160.0, + "step": 4928 + }, + { + "entropy": 1.6323689023653667, + "epoch": 0.5414847161572053, + "grad_norm": 0.6897679567337036, + "learning_rate": 1.740205679382129e-05, + "loss": 1.3017, + "mean_token_accuracy": 0.6699829796950022, + "num_tokens": 827857219.0, + "step": 4929 + }, + { + "entropy": 1.6964257657527924, + "epoch": 0.5415945730685782, + "grad_norm": 0.6199988722801208, + "learning_rate": 1.7400931341641706e-05, + "loss": 1.4628, + "mean_token_accuracy": 0.6376723100741705, + "num_tokens": 828046638.0, + "step": 4930 + }, + { + "entropy": 1.6805628935496013, + "epoch": 0.5417044299799512, + "grad_norm": 0.7456372976303101, + "learning_rate": 1.7399805686875527e-05, + "loss": 1.2353, + "mean_token_accuracy": 0.6760278890530268, + "num_tokens": 828160151.0, + "step": 4931 + }, + { + "entropy": 1.7248408893744152, + "epoch": 0.541814286891324, + "grad_norm": 0.69236159324646, + "learning_rate": 1.7398679829558386e-05, + "loss": 1.4688, + "mean_token_accuracy": 0.6398780643939972, + "num_tokens": 828331278.0, + "step": 4932 + }, + { + "entropy": 1.6821700930595398, + "epoch": 0.5419241438026969, + "grad_norm": 0.7130243182182312, + "learning_rate": 1.739755376972591e-05, + "loss": 1.4732, + "mean_token_accuracy": 0.6641266047954559, + "num_tokens": 828474991.0, + "step": 4933 + }, + { + "entropy": 1.6392736335595448, + "epoch": 0.5420340007140699, + "grad_norm": 0.7200448513031006, + "learning_rate": 1.7396427507413737e-05, + "loss": 1.3565, + "mean_token_accuracy": 0.6657882034778595, + "num_tokens": 828615538.0, + "step": 4934 + }, + { + "entropy": 1.6556443572044373, + "epoch": 0.5421438576254428, + "grad_norm": 0.7323468327522278, + "learning_rate": 1.739530104265752e-05, + "loss": 1.1859, + "mean_token_accuracy": 0.6867117136716843, + "num_tokens": 828766143.0, + "step": 4935 + }, + { + "entropy": 1.7083774209022522, + "epoch": 0.5422537145368158, + "grad_norm": 0.77711021900177, + "learning_rate": 1.7394174375492906e-05, + "loss": 1.2845, + "mean_token_accuracy": 0.6667287697394689, + "num_tokens": 828880028.0, + "step": 4936 + }, + { + "entropy": 1.6991042792797089, + "epoch": 0.5423635714481887, + "grad_norm": 0.694174587726593, + "learning_rate": 1.739304750595555e-05, + "loss": 1.2199, + "mean_token_accuracy": 0.6751233587662379, + "num_tokens": 828983386.0, + "step": 4937 + }, + { + "entropy": 1.7104702393213909, + "epoch": 0.5424734283595617, + "grad_norm": 0.6545060873031616, + "learning_rate": 1.7391920434081126e-05, + "loss": 1.4173, + "mean_token_accuracy": 0.6476683566967646, + "num_tokens": 829168137.0, + "step": 4938 + }, + { + "entropy": 1.693225493033727, + "epoch": 0.5425832852709346, + "grad_norm": 0.7685295939445496, + "learning_rate": 1.73907931599053e-05, + "loss": 1.2846, + "mean_token_accuracy": 0.6751909504334132, + "num_tokens": 829405939.0, + "step": 4939 + }, + { + "entropy": 1.7319706281026204, + "epoch": 0.5426931421823076, + "grad_norm": 0.8708465695381165, + "learning_rate": 1.7389665683463748e-05, + "loss": 1.558, + "mean_token_accuracy": 0.6340043346087137, + "num_tokens": 829636248.0, + "step": 4940 + }, + { + "entropy": 1.7094059487183888, + "epoch": 0.5428029990936805, + "grad_norm": 0.9573303461074829, + "learning_rate": 1.738853800479216e-05, + "loss": 1.4207, + "mean_token_accuracy": 0.6643887509902319, + "num_tokens": 829784445.0, + "step": 4941 + }, + { + "entropy": 1.748300055662791, + "epoch": 0.5429128560050535, + "grad_norm": 0.8324919939041138, + "learning_rate": 1.738741012392622e-05, + "loss": 1.3749, + "mean_token_accuracy": 0.6539400120576223, + "num_tokens": 829953714.0, + "step": 4942 + }, + { + "entropy": 1.699168602625529, + "epoch": 0.5430227129164263, + "grad_norm": 0.6905611157417297, + "learning_rate": 1.7386282040901626e-05, + "loss": 1.487, + "mean_token_accuracy": 0.6424149125814438, + "num_tokens": 830112166.0, + "step": 4943 + }, + { + "entropy": 1.683763285477956, + "epoch": 0.5431325698277993, + "grad_norm": 0.8066513538360596, + "learning_rate": 1.7385153755754087e-05, + "loss": 1.3186, + "mean_token_accuracy": 0.6723660826683044, + "num_tokens": 830229332.0, + "step": 4944 + }, + { + "entropy": 1.7084158559640248, + "epoch": 0.5432424267391722, + "grad_norm": 0.6088327765464783, + "learning_rate": 1.7384025268519307e-05, + "loss": 1.5253, + "mean_token_accuracy": 0.62581634024779, + "num_tokens": 830507166.0, + "step": 4945 + }, + { + "entropy": 1.7765875260035198, + "epoch": 0.5433522836505452, + "grad_norm": 0.8466908931732178, + "learning_rate": 1.7382896579233003e-05, + "loss": 1.5159, + "mean_token_accuracy": 0.6420450657606125, + "num_tokens": 830700696.0, + "step": 4946 + }, + { + "entropy": 1.7295196950435638, + "epoch": 0.5434621405619181, + "grad_norm": 0.7594314813613892, + "learning_rate": 1.7381767687930903e-05, + "loss": 1.3716, + "mean_token_accuracy": 0.6728833566109339, + "num_tokens": 830855018.0, + "step": 4947 + }, + { + "entropy": 1.6678162415822346, + "epoch": 0.543571997473291, + "grad_norm": 0.5881645083427429, + "learning_rate": 1.7380638594648728e-05, + "loss": 1.4235, + "mean_token_accuracy": 0.6409558157126108, + "num_tokens": 831035506.0, + "step": 4948 + }, + { + "entropy": 1.744312971830368, + "epoch": 0.543681854384664, + "grad_norm": 0.6560190916061401, + "learning_rate": 1.7379509299422216e-05, + "loss": 1.3315, + "mean_token_accuracy": 0.6596087117989858, + "num_tokens": 831167690.0, + "step": 4949 + }, + { + "entropy": 1.7742221355438232, + "epoch": 0.5437917112960369, + "grad_norm": 0.7557669281959534, + "learning_rate": 1.7378379802287113e-05, + "loss": 1.3269, + "mean_token_accuracy": 0.6664220293362936, + "num_tokens": 831302737.0, + "step": 4950 + }, + { + "entropy": 1.6314336359500885, + "epoch": 0.5439015682074099, + "grad_norm": 2.5049948692321777, + "learning_rate": 1.737725010327916e-05, + "loss": 1.4156, + "mean_token_accuracy": 0.6630344639221827, + "num_tokens": 831506832.0, + "step": 4951 + }, + { + "entropy": 1.6986377537250519, + "epoch": 0.5440114251187828, + "grad_norm": 0.6551359295845032, + "learning_rate": 1.737612020243411e-05, + "loss": 1.3055, + "mean_token_accuracy": 0.659782146414121, + "num_tokens": 831643698.0, + "step": 4952 + }, + { + "entropy": 1.6850547790527344, + "epoch": 0.5441212820301558, + "grad_norm": 0.6227862238883972, + "learning_rate": 1.7374990099787737e-05, + "loss": 1.3241, + "mean_token_accuracy": 0.6679957658052444, + "num_tokens": 831809561.0, + "step": 4953 + }, + { + "entropy": 1.7365097105503082, + "epoch": 0.5442311389415286, + "grad_norm": 0.7013810873031616, + "learning_rate": 1.7373859795375797e-05, + "loss": 1.5199, + "mean_token_accuracy": 0.6493206669886907, + "num_tokens": 831954609.0, + "step": 4954 + }, + { + "entropy": 1.6703599095344543, + "epoch": 0.5443409958529016, + "grad_norm": 0.782053530216217, + "learning_rate": 1.7372729289234064e-05, + "loss": 1.2796, + "mean_token_accuracy": 0.6680347323417664, + "num_tokens": 832063290.0, + "step": 4955 + }, + { + "entropy": 1.7207246720790863, + "epoch": 0.5444508527642745, + "grad_norm": 0.5964549779891968, + "learning_rate": 1.7371598581398325e-05, + "loss": 1.3947, + "mean_token_accuracy": 0.6506600578625997, + "num_tokens": 832266378.0, + "step": 4956 + }, + { + "entropy": 1.690430869658788, + "epoch": 0.5445607096756475, + "grad_norm": 0.690424382686615, + "learning_rate": 1.737046767190436e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6578344404697418, + "num_tokens": 832434529.0, + "step": 4957 + }, + { + "entropy": 1.7267669141292572, + "epoch": 0.5446705665870204, + "grad_norm": 0.7527792453765869, + "learning_rate": 1.7369336560787966e-05, + "loss": 1.3702, + "mean_token_accuracy": 0.6688602467377981, + "num_tokens": 832555033.0, + "step": 4958 + }, + { + "entropy": 1.6294583181540172, + "epoch": 0.5447804234983934, + "grad_norm": 0.8316755890846252, + "learning_rate": 1.736820524808494e-05, + "loss": 1.3852, + "mean_token_accuracy": 0.6587762931982676, + "num_tokens": 832758932.0, + "step": 4959 + }, + { + "entropy": 1.7353256543477376, + "epoch": 0.5448902804097663, + "grad_norm": 0.6854872107505798, + "learning_rate": 1.7367073733831085e-05, + "loss": 1.4275, + "mean_token_accuracy": 0.6549768050511678, + "num_tokens": 832967800.0, + "step": 4960 + }, + { + "entropy": 1.666805108388265, + "epoch": 0.5450001373211392, + "grad_norm": 0.6198726892471313, + "learning_rate": 1.7365942018062216e-05, + "loss": 1.567, + "mean_token_accuracy": 0.6273992757002512, + "num_tokens": 833157652.0, + "step": 4961 + }, + { + "entropy": 1.7501900096734364, + "epoch": 0.5451099942325122, + "grad_norm": 0.701242208480835, + "learning_rate": 1.736481010081415e-05, + "loss": 1.3403, + "mean_token_accuracy": 0.6606934070587158, + "num_tokens": 833284661.0, + "step": 4962 + }, + { + "entropy": 1.7042124271392822, + "epoch": 0.545219851143885, + "grad_norm": 0.7061309814453125, + "learning_rate": 1.7363677982122713e-05, + "loss": 1.4459, + "mean_token_accuracy": 0.6599131673574448, + "num_tokens": 833426934.0, + "step": 4963 + }, + { + "entropy": 1.670636922121048, + "epoch": 0.545329708055258, + "grad_norm": 0.6328370571136475, + "learning_rate": 1.7362545662023735e-05, + "loss": 1.3507, + "mean_token_accuracy": 0.6591466615597407, + "num_tokens": 833573679.0, + "step": 4964 + }, + { + "entropy": 1.6602637072404225, + "epoch": 0.5454395649666309, + "grad_norm": 0.6920685172080994, + "learning_rate": 1.7361413140553058e-05, + "loss": 1.2574, + "mean_token_accuracy": 0.6768335302670797, + "num_tokens": 833695555.0, + "step": 4965 + }, + { + "entropy": 1.740365246931712, + "epoch": 0.5455494218780039, + "grad_norm": 0.7330303192138672, + "learning_rate": 1.7360280417746515e-05, + "loss": 1.3766, + "mean_token_accuracy": 0.6667214135328928, + "num_tokens": 833814908.0, + "step": 4966 + }, + { + "entropy": 1.6844541629155476, + "epoch": 0.5456592787893768, + "grad_norm": 0.6612024307250977, + "learning_rate": 1.7359147493639966e-05, + "loss": 1.3186, + "mean_token_accuracy": 0.662054200967153, + "num_tokens": 833944687.0, + "step": 4967 + }, + { + "entropy": 1.706677258014679, + "epoch": 0.5457691357007498, + "grad_norm": 0.7018133401870728, + "learning_rate": 1.7358014368269265e-05, + "loss": 1.4785, + "mean_token_accuracy": 0.6449083437522253, + "num_tokens": 834118400.0, + "step": 4968 + }, + { + "entropy": 1.6965554157892864, + "epoch": 0.5458789926121227, + "grad_norm": 0.6533994078636169, + "learning_rate": 1.735688104167027e-05, + "loss": 1.4246, + "mean_token_accuracy": 0.648577556014061, + "num_tokens": 834269641.0, + "step": 4969 + }, + { + "entropy": 1.6984079976876576, + "epoch": 0.5459888495234957, + "grad_norm": 0.8152681589126587, + "learning_rate": 1.735574751387886e-05, + "loss": 1.2825, + "mean_token_accuracy": 0.6712081631024679, + "num_tokens": 834417936.0, + "step": 4970 + }, + { + "entropy": 1.630674570798874, + "epoch": 0.5460987064348686, + "grad_norm": 0.6994221806526184, + "learning_rate": 1.7354613784930904e-05, + "loss": 1.4286, + "mean_token_accuracy": 0.6537977854410807, + "num_tokens": 834607646.0, + "step": 4971 + }, + { + "entropy": 1.7504248122374217, + "epoch": 0.5462085633462416, + "grad_norm": 0.8393651247024536, + "learning_rate": 1.7353479854862285e-05, + "loss": 1.4681, + "mean_token_accuracy": 0.6475894500811895, + "num_tokens": 834760505.0, + "step": 4972 + }, + { + "entropy": 1.6946745415528615, + "epoch": 0.5463184202576145, + "grad_norm": 0.6594904661178589, + "learning_rate": 1.735234572370889e-05, + "loss": 1.4675, + "mean_token_accuracy": 0.6494403878847758, + "num_tokens": 834951630.0, + "step": 4973 + }, + { + "entropy": 1.677021066347758, + "epoch": 0.5464282771689873, + "grad_norm": 0.6898400187492371, + "learning_rate": 1.735121139150662e-05, + "loss": 1.4039, + "mean_token_accuracy": 0.6618401060501734, + "num_tokens": 835132695.0, + "step": 4974 + }, + { + "entropy": 1.6862121025721233, + "epoch": 0.5465381340803603, + "grad_norm": 0.7192554473876953, + "learning_rate": 1.7350076858291363e-05, + "loss": 1.452, + "mean_token_accuracy": 0.6425764660040537, + "num_tokens": 835332699.0, + "step": 4975 + }, + { + "entropy": 1.7146854003270466, + "epoch": 0.5466479909917332, + "grad_norm": 0.6000041365623474, + "learning_rate": 1.734894212409904e-05, + "loss": 1.38, + "mean_token_accuracy": 0.6442343592643738, + "num_tokens": 835491957.0, + "step": 4976 + }, + { + "entropy": 1.7366932928562164, + "epoch": 0.5467578479031062, + "grad_norm": 1.8402376174926758, + "learning_rate": 1.734780718896556e-05, + "loss": 1.1827, + "mean_token_accuracy": 0.6654492169618607, + "num_tokens": 835675120.0, + "step": 4977 + }, + { + "entropy": 1.7179711163043976, + "epoch": 0.5468677048144791, + "grad_norm": 0.7776644825935364, + "learning_rate": 1.7346672052926838e-05, + "loss": 1.3917, + "mean_token_accuracy": 0.6652675569057465, + "num_tokens": 835855578.0, + "step": 4978 + }, + { + "entropy": 1.7254438002904255, + "epoch": 0.5469775617258521, + "grad_norm": 0.7210192084312439, + "learning_rate": 1.734553671601881e-05, + "loss": 1.4128, + "mean_token_accuracy": 0.6519081691900889, + "num_tokens": 836044866.0, + "step": 4979 + }, + { + "entropy": 1.7057076493899028, + "epoch": 0.547087418637225, + "grad_norm": 0.7255688309669495, + "learning_rate": 1.7344401178277405e-05, + "loss": 1.3313, + "mean_token_accuracy": 0.6629131535689036, + "num_tokens": 836172342.0, + "step": 4980 + }, + { + "entropy": 1.764757553736369, + "epoch": 0.547197275548598, + "grad_norm": 0.9099174737930298, + "learning_rate": 1.734326543973856e-05, + "loss": 1.556, + "mean_token_accuracy": 0.6452460636695226, + "num_tokens": 836337102.0, + "step": 4981 + }, + { + "entropy": 1.7524670163790386, + "epoch": 0.5473071324599709, + "grad_norm": 0.8093464374542236, + "learning_rate": 1.734212950043822e-05, + "loss": 1.4775, + "mean_token_accuracy": 0.6518103977044424, + "num_tokens": 836471055.0, + "step": 4982 + }, + { + "entropy": 1.7805779079596202, + "epoch": 0.5474169893713439, + "grad_norm": 0.7849537134170532, + "learning_rate": 1.7340993360412343e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.6559980014959971, + "num_tokens": 836607555.0, + "step": 4983 + }, + { + "entropy": 1.7531798581282299, + "epoch": 0.5475268462827168, + "grad_norm": 0.7503484487533569, + "learning_rate": 1.733985701969688e-05, + "loss": 1.5242, + "mean_token_accuracy": 0.64345849553744, + "num_tokens": 836777172.0, + "step": 4984 + }, + { + "entropy": 1.6708122690518696, + "epoch": 0.5476367031940897, + "grad_norm": 0.7763621807098389, + "learning_rate": 1.73387204783278e-05, + "loss": 1.3054, + "mean_token_accuracy": 0.6630136370658875, + "num_tokens": 836904265.0, + "step": 4985 + }, + { + "entropy": 1.6611828605333965, + "epoch": 0.5477465601054626, + "grad_norm": 0.7332755327224731, + "learning_rate": 1.7337583736341077e-05, + "loss": 1.4492, + "mean_token_accuracy": 0.6425531009833018, + "num_tokens": 837153610.0, + "step": 4986 + }, + { + "entropy": 1.6737177173296611, + "epoch": 0.5478564170168355, + "grad_norm": 0.7409669756889343, + "learning_rate": 1.7336446793772682e-05, + "loss": 1.3822, + "mean_token_accuracy": 0.6608980546394984, + "num_tokens": 837313064.0, + "step": 4987 + }, + { + "entropy": 1.687143345673879, + "epoch": 0.5479662739282085, + "grad_norm": 0.8636589646339417, + "learning_rate": 1.73353096506586e-05, + "loss": 1.2404, + "mean_token_accuracy": 0.6764566948016485, + "num_tokens": 837451066.0, + "step": 4988 + }, + { + "entropy": 1.649695744117101, + "epoch": 0.5480761308395814, + "grad_norm": 0.7815621495246887, + "learning_rate": 1.733417230703482e-05, + "loss": 1.3502, + "mean_token_accuracy": 0.658309539159139, + "num_tokens": 837614506.0, + "step": 4989 + }, + { + "entropy": 1.7112309634685516, + "epoch": 0.5481859877509544, + "grad_norm": 0.6810916066169739, + "learning_rate": 1.7333034762937346e-05, + "loss": 1.31, + "mean_token_accuracy": 0.6706115355094274, + "num_tokens": 837773388.0, + "step": 4990 + }, + { + "entropy": 1.719188928604126, + "epoch": 0.5482958446623273, + "grad_norm": 0.6780290007591248, + "learning_rate": 1.7331897018402175e-05, + "loss": 1.3669, + "mean_token_accuracy": 0.6579122791687647, + "num_tokens": 837943358.0, + "step": 4991 + }, + { + "entropy": 1.731730043888092, + "epoch": 0.5484057015737003, + "grad_norm": 0.6477614045143127, + "learning_rate": 1.7330759073465317e-05, + "loss": 1.4375, + "mean_token_accuracy": 0.6566148449977239, + "num_tokens": 838148527.0, + "step": 4992 + }, + { + "entropy": 1.7366572121779125, + "epoch": 0.5485155584850732, + "grad_norm": 0.7073691487312317, + "learning_rate": 1.7329620928162785e-05, + "loss": 1.4436, + "mean_token_accuracy": 0.6603938837846121, + "num_tokens": 838351432.0, + "step": 4993 + }, + { + "entropy": 1.710929661989212, + "epoch": 0.5486254153964462, + "grad_norm": 0.6895067691802979, + "learning_rate": 1.7328482582530598e-05, + "loss": 1.3865, + "mean_token_accuracy": 0.660644123951594, + "num_tokens": 838525802.0, + "step": 4994 + }, + { + "entropy": 1.7042444845040639, + "epoch": 0.548735272307819, + "grad_norm": 0.767922580242157, + "learning_rate": 1.7327344036604796e-05, + "loss": 1.3309, + "mean_token_accuracy": 0.6714604794979095, + "num_tokens": 838674351.0, + "step": 4995 + }, + { + "entropy": 1.6385613679885864, + "epoch": 0.548845129219192, + "grad_norm": 0.7014147639274597, + "learning_rate": 1.7326205290421405e-05, + "loss": 1.315, + "mean_token_accuracy": 0.6713072061538696, + "num_tokens": 838818241.0, + "step": 4996 + }, + { + "entropy": 1.731259047985077, + "epoch": 0.5489549861305649, + "grad_norm": 0.7416298389434814, + "learning_rate": 1.7325066344016467e-05, + "loss": 1.3877, + "mean_token_accuracy": 0.6583247681458791, + "num_tokens": 838993948.0, + "step": 4997 + }, + { + "entropy": 1.6808960835138957, + "epoch": 0.5490648430419379, + "grad_norm": 0.6791642308235168, + "learning_rate": 1.732392719742603e-05, + "loss": 1.425, + "mean_token_accuracy": 0.6509095182021459, + "num_tokens": 839149363.0, + "step": 4998 + }, + { + "entropy": 1.6175450483957927, + "epoch": 0.5491746999533108, + "grad_norm": 0.653907060623169, + "learning_rate": 1.7322787850686143e-05, + "loss": 1.3148, + "mean_token_accuracy": 0.6778159439563751, + "num_tokens": 839305980.0, + "step": 4999 + }, + { + "entropy": 1.6603448390960693, + "epoch": 0.5492845568646838, + "grad_norm": 0.6586391925811768, + "learning_rate": 1.732164830383287e-05, + "loss": 1.576, + "mean_token_accuracy": 0.6371851215759913, + "num_tokens": 839486072.0, + "step": 5000 + }, + { + "entropy": 1.6676330765088399, + "epoch": 0.5493944137760567, + "grad_norm": 0.7712739109992981, + "learning_rate": 1.732050855690228e-05, + "loss": 1.4334, + "mean_token_accuracy": 0.654510036110878, + "num_tokens": 839662761.0, + "step": 5001 + }, + { + "entropy": 1.78163543343544, + "epoch": 0.5495042706874296, + "grad_norm": 4.66588020324707, + "learning_rate": 1.7319368609930442e-05, + "loss": 1.4213, + "mean_token_accuracy": 0.654133602976799, + "num_tokens": 839814998.0, + "step": 5002 + }, + { + "entropy": 1.672917405764262, + "epoch": 0.5496141275988026, + "grad_norm": 0.6977851390838623, + "learning_rate": 1.7318228462953436e-05, + "loss": 1.3021, + "mean_token_accuracy": 0.6800949474175771, + "num_tokens": 839946153.0, + "step": 5003 + }, + { + "entropy": 1.7138892312844594, + "epoch": 0.5497239845101755, + "grad_norm": 0.6364522576332092, + "learning_rate": 1.7317088116007347e-05, + "loss": 1.2984, + "mean_token_accuracy": 0.6754194498062134, + "num_tokens": 840104761.0, + "step": 5004 + }, + { + "entropy": 1.7337297697861989, + "epoch": 0.5498338414215485, + "grad_norm": 0.6527485251426697, + "learning_rate": 1.731594756912826e-05, + "loss": 1.4891, + "mean_token_accuracy": 0.6498339672883352, + "num_tokens": 840271652.0, + "step": 5005 + }, + { + "entropy": 1.698456237713496, + "epoch": 0.5499436983329213, + "grad_norm": 0.7499955892562866, + "learning_rate": 1.7314806822352283e-05, + "loss": 1.2699, + "mean_token_accuracy": 0.6654329647620519, + "num_tokens": 840397079.0, + "step": 5006 + }, + { + "entropy": 1.6331544518470764, + "epoch": 0.5500535552442943, + "grad_norm": 0.6753321290016174, + "learning_rate": 1.7313665875715513e-05, + "loss": 1.2348, + "mean_token_accuracy": 0.6808192729949951, + "num_tokens": 840503621.0, + "step": 5007 + }, + { + "entropy": 1.7092650135358174, + "epoch": 0.5501634121556672, + "grad_norm": 0.7636615037918091, + "learning_rate": 1.7312524729254066e-05, + "loss": 1.5449, + "mean_token_accuracy": 0.6385166347026825, + "num_tokens": 840684309.0, + "step": 5008 + }, + { + "entropy": 1.702101041873296, + "epoch": 0.5502732690670402, + "grad_norm": 0.7472963333129883, + "learning_rate": 1.7311383383004052e-05, + "loss": 1.4984, + "mean_token_accuracy": 0.6420136094093323, + "num_tokens": 840943695.0, + "step": 5009 + }, + { + "entropy": 1.710230439901352, + "epoch": 0.5503831259784131, + "grad_norm": 0.7253463864326477, + "learning_rate": 1.73102418370016e-05, + "loss": 1.4516, + "mean_token_accuracy": 0.6447204500436783, + "num_tokens": 841164320.0, + "step": 5010 + }, + { + "entropy": 1.7154650886853535, + "epoch": 0.5504929828897861, + "grad_norm": 2.171661376953125, + "learning_rate": 1.7309100091282837e-05, + "loss": 1.4222, + "mean_token_accuracy": 0.6575359304745992, + "num_tokens": 841288359.0, + "step": 5011 + }, + { + "entropy": 1.7397755086421967, + "epoch": 0.550602839801159, + "grad_norm": 0.7591625452041626, + "learning_rate": 1.7307958145883898e-05, + "loss": 1.402, + "mean_token_accuracy": 0.65840412179629, + "num_tokens": 841437589.0, + "step": 5012 + }, + { + "entropy": 1.6358317236105602, + "epoch": 0.550712696712532, + "grad_norm": 0.617072582244873, + "learning_rate": 1.730681600084093e-05, + "loss": 1.3611, + "mean_token_accuracy": 0.6660978297392527, + "num_tokens": 841609979.0, + "step": 5013 + }, + { + "entropy": 1.6907884379227955, + "epoch": 0.5508225536239049, + "grad_norm": 0.6868788599967957, + "learning_rate": 1.7305673656190074e-05, + "loss": 1.4394, + "mean_token_accuracy": 0.6435293157895406, + "num_tokens": 841771066.0, + "step": 5014 + }, + { + "entropy": 1.7315944532553356, + "epoch": 0.5509324105352778, + "grad_norm": 0.6620607376098633, + "learning_rate": 1.730453111196749e-05, + "loss": 1.4044, + "mean_token_accuracy": 0.6534575472275416, + "num_tokens": 841932098.0, + "step": 5015 + }, + { + "entropy": 1.6816769540309906, + "epoch": 0.5510422674466507, + "grad_norm": 0.8678973913192749, + "learning_rate": 1.7303388368209337e-05, + "loss": 1.2427, + "mean_token_accuracy": 0.6835728486378988, + "num_tokens": 842056049.0, + "step": 5016 + }, + { + "entropy": 1.6632795631885529, + "epoch": 0.5511521243580236, + "grad_norm": 0.6091153025627136, + "learning_rate": 1.7302245424951783e-05, + "loss": 1.4623, + "mean_token_accuracy": 0.6500665346781412, + "num_tokens": 842275188.0, + "step": 5017 + }, + { + "entropy": 1.6769650379816692, + "epoch": 0.5512619812693966, + "grad_norm": 0.5689995288848877, + "learning_rate": 1.7301102282231e-05, + "loss": 1.4242, + "mean_token_accuracy": 0.649879202246666, + "num_tokens": 842487986.0, + "step": 5018 + }, + { + "entropy": 1.8732970456282299, + "epoch": 0.5513718381807695, + "grad_norm": 0.833006739616394, + "learning_rate": 1.7299958940083168e-05, + "loss": 1.5442, + "mean_token_accuracy": 0.6393305758635203, + "num_tokens": 842638311.0, + "step": 5019 + }, + { + "entropy": 1.7400578459103901, + "epoch": 0.5514816950921425, + "grad_norm": 0.6837904453277588, + "learning_rate": 1.7298815398544474e-05, + "loss": 1.3732, + "mean_token_accuracy": 0.6732922891775767, + "num_tokens": 842788496.0, + "step": 5020 + }, + { + "entropy": 1.735822359720866, + "epoch": 0.5515915520035154, + "grad_norm": 0.6274124979972839, + "learning_rate": 1.729767165765111e-05, + "loss": 1.3365, + "mean_token_accuracy": 0.673115094502767, + "num_tokens": 842980742.0, + "step": 5021 + }, + { + "entropy": 1.7332566777865093, + "epoch": 0.5517014089148884, + "grad_norm": 0.6682114601135254, + "learning_rate": 1.7296527717439285e-05, + "loss": 1.3757, + "mean_token_accuracy": 0.6619204978148142, + "num_tokens": 843135183.0, + "step": 5022 + }, + { + "entropy": 1.6546105941136677, + "epoch": 0.5518112658262613, + "grad_norm": 0.64713054895401, + "learning_rate": 1.7295383577945183e-05, + "loss": 1.4332, + "mean_token_accuracy": 0.6506260534127554, + "num_tokens": 843357807.0, + "step": 5023 + }, + { + "entropy": 1.6766623953978221, + "epoch": 0.5519211227376343, + "grad_norm": 0.6391339898109436, + "learning_rate": 1.7294239239205036e-05, + "loss": 1.4106, + "mean_token_accuracy": 0.6604893952608109, + "num_tokens": 843511591.0, + "step": 5024 + }, + { + "entropy": 1.7070300082365673, + "epoch": 0.5520309796490072, + "grad_norm": 0.6645405888557434, + "learning_rate": 1.7293094701255052e-05, + "loss": 1.4886, + "mean_token_accuracy": 0.6455263296763102, + "num_tokens": 843686858.0, + "step": 5025 + }, + { + "entropy": 1.7414699892203014, + "epoch": 0.5521408365603802, + "grad_norm": 0.7096126079559326, + "learning_rate": 1.7291949964131454e-05, + "loss": 1.5641, + "mean_token_accuracy": 0.6493834306796392, + "num_tokens": 843852202.0, + "step": 5026 + }, + { + "entropy": 1.7158599992593129, + "epoch": 0.552250693471753, + "grad_norm": 0.5695939660072327, + "learning_rate": 1.7290805027870475e-05, + "loss": 1.4462, + "mean_token_accuracy": 0.6488019227981567, + "num_tokens": 844059832.0, + "step": 5027 + }, + { + "entropy": 1.6897384027640026, + "epoch": 0.5523605503831259, + "grad_norm": 0.6646971702575684, + "learning_rate": 1.7289659892508353e-05, + "loss": 1.4142, + "mean_token_accuracy": 0.6486612806717554, + "num_tokens": 844275183.0, + "step": 5028 + }, + { + "entropy": 1.7032889624436696, + "epoch": 0.5524704072944989, + "grad_norm": 0.7060292959213257, + "learning_rate": 1.728851455808133e-05, + "loss": 1.2169, + "mean_token_accuracy": 0.6796109775702158, + "num_tokens": 844383058.0, + "step": 5029 + }, + { + "entropy": 1.7134819130102794, + "epoch": 0.5525802642058718, + "grad_norm": 0.7380111217498779, + "learning_rate": 1.7287369024625652e-05, + "loss": 1.5068, + "mean_token_accuracy": 0.6495644648869833, + "num_tokens": 844531120.0, + "step": 5030 + }, + { + "entropy": 1.6543967723846436, + "epoch": 0.5526901211172448, + "grad_norm": 0.6035121083259583, + "learning_rate": 1.728622329217758e-05, + "loss": 1.4167, + "mean_token_accuracy": 0.6516775141159693, + "num_tokens": 844707979.0, + "step": 5031 + }, + { + "entropy": 1.7068756620089214, + "epoch": 0.5527999780286177, + "grad_norm": 0.8700978755950928, + "learning_rate": 1.7285077360773374e-05, + "loss": 1.3426, + "mean_token_accuracy": 0.6678166339794794, + "num_tokens": 844852167.0, + "step": 5032 + }, + { + "entropy": 1.7389213939507802, + "epoch": 0.5529098349399907, + "grad_norm": 0.6411224603652954, + "learning_rate": 1.7283931230449297e-05, + "loss": 1.4793, + "mean_token_accuracy": 0.6375206708908081, + "num_tokens": 845043801.0, + "step": 5033 + }, + { + "entropy": 1.657431811094284, + "epoch": 0.5530196918513636, + "grad_norm": 0.6940959692001343, + "learning_rate": 1.7282784901241632e-05, + "loss": 1.2937, + "mean_token_accuracy": 0.6709872682889303, + "num_tokens": 845167525.0, + "step": 5034 + }, + { + "entropy": 1.7134600778420765, + "epoch": 0.5531295487627366, + "grad_norm": 0.7191624641418457, + "learning_rate": 1.7281638373186655e-05, + "loss": 1.4491, + "mean_token_accuracy": 0.6502556055784225, + "num_tokens": 845367554.0, + "step": 5035 + }, + { + "entropy": 1.6771671672662098, + "epoch": 0.5532394056741095, + "grad_norm": 0.6489148139953613, + "learning_rate": 1.7280491646320654e-05, + "loss": 1.3884, + "mean_token_accuracy": 0.6516650716463724, + "num_tokens": 845533357.0, + "step": 5036 + }, + { + "entropy": 1.716915915409724, + "epoch": 0.5533492625854824, + "grad_norm": 0.688566267490387, + "learning_rate": 1.7279344720679924e-05, + "loss": 1.4762, + "mean_token_accuracy": 0.6542300681273142, + "num_tokens": 845713492.0, + "step": 5037 + }, + { + "entropy": 1.6464048027992249, + "epoch": 0.5534591194968553, + "grad_norm": 0.6653256416320801, + "learning_rate": 1.727819759630076e-05, + "loss": 1.4109, + "mean_token_accuracy": 0.662084236741066, + "num_tokens": 845908415.0, + "step": 5038 + }, + { + "entropy": 1.663993815581004, + "epoch": 0.5535689764082283, + "grad_norm": 0.651810884475708, + "learning_rate": 1.7277050273219477e-05, + "loss": 1.4258, + "mean_token_accuracy": 0.6560534288485845, + "num_tokens": 846110806.0, + "step": 5039 + }, + { + "entropy": 1.7180224458376567, + "epoch": 0.5536788333196012, + "grad_norm": 0.6927412748336792, + "learning_rate": 1.7275902751472375e-05, + "loss": 1.34, + "mean_token_accuracy": 0.6658701201279958, + "num_tokens": 846291645.0, + "step": 5040 + }, + { + "entropy": 1.7129448254903157, + "epoch": 0.5537886902309741, + "grad_norm": 0.8613117337226868, + "learning_rate": 1.7274755031095782e-05, + "loss": 1.5887, + "mean_token_accuracy": 0.6217222909132639, + "num_tokens": 846451007.0, + "step": 5041 + }, + { + "entropy": 1.7207149465878804, + "epoch": 0.5538985471423471, + "grad_norm": 0.7448726892471313, + "learning_rate": 1.727360711212602e-05, + "loss": 1.3178, + "mean_token_accuracy": 0.6566254794597626, + "num_tokens": 846581772.0, + "step": 5042 + }, + { + "entropy": 1.67302605509758, + "epoch": 0.55400840405372, + "grad_norm": 0.7350447177886963, + "learning_rate": 1.727245899459942e-05, + "loss": 1.5016, + "mean_token_accuracy": 0.6568493197361628, + "num_tokens": 846760830.0, + "step": 5043 + }, + { + "entropy": 1.675746778647105, + "epoch": 0.554118260965093, + "grad_norm": 0.799920916557312, + "learning_rate": 1.7271310678552316e-05, + "loss": 1.3671, + "mean_token_accuracy": 0.661032055815061, + "num_tokens": 846907065.0, + "step": 5044 + }, + { + "entropy": 1.7543572783470154, + "epoch": 0.5542281178764659, + "grad_norm": 0.7802977561950684, + "learning_rate": 1.7270162164021058e-05, + "loss": 1.3284, + "mean_token_accuracy": 0.6583187431097031, + "num_tokens": 847073867.0, + "step": 5045 + }, + { + "entropy": 1.6700835426648457, + "epoch": 0.5543379747878389, + "grad_norm": 0.6491420269012451, + "learning_rate": 1.726901345104199e-05, + "loss": 1.4677, + "mean_token_accuracy": 0.6603880474964777, + "num_tokens": 847231745.0, + "step": 5046 + }, + { + "entropy": 1.7336505154768627, + "epoch": 0.5544478316992117, + "grad_norm": 0.6150977611541748, + "learning_rate": 1.7267864539651476e-05, + "loss": 1.4324, + "mean_token_accuracy": 0.6436419288317362, + "num_tokens": 847459685.0, + "step": 5047 + }, + { + "entropy": 1.6739269097646077, + "epoch": 0.5545576886105847, + "grad_norm": 0.6661935448646545, + "learning_rate": 1.726671542988587e-05, + "loss": 1.3945, + "mean_token_accuracy": 0.6670573254426321, + "num_tokens": 847611418.0, + "step": 5048 + }, + { + "entropy": 1.6247097651163738, + "epoch": 0.5546675455219576, + "grad_norm": 0.6128849983215332, + "learning_rate": 1.7265566121781545e-05, + "loss": 1.3081, + "mean_token_accuracy": 0.6608265737692515, + "num_tokens": 847802471.0, + "step": 5049 + }, + { + "entropy": 1.7174389859040577, + "epoch": 0.5547774024333306, + "grad_norm": 0.7090808153152466, + "learning_rate": 1.7264416615374875e-05, + "loss": 1.4592, + "mean_token_accuracy": 0.640943189462026, + "num_tokens": 847976449.0, + "step": 5050 + }, + { + "entropy": 1.7501269181569417, + "epoch": 0.5548872593447035, + "grad_norm": 0.7363408207893372, + "learning_rate": 1.7263266910702247e-05, + "loss": 1.3542, + "mean_token_accuracy": 0.6537466496229172, + "num_tokens": 848079456.0, + "step": 5051 + }, + { + "entropy": 1.6778157651424408, + "epoch": 0.5549971162560765, + "grad_norm": 0.600395917892456, + "learning_rate": 1.7262117007800033e-05, + "loss": 1.4896, + "mean_token_accuracy": 0.6465341796477636, + "num_tokens": 848310668.0, + "step": 5052 + }, + { + "entropy": 1.7230773468812306, + "epoch": 0.5551069731674494, + "grad_norm": 0.6006616950035095, + "learning_rate": 1.726096690670465e-05, + "loss": 1.3888, + "mean_token_accuracy": 0.6528857400019964, + "num_tokens": 848468063.0, + "step": 5053 + }, + { + "entropy": 1.7066966394583385, + "epoch": 0.5552168300788224, + "grad_norm": 0.69357830286026, + "learning_rate": 1.7259816607452477e-05, + "loss": 1.3105, + "mean_token_accuracy": 0.6618163386980692, + "num_tokens": 848617851.0, + "step": 5054 + }, + { + "entropy": 1.754871626694997, + "epoch": 0.5553266869901953, + "grad_norm": 0.7361278533935547, + "learning_rate": 1.7258666110079933e-05, + "loss": 1.4104, + "mean_token_accuracy": 0.6681383550167084, + "num_tokens": 848741009.0, + "step": 5055 + }, + { + "entropy": 1.7040483752886455, + "epoch": 0.5554365439015682, + "grad_norm": 0.6013309359550476, + "learning_rate": 1.7257515414623427e-05, + "loss": 1.3651, + "mean_token_accuracy": 0.6659112522999445, + "num_tokens": 848888733.0, + "step": 5056 + }, + { + "entropy": 1.6922965149084728, + "epoch": 0.5555464008129412, + "grad_norm": 0.7231853008270264, + "learning_rate": 1.7256364521119377e-05, + "loss": 1.4536, + "mean_token_accuracy": 0.6473261117935181, + "num_tokens": 849101982.0, + "step": 5057 + }, + { + "entropy": 1.6854231754938762, + "epoch": 0.555656257724314, + "grad_norm": 0.6253160238265991, + "learning_rate": 1.7255213429604204e-05, + "loss": 1.3522, + "mean_token_accuracy": 0.6646686444679896, + "num_tokens": 849240891.0, + "step": 5058 + }, + { + "entropy": 1.675849974155426, + "epoch": 0.555766114635687, + "grad_norm": 0.7990770936012268, + "learning_rate": 1.725406214011435e-05, + "loss": 1.3278, + "mean_token_accuracy": 0.6695520381132761, + "num_tokens": 849394620.0, + "step": 5059 + }, + { + "entropy": 1.7733904123306274, + "epoch": 0.5558759715470599, + "grad_norm": 0.6922385096549988, + "learning_rate": 1.7252910652686248e-05, + "loss": 1.459, + "mean_token_accuracy": 0.6450273891290029, + "num_tokens": 849547571.0, + "step": 5060 + }, + { + "entropy": 1.7009385426839192, + "epoch": 0.5559858284584329, + "grad_norm": 0.7180578112602234, + "learning_rate": 1.725175896735634e-05, + "loss": 1.4134, + "mean_token_accuracy": 0.6490115920702616, + "num_tokens": 849694441.0, + "step": 5061 + }, + { + "entropy": 1.6811227997144063, + "epoch": 0.5560956853698058, + "grad_norm": 0.8859359622001648, + "learning_rate": 1.7250607084161078e-05, + "loss": 1.467, + "mean_token_accuracy": 0.636049841841062, + "num_tokens": 849891733.0, + "step": 5062 + }, + { + "entropy": 1.700390100479126, + "epoch": 0.5562055422811788, + "grad_norm": 0.6678200364112854, + "learning_rate": 1.724945500313692e-05, + "loss": 1.3254, + "mean_token_accuracy": 0.6696620285511017, + "num_tokens": 850015651.0, + "step": 5063 + }, + { + "entropy": 1.6472013394037883, + "epoch": 0.5563153991925517, + "grad_norm": 0.6910000443458557, + "learning_rate": 1.7248302724320324e-05, + "loss": 1.4257, + "mean_token_accuracy": 0.6657865395148596, + "num_tokens": 850187479.0, + "step": 5064 + }, + { + "entropy": 1.6618298788865407, + "epoch": 0.5564252561039247, + "grad_norm": 0.6528242826461792, + "learning_rate": 1.7247150247747765e-05, + "loss": 1.3154, + "mean_token_accuracy": 0.6672501713037491, + "num_tokens": 850354624.0, + "step": 5065 + }, + { + "entropy": 1.718745857477188, + "epoch": 0.5565351130152976, + "grad_norm": 0.7377060651779175, + "learning_rate": 1.724599757345571e-05, + "loss": 1.5872, + "mean_token_accuracy": 0.6395946790774664, + "num_tokens": 850531179.0, + "step": 5066 + }, + { + "entropy": 1.6819844444592793, + "epoch": 0.5566449699266706, + "grad_norm": 0.6145383715629578, + "learning_rate": 1.7244844701480654e-05, + "loss": 1.3237, + "mean_token_accuracy": 0.6742121378580729, + "num_tokens": 850668747.0, + "step": 5067 + }, + { + "entropy": 1.6457071900367737, + "epoch": 0.5567548268380434, + "grad_norm": 0.6610442399978638, + "learning_rate": 1.7243691631859075e-05, + "loss": 1.3572, + "mean_token_accuracy": 0.669839675227801, + "num_tokens": 850875419.0, + "step": 5068 + }, + { + "entropy": 1.6283689141273499, + "epoch": 0.5568646837494163, + "grad_norm": 0.6071202754974365, + "learning_rate": 1.7242538364627467e-05, + "loss": 1.2843, + "mean_token_accuracy": 0.6711755692958832, + "num_tokens": 850996689.0, + "step": 5069 + }, + { + "entropy": 1.7722647686799367, + "epoch": 0.5569745406607893, + "grad_norm": 0.6648354530334473, + "learning_rate": 1.7241384899822334e-05, + "loss": 1.4833, + "mean_token_accuracy": 0.6351625472307205, + "num_tokens": 851177217.0, + "step": 5070 + }, + { + "entropy": 1.7500501374403636, + "epoch": 0.5570843975721622, + "grad_norm": 0.6843627691268921, + "learning_rate": 1.724023123748018e-05, + "loss": 1.3283, + "mean_token_accuracy": 0.6687018970648447, + "num_tokens": 851312948.0, + "step": 5071 + }, + { + "entropy": 1.730874131123225, + "epoch": 0.5571942544835352, + "grad_norm": 0.752149224281311, + "learning_rate": 1.723907737763752e-05, + "loss": 1.2589, + "mean_token_accuracy": 0.6734604885180792, + "num_tokens": 851434762.0, + "step": 5072 + }, + { + "entropy": 1.7467269003391266, + "epoch": 0.5573041113949081, + "grad_norm": 0.8023228049278259, + "learning_rate": 1.7237923320330875e-05, + "loss": 1.3556, + "mean_token_accuracy": 0.6621319899956385, + "num_tokens": 851601105.0, + "step": 5073 + }, + { + "entropy": 1.684466113646825, + "epoch": 0.5574139683062811, + "grad_norm": 0.8053759932518005, + "learning_rate": 1.7236769065596765e-05, + "loss": 1.3469, + "mean_token_accuracy": 0.6632759322722753, + "num_tokens": 851761746.0, + "step": 5074 + }, + { + "entropy": 1.7283134460449219, + "epoch": 0.557523825217654, + "grad_norm": 0.7793658971786499, + "learning_rate": 1.7235614613471726e-05, + "loss": 1.2974, + "mean_token_accuracy": 0.6666330446799597, + "num_tokens": 851894994.0, + "step": 5075 + }, + { + "entropy": 1.6891380151112874, + "epoch": 0.557633682129027, + "grad_norm": 0.6810115575790405, + "learning_rate": 1.723445996399229e-05, + "loss": 1.4283, + "mean_token_accuracy": 0.6489190608263016, + "num_tokens": 852019579.0, + "step": 5076 + }, + { + "entropy": 1.720542977253596, + "epoch": 0.5577435390403999, + "grad_norm": 0.7062191963195801, + "learning_rate": 1.723330511719501e-05, + "loss": 1.3266, + "mean_token_accuracy": 0.6564305424690247, + "num_tokens": 852170454.0, + "step": 5077 + }, + { + "entropy": 1.7275803287823994, + "epoch": 0.5578533959517729, + "grad_norm": 0.6995865702629089, + "learning_rate": 1.7232150073116434e-05, + "loss": 1.31, + "mean_token_accuracy": 0.6622872352600098, + "num_tokens": 852276883.0, + "step": 5078 + }, + { + "entropy": 1.735807627439499, + "epoch": 0.5579632528631457, + "grad_norm": 0.7578923106193542, + "learning_rate": 1.7230994831793112e-05, + "loss": 1.5665, + "mean_token_accuracy": 0.6405892074108124, + "num_tokens": 852553346.0, + "step": 5079 + }, + { + "entropy": 1.7293440699577332, + "epoch": 0.5580731097745187, + "grad_norm": 0.6468039751052856, + "learning_rate": 1.722983939326161e-05, + "loss": 1.3335, + "mean_token_accuracy": 0.6611098150412241, + "num_tokens": 852716230.0, + "step": 5080 + }, + { + "entropy": 1.6878082553545635, + "epoch": 0.5581829666858916, + "grad_norm": 0.6970882415771484, + "learning_rate": 1.7228683757558506e-05, + "loss": 1.4551, + "mean_token_accuracy": 0.6598817507425944, + "num_tokens": 852879187.0, + "step": 5081 + }, + { + "entropy": 1.6392480432987213, + "epoch": 0.5582928235972645, + "grad_norm": 0.6725665926933289, + "learning_rate": 1.722752792472036e-05, + "loss": 1.2592, + "mean_token_accuracy": 0.6828918804725012, + "num_tokens": 853012620.0, + "step": 5082 + }, + { + "entropy": 1.6959756811459858, + "epoch": 0.5584026805086375, + "grad_norm": 0.7385476231575012, + "learning_rate": 1.7226371894783768e-05, + "loss": 1.231, + "mean_token_accuracy": 0.6803264965613683, + "num_tokens": 853167262.0, + "step": 5083 + }, + { + "entropy": 1.6905551254749298, + "epoch": 0.5585125374200104, + "grad_norm": 0.6331995725631714, + "learning_rate": 1.7225215667785305e-05, + "loss": 1.3169, + "mean_token_accuracy": 0.6700136860211691, + "num_tokens": 853326108.0, + "step": 5084 + }, + { + "entropy": 1.7254391411940257, + "epoch": 0.5586223943313834, + "grad_norm": 0.7044715285301208, + "learning_rate": 1.7224059243761572e-05, + "loss": 1.4471, + "mean_token_accuracy": 0.6458353300889333, + "num_tokens": 853488209.0, + "step": 5085 + }, + { + "entropy": 1.7297268311182659, + "epoch": 0.5587322512427563, + "grad_norm": 0.7728154063224792, + "learning_rate": 1.7222902622749173e-05, + "loss": 1.281, + "mean_token_accuracy": 0.6573230673869451, + "num_tokens": 853610205.0, + "step": 5086 + }, + { + "entropy": 1.735485980908076, + "epoch": 0.5588421081541293, + "grad_norm": 0.837343692779541, + "learning_rate": 1.7221745804784707e-05, + "loss": 1.3709, + "mean_token_accuracy": 0.6560803353786469, + "num_tokens": 853751392.0, + "step": 5087 + }, + { + "entropy": 1.6720333397388458, + "epoch": 0.5589519650655022, + "grad_norm": 0.6489691138267517, + "learning_rate": 1.722058878990479e-05, + "loss": 1.4012, + "mean_token_accuracy": 0.6520512402057648, + "num_tokens": 853991504.0, + "step": 5088 + }, + { + "entropy": 1.694252997636795, + "epoch": 0.5590618219768752, + "grad_norm": 0.8448305130004883, + "learning_rate": 1.721943157814604e-05, + "loss": 1.4325, + "mean_token_accuracy": 0.6456566154956818, + "num_tokens": 854165712.0, + "step": 5089 + }, + { + "entropy": 1.750052273273468, + "epoch": 0.559171678888248, + "grad_norm": 0.6790991425514221, + "learning_rate": 1.7218274169545082e-05, + "loss": 1.4332, + "mean_token_accuracy": 0.6550088077783585, + "num_tokens": 854287722.0, + "step": 5090 + }, + { + "entropy": 1.7848297357559204, + "epoch": 0.559281535799621, + "grad_norm": 0.6880961060523987, + "learning_rate": 1.721711656413855e-05, + "loss": 1.3807, + "mean_token_accuracy": 0.655860627690951, + "num_tokens": 854402682.0, + "step": 5091 + }, + { + "entropy": 1.7680182953675587, + "epoch": 0.5593913927109939, + "grad_norm": 0.7541852593421936, + "learning_rate": 1.7215958761963085e-05, + "loss": 1.4438, + "mean_token_accuracy": 0.659633050362269, + "num_tokens": 854582653.0, + "step": 5092 + }, + { + "entropy": 1.743993620077769, + "epoch": 0.5595012496223669, + "grad_norm": 0.7386744618415833, + "learning_rate": 1.7214800763055323e-05, + "loss": 1.3355, + "mean_token_accuracy": 0.655690461397171, + "num_tokens": 854733417.0, + "step": 5093 + }, + { + "entropy": 1.7236657241980236, + "epoch": 0.5596111065337398, + "grad_norm": 0.6876170039176941, + "learning_rate": 1.7213642567451917e-05, + "loss": 1.5466, + "mean_token_accuracy": 0.6431126991907755, + "num_tokens": 854876880.0, + "step": 5094 + }, + { + "entropy": 1.6323457062244415, + "epoch": 0.5597209634451128, + "grad_norm": 0.6821046471595764, + "learning_rate": 1.7212484175189522e-05, + "loss": 1.5167, + "mean_token_accuracy": 0.6490619430939356, + "num_tokens": 855089649.0, + "step": 5095 + }, + { + "entropy": 1.6829971273740132, + "epoch": 0.5598308203564857, + "grad_norm": 0.698646605014801, + "learning_rate": 1.7211325586304802e-05, + "loss": 1.4024, + "mean_token_accuracy": 0.6564787675937017, + "num_tokens": 855301819.0, + "step": 5096 + }, + { + "entropy": 1.6667365928490956, + "epoch": 0.5599406772678586, + "grad_norm": 0.6255282759666443, + "learning_rate": 1.721016680083443e-05, + "loss": 1.4059, + "mean_token_accuracy": 0.6555192569891611, + "num_tokens": 855474924.0, + "step": 5097 + }, + { + "entropy": 1.7034937342007954, + "epoch": 0.5600505341792316, + "grad_norm": 0.6177133321762085, + "learning_rate": 1.7209007818815074e-05, + "loss": 1.4076, + "mean_token_accuracy": 0.6450115591287613, + "num_tokens": 855642413.0, + "step": 5098 + }, + { + "entropy": 1.6558322707811992, + "epoch": 0.5601603910906044, + "grad_norm": 0.7308685779571533, + "learning_rate": 1.720784864028342e-05, + "loss": 1.4158, + "mean_token_accuracy": 0.6546875933806101, + "num_tokens": 855823504.0, + "step": 5099 + }, + { + "entropy": 1.7279444734255474, + "epoch": 0.5602702480019774, + "grad_norm": 0.6921755075454712, + "learning_rate": 1.720668926527615e-05, + "loss": 1.3391, + "mean_token_accuracy": 0.6614320774873098, + "num_tokens": 855945864.0, + "step": 5100 + }, + { + "entropy": 1.678322970867157, + "epoch": 0.5603801049133503, + "grad_norm": 0.57282555103302, + "learning_rate": 1.7205529693829965e-05, + "loss": 1.4324, + "mean_token_accuracy": 0.6524877349535624, + "num_tokens": 856171543.0, + "step": 5101 + }, + { + "entropy": 1.712640792131424, + "epoch": 0.5604899618247233, + "grad_norm": 0.6682943105697632, + "learning_rate": 1.720436992598156e-05, + "loss": 1.4408, + "mean_token_accuracy": 0.6717189103364944, + "num_tokens": 856322857.0, + "step": 5102 + }, + { + "entropy": 1.7849741280078888, + "epoch": 0.5605998187360962, + "grad_norm": 0.8261640667915344, + "learning_rate": 1.7203209961767646e-05, + "loss": 1.4446, + "mean_token_accuracy": 0.6582596053679785, + "num_tokens": 856486604.0, + "step": 5103 + }, + { + "entropy": 1.7254948616027832, + "epoch": 0.5607096756474692, + "grad_norm": 0.6827483177185059, + "learning_rate": 1.720204980122493e-05, + "loss": 1.6162, + "mean_token_accuracy": 0.6409385999043783, + "num_tokens": 856730292.0, + "step": 5104 + }, + { + "entropy": 1.7715973059336345, + "epoch": 0.5608195325588421, + "grad_norm": 0.8116368055343628, + "learning_rate": 1.720088944439013e-05, + "loss": 1.3775, + "mean_token_accuracy": 0.6552746097246805, + "num_tokens": 856885143.0, + "step": 5105 + }, + { + "entropy": 1.7679732938607533, + "epoch": 0.5609293894702151, + "grad_norm": 0.6571024060249329, + "learning_rate": 1.7199728891299974e-05, + "loss": 1.3537, + "mean_token_accuracy": 0.6583919723828634, + "num_tokens": 857019977.0, + "step": 5106 + }, + { + "entropy": 1.6773741841316223, + "epoch": 0.561039246381588, + "grad_norm": 0.5981674790382385, + "learning_rate": 1.7198568141991193e-05, + "loss": 1.4617, + "mean_token_accuracy": 0.6360589961210886, + "num_tokens": 857227441.0, + "step": 5107 + }, + { + "entropy": 1.6567221482594807, + "epoch": 0.561149103292961, + "grad_norm": 0.7425564527511597, + "learning_rate": 1.7197407196500525e-05, + "loss": 1.3022, + "mean_token_accuracy": 0.6754785428444544, + "num_tokens": 857391082.0, + "step": 5108 + }, + { + "entropy": 1.7228084901968639, + "epoch": 0.5612589602043339, + "grad_norm": 0.6551631093025208, + "learning_rate": 1.7196246054864708e-05, + "loss": 1.593, + "mean_token_accuracy": 0.6327783366044363, + "num_tokens": 857596607.0, + "step": 5109 + }, + { + "entropy": 1.7758533358573914, + "epoch": 0.5613688171157067, + "grad_norm": 0.6851291656494141, + "learning_rate": 1.71950847171205e-05, + "loss": 1.4648, + "mean_token_accuracy": 0.6309924423694611, + "num_tokens": 857772597.0, + "step": 5110 + }, + { + "entropy": 1.76680189371109, + "epoch": 0.5614786740270797, + "grad_norm": 0.7714706063270569, + "learning_rate": 1.719392318330465e-05, + "loss": 1.3437, + "mean_token_accuracy": 0.6618274201949438, + "num_tokens": 857905051.0, + "step": 5111 + }, + { + "entropy": 1.645568698644638, + "epoch": 0.5615885309384526, + "grad_norm": 0.589038074016571, + "learning_rate": 1.7192761453453924e-05, + "loss": 1.3992, + "mean_token_accuracy": 0.6517779429753622, + "num_tokens": 858079797.0, + "step": 5112 + }, + { + "entropy": 1.7456736266613007, + "epoch": 0.5616983878498256, + "grad_norm": 0.7510016560554504, + "learning_rate": 1.719159952760509e-05, + "loss": 1.3539, + "mean_token_accuracy": 0.6672643373409907, + "num_tokens": 858237023.0, + "step": 5113 + }, + { + "entropy": 1.6943009197711945, + "epoch": 0.5618082447611985, + "grad_norm": 0.646049976348877, + "learning_rate": 1.7190437405794917e-05, + "loss": 1.3997, + "mean_token_accuracy": 0.6529371738433838, + "num_tokens": 858452256.0, + "step": 5114 + }, + { + "entropy": 1.7592908143997192, + "epoch": 0.5619181016725715, + "grad_norm": 0.6745445728302002, + "learning_rate": 1.718927508806019e-05, + "loss": 1.3, + "mean_token_accuracy": 0.6665283391873041, + "num_tokens": 858604338.0, + "step": 5115 + }, + { + "entropy": 1.74533345301946, + "epoch": 0.5620279585839444, + "grad_norm": 0.7311209440231323, + "learning_rate": 1.7188112574437696e-05, + "loss": 1.3826, + "mean_token_accuracy": 0.6617701351642609, + "num_tokens": 858738333.0, + "step": 5116 + }, + { + "entropy": 1.6821747322877247, + "epoch": 0.5621378154953174, + "grad_norm": 0.6801566481590271, + "learning_rate": 1.7186949864964225e-05, + "loss": 1.4588, + "mean_token_accuracy": 0.6743607322374979, + "num_tokens": 858926628.0, + "step": 5117 + }, + { + "entropy": 1.6699562072753906, + "epoch": 0.5622476724066903, + "grad_norm": 0.7358706593513489, + "learning_rate": 1.718578695967658e-05, + "loss": 1.3404, + "mean_token_accuracy": 0.6704433461030325, + "num_tokens": 859072049.0, + "step": 5118 + }, + { + "entropy": 1.6408792237440746, + "epoch": 0.5623575293180633, + "grad_norm": 0.717157781124115, + "learning_rate": 1.718462385861157e-05, + "loss": 1.2962, + "mean_token_accuracy": 0.6739445279041926, + "num_tokens": 859230044.0, + "step": 5119 + }, + { + "entropy": 1.7437108953793843, + "epoch": 0.5624673862294362, + "grad_norm": 1.0900483131408691, + "learning_rate": 1.7183460561806e-05, + "loss": 1.5626, + "mean_token_accuracy": 0.6493689517180125, + "num_tokens": 859372507.0, + "step": 5120 + }, + { + "entropy": 1.6718702812989552, + "epoch": 0.5625772431408091, + "grad_norm": 0.7479756474494934, + "learning_rate": 1.718229706929669e-05, + "loss": 1.2736, + "mean_token_accuracy": 0.6695879648129145, + "num_tokens": 859517643.0, + "step": 5121 + }, + { + "entropy": 1.6360397239526112, + "epoch": 0.562687100052182, + "grad_norm": 0.7399976849555969, + "learning_rate": 1.718113338112046e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6537938465674719, + "num_tokens": 859682232.0, + "step": 5122 + }, + { + "entropy": 1.6917523245016735, + "epoch": 0.5627969569635549, + "grad_norm": 0.7607491612434387, + "learning_rate": 1.7179969497314145e-05, + "loss": 1.4321, + "mean_token_accuracy": 0.6621414522329966, + "num_tokens": 859806144.0, + "step": 5123 + }, + { + "entropy": 1.7270215352376301, + "epoch": 0.5629068138749279, + "grad_norm": 0.663026750087738, + "learning_rate": 1.7178805417914576e-05, + "loss": 1.3397, + "mean_token_accuracy": 0.666211391488711, + "num_tokens": 859949673.0, + "step": 5124 + }, + { + "entropy": 1.6595263083775837, + "epoch": 0.5630166707863008, + "grad_norm": 0.6144124865531921, + "learning_rate": 1.7177641142958604e-05, + "loss": 1.2972, + "mean_token_accuracy": 0.6727373351653417, + "num_tokens": 860090516.0, + "step": 5125 + }, + { + "entropy": 1.7171042064825695, + "epoch": 0.5631265276976738, + "grad_norm": 0.7884184122085571, + "learning_rate": 1.7176476672483077e-05, + "loss": 1.4505, + "mean_token_accuracy": 0.6440162112315496, + "num_tokens": 860271153.0, + "step": 5126 + }, + { + "entropy": 1.6738516787687938, + "epoch": 0.5632363846090467, + "grad_norm": 0.67924964427948, + "learning_rate": 1.717531200652484e-05, + "loss": 1.3207, + "mean_token_accuracy": 0.6723797023296356, + "num_tokens": 860404625.0, + "step": 5127 + }, + { + "entropy": 1.7053898572921753, + "epoch": 0.5633462415204197, + "grad_norm": 0.6389914155006409, + "learning_rate": 1.7174147145120766e-05, + "loss": 1.3741, + "mean_token_accuracy": 0.6581660558780035, + "num_tokens": 860625943.0, + "step": 5128 + }, + { + "entropy": 1.6907204886277516, + "epoch": 0.5634560984317926, + "grad_norm": 0.7094506621360779, + "learning_rate": 1.7172982088307715e-05, + "loss": 1.3378, + "mean_token_accuracy": 0.6622174034516016, + "num_tokens": 860803832.0, + "step": 5129 + }, + { + "entropy": 1.6670372982819874, + "epoch": 0.5635659553431656, + "grad_norm": 0.8901845216751099, + "learning_rate": 1.717181683612256e-05, + "loss": 1.4248, + "mean_token_accuracy": 0.6624594082434972, + "num_tokens": 860978869.0, + "step": 5130 + }, + { + "entropy": 1.667426864306132, + "epoch": 0.5636758122545384, + "grad_norm": 0.7643829584121704, + "learning_rate": 1.717065138860219e-05, + "loss": 1.2617, + "mean_token_accuracy": 0.6825516323248545, + "num_tokens": 861113828.0, + "step": 5131 + }, + { + "entropy": 1.790122111638387, + "epoch": 0.5637856691659114, + "grad_norm": 0.8605037331581116, + "learning_rate": 1.7169485745783475e-05, + "loss": 1.433, + "mean_token_accuracy": 0.6626231670379639, + "num_tokens": 861276229.0, + "step": 5132 + }, + { + "entropy": 1.7115785876909893, + "epoch": 0.5638955260772843, + "grad_norm": 0.7499393820762634, + "learning_rate": 1.716831990770332e-05, + "loss": 1.3592, + "mean_token_accuracy": 0.6678246607383093, + "num_tokens": 861403520.0, + "step": 5133 + }, + { + "entropy": 1.7232838968435924, + "epoch": 0.5640053829886573, + "grad_norm": 0.6720132827758789, + "learning_rate": 1.7167153874398622e-05, + "loss": 1.4723, + "mean_token_accuracy": 0.6535212695598602, + "num_tokens": 861577011.0, + "step": 5134 + }, + { + "entropy": 1.7426664630572002, + "epoch": 0.5641152399000302, + "grad_norm": 0.7186594605445862, + "learning_rate": 1.716598764590628e-05, + "loss": 1.4079, + "mean_token_accuracy": 0.6688724607229233, + "num_tokens": 861725353.0, + "step": 5135 + }, + { + "entropy": 1.6872510810693104, + "epoch": 0.5642250968114031, + "grad_norm": 0.7637690305709839, + "learning_rate": 1.7164821222263207e-05, + "loss": 1.186, + "mean_token_accuracy": 0.6860415786504745, + "num_tokens": 861893900.0, + "step": 5136 + }, + { + "entropy": 1.7718837360541027, + "epoch": 0.5643349537227761, + "grad_norm": 0.7247793674468994, + "learning_rate": 1.7163654603506327e-05, + "loss": 1.6064, + "mean_token_accuracy": 0.6367716689904531, + "num_tokens": 862109571.0, + "step": 5137 + }, + { + "entropy": 1.7636443078517914, + "epoch": 0.564444810634149, + "grad_norm": 0.7421050071716309, + "learning_rate": 1.716248778967255e-05, + "loss": 1.4571, + "mean_token_accuracy": 0.6570049126942953, + "num_tokens": 862250376.0, + "step": 5138 + }, + { + "entropy": 1.7156391243139903, + "epoch": 0.564554667545522, + "grad_norm": 0.7368531227111816, + "learning_rate": 1.7161320780798812e-05, + "loss": 1.4297, + "mean_token_accuracy": 0.6491710195938746, + "num_tokens": 862402788.0, + "step": 5139 + }, + { + "entropy": 1.7167876561482747, + "epoch": 0.5646645244568949, + "grad_norm": 0.6028063297271729, + "learning_rate": 1.716015357692205e-05, + "loss": 1.4445, + "mean_token_accuracy": 0.6677204618851343, + "num_tokens": 862608011.0, + "step": 5140 + }, + { + "entropy": 1.6989375948905945, + "epoch": 0.5647743813682679, + "grad_norm": 0.8523213267326355, + "learning_rate": 1.71589861780792e-05, + "loss": 1.3444, + "mean_token_accuracy": 0.6717568387587866, + "num_tokens": 862778996.0, + "step": 5141 + }, + { + "entropy": 1.6621976296106975, + "epoch": 0.5648842382796407, + "grad_norm": 0.6297332048416138, + "learning_rate": 1.715781858430721e-05, + "loss": 1.4134, + "mean_token_accuracy": 0.6557877908150355, + "num_tokens": 862939813.0, + "step": 5142 + }, + { + "entropy": 1.6616567373275757, + "epoch": 0.5649940951910137, + "grad_norm": 0.6319537162780762, + "learning_rate": 1.7156650795643043e-05, + "loss": 1.3247, + "mean_token_accuracy": 0.6654583762089411, + "num_tokens": 863113346.0, + "step": 5143 + }, + { + "entropy": 1.7592324515183766, + "epoch": 0.5651039521023866, + "grad_norm": 0.6727480888366699, + "learning_rate": 1.715548281212365e-05, + "loss": 1.4165, + "mean_token_accuracy": 0.6434768736362457, + "num_tokens": 863291829.0, + "step": 5144 + }, + { + "entropy": 1.7069471180438995, + "epoch": 0.5652138090137596, + "grad_norm": 0.6831556558609009, + "learning_rate": 1.7154314633785997e-05, + "loss": 1.4489, + "mean_token_accuracy": 0.6467359215021133, + "num_tokens": 863514790.0, + "step": 5145 + }, + { + "entropy": 1.6704001724720001, + "epoch": 0.5653236659251325, + "grad_norm": 0.6945511102676392, + "learning_rate": 1.7153146260667064e-05, + "loss": 1.2975, + "mean_token_accuracy": 0.6630304008722305, + "num_tokens": 863686919.0, + "step": 5146 + }, + { + "entropy": 1.6741840541362762, + "epoch": 0.5654335228365055, + "grad_norm": 0.7052369713783264, + "learning_rate": 1.7151977692803824e-05, + "loss": 1.3397, + "mean_token_accuracy": 0.6724216043949127, + "num_tokens": 863810676.0, + "step": 5147 + }, + { + "entropy": 1.6889431178569794, + "epoch": 0.5655433797478784, + "grad_norm": 0.7486838698387146, + "learning_rate": 1.715080893023326e-05, + "loss": 1.3683, + "mean_token_accuracy": 0.654551774263382, + "num_tokens": 863981409.0, + "step": 5148 + }, + { + "entropy": 1.7260994116465251, + "epoch": 0.5656532366592514, + "grad_norm": 0.776213526725769, + "learning_rate": 1.7149639972992363e-05, + "loss": 1.3205, + "mean_token_accuracy": 0.6820149670044581, + "num_tokens": 864179470.0, + "step": 5149 + }, + { + "entropy": 1.614399919907252, + "epoch": 0.5657630935706243, + "grad_norm": 0.7526430487632751, + "learning_rate": 1.7148470821118135e-05, + "loss": 1.118, + "mean_token_accuracy": 0.6992814292510351, + "num_tokens": 864298750.0, + "step": 5150 + }, + { + "entropy": 1.708221822977066, + "epoch": 0.5658729504819972, + "grad_norm": 0.7361465692520142, + "learning_rate": 1.7147301474647577e-05, + "loss": 1.2939, + "mean_token_accuracy": 0.6688221096992493, + "num_tokens": 864423445.0, + "step": 5151 + }, + { + "entropy": 1.673103392124176, + "epoch": 0.5659828073933701, + "grad_norm": 0.7637960314750671, + "learning_rate": 1.7146131933617695e-05, + "loss": 1.4075, + "mean_token_accuracy": 0.6531636367241541, + "num_tokens": 864600118.0, + "step": 5152 + }, + { + "entropy": 1.6958302358786266, + "epoch": 0.566092664304743, + "grad_norm": 0.6391355395317078, + "learning_rate": 1.7144962198065507e-05, + "loss": 1.323, + "mean_token_accuracy": 0.6684149752060572, + "num_tokens": 864750929.0, + "step": 5153 + }, + { + "entropy": 1.6995809276898701, + "epoch": 0.566202521216116, + "grad_norm": 0.7472272515296936, + "learning_rate": 1.7143792268028036e-05, + "loss": 1.3566, + "mean_token_accuracy": 0.6624608635902405, + "num_tokens": 864923982.0, + "step": 5154 + }, + { + "entropy": 1.708970695734024, + "epoch": 0.5663123781274889, + "grad_norm": 0.7742936611175537, + "learning_rate": 1.7142622143542307e-05, + "loss": 1.3687, + "mean_token_accuracy": 0.657701775431633, + "num_tokens": 865069429.0, + "step": 5155 + }, + { + "entropy": 1.6366569598515828, + "epoch": 0.5664222350388619, + "grad_norm": 0.6137021780014038, + "learning_rate": 1.7141451824645356e-05, + "loss": 1.3238, + "mean_token_accuracy": 0.6683499167362849, + "num_tokens": 865216437.0, + "step": 5156 + }, + { + "entropy": 1.698869526386261, + "epoch": 0.5665320919502348, + "grad_norm": 0.7175676822662354, + "learning_rate": 1.714028131137422e-05, + "loss": 1.4583, + "mean_token_accuracy": 0.6476651877164841, + "num_tokens": 865468974.0, + "step": 5157 + }, + { + "entropy": 1.7646108369032543, + "epoch": 0.5666419488616078, + "grad_norm": 0.6280926465988159, + "learning_rate": 1.713911060376595e-05, + "loss": 1.3422, + "mean_token_accuracy": 0.6493685891230901, + "num_tokens": 865631225.0, + "step": 5158 + }, + { + "entropy": 1.701049913962682, + "epoch": 0.5667518057729807, + "grad_norm": 0.764488935470581, + "learning_rate": 1.7137939701857593e-05, + "loss": 1.4099, + "mean_token_accuracy": 0.671028807759285, + "num_tokens": 865778041.0, + "step": 5159 + }, + { + "entropy": 1.7382206519444783, + "epoch": 0.5668616626843537, + "grad_norm": 0.8662286996841431, + "learning_rate": 1.713676860568621e-05, + "loss": 1.5012, + "mean_token_accuracy": 0.6548273215691248, + "num_tokens": 865997091.0, + "step": 5160 + }, + { + "entropy": 1.7540892759958904, + "epoch": 0.5669715195957266, + "grad_norm": 0.759167492389679, + "learning_rate": 1.7135597315288873e-05, + "loss": 1.2949, + "mean_token_accuracy": 0.6608439882596334, + "num_tokens": 866148237.0, + "step": 5161 + }, + { + "entropy": 1.7543078362941742, + "epoch": 0.5670813765070996, + "grad_norm": 0.6145092844963074, + "learning_rate": 1.7134425830702638e-05, + "loss": 1.4108, + "mean_token_accuracy": 0.6426503856976827, + "num_tokens": 866347643.0, + "step": 5162 + }, + { + "entropy": 1.7160977522532146, + "epoch": 0.5671912334184724, + "grad_norm": 0.6957924365997314, + "learning_rate": 1.7133254151964594e-05, + "loss": 1.4157, + "mean_token_accuracy": 0.6433221797148386, + "num_tokens": 866546498.0, + "step": 5163 + }, + { + "entropy": 1.7012445231278737, + "epoch": 0.5673010903298453, + "grad_norm": 0.6757133603096008, + "learning_rate": 1.7132082279111816e-05, + "loss": 1.3596, + "mean_token_accuracy": 0.6617651581764221, + "num_tokens": 866681108.0, + "step": 5164 + }, + { + "entropy": 1.6479481756687164, + "epoch": 0.5674109472412183, + "grad_norm": 0.6679365634918213, + "learning_rate": 1.71309102121814e-05, + "loss": 1.2512, + "mean_token_accuracy": 0.6751443793376287, + "num_tokens": 866783889.0, + "step": 5165 + }, + { + "entropy": 1.6679150362809498, + "epoch": 0.5675208041525912, + "grad_norm": 0.6778741478919983, + "learning_rate": 1.712973795121044e-05, + "loss": 1.4702, + "mean_token_accuracy": 0.6485533167918524, + "num_tokens": 866960522.0, + "step": 5166 + }, + { + "entropy": 1.7731144726276398, + "epoch": 0.5676306610639642, + "grad_norm": 0.8737553358078003, + "learning_rate": 1.712856549623603e-05, + "loss": 1.5449, + "mean_token_accuracy": 0.646806518236796, + "num_tokens": 867142270.0, + "step": 5167 + }, + { + "entropy": 1.7592595716317494, + "epoch": 0.5677405179753371, + "grad_norm": 0.6949407458305359, + "learning_rate": 1.7127392847295286e-05, + "loss": 1.4931, + "mean_token_accuracy": 0.6439671516418457, + "num_tokens": 867304938.0, + "step": 5168 + }, + { + "entropy": 1.6744161943594615, + "epoch": 0.5678503748867101, + "grad_norm": 0.7399938702583313, + "learning_rate": 1.7126220004425324e-05, + "loss": 1.3912, + "mean_token_accuracy": 0.6706068366765976, + "num_tokens": 867469987.0, + "step": 5169 + }, + { + "entropy": 1.6979783276716869, + "epoch": 0.567960231798083, + "grad_norm": 0.7100719809532166, + "learning_rate": 1.7125046967663255e-05, + "loss": 1.3621, + "mean_token_accuracy": 0.6856355915466944, + "num_tokens": 867633028.0, + "step": 5170 + }, + { + "entropy": 1.6883835395177205, + "epoch": 0.568070088709456, + "grad_norm": 0.6703440546989441, + "learning_rate": 1.7123873737046207e-05, + "loss": 1.4014, + "mean_token_accuracy": 0.6576645423968633, + "num_tokens": 867826775.0, + "step": 5171 + }, + { + "entropy": 1.6956737736860912, + "epoch": 0.5681799456208289, + "grad_norm": 0.5624609589576721, + "learning_rate": 1.7122700312611324e-05, + "loss": 1.435, + "mean_token_accuracy": 0.6521128962437311, + "num_tokens": 868003547.0, + "step": 5172 + }, + { + "entropy": 1.7059629559516907, + "epoch": 0.5682898025322018, + "grad_norm": 0.6375492215156555, + "learning_rate": 1.7121526694395726e-05, + "loss": 1.4911, + "mean_token_accuracy": 0.6470068991184235, + "num_tokens": 868185524.0, + "step": 5173 + }, + { + "entropy": 1.7254150609175365, + "epoch": 0.5683996594435747, + "grad_norm": 0.7593937516212463, + "learning_rate": 1.712035288243657e-05, + "loss": 1.4746, + "mean_token_accuracy": 0.6589858829975128, + "num_tokens": 868345630.0, + "step": 5174 + }, + { + "entropy": 1.7823002735773723, + "epoch": 0.5685095163549477, + "grad_norm": 0.6700795888900757, + "learning_rate": 1.7119178876771004e-05, + "loss": 1.4431, + "mean_token_accuracy": 0.6534734417994817, + "num_tokens": 868487484.0, + "step": 5175 + }, + { + "entropy": 1.7280798256397247, + "epoch": 0.5686193732663206, + "grad_norm": 0.6447996497154236, + "learning_rate": 1.711800467743618e-05, + "loss": 1.43, + "mean_token_accuracy": 0.649931788444519, + "num_tokens": 868665888.0, + "step": 5176 + }, + { + "entropy": 1.6930799186229706, + "epoch": 0.5687292301776935, + "grad_norm": 0.6100133061408997, + "learning_rate": 1.711683028446927e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6684134354194006, + "num_tokens": 868854573.0, + "step": 5177 + }, + { + "entropy": 1.6725496153036754, + "epoch": 0.5688390870890665, + "grad_norm": 0.6703057885169983, + "learning_rate": 1.7115655697907437e-05, + "loss": 1.3392, + "mean_token_accuracy": 0.6644681443770727, + "num_tokens": 869022423.0, + "step": 5178 + }, + { + "entropy": 1.6850597560405731, + "epoch": 0.5689489440004394, + "grad_norm": 0.7215490937232971, + "learning_rate": 1.7114480917787854e-05, + "loss": 1.3246, + "mean_token_accuracy": 0.675572469830513, + "num_tokens": 869164793.0, + "step": 5179 + }, + { + "entropy": 1.7287063002586365, + "epoch": 0.5690588009118124, + "grad_norm": 0.7284601330757141, + "learning_rate": 1.7113305944147705e-05, + "loss": 1.3834, + "mean_token_accuracy": 0.6688296049833298, + "num_tokens": 869319413.0, + "step": 5180 + }, + { + "entropy": 1.7300913234551747, + "epoch": 0.5691686578231853, + "grad_norm": 0.8334223031997681, + "learning_rate": 1.711213077702418e-05, + "loss": 1.4058, + "mean_token_accuracy": 0.6505585461854935, + "num_tokens": 869499491.0, + "step": 5181 + }, + { + "entropy": 1.6942098836104076, + "epoch": 0.5692785147345583, + "grad_norm": 0.5407121181488037, + "learning_rate": 1.711095541645447e-05, + "loss": 1.4164, + "mean_token_accuracy": 0.6361008981863657, + "num_tokens": 869772386.0, + "step": 5182 + }, + { + "entropy": 1.768281082312266, + "epoch": 0.5693883716459311, + "grad_norm": 0.7168862819671631, + "learning_rate": 1.7109779862475773e-05, + "loss": 1.3441, + "mean_token_accuracy": 0.6586054215828577, + "num_tokens": 869944947.0, + "step": 5183 + }, + { + "entropy": 1.5970544119675953, + "epoch": 0.5694982285573041, + "grad_norm": 0.6595725417137146, + "learning_rate": 1.7108604115125298e-05, + "loss": 1.3401, + "mean_token_accuracy": 0.6841567407051722, + "num_tokens": 870131433.0, + "step": 5184 + }, + { + "entropy": 1.7234369615713756, + "epoch": 0.569608085468677, + "grad_norm": 0.687660276889801, + "learning_rate": 1.7107428174440254e-05, + "loss": 1.2686, + "mean_token_accuracy": 0.6720225811004639, + "num_tokens": 870280419.0, + "step": 5185 + }, + { + "entropy": 1.667283058166504, + "epoch": 0.56971794238005, + "grad_norm": 0.6603217124938965, + "learning_rate": 1.710625204045786e-05, + "loss": 1.4857, + "mean_token_accuracy": 0.6498481879631678, + "num_tokens": 870435215.0, + "step": 5186 + }, + { + "entropy": 1.6752402385075886, + "epoch": 0.5698277992914229, + "grad_norm": 0.6785051226615906, + "learning_rate": 1.7105075713215343e-05, + "loss": 1.4667, + "mean_token_accuracy": 0.6571814368168513, + "num_tokens": 870635341.0, + "step": 5187 + }, + { + "entropy": 1.761198987563451, + "epoch": 0.5699376562027959, + "grad_norm": 0.8082736134529114, + "learning_rate": 1.710389919274993e-05, + "loss": 1.4905, + "mean_token_accuracy": 0.6487453877925873, + "num_tokens": 870824878.0, + "step": 5188 + }, + { + "entropy": 1.6660157044728596, + "epoch": 0.5700475131141688, + "grad_norm": 0.6341331601142883, + "learning_rate": 1.7102722479098855e-05, + "loss": 1.5113, + "mean_token_accuracy": 0.6298314034938812, + "num_tokens": 871034276.0, + "step": 5189 + }, + { + "entropy": 1.6461931069691975, + "epoch": 0.5701573700255417, + "grad_norm": 0.6263504028320312, + "learning_rate": 1.7101545572299368e-05, + "loss": 1.3666, + "mean_token_accuracy": 0.6588836163282394, + "num_tokens": 871199755.0, + "step": 5190 + }, + { + "entropy": 1.7178466320037842, + "epoch": 0.5702672269369147, + "grad_norm": 0.7136973142623901, + "learning_rate": 1.710036847238871e-05, + "loss": 1.3612, + "mean_token_accuracy": 0.6669484178225199, + "num_tokens": 871378421.0, + "step": 5191 + }, + { + "entropy": 1.7081640462080638, + "epoch": 0.5703770838482876, + "grad_norm": 0.6606138348579407, + "learning_rate": 1.709919117940414e-05, + "loss": 1.3281, + "mean_token_accuracy": 0.6701463560263315, + "num_tokens": 871577120.0, + "step": 5192 + }, + { + "entropy": 1.7162544826666515, + "epoch": 0.5704869407596606, + "grad_norm": 0.6978714466094971, + "learning_rate": 1.709801369338292e-05, + "loss": 1.1938, + "mean_token_accuracy": 0.6883192261060079, + "num_tokens": 871681735.0, + "step": 5193 + }, + { + "entropy": 1.7487797538439434, + "epoch": 0.5705967976710334, + "grad_norm": 0.6483967900276184, + "learning_rate": 1.709683601436231e-05, + "loss": 1.4388, + "mean_token_accuracy": 0.6695059786240259, + "num_tokens": 871823015.0, + "step": 5194 + }, + { + "entropy": 1.6930171847343445, + "epoch": 0.5707066545824064, + "grad_norm": 0.6266820430755615, + "learning_rate": 1.709565814237959e-05, + "loss": 1.4659, + "mean_token_accuracy": 0.6464860786994299, + "num_tokens": 871993586.0, + "step": 5195 + }, + { + "entropy": 1.6980145176251729, + "epoch": 0.5708165114937793, + "grad_norm": 0.7094162106513977, + "learning_rate": 1.7094480077472035e-05, + "loss": 1.3551, + "mean_token_accuracy": 0.6648579289515814, + "num_tokens": 872131292.0, + "step": 5196 + }, + { + "entropy": 1.694219281276067, + "epoch": 0.5709263684051523, + "grad_norm": 0.7052621841430664, + "learning_rate": 1.7093301819676935e-05, + "loss": 1.3833, + "mean_token_accuracy": 0.6764810482660929, + "num_tokens": 872314113.0, + "step": 5197 + }, + { + "entropy": 1.7130983074506123, + "epoch": 0.5710362253165252, + "grad_norm": 0.6613668203353882, + "learning_rate": 1.7092123369031575e-05, + "loss": 1.4329, + "mean_token_accuracy": 0.6563561856746674, + "num_tokens": 872505265.0, + "step": 5198 + }, + { + "entropy": 1.7371846238772075, + "epoch": 0.5711460822278982, + "grad_norm": 0.6612892746925354, + "learning_rate": 1.7090944725573254e-05, + "loss": 1.4961, + "mean_token_accuracy": 0.6479866852362951, + "num_tokens": 872658236.0, + "step": 5199 + }, + { + "entropy": 1.6984902322292328, + "epoch": 0.5712559391392711, + "grad_norm": 0.7463762164115906, + "learning_rate": 1.708976588933928e-05, + "loss": 1.5082, + "mean_token_accuracy": 0.6461683760086695, + "num_tokens": 872821473.0, + "step": 5200 + }, + { + "entropy": 1.696275144815445, + "epoch": 0.5713657960506441, + "grad_norm": 0.7056490778923035, + "learning_rate": 1.708858686036696e-05, + "loss": 1.4607, + "mean_token_accuracy": 0.6508284409840902, + "num_tokens": 872984515.0, + "step": 5201 + }, + { + "entropy": 1.626963605483373, + "epoch": 0.571475652962017, + "grad_norm": 0.7029443383216858, + "learning_rate": 1.7087407638693607e-05, + "loss": 1.3804, + "mean_token_accuracy": 0.6720445652802786, + "num_tokens": 873183647.0, + "step": 5202 + }, + { + "entropy": 1.7299024264017742, + "epoch": 0.57158550987339, + "grad_norm": 0.586719810962677, + "learning_rate": 1.7086228224356543e-05, + "loss": 1.3943, + "mean_token_accuracy": 0.6482478181521097, + "num_tokens": 873348548.0, + "step": 5203 + }, + { + "entropy": 1.71010688940684, + "epoch": 0.5716953667847628, + "grad_norm": 0.7238386869430542, + "learning_rate": 1.7085048617393104e-05, + "loss": 1.365, + "mean_token_accuracy": 0.6639541685581207, + "num_tokens": 873549913.0, + "step": 5204 + }, + { + "entropy": 1.6785256763299305, + "epoch": 0.5718052236961357, + "grad_norm": 0.6056387424468994, + "learning_rate": 1.7083868817840617e-05, + "loss": 1.3858, + "mean_token_accuracy": 0.6524512271086375, + "num_tokens": 873733597.0, + "step": 5205 + }, + { + "entropy": 1.6776171326637268, + "epoch": 0.5719150806075087, + "grad_norm": 0.7004038095474243, + "learning_rate": 1.7082688825736424e-05, + "loss": 1.256, + "mean_token_accuracy": 0.6724594185749689, + "num_tokens": 873854790.0, + "step": 5206 + }, + { + "entropy": 1.6850430766741435, + "epoch": 0.5720249375188816, + "grad_norm": 0.6626861691474915, + "learning_rate": 1.7081508641117866e-05, + "loss": 1.3037, + "mean_token_accuracy": 0.6690275172392527, + "num_tokens": 874031885.0, + "step": 5207 + }, + { + "entropy": 1.68301260471344, + "epoch": 0.5721347944302546, + "grad_norm": 1.1614986658096313, + "learning_rate": 1.7080328264022307e-05, + "loss": 1.5599, + "mean_token_accuracy": 0.6682318995396296, + "num_tokens": 874203630.0, + "step": 5208 + }, + { + "entropy": 1.791142870982488, + "epoch": 0.5722446513416275, + "grad_norm": 0.736549973487854, + "learning_rate": 1.7079147694487093e-05, + "loss": 1.3728, + "mean_token_accuracy": 0.6522929718097051, + "num_tokens": 874342919.0, + "step": 5209 + }, + { + "entropy": 1.6703368723392487, + "epoch": 0.5723545082530005, + "grad_norm": 0.6671043634414673, + "learning_rate": 1.7077966932549595e-05, + "loss": 1.401, + "mean_token_accuracy": 0.6502569168806076, + "num_tokens": 874581234.0, + "step": 5210 + }, + { + "entropy": 1.7057646413644154, + "epoch": 0.5724643651643734, + "grad_norm": 0.652962863445282, + "learning_rate": 1.707678597824718e-05, + "loss": 1.4256, + "mean_token_accuracy": 0.648739273349444, + "num_tokens": 874765878.0, + "step": 5211 + }, + { + "entropy": 1.754818469285965, + "epoch": 0.5725742220757464, + "grad_norm": 0.6058507561683655, + "learning_rate": 1.707560483161723e-05, + "loss": 1.5231, + "mean_token_accuracy": 0.6460634718338648, + "num_tokens": 874977004.0, + "step": 5212 + }, + { + "entropy": 1.7265916963418324, + "epoch": 0.5726840789871193, + "grad_norm": 0.6572061777114868, + "learning_rate": 1.7074423492697127e-05, + "loss": 1.5681, + "mean_token_accuracy": 0.6427283038695654, + "num_tokens": 875176239.0, + "step": 5213 + }, + { + "entropy": 1.6362906793753307, + "epoch": 0.5727939358984923, + "grad_norm": 0.7420666217803955, + "learning_rate": 1.7073241961524253e-05, + "loss": 1.455, + "mean_token_accuracy": 0.6566400279601415, + "num_tokens": 875350911.0, + "step": 5214 + }, + { + "entropy": 1.7721679508686066, + "epoch": 0.5729037928098651, + "grad_norm": 0.6848008632659912, + "learning_rate": 1.707206023813601e-05, + "loss": 1.4325, + "mean_token_accuracy": 0.6503975490729014, + "num_tokens": 875533134.0, + "step": 5215 + }, + { + "entropy": 1.7172233561674755, + "epoch": 0.5730136497212381, + "grad_norm": 0.7289333343505859, + "learning_rate": 1.7070878322569797e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6530760476986567, + "num_tokens": 875694665.0, + "step": 5216 + }, + { + "entropy": 1.7005958954493205, + "epoch": 0.573123506632611, + "grad_norm": 0.7095157504081726, + "learning_rate": 1.706969621486302e-05, + "loss": 1.4573, + "mean_token_accuracy": 0.6517948259909948, + "num_tokens": 875849820.0, + "step": 5217 + }, + { + "entropy": 1.7994357645511627, + "epoch": 0.5732333635439839, + "grad_norm": 0.638963520526886, + "learning_rate": 1.706851391505309e-05, + "loss": 1.5509, + "mean_token_accuracy": 0.6139777153730392, + "num_tokens": 876084718.0, + "step": 5218 + }, + { + "entropy": 1.7117552955945332, + "epoch": 0.5733432204553569, + "grad_norm": 0.6520084738731384, + "learning_rate": 1.7067331423177433e-05, + "loss": 1.4279, + "mean_token_accuracy": 0.649882584810257, + "num_tokens": 876279619.0, + "step": 5219 + }, + { + "entropy": 1.6188062528769176, + "epoch": 0.5734530773667298, + "grad_norm": 0.6222274303436279, + "learning_rate": 1.706614873927347e-05, + "loss": 1.2127, + "mean_token_accuracy": 0.6836750755707423, + "num_tokens": 876402605.0, + "step": 5220 + }, + { + "entropy": 1.7084797322750092, + "epoch": 0.5735629342781028, + "grad_norm": 0.7118093967437744, + "learning_rate": 1.7064965863378634e-05, + "loss": 1.4423, + "mean_token_accuracy": 0.6552711973587672, + "num_tokens": 876580269.0, + "step": 5221 + }, + { + "entropy": 1.6655645966529846, + "epoch": 0.5736727911894757, + "grad_norm": 0.6326475143432617, + "learning_rate": 1.7063782795530357e-05, + "loss": 1.3397, + "mean_token_accuracy": 0.6672480752070745, + "num_tokens": 876767038.0, + "step": 5222 + }, + { + "entropy": 1.7708354194959004, + "epoch": 0.5737826481008487, + "grad_norm": 0.7146939635276794, + "learning_rate": 1.7062599535766092e-05, + "loss": 1.4182, + "mean_token_accuracy": 0.6546067396799723, + "num_tokens": 876905753.0, + "step": 5223 + }, + { + "entropy": 1.6421829263369243, + "epoch": 0.5738925050122216, + "grad_norm": 0.7064340710639954, + "learning_rate": 1.706141608412328e-05, + "loss": 1.5359, + "mean_token_accuracy": 0.6482568581899008, + "num_tokens": 877073242.0, + "step": 5224 + }, + { + "entropy": 1.7002604206403096, + "epoch": 0.5740023619235946, + "grad_norm": 0.7547617554664612, + "learning_rate": 1.706023244063938e-05, + "loss": 1.4337, + "mean_token_accuracy": 0.6474874764680862, + "num_tokens": 877218384.0, + "step": 5225 + }, + { + "entropy": 1.792248547077179, + "epoch": 0.5741122188349674, + "grad_norm": 0.7492245435714722, + "learning_rate": 1.7059048605351857e-05, + "loss": 1.5136, + "mean_token_accuracy": 0.6328116208314896, + "num_tokens": 877364436.0, + "step": 5226 + }, + { + "entropy": 1.7168182233969371, + "epoch": 0.5742220757463404, + "grad_norm": 0.5984399914741516, + "learning_rate": 1.7057864578298175e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.6506437808275223, + "num_tokens": 877520851.0, + "step": 5227 + }, + { + "entropy": 1.6336182951927185, + "epoch": 0.5743319326577133, + "grad_norm": 0.6459986567497253, + "learning_rate": 1.7056680359515807e-05, + "loss": 1.4619, + "mean_token_accuracy": 0.658488447467486, + "num_tokens": 877714817.0, + "step": 5228 + }, + { + "entropy": 1.755718320608139, + "epoch": 0.5744417895690863, + "grad_norm": 0.691868782043457, + "learning_rate": 1.7055495949042236e-05, + "loss": 1.4811, + "mean_token_accuracy": 0.6512856880823771, + "num_tokens": 877896899.0, + "step": 5229 + }, + { + "entropy": 1.7979524632294972, + "epoch": 0.5745516464804592, + "grad_norm": 0.6676319241523743, + "learning_rate": 1.7054311346914948e-05, + "loss": 1.3411, + "mean_token_accuracy": 0.6650152256091436, + "num_tokens": 878029570.0, + "step": 5230 + }, + { + "entropy": 1.7446431517601013, + "epoch": 0.5746615033918321, + "grad_norm": 0.810004711151123, + "learning_rate": 1.705312655317143e-05, + "loss": 1.5819, + "mean_token_accuracy": 0.6404779901107153, + "num_tokens": 878204154.0, + "step": 5231 + }, + { + "entropy": 1.6578894356886547, + "epoch": 0.5747713603032051, + "grad_norm": 0.6192182898521423, + "learning_rate": 1.7051941567849188e-05, + "loss": 1.4529, + "mean_token_accuracy": 0.6516250371932983, + "num_tokens": 878372472.0, + "step": 5232 + }, + { + "entropy": 1.694698413213094, + "epoch": 0.574881217214578, + "grad_norm": 0.7715234160423279, + "learning_rate": 1.7050756390985722e-05, + "loss": 1.3658, + "mean_token_accuracy": 0.6625189731518427, + "num_tokens": 878498037.0, + "step": 5233 + }, + { + "entropy": 1.6993319789568584, + "epoch": 0.574991074125951, + "grad_norm": 0.8013604283332825, + "learning_rate": 1.7049571022618542e-05, + "loss": 1.255, + "mean_token_accuracy": 0.6749153534571329, + "num_tokens": 878628141.0, + "step": 5234 + }, + { + "entropy": 1.7042160034179688, + "epoch": 0.5751009310373238, + "grad_norm": 0.6807699203491211, + "learning_rate": 1.7048385462785165e-05, + "loss": 1.3983, + "mean_token_accuracy": 0.6614261368910471, + "num_tokens": 878808460.0, + "step": 5235 + }, + { + "entropy": 1.7180274625619252, + "epoch": 0.5752107879486968, + "grad_norm": 0.7537125945091248, + "learning_rate": 1.7047199711523114e-05, + "loss": 1.3341, + "mean_token_accuracy": 0.659697949886322, + "num_tokens": 878953257.0, + "step": 5236 + }, + { + "entropy": 1.7314948936303456, + "epoch": 0.5753206448600697, + "grad_norm": 0.6685804128646851, + "learning_rate": 1.7046013768869917e-05, + "loss": 1.3889, + "mean_token_accuracy": 0.6570734431346258, + "num_tokens": 879172278.0, + "step": 5237 + }, + { + "entropy": 1.6946994264920552, + "epoch": 0.5754305017714427, + "grad_norm": 0.691840648651123, + "learning_rate": 1.7044827634863114e-05, + "loss": 1.4046, + "mean_token_accuracy": 0.6577261487642924, + "num_tokens": 879297522.0, + "step": 5238 + }, + { + "entropy": 1.6562682489554088, + "epoch": 0.5755403586828156, + "grad_norm": 0.6487468481063843, + "learning_rate": 1.704364130954023e-05, + "loss": 1.3072, + "mean_token_accuracy": 0.6770086338122686, + "num_tokens": 879456037.0, + "step": 5239 + }, + { + "entropy": 1.7011775175730388, + "epoch": 0.5756502155941886, + "grad_norm": 0.7802998423576355, + "learning_rate": 1.7042454792938827e-05, + "loss": 1.3878, + "mean_token_accuracy": 0.656680092215538, + "num_tokens": 879654239.0, + "step": 5240 + }, + { + "entropy": 1.788252015908559, + "epoch": 0.5757600725055615, + "grad_norm": 0.8169171214103699, + "learning_rate": 1.704126808509645e-05, + "loss": 1.631, + "mean_token_accuracy": 0.627186248699824, + "num_tokens": 879815488.0, + "step": 5241 + }, + { + "entropy": 1.724605659643809, + "epoch": 0.5758699294169345, + "grad_norm": 0.624187707901001, + "learning_rate": 1.7040081186050666e-05, + "loss": 1.4769, + "mean_token_accuracy": 0.6461042215426763, + "num_tokens": 879993713.0, + "step": 5242 + }, + { + "entropy": 1.7722249925136566, + "epoch": 0.5759797863283074, + "grad_norm": 0.7050033211708069, + "learning_rate": 1.703889409583903e-05, + "loss": 1.5217, + "mean_token_accuracy": 0.6346140801906586, + "num_tokens": 880182622.0, + "step": 5243 + }, + { + "entropy": 1.7021553913752239, + "epoch": 0.5760896432396804, + "grad_norm": 0.6425755023956299, + "learning_rate": 1.703770681449912e-05, + "loss": 1.4137, + "mean_token_accuracy": 0.6676051765680313, + "num_tokens": 880327373.0, + "step": 5244 + }, + { + "entropy": 1.7722231149673462, + "epoch": 0.5761995001510533, + "grad_norm": 0.7244229316711426, + "learning_rate": 1.7036519342068507e-05, + "loss": 1.4358, + "mean_token_accuracy": 0.6465661724408468, + "num_tokens": 880481323.0, + "step": 5245 + }, + { + "entropy": 1.7024872402350109, + "epoch": 0.5763093570624261, + "grad_norm": 0.909506618976593, + "learning_rate": 1.7035331678584776e-05, + "loss": 1.4059, + "mean_token_accuracy": 0.6582985470692316, + "num_tokens": 880643104.0, + "step": 5246 + }, + { + "entropy": 1.688640296459198, + "epoch": 0.5764192139737991, + "grad_norm": 0.8793448805809021, + "learning_rate": 1.703414382408552e-05, + "loss": 1.5182, + "mean_token_accuracy": 0.6507097780704498, + "num_tokens": 880824149.0, + "step": 5247 + }, + { + "entropy": 1.7103163798650105, + "epoch": 0.576529070885172, + "grad_norm": 0.6955628395080566, + "learning_rate": 1.703295577860833e-05, + "loss": 1.4729, + "mean_token_accuracy": 0.6492475817600886, + "num_tokens": 881005364.0, + "step": 5248 + }, + { + "entropy": 1.7185686628023784, + "epoch": 0.576638927796545, + "grad_norm": 0.6877673268318176, + "learning_rate": 1.703176754219081e-05, + "loss": 1.36, + "mean_token_accuracy": 0.6541955421368281, + "num_tokens": 881203802.0, + "step": 5249 + }, + { + "entropy": 1.6697140634059906, + "epoch": 0.5767487847079179, + "grad_norm": 0.6812824010848999, + "learning_rate": 1.703057911487056e-05, + "loss": 1.3598, + "mean_token_accuracy": 0.6645796249310175, + "num_tokens": 881348954.0, + "step": 5250 + }, + { + "entropy": 1.7107038895289104, + "epoch": 0.5768586416192909, + "grad_norm": 0.676230788230896, + "learning_rate": 1.70293904966852e-05, + "loss": 1.4258, + "mean_token_accuracy": 0.6471339017152786, + "num_tokens": 881511658.0, + "step": 5251 + }, + { + "entropy": 1.693604399760564, + "epoch": 0.5769684985306638, + "grad_norm": 0.6729671955108643, + "learning_rate": 1.702820168767235e-05, + "loss": 1.3252, + "mean_token_accuracy": 0.6672259618838629, + "num_tokens": 881661578.0, + "step": 5252 + }, + { + "entropy": 1.7389824489752452, + "epoch": 0.5770783554420368, + "grad_norm": 0.7641178369522095, + "learning_rate": 1.7027012687869637e-05, + "loss": 1.32, + "mean_token_accuracy": 0.6692226231098175, + "num_tokens": 881774550.0, + "step": 5253 + }, + { + "entropy": 1.7137807210286458, + "epoch": 0.5771882123534097, + "grad_norm": 0.7289935350418091, + "learning_rate": 1.7025823497314682e-05, + "loss": 1.3686, + "mean_token_accuracy": 0.6578000535567602, + "num_tokens": 881928763.0, + "step": 5254 + }, + { + "entropy": 1.6480054656664531, + "epoch": 0.5772980692647827, + "grad_norm": 0.8076556921005249, + "learning_rate": 1.7024634116045133e-05, + "loss": 1.3671, + "mean_token_accuracy": 0.669852688908577, + "num_tokens": 882123111.0, + "step": 5255 + }, + { + "entropy": 1.6956392228603363, + "epoch": 0.5774079261761556, + "grad_norm": 0.7042696475982666, + "learning_rate": 1.7023444544098624e-05, + "loss": 1.3784, + "mean_token_accuracy": 0.6528898576895396, + "num_tokens": 882287513.0, + "step": 5256 + }, + { + "entropy": 1.6527445713678997, + "epoch": 0.5775177830875285, + "grad_norm": 0.5778684616088867, + "learning_rate": 1.702225478151281e-05, + "loss": 1.403, + "mean_token_accuracy": 0.660379151503245, + "num_tokens": 882511864.0, + "step": 5257 + }, + { + "entropy": 1.7085080047448475, + "epoch": 0.5776276399989014, + "grad_norm": 0.8756363391876221, + "learning_rate": 1.7021064828325347e-05, + "loss": 1.437, + "mean_token_accuracy": 0.6682372838258743, + "num_tokens": 882708567.0, + "step": 5258 + }, + { + "entropy": 1.7350213130315144, + "epoch": 0.5777374969102743, + "grad_norm": 0.7906478047370911, + "learning_rate": 1.7019874684573897e-05, + "loss": 1.4177, + "mean_token_accuracy": 0.6588336328665415, + "num_tokens": 882865802.0, + "step": 5259 + }, + { + "entropy": 1.7045749227205913, + "epoch": 0.5778473538216473, + "grad_norm": 0.6288008689880371, + "learning_rate": 1.7018684350296123e-05, + "loss": 1.3069, + "mean_token_accuracy": 0.6615212808052698, + "num_tokens": 882992111.0, + "step": 5260 + }, + { + "entropy": 1.7101500928401947, + "epoch": 0.5779572107330202, + "grad_norm": 0.6722090840339661, + "learning_rate": 1.7017493825529703e-05, + "loss": 1.3803, + "mean_token_accuracy": 0.6603521555662155, + "num_tokens": 883145631.0, + "step": 5261 + }, + { + "entropy": 1.7001839975516002, + "epoch": 0.5780670676443932, + "grad_norm": 0.6493961811065674, + "learning_rate": 1.7016303110312316e-05, + "loss": 1.3758, + "mean_token_accuracy": 0.6508415341377258, + "num_tokens": 883353534.0, + "step": 5262 + }, + { + "entropy": 1.7026556134223938, + "epoch": 0.5781769245557661, + "grad_norm": 0.5992609858512878, + "learning_rate": 1.7015112204681644e-05, + "loss": 1.4741, + "mean_token_accuracy": 0.6399723639090856, + "num_tokens": 883605305.0, + "step": 5263 + }, + { + "entropy": 1.7291531364123027, + "epoch": 0.5782867814671391, + "grad_norm": 0.8137235641479492, + "learning_rate": 1.7013921108675385e-05, + "loss": 1.4096, + "mean_token_accuracy": 0.6661647657553355, + "num_tokens": 883813260.0, + "step": 5264 + }, + { + "entropy": 1.672690361738205, + "epoch": 0.578396638378512, + "grad_norm": 0.5643144845962524, + "learning_rate": 1.701272982233123e-05, + "loss": 1.3007, + "mean_token_accuracy": 0.6663571248451868, + "num_tokens": 883975166.0, + "step": 5265 + }, + { + "entropy": 1.7014889319737752, + "epoch": 0.578506495289885, + "grad_norm": 0.8740093111991882, + "learning_rate": 1.7011538345686887e-05, + "loss": 1.3405, + "mean_token_accuracy": 0.6774832854668299, + "num_tokens": 884141235.0, + "step": 5266 + }, + { + "entropy": 1.709878146648407, + "epoch": 0.5786163522012578, + "grad_norm": 0.8314535021781921, + "learning_rate": 1.7010346678780062e-05, + "loss": 1.4373, + "mean_token_accuracy": 0.6721083223819733, + "num_tokens": 884279533.0, + "step": 5267 + }, + { + "entropy": 1.7554621994495392, + "epoch": 0.5787262091126308, + "grad_norm": 0.6525247097015381, + "learning_rate": 1.7009154821648478e-05, + "loss": 1.418, + "mean_token_accuracy": 0.6456648210684458, + "num_tokens": 884459831.0, + "step": 5268 + }, + { + "entropy": 1.6662676533063252, + "epoch": 0.5788360660240037, + "grad_norm": 0.765891969203949, + "learning_rate": 1.7007962774329846e-05, + "loss": 1.4096, + "mean_token_accuracy": 0.650817280014356, + "num_tokens": 884612746.0, + "step": 5269 + }, + { + "entropy": 1.6593515574932098, + "epoch": 0.5789459229353767, + "grad_norm": 0.5619024038314819, + "learning_rate": 1.7006770536861902e-05, + "loss": 1.444, + "mean_token_accuracy": 0.6437307397524515, + "num_tokens": 884817763.0, + "step": 5270 + }, + { + "entropy": 1.7293136517206829, + "epoch": 0.5790557798467496, + "grad_norm": 0.7005648612976074, + "learning_rate": 1.7005578109282377e-05, + "loss": 1.4625, + "mean_token_accuracy": 0.6423446436723074, + "num_tokens": 885024512.0, + "step": 5271 + }, + { + "entropy": 1.7019110818703969, + "epoch": 0.5791656367581225, + "grad_norm": 0.6571354269981384, + "learning_rate": 1.700438549162901e-05, + "loss": 1.3648, + "mean_token_accuracy": 0.6714861591657003, + "num_tokens": 885150107.0, + "step": 5272 + }, + { + "entropy": 1.683734953403473, + "epoch": 0.5792754936694955, + "grad_norm": 0.6158427596092224, + "learning_rate": 1.7003192683939547e-05, + "loss": 1.6153, + "mean_token_accuracy": 0.6282309715946516, + "num_tokens": 885329600.0, + "step": 5273 + }, + { + "entropy": 1.7012614409128826, + "epoch": 0.5793853505808684, + "grad_norm": 0.6291089057922363, + "learning_rate": 1.7001999686251743e-05, + "loss": 1.2531, + "mean_token_accuracy": 0.6708623866240183, + "num_tokens": 885477430.0, + "step": 5274 + }, + { + "entropy": 1.7729269862174988, + "epoch": 0.5794952074922414, + "grad_norm": 0.6474009156227112, + "learning_rate": 1.7000806498603354e-05, + "loss": 1.4934, + "mean_token_accuracy": 0.6346175720294317, + "num_tokens": 885718460.0, + "step": 5275 + }, + { + "entropy": 1.646110604206721, + "epoch": 0.5796050644036143, + "grad_norm": 1.444273829460144, + "learning_rate": 1.6999613121032143e-05, + "loss": 1.169, + "mean_token_accuracy": 0.6865449994802475, + "num_tokens": 885930047.0, + "step": 5276 + }, + { + "entropy": 1.733050415913264, + "epoch": 0.5797149213149873, + "grad_norm": 1.0887868404388428, + "learning_rate": 1.6998419553575877e-05, + "loss": 1.5272, + "mean_token_accuracy": 0.6553502380847931, + "num_tokens": 886082958.0, + "step": 5277 + }, + { + "entropy": 1.7067347665627797, + "epoch": 0.5798247782263601, + "grad_norm": 0.7280968427658081, + "learning_rate": 1.6997225796272342e-05, + "loss": 1.3507, + "mean_token_accuracy": 0.6697677026192347, + "num_tokens": 886202974.0, + "step": 5278 + }, + { + "entropy": 1.6995122532049816, + "epoch": 0.5799346351377331, + "grad_norm": 0.6713830232620239, + "learning_rate": 1.6996031849159304e-05, + "loss": 1.3817, + "mean_token_accuracy": 0.6518794447183609, + "num_tokens": 886396571.0, + "step": 5279 + }, + { + "entropy": 1.7079652845859528, + "epoch": 0.580044492049106, + "grad_norm": 0.7544903755187988, + "learning_rate": 1.6994837712274566e-05, + "loss": 1.3505, + "mean_token_accuracy": 0.663679818312327, + "num_tokens": 886534124.0, + "step": 5280 + }, + { + "entropy": 1.718712459007899, + "epoch": 0.580154348960479, + "grad_norm": 0.6247344613075256, + "learning_rate": 1.6993643385655914e-05, + "loss": 1.3006, + "mean_token_accuracy": 0.6607035100460052, + "num_tokens": 886670601.0, + "step": 5281 + }, + { + "entropy": 1.730804314215978, + "epoch": 0.5802642058718519, + "grad_norm": 0.5982744693756104, + "learning_rate": 1.6992448869341147e-05, + "loss": 1.4283, + "mean_token_accuracy": 0.6605020463466644, + "num_tokens": 886849250.0, + "step": 5282 + }, + { + "entropy": 1.6990150213241577, + "epoch": 0.5803740627832249, + "grad_norm": 0.7501466274261475, + "learning_rate": 1.6991254163368077e-05, + "loss": 1.4045, + "mean_token_accuracy": 0.6659552901983261, + "num_tokens": 886994164.0, + "step": 5283 + }, + { + "entropy": 1.7572198311487834, + "epoch": 0.5804839196945978, + "grad_norm": 0.7074136734008789, + "learning_rate": 1.699005926777451e-05, + "loss": 1.6011, + "mean_token_accuracy": 0.6400540322065353, + "num_tokens": 887194351.0, + "step": 5284 + }, + { + "entropy": 1.6333990295728047, + "epoch": 0.5805937766059707, + "grad_norm": 0.8395227193832397, + "learning_rate": 1.698886418259827e-05, + "loss": 1.2298, + "mean_token_accuracy": 0.68232361972332, + "num_tokens": 887338240.0, + "step": 5285 + }, + { + "entropy": 1.7285989026228588, + "epoch": 0.5807036335173437, + "grad_norm": 0.6749279499053955, + "learning_rate": 1.6987668907877176e-05, + "loss": 1.5952, + "mean_token_accuracy": 0.637232648829619, + "num_tokens": 887543389.0, + "step": 5286 + }, + { + "entropy": 1.6935764054457347, + "epoch": 0.5808134904287166, + "grad_norm": 0.6975464224815369, + "learning_rate": 1.6986473443649058e-05, + "loss": 1.3784, + "mean_token_accuracy": 0.6656419287125269, + "num_tokens": 887737148.0, + "step": 5287 + }, + { + "entropy": 1.7259495158990223, + "epoch": 0.5809233473400895, + "grad_norm": 0.6545588970184326, + "learning_rate": 1.698527778995175e-05, + "loss": 1.4611, + "mean_token_accuracy": 0.6462489118178686, + "num_tokens": 887956818.0, + "step": 5288 + }, + { + "entropy": 1.7162467340628307, + "epoch": 0.5810332042514624, + "grad_norm": 0.7039337158203125, + "learning_rate": 1.6984081946823102e-05, + "loss": 1.416, + "mean_token_accuracy": 0.6598154058059057, + "num_tokens": 888168154.0, + "step": 5289 + }, + { + "entropy": 1.7691878577073414, + "epoch": 0.5811430611628354, + "grad_norm": 0.7095263004302979, + "learning_rate": 1.698288591430096e-05, + "loss": 1.3154, + "mean_token_accuracy": 0.661227265993754, + "num_tokens": 888288120.0, + "step": 5290 + }, + { + "entropy": 1.66457137465477, + "epoch": 0.5812529180742083, + "grad_norm": 0.7228137850761414, + "learning_rate": 1.6981689692423166e-05, + "loss": 1.3557, + "mean_token_accuracy": 0.668318991859754, + "num_tokens": 888435126.0, + "step": 5291 + }, + { + "entropy": 1.7254151900609334, + "epoch": 0.5813627749855813, + "grad_norm": 0.6591452360153198, + "learning_rate": 1.6980493281227595e-05, + "loss": 1.426, + "mean_token_accuracy": 0.6703950862089793, + "num_tokens": 888628454.0, + "step": 5292 + }, + { + "entropy": 1.7352023720741272, + "epoch": 0.5814726318969542, + "grad_norm": 0.7988258004188538, + "learning_rate": 1.6979296680752103e-05, + "loss": 1.4883, + "mean_token_accuracy": 0.6490569015343984, + "num_tokens": 888814092.0, + "step": 5293 + }, + { + "entropy": 1.6795012454191844, + "epoch": 0.5815824888083272, + "grad_norm": 0.8168243169784546, + "learning_rate": 1.6978099891034564e-05, + "loss": 1.3261, + "mean_token_accuracy": 0.6650880227486292, + "num_tokens": 888976522.0, + "step": 5294 + }, + { + "entropy": 1.7140100101629894, + "epoch": 0.5816923457197001, + "grad_norm": 0.6555765867233276, + "learning_rate": 1.6976902912112862e-05, + "loss": 1.3571, + "mean_token_accuracy": 0.668232391277949, + "num_tokens": 889116020.0, + "step": 5295 + }, + { + "entropy": 1.6489605208237965, + "epoch": 0.5818022026310731, + "grad_norm": 0.5820825695991516, + "learning_rate": 1.6975705744024875e-05, + "loss": 1.3159, + "mean_token_accuracy": 0.6601169308026632, + "num_tokens": 889297392.0, + "step": 5296 + }, + { + "entropy": 1.726332853237788, + "epoch": 0.581912059542446, + "grad_norm": 0.7054926156997681, + "learning_rate": 1.697450838680849e-05, + "loss": 1.5213, + "mean_token_accuracy": 0.6465317706267039, + "num_tokens": 889440904.0, + "step": 5297 + }, + { + "entropy": 1.675535907347997, + "epoch": 0.582021916453819, + "grad_norm": 0.6241676211357117, + "learning_rate": 1.697331084050161e-05, + "loss": 1.3933, + "mean_token_accuracy": 0.6527293970187505, + "num_tokens": 889625922.0, + "step": 5298 + }, + { + "entropy": 1.6990026930967967, + "epoch": 0.5821317733651918, + "grad_norm": 0.7096611261367798, + "learning_rate": 1.6972113105142134e-05, + "loss": 1.3727, + "mean_token_accuracy": 0.6669509063164393, + "num_tokens": 889747954.0, + "step": 5299 + }, + { + "entropy": 1.7199326157569885, + "epoch": 0.5822416302765647, + "grad_norm": 0.7114792466163635, + "learning_rate": 1.697091518076797e-05, + "loss": 1.7058, + "mean_token_accuracy": 0.6259814451138178, + "num_tokens": 889963566.0, + "step": 5300 + }, + { + "entropy": 1.728680282831192, + "epoch": 0.5823514871879377, + "grad_norm": 0.6678593158721924, + "learning_rate": 1.6969717067417027e-05, + "loss": 1.3901, + "mean_token_accuracy": 0.6541063139835993, + "num_tokens": 890097701.0, + "step": 5301 + }, + { + "entropy": 1.7306519746780396, + "epoch": 0.5824613440993106, + "grad_norm": 0.621885359287262, + "learning_rate": 1.6968518765127234e-05, + "loss": 1.458, + "mean_token_accuracy": 0.6357903728882471, + "num_tokens": 890300403.0, + "step": 5302 + }, + { + "entropy": 1.7454725603262584, + "epoch": 0.5825712010106836, + "grad_norm": 0.7045135498046875, + "learning_rate": 1.696732027393651e-05, + "loss": 1.5412, + "mean_token_accuracy": 0.6473953574895859, + "num_tokens": 890504802.0, + "step": 5303 + }, + { + "entropy": 1.7602245509624481, + "epoch": 0.5826810579220565, + "grad_norm": 0.6078582406044006, + "learning_rate": 1.6966121593882783e-05, + "loss": 1.4724, + "mean_token_accuracy": 0.6533922801415125, + "num_tokens": 890684992.0, + "step": 5304 + }, + { + "entropy": 1.6792764365673065, + "epoch": 0.5827909148334295, + "grad_norm": 0.626315176486969, + "learning_rate": 1.6964922725004e-05, + "loss": 1.2841, + "mean_token_accuracy": 0.6726724753777186, + "num_tokens": 890831358.0, + "step": 5305 + }, + { + "entropy": 1.681016316016515, + "epoch": 0.5829007717448024, + "grad_norm": 0.6014984846115112, + "learning_rate": 1.6963723667338104e-05, + "loss": 1.3824, + "mean_token_accuracy": 0.6497796426216761, + "num_tokens": 891039826.0, + "step": 5306 + }, + { + "entropy": 1.6937636534372966, + "epoch": 0.5830106286561754, + "grad_norm": 0.616007924079895, + "learning_rate": 1.696252442092304e-05, + "loss": 1.3501, + "mean_token_accuracy": 0.6541839092969894, + "num_tokens": 891155706.0, + "step": 5307 + }, + { + "entropy": 1.689687172571818, + "epoch": 0.5831204855675483, + "grad_norm": 0.5852237343788147, + "learning_rate": 1.696132498579676e-05, + "loss": 1.3242, + "mean_token_accuracy": 0.6588374376296997, + "num_tokens": 891334512.0, + "step": 5308 + }, + { + "entropy": 1.7258604069550831, + "epoch": 0.5832303424789212, + "grad_norm": 0.6953949332237244, + "learning_rate": 1.6960125361997232e-05, + "loss": 1.4621, + "mean_token_accuracy": 0.6593173642953237, + "num_tokens": 891501515.0, + "step": 5309 + }, + { + "entropy": 1.710837850968043, + "epoch": 0.5833401993902941, + "grad_norm": 0.6133494973182678, + "learning_rate": 1.6958925549562423e-05, + "loss": 1.5322, + "mean_token_accuracy": 0.6437032918135325, + "num_tokens": 891691280.0, + "step": 5310 + }, + { + "entropy": 1.637757400671641, + "epoch": 0.5834500563016671, + "grad_norm": 0.6457231044769287, + "learning_rate": 1.6957725548530307e-05, + "loss": 1.2902, + "mean_token_accuracy": 0.6730124702056249, + "num_tokens": 891841543.0, + "step": 5311 + }, + { + "entropy": 1.676180859406789, + "epoch": 0.58355991321304, + "grad_norm": 0.6418126225471497, + "learning_rate": 1.6956525358938866e-05, + "loss": 1.5549, + "mean_token_accuracy": 0.6378814553221067, + "num_tokens": 892038000.0, + "step": 5312 + }, + { + "entropy": 1.7321598728497822, + "epoch": 0.5836697701244129, + "grad_norm": 0.6593100428581238, + "learning_rate": 1.6955324980826073e-05, + "loss": 1.4845, + "mean_token_accuracy": 0.6519307891527811, + "num_tokens": 892184783.0, + "step": 5313 + }, + { + "entropy": 1.7277933657169342, + "epoch": 0.5837796270357859, + "grad_norm": 0.7424345016479492, + "learning_rate": 1.695412441422993e-05, + "loss": 1.455, + "mean_token_accuracy": 0.6451116353273392, + "num_tokens": 892375390.0, + "step": 5314 + }, + { + "entropy": 1.7113395134607952, + "epoch": 0.5838894839471588, + "grad_norm": 0.705172061920166, + "learning_rate": 1.6952923659188437e-05, + "loss": 1.276, + "mean_token_accuracy": 0.6679862240950266, + "num_tokens": 892535909.0, + "step": 5315 + }, + { + "entropy": 1.589612990617752, + "epoch": 0.5839993408585318, + "grad_norm": 0.5760466456413269, + "learning_rate": 1.6951722715739584e-05, + "loss": 1.3377, + "mean_token_accuracy": 0.6666668653488159, + "num_tokens": 892712710.0, + "step": 5316 + }, + { + "entropy": 1.6956795851389568, + "epoch": 0.5841091977699047, + "grad_norm": 0.8694625496864319, + "learning_rate": 1.6950521583921397e-05, + "loss": 1.4102, + "mean_token_accuracy": 0.6633008221785227, + "num_tokens": 892862269.0, + "step": 5317 + }, + { + "entropy": 1.7499979138374329, + "epoch": 0.5842190546812777, + "grad_norm": 0.6729763150215149, + "learning_rate": 1.694932026377188e-05, + "loss": 1.5164, + "mean_token_accuracy": 0.6467028011878332, + "num_tokens": 893046579.0, + "step": 5318 + }, + { + "entropy": 1.7273075977961223, + "epoch": 0.5843289115926505, + "grad_norm": 0.7413749098777771, + "learning_rate": 1.6948118755329058e-05, + "loss": 1.6088, + "mean_token_accuracy": 0.6405004958311716, + "num_tokens": 893236120.0, + "step": 5319 + }, + { + "entropy": 1.763388415177663, + "epoch": 0.5844387685040235, + "grad_norm": 0.7787388563156128, + "learning_rate": 1.6946917058630955e-05, + "loss": 1.3354, + "mean_token_accuracy": 0.6630240182081858, + "num_tokens": 893371667.0, + "step": 5320 + }, + { + "entropy": 1.7534803748130798, + "epoch": 0.5845486254153964, + "grad_norm": 0.8445398211479187, + "learning_rate": 1.6945715173715613e-05, + "loss": 0.913, + "mean_token_accuracy": 0.6981311688820521, + "num_tokens": 893492510.0, + "step": 5321 + }, + { + "entropy": 1.7531782786051433, + "epoch": 0.5846584823267694, + "grad_norm": 0.7116334438323975, + "learning_rate": 1.694451310062106e-05, + "loss": 1.3404, + "mean_token_accuracy": 0.6567851354678472, + "num_tokens": 893645939.0, + "step": 5322 + }, + { + "entropy": 1.7355563342571259, + "epoch": 0.5847683392381423, + "grad_norm": 0.6662753224372864, + "learning_rate": 1.6943310839385346e-05, + "loss": 1.3407, + "mean_token_accuracy": 0.6634679039319357, + "num_tokens": 893779093.0, + "step": 5323 + }, + { + "entropy": 1.7133808135986328, + "epoch": 0.5848781961495153, + "grad_norm": 0.7315980792045593, + "learning_rate": 1.6942108390046523e-05, + "loss": 1.5366, + "mean_token_accuracy": 0.6463624636332194, + "num_tokens": 893920200.0, + "step": 5324 + }, + { + "entropy": 1.733003169298172, + "epoch": 0.5849880530608882, + "grad_norm": 0.7187158465385437, + "learning_rate": 1.6940905752642648e-05, + "loss": 1.3796, + "mean_token_accuracy": 0.6481821984052658, + "num_tokens": 894070681.0, + "step": 5325 + }, + { + "entropy": 1.7708965142567952, + "epoch": 0.5850979099722611, + "grad_norm": 0.8311673998832703, + "learning_rate": 1.693970292721178e-05, + "loss": 1.518, + "mean_token_accuracy": 0.6394187857707342, + "num_tokens": 894259155.0, + "step": 5326 + }, + { + "entropy": 1.7388906975587208, + "epoch": 0.5852077668836341, + "grad_norm": 0.7034797072410583, + "learning_rate": 1.6938499913791996e-05, + "loss": 1.4722, + "mean_token_accuracy": 0.6401112427314123, + "num_tokens": 894413541.0, + "step": 5327 + }, + { + "entropy": 1.6831568082173665, + "epoch": 0.585317623795007, + "grad_norm": 0.6965767741203308, + "learning_rate": 1.6937296712421364e-05, + "loss": 1.2915, + "mean_token_accuracy": 0.6696919500827789, + "num_tokens": 894541182.0, + "step": 5328 + }, + { + "entropy": 1.6878312130769093, + "epoch": 0.58542748070638, + "grad_norm": 0.6519395709037781, + "learning_rate": 1.6936093323137963e-05, + "loss": 1.2841, + "mean_token_accuracy": 0.6752176831165949, + "num_tokens": 894678154.0, + "step": 5329 + }, + { + "entropy": 1.6015850404898326, + "epoch": 0.5855373376177528, + "grad_norm": 0.7328821420669556, + "learning_rate": 1.6934889745979886e-05, + "loss": 1.2998, + "mean_token_accuracy": 0.6716904441515604, + "num_tokens": 894839877.0, + "step": 5330 + }, + { + "entropy": 1.6900799870491028, + "epoch": 0.5856471945291258, + "grad_norm": 0.6380017399787903, + "learning_rate": 1.6933685980985224e-05, + "loss": 1.4289, + "mean_token_accuracy": 0.6551753083864847, + "num_tokens": 894977460.0, + "step": 5331 + }, + { + "entropy": 1.7345438599586487, + "epoch": 0.5857570514404987, + "grad_norm": 0.7761285901069641, + "learning_rate": 1.6932482028192074e-05, + "loss": 1.2974, + "mean_token_accuracy": 0.6700637092192968, + "num_tokens": 895105165.0, + "step": 5332 + }, + { + "entropy": 1.6849959095319111, + "epoch": 0.5858669083518717, + "grad_norm": 0.6113293170928955, + "learning_rate": 1.6931277887638537e-05, + "loss": 1.4208, + "mean_token_accuracy": 0.6713494658470154, + "num_tokens": 895276035.0, + "step": 5333 + }, + { + "entropy": 1.7273829380671184, + "epoch": 0.5859767652632446, + "grad_norm": 0.6847710013389587, + "learning_rate": 1.6930073559362732e-05, + "loss": 1.3862, + "mean_token_accuracy": 0.6546449114878973, + "num_tokens": 895459349.0, + "step": 5334 + }, + { + "entropy": 1.7116826176643372, + "epoch": 0.5860866221746176, + "grad_norm": 0.6733460426330566, + "learning_rate": 1.692886904340277e-05, + "loss": 1.3885, + "mean_token_accuracy": 0.6552889744440714, + "num_tokens": 895648082.0, + "step": 5335 + }, + { + "entropy": 1.7135928471883137, + "epoch": 0.5861964790859905, + "grad_norm": 0.7363564968109131, + "learning_rate": 1.6927664339796773e-05, + "loss": 1.2452, + "mean_token_accuracy": 0.6762679914633433, + "num_tokens": 895792045.0, + "step": 5336 + }, + { + "entropy": 1.708931068579356, + "epoch": 0.5863063359973635, + "grad_norm": 0.6197008490562439, + "learning_rate": 1.692645944858287e-05, + "loss": 1.5095, + "mean_token_accuracy": 0.6281079649925232, + "num_tokens": 896003794.0, + "step": 5337 + }, + { + "entropy": 1.7107830742994945, + "epoch": 0.5864161929087364, + "grad_norm": 0.7372785210609436, + "learning_rate": 1.69252543697992e-05, + "loss": 1.4289, + "mean_token_accuracy": 0.6556178480386734, + "num_tokens": 896152637.0, + "step": 5338 + }, + { + "entropy": 1.6730316678682964, + "epoch": 0.5865260498201094, + "grad_norm": 0.7120349407196045, + "learning_rate": 1.6924049103483896e-05, + "loss": 1.4772, + "mean_token_accuracy": 0.6575778424739838, + "num_tokens": 896352786.0, + "step": 5339 + }, + { + "entropy": 1.7376192808151245, + "epoch": 0.5866359067314822, + "grad_norm": 0.7281277775764465, + "learning_rate": 1.692284364967511e-05, + "loss": 1.4657, + "mean_token_accuracy": 0.6524485051631927, + "num_tokens": 896493812.0, + "step": 5340 + }, + { + "entropy": 1.7086794475714366, + "epoch": 0.5867457636428551, + "grad_norm": 0.6899880766868591, + "learning_rate": 1.6921638008410984e-05, + "loss": 1.3757, + "mean_token_accuracy": 0.6638755599657694, + "num_tokens": 896656202.0, + "step": 5341 + }, + { + "entropy": 1.7046737869580586, + "epoch": 0.5868556205542281, + "grad_norm": 0.6136064529418945, + "learning_rate": 1.692043217972969e-05, + "loss": 1.4449, + "mean_token_accuracy": 0.6559189210335413, + "num_tokens": 896874423.0, + "step": 5342 + }, + { + "entropy": 1.7691023747126262, + "epoch": 0.586965477465601, + "grad_norm": 0.6861811876296997, + "learning_rate": 1.6919226163669385e-05, + "loss": 1.5529, + "mean_token_accuracy": 0.6326283564170202, + "num_tokens": 897049573.0, + "step": 5343 + }, + { + "entropy": 1.703104058901469, + "epoch": 0.587075334376974, + "grad_norm": 0.6944000720977783, + "learning_rate": 1.691801996026824e-05, + "loss": 1.3242, + "mean_token_accuracy": 0.6579422255357107, + "num_tokens": 897206230.0, + "step": 5344 + }, + { + "entropy": 1.766764263312022, + "epoch": 0.5871851912883469, + "grad_norm": 0.6695455312728882, + "learning_rate": 1.6916813569564428e-05, + "loss": 1.3056, + "mean_token_accuracy": 0.6696594009796778, + "num_tokens": 897363683.0, + "step": 5345 + }, + { + "entropy": 1.779427985350291, + "epoch": 0.5872950481997199, + "grad_norm": 0.7089075446128845, + "learning_rate": 1.6915606991596132e-05, + "loss": 1.3939, + "mean_token_accuracy": 0.6523445149262747, + "num_tokens": 897537706.0, + "step": 5346 + }, + { + "entropy": 1.7377333045005798, + "epoch": 0.5874049051110928, + "grad_norm": 0.7925550937652588, + "learning_rate": 1.691440022640154e-05, + "loss": 1.4406, + "mean_token_accuracy": 0.651837890346845, + "num_tokens": 897722349.0, + "step": 5347 + }, + { + "entropy": 1.673417071501414, + "epoch": 0.5875147620224658, + "grad_norm": 0.77927565574646, + "learning_rate": 1.6913193274018848e-05, + "loss": 1.3656, + "mean_token_accuracy": 0.6586094995339712, + "num_tokens": 897887572.0, + "step": 5348 + }, + { + "entropy": 1.6529791951179504, + "epoch": 0.5876246189338387, + "grad_norm": 0.7137171626091003, + "learning_rate": 1.6911986134486252e-05, + "loss": 1.3981, + "mean_token_accuracy": 0.6643014947573344, + "num_tokens": 898057976.0, + "step": 5349 + }, + { + "entropy": 1.7111935218175252, + "epoch": 0.5877344758452117, + "grad_norm": 0.6308448910713196, + "learning_rate": 1.691077880784196e-05, + "loss": 1.2921, + "mean_token_accuracy": 0.6741099208593369, + "num_tokens": 898208741.0, + "step": 5350 + }, + { + "entropy": 1.7605046530564625, + "epoch": 0.5878443327565845, + "grad_norm": 0.6674914956092834, + "learning_rate": 1.6909571294124184e-05, + "loss": 1.5404, + "mean_token_accuracy": 0.6369107812643051, + "num_tokens": 898382084.0, + "step": 5351 + }, + { + "entropy": 1.723831405242284, + "epoch": 0.5879541896679575, + "grad_norm": 0.8111042976379395, + "learning_rate": 1.6908363593371134e-05, + "loss": 1.4523, + "mean_token_accuracy": 0.6531597375869751, + "num_tokens": 898528919.0, + "step": 5352 + }, + { + "entropy": 1.7647210558255513, + "epoch": 0.5880640465793304, + "grad_norm": 0.6686208844184875, + "learning_rate": 1.6907155705621044e-05, + "loss": 1.452, + "mean_token_accuracy": 0.6530319501956304, + "num_tokens": 898678535.0, + "step": 5353 + }, + { + "entropy": 1.7188012103239696, + "epoch": 0.5881739034907033, + "grad_norm": 0.6231117248535156, + "learning_rate": 1.6905947630912137e-05, + "loss": 1.4291, + "mean_token_accuracy": 0.6495647728443146, + "num_tokens": 898862122.0, + "step": 5354 + }, + { + "entropy": 1.7267694274584453, + "epoch": 0.5882837604020763, + "grad_norm": 0.6327919960021973, + "learning_rate": 1.6904739369282646e-05, + "loss": 1.3621, + "mean_token_accuracy": 0.6660894205172857, + "num_tokens": 899026487.0, + "step": 5355 + }, + { + "entropy": 1.732596476872762, + "epoch": 0.5883936173134492, + "grad_norm": 0.6500836610794067, + "learning_rate": 1.6903530920770818e-05, + "loss": 1.3427, + "mean_token_accuracy": 0.660812055071195, + "num_tokens": 899202132.0, + "step": 5356 + }, + { + "entropy": 1.7914599478244781, + "epoch": 0.5885034742248222, + "grad_norm": 0.7583115100860596, + "learning_rate": 1.6902322285414893e-05, + "loss": 1.3587, + "mean_token_accuracy": 0.6623788376649221, + "num_tokens": 899361031.0, + "step": 5357 + }, + { + "entropy": 1.6479543348153431, + "epoch": 0.5886133311361951, + "grad_norm": 0.6269407868385315, + "learning_rate": 1.6901113463253126e-05, + "loss": 1.2791, + "mean_token_accuracy": 0.6785383919874827, + "num_tokens": 899498069.0, + "step": 5358 + }, + { + "entropy": 1.6799676318963368, + "epoch": 0.5887231880475681, + "grad_norm": 0.6469370722770691, + "learning_rate": 1.6899904454323782e-05, + "loss": 1.3679, + "mean_token_accuracy": 0.6630029827356339, + "num_tokens": 899695117.0, + "step": 5359 + }, + { + "entropy": 1.7061224579811096, + "epoch": 0.588833044958941, + "grad_norm": 0.6230675578117371, + "learning_rate": 1.689869525866512e-05, + "loss": 1.3548, + "mean_token_accuracy": 0.6579580803712209, + "num_tokens": 899856534.0, + "step": 5360 + }, + { + "entropy": 1.725221852461497, + "epoch": 0.588942901870314, + "grad_norm": 0.6036698818206787, + "learning_rate": 1.689748587631541e-05, + "loss": 1.392, + "mean_token_accuracy": 0.6504359195629755, + "num_tokens": 900066575.0, + "step": 5361 + }, + { + "entropy": 1.7007695138454437, + "epoch": 0.5890527587816868, + "grad_norm": 0.7017976641654968, + "learning_rate": 1.689627630731293e-05, + "loss": 1.3983, + "mean_token_accuracy": 0.6593217353026072, + "num_tokens": 900232711.0, + "step": 5362 + }, + { + "entropy": 1.6633349259694417, + "epoch": 0.5891626156930598, + "grad_norm": 0.6416093111038208, + "learning_rate": 1.6895066551695958e-05, + "loss": 1.2961, + "mean_token_accuracy": 0.6652289082606634, + "num_tokens": 900425804.0, + "step": 5363 + }, + { + "entropy": 1.7397405008474986, + "epoch": 0.5892724726044327, + "grad_norm": 0.7236183881759644, + "learning_rate": 1.689385660950279e-05, + "loss": 1.317, + "mean_token_accuracy": 0.6659122854471207, + "num_tokens": 900551096.0, + "step": 5364 + }, + { + "entropy": 1.753376970688502, + "epoch": 0.5893823295158057, + "grad_norm": 0.7134180068969727, + "learning_rate": 1.6892646480771714e-05, + "loss": 1.409, + "mean_token_accuracy": 0.6489014178514481, + "num_tokens": 900751781.0, + "step": 5365 + }, + { + "entropy": 1.7056757907072704, + "epoch": 0.5894921864271786, + "grad_norm": 0.7589015960693359, + "learning_rate": 1.6891436165541033e-05, + "loss": 1.3076, + "mean_token_accuracy": 0.6677842835585276, + "num_tokens": 900921952.0, + "step": 5366 + }, + { + "entropy": 1.6635324656963348, + "epoch": 0.5896020433385515, + "grad_norm": 0.6845018267631531, + "learning_rate": 1.6890225663849053e-05, + "loss": 1.4531, + "mean_token_accuracy": 0.6488099843263626, + "num_tokens": 901075512.0, + "step": 5367 + }, + { + "entropy": 1.7018173734347026, + "epoch": 0.5897119002499245, + "grad_norm": 0.6907767653465271, + "learning_rate": 1.6889014975734086e-05, + "loss": 1.4226, + "mean_token_accuracy": 0.6496013949314753, + "num_tokens": 901238201.0, + "step": 5368 + }, + { + "entropy": 1.618338406085968, + "epoch": 0.5898217571612974, + "grad_norm": 0.707096517086029, + "learning_rate": 1.6887804101234442e-05, + "loss": 1.1916, + "mean_token_accuracy": 0.6871578395366669, + "num_tokens": 901386796.0, + "step": 5369 + }, + { + "entropy": 1.7351085146268208, + "epoch": 0.5899316140726704, + "grad_norm": 0.7170294523239136, + "learning_rate": 1.6886593040388458e-05, + "loss": 1.4842, + "mean_token_accuracy": 0.6426356732845306, + "num_tokens": 901541333.0, + "step": 5370 + }, + { + "entropy": 1.6429972449938457, + "epoch": 0.5900414709840432, + "grad_norm": 0.5985599756240845, + "learning_rate": 1.6885381793234457e-05, + "loss": 1.3731, + "mean_token_accuracy": 0.6494886229435602, + "num_tokens": 901705291.0, + "step": 5371 + }, + { + "entropy": 1.7603021562099457, + "epoch": 0.5901513278954162, + "grad_norm": 0.6294792890548706, + "learning_rate": 1.688417035981077e-05, + "loss": 1.5715, + "mean_token_accuracy": 0.6342183103164037, + "num_tokens": 901907928.0, + "step": 5372 + }, + { + "entropy": 1.7004207074642181, + "epoch": 0.5902611848067891, + "grad_norm": 0.6703773140907288, + "learning_rate": 1.688295874015575e-05, + "loss": 1.315, + "mean_token_accuracy": 0.667037362853686, + "num_tokens": 902036622.0, + "step": 5373 + }, + { + "entropy": 1.7578519781430562, + "epoch": 0.5903710417181621, + "grad_norm": 0.7326668500900269, + "learning_rate": 1.688174693430773e-05, + "loss": 1.5802, + "mean_token_accuracy": 0.6321331361929575, + "num_tokens": 902229056.0, + "step": 5374 + }, + { + "entropy": 1.6765822271505992, + "epoch": 0.590480898629535, + "grad_norm": 0.7106185555458069, + "learning_rate": 1.688053494230507e-05, + "loss": 1.3502, + "mean_token_accuracy": 0.6803947786490122, + "num_tokens": 902370906.0, + "step": 5375 + }, + { + "entropy": 1.670622855424881, + "epoch": 0.590590755540908, + "grad_norm": 0.6781371831893921, + "learning_rate": 1.687932276418613e-05, + "loss": 1.327, + "mean_token_accuracy": 0.6617625802755356, + "num_tokens": 902526144.0, + "step": 5376 + }, + { + "entropy": 1.6448892652988434, + "epoch": 0.5907006124522809, + "grad_norm": 0.6684084534645081, + "learning_rate": 1.6878110399989274e-05, + "loss": 1.436, + "mean_token_accuracy": 0.648414189616839, + "num_tokens": 902715944.0, + "step": 5377 + }, + { + "entropy": 1.6711215178171794, + "epoch": 0.5908104693636539, + "grad_norm": 0.6007981896400452, + "learning_rate": 1.6876897849752875e-05, + "loss": 1.3683, + "mean_token_accuracy": 0.6540864855051041, + "num_tokens": 902883356.0, + "step": 5378 + }, + { + "entropy": 1.6590981384118397, + "epoch": 0.5909203262750268, + "grad_norm": 0.7310410737991333, + "learning_rate": 1.6875685113515304e-05, + "loss": 1.5214, + "mean_token_accuracy": 0.6452625741561254, + "num_tokens": 903075535.0, + "step": 5379 + }, + { + "entropy": 1.7468369503815968, + "epoch": 0.5910301831863997, + "grad_norm": 0.6988155841827393, + "learning_rate": 1.6874472191314947e-05, + "loss": 1.3991, + "mean_token_accuracy": 0.6527721385161082, + "num_tokens": 903201717.0, + "step": 5380 + }, + { + "entropy": 1.670701116323471, + "epoch": 0.5911400400977727, + "grad_norm": 0.8909992575645447, + "learning_rate": 1.6873259083190193e-05, + "loss": 1.2956, + "mean_token_accuracy": 0.6799812763929367, + "num_tokens": 903322154.0, + "step": 5381 + }, + { + "entropy": 1.6859600047270458, + "epoch": 0.5912498970091455, + "grad_norm": 0.694366991519928, + "learning_rate": 1.6872045789179435e-05, + "loss": 1.2805, + "mean_token_accuracy": 0.6731400340795517, + "num_tokens": 903459004.0, + "step": 5382 + }, + { + "entropy": 1.7458996375401814, + "epoch": 0.5913597539205185, + "grad_norm": 0.6606221795082092, + "learning_rate": 1.6870832309321076e-05, + "loss": 1.5232, + "mean_token_accuracy": 0.6352545966704687, + "num_tokens": 903637357.0, + "step": 5383 + }, + { + "entropy": 1.7042790353298187, + "epoch": 0.5914696108318914, + "grad_norm": 0.7215031385421753, + "learning_rate": 1.6869618643653517e-05, + "loss": 1.3872, + "mean_token_accuracy": 0.6490479856729507, + "num_tokens": 903786222.0, + "step": 5384 + }, + { + "entropy": 1.6865171492099762, + "epoch": 0.5915794677432644, + "grad_norm": 0.6201615333557129, + "learning_rate": 1.6868404792215177e-05, + "loss": 1.3853, + "mean_token_accuracy": 0.6442459026972452, + "num_tokens": 903972153.0, + "step": 5385 + }, + { + "entropy": 1.7092716892560322, + "epoch": 0.5916893246546373, + "grad_norm": 0.7146571278572083, + "learning_rate": 1.686719075504447e-05, + "loss": 1.4497, + "mean_token_accuracy": 0.648409311970075, + "num_tokens": 904151519.0, + "step": 5386 + }, + { + "entropy": 1.7341859141985576, + "epoch": 0.5917991815660103, + "grad_norm": 0.7208521962165833, + "learning_rate": 1.6865976532179815e-05, + "loss": 1.4917, + "mean_token_accuracy": 0.6417253216107687, + "num_tokens": 904338083.0, + "step": 5387 + }, + { + "entropy": 1.7087008953094482, + "epoch": 0.5919090384773832, + "grad_norm": 0.614952027797699, + "learning_rate": 1.6864762123659645e-05, + "loss": 1.6023, + "mean_token_accuracy": 0.6175388197104136, + "num_tokens": 904544903.0, + "step": 5388 + }, + { + "entropy": 1.6603162388006847, + "epoch": 0.5920188953887562, + "grad_norm": 0.702585756778717, + "learning_rate": 1.68635475295224e-05, + "loss": 1.3771, + "mean_token_accuracy": 0.6522153516610464, + "num_tokens": 904681339.0, + "step": 5389 + }, + { + "entropy": 1.7155030568440754, + "epoch": 0.5921287523001291, + "grad_norm": 0.6054456830024719, + "learning_rate": 1.6862332749806515e-05, + "loss": 1.5448, + "mean_token_accuracy": 0.6397515883048376, + "num_tokens": 904863737.0, + "step": 5390 + }, + { + "entropy": 1.630701909462611, + "epoch": 0.5922386092115021, + "grad_norm": 0.6318526864051819, + "learning_rate": 1.6861117784550444e-05, + "loss": 1.3956, + "mean_token_accuracy": 0.6545381247997284, + "num_tokens": 905036452.0, + "step": 5391 + }, + { + "entropy": 1.7371818919976552, + "epoch": 0.592348466122875, + "grad_norm": 0.642082691192627, + "learning_rate": 1.6859902633792633e-05, + "loss": 1.4205, + "mean_token_accuracy": 0.6491349885861079, + "num_tokens": 905230942.0, + "step": 5392 + }, + { + "entropy": 1.6382679243882496, + "epoch": 0.592458323034248, + "grad_norm": 0.7798934578895569, + "learning_rate": 1.6858687297571544e-05, + "loss": 1.3769, + "mean_token_accuracy": 0.6772749076286951, + "num_tokens": 905357564.0, + "step": 5393 + }, + { + "entropy": 1.726676990588506, + "epoch": 0.5925681799456208, + "grad_norm": 0.7513710260391235, + "learning_rate": 1.6857471775925646e-05, + "loss": 1.4649, + "mean_token_accuracy": 0.6569699744383494, + "num_tokens": 905517686.0, + "step": 5394 + }, + { + "entropy": 1.6619043449560802, + "epoch": 0.5926780368569937, + "grad_norm": 0.5446877479553223, + "learning_rate": 1.68562560688934e-05, + "loss": 1.5784, + "mean_token_accuracy": 0.6213619013627371, + "num_tokens": 905793640.0, + "step": 5395 + }, + { + "entropy": 1.6814130246639252, + "epoch": 0.5927878937683667, + "grad_norm": 0.7586706876754761, + "learning_rate": 1.6855040176513294e-05, + "loss": 1.3387, + "mean_token_accuracy": 0.6601586639881134, + "num_tokens": 905963864.0, + "step": 5396 + }, + { + "entropy": 1.6784147222836812, + "epoch": 0.5928977506797396, + "grad_norm": 0.6829437613487244, + "learning_rate": 1.68538240988238e-05, + "loss": 1.3593, + "mean_token_accuracy": 0.656915009021759, + "num_tokens": 906143020.0, + "step": 5397 + }, + { + "entropy": 1.6552779972553253, + "epoch": 0.5930076075911126, + "grad_norm": 0.6679306626319885, + "learning_rate": 1.6852607835863416e-05, + "loss": 1.3276, + "mean_token_accuracy": 0.6578433761994044, + "num_tokens": 906300883.0, + "step": 5398 + }, + { + "entropy": 1.64952618877093, + "epoch": 0.5931174645024855, + "grad_norm": 0.5992361307144165, + "learning_rate": 1.6851391387670627e-05, + "loss": 1.3282, + "mean_token_accuracy": 0.6626505106687546, + "num_tokens": 906476656.0, + "step": 5399 + }, + { + "entropy": 1.722427527109782, + "epoch": 0.5932273214138585, + "grad_norm": 0.7228727340698242, + "learning_rate": 1.685017475428394e-05, + "loss": 1.435, + "mean_token_accuracy": 0.6515922645727793, + "num_tokens": 906654249.0, + "step": 5400 + }, + { + "entropy": 1.7722227871418, + "epoch": 0.5933371783252314, + "grad_norm": 0.7820917963981628, + "learning_rate": 1.6848957935741854e-05, + "loss": 1.3257, + "mean_token_accuracy": 0.6733630647261938, + "num_tokens": 906818481.0, + "step": 5401 + }, + { + "entropy": 1.6531193753083546, + "epoch": 0.5934470352366044, + "grad_norm": 0.7379235625267029, + "learning_rate": 1.684774093208289e-05, + "loss": 1.3019, + "mean_token_accuracy": 0.6712036629517873, + "num_tokens": 906975820.0, + "step": 5402 + }, + { + "entropy": 1.6952547132968903, + "epoch": 0.5935568921479772, + "grad_norm": 0.7943975925445557, + "learning_rate": 1.684652374334556e-05, + "loss": 1.2802, + "mean_token_accuracy": 0.6676329722007116, + "num_tokens": 907096991.0, + "step": 5403 + }, + { + "entropy": 1.6849872171878815, + "epoch": 0.5936667490593502, + "grad_norm": 0.7202121019363403, + "learning_rate": 1.6845306369568382e-05, + "loss": 1.2806, + "mean_token_accuracy": 0.6844563235839208, + "num_tokens": 907235353.0, + "step": 5404 + }, + { + "entropy": 1.7030098736286163, + "epoch": 0.5937766059707231, + "grad_norm": 0.6924136877059937, + "learning_rate": 1.68440888107899e-05, + "loss": 1.4624, + "mean_token_accuracy": 0.6487593750158945, + "num_tokens": 907394416.0, + "step": 5405 + }, + { + "entropy": 1.688938041528066, + "epoch": 0.5938864628820961, + "grad_norm": 0.7259036898612976, + "learning_rate": 1.6842871067048633e-05, + "loss": 1.5779, + "mean_token_accuracy": 0.6511958241462708, + "num_tokens": 907558715.0, + "step": 5406 + }, + { + "entropy": 1.723609745502472, + "epoch": 0.593996319793469, + "grad_norm": 0.6626251339912415, + "learning_rate": 1.6841653138383137e-05, + "loss": 1.4507, + "mean_token_accuracy": 0.6560012847185135, + "num_tokens": 907730055.0, + "step": 5407 + }, + { + "entropy": 1.7112099329630535, + "epoch": 0.5941061767048419, + "grad_norm": 0.6510260701179504, + "learning_rate": 1.6840435024831944e-05, + "loss": 1.556, + "mean_token_accuracy": 0.6182466248671213, + "num_tokens": 907988991.0, + "step": 5408 + }, + { + "entropy": 1.7646510402361553, + "epoch": 0.5942160336162149, + "grad_norm": 0.9172123670578003, + "learning_rate": 1.6839216726433616e-05, + "loss": 1.3719, + "mean_token_accuracy": 0.6614715158939362, + "num_tokens": 908184419.0, + "step": 5409 + }, + { + "entropy": 1.7178015510241191, + "epoch": 0.5943258905275878, + "grad_norm": 0.7179532647132874, + "learning_rate": 1.6837998243226712e-05, + "loss": 1.415, + "mean_token_accuracy": 0.6340557535489401, + "num_tokens": 908434034.0, + "step": 5410 + }, + { + "entropy": 1.7165958086649578, + "epoch": 0.5944357474389608, + "grad_norm": 0.6283994317054749, + "learning_rate": 1.6836779575249796e-05, + "loss": 1.3807, + "mean_token_accuracy": 0.6633997658888499, + "num_tokens": 908589399.0, + "step": 5411 + }, + { + "entropy": 1.7015175918738048, + "epoch": 0.5945456043503337, + "grad_norm": 0.6560877561569214, + "learning_rate": 1.6835560722541434e-05, + "loss": 1.3933, + "mean_token_accuracy": 0.653649906317393, + "num_tokens": 908770306.0, + "step": 5412 + }, + { + "entropy": 1.6911265850067139, + "epoch": 0.5946554612617067, + "grad_norm": 0.6705179214477539, + "learning_rate": 1.6834341685140205e-05, + "loss": 1.4012, + "mean_token_accuracy": 0.6415108740329742, + "num_tokens": 908992577.0, + "step": 5413 + }, + { + "entropy": 1.6993640164534252, + "epoch": 0.5947653181730795, + "grad_norm": 0.6580267548561096, + "learning_rate": 1.683312246308469e-05, + "loss": 1.4285, + "mean_token_accuracy": 0.6524773985147476, + "num_tokens": 909181844.0, + "step": 5414 + }, + { + "entropy": 1.730893741051356, + "epoch": 0.5948751750844525, + "grad_norm": 0.6987996697425842, + "learning_rate": 1.6831903056413477e-05, + "loss": 1.3224, + "mean_token_accuracy": 0.6611655751864115, + "num_tokens": 909304767.0, + "step": 5415 + }, + { + "entropy": 1.7225580215454102, + "epoch": 0.5949850319958254, + "grad_norm": 0.8656018376350403, + "learning_rate": 1.683068346516516e-05, + "loss": 1.3047, + "mean_token_accuracy": 0.6596146573623022, + "num_tokens": 909433583.0, + "step": 5416 + }, + { + "entropy": 1.7331815858681996, + "epoch": 0.5950948889071984, + "grad_norm": 0.6861293911933899, + "learning_rate": 1.682946368937834e-05, + "loss": 1.4714, + "mean_token_accuracy": 0.6402883330980936, + "num_tokens": 909643336.0, + "step": 5417 + }, + { + "entropy": 1.7125715414683025, + "epoch": 0.5952047458185713, + "grad_norm": 0.6575915813446045, + "learning_rate": 1.6828243729091626e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.66132952272892, + "num_tokens": 909778117.0, + "step": 5418 + }, + { + "entropy": 1.657194048166275, + "epoch": 0.5953146027299443, + "grad_norm": 0.7357683777809143, + "learning_rate": 1.6827023584343615e-05, + "loss": 1.2943, + "mean_token_accuracy": 0.6715851227442423, + "num_tokens": 909929876.0, + "step": 5419 + }, + { + "entropy": 1.662133087714513, + "epoch": 0.5954244596413172, + "grad_norm": 0.6675532460212708, + "learning_rate": 1.682580325517294e-05, + "loss": 1.3832, + "mean_token_accuracy": 0.6506559252738953, + "num_tokens": 910095389.0, + "step": 5420 + }, + { + "entropy": 1.7440082132816315, + "epoch": 0.5955343165526901, + "grad_norm": 0.7201164960861206, + "learning_rate": 1.6824582741618215e-05, + "loss": 1.4109, + "mean_token_accuracy": 0.6457555194695791, + "num_tokens": 910265221.0, + "step": 5421 + }, + { + "entropy": 1.7047818501790364, + "epoch": 0.5956441734640631, + "grad_norm": 0.5827786922454834, + "learning_rate": 1.682336204371807e-05, + "loss": 1.4126, + "mean_token_accuracy": 0.6369202633698782, + "num_tokens": 910458062.0, + "step": 5422 + }, + { + "entropy": 1.6782648364702861, + "epoch": 0.595754030375436, + "grad_norm": 0.8056417107582092, + "learning_rate": 1.682214116151114e-05, + "loss": 1.591, + "mean_token_accuracy": 0.6487656235694885, + "num_tokens": 910712200.0, + "step": 5423 + }, + { + "entropy": 1.6988307734330494, + "epoch": 0.595863887286809, + "grad_norm": 0.561825692653656, + "learning_rate": 1.6820920095036068e-05, + "loss": 1.3425, + "mean_token_accuracy": 0.6547723909219106, + "num_tokens": 910893187.0, + "step": 5424 + }, + { + "entropy": 1.611896812915802, + "epoch": 0.5959737441981818, + "grad_norm": 0.633956789970398, + "learning_rate": 1.6819698844331497e-05, + "loss": 1.3578, + "mean_token_accuracy": 0.667489156126976, + "num_tokens": 911126313.0, + "step": 5425 + }, + { + "entropy": 1.6960194905598958, + "epoch": 0.5960836011095548, + "grad_norm": 0.7324186563491821, + "learning_rate": 1.6818477409436078e-05, + "loss": 1.4714, + "mean_token_accuracy": 0.645858551065127, + "num_tokens": 911338849.0, + "step": 5426 + }, + { + "entropy": 1.655552089214325, + "epoch": 0.5961934580209277, + "grad_norm": 0.7095927000045776, + "learning_rate": 1.6817255790388472e-05, + "loss": 1.4871, + "mean_token_accuracy": 0.656248539686203, + "num_tokens": 911516630.0, + "step": 5427 + }, + { + "entropy": 1.7007441520690918, + "epoch": 0.5963033149323007, + "grad_norm": 0.7084668874740601, + "learning_rate": 1.6816033987227342e-05, + "loss": 1.5267, + "mean_token_accuracy": 0.6427704642216364, + "num_tokens": 911699013.0, + "step": 5428 + }, + { + "entropy": 1.7174124121665955, + "epoch": 0.5964131718436736, + "grad_norm": 0.6124807596206665, + "learning_rate": 1.6814811999991357e-05, + "loss": 1.4031, + "mean_token_accuracy": 0.6490861674149832, + "num_tokens": 911857669.0, + "step": 5429 + }, + { + "entropy": 1.6959696312745411, + "epoch": 0.5965230287550466, + "grad_norm": 0.9495405554771423, + "learning_rate": 1.6813589828719195e-05, + "loss": 1.1984, + "mean_token_accuracy": 0.678948904077212, + "num_tokens": 911990431.0, + "step": 5430 + }, + { + "entropy": 1.7092759013175964, + "epoch": 0.5966328856664195, + "grad_norm": 0.6147773861885071, + "learning_rate": 1.6812367473449528e-05, + "loss": 1.396, + "mean_token_accuracy": 0.6636812587579092, + "num_tokens": 912200494.0, + "step": 5431 + }, + { + "entropy": 1.709659606218338, + "epoch": 0.5967427425777925, + "grad_norm": 0.748707115650177, + "learning_rate": 1.6811144934221057e-05, + "loss": 1.2594, + "mean_token_accuracy": 0.668513630827268, + "num_tokens": 912323384.0, + "step": 5432 + }, + { + "entropy": 1.6889064808686574, + "epoch": 0.5968525994891654, + "grad_norm": 0.65199875831604, + "learning_rate": 1.6809922211072462e-05, + "loss": 1.3911, + "mean_token_accuracy": 0.659337967634201, + "num_tokens": 912475513.0, + "step": 5433 + }, + { + "entropy": 1.6920311848322551, + "epoch": 0.5969624564005382, + "grad_norm": 0.6664896607398987, + "learning_rate": 1.680869930404245e-05, + "loss": 1.5205, + "mean_token_accuracy": 0.648727094133695, + "num_tokens": 912661477.0, + "step": 5434 + }, + { + "entropy": 1.7712519864241283, + "epoch": 0.5970723133119112, + "grad_norm": 0.6912421584129333, + "learning_rate": 1.680747621316972e-05, + "loss": 1.3873, + "mean_token_accuracy": 0.6598270038763682, + "num_tokens": 912808322.0, + "step": 5435 + }, + { + "entropy": 1.6839702626069386, + "epoch": 0.5971821702232841, + "grad_norm": 0.6543471813201904, + "learning_rate": 1.680625293849299e-05, + "loss": 1.3244, + "mean_token_accuracy": 0.6574779450893402, + "num_tokens": 912985667.0, + "step": 5436 + }, + { + "entropy": 1.7119268377621968, + "epoch": 0.5972920271346571, + "grad_norm": 0.6256197690963745, + "learning_rate": 1.6805029480050965e-05, + "loss": 1.3939, + "mean_token_accuracy": 0.6478220820426941, + "num_tokens": 913142770.0, + "step": 5437 + }, + { + "entropy": 1.6727095246315002, + "epoch": 0.59740188404603, + "grad_norm": 0.6122837662696838, + "learning_rate": 1.6803805837882373e-05, + "loss": 1.3201, + "mean_token_accuracy": 0.6651313950618108, + "num_tokens": 913284604.0, + "step": 5438 + }, + { + "entropy": 1.6385413606961567, + "epoch": 0.597511740957403, + "grad_norm": 0.7011650800704956, + "learning_rate": 1.6802582012025948e-05, + "loss": 1.2644, + "mean_token_accuracy": 0.6758585671583811, + "num_tokens": 913418447.0, + "step": 5439 + }, + { + "entropy": 1.7271398703257244, + "epoch": 0.5976215978687759, + "grad_norm": 0.8002787232398987, + "learning_rate": 1.680135800252041e-05, + "loss": 1.475, + "mean_token_accuracy": 0.6593929280837377, + "num_tokens": 913595488.0, + "step": 5440 + }, + { + "entropy": 1.7234635253747304, + "epoch": 0.5977314547801489, + "grad_norm": 0.6236562728881836, + "learning_rate": 1.6800133809404507e-05, + "loss": 1.621, + "mean_token_accuracy": 0.6271956562995911, + "num_tokens": 913811760.0, + "step": 5441 + }, + { + "entropy": 1.663605233033498, + "epoch": 0.5978413116915218, + "grad_norm": 0.686917781829834, + "learning_rate": 1.6798909432716987e-05, + "loss": 1.3732, + "mean_token_accuracy": 0.659597580631574, + "num_tokens": 913955134.0, + "step": 5442 + }, + { + "entropy": 1.6567329565684001, + "epoch": 0.5979511686028948, + "grad_norm": 0.8035587668418884, + "learning_rate": 1.679768487249659e-05, + "loss": 1.4351, + "mean_token_accuracy": 0.6736761331558228, + "num_tokens": 914122511.0, + "step": 5443 + }, + { + "entropy": 1.7484365304311116, + "epoch": 0.5980610255142677, + "grad_norm": 0.6443024277687073, + "learning_rate": 1.6796460128782084e-05, + "loss": 1.6444, + "mean_token_accuracy": 0.618500699599584, + "num_tokens": 914346097.0, + "step": 5444 + }, + { + "entropy": 1.687730719645818, + "epoch": 0.5981708824256406, + "grad_norm": 0.6121844053268433, + "learning_rate": 1.6795235201612226e-05, + "loss": 1.4166, + "mean_token_accuracy": 0.6520683070023855, + "num_tokens": 914609850.0, + "step": 5445 + }, + { + "entropy": 1.6414126654465993, + "epoch": 0.5982807393370135, + "grad_norm": 0.7661187052726746, + "learning_rate": 1.6794010091025785e-05, + "loss": 1.206, + "mean_token_accuracy": 0.6839122573534647, + "num_tokens": 914733707.0, + "step": 5446 + }, + { + "entropy": 1.6634565393129985, + "epoch": 0.5983905962483865, + "grad_norm": 0.6488881707191467, + "learning_rate": 1.6792784797061533e-05, + "loss": 1.5076, + "mean_token_accuracy": 0.641075556476911, + "num_tokens": 914935285.0, + "step": 5447 + }, + { + "entropy": 1.713909884293874, + "epoch": 0.5985004531597594, + "grad_norm": 0.690524697303772, + "learning_rate": 1.6791559319758256e-05, + "loss": 1.3711, + "mean_token_accuracy": 0.658517986536026, + "num_tokens": 915077128.0, + "step": 5448 + }, + { + "entropy": 1.6755700409412384, + "epoch": 0.5986103100711323, + "grad_norm": 0.5590389370918274, + "learning_rate": 1.6790333659154735e-05, + "loss": 0.9613, + "mean_token_accuracy": 0.689162497719129, + "num_tokens": 915211004.0, + "step": 5449 + }, + { + "entropy": 1.7446764012177784, + "epoch": 0.5987201669825053, + "grad_norm": 0.6876842975616455, + "learning_rate": 1.678910781528976e-05, + "loss": 1.3854, + "mean_token_accuracy": 0.6610653201738993, + "num_tokens": 915326622.0, + "step": 5450 + }, + { + "entropy": 1.690832147995631, + "epoch": 0.5988300238938782, + "grad_norm": 0.5802832841873169, + "learning_rate": 1.6787881788202135e-05, + "loss": 1.4026, + "mean_token_accuracy": 0.6581792682409286, + "num_tokens": 915547066.0, + "step": 5451 + }, + { + "entropy": 1.728159526983897, + "epoch": 0.5989398808052512, + "grad_norm": 0.7079800367355347, + "learning_rate": 1.6786655577930658e-05, + "loss": 1.5643, + "mean_token_accuracy": 0.6469894895950953, + "num_tokens": 915692436.0, + "step": 5452 + }, + { + "entropy": 1.6673387587070465, + "epoch": 0.5990497377166241, + "grad_norm": 0.6151366829872131, + "learning_rate": 1.678542918451414e-05, + "loss": 1.4437, + "mean_token_accuracy": 0.6431677391131719, + "num_tokens": 915919308.0, + "step": 5453 + }, + { + "entropy": 1.7032847106456757, + "epoch": 0.5991595946279971, + "grad_norm": 0.7041369676589966, + "learning_rate": 1.6784202607991396e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6400700211524963, + "num_tokens": 916121360.0, + "step": 5454 + }, + { + "entropy": 1.759365479151408, + "epoch": 0.59926945153937, + "grad_norm": 0.745280385017395, + "learning_rate": 1.6782975848401244e-05, + "loss": 1.3423, + "mean_token_accuracy": 0.6642535577217737, + "num_tokens": 916264675.0, + "step": 5455 + }, + { + "entropy": 1.792235126097997, + "epoch": 0.5993793084507429, + "grad_norm": 0.69027179479599, + "learning_rate": 1.6781748905782512e-05, + "loss": 1.5797, + "mean_token_accuracy": 0.6236337820688883, + "num_tokens": 916506247.0, + "step": 5456 + }, + { + "entropy": 1.687427928050359, + "epoch": 0.5994891653621158, + "grad_norm": 0.7181799411773682, + "learning_rate": 1.6780521780174032e-05, + "loss": 1.4468, + "mean_token_accuracy": 0.6493152479330698, + "num_tokens": 916654838.0, + "step": 5457 + }, + { + "entropy": 1.7065490186214447, + "epoch": 0.5995990222734888, + "grad_norm": 0.7162134051322937, + "learning_rate": 1.6779294471614647e-05, + "loss": 1.322, + "mean_token_accuracy": 0.662808025876681, + "num_tokens": 916801918.0, + "step": 5458 + }, + { + "entropy": 1.6874915262063344, + "epoch": 0.5997088791848617, + "grad_norm": 0.6525692343711853, + "learning_rate": 1.6778066980143194e-05, + "loss": 1.3566, + "mean_token_accuracy": 0.6498881727457047, + "num_tokens": 916998423.0, + "step": 5459 + }, + { + "entropy": 1.747247964143753, + "epoch": 0.5998187360962347, + "grad_norm": 0.6652222275733948, + "learning_rate": 1.6776839305798523e-05, + "loss": 1.5106, + "mean_token_accuracy": 0.6408848663171133, + "num_tokens": 917183535.0, + "step": 5460 + }, + { + "entropy": 1.6877395709355671, + "epoch": 0.5999285930076076, + "grad_norm": 0.6594512462615967, + "learning_rate": 1.6775611448619494e-05, + "loss": 1.4626, + "mean_token_accuracy": 0.6533342649539312, + "num_tokens": 917332369.0, + "step": 5461 + }, + { + "entropy": 1.6684443255265553, + "epoch": 0.6000384499189805, + "grad_norm": 0.5910911560058594, + "learning_rate": 1.6774383408644957e-05, + "loss": 1.402, + "mean_token_accuracy": 0.664065291484197, + "num_tokens": 917501787.0, + "step": 5462 + }, + { + "entropy": 1.7669705549875896, + "epoch": 0.6001483068303535, + "grad_norm": 0.6818894743919373, + "learning_rate": 1.6773155185913795e-05, + "loss": 1.5071, + "mean_token_accuracy": 0.6319067428509394, + "num_tokens": 917694774.0, + "step": 5463 + }, + { + "entropy": 1.640412410100301, + "epoch": 0.6002581637417264, + "grad_norm": 0.6774711012840271, + "learning_rate": 1.6771926780464874e-05, + "loss": 1.3312, + "mean_token_accuracy": 0.6684557646512985, + "num_tokens": 917895672.0, + "step": 5464 + }, + { + "entropy": 1.6497747401396434, + "epoch": 0.6003680206530994, + "grad_norm": 0.6946020722389221, + "learning_rate": 1.677069819233707e-05, + "loss": 1.4321, + "mean_token_accuracy": 0.6589450190464655, + "num_tokens": 918106673.0, + "step": 5465 + }, + { + "entropy": 1.751279056072235, + "epoch": 0.6004778775644722, + "grad_norm": 0.7799018025398254, + "learning_rate": 1.6769469421569265e-05, + "loss": 1.3527, + "mean_token_accuracy": 0.6531901061534882, + "num_tokens": 918293869.0, + "step": 5466 + }, + { + "entropy": 1.7017297645409901, + "epoch": 0.6005877344758452, + "grad_norm": 0.7740174531936646, + "learning_rate": 1.6768240468200354e-05, + "loss": 1.3378, + "mean_token_accuracy": 0.6581835597753525, + "num_tokens": 918457119.0, + "step": 5467 + }, + { + "entropy": 1.6877376039822896, + "epoch": 0.6006975913872181, + "grad_norm": 0.6579194664955139, + "learning_rate": 1.6767011332269233e-05, + "loss": 1.4362, + "mean_token_accuracy": 0.6469058791796366, + "num_tokens": 918644483.0, + "step": 5468 + }, + { + "entropy": 1.7218892375628154, + "epoch": 0.6008074482985911, + "grad_norm": 0.63138747215271, + "learning_rate": 1.67657820138148e-05, + "loss": 1.4232, + "mean_token_accuracy": 0.6666744997104009, + "num_tokens": 918799975.0, + "step": 5469 + }, + { + "entropy": 1.7323795755704243, + "epoch": 0.600917305209964, + "grad_norm": 0.7608441710472107, + "learning_rate": 1.6764552512875967e-05, + "loss": 1.4874, + "mean_token_accuracy": 0.6452402472496033, + "num_tokens": 918920586.0, + "step": 5470 + }, + { + "entropy": 1.6932736833890278, + "epoch": 0.601027162121337, + "grad_norm": 0.6323309540748596, + "learning_rate": 1.6763322829491643e-05, + "loss": 1.4062, + "mean_token_accuracy": 0.6532955567042033, + "num_tokens": 919121611.0, + "step": 5471 + }, + { + "entropy": 1.6725764473279316, + "epoch": 0.6011370190327099, + "grad_norm": 0.8088108897209167, + "learning_rate": 1.6762092963700746e-05, + "loss": 1.2678, + "mean_token_accuracy": 0.6674557526906332, + "num_tokens": 919245911.0, + "step": 5472 + }, + { + "entropy": 1.758307288090388, + "epoch": 0.6012468759440829, + "grad_norm": 0.7137458324432373, + "learning_rate": 1.676086291554221e-05, + "loss": 1.3491, + "mean_token_accuracy": 0.6584253559509913, + "num_tokens": 919384474.0, + "step": 5473 + }, + { + "entropy": 1.6837130685647328, + "epoch": 0.6013567328554558, + "grad_norm": 1.0152652263641357, + "learning_rate": 1.675963268505495e-05, + "loss": 1.3494, + "mean_token_accuracy": 0.6644057482481003, + "num_tokens": 919528252.0, + "step": 5474 + }, + { + "entropy": 1.654979646205902, + "epoch": 0.6014665897668287, + "grad_norm": 0.7153851985931396, + "learning_rate": 1.6758402272277915e-05, + "loss": 1.4471, + "mean_token_accuracy": 0.6493087609608968, + "num_tokens": 919760402.0, + "step": 5475 + }, + { + "entropy": 1.7279058396816254, + "epoch": 0.6015764466782016, + "grad_norm": 0.6236484050750732, + "learning_rate": 1.6757171677250045e-05, + "loss": 1.4912, + "mean_token_accuracy": 0.6396552622318268, + "num_tokens": 919969841.0, + "step": 5476 + }, + { + "entropy": 1.710862507422765, + "epoch": 0.6016863035895745, + "grad_norm": 0.7031605839729309, + "learning_rate": 1.675594090001028e-05, + "loss": 1.3442, + "mean_token_accuracy": 0.6595994979143143, + "num_tokens": 920101743.0, + "step": 5477 + }, + { + "entropy": 1.682221661011378, + "epoch": 0.6017961605009475, + "grad_norm": 0.6580245494842529, + "learning_rate": 1.6754709940597584e-05, + "loss": 1.3735, + "mean_token_accuracy": 0.6630414674679438, + "num_tokens": 920264770.0, + "step": 5478 + }, + { + "entropy": 1.731015940507253, + "epoch": 0.6019060174123204, + "grad_norm": 0.6210698485374451, + "learning_rate": 1.675347879905091e-05, + "loss": 1.3219, + "mean_token_accuracy": 0.6625956445932388, + "num_tokens": 920391189.0, + "step": 5479 + }, + { + "entropy": 1.682970017194748, + "epoch": 0.6020158743236934, + "grad_norm": 0.7015461325645447, + "learning_rate": 1.6752247475409226e-05, + "loss": 1.6362, + "mean_token_accuracy": 0.6371884370843569, + "num_tokens": 920593115.0, + "step": 5480 + }, + { + "entropy": 1.6944019198417664, + "epoch": 0.6021257312350663, + "grad_norm": 0.6100954413414001, + "learning_rate": 1.67510159697115e-05, + "loss": 1.3737, + "mean_token_accuracy": 0.6664390613635381, + "num_tokens": 920749916.0, + "step": 5481 + }, + { + "entropy": 1.7003353436787922, + "epoch": 0.6022355881464393, + "grad_norm": 0.7275522351264954, + "learning_rate": 1.6749784281996712e-05, + "loss": 1.414, + "mean_token_accuracy": 0.6477395196755728, + "num_tokens": 920929325.0, + "step": 5482 + }, + { + "entropy": 1.687132587035497, + "epoch": 0.6023454450578122, + "grad_norm": 0.6985853910446167, + "learning_rate": 1.674855241230384e-05, + "loss": 1.3656, + "mean_token_accuracy": 0.6509382625420889, + "num_tokens": 921084679.0, + "step": 5483 + }, + { + "entropy": 1.6675028403600056, + "epoch": 0.6024553019691852, + "grad_norm": 0.722407877445221, + "learning_rate": 1.6747320360671873e-05, + "loss": 1.2328, + "mean_token_accuracy": 0.6737691164016724, + "num_tokens": 921194486.0, + "step": 5484 + }, + { + "entropy": 1.712370087703069, + "epoch": 0.6025651588805581, + "grad_norm": 0.7098826169967651, + "learning_rate": 1.674608812713981e-05, + "loss": 1.3426, + "mean_token_accuracy": 0.6622453580300013, + "num_tokens": 921331468.0, + "step": 5485 + }, + { + "entropy": 1.7794211904207866, + "epoch": 0.6026750157919311, + "grad_norm": 0.8509989380836487, + "learning_rate": 1.6744855711746647e-05, + "loss": 1.4642, + "mean_token_accuracy": 0.6489201337099075, + "num_tokens": 921468742.0, + "step": 5486 + }, + { + "entropy": 1.7174873848756154, + "epoch": 0.6027848727033039, + "grad_norm": 0.6344740986824036, + "learning_rate": 1.674362311453139e-05, + "loss": 1.3383, + "mean_token_accuracy": 0.662666474779447, + "num_tokens": 921629136.0, + "step": 5487 + }, + { + "entropy": 1.7565912504990895, + "epoch": 0.6028947296146769, + "grad_norm": 0.6211560964584351, + "learning_rate": 1.6742390335533044e-05, + "loss": 1.59, + "mean_token_accuracy": 0.6265095522006353, + "num_tokens": 921833335.0, + "step": 5488 + }, + { + "entropy": 1.6729480028152466, + "epoch": 0.6030045865260498, + "grad_norm": 0.6941152215003967, + "learning_rate": 1.6741157374790636e-05, + "loss": 1.3861, + "mean_token_accuracy": 0.6670657147963842, + "num_tokens": 922007520.0, + "step": 5489 + }, + { + "entropy": 1.7290611962477367, + "epoch": 0.6031144434374227, + "grad_norm": 0.6478551030158997, + "learning_rate": 1.673992423234318e-05, + "loss": 1.4021, + "mean_token_accuracy": 0.6645165433486303, + "num_tokens": 922188846.0, + "step": 5490 + }, + { + "entropy": 1.6348287463188171, + "epoch": 0.6032243003487957, + "grad_norm": 0.5874102115631104, + "learning_rate": 1.6738690908229714e-05, + "loss": 1.4658, + "mean_token_accuracy": 0.645725334684054, + "num_tokens": 922380313.0, + "step": 5491 + }, + { + "entropy": 1.7524794340133667, + "epoch": 0.6033341572601686, + "grad_norm": 0.8978797793388367, + "learning_rate": 1.6737457402489266e-05, + "loss": 1.53, + "mean_token_accuracy": 0.6540175626675288, + "num_tokens": 922512596.0, + "step": 5492 + }, + { + "entropy": 1.6377544303735097, + "epoch": 0.6034440141715416, + "grad_norm": 0.7050206661224365, + "learning_rate": 1.673622371516087e-05, + "loss": 1.4592, + "mean_token_accuracy": 0.6444994062185287, + "num_tokens": 922750463.0, + "step": 5493 + }, + { + "entropy": 1.6732242107391357, + "epoch": 0.6035538710829145, + "grad_norm": 0.738034188747406, + "learning_rate": 1.673498984628359e-05, + "loss": 1.3387, + "mean_token_accuracy": 0.66953673462073, + "num_tokens": 922907666.0, + "step": 5494 + }, + { + "entropy": 1.6891792019208272, + "epoch": 0.6036637279942875, + "grad_norm": 0.6616693735122681, + "learning_rate": 1.673375579589646e-05, + "loss": 1.376, + "mean_token_accuracy": 0.6620072374741236, + "num_tokens": 923087081.0, + "step": 5495 + }, + { + "entropy": 1.740149160226186, + "epoch": 0.6037735849056604, + "grad_norm": 0.7156772613525391, + "learning_rate": 1.673252156403854e-05, + "loss": 1.2699, + "mean_token_accuracy": 0.6738031804561615, + "num_tokens": 923216649.0, + "step": 5496 + }, + { + "entropy": 1.640553206205368, + "epoch": 0.6038834418170334, + "grad_norm": 0.5988604426383972, + "learning_rate": 1.6731287150748894e-05, + "loss": 1.3356, + "mean_token_accuracy": 0.6715440601110458, + "num_tokens": 923402586.0, + "step": 5497 + }, + { + "entropy": 1.646197388569514, + "epoch": 0.6039932987284062, + "grad_norm": 0.6301091313362122, + "learning_rate": 1.67300525560666e-05, + "loss": 1.4681, + "mean_token_accuracy": 0.6478618681430817, + "num_tokens": 923608007.0, + "step": 5498 + }, + { + "entropy": 1.7597610652446747, + "epoch": 0.6041031556397792, + "grad_norm": 0.6319087147712708, + "learning_rate": 1.6728817780030718e-05, + "loss": 1.4828, + "mean_token_accuracy": 0.6508717288573583, + "num_tokens": 923815695.0, + "step": 5499 + }, + { + "entropy": 1.7612847288449605, + "epoch": 0.6042130125511521, + "grad_norm": 0.7959107160568237, + "learning_rate": 1.6727582822680336e-05, + "loss": 1.5721, + "mean_token_accuracy": 0.6293148795763651, + "num_tokens": 923990182.0, + "step": 5500 + }, + { + "entropy": 1.705323855082194, + "epoch": 0.6043228694625251, + "grad_norm": 0.6401039361953735, + "learning_rate": 1.672634768405454e-05, + "loss": 1.5592, + "mean_token_accuracy": 0.6398047258456548, + "num_tokens": 924204209.0, + "step": 5501 + }, + { + "entropy": 1.726824293533961, + "epoch": 0.604432726373898, + "grad_norm": 0.7112311720848083, + "learning_rate": 1.672511236419242e-05, + "loss": 1.3496, + "mean_token_accuracy": 0.6554846813281378, + "num_tokens": 924376901.0, + "step": 5502 + }, + { + "entropy": 1.6738229095935822, + "epoch": 0.6045425832852709, + "grad_norm": 0.6750335693359375, + "learning_rate": 1.672387686313307e-05, + "loss": 1.4294, + "mean_token_accuracy": 0.6564183880885442, + "num_tokens": 924580944.0, + "step": 5503 + }, + { + "entropy": 1.7278579473495483, + "epoch": 0.6046524401966439, + "grad_norm": 0.7934318780899048, + "learning_rate": 1.6722641180915602e-05, + "loss": 1.3263, + "mean_token_accuracy": 0.6604352543751398, + "num_tokens": 924708411.0, + "step": 5504 + }, + { + "entropy": 1.7267205317815144, + "epoch": 0.6047622971080168, + "grad_norm": 0.636107325553894, + "learning_rate": 1.6721405317579116e-05, + "loss": 1.3168, + "mean_token_accuracy": 0.6644595215717951, + "num_tokens": 924842526.0, + "step": 5505 + }, + { + "entropy": 1.719021886587143, + "epoch": 0.6048721540193898, + "grad_norm": 0.7312912940979004, + "learning_rate": 1.6720169273162733e-05, + "loss": 1.1689, + "mean_token_accuracy": 0.6934764335552851, + "num_tokens": 924950234.0, + "step": 5506 + }, + { + "entropy": 1.6559423406918843, + "epoch": 0.6049820109307626, + "grad_norm": 0.6602075099945068, + "learning_rate": 1.6718933047705567e-05, + "loss": 1.3758, + "mean_token_accuracy": 0.6585381329059601, + "num_tokens": 925108872.0, + "step": 5507 + }, + { + "entropy": 1.6688361366589863, + "epoch": 0.6050918678421356, + "grad_norm": 0.7089828848838806, + "learning_rate": 1.6717696641246747e-05, + "loss": 1.4099, + "mean_token_accuracy": 0.6570945431788763, + "num_tokens": 925316481.0, + "step": 5508 + }, + { + "entropy": 1.7144095798333485, + "epoch": 0.6052017247535085, + "grad_norm": 0.6471745371818542, + "learning_rate": 1.6716460053825405e-05, + "loss": 1.4705, + "mean_token_accuracy": 0.6423929333686829, + "num_tokens": 925487174.0, + "step": 5509 + }, + { + "entropy": 1.6315881411234539, + "epoch": 0.6053115816648815, + "grad_norm": 0.6560130715370178, + "learning_rate": 1.671522328548068e-05, + "loss": 1.2288, + "mean_token_accuracy": 0.6750545849402746, + "num_tokens": 925663600.0, + "step": 5510 + }, + { + "entropy": 1.6976039409637451, + "epoch": 0.6054214385762544, + "grad_norm": 0.6420303583145142, + "learning_rate": 1.6713986336251712e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6477069954077402, + "num_tokens": 925858263.0, + "step": 5511 + }, + { + "entropy": 1.6715827286243439, + "epoch": 0.6055312954876274, + "grad_norm": 0.6958088874816895, + "learning_rate": 1.671274920617765e-05, + "loss": 1.3601, + "mean_token_accuracy": 0.6639330287774404, + "num_tokens": 925989227.0, + "step": 5512 + }, + { + "entropy": 1.7799000144004822, + "epoch": 0.6056411523990003, + "grad_norm": 0.7925567030906677, + "learning_rate": 1.671151189529765e-05, + "loss": 1.4278, + "mean_token_accuracy": 0.6552622616291046, + "num_tokens": 926130198.0, + "step": 5513 + }, + { + "entropy": 1.7007222870985668, + "epoch": 0.6057510093103733, + "grad_norm": 0.7391920685768127, + "learning_rate": 1.6710274403650878e-05, + "loss": 1.3117, + "mean_token_accuracy": 0.6625027805566788, + "num_tokens": 926257462.0, + "step": 5514 + }, + { + "entropy": 1.7189228733380635, + "epoch": 0.6058608662217462, + "grad_norm": 0.6808819770812988, + "learning_rate": 1.6709036731276487e-05, + "loss": 1.5554, + "mean_token_accuracy": 0.6345362067222595, + "num_tokens": 926453546.0, + "step": 5515 + }, + { + "entropy": 1.7309414744377136, + "epoch": 0.6059707231331191, + "grad_norm": 0.7075436115264893, + "learning_rate": 1.670779887821366e-05, + "loss": 1.5652, + "mean_token_accuracy": 0.6544856876134872, + "num_tokens": 926613311.0, + "step": 5516 + }, + { + "entropy": 1.7191075285275776, + "epoch": 0.6060805800444921, + "grad_norm": 0.6736297011375427, + "learning_rate": 1.670656084450157e-05, + "loss": 1.4221, + "mean_token_accuracy": 0.6376722504695257, + "num_tokens": 926779157.0, + "step": 5517 + }, + { + "entropy": 1.776821494102478, + "epoch": 0.6061904369558649, + "grad_norm": 0.7885594964027405, + "learning_rate": 1.6705322630179398e-05, + "loss": 1.3079, + "mean_token_accuracy": 0.6540821691354116, + "num_tokens": 926900181.0, + "step": 5518 + }, + { + "entropy": 1.6601528028647106, + "epoch": 0.6063002938672379, + "grad_norm": 1.221449613571167, + "learning_rate": 1.6704084235286336e-05, + "loss": 1.4191, + "mean_token_accuracy": 0.6419506842891375, + "num_tokens": 927094727.0, + "step": 5519 + }, + { + "entropy": 1.6796456972757976, + "epoch": 0.6064101507786108, + "grad_norm": 0.695866048336029, + "learning_rate": 1.6702845659861585e-05, + "loss": 1.4795, + "mean_token_accuracy": 0.664507215221723, + "num_tokens": 927270945.0, + "step": 5520 + }, + { + "entropy": 1.752757598956426, + "epoch": 0.6065200076899838, + "grad_norm": 0.6824564337730408, + "learning_rate": 1.6701606903944328e-05, + "loss": 1.4799, + "mean_token_accuracy": 0.6392989705006281, + "num_tokens": 927413568.0, + "step": 5521 + }, + { + "entropy": 1.7567788263161976, + "epoch": 0.6066298646013567, + "grad_norm": 0.6957651376724243, + "learning_rate": 1.6700367967573786e-05, + "loss": 1.4415, + "mean_token_accuracy": 0.6470659871896108, + "num_tokens": 927553465.0, + "step": 5522 + }, + { + "entropy": 1.7466229895750682, + "epoch": 0.6067397215127297, + "grad_norm": 0.6938997507095337, + "learning_rate": 1.669912885078917e-05, + "loss": 1.3197, + "mean_token_accuracy": 0.6793912301460902, + "num_tokens": 927687114.0, + "step": 5523 + }, + { + "entropy": 1.7211142977078755, + "epoch": 0.6068495784241026, + "grad_norm": 0.7023671865463257, + "learning_rate": 1.669788955362969e-05, + "loss": 1.4972, + "mean_token_accuracy": 0.6480685224135717, + "num_tokens": 927872297.0, + "step": 5524 + }, + { + "entropy": 1.7373531758785248, + "epoch": 0.6069594353354756, + "grad_norm": 0.7833293080329895, + "learning_rate": 1.6696650076134576e-05, + "loss": 1.449, + "mean_token_accuracy": 0.652032271027565, + "num_tokens": 928010124.0, + "step": 5525 + }, + { + "entropy": 1.6578099131584167, + "epoch": 0.6070692922468485, + "grad_norm": 0.6297708749771118, + "learning_rate": 1.6695410418343054e-05, + "loss": 1.283, + "mean_token_accuracy": 0.6739129473765691, + "num_tokens": 928139059.0, + "step": 5526 + }, + { + "entropy": 1.7742358942826588, + "epoch": 0.6071791491582215, + "grad_norm": 0.6884430646896362, + "learning_rate": 1.6694170580294356e-05, + "loss": 1.4999, + "mean_token_accuracy": 0.622523158788681, + "num_tokens": 928385655.0, + "step": 5527 + }, + { + "entropy": 1.7077501217524211, + "epoch": 0.6072890060695944, + "grad_norm": 0.7137337923049927, + "learning_rate": 1.6692930562027725e-05, + "loss": 1.4385, + "mean_token_accuracy": 0.6593621472517649, + "num_tokens": 928531980.0, + "step": 5528 + }, + { + "entropy": 1.7548049290974934, + "epoch": 0.6073988629809672, + "grad_norm": 0.5949137806892395, + "learning_rate": 1.6691690363582412e-05, + "loss": 1.3751, + "mean_token_accuracy": 0.6646410326162974, + "num_tokens": 928686528.0, + "step": 5529 + }, + { + "entropy": 1.7064041197299957, + "epoch": 0.6075087198923402, + "grad_norm": 0.660309910774231, + "learning_rate": 1.669044998499766e-05, + "loss": 1.4325, + "mean_token_accuracy": 0.6318048238754272, + "num_tokens": 928910481.0, + "step": 5530 + }, + { + "entropy": 1.6907892227172852, + "epoch": 0.6076185768037131, + "grad_norm": 0.6489412784576416, + "learning_rate": 1.668920942631273e-05, + "loss": 1.4504, + "mean_token_accuracy": 0.6479069888591766, + "num_tokens": 929134057.0, + "step": 5531 + }, + { + "entropy": 1.7279614110787709, + "epoch": 0.6077284337150861, + "grad_norm": 0.6888580322265625, + "learning_rate": 1.6687968687566885e-05, + "loss": 1.4058, + "mean_token_accuracy": 0.6593583077192307, + "num_tokens": 929299191.0, + "step": 5532 + }, + { + "entropy": 1.7116707563400269, + "epoch": 0.607838290626459, + "grad_norm": 0.5436456203460693, + "learning_rate": 1.6686727768799393e-05, + "loss": 1.2332, + "mean_token_accuracy": 0.684593141078949, + "num_tokens": 929531505.0, + "step": 5533 + }, + { + "entropy": 1.675088753302892, + "epoch": 0.607948147537832, + "grad_norm": 0.6057883501052856, + "learning_rate": 1.6685486670049533e-05, + "loss": 1.4831, + "mean_token_accuracy": 0.6440512239933014, + "num_tokens": 929694373.0, + "step": 5534 + }, + { + "entropy": 1.7029797037442524, + "epoch": 0.6080580044492049, + "grad_norm": 0.695314347743988, + "learning_rate": 1.668424539135658e-05, + "loss": 1.4127, + "mean_token_accuracy": 0.6551998356978098, + "num_tokens": 929877449.0, + "step": 5535 + }, + { + "entropy": 1.6930132706960042, + "epoch": 0.6081678613605779, + "grad_norm": 0.718914806842804, + "learning_rate": 1.668300393275982e-05, + "loss": 1.3504, + "mean_token_accuracy": 0.6599302838246027, + "num_tokens": 930056568.0, + "step": 5536 + }, + { + "entropy": 1.699955701828003, + "epoch": 0.6082777182719508, + "grad_norm": 0.8041271567344666, + "learning_rate": 1.6681762294298548e-05, + "loss": 1.3738, + "mean_token_accuracy": 0.6663634926080704, + "num_tokens": 930193126.0, + "step": 5537 + }, + { + "entropy": 1.6799223522345226, + "epoch": 0.6083875751833238, + "grad_norm": 0.6648184061050415, + "learning_rate": 1.6680520476012064e-05, + "loss": 1.5068, + "mean_token_accuracy": 0.6448338876167933, + "num_tokens": 930414595.0, + "step": 5538 + }, + { + "entropy": 1.7127573291460674, + "epoch": 0.6084974320946966, + "grad_norm": 0.7391481399536133, + "learning_rate": 1.667927847793966e-05, + "loss": 1.3562, + "mean_token_accuracy": 0.6645816365877787, + "num_tokens": 930543870.0, + "step": 5539 + }, + { + "entropy": 1.6624590853850048, + "epoch": 0.6086072890060696, + "grad_norm": 0.6188486814498901, + "learning_rate": 1.6678036300120653e-05, + "loss": 1.37, + "mean_token_accuracy": 0.6666365414857864, + "num_tokens": 930747185.0, + "step": 5540 + }, + { + "entropy": 1.7099547684192657, + "epoch": 0.6087171459174425, + "grad_norm": 0.5991356372833252, + "learning_rate": 1.6676793942594357e-05, + "loss": 1.3754, + "mean_token_accuracy": 0.6520405461390814, + "num_tokens": 930921087.0, + "step": 5541 + }, + { + "entropy": 1.6891375382741292, + "epoch": 0.6088270028288155, + "grad_norm": 0.8991249799728394, + "learning_rate": 1.667555140540009e-05, + "loss": 1.3934, + "mean_token_accuracy": 0.6690708696842194, + "num_tokens": 931109978.0, + "step": 5542 + }, + { + "entropy": 1.7334303557872772, + "epoch": 0.6089368597401884, + "grad_norm": 0.694080114364624, + "learning_rate": 1.667430868857718e-05, + "loss": 1.6048, + "mean_token_accuracy": 0.6428815325101217, + "num_tokens": 931323499.0, + "step": 5543 + }, + { + "entropy": 1.7433924575646718, + "epoch": 0.6090467166515613, + "grad_norm": 0.7323341965675354, + "learning_rate": 1.6673065792164954e-05, + "loss": 1.3905, + "mean_token_accuracy": 0.6570341090361277, + "num_tokens": 931470697.0, + "step": 5544 + }, + { + "entropy": 1.7255782683690388, + "epoch": 0.6091565735629343, + "grad_norm": 0.6894171237945557, + "learning_rate": 1.6671822716202754e-05, + "loss": 1.236, + "mean_token_accuracy": 0.6878236333529154, + "num_tokens": 931601393.0, + "step": 5545 + }, + { + "entropy": 1.706501563390096, + "epoch": 0.6092664304743072, + "grad_norm": 0.7516751289367676, + "learning_rate": 1.667057946072992e-05, + "loss": 1.3684, + "mean_token_accuracy": 0.6665105720361074, + "num_tokens": 931776426.0, + "step": 5546 + }, + { + "entropy": 1.7160039345423381, + "epoch": 0.6093762873856802, + "grad_norm": 1.0031617879867554, + "learning_rate": 1.6669336025785802e-05, + "loss": 1.3054, + "mean_token_accuracy": 0.6578457355499268, + "num_tokens": 931890991.0, + "step": 5547 + }, + { + "entropy": 1.6809919476509094, + "epoch": 0.6094861442970531, + "grad_norm": 0.5633423924446106, + "learning_rate": 1.6668092411409752e-05, + "loss": 1.3895, + "mean_token_accuracy": 0.6533411145210266, + "num_tokens": 932072346.0, + "step": 5548 + }, + { + "entropy": 1.6685432493686676, + "epoch": 0.609596001208426, + "grad_norm": 0.7651619911193848, + "learning_rate": 1.6666848617641134e-05, + "loss": 1.3651, + "mean_token_accuracy": 0.6608080416917801, + "num_tokens": 932234097.0, + "step": 5549 + }, + { + "entropy": 1.6282340387503307, + "epoch": 0.6097058581197989, + "grad_norm": 0.5770187377929688, + "learning_rate": 1.666560464451931e-05, + "loss": 1.3481, + "mean_token_accuracy": 0.6698790689309438, + "num_tokens": 932407066.0, + "step": 5550 + }, + { + "entropy": 1.7054801086584728, + "epoch": 0.6098157150311719, + "grad_norm": 0.7385131120681763, + "learning_rate": 1.666436049208365e-05, + "loss": 1.276, + "mean_token_accuracy": 0.668360099196434, + "num_tokens": 932523952.0, + "step": 5551 + }, + { + "entropy": 1.743846187988917, + "epoch": 0.6099255719425448, + "grad_norm": 0.7189563512802124, + "learning_rate": 1.6663116160373532e-05, + "loss": 1.4179, + "mean_token_accuracy": 0.6598916351795197, + "num_tokens": 932686362.0, + "step": 5552 + }, + { + "entropy": 1.7578631341457367, + "epoch": 0.6100354288539178, + "grad_norm": 0.7211261987686157, + "learning_rate": 1.6661871649428344e-05, + "loss": 1.2996, + "mean_token_accuracy": 0.6601702322562536, + "num_tokens": 932795312.0, + "step": 5553 + }, + { + "entropy": 1.7131268680095673, + "epoch": 0.6101452857652907, + "grad_norm": 0.6948407888412476, + "learning_rate": 1.6660626959287468e-05, + "loss": 1.4821, + "mean_token_accuracy": 0.6617609262466431, + "num_tokens": 932979185.0, + "step": 5554 + }, + { + "entropy": 1.7209480007489522, + "epoch": 0.6102551426766637, + "grad_norm": 0.6682031750679016, + "learning_rate": 1.66593820899903e-05, + "loss": 1.3365, + "mean_token_accuracy": 0.6623590737581253, + "num_tokens": 933132863.0, + "step": 5555 + }, + { + "entropy": 1.7183450758457184, + "epoch": 0.6103649995880366, + "grad_norm": 0.8200631737709045, + "learning_rate": 1.6658137041576236e-05, + "loss": 1.2704, + "mean_token_accuracy": 0.6729947527249655, + "num_tokens": 933246415.0, + "step": 5556 + }, + { + "entropy": 1.7393971184889476, + "epoch": 0.6104748564994095, + "grad_norm": 0.6439294815063477, + "learning_rate": 1.6656891814084685e-05, + "loss": 1.3755, + "mean_token_accuracy": 0.6616205821434656, + "num_tokens": 933432973.0, + "step": 5557 + }, + { + "entropy": 1.6542038420836132, + "epoch": 0.6105847134107825, + "grad_norm": 0.631920337677002, + "learning_rate": 1.665564640755506e-05, + "loss": 1.3404, + "mean_token_accuracy": 0.6656839350859324, + "num_tokens": 933603655.0, + "step": 5558 + }, + { + "entropy": 1.7492648462454479, + "epoch": 0.6106945703221554, + "grad_norm": 0.79639732837677, + "learning_rate": 1.6654400822026774e-05, + "loss": 1.3435, + "mean_token_accuracy": 0.656333123644193, + "num_tokens": 933731281.0, + "step": 5559 + }, + { + "entropy": 1.6991788546244304, + "epoch": 0.6108044272335283, + "grad_norm": 0.6638270020484924, + "learning_rate": 1.6653155057539248e-05, + "loss": 1.26, + "mean_token_accuracy": 0.6817802836497625, + "num_tokens": 933874973.0, + "step": 5560 + }, + { + "entropy": 1.6869306564331055, + "epoch": 0.6109142841449012, + "grad_norm": 0.7141402959823608, + "learning_rate": 1.665190911413191e-05, + "loss": 1.3595, + "mean_token_accuracy": 0.6624280711015066, + "num_tokens": 934074045.0, + "step": 5561 + }, + { + "entropy": 1.723333050807317, + "epoch": 0.6110241410562742, + "grad_norm": 0.6675291657447815, + "learning_rate": 1.6650662991844196e-05, + "loss": 1.6021, + "mean_token_accuracy": 0.6266117841005325, + "num_tokens": 934258442.0, + "step": 5562 + }, + { + "entropy": 1.6694819529851277, + "epoch": 0.6111339979676471, + "grad_norm": 0.7414196133613586, + "learning_rate": 1.6649416690715552e-05, + "loss": 1.3676, + "mean_token_accuracy": 0.6675353596607844, + "num_tokens": 934488821.0, + "step": 5563 + }, + { + "entropy": 1.683532973130544, + "epoch": 0.6112438548790201, + "grad_norm": 0.9000768065452576, + "learning_rate": 1.6648170210785405e-05, + "loss": 1.1238, + "mean_token_accuracy": 0.6990447590748469, + "num_tokens": 934594435.0, + "step": 5564 + }, + { + "entropy": 1.6689512928326924, + "epoch": 0.611353711790393, + "grad_norm": 0.6170567274093628, + "learning_rate": 1.664692355209322e-05, + "loss": 1.3279, + "mean_token_accuracy": 0.6679205298423767, + "num_tokens": 934723939.0, + "step": 5565 + }, + { + "entropy": 1.6680874923865001, + "epoch": 0.611463568701766, + "grad_norm": 0.6829505562782288, + "learning_rate": 1.6645676714678455e-05, + "loss": 1.5437, + "mean_token_accuracy": 0.6364776839812597, + "num_tokens": 934951400.0, + "step": 5566 + }, + { + "entropy": 1.6776273846626282, + "epoch": 0.6115734256131389, + "grad_norm": 0.7855686545372009, + "learning_rate": 1.664442969858056e-05, + "loss": 1.4367, + "mean_token_accuracy": 0.6567247211933136, + "num_tokens": 935139302.0, + "step": 5567 + }, + { + "entropy": 1.7481131454308827, + "epoch": 0.6116832825245119, + "grad_norm": 0.7802530527114868, + "learning_rate": 1.664318250383901e-05, + "loss": 1.449, + "mean_token_accuracy": 0.6351684182882309, + "num_tokens": 935338558.0, + "step": 5568 + }, + { + "entropy": 1.6226297517617543, + "epoch": 0.6117931394358848, + "grad_norm": 1.6184296607971191, + "learning_rate": 1.6641935130493276e-05, + "loss": 1.3117, + "mean_token_accuracy": 0.6598619123299917, + "num_tokens": 935512009.0, + "step": 5569 + }, + { + "entropy": 1.7038521965344746, + "epoch": 0.6119029963472576, + "grad_norm": 0.6111783981323242, + "learning_rate": 1.6640687578582835e-05, + "loss": 1.3716, + "mean_token_accuracy": 0.665923555692037, + "num_tokens": 935663636.0, + "step": 5570 + }, + { + "entropy": 1.724490185578664, + "epoch": 0.6120128532586306, + "grad_norm": 0.601383626461029, + "learning_rate": 1.6639439848147177e-05, + "loss": 1.4497, + "mean_token_accuracy": 0.6473473062117895, + "num_tokens": 935844408.0, + "step": 5571 + }, + { + "entropy": 1.6795639892419179, + "epoch": 0.6121227101700035, + "grad_norm": 0.7149496078491211, + "learning_rate": 1.6638191939225787e-05, + "loss": 1.2712, + "mean_token_accuracy": 0.6758081962664922, + "num_tokens": 935969137.0, + "step": 5572 + }, + { + "entropy": 1.6430913706620534, + "epoch": 0.6122325670813765, + "grad_norm": 0.6253581643104553, + "learning_rate": 1.6636943851858166e-05, + "loss": 1.4073, + "mean_token_accuracy": 0.6706066131591797, + "num_tokens": 936162195.0, + "step": 5573 + }, + { + "entropy": 1.6803169250488281, + "epoch": 0.6123424239927494, + "grad_norm": 0.646298348903656, + "learning_rate": 1.6635695586083808e-05, + "loss": 1.3594, + "mean_token_accuracy": 0.6621912568807602, + "num_tokens": 936321911.0, + "step": 5574 + }, + { + "entropy": 1.7561656534671783, + "epoch": 0.6124522809041224, + "grad_norm": 0.6321209669113159, + "learning_rate": 1.663444714194223e-05, + "loss": 1.499, + "mean_token_accuracy": 0.6410553604364395, + "num_tokens": 936510584.0, + "step": 5575 + }, + { + "entropy": 1.6951843996842701, + "epoch": 0.6125621378154953, + "grad_norm": 0.7173265218734741, + "learning_rate": 1.6633198519472933e-05, + "loss": 1.4487, + "mean_token_accuracy": 0.651951809724172, + "num_tokens": 936713499.0, + "step": 5576 + }, + { + "entropy": 1.7019581099351246, + "epoch": 0.6126719947268683, + "grad_norm": 0.5802896022796631, + "learning_rate": 1.6631949718715445e-05, + "loss": 1.3271, + "mean_token_accuracy": 0.6597871532042822, + "num_tokens": 936940791.0, + "step": 5577 + }, + { + "entropy": 1.6554476817448933, + "epoch": 0.6127818516382412, + "grad_norm": 0.7390364408493042, + "learning_rate": 1.6630700739709282e-05, + "loss": 1.4072, + "mean_token_accuracy": 0.6636865039666494, + "num_tokens": 937082308.0, + "step": 5578 + }, + { + "entropy": 1.6889912883440654, + "epoch": 0.6128917085496142, + "grad_norm": 0.686810314655304, + "learning_rate": 1.6629451582493983e-05, + "loss": 1.4255, + "mean_token_accuracy": 0.6632570077975591, + "num_tokens": 937229279.0, + "step": 5579 + }, + { + "entropy": 1.7240253388881683, + "epoch": 0.613001565460987, + "grad_norm": 0.6423957347869873, + "learning_rate": 1.6628202247109072e-05, + "loss": 1.4287, + "mean_token_accuracy": 0.6554444034894308, + "num_tokens": 937395643.0, + "step": 5580 + }, + { + "entropy": 1.7453702688217163, + "epoch": 0.61311142237236, + "grad_norm": 0.7353253960609436, + "learning_rate": 1.66269527335941e-05, + "loss": 1.4087, + "mean_token_accuracy": 0.6449921876192093, + "num_tokens": 937539093.0, + "step": 5581 + }, + { + "entropy": 1.6367293000221252, + "epoch": 0.6132212792837329, + "grad_norm": 0.608282744884491, + "learning_rate": 1.662570304198861e-05, + "loss": 1.4691, + "mean_token_accuracy": 0.6490232745806376, + "num_tokens": 937756052.0, + "step": 5582 + }, + { + "entropy": 1.686044067144394, + "epoch": 0.6133311361951058, + "grad_norm": 0.6353262066841125, + "learning_rate": 1.6624453172332154e-05, + "loss": 1.4763, + "mean_token_accuracy": 0.6541228095690409, + "num_tokens": 937904214.0, + "step": 5583 + }, + { + "entropy": 1.69670374194781, + "epoch": 0.6134409931064788, + "grad_norm": 0.6989722847938538, + "learning_rate": 1.662320312466429e-05, + "loss": 1.2754, + "mean_token_accuracy": 0.6773638278245926, + "num_tokens": 938031279.0, + "step": 5584 + }, + { + "entropy": 1.724222093820572, + "epoch": 0.6135508500178517, + "grad_norm": 0.6987892985343933, + "learning_rate": 1.6621952899024578e-05, + "loss": 1.4692, + "mean_token_accuracy": 0.6530686269203821, + "num_tokens": 938194279.0, + "step": 5585 + }, + { + "entropy": 1.777810384829839, + "epoch": 0.6136607069292247, + "grad_norm": 0.6809293627738953, + "learning_rate": 1.662070249545259e-05, + "loss": 1.5097, + "mean_token_accuracy": 0.6327020525932312, + "num_tokens": 938368428.0, + "step": 5586 + }, + { + "entropy": 1.6900599499543507, + "epoch": 0.6137705638405976, + "grad_norm": 0.5528995990753174, + "learning_rate": 1.6619451913987905e-05, + "loss": 1.401, + "mean_token_accuracy": 0.6550226360559464, + "num_tokens": 938568730.0, + "step": 5587 + }, + { + "entropy": 1.7115299503008525, + "epoch": 0.6138804207519706, + "grad_norm": 0.7030913233757019, + "learning_rate": 1.6618201154670096e-05, + "loss": 1.2342, + "mean_token_accuracy": 0.6826610863208771, + "num_tokens": 938711244.0, + "step": 5588 + }, + { + "entropy": 1.674224187930425, + "epoch": 0.6139902776633435, + "grad_norm": 0.6814902424812317, + "learning_rate": 1.6616950217538752e-05, + "loss": 1.3691, + "mean_token_accuracy": 0.6572864949703217, + "num_tokens": 938882831.0, + "step": 5589 + }, + { + "entropy": 1.677575667699178, + "epoch": 0.6141001345747165, + "grad_norm": 0.7386172413825989, + "learning_rate": 1.6615699102633466e-05, + "loss": 1.4307, + "mean_token_accuracy": 0.6639702320098877, + "num_tokens": 939044197.0, + "step": 5590 + }, + { + "entropy": 1.6663434406121571, + "epoch": 0.6142099914860893, + "grad_norm": 0.7466452717781067, + "learning_rate": 1.6614447809993833e-05, + "loss": 1.3802, + "mean_token_accuracy": 0.657066822052002, + "num_tokens": 939214259.0, + "step": 5591 + }, + { + "entropy": 1.665649155775706, + "epoch": 0.6143198483974623, + "grad_norm": 0.701424777507782, + "learning_rate": 1.6613196339659454e-05, + "loss": 1.214, + "mean_token_accuracy": 0.6855147878328959, + "num_tokens": 939396313.0, + "step": 5592 + }, + { + "entropy": 1.6514354248841603, + "epoch": 0.6144297053088352, + "grad_norm": 0.6594340801239014, + "learning_rate": 1.6611944691669944e-05, + "loss": 1.5042, + "mean_token_accuracy": 0.6493265976508459, + "num_tokens": 939601857.0, + "step": 5593 + }, + { + "entropy": 1.7074782649676006, + "epoch": 0.6145395622202082, + "grad_norm": 0.6715470552444458, + "learning_rate": 1.6610692866064912e-05, + "loss": 1.5188, + "mean_token_accuracy": 0.6534653852383295, + "num_tokens": 939812970.0, + "step": 5594 + }, + { + "entropy": 1.7174999515215557, + "epoch": 0.6146494191315811, + "grad_norm": 0.7222854495048523, + "learning_rate": 1.660944086288398e-05, + "loss": 1.2172, + "mean_token_accuracy": 0.6784104257822037, + "num_tokens": 939938382.0, + "step": 5595 + }, + { + "entropy": 1.7256175378958385, + "epoch": 0.6147592760429541, + "grad_norm": 0.6657436490058899, + "learning_rate": 1.660818868216677e-05, + "loss": 1.3559, + "mean_token_accuracy": 0.6577880332867304, + "num_tokens": 940127967.0, + "step": 5596 + }, + { + "entropy": 1.6247844000657399, + "epoch": 0.614869132954327, + "grad_norm": 0.6978784203529358, + "learning_rate": 1.660693632395292e-05, + "loss": 1.3566, + "mean_token_accuracy": 0.6696316401163737, + "num_tokens": 940261869.0, + "step": 5597 + }, + { + "entropy": 1.6865403950214386, + "epoch": 0.6149789898656999, + "grad_norm": 0.5856928825378418, + "learning_rate": 1.6605683788282057e-05, + "loss": 1.4175, + "mean_token_accuracy": 0.652377262711525, + "num_tokens": 940444933.0, + "step": 5598 + }, + { + "entropy": 1.7653738756974537, + "epoch": 0.6150888467770729, + "grad_norm": 0.6951313614845276, + "learning_rate": 1.6604431075193833e-05, + "loss": 1.4452, + "mean_token_accuracy": 0.6424042185147604, + "num_tokens": 940576065.0, + "step": 5599 + }, + { + "entropy": 1.6887817184130351, + "epoch": 0.6151987036884458, + "grad_norm": 0.7151344418525696, + "learning_rate": 1.6603178184727888e-05, + "loss": 1.3428, + "mean_token_accuracy": 0.6740106294552485, + "num_tokens": 940717460.0, + "step": 5600 + }, + { + "entropy": 1.7164513369401295, + "epoch": 0.6153085605998188, + "grad_norm": 0.6633734703063965, + "learning_rate": 1.6601925116923875e-05, + "loss": 1.4313, + "mean_token_accuracy": 0.647556280096372, + "num_tokens": 940948397.0, + "step": 5601 + }, + { + "entropy": 1.6911031305789948, + "epoch": 0.6154184175111916, + "grad_norm": 0.687568187713623, + "learning_rate": 1.660067187182146e-05, + "loss": 1.4035, + "mean_token_accuracy": 0.6575697958469391, + "num_tokens": 941105151.0, + "step": 5602 + }, + { + "entropy": 1.7053539156913757, + "epoch": 0.6155282744225646, + "grad_norm": 0.700892448425293, + "learning_rate": 1.6599418449460305e-05, + "loss": 1.3932, + "mean_token_accuracy": 0.6440112143754959, + "num_tokens": 941275369.0, + "step": 5603 + }, + { + "entropy": 1.691804975271225, + "epoch": 0.6156381313339375, + "grad_norm": 0.6623601317405701, + "learning_rate": 1.6598164849880077e-05, + "loss": 1.4609, + "mean_token_accuracy": 0.6414239406585693, + "num_tokens": 941451129.0, + "step": 5604 + }, + { + "entropy": 1.6720061600208282, + "epoch": 0.6157479882453105, + "grad_norm": 0.7391043901443481, + "learning_rate": 1.6596911073120455e-05, + "loss": 1.3943, + "mean_token_accuracy": 0.6600146691004435, + "num_tokens": 941580404.0, + "step": 5605 + }, + { + "entropy": 1.7468764384587605, + "epoch": 0.6158578451566834, + "grad_norm": 0.5990471839904785, + "learning_rate": 1.6595657119221124e-05, + "loss": 1.3827, + "mean_token_accuracy": 0.6457860618829727, + "num_tokens": 941801078.0, + "step": 5606 + }, + { + "entropy": 1.7029780944188435, + "epoch": 0.6159677020680564, + "grad_norm": 0.6301769614219666, + "learning_rate": 1.659440298822176e-05, + "loss": 1.4816, + "mean_token_accuracy": 0.6631426165501276, + "num_tokens": 941977723.0, + "step": 5607 + }, + { + "entropy": 1.7141645848751068, + "epoch": 0.6160775589794293, + "grad_norm": 0.7286873459815979, + "learning_rate": 1.6593148680162063e-05, + "loss": 1.3945, + "mean_token_accuracy": 0.6584265381097794, + "num_tokens": 942126547.0, + "step": 5608 + }, + { + "entropy": 1.7053893009821575, + "epoch": 0.6161874158908023, + "grad_norm": 0.7642189860343933, + "learning_rate": 1.659189419508173e-05, + "loss": 1.5418, + "mean_token_accuracy": 0.6277876098950704, + "num_tokens": 942298233.0, + "step": 5609 + }, + { + "entropy": 1.6716360052426655, + "epoch": 0.6162972728021752, + "grad_norm": 0.6552688479423523, + "learning_rate": 1.659063953302047e-05, + "loss": 1.402, + "mean_token_accuracy": 0.6565060516198477, + "num_tokens": 942466556.0, + "step": 5610 + }, + { + "entropy": 1.7290156185626984, + "epoch": 0.616407129713548, + "grad_norm": 0.650862991809845, + "learning_rate": 1.6589384694017984e-05, + "loss": 1.48, + "mean_token_accuracy": 0.6433103134234747, + "num_tokens": 942692434.0, + "step": 5611 + }, + { + "entropy": 1.7001596788565319, + "epoch": 0.616516986624921, + "grad_norm": 0.6333754062652588, + "learning_rate": 1.6588129678113992e-05, + "loss": 1.5025, + "mean_token_accuracy": 0.6430030663808187, + "num_tokens": 942845785.0, + "step": 5612 + }, + { + "entropy": 1.7286210159460704, + "epoch": 0.6166268435362939, + "grad_norm": 0.8158491849899292, + "learning_rate": 1.6586874485348216e-05, + "loss": 1.2876, + "mean_token_accuracy": 0.6615132391452789, + "num_tokens": 942966008.0, + "step": 5613 + }, + { + "entropy": 1.6635615924994152, + "epoch": 0.6167367004476669, + "grad_norm": 0.7491524815559387, + "learning_rate": 1.658561911576038e-05, + "loss": 1.3309, + "mean_token_accuracy": 0.6790938824415207, + "num_tokens": 943116395.0, + "step": 5614 + }, + { + "entropy": 1.686743954817454, + "epoch": 0.6168465573590398, + "grad_norm": 0.6865391135215759, + "learning_rate": 1.6584363569390213e-05, + "loss": 1.4197, + "mean_token_accuracy": 0.6580019642909368, + "num_tokens": 943271670.0, + "step": 5615 + }, + { + "entropy": 1.6392175356547039, + "epoch": 0.6169564142704128, + "grad_norm": 0.6303647756576538, + "learning_rate": 1.6583107846277455e-05, + "loss": 1.2924, + "mean_token_accuracy": 0.6750722229480743, + "num_tokens": 943457679.0, + "step": 5616 + }, + { + "entropy": 1.6606386701265972, + "epoch": 0.6170662711817857, + "grad_norm": 0.7184935808181763, + "learning_rate": 1.658185194646185e-05, + "loss": 1.4147, + "mean_token_accuracy": 0.6562165568272272, + "num_tokens": 943637649.0, + "step": 5617 + }, + { + "entropy": 1.6875253518422444, + "epoch": 0.6171761280931587, + "grad_norm": 0.5899779200553894, + "learning_rate": 1.658059586998315e-05, + "loss": 1.3247, + "mean_token_accuracy": 0.6731048425038656, + "num_tokens": 943799454.0, + "step": 5618 + }, + { + "entropy": 1.6526349087556202, + "epoch": 0.6172859850045316, + "grad_norm": 0.6579746603965759, + "learning_rate": 1.65793396168811e-05, + "loss": 1.4656, + "mean_token_accuracy": 0.6476482550303141, + "num_tokens": 944009136.0, + "step": 5619 + }, + { + "entropy": 1.669644723335902, + "epoch": 0.6173958419159046, + "grad_norm": 1.451614499092102, + "learning_rate": 1.6578083187195467e-05, + "loss": 1.3359, + "mean_token_accuracy": 0.6453322917222977, + "num_tokens": 944230639.0, + "step": 5620 + }, + { + "entropy": 1.6753608882427216, + "epoch": 0.6175056988272775, + "grad_norm": 0.6836093068122864, + "learning_rate": 1.6576826580966015e-05, + "loss": 1.3984, + "mean_token_accuracy": 0.655582994222641, + "num_tokens": 944397785.0, + "step": 5621 + }, + { + "entropy": 1.710491806268692, + "epoch": 0.6176155557386505, + "grad_norm": 0.7552167177200317, + "learning_rate": 1.657556979823252e-05, + "loss": 1.5175, + "mean_token_accuracy": 0.6515548129876455, + "num_tokens": 944603736.0, + "step": 5622 + }, + { + "entropy": 1.7582048177719116, + "epoch": 0.6177254126500233, + "grad_norm": 0.5882629752159119, + "learning_rate": 1.6574312839034745e-05, + "loss": 1.4611, + "mean_token_accuracy": 0.6304621398448944, + "num_tokens": 944782321.0, + "step": 5623 + }, + { + "entropy": 1.7265077730019887, + "epoch": 0.6178352695613962, + "grad_norm": 0.6855106353759766, + "learning_rate": 1.6573055703412486e-05, + "loss": 1.4187, + "mean_token_accuracy": 0.6633595625559489, + "num_tokens": 944970965.0, + "step": 5624 + }, + { + "entropy": 1.6792520582675934, + "epoch": 0.6179451264727692, + "grad_norm": 0.628589928150177, + "learning_rate": 1.6571798391405523e-05, + "loss": 1.417, + "mean_token_accuracy": 0.652607669432958, + "num_tokens": 945171431.0, + "step": 5625 + }, + { + "entropy": 1.61854421099027, + "epoch": 0.6180549833841421, + "grad_norm": 0.8567890524864197, + "learning_rate": 1.6570540903053653e-05, + "loss": 1.4852, + "mean_token_accuracy": 0.6646982729434967, + "num_tokens": 945370917.0, + "step": 5626 + }, + { + "entropy": 1.742018034060796, + "epoch": 0.6181648402955151, + "grad_norm": 0.7597964406013489, + "learning_rate": 1.6569283238396672e-05, + "loss": 1.4677, + "mean_token_accuracy": 0.6531796753406525, + "num_tokens": 945532898.0, + "step": 5627 + }, + { + "entropy": 1.7409979899724324, + "epoch": 0.618274697206888, + "grad_norm": 0.6351725459098816, + "learning_rate": 1.6568025397474388e-05, + "loss": 1.4706, + "mean_token_accuracy": 0.6394537637631098, + "num_tokens": 945718529.0, + "step": 5628 + }, + { + "entropy": 1.779521683851878, + "epoch": 0.618384554118261, + "grad_norm": 0.6235055923461914, + "learning_rate": 1.6566767380326604e-05, + "loss": 1.3624, + "mean_token_accuracy": 0.6629078437884649, + "num_tokens": 945854934.0, + "step": 5629 + }, + { + "entropy": 1.704407960176468, + "epoch": 0.6184944110296339, + "grad_norm": 0.7060872316360474, + "learning_rate": 1.656550918699315e-05, + "loss": 1.4615, + "mean_token_accuracy": 0.6496474295854568, + "num_tokens": 946015936.0, + "step": 5630 + }, + { + "entropy": 1.680654654900233, + "epoch": 0.6186042679410069, + "grad_norm": 0.6293652057647705, + "learning_rate": 1.656425081751383e-05, + "loss": 1.3839, + "mean_token_accuracy": 0.6551440358161926, + "num_tokens": 946154876.0, + "step": 5631 + }, + { + "entropy": 1.6304692129294078, + "epoch": 0.6187141248523798, + "grad_norm": 0.6845012903213501, + "learning_rate": 1.656299227192848e-05, + "loss": 1.4222, + "mean_token_accuracy": 0.6561487466096878, + "num_tokens": 946303938.0, + "step": 5632 + }, + { + "entropy": 1.6892323593298595, + "epoch": 0.6188239817637528, + "grad_norm": 0.6096817255020142, + "learning_rate": 1.6561733550276934e-05, + "loss": 1.3853, + "mean_token_accuracy": 0.6653565714756647, + "num_tokens": 946472387.0, + "step": 5633 + }, + { + "entropy": 1.740951379140218, + "epoch": 0.6189338386751256, + "grad_norm": 0.8123107552528381, + "learning_rate": 1.6560474652599025e-05, + "loss": 1.5593, + "mean_token_accuracy": 0.6589376678069433, + "num_tokens": 946664473.0, + "step": 5634 + }, + { + "entropy": 1.7017356554667156, + "epoch": 0.6190436955864986, + "grad_norm": 0.5863003730773926, + "learning_rate": 1.6559215578934602e-05, + "loss": 1.3029, + "mean_token_accuracy": 0.6699723253647486, + "num_tokens": 946817763.0, + "step": 5635 + }, + { + "entropy": 1.7194437483946483, + "epoch": 0.6191535524978715, + "grad_norm": 0.8233284950256348, + "learning_rate": 1.655795632932351e-05, + "loss": 1.4635, + "mean_token_accuracy": 0.6592583407958349, + "num_tokens": 946962720.0, + "step": 5636 + }, + { + "entropy": 1.6594391167163849, + "epoch": 0.6192634094092445, + "grad_norm": 0.5975894927978516, + "learning_rate": 1.6556696903805604e-05, + "loss": 1.4232, + "mean_token_accuracy": 0.6385360260804495, + "num_tokens": 947145238.0, + "step": 5637 + }, + { + "entropy": 1.7108287413914998, + "epoch": 0.6193732663206174, + "grad_norm": 0.6864363551139832, + "learning_rate": 1.6555437302420746e-05, + "loss": 1.5409, + "mean_token_accuracy": 0.6319515456755956, + "num_tokens": 947358881.0, + "step": 5638 + }, + { + "entropy": 1.6026353538036346, + "epoch": 0.6194831232319903, + "grad_norm": 0.6471695899963379, + "learning_rate": 1.6554177525208798e-05, + "loss": 1.4167, + "mean_token_accuracy": 0.6550784210364023, + "num_tokens": 947560663.0, + "step": 5639 + }, + { + "entropy": 1.7613280514876049, + "epoch": 0.6195929801433633, + "grad_norm": 0.7547305822372437, + "learning_rate": 1.6552917572209637e-05, + "loss": 1.3111, + "mean_token_accuracy": 0.6676426778237025, + "num_tokens": 947677967.0, + "step": 5640 + }, + { + "entropy": 1.6681643426418304, + "epoch": 0.6197028370547362, + "grad_norm": 0.5885694622993469, + "learning_rate": 1.6551657443463132e-05, + "loss": 1.3086, + "mean_token_accuracy": 0.6593044847249985, + "num_tokens": 947840975.0, + "step": 5641 + }, + { + "entropy": 1.684307485818863, + "epoch": 0.6198126939661092, + "grad_norm": 0.6141138076782227, + "learning_rate": 1.6550397139009174e-05, + "loss": 1.3726, + "mean_token_accuracy": 0.6515480875968933, + "num_tokens": 948049432.0, + "step": 5642 + }, + { + "entropy": 1.698849121729533, + "epoch": 0.619922550877482, + "grad_norm": 0.6681082844734192, + "learning_rate": 1.654913665888765e-05, + "loss": 1.5047, + "mean_token_accuracy": 0.6400974442561468, + "num_tokens": 948241513.0, + "step": 5643 + }, + { + "entropy": 1.6541813611984253, + "epoch": 0.620032407788855, + "grad_norm": 0.6509910821914673, + "learning_rate": 1.654787600313845e-05, + "loss": 1.3013, + "mean_token_accuracy": 0.6745069374640783, + "num_tokens": 948397238.0, + "step": 5644 + }, + { + "entropy": 1.7138587733109791, + "epoch": 0.6201422647002279, + "grad_norm": 0.6395068764686584, + "learning_rate": 1.654661517180147e-05, + "loss": 1.3956, + "mean_token_accuracy": 0.6503481864929199, + "num_tokens": 948558440.0, + "step": 5645 + }, + { + "entropy": 1.7701470851898193, + "epoch": 0.6202521216116009, + "grad_norm": 0.7288310527801514, + "learning_rate": 1.6545354164916624e-05, + "loss": 1.3433, + "mean_token_accuracy": 0.6570984820524851, + "num_tokens": 948686589.0, + "step": 5646 + }, + { + "entropy": 1.68064480026563, + "epoch": 0.6203619785229738, + "grad_norm": 0.6537264585494995, + "learning_rate": 1.6544092982523817e-05, + "loss": 1.4848, + "mean_token_accuracy": 0.6528479357560476, + "num_tokens": 948869305.0, + "step": 5647 + }, + { + "entropy": 1.7192702094713848, + "epoch": 0.6204718354343468, + "grad_norm": 0.7808331251144409, + "learning_rate": 1.654283162466296e-05, + "loss": 1.4711, + "mean_token_accuracy": 0.6476468493541082, + "num_tokens": 949060303.0, + "step": 5648 + }, + { + "entropy": 1.6918814182281494, + "epoch": 0.6205816923457197, + "grad_norm": 0.6144691705703735, + "learning_rate": 1.654157009137399e-05, + "loss": 1.3763, + "mean_token_accuracy": 0.6593097994724909, + "num_tokens": 949221427.0, + "step": 5649 + }, + { + "entropy": 1.7113316158453624, + "epoch": 0.6206915492570927, + "grad_norm": 0.7773605585098267, + "learning_rate": 1.6540308382696814e-05, + "loss": 1.2532, + "mean_token_accuracy": 0.6766321261723837, + "num_tokens": 949344612.0, + "step": 5650 + }, + { + "entropy": 1.686803976694743, + "epoch": 0.6208014061684656, + "grad_norm": 0.8215593099594116, + "learning_rate": 1.6539046498671377e-05, + "loss": 1.398, + "mean_token_accuracy": 0.6599339644114176, + "num_tokens": 949479081.0, + "step": 5651 + }, + { + "entropy": 1.7015974322954814, + "epoch": 0.6209112630798385, + "grad_norm": 0.6959190368652344, + "learning_rate": 1.6537784439337618e-05, + "loss": 1.3426, + "mean_token_accuracy": 0.6654083828131357, + "num_tokens": 949657684.0, + "step": 5652 + }, + { + "entropy": 1.6332708994547527, + "epoch": 0.6210211199912115, + "grad_norm": 0.6165181398391724, + "learning_rate": 1.6536522204735473e-05, + "loss": 1.3202, + "mean_token_accuracy": 0.6605967779954275, + "num_tokens": 949808395.0, + "step": 5653 + }, + { + "entropy": 1.7021582822004955, + "epoch": 0.6211309769025843, + "grad_norm": 0.736033022403717, + "learning_rate": 1.6535259794904895e-05, + "loss": 1.284, + "mean_token_accuracy": 0.6535018235445023, + "num_tokens": 949936209.0, + "step": 5654 + }, + { + "entropy": 1.7629179656505585, + "epoch": 0.6212408338139573, + "grad_norm": 0.6508721113204956, + "learning_rate": 1.6533997209885843e-05, + "loss": 1.4651, + "mean_token_accuracy": 0.647722914814949, + "num_tokens": 950155281.0, + "step": 5655 + }, + { + "entropy": 1.7487357060114543, + "epoch": 0.6213506907253302, + "grad_norm": 0.6706015467643738, + "learning_rate": 1.653273444971827e-05, + "loss": 1.4312, + "mean_token_accuracy": 0.6624927769104639, + "num_tokens": 950338044.0, + "step": 5656 + }, + { + "entropy": 1.699324498573939, + "epoch": 0.6214605476367032, + "grad_norm": 0.6364463567733765, + "learning_rate": 1.6531471514442143e-05, + "loss": 1.4349, + "mean_token_accuracy": 0.6458380470673243, + "num_tokens": 950510346.0, + "step": 5657 + }, + { + "entropy": 1.7458198368549347, + "epoch": 0.6215704045480761, + "grad_norm": 0.702688992023468, + "learning_rate": 1.653020840409744e-05, + "loss": 1.331, + "mean_token_accuracy": 0.6683972229560217, + "num_tokens": 950676071.0, + "step": 5658 + }, + { + "entropy": 1.6945001284281414, + "epoch": 0.6216802614594491, + "grad_norm": 0.722691535949707, + "learning_rate": 1.652894511872413e-05, + "loss": 1.3648, + "mean_token_accuracy": 0.6641036917765936, + "num_tokens": 950826008.0, + "step": 5659 + }, + { + "entropy": 1.6684882044792175, + "epoch": 0.621790118370822, + "grad_norm": 0.5960844159126282, + "learning_rate": 1.6527681658362195e-05, + "loss": 1.4113, + "mean_token_accuracy": 0.6465084751447042, + "num_tokens": 951009559.0, + "step": 5660 + }, + { + "entropy": 1.7376012802124023, + "epoch": 0.621899975282195, + "grad_norm": 0.9526035785675049, + "learning_rate": 1.652641802305163e-05, + "loss": 1.2984, + "mean_token_accuracy": 0.6551149984200796, + "num_tokens": 951117465.0, + "step": 5661 + }, + { + "entropy": 1.6809014678001404, + "epoch": 0.6220098321935679, + "grad_norm": 0.6386826038360596, + "learning_rate": 1.6525154212832427e-05, + "loss": 1.5054, + "mean_token_accuracy": 0.6454970935980479, + "num_tokens": 951316564.0, + "step": 5662 + }, + { + "entropy": 1.747914433479309, + "epoch": 0.6221196891049409, + "grad_norm": 0.637520432472229, + "learning_rate": 1.652389022774458e-05, + "loss": 1.4074, + "mean_token_accuracy": 0.660906101266543, + "num_tokens": 951550546.0, + "step": 5663 + }, + { + "entropy": 1.6579786936442058, + "epoch": 0.6222295460163138, + "grad_norm": 0.7350385189056396, + "learning_rate": 1.6522626067828096e-05, + "loss": 1.4238, + "mean_token_accuracy": 0.6841726005077362, + "num_tokens": 951679384.0, + "step": 5664 + }, + { + "entropy": 1.7201216916243236, + "epoch": 0.6223394029276866, + "grad_norm": 0.7172040343284607, + "learning_rate": 1.6521361733122988e-05, + "loss": 1.473, + "mean_token_accuracy": 0.6475165237983068, + "num_tokens": 951851937.0, + "step": 5665 + }, + { + "entropy": 1.6948122183481853, + "epoch": 0.6224492598390596, + "grad_norm": 0.7633089423179626, + "learning_rate": 1.6520097223669265e-05, + "loss": 1.3301, + "mean_token_accuracy": 0.6592757950226465, + "num_tokens": 952008420.0, + "step": 5666 + }, + { + "entropy": 1.7121588389078777, + "epoch": 0.6225591167504325, + "grad_norm": 0.8232876062393188, + "learning_rate": 1.6518832539506956e-05, + "loss": 1.2325, + "mean_token_accuracy": 0.67981685201327, + "num_tokens": 952133734.0, + "step": 5667 + }, + { + "entropy": 1.6649379134178162, + "epoch": 0.6226689736618055, + "grad_norm": 0.6807710528373718, + "learning_rate": 1.6517567680676082e-05, + "loss": 1.3521, + "mean_token_accuracy": 0.6662793705860773, + "num_tokens": 952280589.0, + "step": 5668 + }, + { + "entropy": 1.7094309329986572, + "epoch": 0.6227788305731784, + "grad_norm": 0.6411172151565552, + "learning_rate": 1.6516302647216678e-05, + "loss": 1.3567, + "mean_token_accuracy": 0.6611761103073756, + "num_tokens": 952461092.0, + "step": 5669 + }, + { + "entropy": 1.686687747637431, + "epoch": 0.6228886874845514, + "grad_norm": 0.6205955743789673, + "learning_rate": 1.651503743916878e-05, + "loss": 1.395, + "mean_token_accuracy": 0.6670850316683451, + "num_tokens": 952633888.0, + "step": 5670 + }, + { + "entropy": 1.7188594837983449, + "epoch": 0.6229985443959243, + "grad_norm": 0.795706570148468, + "learning_rate": 1.6513772056572434e-05, + "loss": 1.436, + "mean_token_accuracy": 0.6592790633440018, + "num_tokens": 952787296.0, + "step": 5671 + }, + { + "entropy": 1.6902817885080974, + "epoch": 0.6231084013072973, + "grad_norm": 0.5516853928565979, + "learning_rate": 1.6512506499467683e-05, + "loss": 1.4364, + "mean_token_accuracy": 0.6347800940275192, + "num_tokens": 953006969.0, + "step": 5672 + }, + { + "entropy": 1.7436016698678334, + "epoch": 0.6232182582186702, + "grad_norm": 0.8173310160636902, + "learning_rate": 1.651124076789459e-05, + "loss": 1.1976, + "mean_token_accuracy": 0.6767911414305369, + "num_tokens": 953116217.0, + "step": 5673 + }, + { + "entropy": 1.7197916905085247, + "epoch": 0.6233281151300432, + "grad_norm": 0.7681940793991089, + "learning_rate": 1.6509974861893207e-05, + "loss": 1.382, + "mean_token_accuracy": 0.658002108335495, + "num_tokens": 953280585.0, + "step": 5674 + }, + { + "entropy": 1.7034264703591664, + "epoch": 0.623437972041416, + "grad_norm": 0.6472874879837036, + "learning_rate": 1.6508708781503604e-05, + "loss": 1.2812, + "mean_token_accuracy": 0.6792033066352209, + "num_tokens": 953424705.0, + "step": 5675 + }, + { + "entropy": 1.676049013932546, + "epoch": 0.623547828952789, + "grad_norm": 0.6681833863258362, + "learning_rate": 1.650744252676585e-05, + "loss": 1.2813, + "mean_token_accuracy": 0.6753066728512446, + "num_tokens": 953580943.0, + "step": 5676 + }, + { + "entropy": 1.6816561023394268, + "epoch": 0.6236576858641619, + "grad_norm": 0.910898745059967, + "learning_rate": 1.6506176097720025e-05, + "loss": 1.238, + "mean_token_accuracy": 0.6695930411418279, + "num_tokens": 953734061.0, + "step": 5677 + }, + { + "entropy": 1.735485553741455, + "epoch": 0.6237675427755348, + "grad_norm": 0.6783795952796936, + "learning_rate": 1.6504909494406202e-05, + "loss": 1.3318, + "mean_token_accuracy": 0.6575172245502472, + "num_tokens": 953846716.0, + "step": 5678 + }, + { + "entropy": 1.8034160832564037, + "epoch": 0.6238773996869078, + "grad_norm": 0.689171314239502, + "learning_rate": 1.6503642716864475e-05, + "loss": 1.401, + "mean_token_accuracy": 0.6448919673760732, + "num_tokens": 953994690.0, + "step": 5679 + }, + { + "entropy": 1.6796988149483998, + "epoch": 0.6239872565982807, + "grad_norm": 0.7800900936126709, + "learning_rate": 1.650237576513494e-05, + "loss": 1.2953, + "mean_token_accuracy": 0.6639833003282547, + "num_tokens": 954154318.0, + "step": 5680 + }, + { + "entropy": 1.7101227541764576, + "epoch": 0.6240971135096537, + "grad_norm": 0.7329297065734863, + "learning_rate": 1.650110863925769e-05, + "loss": 1.3773, + "mean_token_accuracy": 0.6709673305352529, + "num_tokens": 954305906.0, + "step": 5681 + }, + { + "entropy": 1.683126191298167, + "epoch": 0.6242069704210266, + "grad_norm": 0.7751897573471069, + "learning_rate": 1.6499841339272826e-05, + "loss": 1.2098, + "mean_token_accuracy": 0.6865563889344534, + "num_tokens": 954448481.0, + "step": 5682 + }, + { + "entropy": 1.686317543188731, + "epoch": 0.6243168273323996, + "grad_norm": 0.6909394860267639, + "learning_rate": 1.649857386522047e-05, + "loss": 1.4806, + "mean_token_accuracy": 0.658604254325231, + "num_tokens": 954588846.0, + "step": 5683 + }, + { + "entropy": 1.738376796245575, + "epoch": 0.6244266842437725, + "grad_norm": 0.6977959871292114, + "learning_rate": 1.6497306217140723e-05, + "loss": 1.3653, + "mean_token_accuracy": 0.6561718732118607, + "num_tokens": 954703540.0, + "step": 5684 + }, + { + "entropy": 1.7150452435016632, + "epoch": 0.6245365411551455, + "grad_norm": 0.7359358072280884, + "learning_rate": 1.6496038395073714e-05, + "loss": 1.3309, + "mean_token_accuracy": 0.6774442195892334, + "num_tokens": 954823216.0, + "step": 5685 + }, + { + "entropy": 1.6885569989681244, + "epoch": 0.6246463980665183, + "grad_norm": 0.6183052659034729, + "learning_rate": 1.649477039905956e-05, + "loss": 1.4629, + "mean_token_accuracy": 0.6438476542631785, + "num_tokens": 955010837.0, + "step": 5686 + }, + { + "entropy": 1.7190197507540386, + "epoch": 0.6247562549778913, + "grad_norm": 0.731508731842041, + "learning_rate": 1.6493502229138404e-05, + "loss": 1.3833, + "mean_token_accuracy": 0.6536182264486948, + "num_tokens": 955149407.0, + "step": 5687 + }, + { + "entropy": 1.735072563091914, + "epoch": 0.6248661118892642, + "grad_norm": 0.6480312943458557, + "learning_rate": 1.6492233885350378e-05, + "loss": 1.4092, + "mean_token_accuracy": 0.6455465306838354, + "num_tokens": 955323698.0, + "step": 5688 + }, + { + "entropy": 1.7098636428515117, + "epoch": 0.6249759688006372, + "grad_norm": 0.6097153425216675, + "learning_rate": 1.6490965367735627e-05, + "loss": 1.4695, + "mean_token_accuracy": 0.6430481324593226, + "num_tokens": 955565112.0, + "step": 5689 + }, + { + "entropy": 1.6718948781490326, + "epoch": 0.6250858257120101, + "grad_norm": 0.8378857374191284, + "learning_rate": 1.6489696676334292e-05, + "loss": 1.4418, + "mean_token_accuracy": 0.6423271497090658, + "num_tokens": 955780285.0, + "step": 5690 + }, + { + "entropy": 1.782025973002116, + "epoch": 0.6251956826233831, + "grad_norm": 0.7587185502052307, + "learning_rate": 1.6488427811186533e-05, + "loss": 1.6231, + "mean_token_accuracy": 0.6388049274682999, + "num_tokens": 955942572.0, + "step": 5691 + }, + { + "entropy": 1.7340028285980225, + "epoch": 0.625305539534756, + "grad_norm": 0.6405612230300903, + "learning_rate": 1.6487158772332504e-05, + "loss": 1.5302, + "mean_token_accuracy": 0.6268052359422048, + "num_tokens": 956175191.0, + "step": 5692 + }, + { + "entropy": 1.7250635226567586, + "epoch": 0.6254153964461289, + "grad_norm": 0.748051106929779, + "learning_rate": 1.6485889559812377e-05, + "loss": 1.3172, + "mean_token_accuracy": 0.6627877404292425, + "num_tokens": 956336559.0, + "step": 5693 + }, + { + "entropy": 1.6929832597573597, + "epoch": 0.6255252533575019, + "grad_norm": 0.631056010723114, + "learning_rate": 1.6484620173666314e-05, + "loss": 1.5161, + "mean_token_accuracy": 0.646266758441925, + "num_tokens": 956509627.0, + "step": 5694 + }, + { + "entropy": 1.692749907573064, + "epoch": 0.6256351102688748, + "grad_norm": 0.6480849981307983, + "learning_rate": 1.6483350613934497e-05, + "loss": 1.2836, + "mean_token_accuracy": 0.6649422496557236, + "num_tokens": 956687120.0, + "step": 5695 + }, + { + "entropy": 1.6549834311008453, + "epoch": 0.6257449671802477, + "grad_norm": 0.7371950149536133, + "learning_rate": 1.64820808806571e-05, + "loss": 1.3504, + "mean_token_accuracy": 0.6728880554437637, + "num_tokens": 956826533.0, + "step": 5696 + }, + { + "entropy": 1.7084954679012299, + "epoch": 0.6258548240916206, + "grad_norm": 0.7008212208747864, + "learning_rate": 1.6480810973874316e-05, + "loss": 1.357, + "mean_token_accuracy": 0.6564209510882696, + "num_tokens": 956956502.0, + "step": 5697 + }, + { + "entropy": 1.7462473213672638, + "epoch": 0.6259646810029936, + "grad_norm": 0.716570258140564, + "learning_rate": 1.6479540893626332e-05, + "loss": 1.4223, + "mean_token_accuracy": 0.6491911063591639, + "num_tokens": 957127208.0, + "step": 5698 + }, + { + "entropy": 1.7282981077829997, + "epoch": 0.6260745379143665, + "grad_norm": 0.8346961736679077, + "learning_rate": 1.647827063995335e-05, + "loss": 1.5449, + "mean_token_accuracy": 0.6457051436106364, + "num_tokens": 957332479.0, + "step": 5699 + }, + { + "entropy": 1.6583941678206127, + "epoch": 0.6261843948257395, + "grad_norm": 0.7575819492340088, + "learning_rate": 1.6477000212895573e-05, + "loss": 1.3436, + "mean_token_accuracy": 0.6612271418174108, + "num_tokens": 957498700.0, + "step": 5700 + }, + { + "entropy": 1.7637586295604706, + "epoch": 0.6262942517371124, + "grad_norm": 0.5901393294334412, + "learning_rate": 1.6475729612493202e-05, + "loss": 1.389, + "mean_token_accuracy": 0.6556883603334427, + "num_tokens": 957713899.0, + "step": 5701 + }, + { + "entropy": 1.7112191021442413, + "epoch": 0.6264041086484854, + "grad_norm": 0.6072997450828552, + "learning_rate": 1.647445883878646e-05, + "loss": 1.349, + "mean_token_accuracy": 0.6535293956597646, + "num_tokens": 957877257.0, + "step": 5702 + }, + { + "entropy": 1.6826953887939453, + "epoch": 0.6265139655598583, + "grad_norm": 0.7811943292617798, + "learning_rate": 1.6473187891815563e-05, + "loss": 1.3923, + "mean_token_accuracy": 0.6466284741957983, + "num_tokens": 958124142.0, + "step": 5703 + }, + { + "entropy": 1.7394831478595734, + "epoch": 0.6266238224712313, + "grad_norm": 0.7273539900779724, + "learning_rate": 1.6471916771620734e-05, + "loss": 1.4109, + "mean_token_accuracy": 0.6454877008994421, + "num_tokens": 958342923.0, + "step": 5704 + }, + { + "entropy": 1.7591275970141094, + "epoch": 0.6267336793826042, + "grad_norm": 0.641975998878479, + "learning_rate": 1.6470645478242203e-05, + "loss": 1.4495, + "mean_token_accuracy": 0.6493928283452988, + "num_tokens": 958589574.0, + "step": 5705 + }, + { + "entropy": 1.6759761174519856, + "epoch": 0.626843536293977, + "grad_norm": 0.677434504032135, + "learning_rate": 1.6469374011720213e-05, + "loss": 1.2512, + "mean_token_accuracy": 0.68131522834301, + "num_tokens": 958743999.0, + "step": 5706 + }, + { + "entropy": 1.7099489271640778, + "epoch": 0.62695339320535, + "grad_norm": 0.6134753823280334, + "learning_rate": 1.6468102372094995e-05, + "loss": 1.3581, + "mean_token_accuracy": 0.659825325012207, + "num_tokens": 958909549.0, + "step": 5707 + }, + { + "entropy": 1.7220982710520427, + "epoch": 0.6270632501167229, + "grad_norm": 0.7816711664199829, + "learning_rate": 1.6466830559406805e-05, + "loss": 1.3075, + "mean_token_accuracy": 0.6649517863988876, + "num_tokens": 959021472.0, + "step": 5708 + }, + { + "entropy": 1.6756068567434947, + "epoch": 0.6271731070280959, + "grad_norm": 0.6301890015602112, + "learning_rate": 1.6465558573695888e-05, + "loss": 1.3314, + "mean_token_accuracy": 0.6648290057977041, + "num_tokens": 959159822.0, + "step": 5709 + }, + { + "entropy": 1.7089245716730754, + "epoch": 0.6272829639394688, + "grad_norm": 0.7451719045639038, + "learning_rate": 1.6464286415002504e-05, + "loss": 1.2651, + "mean_token_accuracy": 0.6750001311302185, + "num_tokens": 959259070.0, + "step": 5710 + }, + { + "entropy": 1.7253338595231373, + "epoch": 0.6273928208508418, + "grad_norm": 0.7505415081977844, + "learning_rate": 1.646301408336692e-05, + "loss": 1.3964, + "mean_token_accuracy": 0.6479530483484268, + "num_tokens": 959434654.0, + "step": 5711 + }, + { + "entropy": 1.708227703968684, + "epoch": 0.6275026777622147, + "grad_norm": 0.6948199272155762, + "learning_rate": 1.64617415788294e-05, + "loss": 1.3378, + "mean_token_accuracy": 0.6625229269266129, + "num_tokens": 959588905.0, + "step": 5712 + }, + { + "entropy": 1.7235769430796306, + "epoch": 0.6276125346735877, + "grad_norm": 0.6836848855018616, + "learning_rate": 1.6460468901430225e-05, + "loss": 1.4183, + "mean_token_accuracy": 0.6595809658368429, + "num_tokens": 959737641.0, + "step": 5713 + }, + { + "entropy": 1.704572280248006, + "epoch": 0.6277223915849606, + "grad_norm": 0.6386615633964539, + "learning_rate": 1.6459196051209663e-05, + "loss": 1.521, + "mean_token_accuracy": 0.63822074731191, + "num_tokens": 960007106.0, + "step": 5714 + }, + { + "entropy": 1.6550373832384746, + "epoch": 0.6278322484963336, + "grad_norm": 0.6144903898239136, + "learning_rate": 1.645792302820801e-05, + "loss": 1.3233, + "mean_token_accuracy": 0.6593523075183233, + "num_tokens": 960197922.0, + "step": 5715 + }, + { + "entropy": 1.6816544930140178, + "epoch": 0.6279421054077065, + "grad_norm": 0.7425700426101685, + "learning_rate": 1.645664983246555e-05, + "loss": 1.2967, + "mean_token_accuracy": 0.6748471558094025, + "num_tokens": 960332207.0, + "step": 5716 + }, + { + "entropy": 1.6723963618278503, + "epoch": 0.6280519623190794, + "grad_norm": 0.7503743767738342, + "learning_rate": 1.6455376464022585e-05, + "loss": 1.2984, + "mean_token_accuracy": 0.6709683686494827, + "num_tokens": 960458091.0, + "step": 5717 + }, + { + "entropy": 1.6904148161411285, + "epoch": 0.6281618192304523, + "grad_norm": 0.70115727186203, + "learning_rate": 1.645410292291941e-05, + "loss": 1.5021, + "mean_token_accuracy": 0.6557003756364187, + "num_tokens": 960652785.0, + "step": 5718 + }, + { + "entropy": 1.7101349135239918, + "epoch": 0.6282716761418252, + "grad_norm": 0.7017475962638855, + "learning_rate": 1.6452829209196337e-05, + "loss": 1.3333, + "mean_token_accuracy": 0.6754453778266907, + "num_tokens": 960782480.0, + "step": 5719 + }, + { + "entropy": 1.7187654972076416, + "epoch": 0.6283815330531982, + "grad_norm": 0.7320044636726379, + "learning_rate": 1.6451555322893676e-05, + "loss": 1.5041, + "mean_token_accuracy": 0.6363671620686849, + "num_tokens": 960975432.0, + "step": 5720 + }, + { + "entropy": 1.629583348830541, + "epoch": 0.6284913899645711, + "grad_norm": 0.6926023364067078, + "learning_rate": 1.6450281264051746e-05, + "loss": 1.3311, + "mean_token_accuracy": 0.6710839569568634, + "num_tokens": 961145110.0, + "step": 5721 + }, + { + "entropy": 1.6936370134353638, + "epoch": 0.6286012468759441, + "grad_norm": 0.776731014251709, + "learning_rate": 1.644900703271087e-05, + "loss": 1.2384, + "mean_token_accuracy": 0.6770918766657511, + "num_tokens": 961334917.0, + "step": 5722 + }, + { + "entropy": 1.7333524624506633, + "epoch": 0.628711103787317, + "grad_norm": 0.6435438990592957, + "learning_rate": 1.6447732628911375e-05, + "loss": 1.4127, + "mean_token_accuracy": 0.6506403088569641, + "num_tokens": 961515935.0, + "step": 5723 + }, + { + "entropy": 1.729872743288676, + "epoch": 0.62882096069869, + "grad_norm": 0.6495027542114258, + "learning_rate": 1.64464580526936e-05, + "loss": 1.3786, + "mean_token_accuracy": 0.6543400337298712, + "num_tokens": 961656876.0, + "step": 5724 + }, + { + "entropy": 1.734206845362981, + "epoch": 0.6289308176100629, + "grad_norm": 0.6567356586456299, + "learning_rate": 1.6445183304097882e-05, + "loss": 1.5658, + "mean_token_accuracy": 0.6441596001386642, + "num_tokens": 961848097.0, + "step": 5725 + }, + { + "entropy": 1.7201481660207112, + "epoch": 0.6290406745214359, + "grad_norm": 0.7097489833831787, + "learning_rate": 1.6443908383164565e-05, + "loss": 1.4798, + "mean_token_accuracy": 0.6604090680678686, + "num_tokens": 961977227.0, + "step": 5726 + }, + { + "entropy": 1.6500455737113953, + "epoch": 0.6291505314328087, + "grad_norm": 0.5988762378692627, + "learning_rate": 1.6442633289934e-05, + "loss": 1.4845, + "mean_token_accuracy": 0.6477457582950592, + "num_tokens": 962218485.0, + "step": 5727 + }, + { + "entropy": 1.6345641314983368, + "epoch": 0.6292603883441817, + "grad_norm": 0.6215258240699768, + "learning_rate": 1.6441358024446543e-05, + "loss": 1.3618, + "mean_token_accuracy": 0.6687282770872116, + "num_tokens": 962381448.0, + "step": 5728 + }, + { + "entropy": 1.696586012840271, + "epoch": 0.6293702452555546, + "grad_norm": 0.6843472719192505, + "learning_rate": 1.6440082586742558e-05, + "loss": 1.3164, + "mean_token_accuracy": 0.6669615209102631, + "num_tokens": 962529434.0, + "step": 5729 + }, + { + "entropy": 1.6618889768918355, + "epoch": 0.6294801021669276, + "grad_norm": 0.7052629590034485, + "learning_rate": 1.643880697686241e-05, + "loss": 1.5781, + "mean_token_accuracy": 0.638151670495669, + "num_tokens": 962704605.0, + "step": 5730 + }, + { + "entropy": 1.707265595595042, + "epoch": 0.6295899590783005, + "grad_norm": 0.7040795683860779, + "learning_rate": 1.6437531194846473e-05, + "loss": 1.2903, + "mean_token_accuracy": 0.6720566848913828, + "num_tokens": 962824714.0, + "step": 5731 + }, + { + "entropy": 1.6971515615781148, + "epoch": 0.6296998159896734, + "grad_norm": 0.6653143763542175, + "learning_rate": 1.6436255240735123e-05, + "loss": 1.2856, + "mean_token_accuracy": 0.6751369287570318, + "num_tokens": 962983287.0, + "step": 5732 + }, + { + "entropy": 1.7146797279516857, + "epoch": 0.6298096729010464, + "grad_norm": 0.6092875003814697, + "learning_rate": 1.643497911456874e-05, + "loss": 1.3847, + "mean_token_accuracy": 0.6554857790470123, + "num_tokens": 963192937.0, + "step": 5733 + }, + { + "entropy": 1.723339209953944, + "epoch": 0.6299195298124193, + "grad_norm": 0.7470570802688599, + "learning_rate": 1.6433702816387726e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6623584628105164, + "num_tokens": 963342862.0, + "step": 5734 + }, + { + "entropy": 1.7229750553766887, + "epoch": 0.6300293867237923, + "grad_norm": 0.6485400795936584, + "learning_rate": 1.643242634623246e-05, + "loss": 1.365, + "mean_token_accuracy": 0.647914802034696, + "num_tokens": 963467152.0, + "step": 5735 + }, + { + "entropy": 1.7394628127415974, + "epoch": 0.6301392436351652, + "grad_norm": 0.7105817794799805, + "learning_rate": 1.643114970414335e-05, + "loss": 1.4991, + "mean_token_accuracy": 0.6525513231754303, + "num_tokens": 963653661.0, + "step": 5736 + }, + { + "entropy": 1.677705059448878, + "epoch": 0.6302491005465382, + "grad_norm": 0.6255174875259399, + "learning_rate": 1.64298728901608e-05, + "loss": 1.2898, + "mean_token_accuracy": 0.6665566811958948, + "num_tokens": 963820917.0, + "step": 5737 + }, + { + "entropy": 1.6519030431906383, + "epoch": 0.630358957457911, + "grad_norm": 0.6910622119903564, + "learning_rate": 1.6428595904325216e-05, + "loss": 1.326, + "mean_token_accuracy": 0.6713027606407801, + "num_tokens": 963981593.0, + "step": 5738 + }, + { + "entropy": 1.750054806470871, + "epoch": 0.630468814369284, + "grad_norm": 0.7107712030410767, + "learning_rate": 1.642731874667702e-05, + "loss": 1.3285, + "mean_token_accuracy": 0.6556506305932999, + "num_tokens": 964125721.0, + "step": 5739 + }, + { + "entropy": 1.6838609278202057, + "epoch": 0.6305786712806569, + "grad_norm": 0.6880468726158142, + "learning_rate": 1.6426041417256633e-05, + "loss": 1.412, + "mean_token_accuracy": 0.6564341684182485, + "num_tokens": 964285108.0, + "step": 5740 + }, + { + "entropy": 1.6792670687039692, + "epoch": 0.6306885281920299, + "grad_norm": 0.6007390022277832, + "learning_rate": 1.6424763916104477e-05, + "loss": 1.3549, + "mean_token_accuracy": 0.6552864263455073, + "num_tokens": 964458866.0, + "step": 5741 + }, + { + "entropy": 1.7109587788581848, + "epoch": 0.6307983851034028, + "grad_norm": 0.730143129825592, + "learning_rate": 1.6423486243260993e-05, + "loss": 1.5021, + "mean_token_accuracy": 0.663687601685524, + "num_tokens": 964577666.0, + "step": 5742 + }, + { + "entropy": 1.6826227903366089, + "epoch": 0.6309082420147758, + "grad_norm": 0.662407636642456, + "learning_rate": 1.642220839876661e-05, + "loss": 1.3491, + "mean_token_accuracy": 0.6631719022989273, + "num_tokens": 964731662.0, + "step": 5743 + }, + { + "entropy": 1.6952384213606517, + "epoch": 0.6310180989261487, + "grad_norm": 0.638367772102356, + "learning_rate": 1.6420930382661773e-05, + "loss": 1.4605, + "mean_token_accuracy": 0.6421840240557989, + "num_tokens": 964979467.0, + "step": 5744 + }, + { + "entropy": 1.6700897018114726, + "epoch": 0.6311279558375217, + "grad_norm": 0.692187488079071, + "learning_rate": 1.641965219498693e-05, + "loss": 1.3674, + "mean_token_accuracy": 0.6602755586306254, + "num_tokens": 965202445.0, + "step": 5745 + }, + { + "entropy": 1.6474164128303528, + "epoch": 0.6312378127488946, + "grad_norm": 0.5569962859153748, + "learning_rate": 1.6418373835782542e-05, + "loss": 1.4016, + "mean_token_accuracy": 0.6398782779773077, + "num_tokens": 965414482.0, + "step": 5746 + }, + { + "entropy": 1.7786280512809753, + "epoch": 0.6313476696602675, + "grad_norm": 0.7615206241607666, + "learning_rate": 1.6417095305089062e-05, + "loss": 1.3846, + "mean_token_accuracy": 0.6566232194503149, + "num_tokens": 965560099.0, + "step": 5747 + }, + { + "entropy": 1.7956977883974712, + "epoch": 0.6314575265716404, + "grad_norm": 0.7831846475601196, + "learning_rate": 1.641581660294696e-05, + "loss": 1.4214, + "mean_token_accuracy": 0.6555542002121607, + "num_tokens": 965726148.0, + "step": 5748 + }, + { + "entropy": 1.6914733946323395, + "epoch": 0.6315673834830133, + "grad_norm": 0.6474779844284058, + "learning_rate": 1.6414537729396698e-05, + "loss": 1.4031, + "mean_token_accuracy": 0.6505423734585444, + "num_tokens": 965937834.0, + "step": 5749 + }, + { + "entropy": 1.7051812211672466, + "epoch": 0.6316772403943863, + "grad_norm": 0.7757088541984558, + "learning_rate": 1.641325868447876e-05, + "loss": 1.276, + "mean_token_accuracy": 0.6699098100264868, + "num_tokens": 966046845.0, + "step": 5750 + }, + { + "entropy": 1.6874237755934398, + "epoch": 0.6317870973057592, + "grad_norm": 0.8079215884208679, + "learning_rate": 1.641197946823362e-05, + "loss": 1.4308, + "mean_token_accuracy": 0.6719500770171484, + "num_tokens": 966238160.0, + "step": 5751 + }, + { + "entropy": 1.719059665997823, + "epoch": 0.6318969542171322, + "grad_norm": 0.6266113519668579, + "learning_rate": 1.641070008070177e-05, + "loss": 1.3024, + "mean_token_accuracy": 0.6625022441148758, + "num_tokens": 966383153.0, + "step": 5752 + }, + { + "entropy": 1.6473219096660614, + "epoch": 0.6320068111285051, + "grad_norm": 0.7965893149375916, + "learning_rate": 1.6409420521923705e-05, + "loss": 1.3459, + "mean_token_accuracy": 0.6579365134239197, + "num_tokens": 966589655.0, + "step": 5753 + }, + { + "entropy": 1.6831459005673726, + "epoch": 0.6321166680398781, + "grad_norm": 0.6689232587814331, + "learning_rate": 1.6408140791939914e-05, + "loss": 1.2523, + "mean_token_accuracy": 0.6745945314566294, + "num_tokens": 966736286.0, + "step": 5754 + }, + { + "entropy": 1.7096228897571564, + "epoch": 0.632226524951251, + "grad_norm": 0.6370944380760193, + "learning_rate": 1.6406860890790904e-05, + "loss": 1.4404, + "mean_token_accuracy": 0.6629820168018341, + "num_tokens": 966930006.0, + "step": 5755 + }, + { + "entropy": 1.643300195535024, + "epoch": 0.632336381862624, + "grad_norm": 0.5664902925491333, + "learning_rate": 1.6405580818517183e-05, + "loss": 1.4674, + "mean_token_accuracy": 0.6461287786563238, + "num_tokens": 967215614.0, + "step": 5756 + }, + { + "entropy": 1.6857891182104747, + "epoch": 0.6324462387739969, + "grad_norm": 0.6653056740760803, + "learning_rate": 1.6404300575159266e-05, + "loss": 1.3096, + "mean_token_accuracy": 0.6652368158102036, + "num_tokens": 967351993.0, + "step": 5757 + }, + { + "entropy": 1.6991856694221497, + "epoch": 0.6325560956853699, + "grad_norm": 0.7402186989784241, + "learning_rate": 1.640302016075767e-05, + "loss": 1.4144, + "mean_token_accuracy": 0.6631848861773809, + "num_tokens": 967491867.0, + "step": 5758 + }, + { + "entropy": 1.7615701655546825, + "epoch": 0.6326659525967427, + "grad_norm": 0.8464493751525879, + "learning_rate": 1.6401739575352922e-05, + "loss": 1.4789, + "mean_token_accuracy": 0.6543097992738088, + "num_tokens": 967645170.0, + "step": 5759 + }, + { + "entropy": 1.6885365744431813, + "epoch": 0.6327758095081156, + "grad_norm": 0.7495446801185608, + "learning_rate": 1.640045881898555e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.669794961810112, + "num_tokens": 967769391.0, + "step": 5760 + }, + { + "entropy": 1.7515502472718556, + "epoch": 0.6328856664194886, + "grad_norm": 0.6737968325614929, + "learning_rate": 1.639917789169609e-05, + "loss": 1.376, + "mean_token_accuracy": 0.6509286512931188, + "num_tokens": 967905792.0, + "step": 5761 + }, + { + "entropy": 1.6751770774523418, + "epoch": 0.6329955233308615, + "grad_norm": 0.6613587141036987, + "learning_rate": 1.639789679352508e-05, + "loss": 1.3934, + "mean_token_accuracy": 0.6460143725077311, + "num_tokens": 968068487.0, + "step": 5762 + }, + { + "entropy": 1.7160434822241466, + "epoch": 0.6331053802422345, + "grad_norm": 0.7124457359313965, + "learning_rate": 1.639661552451307e-05, + "loss": 1.2378, + "mean_token_accuracy": 0.6820594320694605, + "num_tokens": 968185469.0, + "step": 5763 + }, + { + "entropy": 1.7458227773507435, + "epoch": 0.6332152371536074, + "grad_norm": 0.6111082434654236, + "learning_rate": 1.6395334084700613e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6464193016290665, + "num_tokens": 968396545.0, + "step": 5764 + }, + { + "entropy": 1.7225351532300313, + "epoch": 0.6333250940649804, + "grad_norm": 0.7520685791969299, + "learning_rate": 1.6394052474128262e-05, + "loss": 1.3592, + "mean_token_accuracy": 0.6575525949398676, + "num_tokens": 968577101.0, + "step": 5765 + }, + { + "entropy": 1.7581463356812794, + "epoch": 0.6334349509763533, + "grad_norm": 0.6961262822151184, + "learning_rate": 1.639277069283658e-05, + "loss": 1.4337, + "mean_token_accuracy": 0.6417265683412552, + "num_tokens": 968738011.0, + "step": 5766 + }, + { + "entropy": 1.6993624071280162, + "epoch": 0.6335448078877263, + "grad_norm": 0.7254540920257568, + "learning_rate": 1.6391488740866137e-05, + "loss": 1.6107, + "mean_token_accuracy": 0.6395700896779696, + "num_tokens": 968951008.0, + "step": 5767 + }, + { + "entropy": 1.712354451417923, + "epoch": 0.6336546647990992, + "grad_norm": 0.8201763033866882, + "learning_rate": 1.6390206618257504e-05, + "loss": 1.3487, + "mean_token_accuracy": 0.6735559155543646, + "num_tokens": 969087698.0, + "step": 5768 + }, + { + "entropy": 1.6842971344788868, + "epoch": 0.6337645217104722, + "grad_norm": 0.6519754528999329, + "learning_rate": 1.6388924325051262e-05, + "loss": 1.401, + "mean_token_accuracy": 0.6581608355045319, + "num_tokens": 969272928.0, + "step": 5769 + }, + { + "entropy": 1.673642873764038, + "epoch": 0.633874378621845, + "grad_norm": 0.6902750134468079, + "learning_rate": 1.6387641861287988e-05, + "loss": 1.401, + "mean_token_accuracy": 0.6717785199483236, + "num_tokens": 969452469.0, + "step": 5770 + }, + { + "entropy": 1.6944104433059692, + "epoch": 0.633984235533218, + "grad_norm": 0.7268493175506592, + "learning_rate": 1.6386359227008283e-05, + "loss": 1.4569, + "mean_token_accuracy": 0.6476317048072815, + "num_tokens": 969642687.0, + "step": 5771 + }, + { + "entropy": 1.6414244969685872, + "epoch": 0.6340940924445909, + "grad_norm": 0.5762456655502319, + "learning_rate": 1.6385076422252735e-05, + "loss": 1.4876, + "mean_token_accuracy": 0.6473345657189687, + "num_tokens": 969895358.0, + "step": 5772 + }, + { + "entropy": 1.7236258288224537, + "epoch": 0.6342039493559638, + "grad_norm": 0.773324728012085, + "learning_rate": 1.638379344706194e-05, + "loss": 1.4069, + "mean_token_accuracy": 0.6589711755514145, + "num_tokens": 970022354.0, + "step": 5773 + }, + { + "entropy": 1.6633692582448323, + "epoch": 0.6343138062673368, + "grad_norm": 0.6954154968261719, + "learning_rate": 1.6382510301476514e-05, + "loss": 1.3966, + "mean_token_accuracy": 0.6627478500207266, + "num_tokens": 970183041.0, + "step": 5774 + }, + { + "entropy": 1.722163478533427, + "epoch": 0.6344236631787097, + "grad_norm": 0.6416156888008118, + "learning_rate": 1.638122698553706e-05, + "loss": 1.3794, + "mean_token_accuracy": 0.6767031103372574, + "num_tokens": 970325617.0, + "step": 5775 + }, + { + "entropy": 1.7074719667434692, + "epoch": 0.6345335200900827, + "grad_norm": 0.6638447046279907, + "learning_rate": 1.6379943499284194e-05, + "loss": 1.366, + "mean_token_accuracy": 0.6629780729611715, + "num_tokens": 970503187.0, + "step": 5776 + }, + { + "entropy": 1.7575515409310658, + "epoch": 0.6346433770014556, + "grad_norm": 0.6173842549324036, + "learning_rate": 1.6378659842758545e-05, + "loss": 1.4809, + "mean_token_accuracy": 0.6375833203395208, + "num_tokens": 970734107.0, + "step": 5777 + }, + { + "entropy": 1.6983816027641296, + "epoch": 0.6347532339128286, + "grad_norm": 0.638751208782196, + "learning_rate": 1.6377376016000735e-05, + "loss": 1.3994, + "mean_token_accuracy": 0.6642539451519648, + "num_tokens": 970909962.0, + "step": 5778 + }, + { + "entropy": 1.681634436051051, + "epoch": 0.6348630908242014, + "grad_norm": 0.7870696187019348, + "learning_rate": 1.6376092019051396e-05, + "loss": 1.5343, + "mean_token_accuracy": 0.6600727339585623, + "num_tokens": 971073686.0, + "step": 5779 + }, + { + "entropy": 1.6831410626570384, + "epoch": 0.6349729477355744, + "grad_norm": 0.7157701849937439, + "learning_rate": 1.6374807851951166e-05, + "loss": 1.4874, + "mean_token_accuracy": 0.6633025457461675, + "num_tokens": 971233291.0, + "step": 5780 + }, + { + "entropy": 1.6157586574554443, + "epoch": 0.6350828046469473, + "grad_norm": 0.6183757185935974, + "learning_rate": 1.637352351474069e-05, + "loss": 1.318, + "mean_token_accuracy": 0.6748292644818624, + "num_tokens": 971387042.0, + "step": 5781 + }, + { + "entropy": 1.7227964301904042, + "epoch": 0.6351926615583203, + "grad_norm": 0.7202012538909912, + "learning_rate": 1.6372239007460618e-05, + "loss": 1.38, + "mean_token_accuracy": 0.6581563999255499, + "num_tokens": 971608939.0, + "step": 5782 + }, + { + "entropy": 1.6987042327721913, + "epoch": 0.6353025184696932, + "grad_norm": 0.598973274230957, + "learning_rate": 1.63709543301516e-05, + "loss": 1.5169, + "mean_token_accuracy": 0.6307604809602102, + "num_tokens": 971829170.0, + "step": 5783 + }, + { + "entropy": 1.6923502782980602, + "epoch": 0.6354123753810662, + "grad_norm": 0.6629777550697327, + "learning_rate": 1.6369669482854298e-05, + "loss": 1.3937, + "mean_token_accuracy": 0.6485069692134857, + "num_tokens": 971977025.0, + "step": 5784 + }, + { + "entropy": 1.6766403019428253, + "epoch": 0.6355222322924391, + "grad_norm": 0.7367562055587769, + "learning_rate": 1.6368384465609376e-05, + "loss": 1.4578, + "mean_token_accuracy": 0.6594801992177963, + "num_tokens": 972163071.0, + "step": 5785 + }, + { + "entropy": 1.7118110756079357, + "epoch": 0.6356320892038121, + "grad_norm": 0.6576919555664062, + "learning_rate": 1.636709927845751e-05, + "loss": 1.4844, + "mean_token_accuracy": 0.6427266945441564, + "num_tokens": 972368158.0, + "step": 5786 + }, + { + "entropy": 1.712723731994629, + "epoch": 0.635741946115185, + "grad_norm": 0.6602995991706848, + "learning_rate": 1.6365813921439365e-05, + "loss": 1.3296, + "mean_token_accuracy": 0.6641115595897039, + "num_tokens": 972525592.0, + "step": 5787 + }, + { + "entropy": 1.7280906836191814, + "epoch": 0.6358518030265579, + "grad_norm": 0.7627761960029602, + "learning_rate": 1.6364528394595627e-05, + "loss": 1.3437, + "mean_token_accuracy": 0.6574230591456095, + "num_tokens": 972716798.0, + "step": 5788 + }, + { + "entropy": 1.7323518792788188, + "epoch": 0.6359616599379309, + "grad_norm": 0.714829683303833, + "learning_rate": 1.6363242697966984e-05, + "loss": 1.495, + "mean_token_accuracy": 0.6422079453865687, + "num_tokens": 972883401.0, + "step": 5789 + }, + { + "entropy": 1.6686233679453533, + "epoch": 0.6360715168493037, + "grad_norm": 0.6240226030349731, + "learning_rate": 1.636195683159413e-05, + "loss": 1.4466, + "mean_token_accuracy": 0.662479097644488, + "num_tokens": 973067632.0, + "step": 5790 + }, + { + "entropy": 1.6985422770182292, + "epoch": 0.6361813737606767, + "grad_norm": 0.7517790198326111, + "learning_rate": 1.6360670795517754e-05, + "loss": 1.2413, + "mean_token_accuracy": 0.6770857026179632, + "num_tokens": 973199330.0, + "step": 5791 + }, + { + "entropy": 1.7671143412590027, + "epoch": 0.6362912306720496, + "grad_norm": 0.6133571863174438, + "learning_rate": 1.6359384589778563e-05, + "loss": 1.3633, + "mean_token_accuracy": 0.6553646673758825, + "num_tokens": 973335730.0, + "step": 5792 + }, + { + "entropy": 1.7020770212014515, + "epoch": 0.6364010875834226, + "grad_norm": 0.800118088722229, + "learning_rate": 1.6358098214417263e-05, + "loss": 1.5184, + "mean_token_accuracy": 0.6433244993289312, + "num_tokens": 973529438.0, + "step": 5793 + }, + { + "entropy": 1.6572450300057728, + "epoch": 0.6365109444947955, + "grad_norm": 0.6743999123573303, + "learning_rate": 1.635681166947457e-05, + "loss": 1.3541, + "mean_token_accuracy": 0.665309856335322, + "num_tokens": 973686576.0, + "step": 5794 + }, + { + "entropy": 1.7047623197237651, + "epoch": 0.6366208014061685, + "grad_norm": 0.7407748699188232, + "learning_rate": 1.6355524954991205e-05, + "loss": 1.3828, + "mean_token_accuracy": 0.6678665081659952, + "num_tokens": 973906181.0, + "step": 5795 + }, + { + "entropy": 1.6628866891066234, + "epoch": 0.6367306583175414, + "grad_norm": 0.5496436953544617, + "learning_rate": 1.6354238071007887e-05, + "loss": 1.2899, + "mean_token_accuracy": 0.6707668304443359, + "num_tokens": 974089134.0, + "step": 5796 + }, + { + "entropy": 1.7366754313309987, + "epoch": 0.6368405152289144, + "grad_norm": 0.5878070592880249, + "learning_rate": 1.6352951017565346e-05, + "loss": 1.5668, + "mean_token_accuracy": 0.6261717478434244, + "num_tokens": 974292842.0, + "step": 5797 + }, + { + "entropy": 1.6523142755031586, + "epoch": 0.6369503721402873, + "grad_norm": 0.6288219690322876, + "learning_rate": 1.6351663794704316e-05, + "loss": 1.4322, + "mean_token_accuracy": 0.6486099511384964, + "num_tokens": 974493940.0, + "step": 5798 + }, + { + "entropy": 1.6582373181978862, + "epoch": 0.6370602290516603, + "grad_norm": 0.7235569953918457, + "learning_rate": 1.635037640246554e-05, + "loss": 1.2853, + "mean_token_accuracy": 0.6691482861836752, + "num_tokens": 974630137.0, + "step": 5799 + }, + { + "entropy": 1.744762162367503, + "epoch": 0.6371700859630332, + "grad_norm": 0.7221118211746216, + "learning_rate": 1.634908884088976e-05, + "loss": 1.4468, + "mean_token_accuracy": 0.6466242223978043, + "num_tokens": 974816980.0, + "step": 5800 + }, + { + "entropy": 1.7268809576829274, + "epoch": 0.637279942874406, + "grad_norm": 0.6569739580154419, + "learning_rate": 1.634780111001773e-05, + "loss": 1.3366, + "mean_token_accuracy": 0.6608653118213018, + "num_tokens": 974959442.0, + "step": 5801 + }, + { + "entropy": 1.7274185717105865, + "epoch": 0.637389799785779, + "grad_norm": 0.6722861528396606, + "learning_rate": 1.6346513209890206e-05, + "loss": 1.2967, + "mean_token_accuracy": 0.6615037868420283, + "num_tokens": 975083415.0, + "step": 5802 + }, + { + "entropy": 1.6528538862864177, + "epoch": 0.6374996566971519, + "grad_norm": 0.6227561235427856, + "learning_rate": 1.6345225140547946e-05, + "loss": 1.3146, + "mean_token_accuracy": 0.6666281570990881, + "num_tokens": 975209878.0, + "step": 5803 + }, + { + "entropy": 1.6657472550868988, + "epoch": 0.6376095136085249, + "grad_norm": 0.6815557479858398, + "learning_rate": 1.634393690203172e-05, + "loss": 1.3838, + "mean_token_accuracy": 0.6573799202839533, + "num_tokens": 975397134.0, + "step": 5804 + }, + { + "entropy": 1.6994928816954296, + "epoch": 0.6377193705198978, + "grad_norm": 0.6876154541969299, + "learning_rate": 1.63426484943823e-05, + "loss": 1.3021, + "mean_token_accuracy": 0.6681809027989706, + "num_tokens": 975549830.0, + "step": 5805 + }, + { + "entropy": 1.6880747079849243, + "epoch": 0.6378292274312708, + "grad_norm": 0.649737536907196, + "learning_rate": 1.6341359917640462e-05, + "loss": 1.2683, + "mean_token_accuracy": 0.6772895157337189, + "num_tokens": 975724955.0, + "step": 5806 + }, + { + "entropy": 1.7019491692384083, + "epoch": 0.6379390843426437, + "grad_norm": 0.680798351764679, + "learning_rate": 1.634007117184699e-05, + "loss": 1.3433, + "mean_token_accuracy": 0.6644106159607569, + "num_tokens": 975899143.0, + "step": 5807 + }, + { + "entropy": 1.8168910245100658, + "epoch": 0.6380489412540167, + "grad_norm": 1.041638970375061, + "learning_rate": 1.633878225704267e-05, + "loss": 1.4986, + "mean_token_accuracy": 0.640401303768158, + "num_tokens": 976090892.0, + "step": 5808 + }, + { + "entropy": 1.7052318652470906, + "epoch": 0.6381587981653896, + "grad_norm": 0.5862450003623962, + "learning_rate": 1.63374931732683e-05, + "loss": 1.3986, + "mean_token_accuracy": 0.6688533673683802, + "num_tokens": 976245143.0, + "step": 5809 + }, + { + "entropy": 1.6716302533944447, + "epoch": 0.6382686550767626, + "grad_norm": 0.6785920858383179, + "learning_rate": 1.633620392056467e-05, + "loss": 1.3501, + "mean_token_accuracy": 0.6693478226661682, + "num_tokens": 976369806.0, + "step": 5810 + }, + { + "entropy": 1.6544945339361827, + "epoch": 0.6383785119881354, + "grad_norm": 0.5717010498046875, + "learning_rate": 1.6334914498972595e-05, + "loss": 1.3667, + "mean_token_accuracy": 0.6694445610046387, + "num_tokens": 976553064.0, + "step": 5811 + }, + { + "entropy": 1.7117507060368855, + "epoch": 0.6384883688995084, + "grad_norm": 0.6321550607681274, + "learning_rate": 1.633362490853288e-05, + "loss": 1.5681, + "mean_token_accuracy": 0.6327792455752691, + "num_tokens": 976731552.0, + "step": 5812 + }, + { + "entropy": 1.70002148548762, + "epoch": 0.6385982258108813, + "grad_norm": 0.634379506111145, + "learning_rate": 1.633233514928634e-05, + "loss": 1.3679, + "mean_token_accuracy": 0.6579567342996597, + "num_tokens": 976914840.0, + "step": 5813 + }, + { + "entropy": 1.7061622142791748, + "epoch": 0.6387080827222542, + "grad_norm": 0.6758495569229126, + "learning_rate": 1.6331045221273795e-05, + "loss": 1.3905, + "mean_token_accuracy": 0.6592559516429901, + "num_tokens": 977102576.0, + "step": 5814 + }, + { + "entropy": 1.6816378434499104, + "epoch": 0.6388179396336272, + "grad_norm": 0.6217886805534363, + "learning_rate": 1.6329755124536074e-05, + "loss": 1.3363, + "mean_token_accuracy": 0.6602163165807724, + "num_tokens": 977243005.0, + "step": 5815 + }, + { + "entropy": 1.6444752017656963, + "epoch": 0.6389277965450001, + "grad_norm": 0.7023751735687256, + "learning_rate": 1.6328464859113998e-05, + "loss": 1.3488, + "mean_token_accuracy": 0.6646647155284882, + "num_tokens": 977397257.0, + "step": 5816 + }, + { + "entropy": 1.756317913532257, + "epoch": 0.6390376534563731, + "grad_norm": 0.753580629825592, + "learning_rate": 1.6327174425048415e-05, + "loss": 1.34, + "mean_token_accuracy": 0.6718757003545761, + "num_tokens": 977535347.0, + "step": 5817 + }, + { + "entropy": 1.6862174967924755, + "epoch": 0.639147510367746, + "grad_norm": 0.6223485469818115, + "learning_rate": 1.632588382238016e-05, + "loss": 1.2357, + "mean_token_accuracy": 0.6681343664725622, + "num_tokens": 977680666.0, + "step": 5818 + }, + { + "entropy": 1.6936693688233693, + "epoch": 0.639257367279119, + "grad_norm": 0.6414377689361572, + "learning_rate": 1.6324593051150084e-05, + "loss": 1.3485, + "mean_token_accuracy": 0.6657200207312902, + "num_tokens": 977840285.0, + "step": 5819 + }, + { + "entropy": 1.6185721854368846, + "epoch": 0.6393672241904919, + "grad_norm": 0.5484555959701538, + "learning_rate": 1.632330211139904e-05, + "loss": 1.2784, + "mean_token_accuracy": 0.6813697318236033, + "num_tokens": 978022560.0, + "step": 5820 + }, + { + "entropy": 1.6399174928665161, + "epoch": 0.6394770811018649, + "grad_norm": 0.6649972796440125, + "learning_rate": 1.6322011003167877e-05, + "loss": 1.3435, + "mean_token_accuracy": 0.6716059247652689, + "num_tokens": 978163797.0, + "step": 5821 + }, + { + "entropy": 1.761791964371999, + "epoch": 0.6395869380132377, + "grad_norm": 0.7113659381866455, + "learning_rate": 1.6320719726497465e-05, + "loss": 1.3612, + "mean_token_accuracy": 0.655978669722875, + "num_tokens": 978316603.0, + "step": 5822 + }, + { + "entropy": 1.7087388435999553, + "epoch": 0.6396967949246107, + "grad_norm": 0.6851733326911926, + "learning_rate": 1.6319428281428674e-05, + "loss": 1.3932, + "mean_token_accuracy": 0.6515816897153854, + "num_tokens": 978468692.0, + "step": 5823 + }, + { + "entropy": 1.7219915489355724, + "epoch": 0.6398066518359836, + "grad_norm": 0.6899517774581909, + "learning_rate": 1.6318136668002374e-05, + "loss": 1.5738, + "mean_token_accuracy": 0.6320692549149195, + "num_tokens": 978646354.0, + "step": 5824 + }, + { + "entropy": 1.705000917116801, + "epoch": 0.6399165087473566, + "grad_norm": 0.7402633428573608, + "learning_rate": 1.6316844886259443e-05, + "loss": 1.2827, + "mean_token_accuracy": 0.6657779663801193, + "num_tokens": 978781744.0, + "step": 5825 + }, + { + "entropy": 1.7203875084718068, + "epoch": 0.6400263656587295, + "grad_norm": 0.8369673490524292, + "learning_rate": 1.631555293624077e-05, + "loss": 1.4431, + "mean_token_accuracy": 0.6510690748691559, + "num_tokens": 978918388.0, + "step": 5826 + }, + { + "entropy": 1.6911202172438304, + "epoch": 0.6401362225701024, + "grad_norm": 0.7420011162757874, + "learning_rate": 1.6314260817987237e-05, + "loss": 1.3414, + "mean_token_accuracy": 0.6616135090589523, + "num_tokens": 979058916.0, + "step": 5827 + }, + { + "entropy": 1.7454005479812622, + "epoch": 0.6402460794814754, + "grad_norm": 0.6640864610671997, + "learning_rate": 1.6312968531539748e-05, + "loss": 1.4186, + "mean_token_accuracy": 0.6549846281607946, + "num_tokens": 979208637.0, + "step": 5828 + }, + { + "entropy": 1.7146745920181274, + "epoch": 0.6403559363928483, + "grad_norm": 0.736393392086029, + "learning_rate": 1.6311676076939197e-05, + "loss": 1.4239, + "mean_token_accuracy": 0.6446640143791834, + "num_tokens": 979366751.0, + "step": 5829 + }, + { + "entropy": 1.7976812819639842, + "epoch": 0.6404657933042213, + "grad_norm": 0.7656483054161072, + "learning_rate": 1.6310383454226496e-05, + "loss": 1.5592, + "mean_token_accuracy": 0.633850152293841, + "num_tokens": 979523642.0, + "step": 5830 + }, + { + "entropy": 1.7306662797927856, + "epoch": 0.6405756502155942, + "grad_norm": 0.7166682481765747, + "learning_rate": 1.6309090663442546e-05, + "loss": 1.416, + "mean_token_accuracy": 0.6429750323295593, + "num_tokens": 979684950.0, + "step": 5831 + }, + { + "entropy": 1.6555648644765217, + "epoch": 0.6406855071269671, + "grad_norm": 0.6912305951118469, + "learning_rate": 1.6307797704628272e-05, + "loss": 1.2916, + "mean_token_accuracy": 0.6630461364984512, + "num_tokens": 979822099.0, + "step": 5832 + }, + { + "entropy": 1.7157772084077199, + "epoch": 0.64079536403834, + "grad_norm": 0.5763871073722839, + "learning_rate": 1.6306504577824594e-05, + "loss": 1.4631, + "mean_token_accuracy": 0.6457369774580002, + "num_tokens": 980073550.0, + "step": 5833 + }, + { + "entropy": 1.6614607473214467, + "epoch": 0.640905220949713, + "grad_norm": 0.6631821990013123, + "learning_rate": 1.6305211283072432e-05, + "loss": 1.4923, + "mean_token_accuracy": 0.6427841186523438, + "num_tokens": 980267997.0, + "step": 5834 + }, + { + "entropy": 1.7480806112289429, + "epoch": 0.6410150778610859, + "grad_norm": 0.6579576730728149, + "learning_rate": 1.6303917820412726e-05, + "loss": 1.4038, + "mean_token_accuracy": 0.6576637079318365, + "num_tokens": 980443398.0, + "step": 5835 + }, + { + "entropy": 1.757957011461258, + "epoch": 0.6411249347724589, + "grad_norm": 0.8210548758506775, + "learning_rate": 1.630262418988641e-05, + "loss": 1.6428, + "mean_token_accuracy": 0.6485659529765447, + "num_tokens": 980594525.0, + "step": 5836 + }, + { + "entropy": 1.736217776934306, + "epoch": 0.6412347916838318, + "grad_norm": 0.6790369749069214, + "learning_rate": 1.6301330391534432e-05, + "loss": 1.5209, + "mean_token_accuracy": 0.6406088074048361, + "num_tokens": 980755445.0, + "step": 5837 + }, + { + "entropy": 1.7437133093674977, + "epoch": 0.6413446485952048, + "grad_norm": 0.7322378754615784, + "learning_rate": 1.6300036425397732e-05, + "loss": 1.4433, + "mean_token_accuracy": 0.6456714073816935, + "num_tokens": 980925437.0, + "step": 5838 + }, + { + "entropy": 1.7406888504823048, + "epoch": 0.6414545055065777, + "grad_norm": 0.8085801601409912, + "learning_rate": 1.629874229151727e-05, + "loss": 1.4783, + "mean_token_accuracy": 0.6532542854547501, + "num_tokens": 981085144.0, + "step": 5839 + }, + { + "entropy": 1.6952376067638397, + "epoch": 0.6415643624179507, + "grad_norm": 0.7089763879776001, + "learning_rate": 1.6297447989934e-05, + "loss": 1.3368, + "mean_token_accuracy": 0.6644267588853836, + "num_tokens": 981266976.0, + "step": 5840 + }, + { + "entropy": 1.6870313982168834, + "epoch": 0.6416742193293236, + "grad_norm": 0.6560161709785461, + "learning_rate": 1.6296153520688886e-05, + "loss": 1.4609, + "mean_token_accuracy": 0.6392157872517904, + "num_tokens": 981482379.0, + "step": 5841 + }, + { + "entropy": 1.713809609413147, + "epoch": 0.6417840762406964, + "grad_norm": 0.6234894394874573, + "learning_rate": 1.6294858883822902e-05, + "loss": 1.3206, + "mean_token_accuracy": 0.6720296243826548, + "num_tokens": 981646187.0, + "step": 5842 + }, + { + "entropy": 1.7012270887692769, + "epoch": 0.6418939331520694, + "grad_norm": 0.6372230052947998, + "learning_rate": 1.6293564079377024e-05, + "loss": 1.399, + "mean_token_accuracy": 0.646788035829862, + "num_tokens": 981809605.0, + "step": 5843 + }, + { + "entropy": 1.6442164182662964, + "epoch": 0.6420037900634423, + "grad_norm": 0.6384155750274658, + "learning_rate": 1.6292269107392223e-05, + "loss": 1.3745, + "mean_token_accuracy": 0.6641842971245447, + "num_tokens": 981981077.0, + "step": 5844 + }, + { + "entropy": 1.723905752102534, + "epoch": 0.6421136469748153, + "grad_norm": 0.6278502345085144, + "learning_rate": 1.6290973967909492e-05, + "loss": 1.3378, + "mean_token_accuracy": 0.66019007563591, + "num_tokens": 982155308.0, + "step": 5845 + }, + { + "entropy": 1.704160491625468, + "epoch": 0.6422235038861882, + "grad_norm": 0.6128730177879333, + "learning_rate": 1.6289678660969818e-05, + "loss": 1.4123, + "mean_token_accuracy": 0.663148025671641, + "num_tokens": 982334839.0, + "step": 5846 + }, + { + "entropy": 1.6866064369678497, + "epoch": 0.6423333607975612, + "grad_norm": 0.5298371911048889, + "learning_rate": 1.6288383186614198e-05, + "loss": 1.4093, + "mean_token_accuracy": 0.6518398175636927, + "num_tokens": 982563768.0, + "step": 5847 + }, + { + "entropy": 1.716744065284729, + "epoch": 0.6424432177089341, + "grad_norm": 0.6819528937339783, + "learning_rate": 1.6287087544883633e-05, + "loss": 1.354, + "mean_token_accuracy": 0.6657714794079462, + "num_tokens": 982751166.0, + "step": 5848 + }, + { + "entropy": 1.6699997087319691, + "epoch": 0.6425530746203071, + "grad_norm": 0.8962133526802063, + "learning_rate": 1.628579173581913e-05, + "loss": 1.3544, + "mean_token_accuracy": 0.6710561861594518, + "num_tokens": 982903845.0, + "step": 5849 + }, + { + "entropy": 1.6850681801637013, + "epoch": 0.64266293153168, + "grad_norm": 0.6950148940086365, + "learning_rate": 1.62844957594617e-05, + "loss": 1.4295, + "mean_token_accuracy": 0.6709064096212387, + "num_tokens": 983059833.0, + "step": 5850 + }, + { + "entropy": 1.6466986139615376, + "epoch": 0.642772788443053, + "grad_norm": 0.7298344373703003, + "learning_rate": 1.6283199615852364e-05, + "loss": 1.2233, + "mean_token_accuracy": 0.6797444274028143, + "num_tokens": 983193013.0, + "step": 5851 + }, + { + "entropy": 1.6685850421587627, + "epoch": 0.6428826453544259, + "grad_norm": 0.7035362124443054, + "learning_rate": 1.6281903305032135e-05, + "loss": 1.3384, + "mean_token_accuracy": 0.6661973843971888, + "num_tokens": 983321092.0, + "step": 5852 + }, + { + "entropy": 1.722305456797282, + "epoch": 0.6429925022657988, + "grad_norm": 0.6722835302352905, + "learning_rate": 1.6280606827042053e-05, + "loss": 1.3151, + "mean_token_accuracy": 0.6609906901915868, + "num_tokens": 983476645.0, + "step": 5853 + }, + { + "entropy": 1.7051582833131154, + "epoch": 0.6431023591771717, + "grad_norm": 0.591584324836731, + "learning_rate": 1.6279310181923137e-05, + "loss": 1.4214, + "mean_token_accuracy": 0.65077872077624, + "num_tokens": 983641414.0, + "step": 5854 + }, + { + "entropy": 1.7542580962181091, + "epoch": 0.6432122160885446, + "grad_norm": 0.7439432144165039, + "learning_rate": 1.627801336971644e-05, + "loss": 1.3505, + "mean_token_accuracy": 0.6534154663483301, + "num_tokens": 983766157.0, + "step": 5855 + }, + { + "entropy": 1.7984492977460225, + "epoch": 0.6433220729999176, + "grad_norm": 0.7941092848777771, + "learning_rate": 1.627671639046299e-05, + "loss": 1.7088, + "mean_token_accuracy": 0.6389260292053223, + "num_tokens": 983977429.0, + "step": 5856 + }, + { + "entropy": 1.720232754945755, + "epoch": 0.6434319299112905, + "grad_norm": 0.8250980973243713, + "learning_rate": 1.6275419244203853e-05, + "loss": 1.519, + "mean_token_accuracy": 0.6374199092388153, + "num_tokens": 984183890.0, + "step": 5857 + }, + { + "entropy": 1.736306478579839, + "epoch": 0.6435417868226635, + "grad_norm": 0.6806954741477966, + "learning_rate": 1.627412193098007e-05, + "loss": 1.5107, + "mean_token_accuracy": 0.6463406682014465, + "num_tokens": 984373464.0, + "step": 5858 + }, + { + "entropy": 1.7540569305419922, + "epoch": 0.6436516437340364, + "grad_norm": 0.7410135865211487, + "learning_rate": 1.62728244508327e-05, + "loss": 1.2862, + "mean_token_accuracy": 0.6681255847215652, + "num_tokens": 984513422.0, + "step": 5859 + }, + { + "entropy": 1.6755077838897705, + "epoch": 0.6437615006454094, + "grad_norm": 0.6239484548568726, + "learning_rate": 1.6271526803802818e-05, + "loss": 1.3604, + "mean_token_accuracy": 0.6560343901316324, + "num_tokens": 984693720.0, + "step": 5860 + }, + { + "entropy": 1.645962009827296, + "epoch": 0.6438713575567823, + "grad_norm": 0.5675744414329529, + "learning_rate": 1.6270228989931487e-05, + "loss": 1.3538, + "mean_token_accuracy": 0.6722413251797358, + "num_tokens": 984879033.0, + "step": 5861 + }, + { + "entropy": 1.7259068687756856, + "epoch": 0.6439812144681553, + "grad_norm": 0.6161162853240967, + "learning_rate": 1.6268931009259782e-05, + "loss": 1.4002, + "mean_token_accuracy": 0.6652526358763377, + "num_tokens": 985044446.0, + "step": 5862 + }, + { + "entropy": 1.7283404767513275, + "epoch": 0.6440910713795281, + "grad_norm": 0.7567533254623413, + "learning_rate": 1.6267632861828784e-05, + "loss": 1.3376, + "mean_token_accuracy": 0.6561006804307302, + "num_tokens": 985207012.0, + "step": 5863 + }, + { + "entropy": 1.6944616238276164, + "epoch": 0.6442009282909011, + "grad_norm": 0.676316499710083, + "learning_rate": 1.6266334547679584e-05, + "loss": 1.4184, + "mean_token_accuracy": 0.65648120145003, + "num_tokens": 985355826.0, + "step": 5864 + }, + { + "entropy": 1.7600885530312855, + "epoch": 0.644310785202274, + "grad_norm": 0.672926127910614, + "learning_rate": 1.626503606685326e-05, + "loss": 1.4699, + "mean_token_accuracy": 0.6459670712550482, + "num_tokens": 985505942.0, + "step": 5865 + }, + { + "entropy": 1.7543242474397023, + "epoch": 0.644420642113647, + "grad_norm": 0.7283417582511902, + "learning_rate": 1.6263737419390924e-05, + "loss": 1.3375, + "mean_token_accuracy": 0.6589344541231791, + "num_tokens": 985632534.0, + "step": 5866 + }, + { + "entropy": 1.7102359334627788, + "epoch": 0.6445304990250199, + "grad_norm": 0.6508517265319824, + "learning_rate": 1.626243860533367e-05, + "loss": 1.3067, + "mean_token_accuracy": 0.6687440226475397, + "num_tokens": 985812243.0, + "step": 5867 + }, + { + "entropy": 1.651482840379079, + "epoch": 0.6446403559363928, + "grad_norm": 0.8385653495788574, + "learning_rate": 1.6261139624722607e-05, + "loss": 1.3541, + "mean_token_accuracy": 0.6784233748912811, + "num_tokens": 986033493.0, + "step": 5868 + }, + { + "entropy": 1.8157791793346405, + "epoch": 0.6447502128477658, + "grad_norm": 0.7561879754066467, + "learning_rate": 1.6259840477598842e-05, + "loss": 1.6814, + "mean_token_accuracy": 0.6107426683108012, + "num_tokens": 986264905.0, + "step": 5869 + }, + { + "entropy": 1.7169578472773235, + "epoch": 0.6448600697591387, + "grad_norm": 0.6490621566772461, + "learning_rate": 1.6258541164003497e-05, + "loss": 1.48, + "mean_token_accuracy": 0.6457877457141876, + "num_tokens": 986432715.0, + "step": 5870 + }, + { + "entropy": 1.694730967283249, + "epoch": 0.6449699266705117, + "grad_norm": 0.6287488341331482, + "learning_rate": 1.6257241683977695e-05, + "loss": 1.3854, + "mean_token_accuracy": 0.6665887037913004, + "num_tokens": 986596620.0, + "step": 5871 + }, + { + "entropy": 1.7151194314161937, + "epoch": 0.6450797835818846, + "grad_norm": 0.7435504794120789, + "learning_rate": 1.625594203756256e-05, + "loss": 1.5081, + "mean_token_accuracy": 0.6542889624834061, + "num_tokens": 986763978.0, + "step": 5872 + }, + { + "entropy": 1.7020180424054463, + "epoch": 0.6451896404932576, + "grad_norm": 0.6360806226730347, + "learning_rate": 1.625464222479923e-05, + "loss": 1.3542, + "mean_token_accuracy": 0.664187510808309, + "num_tokens": 986915228.0, + "step": 5873 + }, + { + "entropy": 1.6966914137204487, + "epoch": 0.6452994974046304, + "grad_norm": 0.8072082996368408, + "learning_rate": 1.625334224572884e-05, + "loss": 1.4327, + "mean_token_accuracy": 0.6710225045681, + "num_tokens": 987091477.0, + "step": 5874 + }, + { + "entropy": 1.6498130361239116, + "epoch": 0.6454093543160034, + "grad_norm": 0.7897175550460815, + "learning_rate": 1.6252042100392535e-05, + "loss": 1.4103, + "mean_token_accuracy": 0.6601500709851583, + "num_tokens": 987264375.0, + "step": 5875 + }, + { + "entropy": 1.687047004699707, + "epoch": 0.6455192112273763, + "grad_norm": 0.607562243938446, + "learning_rate": 1.6250741788831466e-05, + "loss": 1.4087, + "mean_token_accuracy": 0.655857135852178, + "num_tokens": 987426445.0, + "step": 5876 + }, + { + "entropy": 1.7364888588587444, + "epoch": 0.6456290681387493, + "grad_norm": 0.5785548090934753, + "learning_rate": 1.6249441311086788e-05, + "loss": 1.5631, + "mean_token_accuracy": 0.6313952604929606, + "num_tokens": 987658927.0, + "step": 5877 + }, + { + "entropy": 1.6707642773787181, + "epoch": 0.6457389250501222, + "grad_norm": 0.8916179537773132, + "learning_rate": 1.624814066719965e-05, + "loss": 1.3344, + "mean_token_accuracy": 0.6687265535195669, + "num_tokens": 987785339.0, + "step": 5878 + }, + { + "entropy": 1.6992899874846141, + "epoch": 0.6458487819614952, + "grad_norm": 0.6406787633895874, + "learning_rate": 1.624683985721123e-05, + "loss": 1.3642, + "mean_token_accuracy": 0.6527419139941534, + "num_tokens": 987956404.0, + "step": 5879 + }, + { + "entropy": 1.6912338038285573, + "epoch": 0.6459586388728681, + "grad_norm": 0.7404332160949707, + "learning_rate": 1.6245538881162693e-05, + "loss": 1.2955, + "mean_token_accuracy": 0.6724090029795965, + "num_tokens": 988096917.0, + "step": 5880 + }, + { + "entropy": 1.7176282107830048, + "epoch": 0.6460684957842411, + "grad_norm": 0.737346887588501, + "learning_rate": 1.624423773909521e-05, + "loss": 1.3986, + "mean_token_accuracy": 0.6630405435959498, + "num_tokens": 988245509.0, + "step": 5881 + }, + { + "entropy": 1.709346890449524, + "epoch": 0.646178352695614, + "grad_norm": 0.6211323738098145, + "learning_rate": 1.6242936431049973e-05, + "loss": 1.4867, + "mean_token_accuracy": 0.6363749404748281, + "num_tokens": 988448841.0, + "step": 5882 + }, + { + "entropy": 1.7240298589070637, + "epoch": 0.6462882096069869, + "grad_norm": 0.6329193115234375, + "learning_rate": 1.6241634957068155e-05, + "loss": 1.4412, + "mean_token_accuracy": 0.6437714745601019, + "num_tokens": 988619793.0, + "step": 5883 + }, + { + "entropy": 1.724859396616618, + "epoch": 0.6463980665183598, + "grad_norm": 0.6122381687164307, + "learning_rate": 1.6240333317190953e-05, + "loss": 1.4146, + "mean_token_accuracy": 0.6419190764427185, + "num_tokens": 988803888.0, + "step": 5884 + }, + { + "entropy": 1.7188538114229839, + "epoch": 0.6465079234297327, + "grad_norm": 0.6661576628684998, + "learning_rate": 1.6239031511459564e-05, + "loss": 1.381, + "mean_token_accuracy": 0.657758911450704, + "num_tokens": 988978211.0, + "step": 5885 + }, + { + "entropy": 1.7248981595039368, + "epoch": 0.6466177803411057, + "grad_norm": 0.7169018387794495, + "learning_rate": 1.6237729539915187e-05, + "loss": 1.4442, + "mean_token_accuracy": 0.6563170303901037, + "num_tokens": 989143411.0, + "step": 5886 + }, + { + "entropy": 1.7199460367361705, + "epoch": 0.6467276372524786, + "grad_norm": 0.5790041089057922, + "learning_rate": 1.6236427402599032e-05, + "loss": 1.5012, + "mean_token_accuracy": 0.6406532824039459, + "num_tokens": 989369510.0, + "step": 5887 + }, + { + "entropy": 1.7657626469930012, + "epoch": 0.6468374941638516, + "grad_norm": 0.8561536073684692, + "learning_rate": 1.623512509955231e-05, + "loss": 1.4471, + "mean_token_accuracy": 0.659746582309405, + "num_tokens": 989548102.0, + "step": 5888 + }, + { + "entropy": 1.6926214396953583, + "epoch": 0.6469473510752245, + "grad_norm": 0.6689512133598328, + "learning_rate": 1.6233822630816234e-05, + "loss": 1.2409, + "mean_token_accuracy": 0.6762634168068568, + "num_tokens": 989679599.0, + "step": 5889 + }, + { + "entropy": 1.7333985964457195, + "epoch": 0.6470572079865975, + "grad_norm": 0.7683215141296387, + "learning_rate": 1.6232519996432035e-05, + "loss": 1.3786, + "mean_token_accuracy": 0.6628393729527792, + "num_tokens": 989827866.0, + "step": 5890 + }, + { + "entropy": 1.7282644311587017, + "epoch": 0.6471670648979704, + "grad_norm": 0.731306254863739, + "learning_rate": 1.623121719644093e-05, + "loss": 1.3069, + "mean_token_accuracy": 0.6695700138807297, + "num_tokens": 989936813.0, + "step": 5891 + }, + { + "entropy": 1.6467431485652924, + "epoch": 0.6472769218093434, + "grad_norm": 0.7342338562011719, + "learning_rate": 1.6229914230884163e-05, + "loss": 1.4412, + "mean_token_accuracy": 0.6496950487295786, + "num_tokens": 990122059.0, + "step": 5892 + }, + { + "entropy": 1.7180868089199066, + "epoch": 0.6473867787207163, + "grad_norm": 0.7283189296722412, + "learning_rate": 1.6228611099802964e-05, + "loss": 1.2448, + "mean_token_accuracy": 0.6772717932860056, + "num_tokens": 990230529.0, + "step": 5893 + }, + { + "entropy": 1.7020895679791768, + "epoch": 0.6474966356320893, + "grad_norm": 0.6616482734680176, + "learning_rate": 1.6227307803238585e-05, + "loss": 1.6071, + "mean_token_accuracy": 0.6459571321805319, + "num_tokens": 990458221.0, + "step": 5894 + }, + { + "entropy": 1.7780593534310658, + "epoch": 0.6476064925434621, + "grad_norm": 0.686166524887085, + "learning_rate": 1.6226004341232265e-05, + "loss": 1.3715, + "mean_token_accuracy": 0.6415807555119196, + "num_tokens": 990660990.0, + "step": 5895 + }, + { + "entropy": 1.6996767024199169, + "epoch": 0.647716349454835, + "grad_norm": 0.7599523067474365, + "learning_rate": 1.622470071382526e-05, + "loss": 1.326, + "mean_token_accuracy": 0.6618270923693975, + "num_tokens": 990797885.0, + "step": 5896 + }, + { + "entropy": 1.6373738249142964, + "epoch": 0.647826206366208, + "grad_norm": 0.60347580909729, + "learning_rate": 1.622339692105884e-05, + "loss": 1.315, + "mean_token_accuracy": 0.6724487642447153, + "num_tokens": 990944305.0, + "step": 5897 + }, + { + "entropy": 1.648904373248418, + "epoch": 0.6479360632775809, + "grad_norm": 0.6465940475463867, + "learning_rate": 1.6222092962974255e-05, + "loss": 1.3056, + "mean_token_accuracy": 0.6669845134019852, + "num_tokens": 991127904.0, + "step": 5898 + }, + { + "entropy": 1.7032305300235748, + "epoch": 0.6480459201889539, + "grad_norm": 0.7024495601654053, + "learning_rate": 1.622078883961278e-05, + "loss": 1.3523, + "mean_token_accuracy": 0.6614350527524948, + "num_tokens": 991310770.0, + "step": 5899 + }, + { + "entropy": 1.7386066218217213, + "epoch": 0.6481557771003268, + "grad_norm": 0.7756279110908508, + "learning_rate": 1.6219484551015694e-05, + "loss": 1.4412, + "mean_token_accuracy": 0.6557717273632685, + "num_tokens": 991453676.0, + "step": 5900 + }, + { + "entropy": 1.6870729128519695, + "epoch": 0.6482656340116998, + "grad_norm": 0.7938646078109741, + "learning_rate": 1.6218180097224273e-05, + "loss": 1.4533, + "mean_token_accuracy": 0.6631547510623932, + "num_tokens": 991615628.0, + "step": 5901 + }, + { + "entropy": 1.6689063012599945, + "epoch": 0.6483754909230727, + "grad_norm": 0.7687073945999146, + "learning_rate": 1.6216875478279802e-05, + "loss": 1.2669, + "mean_token_accuracy": 0.6779245138168335, + "num_tokens": 991759251.0, + "step": 5902 + }, + { + "entropy": 1.7391593058904011, + "epoch": 0.6484853478344457, + "grad_norm": 0.7042144536972046, + "learning_rate": 1.6215570694223574e-05, + "loss": 1.5169, + "mean_token_accuracy": 0.6359234601259232, + "num_tokens": 992007896.0, + "step": 5903 + }, + { + "entropy": 1.6700084805488586, + "epoch": 0.6485952047458186, + "grad_norm": 0.7073227167129517, + "learning_rate": 1.6214265745096885e-05, + "loss": 1.337, + "mean_token_accuracy": 0.6727247337500254, + "num_tokens": 992143508.0, + "step": 5904 + }, + { + "entropy": 1.7103844285011292, + "epoch": 0.6487050616571916, + "grad_norm": 1.0630773305892944, + "learning_rate": 1.6212960630941035e-05, + "loss": 1.5428, + "mean_token_accuracy": 0.6607689758141836, + "num_tokens": 992350316.0, + "step": 5905 + }, + { + "entropy": 1.6851065456867218, + "epoch": 0.6488149185685644, + "grad_norm": 0.7298637628555298, + "learning_rate": 1.6211655351797326e-05, + "loss": 1.4339, + "mean_token_accuracy": 0.6611693799495697, + "num_tokens": 992508315.0, + "step": 5906 + }, + { + "entropy": 1.7009667754173279, + "epoch": 0.6489247754799374, + "grad_norm": 0.7233312726020813, + "learning_rate": 1.6210349907707076e-05, + "loss": 1.5053, + "mean_token_accuracy": 0.6501431415478388, + "num_tokens": 992700402.0, + "step": 5907 + }, + { + "entropy": 1.6802937785784404, + "epoch": 0.6490346323913103, + "grad_norm": 0.6345446109771729, + "learning_rate": 1.62090442987116e-05, + "loss": 1.3397, + "mean_token_accuracy": 0.6666805297136307, + "num_tokens": 992846905.0, + "step": 5908 + }, + { + "entropy": 1.7321950197219849, + "epoch": 0.6491444893026832, + "grad_norm": 0.6220639944076538, + "learning_rate": 1.6207738524852217e-05, + "loss": 1.4515, + "mean_token_accuracy": 0.6630930602550507, + "num_tokens": 993004017.0, + "step": 5909 + }, + { + "entropy": 1.714138279358546, + "epoch": 0.6492543462140562, + "grad_norm": 0.6821590065956116, + "learning_rate": 1.620643258617026e-05, + "loss": 1.4696, + "mean_token_accuracy": 0.6569486111402512, + "num_tokens": 993141938.0, + "step": 5910 + }, + { + "entropy": 1.7272202372550964, + "epoch": 0.6493642031254291, + "grad_norm": 0.798577606678009, + "learning_rate": 1.6205126482707058e-05, + "loss": 1.3874, + "mean_token_accuracy": 0.6723422755797704, + "num_tokens": 993281834.0, + "step": 5911 + }, + { + "entropy": 1.7264328002929688, + "epoch": 0.6494740600368021, + "grad_norm": 0.7377980351448059, + "learning_rate": 1.6203820214503942e-05, + "loss": 1.4137, + "mean_token_accuracy": 0.6667486826578776, + "num_tokens": 993403525.0, + "step": 5912 + }, + { + "entropy": 1.6910780568917592, + "epoch": 0.649583916948175, + "grad_norm": 4.774479389190674, + "learning_rate": 1.6202513781602266e-05, + "loss": 1.1981, + "mean_token_accuracy": 0.6723181953032812, + "num_tokens": 993567459.0, + "step": 5913 + }, + { + "entropy": 1.7071150839328766, + "epoch": 0.649693773859548, + "grad_norm": 0.7636151313781738, + "learning_rate": 1.6201207184043372e-05, + "loss": 1.4607, + "mean_token_accuracy": 0.658385788400968, + "num_tokens": 993712733.0, + "step": 5914 + }, + { + "entropy": 1.790613979101181, + "epoch": 0.6498036307709208, + "grad_norm": 0.7202587723731995, + "learning_rate": 1.6199900421868616e-05, + "loss": 1.5591, + "mean_token_accuracy": 0.6297978659470876, + "num_tokens": 993897946.0, + "step": 5915 + }, + { + "entropy": 1.6419591108957927, + "epoch": 0.6499134876822938, + "grad_norm": 0.6573414206504822, + "learning_rate": 1.6198593495119352e-05, + "loss": 1.4063, + "mean_token_accuracy": 0.6666153768698374, + "num_tokens": 994054233.0, + "step": 5916 + }, + { + "entropy": 1.7275590499242146, + "epoch": 0.6500233445936667, + "grad_norm": 0.6958465576171875, + "learning_rate": 1.6197286403836947e-05, + "loss": 1.499, + "mean_token_accuracy": 0.6515706777572632, + "num_tokens": 994235070.0, + "step": 5917 + }, + { + "entropy": 1.7174121638139088, + "epoch": 0.6501332015050397, + "grad_norm": 0.677852213382721, + "learning_rate": 1.619597914806277e-05, + "loss": 1.5725, + "mean_token_accuracy": 0.6482388724883398, + "num_tokens": 994427969.0, + "step": 5918 + }, + { + "entropy": 1.6932408213615417, + "epoch": 0.6502430584164126, + "grad_norm": 0.8987300395965576, + "learning_rate": 1.6194671727838193e-05, + "loss": 1.445, + "mean_token_accuracy": 0.6716097990671793, + "num_tokens": 994573905.0, + "step": 5919 + }, + { + "entropy": 1.694073627392451, + "epoch": 0.6503529153277856, + "grad_norm": 0.7888936400413513, + "learning_rate": 1.61933641432046e-05, + "loss": 1.382, + "mean_token_accuracy": 0.6525902102390925, + "num_tokens": 994713145.0, + "step": 5920 + }, + { + "entropy": 1.7361188928286235, + "epoch": 0.6504627722391585, + "grad_norm": 0.6191293001174927, + "learning_rate": 1.619205639420337e-05, + "loss": 1.418, + "mean_token_accuracy": 0.6470876733462015, + "num_tokens": 994913257.0, + "step": 5921 + }, + { + "entropy": 1.7108994523684184, + "epoch": 0.6505726291505314, + "grad_norm": 0.7681605815887451, + "learning_rate": 1.6190748480875893e-05, + "loss": 1.3487, + "mean_token_accuracy": 0.661737248301506, + "num_tokens": 995089384.0, + "step": 5922 + }, + { + "entropy": 1.6949030856291454, + "epoch": 0.6506824860619044, + "grad_norm": 0.6882128119468689, + "learning_rate": 1.6189440403263568e-05, + "loss": 1.3357, + "mean_token_accuracy": 0.6652708401282629, + "num_tokens": 995250744.0, + "step": 5923 + }, + { + "entropy": 1.7186113198598225, + "epoch": 0.6507923429732773, + "grad_norm": 0.6824566721916199, + "learning_rate": 1.618813216140779e-05, + "loss": 1.5937, + "mean_token_accuracy": 0.6373369594415029, + "num_tokens": 995448756.0, + "step": 5924 + }, + { + "entropy": 1.7350513140360515, + "epoch": 0.6509021998846503, + "grad_norm": 0.6943637132644653, + "learning_rate": 1.618682375534997e-05, + "loss": 1.3293, + "mean_token_accuracy": 0.6592056502898535, + "num_tokens": 995579361.0, + "step": 5925 + }, + { + "entropy": 1.688408652941386, + "epoch": 0.6510120567960231, + "grad_norm": 0.6207675337791443, + "learning_rate": 1.6185515185131516e-05, + "loss": 1.3164, + "mean_token_accuracy": 0.6692043642203013, + "num_tokens": 995739892.0, + "step": 5926 + }, + { + "entropy": 1.720192591349284, + "epoch": 0.6511219137073961, + "grad_norm": 0.7376704216003418, + "learning_rate": 1.6184206450793838e-05, + "loss": 1.37, + "mean_token_accuracy": 0.6484977056582769, + "num_tokens": 995890285.0, + "step": 5927 + }, + { + "entropy": 1.7388510406017303, + "epoch": 0.651231770618769, + "grad_norm": 0.69366455078125, + "learning_rate": 1.6182897552378366e-05, + "loss": 1.3361, + "mean_token_accuracy": 0.6686133096615473, + "num_tokens": 996020909.0, + "step": 5928 + }, + { + "entropy": 1.7152843674023945, + "epoch": 0.651341627530142, + "grad_norm": 0.7456363439559937, + "learning_rate": 1.618158848992652e-05, + "loss": 1.341, + "mean_token_accuracy": 0.6697397083044052, + "num_tokens": 996149476.0, + "step": 5929 + }, + { + "entropy": 1.7538359761238098, + "epoch": 0.6514514844415149, + "grad_norm": 0.8604733943939209, + "learning_rate": 1.6180279263479736e-05, + "loss": 1.7488, + "mean_token_accuracy": 0.6189997419714928, + "num_tokens": 996320305.0, + "step": 5930 + }, + { + "entropy": 1.7112461129824321, + "epoch": 0.6515613413528879, + "grad_norm": 0.711691677570343, + "learning_rate": 1.6178969873079445e-05, + "loss": 1.6228, + "mean_token_accuracy": 0.6372633626063665, + "num_tokens": 996484824.0, + "step": 5931 + }, + { + "entropy": 1.709233562151591, + "epoch": 0.6516711982642608, + "grad_norm": 0.6133729219436646, + "learning_rate": 1.6177660318767094e-05, + "loss": 1.4192, + "mean_token_accuracy": 0.6441813757022222, + "num_tokens": 996650733.0, + "step": 5932 + }, + { + "entropy": 1.6950431764125824, + "epoch": 0.6517810551756338, + "grad_norm": 0.796097457408905, + "learning_rate": 1.6176350600584127e-05, + "loss": 1.3982, + "mean_token_accuracy": 0.654624213774999, + "num_tokens": 996826862.0, + "step": 5933 + }, + { + "entropy": 1.797725111246109, + "epoch": 0.6518909120870067, + "grad_norm": 0.6709982752799988, + "learning_rate": 1.617504071857199e-05, + "loss": 1.4719, + "mean_token_accuracy": 0.6494367867708206, + "num_tokens": 996948195.0, + "step": 5934 + }, + { + "entropy": 1.729837417602539, + "epoch": 0.6520007689983797, + "grad_norm": 0.5902916789054871, + "learning_rate": 1.6173730672772154e-05, + "loss": 1.4126, + "mean_token_accuracy": 0.6551411052544912, + "num_tokens": 997153677.0, + "step": 5935 + }, + { + "entropy": 1.6588162382443745, + "epoch": 0.6521106259097526, + "grad_norm": 0.5839589238166809, + "learning_rate": 1.617242046322607e-05, + "loss": 1.443, + "mean_token_accuracy": 0.6415523837010065, + "num_tokens": 997350676.0, + "step": 5936 + }, + { + "entropy": 1.7090435028076172, + "epoch": 0.6522204828211254, + "grad_norm": 1.012498378753662, + "learning_rate": 1.6171110089975203e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.6589536915222803, + "num_tokens": 997514467.0, + "step": 5937 + }, + { + "entropy": 1.6660225788752239, + "epoch": 0.6523303397324984, + "grad_norm": 0.7169985175132751, + "learning_rate": 1.616979955306104e-05, + "loss": 1.4925, + "mean_token_accuracy": 0.6534644216299057, + "num_tokens": 997686163.0, + "step": 5938 + }, + { + "entropy": 1.7207870185375214, + "epoch": 0.6524401966438713, + "grad_norm": 0.8183472752571106, + "learning_rate": 1.6168488852525048e-05, + "loss": 1.41, + "mean_token_accuracy": 0.658767968416214, + "num_tokens": 997864009.0, + "step": 5939 + }, + { + "entropy": 1.682630827029546, + "epoch": 0.6525500535552443, + "grad_norm": 0.6713327169418335, + "learning_rate": 1.6167177988408713e-05, + "loss": 1.429, + "mean_token_accuracy": 0.6574962337811788, + "num_tokens": 998095171.0, + "step": 5940 + }, + { + "entropy": 1.6624448994795482, + "epoch": 0.6526599104666172, + "grad_norm": 0.6571035385131836, + "learning_rate": 1.6165866960753525e-05, + "loss": 1.5789, + "mean_token_accuracy": 0.6504167219003042, + "num_tokens": 998273104.0, + "step": 5941 + }, + { + "entropy": 1.70240314801534, + "epoch": 0.6527697673779902, + "grad_norm": 0.8170379400253296, + "learning_rate": 1.6164555769600974e-05, + "loss": 1.4324, + "mean_token_accuracy": 0.649359330534935, + "num_tokens": 998471102.0, + "step": 5942 + }, + { + "entropy": 1.7425579031308491, + "epoch": 0.6528796242893631, + "grad_norm": 0.6321161985397339, + "learning_rate": 1.616324441499256e-05, + "loss": 1.3006, + "mean_token_accuracy": 0.6682200183471044, + "num_tokens": 998615680.0, + "step": 5943 + }, + { + "entropy": 1.7166448334852855, + "epoch": 0.6529894812007361, + "grad_norm": 0.6595907211303711, + "learning_rate": 1.6161932896969784e-05, + "loss": 1.3832, + "mean_token_accuracy": 0.6572774350643158, + "num_tokens": 998779804.0, + "step": 5944 + }, + { + "entropy": 1.6734323004881542, + "epoch": 0.653099338112109, + "grad_norm": 0.7008593678474426, + "learning_rate": 1.616062121557416e-05, + "loss": 1.3083, + "mean_token_accuracy": 0.6746059507131577, + "num_tokens": 998937935.0, + "step": 5945 + }, + { + "entropy": 1.7303222517172496, + "epoch": 0.653209195023482, + "grad_norm": 0.7347795367240906, + "learning_rate": 1.6159309370847204e-05, + "loss": 1.4708, + "mean_token_accuracy": 0.6639884759982427, + "num_tokens": 999103145.0, + "step": 5946 + }, + { + "entropy": 1.6455318927764893, + "epoch": 0.6533190519348548, + "grad_norm": 0.6177557706832886, + "learning_rate": 1.6157997362830427e-05, + "loss": 1.364, + "mean_token_accuracy": 0.6617141962051392, + "num_tokens": 999262435.0, + "step": 5947 + }, + { + "entropy": 1.690677026907603, + "epoch": 0.6534289088462278, + "grad_norm": 0.7298170924186707, + "learning_rate": 1.6156685191565357e-05, + "loss": 1.3552, + "mean_token_accuracy": 0.6585644831260046, + "num_tokens": 999440995.0, + "step": 5948 + }, + { + "entropy": 1.7055202027161915, + "epoch": 0.6535387657576007, + "grad_norm": 0.7108672261238098, + "learning_rate": 1.6155372857093528e-05, + "loss": 1.3826, + "mean_token_accuracy": 0.6667246073484421, + "num_tokens": 999613356.0, + "step": 5949 + }, + { + "entropy": 1.6879318157831829, + "epoch": 0.6536486226689736, + "grad_norm": 0.7076107859611511, + "learning_rate": 1.615406035945647e-05, + "loss": 1.54, + "mean_token_accuracy": 0.6409845153490702, + "num_tokens": 999816032.0, + "step": 5950 + }, + { + "entropy": 1.7342401643594105, + "epoch": 0.6537584795803466, + "grad_norm": 0.9219125509262085, + "learning_rate": 1.615274769869572e-05, + "loss": 1.4078, + "mean_token_accuracy": 0.6552455176909765, + "num_tokens": 999966674.0, + "step": 5951 + }, + { + "entropy": 1.7484307487805684, + "epoch": 0.6538683364917195, + "grad_norm": 0.64826899766922, + "learning_rate": 1.615143487485283e-05, + "loss": 1.4302, + "mean_token_accuracy": 0.6445176502068838, + "num_tokens": 1000183638.0, + "step": 5952 + }, + { + "entropy": 1.735739419857661, + "epoch": 0.6539781934030925, + "grad_norm": 0.823397159576416, + "learning_rate": 1.615012188796935e-05, + "loss": 1.283, + "mean_token_accuracy": 0.6683905571699142, + "num_tokens": 1000379339.0, + "step": 5953 + }, + { + "entropy": 1.7015184263388317, + "epoch": 0.6540880503144654, + "grad_norm": 0.6577404737472534, + "learning_rate": 1.614880873808683e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6538449923197428, + "num_tokens": 1000517761.0, + "step": 5954 + }, + { + "entropy": 1.6893195311228435, + "epoch": 0.6541979072258384, + "grad_norm": 0.7297143340110779, + "learning_rate": 1.6147495425246834e-05, + "loss": 1.3336, + "mean_token_accuracy": 0.6523545185724894, + "num_tokens": 1000677812.0, + "step": 5955 + }, + { + "entropy": 1.6598160068194072, + "epoch": 0.6543077641372113, + "grad_norm": 0.6622530221939087, + "learning_rate": 1.6146181949490926e-05, + "loss": 1.3761, + "mean_token_accuracy": 0.6585600723822912, + "num_tokens": 1000849086.0, + "step": 5956 + }, + { + "entropy": 1.700823446114858, + "epoch": 0.6544176210485843, + "grad_norm": 0.866723895072937, + "learning_rate": 1.6144868310860683e-05, + "loss": 1.3303, + "mean_token_accuracy": 0.6645294477542242, + "num_tokens": 1000997207.0, + "step": 5957 + }, + { + "entropy": 1.7490895291169484, + "epoch": 0.6545274779599571, + "grad_norm": 0.6628153324127197, + "learning_rate": 1.6143554509397673e-05, + "loss": 1.3853, + "mean_token_accuracy": 0.6574032058318456, + "num_tokens": 1001126891.0, + "step": 5958 + }, + { + "entropy": 1.695980042219162, + "epoch": 0.6546373348713301, + "grad_norm": 0.6430051326751709, + "learning_rate": 1.6142240545143478e-05, + "loss": 1.5197, + "mean_token_accuracy": 0.6624879688024521, + "num_tokens": 1001291749.0, + "step": 5959 + }, + { + "entropy": 1.7520456314086914, + "epoch": 0.654747191782703, + "grad_norm": 0.7919518351554871, + "learning_rate": 1.614092641813969e-05, + "loss": 1.4127, + "mean_token_accuracy": 0.6570608119169871, + "num_tokens": 1001420291.0, + "step": 5960 + }, + { + "entropy": 1.7864876786867778, + "epoch": 0.654857048694076, + "grad_norm": 0.7036291360855103, + "learning_rate": 1.61396121284279e-05, + "loss": 1.4129, + "mean_token_accuracy": 0.6543021847804388, + "num_tokens": 1001557544.0, + "step": 5961 + }, + { + "entropy": 1.7022630870342255, + "epoch": 0.6549669056054489, + "grad_norm": 0.6970117092132568, + "learning_rate": 1.6138297676049697e-05, + "loss": 1.3648, + "mean_token_accuracy": 0.6602020363012949, + "num_tokens": 1001702592.0, + "step": 5962 + }, + { + "entropy": 1.676472932100296, + "epoch": 0.6550767625168218, + "grad_norm": 0.7215110063552856, + "learning_rate": 1.613698306104669e-05, + "loss": 1.3701, + "mean_token_accuracy": 0.6555512299140295, + "num_tokens": 1001865740.0, + "step": 5963 + }, + { + "entropy": 1.686583936214447, + "epoch": 0.6551866194281948, + "grad_norm": 0.6436832547187805, + "learning_rate": 1.6135668283460485e-05, + "loss": 1.3576, + "mean_token_accuracy": 0.6521365145842234, + "num_tokens": 1002008262.0, + "step": 5964 + }, + { + "entropy": 1.7136501669883728, + "epoch": 0.6552964763395677, + "grad_norm": 0.8774862289428711, + "learning_rate": 1.613435334333269e-05, + "loss": 1.3359, + "mean_token_accuracy": 0.654481420914332, + "num_tokens": 1002180351.0, + "step": 5965 + }, + { + "entropy": 1.6534929970900218, + "epoch": 0.6554063332509407, + "grad_norm": 0.7280681133270264, + "learning_rate": 1.6133038240704927e-05, + "loss": 1.2625, + "mean_token_accuracy": 0.6773078391949335, + "num_tokens": 1002293818.0, + "step": 5966 + }, + { + "entropy": 1.649037887652715, + "epoch": 0.6555161901623136, + "grad_norm": 0.696632981300354, + "learning_rate": 1.6131722975618817e-05, + "loss": 1.2936, + "mean_token_accuracy": 0.6795135736465454, + "num_tokens": 1002424977.0, + "step": 5967 + }, + { + "entropy": 1.7681506077448528, + "epoch": 0.6556260470736865, + "grad_norm": 0.7785711288452148, + "learning_rate": 1.6130407548115986e-05, + "loss": 1.2863, + "mean_token_accuracy": 0.6662353525559107, + "num_tokens": 1002526808.0, + "step": 5968 + }, + { + "entropy": 1.6666353146235149, + "epoch": 0.6557359039850594, + "grad_norm": 0.7321269512176514, + "learning_rate": 1.612909195823807e-05, + "loss": 1.3184, + "mean_token_accuracy": 0.6688967347145081, + "num_tokens": 1002680478.0, + "step": 5969 + }, + { + "entropy": 1.6807195643583934, + "epoch": 0.6558457608964324, + "grad_norm": 1.010912299156189, + "learning_rate": 1.6127776206026706e-05, + "loss": 1.3433, + "mean_token_accuracy": 0.6613064755996069, + "num_tokens": 1002807257.0, + "step": 5970 + }, + { + "entropy": 1.7417626976966858, + "epoch": 0.6559556178078053, + "grad_norm": 0.7049437165260315, + "learning_rate": 1.612646029152353e-05, + "loss": 1.4176, + "mean_token_accuracy": 0.6610104193290075, + "num_tokens": 1002995627.0, + "step": 5971 + }, + { + "entropy": 1.7317078411579132, + "epoch": 0.6560654747191783, + "grad_norm": 0.7525424957275391, + "learning_rate": 1.61251442147702e-05, + "loss": 1.3605, + "mean_token_accuracy": 0.6603756298621496, + "num_tokens": 1003115921.0, + "step": 5972 + }, + { + "entropy": 1.7446248630682628, + "epoch": 0.6561753316305512, + "grad_norm": 0.7605143189430237, + "learning_rate": 1.6123827975808366e-05, + "loss": 1.4968, + "mean_token_accuracy": 0.646631787220637, + "num_tokens": 1003238280.0, + "step": 5973 + }, + { + "entropy": 1.6948024133841197, + "epoch": 0.6562851885419242, + "grad_norm": 0.635215699672699, + "learning_rate": 1.612251157467968e-05, + "loss": 1.4391, + "mean_token_accuracy": 0.6588515788316727, + "num_tokens": 1003439468.0, + "step": 5974 + }, + { + "entropy": 1.695201168457667, + "epoch": 0.6563950454532971, + "grad_norm": 0.7155786752700806, + "learning_rate": 1.6121195011425818e-05, + "loss": 1.2564, + "mean_token_accuracy": 0.6733829925457636, + "num_tokens": 1003559915.0, + "step": 5975 + }, + { + "entropy": 1.7206557989120483, + "epoch": 0.65650490236467, + "grad_norm": 0.6989328861236572, + "learning_rate": 1.611987828608844e-05, + "loss": 1.4135, + "mean_token_accuracy": 0.6583918134371439, + "num_tokens": 1003742377.0, + "step": 5976 + }, + { + "entropy": 1.6666361689567566, + "epoch": 0.656614759276043, + "grad_norm": 0.605663001537323, + "learning_rate": 1.6118561398709218e-05, + "loss": 1.498, + "mean_token_accuracy": 0.6402202894290289, + "num_tokens": 1003918193.0, + "step": 5977 + }, + { + "entropy": 1.659655769666036, + "epoch": 0.6567246161874158, + "grad_norm": 2.3435916900634766, + "learning_rate": 1.6117244349329837e-05, + "loss": 1.258, + "mean_token_accuracy": 0.6593276808659235, + "num_tokens": 1004142681.0, + "step": 5978 + }, + { + "entropy": 1.6928699215253193, + "epoch": 0.6568344730987888, + "grad_norm": 0.7714124917984009, + "learning_rate": 1.6115927137991977e-05, + "loss": 1.3056, + "mean_token_accuracy": 0.6578250130017599, + "num_tokens": 1004262570.0, + "step": 5979 + }, + { + "entropy": 1.6898990372816722, + "epoch": 0.6569443300101617, + "grad_norm": 0.607197642326355, + "learning_rate": 1.6114609764737324e-05, + "loss": 1.4254, + "mean_token_accuracy": 0.6577767829100291, + "num_tokens": 1004466029.0, + "step": 5980 + }, + { + "entropy": 1.7032975753148396, + "epoch": 0.6570541869215347, + "grad_norm": 0.7361001372337341, + "learning_rate": 1.611329222960758e-05, + "loss": 1.3741, + "mean_token_accuracy": 0.6660144229729971, + "num_tokens": 1004631550.0, + "step": 5981 + }, + { + "entropy": 1.775945911804835, + "epoch": 0.6571640438329076, + "grad_norm": 0.7697334885597229, + "learning_rate": 1.6111974532644444e-05, + "loss": 1.5169, + "mean_token_accuracy": 0.6399200161298116, + "num_tokens": 1004854848.0, + "step": 5982 + }, + { + "entropy": 1.674217273791631, + "epoch": 0.6572739007442806, + "grad_norm": 0.7080762386322021, + "learning_rate": 1.6110656673889615e-05, + "loss": 1.4399, + "mean_token_accuracy": 0.6500117778778076, + "num_tokens": 1005128683.0, + "step": 5983 + }, + { + "entropy": 1.7122255663077037, + "epoch": 0.6573837576556535, + "grad_norm": 0.6646968722343445, + "learning_rate": 1.6109338653384806e-05, + "loss": 1.4163, + "mean_token_accuracy": 0.6497417688369751, + "num_tokens": 1005307086.0, + "step": 5984 + }, + { + "entropy": 1.8177510897318523, + "epoch": 0.6574936145670265, + "grad_norm": 0.7551073431968689, + "learning_rate": 1.6108020471171733e-05, + "loss": 1.3673, + "mean_token_accuracy": 0.6538062343994776, + "num_tokens": 1005454014.0, + "step": 5985 + }, + { + "entropy": 1.69020011027654, + "epoch": 0.6576034714783994, + "grad_norm": 0.7540128827095032, + "learning_rate": 1.610670212729211e-05, + "loss": 1.3295, + "mean_token_accuracy": 0.6638794293006262, + "num_tokens": 1005580386.0, + "step": 5986 + }, + { + "entropy": 1.6580841739972432, + "epoch": 0.6577133283897724, + "grad_norm": 0.6165665984153748, + "learning_rate": 1.610538362178767e-05, + "loss": 1.3407, + "mean_token_accuracy": 0.6651915510495504, + "num_tokens": 1005745720.0, + "step": 5987 + }, + { + "entropy": 1.7150371372699738, + "epoch": 0.6578231853011453, + "grad_norm": 0.7540215849876404, + "learning_rate": 1.6104064954700137e-05, + "loss": 1.4733, + "mean_token_accuracy": 0.6606999586025873, + "num_tokens": 1005910475.0, + "step": 5988 + }, + { + "entropy": 1.704669823249181, + "epoch": 0.6579330422125182, + "grad_norm": 0.7729107141494751, + "learning_rate": 1.6102746126071245e-05, + "loss": 1.5419, + "mean_token_accuracy": 0.6427391221125921, + "num_tokens": 1006067481.0, + "step": 5989 + }, + { + "entropy": 1.676388919353485, + "epoch": 0.6580428991238911, + "grad_norm": 0.7432935833930969, + "learning_rate": 1.610142713594274e-05, + "loss": 1.263, + "mean_token_accuracy": 0.6768547048171362, + "num_tokens": 1006217348.0, + "step": 5990 + }, + { + "entropy": 1.726919690767924, + "epoch": 0.658152756035264, + "grad_norm": 0.73004549741745, + "learning_rate": 1.6100107984356362e-05, + "loss": 1.4621, + "mean_token_accuracy": 0.6482563465833664, + "num_tokens": 1006448796.0, + "step": 5991 + }, + { + "entropy": 1.6418760021527607, + "epoch": 0.658262612946637, + "grad_norm": 0.6615251898765564, + "learning_rate": 1.6098788671353872e-05, + "loss": 1.3493, + "mean_token_accuracy": 0.6650320092837015, + "num_tokens": 1006618329.0, + "step": 5992 + }, + { + "entropy": 1.619303782780965, + "epoch": 0.6583724698580099, + "grad_norm": 0.6965547800064087, + "learning_rate": 1.6097469196977012e-05, + "loss": 1.2817, + "mean_token_accuracy": 0.6767748643954595, + "num_tokens": 1006751014.0, + "step": 5993 + }, + { + "entropy": 1.6654784083366394, + "epoch": 0.6584823267693829, + "grad_norm": 0.7190507054328918, + "learning_rate": 1.609614956126755e-05, + "loss": 1.3126, + "mean_token_accuracy": 0.6592614303032557, + "num_tokens": 1006902725.0, + "step": 5994 + }, + { + "entropy": 1.689567784468333, + "epoch": 0.6585921836807558, + "grad_norm": 1.1087243556976318, + "learning_rate": 1.6094829764267254e-05, + "loss": 1.3326, + "mean_token_accuracy": 0.6678081601858139, + "num_tokens": 1007055528.0, + "step": 5995 + }, + { + "entropy": 1.7132113973299663, + "epoch": 0.6587020405921288, + "grad_norm": 0.6450228095054626, + "learning_rate": 1.6093509806017883e-05, + "loss": 1.3825, + "mean_token_accuracy": 0.645787293712298, + "num_tokens": 1007254824.0, + "step": 5996 + }, + { + "entropy": 1.6614897946516674, + "epoch": 0.6588118975035017, + "grad_norm": 0.6441484093666077, + "learning_rate": 1.609218968656123e-05, + "loss": 1.4582, + "mean_token_accuracy": 0.6584440817435583, + "num_tokens": 1007429624.0, + "step": 5997 + }, + { + "entropy": 1.6524133781592052, + "epoch": 0.6589217544148747, + "grad_norm": 0.6148684024810791, + "learning_rate": 1.6090869405939067e-05, + "loss": 1.3715, + "mean_token_accuracy": 0.6544036467870077, + "num_tokens": 1007626408.0, + "step": 5998 + }, + { + "entropy": 1.6548963785171509, + "epoch": 0.6590316113262475, + "grad_norm": 0.6921628713607788, + "learning_rate": 1.608954896419318e-05, + "loss": 1.3684, + "mean_token_accuracy": 0.6596562564373016, + "num_tokens": 1007779392.0, + "step": 5999 + }, + { + "entropy": 1.6788422763347626, + "epoch": 0.6591414682376205, + "grad_norm": 0.7359298467636108, + "learning_rate": 1.608822836136536e-05, + "loss": 1.3503, + "mean_token_accuracy": 0.6542917539676031, + "num_tokens": 1007942114.0, + "step": 6000 + }, + { + "entropy": 1.6935390035311382, + "epoch": 0.6592513251489934, + "grad_norm": 0.5704781413078308, + "learning_rate": 1.6086907597497406e-05, + "loss": 1.4471, + "mean_token_accuracy": 0.648592452208201, + "num_tokens": 1008134066.0, + "step": 6001 + }, + { + "entropy": 1.6664865513642628, + "epoch": 0.6593611820603664, + "grad_norm": 0.6393500566482544, + "learning_rate": 1.608558667263112e-05, + "loss": 1.4475, + "mean_token_accuracy": 0.6586243808269501, + "num_tokens": 1008313195.0, + "step": 6002 + }, + { + "entropy": 1.7087785402933757, + "epoch": 0.6594710389717393, + "grad_norm": 0.7804690599441528, + "learning_rate": 1.6084265586808304e-05, + "loss": 1.5634, + "mean_token_accuracy": 0.6387995928525925, + "num_tokens": 1008491339.0, + "step": 6003 + }, + { + "entropy": 1.6799262662728627, + "epoch": 0.6595808958831122, + "grad_norm": 1.2888646125793457, + "learning_rate": 1.6082944340070777e-05, + "loss": 1.2409, + "mean_token_accuracy": 0.6644798517227173, + "num_tokens": 1008699500.0, + "step": 6004 + }, + { + "entropy": 1.6629238029321034, + "epoch": 0.6596907527944852, + "grad_norm": 1.3199480772018433, + "learning_rate": 1.6081622932460352e-05, + "loss": 1.0079, + "mean_token_accuracy": 0.6921218782663345, + "num_tokens": 1008848456.0, + "step": 6005 + }, + { + "entropy": 1.6802177329858143, + "epoch": 0.6598006097058581, + "grad_norm": 0.7184219360351562, + "learning_rate": 1.6080301364018852e-05, + "loss": 1.3617, + "mean_token_accuracy": 0.6671679069598516, + "num_tokens": 1008991473.0, + "step": 6006 + }, + { + "entropy": 1.7172418534755707, + "epoch": 0.6599104666172311, + "grad_norm": 0.7799577116966248, + "learning_rate": 1.6078979634788102e-05, + "loss": 1.4283, + "mean_token_accuracy": 0.645437479019165, + "num_tokens": 1009116554.0, + "step": 6007 + }, + { + "entropy": 1.7622934381167095, + "epoch": 0.660020323528604, + "grad_norm": 0.7395654916763306, + "learning_rate": 1.607765774480993e-05, + "loss": 1.5234, + "mean_token_accuracy": 0.6265956362088522, + "num_tokens": 1009265660.0, + "step": 6008 + }, + { + "entropy": 1.7269906798998516, + "epoch": 0.660130180439977, + "grad_norm": 0.7985388040542603, + "learning_rate": 1.6076335694126187e-05, + "loss": 1.4323, + "mean_token_accuracy": 0.6677784125010172, + "num_tokens": 1009437681.0, + "step": 6009 + }, + { + "entropy": 1.7253870368003845, + "epoch": 0.6602400373513498, + "grad_norm": 0.6861431002616882, + "learning_rate": 1.60750134827787e-05, + "loss": 1.3246, + "mean_token_accuracy": 0.666627456744512, + "num_tokens": 1009607245.0, + "step": 6010 + }, + { + "entropy": 1.714435617129008, + "epoch": 0.6603498942627228, + "grad_norm": 0.8335552215576172, + "learning_rate": 1.6073691110809325e-05, + "loss": 1.4504, + "mean_token_accuracy": 0.6543582628170649, + "num_tokens": 1009790328.0, + "step": 6011 + }, + { + "entropy": 1.6384719610214233, + "epoch": 0.6604597511740957, + "grad_norm": 0.6027631759643555, + "learning_rate": 1.6072368578259914e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6624562293291092, + "num_tokens": 1009931118.0, + "step": 6012 + }, + { + "entropy": 1.652736137310664, + "epoch": 0.6605696080854687, + "grad_norm": 0.6054850220680237, + "learning_rate": 1.6071045885172322e-05, + "loss": 1.2664, + "mean_token_accuracy": 0.681120495001475, + "num_tokens": 1010055574.0, + "step": 6013 + }, + { + "entropy": 1.6163685818513234, + "epoch": 0.6606794649968416, + "grad_norm": 0.5509641766548157, + "learning_rate": 1.6069723031588412e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6819182386000952, + "num_tokens": 1010220664.0, + "step": 6014 + }, + { + "entropy": 1.6701125005880992, + "epoch": 0.6607893219082146, + "grad_norm": 0.7778674364089966, + "learning_rate": 1.6068400017550055e-05, + "loss": 1.4248, + "mean_token_accuracy": 0.6782306134700775, + "num_tokens": 1010398174.0, + "step": 6015 + }, + { + "entropy": 1.7769503196080525, + "epoch": 0.6608991788195875, + "grad_norm": 0.7546883225440979, + "learning_rate": 1.6067076843099125e-05, + "loss": 1.4241, + "mean_token_accuracy": 0.642538994550705, + "num_tokens": 1010594125.0, + "step": 6016 + }, + { + "entropy": 1.6197692056496937, + "epoch": 0.6610090357309604, + "grad_norm": 0.7641433477401733, + "learning_rate": 1.6065753508277488e-05, + "loss": 1.2113, + "mean_token_accuracy": 0.6883720109860102, + "num_tokens": 1010715132.0, + "step": 6017 + }, + { + "entropy": 1.6694122155507405, + "epoch": 0.6611188926423334, + "grad_norm": 0.6795452237129211, + "learning_rate": 1.6064430013127036e-05, + "loss": 1.476, + "mean_token_accuracy": 0.6509568393230438, + "num_tokens": 1010891272.0, + "step": 6018 + }, + { + "entropy": 1.6499930421511333, + "epoch": 0.6612287495537063, + "grad_norm": 0.6633215546607971, + "learning_rate": 1.6063106357689662e-05, + "loss": 1.5788, + "mean_token_accuracy": 0.6357754915952682, + "num_tokens": 1011164958.0, + "step": 6019 + }, + { + "entropy": 1.7087593972682953, + "epoch": 0.6613386064650792, + "grad_norm": 0.7861476540565491, + "learning_rate": 1.606178254200725e-05, + "loss": 1.5698, + "mean_token_accuracy": 0.6314490288496017, + "num_tokens": 1011325949.0, + "step": 6020 + }, + { + "entropy": 1.6702162524064381, + "epoch": 0.6614484633764521, + "grad_norm": 0.6822460293769836, + "learning_rate": 1.60604585661217e-05, + "loss": 1.3802, + "mean_token_accuracy": 0.6540361742178599, + "num_tokens": 1011516122.0, + "step": 6021 + }, + { + "entropy": 1.7486283381779988, + "epoch": 0.6615583202878251, + "grad_norm": 0.7449422478675842, + "learning_rate": 1.6059134430074917e-05, + "loss": 1.3651, + "mean_token_accuracy": 0.6583732018868128, + "num_tokens": 1011658572.0, + "step": 6022 + }, + { + "entropy": 1.7109368344148, + "epoch": 0.661668177199198, + "grad_norm": 0.6396523118019104, + "learning_rate": 1.6057810133908812e-05, + "loss": 1.5154, + "mean_token_accuracy": 0.6504169950882593, + "num_tokens": 1011820872.0, + "step": 6023 + }, + { + "entropy": 1.6798317929108937, + "epoch": 0.661778034110571, + "grad_norm": 0.7457844614982605, + "learning_rate": 1.605648567766529e-05, + "loss": 1.3378, + "mean_token_accuracy": 0.6725800782442093, + "num_tokens": 1011939184.0, + "step": 6024 + }, + { + "entropy": 1.7137371897697449, + "epoch": 0.6618878910219439, + "grad_norm": 0.7281384468078613, + "learning_rate": 1.6055161061386282e-05, + "loss": 1.3466, + "mean_token_accuracy": 0.6607188185056051, + "num_tokens": 1012063344.0, + "step": 6025 + }, + { + "entropy": 1.7295754949251811, + "epoch": 0.6619977479333169, + "grad_norm": 0.9593626856803894, + "learning_rate": 1.6053836285113703e-05, + "loss": 1.5888, + "mean_token_accuracy": 0.639353816707929, + "num_tokens": 1012202910.0, + "step": 6026 + }, + { + "entropy": 1.74645792444547, + "epoch": 0.6621076048446898, + "grad_norm": 0.6115472912788391, + "learning_rate": 1.6052511348889475e-05, + "loss": 1.272, + "mean_token_accuracy": 0.6679097364346186, + "num_tokens": 1012353481.0, + "step": 6027 + }, + { + "entropy": 1.6415168742338817, + "epoch": 0.6622174617560628, + "grad_norm": 0.6827746629714966, + "learning_rate": 1.6051186252755548e-05, + "loss": 1.3812, + "mean_token_accuracy": 0.663339634736379, + "num_tokens": 1012531799.0, + "step": 6028 + }, + { + "entropy": 1.7094530860582988, + "epoch": 0.6623273186674357, + "grad_norm": 0.6740764379501343, + "learning_rate": 1.604986099675385e-05, + "loss": 1.2458, + "mean_token_accuracy": 0.6794106811285019, + "num_tokens": 1012644516.0, + "step": 6029 + }, + { + "entropy": 1.6730037033557892, + "epoch": 0.6624371755788087, + "grad_norm": 0.6922171115875244, + "learning_rate": 1.604853558092632e-05, + "loss": 1.4073, + "mean_token_accuracy": 0.6618214547634125, + "num_tokens": 1012839703.0, + "step": 6030 + }, + { + "entropy": 1.6856864591439564, + "epoch": 0.6625470324901815, + "grad_norm": 0.6982408761978149, + "learning_rate": 1.6047210005314927e-05, + "loss": 1.3656, + "mean_token_accuracy": 0.6610573281844457, + "num_tokens": 1012997627.0, + "step": 6031 + }, + { + "entropy": 1.7156126201152802, + "epoch": 0.6626568894015544, + "grad_norm": 0.8553928136825562, + "learning_rate": 1.6045884269961602e-05, + "loss": 1.4574, + "mean_token_accuracy": 0.6595203479131063, + "num_tokens": 1013141489.0, + "step": 6032 + }, + { + "entropy": 1.7379735112190247, + "epoch": 0.6627667463129274, + "grad_norm": 0.6452311873435974, + "learning_rate": 1.6044558374908313e-05, + "loss": 1.3783, + "mean_token_accuracy": 0.6580146799484888, + "num_tokens": 1013350551.0, + "step": 6033 + }, + { + "entropy": 1.6629939476648967, + "epoch": 0.6628766032243003, + "grad_norm": 0.7060854434967041, + "learning_rate": 1.604323232019703e-05, + "loss": 1.3811, + "mean_token_accuracy": 0.6662647575139999, + "num_tokens": 1013522519.0, + "step": 6034 + }, + { + "entropy": 1.6722846726576488, + "epoch": 0.6629864601356733, + "grad_norm": 0.581295371055603, + "learning_rate": 1.6041906105869716e-05, + "loss": 1.3575, + "mean_token_accuracy": 0.6602567632993063, + "num_tokens": 1013698530.0, + "step": 6035 + }, + { + "entropy": 1.8078128496805828, + "epoch": 0.6630963170470462, + "grad_norm": 0.7791680693626404, + "learning_rate": 1.6040579731968342e-05, + "loss": 1.3535, + "mean_token_accuracy": 0.6552262306213379, + "num_tokens": 1013835125.0, + "step": 6036 + }, + { + "entropy": 1.6595743894577026, + "epoch": 0.6632061739584192, + "grad_norm": 0.758542001247406, + "learning_rate": 1.6039253198534893e-05, + "loss": 1.4194, + "mean_token_accuracy": 0.6629291425148646, + "num_tokens": 1013973848.0, + "step": 6037 + }, + { + "entropy": 1.6578982969125111, + "epoch": 0.6633160308697921, + "grad_norm": 0.7612101435661316, + "learning_rate": 1.6037926505611353e-05, + "loss": 1.3237, + "mean_token_accuracy": 0.6753773540258408, + "num_tokens": 1014105985.0, + "step": 6038 + }, + { + "entropy": 1.7778889040152233, + "epoch": 0.6634258877811651, + "grad_norm": 0.6914167404174805, + "learning_rate": 1.6036599653239705e-05, + "loss": 1.4301, + "mean_token_accuracy": 0.6623369753360748, + "num_tokens": 1014286159.0, + "step": 6039 + }, + { + "entropy": 1.730593462785085, + "epoch": 0.663535744692538, + "grad_norm": 0.6559991240501404, + "learning_rate": 1.6035272641461953e-05, + "loss": 1.5694, + "mean_token_accuracy": 0.6481152127186457, + "num_tokens": 1014489852.0, + "step": 6040 + }, + { + "entropy": 1.7553011178970337, + "epoch": 0.663645601603911, + "grad_norm": 0.6367934942245483, + "learning_rate": 1.6033945470320088e-05, + "loss": 1.3236, + "mean_token_accuracy": 0.6699994951486588, + "num_tokens": 1014642102.0, + "step": 6041 + }, + { + "entropy": 1.692327857017517, + "epoch": 0.6637554585152838, + "grad_norm": 0.6805416941642761, + "learning_rate": 1.6032618139856116e-05, + "loss": 1.3413, + "mean_token_accuracy": 0.6565508594115576, + "num_tokens": 1014788802.0, + "step": 6042 + }, + { + "entropy": 1.7228071590264638, + "epoch": 0.6638653154266568, + "grad_norm": 0.5855072140693665, + "learning_rate": 1.6031290650112047e-05, + "loss": 1.2014, + "mean_token_accuracy": 0.6715217183033625, + "num_tokens": 1014978143.0, + "step": 6043 + }, + { + "entropy": 1.7340465486049652, + "epoch": 0.6639751723380297, + "grad_norm": 0.7417824864387512, + "learning_rate": 1.6029963001129897e-05, + "loss": 1.4479, + "mean_token_accuracy": 0.6491431444883347, + "num_tokens": 1015185187.0, + "step": 6044 + }, + { + "entropy": 1.722789963086446, + "epoch": 0.6640850292494026, + "grad_norm": 0.6903808116912842, + "learning_rate": 1.6028635192951686e-05, + "loss": 1.4496, + "mean_token_accuracy": 0.6459407409032186, + "num_tokens": 1015347694.0, + "step": 6045 + }, + { + "entropy": 1.6786029835542042, + "epoch": 0.6641948861607756, + "grad_norm": 0.7279839515686035, + "learning_rate": 1.6027307225619434e-05, + "loss": 1.2846, + "mean_token_accuracy": 0.6784352113803228, + "num_tokens": 1015469969.0, + "step": 6046 + }, + { + "entropy": 1.719456136226654, + "epoch": 0.6643047430721485, + "grad_norm": 0.7053220868110657, + "learning_rate": 1.6025979099175176e-05, + "loss": 1.3669, + "mean_token_accuracy": 0.6673380633195242, + "num_tokens": 1015623741.0, + "step": 6047 + }, + { + "entropy": 1.7085906167825062, + "epoch": 0.6644145999835215, + "grad_norm": 0.6075806021690369, + "learning_rate": 1.6024650813660946e-05, + "loss": 1.4022, + "mean_token_accuracy": 0.6560228218634924, + "num_tokens": 1015805903.0, + "step": 6048 + }, + { + "entropy": 1.718571404616038, + "epoch": 0.6645244568948944, + "grad_norm": 0.7968881130218506, + "learning_rate": 1.6023322369118777e-05, + "loss": 1.5538, + "mean_token_accuracy": 0.6444092392921448, + "num_tokens": 1015977088.0, + "step": 6049 + }, + { + "entropy": 1.6368082066377003, + "epoch": 0.6646343138062674, + "grad_norm": 0.6820107102394104, + "learning_rate": 1.6021993765590724e-05, + "loss": 1.3924, + "mean_token_accuracy": 0.6599597285191218, + "num_tokens": 1016132000.0, + "step": 6050 + }, + { + "entropy": 1.7395052810509999, + "epoch": 0.6647441707176402, + "grad_norm": 1.213407039642334, + "learning_rate": 1.6020665003118828e-05, + "loss": 1.4726, + "mean_token_accuracy": 0.625076542297999, + "num_tokens": 1016336850.0, + "step": 6051 + }, + { + "entropy": 1.6540366212526958, + "epoch": 0.6648540276290132, + "grad_norm": 0.6258625388145447, + "learning_rate": 1.6019336081745143e-05, + "loss": 1.4585, + "mean_token_accuracy": 0.6550316015879313, + "num_tokens": 1016548471.0, + "step": 6052 + }, + { + "entropy": 1.7461306750774384, + "epoch": 0.6649638845403861, + "grad_norm": 0.7281144857406616, + "learning_rate": 1.601800700151174e-05, + "loss": 1.4278, + "mean_token_accuracy": 0.6685409446557363, + "num_tokens": 1016688609.0, + "step": 6053 + }, + { + "entropy": 1.6394382019837697, + "epoch": 0.6650737414517591, + "grad_norm": 0.7716277241706848, + "learning_rate": 1.6016677762460677e-05, + "loss": 1.2538, + "mean_token_accuracy": 0.6766884575287501, + "num_tokens": 1016848324.0, + "step": 6054 + }, + { + "entropy": 1.6753660937150319, + "epoch": 0.665183598363132, + "grad_norm": 0.721605122089386, + "learning_rate": 1.601534836463402e-05, + "loss": 1.4277, + "mean_token_accuracy": 0.65249036749204, + "num_tokens": 1017024377.0, + "step": 6055 + }, + { + "entropy": 1.709774265686671, + "epoch": 0.665293455274505, + "grad_norm": 0.7922242283821106, + "learning_rate": 1.601401880807385e-05, + "loss": 1.523, + "mean_token_accuracy": 0.667452315489451, + "num_tokens": 1017182257.0, + "step": 6056 + }, + { + "entropy": 1.7641392350196838, + "epoch": 0.6654033121858779, + "grad_norm": 0.8408894538879395, + "learning_rate": 1.601268909282224e-05, + "loss": 1.3981, + "mean_token_accuracy": 0.6577701171239217, + "num_tokens": 1017329247.0, + "step": 6057 + }, + { + "entropy": 1.6443076431751251, + "epoch": 0.6655131690972508, + "grad_norm": 0.7164208292961121, + "learning_rate": 1.601135921892128e-05, + "loss": 1.4252, + "mean_token_accuracy": 0.659592812259992, + "num_tokens": 1017531523.0, + "step": 6058 + }, + { + "entropy": 1.7700778742631276, + "epoch": 0.6656230260086238, + "grad_norm": 0.8573195338249207, + "learning_rate": 1.601002918641306e-05, + "loss": 1.451, + "mean_token_accuracy": 0.6490327517191569, + "num_tokens": 1017706081.0, + "step": 6059 + }, + { + "entropy": 1.6549913088480632, + "epoch": 0.6657328829199967, + "grad_norm": 0.6733830571174622, + "learning_rate": 1.6008698995339674e-05, + "loss": 1.4763, + "mean_token_accuracy": 0.6520965496699015, + "num_tokens": 1017895248.0, + "step": 6060 + }, + { + "entropy": 1.689163068930308, + "epoch": 0.6658427398313697, + "grad_norm": 0.7045179605484009, + "learning_rate": 1.6007368645743222e-05, + "loss": 1.3279, + "mean_token_accuracy": 0.6655579805374146, + "num_tokens": 1018012601.0, + "step": 6061 + }, + { + "entropy": 1.664953072865804, + "epoch": 0.6659525967427425, + "grad_norm": 0.5830453634262085, + "learning_rate": 1.6006038137665808e-05, + "loss": 1.4024, + "mean_token_accuracy": 0.6456159402926763, + "num_tokens": 1018229159.0, + "step": 6062 + }, + { + "entropy": 1.687490314245224, + "epoch": 0.6660624536541155, + "grad_norm": 0.6919242143630981, + "learning_rate": 1.600470747114954e-05, + "loss": 1.4013, + "mean_token_accuracy": 0.6645840257406235, + "num_tokens": 1018361148.0, + "step": 6063 + }, + { + "entropy": 1.7308449447154999, + "epoch": 0.6661723105654884, + "grad_norm": 0.7102833986282349, + "learning_rate": 1.600337664623654e-05, + "loss": 1.3756, + "mean_token_accuracy": 0.6476593216260275, + "num_tokens": 1018503933.0, + "step": 6064 + }, + { + "entropy": 1.7429296175638835, + "epoch": 0.6662821674768614, + "grad_norm": 0.6323913931846619, + "learning_rate": 1.6002045662968924e-05, + "loss": 1.5065, + "mean_token_accuracy": 0.6419780949751536, + "num_tokens": 1018704772.0, + "step": 6065 + }, + { + "entropy": 1.7037050426006317, + "epoch": 0.6663920243882343, + "grad_norm": 0.6635475158691406, + "learning_rate": 1.6000714521388812e-05, + "loss": 1.5844, + "mean_token_accuracy": 0.6312363793452581, + "num_tokens": 1018884802.0, + "step": 6066 + }, + { + "entropy": 1.672195961078008, + "epoch": 0.6665018812996073, + "grad_norm": 0.647525429725647, + "learning_rate": 1.599938322153834e-05, + "loss": 1.4732, + "mean_token_accuracy": 0.6414629220962524, + "num_tokens": 1019090452.0, + "step": 6067 + }, + { + "entropy": 1.6700976292292278, + "epoch": 0.6666117382109802, + "grad_norm": 0.6400693655014038, + "learning_rate": 1.5998051763459646e-05, + "loss": 1.3905, + "mean_token_accuracy": 0.6553190549214681, + "num_tokens": 1019247192.0, + "step": 6068 + }, + { + "entropy": 1.6819157501061757, + "epoch": 0.6667215951223532, + "grad_norm": 0.7235324382781982, + "learning_rate": 1.5996720147194865e-05, + "loss": 1.29, + "mean_token_accuracy": 0.6759899059931437, + "num_tokens": 1019407725.0, + "step": 6069 + }, + { + "entropy": 1.6707827945550282, + "epoch": 0.6668314520337261, + "grad_norm": 0.6338719129562378, + "learning_rate": 1.599538837278614e-05, + "loss": 1.455, + "mean_token_accuracy": 0.6488498498996099, + "num_tokens": 1019603082.0, + "step": 6070 + }, + { + "entropy": 1.7216089765230815, + "epoch": 0.666941308945099, + "grad_norm": 0.7594712376594543, + "learning_rate": 1.5994056440275626e-05, + "loss": 1.4081, + "mean_token_accuracy": 0.6522118002176285, + "num_tokens": 1019850616.0, + "step": 6071 + }, + { + "entropy": 1.701182136933009, + "epoch": 0.667051165856472, + "grad_norm": 0.6392696499824524, + "learning_rate": 1.5992724349705476e-05, + "loss": 1.4646, + "mean_token_accuracy": 0.648558313647906, + "num_tokens": 1020043487.0, + "step": 6072 + }, + { + "entropy": 1.6712729334831238, + "epoch": 0.6671610227678448, + "grad_norm": 0.7950101494789124, + "learning_rate": 1.5991392101117847e-05, + "loss": 1.1902, + "mean_token_accuracy": 0.6798241138458252, + "num_tokens": 1020178559.0, + "step": 6073 + }, + { + "entropy": 1.6639493604501088, + "epoch": 0.6672708796792178, + "grad_norm": 0.7325677871704102, + "learning_rate": 1.599005969455491e-05, + "loss": 1.5157, + "mean_token_accuracy": 0.6516855508089066, + "num_tokens": 1020341544.0, + "step": 6074 + }, + { + "entropy": 1.7209535439809163, + "epoch": 0.6673807365905907, + "grad_norm": 0.7139765620231628, + "learning_rate": 1.598872713005883e-05, + "loss": 1.3183, + "mean_token_accuracy": 0.6776246974865595, + "num_tokens": 1020498782.0, + "step": 6075 + }, + { + "entropy": 1.7369226813316345, + "epoch": 0.6674905935019637, + "grad_norm": 0.7224074602127075, + "learning_rate": 1.598739440767179e-05, + "loss": 1.4519, + "mean_token_accuracy": 0.6571163336435953, + "num_tokens": 1020617813.0, + "step": 6076 + }, + { + "entropy": 1.7044867078463237, + "epoch": 0.6676004504133366, + "grad_norm": 0.8380149006843567, + "learning_rate": 1.598606152743596e-05, + "loss": 1.4734, + "mean_token_accuracy": 0.6627581169207891, + "num_tokens": 1020769525.0, + "step": 6077 + }, + { + "entropy": 1.7417923708756764, + "epoch": 0.6677103073247096, + "grad_norm": 0.5832151770591736, + "learning_rate": 1.598472848939353e-05, + "loss": 1.377, + "mean_token_accuracy": 0.6470159838596979, + "num_tokens": 1020952842.0, + "step": 6078 + }, + { + "entropy": 1.7149604658285778, + "epoch": 0.6678201642360825, + "grad_norm": 0.7015511393547058, + "learning_rate": 1.598339529358669e-05, + "loss": 1.3452, + "mean_token_accuracy": 0.6668038119872411, + "num_tokens": 1021135830.0, + "step": 6079 + }, + { + "entropy": 1.7343719899654388, + "epoch": 0.6679300211474555, + "grad_norm": 0.5889387726783752, + "learning_rate": 1.5982061940057633e-05, + "loss": 1.4322, + "mean_token_accuracy": 0.6490494459867477, + "num_tokens": 1021313312.0, + "step": 6080 + }, + { + "entropy": 1.7713063756624858, + "epoch": 0.6680398780588284, + "grad_norm": 0.6595631241798401, + "learning_rate": 1.598072842884856e-05, + "loss": 1.419, + "mean_token_accuracy": 0.6418772985537847, + "num_tokens": 1021484880.0, + "step": 6081 + }, + { + "entropy": 1.695546378691991, + "epoch": 0.6681497349702014, + "grad_norm": 0.7553979754447937, + "learning_rate": 1.597939476000168e-05, + "loss": 1.2275, + "mean_token_accuracy": 0.6762440800666809, + "num_tokens": 1021614110.0, + "step": 6082 + }, + { + "entropy": 1.7010157803694408, + "epoch": 0.6682595918815742, + "grad_norm": 0.6879470348358154, + "learning_rate": 1.59780609335592e-05, + "loss": 1.5772, + "mean_token_accuracy": 0.6345613052447637, + "num_tokens": 1021797865.0, + "step": 6083 + }, + { + "entropy": 1.6960657437642415, + "epoch": 0.6683694487929472, + "grad_norm": 0.6763353943824768, + "learning_rate": 1.597672694956333e-05, + "loss": 1.3334, + "mean_token_accuracy": 0.6662516544262568, + "num_tokens": 1021966095.0, + "step": 6084 + }, + { + "entropy": 1.7035949130853016, + "epoch": 0.6684793057043201, + "grad_norm": 0.6424010396003723, + "learning_rate": 1.5975392808056297e-05, + "loss": 1.412, + "mean_token_accuracy": 0.653632586201032, + "num_tokens": 1022124115.0, + "step": 6085 + }, + { + "entropy": 1.7454820175965626, + "epoch": 0.668589162615693, + "grad_norm": 0.7985848188400269, + "learning_rate": 1.5974058509080322e-05, + "loss": 1.5415, + "mean_token_accuracy": 0.6329482396443685, + "num_tokens": 1022284660.0, + "step": 6086 + }, + { + "entropy": 1.7059528827667236, + "epoch": 0.668699019527066, + "grad_norm": 0.6430754065513611, + "learning_rate": 1.5972724052677636e-05, + "loss": 1.3376, + "mean_token_accuracy": 0.6498004595438639, + "num_tokens": 1022462814.0, + "step": 6087 + }, + { + "entropy": 1.6707301139831543, + "epoch": 0.6688088764384389, + "grad_norm": 0.7058833837509155, + "learning_rate": 1.597138943889048e-05, + "loss": 1.3133, + "mean_token_accuracy": 0.6808335582415262, + "num_tokens": 1022597570.0, + "step": 6088 + }, + { + "entropy": 1.7404019236564636, + "epoch": 0.6689187333498119, + "grad_norm": 0.8028521537780762, + "learning_rate": 1.5970054667761086e-05, + "loss": 1.5094, + "mean_token_accuracy": 0.65455295642217, + "num_tokens": 1022767945.0, + "step": 6089 + }, + { + "entropy": 1.6390142341454823, + "epoch": 0.6690285902611848, + "grad_norm": 0.7467424273490906, + "learning_rate": 1.59687197393317e-05, + "loss": 1.2249, + "mean_token_accuracy": 0.684510534008344, + "num_tokens": 1022885224.0, + "step": 6090 + }, + { + "entropy": 1.698186457157135, + "epoch": 0.6691384471725578, + "grad_norm": 0.6739535927772522, + "learning_rate": 1.5967384653644573e-05, + "loss": 1.5205, + "mean_token_accuracy": 0.6390059242645899, + "num_tokens": 1023125924.0, + "step": 6091 + }, + { + "entropy": 1.7658388912677765, + "epoch": 0.6692483040839307, + "grad_norm": 0.6793767213821411, + "learning_rate": 1.596604941074196e-05, + "loss": 1.3172, + "mean_token_accuracy": 0.6668230046828588, + "num_tokens": 1023261228.0, + "step": 6092 + }, + { + "entropy": 1.6249745587507884, + "epoch": 0.6693581609953037, + "grad_norm": 0.6603535413742065, + "learning_rate": 1.596471401066612e-05, + "loss": 1.383, + "mean_token_accuracy": 0.6608263403177261, + "num_tokens": 1023451278.0, + "step": 6093 + }, + { + "entropy": 1.6451916893323262, + "epoch": 0.6694680179066765, + "grad_norm": 0.6392114758491516, + "learning_rate": 1.5963378453459322e-05, + "loss": 1.4267, + "mean_token_accuracy": 0.6546541998783747, + "num_tokens": 1023641335.0, + "step": 6094 + }, + { + "entropy": 1.663988600174586, + "epoch": 0.6695778748180495, + "grad_norm": 0.5604124069213867, + "learning_rate": 1.596204273916383e-05, + "loss": 1.4705, + "mean_token_accuracy": 0.6506281395753225, + "num_tokens": 1023914875.0, + "step": 6095 + }, + { + "entropy": 1.7463338673114777, + "epoch": 0.6696877317294224, + "grad_norm": 0.693678617477417, + "learning_rate": 1.5960706867821922e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6399700790643692, + "num_tokens": 1024096467.0, + "step": 6096 + }, + { + "entropy": 1.6922602653503418, + "epoch": 0.6697975886407954, + "grad_norm": 0.6785783767700195, + "learning_rate": 1.5959370839475878e-05, + "loss": 1.3892, + "mean_token_accuracy": 0.6647334198156992, + "num_tokens": 1024245472.0, + "step": 6097 + }, + { + "entropy": 1.7134245534737904, + "epoch": 0.6699074455521683, + "grad_norm": 0.7681015133857727, + "learning_rate": 1.595803465416798e-05, + "loss": 1.2404, + "mean_token_accuracy": 0.6760171254475912, + "num_tokens": 1024351814.0, + "step": 6098 + }, + { + "entropy": 1.7330508331457775, + "epoch": 0.6700173024635412, + "grad_norm": 0.7412785291671753, + "learning_rate": 1.595669831194052e-05, + "loss": 1.4319, + "mean_token_accuracy": 0.662334273258845, + "num_tokens": 1024520510.0, + "step": 6099 + }, + { + "entropy": 1.7534303267796834, + "epoch": 0.6701271593749142, + "grad_norm": 0.6362935900688171, + "learning_rate": 1.595536181283579e-05, + "loss": 1.3547, + "mean_token_accuracy": 0.6595286975304285, + "num_tokens": 1024702512.0, + "step": 6100 + }, + { + "entropy": 1.7748811344305675, + "epoch": 0.6702370162862871, + "grad_norm": 0.771950364112854, + "learning_rate": 1.5954025156896094e-05, + "loss": 1.4267, + "mean_token_accuracy": 0.6493061731259028, + "num_tokens": 1024826301.0, + "step": 6101 + }, + { + "entropy": 1.6831330458323162, + "epoch": 0.6703468731976601, + "grad_norm": 0.5883938670158386, + "learning_rate": 1.5952688344163738e-05, + "loss": 1.3333, + "mean_token_accuracy": 0.6650530050198237, + "num_tokens": 1025013672.0, + "step": 6102 + }, + { + "entropy": 1.6784860491752625, + "epoch": 0.670456730109033, + "grad_norm": 0.6166484355926514, + "learning_rate": 1.595135137468102e-05, + "loss": 1.3632, + "mean_token_accuracy": 0.6598242670297623, + "num_tokens": 1025172109.0, + "step": 6103 + }, + { + "entropy": 1.7556925614674885, + "epoch": 0.670566587020406, + "grad_norm": 0.7905380725860596, + "learning_rate": 1.5950014248490268e-05, + "loss": 1.347, + "mean_token_accuracy": 0.673460324605306, + "num_tokens": 1025321964.0, + "step": 6104 + }, + { + "entropy": 1.742532879114151, + "epoch": 0.6706764439317788, + "grad_norm": 0.7387831807136536, + "learning_rate": 1.5948676965633792e-05, + "loss": 1.303, + "mean_token_accuracy": 0.6769290367762247, + "num_tokens": 1025430147.0, + "step": 6105 + }, + { + "entropy": 1.7955981294314067, + "epoch": 0.6707863008431518, + "grad_norm": 0.8614792227745056, + "learning_rate": 1.594733952615392e-05, + "loss": 1.4031, + "mean_token_accuracy": 0.6521740754445394, + "num_tokens": 1025548085.0, + "step": 6106 + }, + { + "entropy": 1.7102086047331493, + "epoch": 0.6708961577545247, + "grad_norm": 0.6578072309494019, + "learning_rate": 1.5946001930092983e-05, + "loss": 1.4516, + "mean_token_accuracy": 0.6479578018188477, + "num_tokens": 1025705026.0, + "step": 6107 + }, + { + "entropy": 1.635363906621933, + "epoch": 0.6710060146658977, + "grad_norm": 0.6915444731712341, + "learning_rate": 1.5944664177493313e-05, + "loss": 1.4012, + "mean_token_accuracy": 0.6578799436489741, + "num_tokens": 1025840042.0, + "step": 6108 + }, + { + "entropy": 1.7048865755399067, + "epoch": 0.6711158715772706, + "grad_norm": 0.666670024394989, + "learning_rate": 1.594332626839725e-05, + "loss": 1.5948, + "mean_token_accuracy": 0.6330678189794222, + "num_tokens": 1026035070.0, + "step": 6109 + }, + { + "entropy": 1.711029291152954, + "epoch": 0.6712257284886436, + "grad_norm": 0.7791758179664612, + "learning_rate": 1.594198820284714e-05, + "loss": 1.4302, + "mean_token_accuracy": 0.6709011346101761, + "num_tokens": 1026175979.0, + "step": 6110 + }, + { + "entropy": 1.6606249113877614, + "epoch": 0.6713355854000165, + "grad_norm": 0.7085611820220947, + "learning_rate": 1.5940649980885324e-05, + "loss": 1.5086, + "mean_token_accuracy": 0.645029549797376, + "num_tokens": 1026404722.0, + "step": 6111 + }, + { + "entropy": 1.7165546814600627, + "epoch": 0.6714454423113894, + "grad_norm": 0.7058658599853516, + "learning_rate": 1.5939311602554168e-05, + "loss": 1.3256, + "mean_token_accuracy": 0.6548676739136378, + "num_tokens": 1026548934.0, + "step": 6112 + }, + { + "entropy": 1.7104195555051167, + "epoch": 0.6715552992227624, + "grad_norm": 0.6687393188476562, + "learning_rate": 1.5937973067896025e-05, + "loss": 1.4702, + "mean_token_accuracy": 0.6464681526025137, + "num_tokens": 1026779833.0, + "step": 6113 + }, + { + "entropy": 1.6975898842016857, + "epoch": 0.6716651561341352, + "grad_norm": 0.575406014919281, + "learning_rate": 1.593663437695326e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6549033125241598, + "num_tokens": 1026985027.0, + "step": 6114 + }, + { + "entropy": 1.7248026132583618, + "epoch": 0.6717750130455082, + "grad_norm": 0.7494511008262634, + "learning_rate": 1.593529552976824e-05, + "loss": 1.3871, + "mean_token_accuracy": 0.6491140872240067, + "num_tokens": 1027159348.0, + "step": 6115 + }, + { + "entropy": 1.629348337650299, + "epoch": 0.6718848699568811, + "grad_norm": 0.650229275226593, + "learning_rate": 1.593395652638334e-05, + "loss": 1.4643, + "mean_token_accuracy": 0.6616547207037607, + "num_tokens": 1027322685.0, + "step": 6116 + }, + { + "entropy": 1.6977149446805317, + "epoch": 0.6719947268682541, + "grad_norm": 0.7905436754226685, + "learning_rate": 1.593261736684094e-05, + "loss": 1.4179, + "mean_token_accuracy": 0.6655702938636144, + "num_tokens": 1027467183.0, + "step": 6117 + }, + { + "entropy": 1.6418705681959789, + "epoch": 0.672104583779627, + "grad_norm": 0.6807728409767151, + "learning_rate": 1.593127805118342e-05, + "loss": 1.4285, + "mean_token_accuracy": 0.6624650160471598, + "num_tokens": 1027669991.0, + "step": 6118 + }, + { + "entropy": 1.693480223417282, + "epoch": 0.672214440691, + "grad_norm": 0.6987659335136414, + "learning_rate": 1.5929938579453178e-05, + "loss": 1.4377, + "mean_token_accuracy": 0.6526973893245062, + "num_tokens": 1027818472.0, + "step": 6119 + }, + { + "entropy": 1.7187944451967876, + "epoch": 0.6723242976023729, + "grad_norm": 0.6811079382896423, + "learning_rate": 1.5928598951692596e-05, + "loss": 1.277, + "mean_token_accuracy": 0.6688454498847326, + "num_tokens": 1027933443.0, + "step": 6120 + }, + { + "entropy": 1.6929615139961243, + "epoch": 0.6724341545137459, + "grad_norm": 0.6487001776695251, + "learning_rate": 1.592725916794408e-05, + "loss": 1.4681, + "mean_token_accuracy": 0.6509808599948883, + "num_tokens": 1028117633.0, + "step": 6121 + }, + { + "entropy": 1.700806051492691, + "epoch": 0.6725440114251188, + "grad_norm": 0.6837039589881897, + "learning_rate": 1.5925919228250034e-05, + "loss": 1.3143, + "mean_token_accuracy": 0.6903966218233109, + "num_tokens": 1028242732.0, + "step": 6122 + }, + { + "entropy": 1.6499686141808827, + "epoch": 0.6726538683364918, + "grad_norm": 0.6767922043800354, + "learning_rate": 1.592457913265286e-05, + "loss": 1.3945, + "mean_token_accuracy": 0.6531237810850143, + "num_tokens": 1028426037.0, + "step": 6123 + }, + { + "entropy": 1.6716104646523793, + "epoch": 0.6727637252478647, + "grad_norm": 0.6955103874206543, + "learning_rate": 1.5923238881194976e-05, + "loss": 1.4092, + "mean_token_accuracy": 0.649432510137558, + "num_tokens": 1028628426.0, + "step": 6124 + }, + { + "entropy": 1.6994199852148693, + "epoch": 0.6728735821592375, + "grad_norm": 2.9740710258483887, + "learning_rate": 1.5921898473918802e-05, + "loss": 1.017, + "mean_token_accuracy": 0.6964697390794754, + "num_tokens": 1028793307.0, + "step": 6125 + }, + { + "entropy": 1.6781040628751118, + "epoch": 0.6729834390706105, + "grad_norm": 0.6323825120925903, + "learning_rate": 1.592055791086676e-05, + "loss": 1.406, + "mean_token_accuracy": 0.6498600840568542, + "num_tokens": 1029003857.0, + "step": 6126 + }, + { + "entropy": 1.691665271917979, + "epoch": 0.6730932959819834, + "grad_norm": 0.6065126657485962, + "learning_rate": 1.5919217192081273e-05, + "loss": 1.4436, + "mean_token_accuracy": 0.6468114952246348, + "num_tokens": 1029184665.0, + "step": 6127 + }, + { + "entropy": 1.7233734627564747, + "epoch": 0.6732031528933564, + "grad_norm": 0.8082062005996704, + "learning_rate": 1.5917876317604785e-05, + "loss": 1.4065, + "mean_token_accuracy": 0.6604608694712321, + "num_tokens": 1029348548.0, + "step": 6128 + }, + { + "entropy": 1.6546673774719238, + "epoch": 0.6733130098047293, + "grad_norm": 0.6178780198097229, + "learning_rate": 1.591653528747972e-05, + "loss": 1.2925, + "mean_token_accuracy": 0.6696785638729731, + "num_tokens": 1029529896.0, + "step": 6129 + }, + { + "entropy": 1.7272561589876811, + "epoch": 0.6734228667161023, + "grad_norm": 0.6660004258155823, + "learning_rate": 1.591519410174853e-05, + "loss": 1.4051, + "mean_token_accuracy": 0.6591061949729919, + "num_tokens": 1029681501.0, + "step": 6130 + }, + { + "entropy": 1.7080715497334797, + "epoch": 0.6735327236274752, + "grad_norm": 0.5781081318855286, + "learning_rate": 1.5913852760453667e-05, + "loss": 1.394, + "mean_token_accuracy": 0.6472490082184473, + "num_tokens": 1029910927.0, + "step": 6131 + }, + { + "entropy": 1.6690656940142314, + "epoch": 0.6736425805388482, + "grad_norm": 0.6107634902000427, + "learning_rate": 1.5912511263637576e-05, + "loss": 1.3232, + "mean_token_accuracy": 0.66473917166392, + "num_tokens": 1030091804.0, + "step": 6132 + }, + { + "entropy": 1.7400188446044922, + "epoch": 0.6737524374502211, + "grad_norm": 0.7845686674118042, + "learning_rate": 1.5911169611342716e-05, + "loss": 1.3731, + "mean_token_accuracy": 0.6524705936511358, + "num_tokens": 1030234959.0, + "step": 6133 + }, + { + "entropy": 1.6985375185807545, + "epoch": 0.6738622943615941, + "grad_norm": 0.6205952763557434, + "learning_rate": 1.5909827803611553e-05, + "loss": 1.3825, + "mean_token_accuracy": 0.6545093754927317, + "num_tokens": 1030412195.0, + "step": 6134 + }, + { + "entropy": 1.7195107738176982, + "epoch": 0.673972151272967, + "grad_norm": 0.8514347672462463, + "learning_rate": 1.590848584048655e-05, + "loss": 1.4261, + "mean_token_accuracy": 0.6498820533355077, + "num_tokens": 1030556047.0, + "step": 6135 + }, + { + "entropy": 1.717844436566035, + "epoch": 0.6740820081843399, + "grad_norm": 0.8321981430053711, + "learning_rate": 1.5907143722010183e-05, + "loss": 1.4249, + "mean_token_accuracy": 0.6527943263451258, + "num_tokens": 1030723162.0, + "step": 6136 + }, + { + "entropy": 1.7162880500157673, + "epoch": 0.6741918650957128, + "grad_norm": 0.6410884857177734, + "learning_rate": 1.590580144822493e-05, + "loss": 1.3525, + "mean_token_accuracy": 0.6689903736114502, + "num_tokens": 1030890509.0, + "step": 6137 + }, + { + "entropy": 1.7301206290721893, + "epoch": 0.6743017220070858, + "grad_norm": 0.735442578792572, + "learning_rate": 1.5904459019173266e-05, + "loss": 1.2337, + "mean_token_accuracy": 0.6780840853850046, + "num_tokens": 1030998724.0, + "step": 6138 + }, + { + "entropy": 1.693600704272588, + "epoch": 0.6744115789184587, + "grad_norm": 0.6858952641487122, + "learning_rate": 1.590311643489769e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.6619731138149897, + "num_tokens": 1031180321.0, + "step": 6139 + }, + { + "entropy": 1.691060831149419, + "epoch": 0.6745214358298316, + "grad_norm": 0.7244452238082886, + "learning_rate": 1.5901773695440684e-05, + "loss": 1.5419, + "mean_token_accuracy": 0.638955608010292, + "num_tokens": 1031349446.0, + "step": 6140 + }, + { + "entropy": 1.673604021469752, + "epoch": 0.6746312927412046, + "grad_norm": 0.5594522953033447, + "learning_rate": 1.5900430800844752e-05, + "loss": 1.3823, + "mean_token_accuracy": 0.6514309992392858, + "num_tokens": 1031556297.0, + "step": 6141 + }, + { + "entropy": 1.69135985771815, + "epoch": 0.6747411496525775, + "grad_norm": 0.7993478775024414, + "learning_rate": 1.5899087751152395e-05, + "loss": 1.2584, + "mean_token_accuracy": 0.675132155418396, + "num_tokens": 1031680407.0, + "step": 6142 + }, + { + "entropy": 1.6793262263139088, + "epoch": 0.6748510065639505, + "grad_norm": 0.7275417447090149, + "learning_rate": 1.5897744546406117e-05, + "loss": 1.3664, + "mean_token_accuracy": 0.6670361111561457, + "num_tokens": 1031833401.0, + "step": 6143 + }, + { + "entropy": 1.7409860094388325, + "epoch": 0.6749608634753234, + "grad_norm": 0.6921661496162415, + "learning_rate": 1.5896401186648428e-05, + "loss": 1.3467, + "mean_token_accuracy": 0.6612462997436523, + "num_tokens": 1031967537.0, + "step": 6144 + }, + { + "entropy": 1.7186478873093922, + "epoch": 0.6750707203866964, + "grad_norm": 0.5948835611343384, + "learning_rate": 1.589505767192185e-05, + "loss": 1.4484, + "mean_token_accuracy": 0.651375338435173, + "num_tokens": 1032134696.0, + "step": 6145 + }, + { + "entropy": 1.753745198249817, + "epoch": 0.6751805772980692, + "grad_norm": 0.7294279932975769, + "learning_rate": 1.58937140022689e-05, + "loss": 1.4038, + "mean_token_accuracy": 0.6501768082380295, + "num_tokens": 1032292941.0, + "step": 6146 + }, + { + "entropy": 1.7068160772323608, + "epoch": 0.6752904342094422, + "grad_norm": 0.8932915925979614, + "learning_rate": 1.5892370177732112e-05, + "loss": 1.4417, + "mean_token_accuracy": 0.6559292525053024, + "num_tokens": 1032429050.0, + "step": 6147 + }, + { + "entropy": 1.7068482637405396, + "epoch": 0.6754002911208151, + "grad_norm": 0.7544932961463928, + "learning_rate": 1.5891026198354007e-05, + "loss": 1.4954, + "mean_token_accuracy": 0.6591473271449407, + "num_tokens": 1032581362.0, + "step": 6148 + }, + { + "entropy": 1.7536579171816509, + "epoch": 0.6755101480321881, + "grad_norm": 0.7992092967033386, + "learning_rate": 1.588968206417713e-05, + "loss": 1.3389, + "mean_token_accuracy": 0.6687672038873037, + "num_tokens": 1032691286.0, + "step": 6149 + }, + { + "entropy": 1.7206375002861023, + "epoch": 0.675620004943561, + "grad_norm": 0.7990248799324036, + "learning_rate": 1.588833777524402e-05, + "loss": 1.3008, + "mean_token_accuracy": 0.6754638602336248, + "num_tokens": 1032812888.0, + "step": 6150 + }, + { + "entropy": 1.7509271105130513, + "epoch": 0.675729861854934, + "grad_norm": 0.6977331638336182, + "learning_rate": 1.588699333159722e-05, + "loss": 1.5671, + "mean_token_accuracy": 0.6463130315144857, + "num_tokens": 1032974721.0, + "step": 6151 + }, + { + "entropy": 1.6518239478270214, + "epoch": 0.6758397187663069, + "grad_norm": 0.8337535262107849, + "learning_rate": 1.5885648733279286e-05, + "loss": 1.5285, + "mean_token_accuracy": 0.6606726894776026, + "num_tokens": 1033125949.0, + "step": 6152 + }, + { + "entropy": 1.7431319256623585, + "epoch": 0.6759495756776798, + "grad_norm": 0.6203035116195679, + "learning_rate": 1.588430398033277e-05, + "loss": 1.4083, + "mean_token_accuracy": 0.6556287507216135, + "num_tokens": 1033291109.0, + "step": 6153 + }, + { + "entropy": 1.7404470642407734, + "epoch": 0.6760594325890528, + "grad_norm": 0.6632646322250366, + "learning_rate": 1.588295907280023e-05, + "loss": 1.49, + "mean_token_accuracy": 0.6481446127096812, + "num_tokens": 1033508731.0, + "step": 6154 + }, + { + "entropy": 1.6975667675336201, + "epoch": 0.6761692895004257, + "grad_norm": 0.6166266202926636, + "learning_rate": 1.588161401072424e-05, + "loss": 1.3478, + "mean_token_accuracy": 0.6563719709714254, + "num_tokens": 1033700041.0, + "step": 6155 + }, + { + "entropy": 1.6918539802233379, + "epoch": 0.6762791464117986, + "grad_norm": 0.6156592965126038, + "learning_rate": 1.5880268794147365e-05, + "loss": 1.4967, + "mean_token_accuracy": 0.6441917419433594, + "num_tokens": 1033902009.0, + "step": 6156 + }, + { + "entropy": 1.6845496892929077, + "epoch": 0.6763890033231715, + "grad_norm": 0.7185717821121216, + "learning_rate": 1.587892342311218e-05, + "loss": 1.3168, + "mean_token_accuracy": 0.6657861719528834, + "num_tokens": 1034057209.0, + "step": 6157 + }, + { + "entropy": 1.716551125049591, + "epoch": 0.6764988602345445, + "grad_norm": 0.8153448104858398, + "learning_rate": 1.587757789766127e-05, + "loss": 1.482, + "mean_token_accuracy": 0.6406472225983938, + "num_tokens": 1034276379.0, + "step": 6158 + }, + { + "entropy": 1.6670089562733967, + "epoch": 0.6766087171459174, + "grad_norm": 0.6375927925109863, + "learning_rate": 1.5876232217837216e-05, + "loss": 1.3635, + "mean_token_accuracy": 0.667756125330925, + "num_tokens": 1034474837.0, + "step": 6159 + }, + { + "entropy": 1.762067049741745, + "epoch": 0.6767185740572904, + "grad_norm": 0.6941974759101868, + "learning_rate": 1.587488638368261e-05, + "loss": 1.4817, + "mean_token_accuracy": 0.6408074299494425, + "num_tokens": 1034653291.0, + "step": 6160 + }, + { + "entropy": 1.6927312711874645, + "epoch": 0.6768284309686633, + "grad_norm": 0.6574122309684753, + "learning_rate": 1.5873540395240046e-05, + "loss": 1.2565, + "mean_token_accuracy": 0.67339259882768, + "num_tokens": 1034797989.0, + "step": 6161 + }, + { + "entropy": 1.6968045830726624, + "epoch": 0.6769382878800363, + "grad_norm": 0.8542110919952393, + "learning_rate": 1.5872194252552127e-05, + "loss": 1.6373, + "mean_token_accuracy": 0.6415252710382143, + "num_tokens": 1034991775.0, + "step": 6162 + }, + { + "entropy": 1.6781950394312541, + "epoch": 0.6770481447914092, + "grad_norm": 0.6783519983291626, + "learning_rate": 1.587084795566145e-05, + "loss": 1.2693, + "mean_token_accuracy": 0.6693742970625559, + "num_tokens": 1035130150.0, + "step": 6163 + }, + { + "entropy": 1.6896279752254486, + "epoch": 0.6771580017027822, + "grad_norm": 0.5958226323127747, + "learning_rate": 1.5869501504610636e-05, + "loss": 1.449, + "mean_token_accuracy": 0.6530238687992096, + "num_tokens": 1035318883.0, + "step": 6164 + }, + { + "entropy": 1.7746345500151317, + "epoch": 0.6772678586141551, + "grad_norm": 0.7527851462364197, + "learning_rate": 1.5868154899442293e-05, + "loss": 1.4133, + "mean_token_accuracy": 0.6448677480220795, + "num_tokens": 1035489150.0, + "step": 6165 + }, + { + "entropy": 1.610521674156189, + "epoch": 0.677377715525528, + "grad_norm": 0.6777754426002502, + "learning_rate": 1.5866808140199037e-05, + "loss": 1.4785, + "mean_token_accuracy": 0.65462859471639, + "num_tokens": 1035662720.0, + "step": 6166 + }, + { + "entropy": 1.7118818759918213, + "epoch": 0.6774875724369009, + "grad_norm": 0.6027317047119141, + "learning_rate": 1.5865461226923497e-05, + "loss": 1.4702, + "mean_token_accuracy": 0.6561297823985418, + "num_tokens": 1035828628.0, + "step": 6167 + }, + { + "entropy": 1.6369926929473877, + "epoch": 0.6775974293482738, + "grad_norm": 0.707374095916748, + "learning_rate": 1.5864114159658305e-05, + "loss": 1.4544, + "mean_token_accuracy": 0.6614964008331299, + "num_tokens": 1036021623.0, + "step": 6168 + }, + { + "entropy": 1.6904229422410328, + "epoch": 0.6777072862596468, + "grad_norm": 0.6496714949607849, + "learning_rate": 1.5862766938446092e-05, + "loss": 1.499, + "mean_token_accuracy": 0.6475720703601837, + "num_tokens": 1036206904.0, + "step": 6169 + }, + { + "entropy": 1.6665221452713013, + "epoch": 0.6778171431710197, + "grad_norm": 0.6841485500335693, + "learning_rate": 1.5861419563329493e-05, + "loss": 1.4886, + "mean_token_accuracy": 0.6600009699662527, + "num_tokens": 1036450534.0, + "step": 6170 + }, + { + "entropy": 1.6735211809476216, + "epoch": 0.6779270000823927, + "grad_norm": 0.6442497968673706, + "learning_rate": 1.586007203435115e-05, + "loss": 1.2632, + "mean_token_accuracy": 0.6747845361630121, + "num_tokens": 1036623414.0, + "step": 6171 + }, + { + "entropy": 1.6664145290851593, + "epoch": 0.6780368569937656, + "grad_norm": 0.7533520460128784, + "learning_rate": 1.585872435155373e-05, + "loss": 1.4729, + "mean_token_accuracy": 0.662231961886088, + "num_tokens": 1036784462.0, + "step": 6172 + }, + { + "entropy": 1.7120993534723918, + "epoch": 0.6781467139051386, + "grad_norm": 0.6117445826530457, + "learning_rate": 1.5857376514979866e-05, + "loss": 1.3504, + "mean_token_accuracy": 0.659230629603068, + "num_tokens": 1036938244.0, + "step": 6173 + }, + { + "entropy": 1.7340730726718903, + "epoch": 0.6782565708165115, + "grad_norm": 0.7913200259208679, + "learning_rate": 1.5856028524672227e-05, + "loss": 1.452, + "mean_token_accuracy": 0.668962687253952, + "num_tokens": 1037128398.0, + "step": 6174 + }, + { + "entropy": 1.7015940447648366, + "epoch": 0.6783664277278845, + "grad_norm": 0.6943639516830444, + "learning_rate": 1.585468038067347e-05, + "loss": 1.5163, + "mean_token_accuracy": 0.6409951796134313, + "num_tokens": 1037288363.0, + "step": 6175 + }, + { + "entropy": 1.673763672510783, + "epoch": 0.6784762846392574, + "grad_norm": 0.6866292357444763, + "learning_rate": 1.5853332083026268e-05, + "loss": 1.3081, + "mean_token_accuracy": 0.6622492522001266, + "num_tokens": 1037442624.0, + "step": 6176 + }, + { + "entropy": 1.703792671362559, + "epoch": 0.6785861415506304, + "grad_norm": 0.6945456266403198, + "learning_rate": 1.5851983631773297e-05, + "loss": 1.4855, + "mean_token_accuracy": 0.6414697915315628, + "num_tokens": 1037590916.0, + "step": 6177 + }, + { + "entropy": 1.652319351832072, + "epoch": 0.6786959984620032, + "grad_norm": 0.6403668522834778, + "learning_rate": 1.5850635026957226e-05, + "loss": 1.5006, + "mean_token_accuracy": 0.6437227378288904, + "num_tokens": 1037788898.0, + "step": 6178 + }, + { + "entropy": 1.675025353829066, + "epoch": 0.6788058553733762, + "grad_norm": 0.663935124874115, + "learning_rate": 1.5849286268620744e-05, + "loss": 1.3069, + "mean_token_accuracy": 0.6672319124142329, + "num_tokens": 1037951523.0, + "step": 6179 + }, + { + "entropy": 1.7284078498681386, + "epoch": 0.6789157122847491, + "grad_norm": 0.7808162569999695, + "learning_rate": 1.5847937356806536e-05, + "loss": 1.4562, + "mean_token_accuracy": 0.651724100112915, + "num_tokens": 1038103992.0, + "step": 6180 + }, + { + "entropy": 1.6787553032239277, + "epoch": 0.679025569196122, + "grad_norm": 0.6067541837692261, + "learning_rate": 1.584658829155729e-05, + "loss": 1.4918, + "mean_token_accuracy": 0.646204670270284, + "num_tokens": 1038323458.0, + "step": 6181 + }, + { + "entropy": 1.6700343191623688, + "epoch": 0.679135426107495, + "grad_norm": 0.919571042060852, + "learning_rate": 1.5845239072915715e-05, + "loss": 1.3506, + "mean_token_accuracy": 0.669276679555575, + "num_tokens": 1038462666.0, + "step": 6182 + }, + { + "entropy": 1.720471332470576, + "epoch": 0.6792452830188679, + "grad_norm": 0.7604859471321106, + "learning_rate": 1.5843889700924503e-05, + "loss": 1.414, + "mean_token_accuracy": 0.6454981962839762, + "num_tokens": 1038631829.0, + "step": 6183 + }, + { + "entropy": 1.734667807817459, + "epoch": 0.6793551399302409, + "grad_norm": 0.7525255084037781, + "learning_rate": 1.5842540175626368e-05, + "loss": 1.3427, + "mean_token_accuracy": 0.6558729112148285, + "num_tokens": 1038806094.0, + "step": 6184 + }, + { + "entropy": 1.7814012865225475, + "epoch": 0.6794649968416138, + "grad_norm": 0.678103506565094, + "learning_rate": 1.584119049706402e-05, + "loss": 1.568, + "mean_token_accuracy": 0.6203633447488149, + "num_tokens": 1039038882.0, + "step": 6185 + }, + { + "entropy": 1.7217522263526917, + "epoch": 0.6795748537529868, + "grad_norm": 0.8400986790657043, + "learning_rate": 1.5839840665280168e-05, + "loss": 1.2395, + "mean_token_accuracy": 0.6749545534451803, + "num_tokens": 1039160977.0, + "step": 6186 + }, + { + "entropy": 1.76739635070165, + "epoch": 0.6796847106643596, + "grad_norm": 0.7183486819267273, + "learning_rate": 1.583849068031754e-05, + "loss": 1.3549, + "mean_token_accuracy": 0.6573175837596258, + "num_tokens": 1039304153.0, + "step": 6187 + }, + { + "entropy": 1.7237287163734436, + "epoch": 0.6797945675757326, + "grad_norm": 0.7184303402900696, + "learning_rate": 1.583714054221887e-05, + "loss": 1.2839, + "mean_token_accuracy": 0.6642322937647501, + "num_tokens": 1039447894.0, + "step": 6188 + }, + { + "entropy": 1.7578026056289673, + "epoch": 0.6799044244871055, + "grad_norm": 0.7254183292388916, + "learning_rate": 1.5835790251026875e-05, + "loss": 1.3413, + "mean_token_accuracy": 0.6657581379016241, + "num_tokens": 1039613902.0, + "step": 6189 + }, + { + "entropy": 1.7511154313882191, + "epoch": 0.6800142813984785, + "grad_norm": 0.6806755065917969, + "learning_rate": 1.5834439806784302e-05, + "loss": 1.3657, + "mean_token_accuracy": 0.656483938296636, + "num_tokens": 1039774043.0, + "step": 6190 + }, + { + "entropy": 1.7452365458011627, + "epoch": 0.6801241383098514, + "grad_norm": 0.654483437538147, + "learning_rate": 1.5833089209533883e-05, + "loss": 1.3834, + "mean_token_accuracy": 0.6558238019545873, + "num_tokens": 1039942032.0, + "step": 6191 + }, + { + "entropy": 1.7572171986103058, + "epoch": 0.6802339952212244, + "grad_norm": 0.6855461597442627, + "learning_rate": 1.583173845931837e-05, + "loss": 1.3906, + "mean_token_accuracy": 0.6548371364672979, + "num_tokens": 1040063994.0, + "step": 6192 + }, + { + "entropy": 1.6424880524476368, + "epoch": 0.6803438521325973, + "grad_norm": 0.6944648623466492, + "learning_rate": 1.5830387556180513e-05, + "loss": 1.3699, + "mean_token_accuracy": 0.6685495773951212, + "num_tokens": 1040204031.0, + "step": 6193 + }, + { + "entropy": 1.6730513870716095, + "epoch": 0.6804537090439702, + "grad_norm": 0.6423060894012451, + "learning_rate": 1.5829036500163068e-05, + "loss": 1.489, + "mean_token_accuracy": 0.6339628795782725, + "num_tokens": 1040432452.0, + "step": 6194 + }, + { + "entropy": 1.7136758367220561, + "epoch": 0.6805635659553432, + "grad_norm": 1.136176347732544, + "learning_rate": 1.582768529130879e-05, + "loss": 1.3342, + "mean_token_accuracy": 0.667153442899386, + "num_tokens": 1040602648.0, + "step": 6195 + }, + { + "entropy": 1.6480139096577961, + "epoch": 0.6806734228667161, + "grad_norm": 0.6092087626457214, + "learning_rate": 1.582633392966045e-05, + "loss": 1.4523, + "mean_token_accuracy": 0.6621314485867819, + "num_tokens": 1040799318.0, + "step": 6196 + }, + { + "entropy": 1.7159779965877533, + "epoch": 0.6807832797780891, + "grad_norm": 0.7131394743919373, + "learning_rate": 1.5824982415260815e-05, + "loss": 1.3849, + "mean_token_accuracy": 0.6469999005397161, + "num_tokens": 1040951208.0, + "step": 6197 + }, + { + "entropy": 1.6891703208287556, + "epoch": 0.6808931366894619, + "grad_norm": 0.6422286033630371, + "learning_rate": 1.5823630748152663e-05, + "loss": 1.4928, + "mean_token_accuracy": 0.6543713063001633, + "num_tokens": 1041111600.0, + "step": 6198 + }, + { + "entropy": 1.6836450199286144, + "epoch": 0.6810029936008349, + "grad_norm": 0.6734887361526489, + "learning_rate": 1.582227892837877e-05, + "loss": 1.3554, + "mean_token_accuracy": 0.6502887507279714, + "num_tokens": 1041265524.0, + "step": 6199 + }, + { + "entropy": 1.7192556262016296, + "epoch": 0.6811128505122078, + "grad_norm": 0.6905456781387329, + "learning_rate": 1.582092695598192e-05, + "loss": 1.4584, + "mean_token_accuracy": 0.6490669349829356, + "num_tokens": 1041441178.0, + "step": 6200 + }, + { + "entropy": 1.7496082484722137, + "epoch": 0.6812227074235808, + "grad_norm": 0.7641483545303345, + "learning_rate": 1.5819574831004908e-05, + "loss": 1.5886, + "mean_token_accuracy": 0.6521731615066528, + "num_tokens": 1041595991.0, + "step": 6201 + }, + { + "entropy": 1.6868942181269329, + "epoch": 0.6813325643349537, + "grad_norm": 0.7057383060455322, + "learning_rate": 1.5818222553490522e-05, + "loss": 1.5181, + "mean_token_accuracy": 0.6546740233898163, + "num_tokens": 1041778203.0, + "step": 6202 + }, + { + "entropy": 1.7335337499777477, + "epoch": 0.6814424212463267, + "grad_norm": 0.6150233149528503, + "learning_rate": 1.5816870123481563e-05, + "loss": 1.5563, + "mean_token_accuracy": 0.6352181782325109, + "num_tokens": 1041979155.0, + "step": 6203 + }, + { + "entropy": 1.7341649134953816, + "epoch": 0.6815522781576996, + "grad_norm": 0.6595054864883423, + "learning_rate": 1.5815517541020832e-05, + "loss": 1.32, + "mean_token_accuracy": 0.6687678645054499, + "num_tokens": 1042146615.0, + "step": 6204 + }, + { + "entropy": 1.7071249385674794, + "epoch": 0.6816621350690726, + "grad_norm": 0.6596662998199463, + "learning_rate": 1.5814164806151146e-05, + "loss": 1.36, + "mean_token_accuracy": 0.6612856537103653, + "num_tokens": 1042321366.0, + "step": 6205 + }, + { + "entropy": 1.6722217202186584, + "epoch": 0.6817719919804455, + "grad_norm": 0.7822057008743286, + "learning_rate": 1.5812811918915313e-05, + "loss": 1.2883, + "mean_token_accuracy": 0.6672490139802297, + "num_tokens": 1042459772.0, + "step": 6206 + }, + { + "entropy": 1.7275211314360301, + "epoch": 0.6818818488918184, + "grad_norm": 0.6742196679115295, + "learning_rate": 1.581145887935615e-05, + "loss": 1.331, + "mean_token_accuracy": 0.6632985124985377, + "num_tokens": 1042570398.0, + "step": 6207 + }, + { + "entropy": 1.6800503234068553, + "epoch": 0.6819917058031914, + "grad_norm": 0.6366540193557739, + "learning_rate": 1.581010568751648e-05, + "loss": 1.3628, + "mean_token_accuracy": 0.6498023221890131, + "num_tokens": 1042718102.0, + "step": 6208 + }, + { + "entropy": 1.7179384032885234, + "epoch": 0.6821015627145642, + "grad_norm": 0.6339711546897888, + "learning_rate": 1.5808752343439133e-05, + "loss": 1.4198, + "mean_token_accuracy": 0.6559414863586426, + "num_tokens": 1042906564.0, + "step": 6209 + }, + { + "entropy": 1.6670476098855336, + "epoch": 0.6822114196259372, + "grad_norm": 0.6430389881134033, + "learning_rate": 1.5807398847166943e-05, + "loss": 1.3518, + "mean_token_accuracy": 0.6729106456041336, + "num_tokens": 1043114289.0, + "step": 6210 + }, + { + "entropy": 1.714191863934199, + "epoch": 0.6823212765373101, + "grad_norm": 0.7927173376083374, + "learning_rate": 1.5806045198742743e-05, + "loss": 1.4358, + "mean_token_accuracy": 0.6493388712406158, + "num_tokens": 1043303392.0, + "step": 6211 + }, + { + "entropy": 1.7568532327810924, + "epoch": 0.6824311334486831, + "grad_norm": 0.7937549352645874, + "learning_rate": 1.5804691398209386e-05, + "loss": 1.3607, + "mean_token_accuracy": 0.673745925227801, + "num_tokens": 1043483335.0, + "step": 6212 + }, + { + "entropy": 1.7293648322423298, + "epoch": 0.682540990360056, + "grad_norm": 0.6161656975746155, + "learning_rate": 1.5803337445609705e-05, + "loss": 1.4712, + "mean_token_accuracy": 0.6608046044905981, + "num_tokens": 1043684540.0, + "step": 6213 + }, + { + "entropy": 1.6890058120091755, + "epoch": 0.682650847271429, + "grad_norm": 0.7689279913902283, + "learning_rate": 1.5801983340986556e-05, + "loss": 1.3705, + "mean_token_accuracy": 0.6688697884480158, + "num_tokens": 1043864929.0, + "step": 6214 + }, + { + "entropy": 1.6985827287038167, + "epoch": 0.6827607041828019, + "grad_norm": 0.7502182126045227, + "learning_rate": 1.58006290843828e-05, + "loss": 1.4028, + "mean_token_accuracy": 0.6728604584932327, + "num_tokens": 1044030942.0, + "step": 6215 + }, + { + "entropy": 1.7380037407080333, + "epoch": 0.6828705610941749, + "grad_norm": 0.7070371508598328, + "learning_rate": 1.57992746758413e-05, + "loss": 1.3499, + "mean_token_accuracy": 0.6579258392254511, + "num_tokens": 1044225669.0, + "step": 6216 + }, + { + "entropy": 1.7195264895757039, + "epoch": 0.6829804180055478, + "grad_norm": 0.6869642734527588, + "learning_rate": 1.5797920115404913e-05, + "loss": 1.5184, + "mean_token_accuracy": 0.639784961938858, + "num_tokens": 1044399612.0, + "step": 6217 + }, + { + "entropy": 1.7750992178916931, + "epoch": 0.6830902749169208, + "grad_norm": 0.7000865936279297, + "learning_rate": 1.579656540311652e-05, + "loss": 1.5207, + "mean_token_accuracy": 0.6384162952502569, + "num_tokens": 1044622029.0, + "step": 6218 + }, + { + "entropy": 1.7211995124816895, + "epoch": 0.6832001318282936, + "grad_norm": 0.6937360763549805, + "learning_rate": 1.5795210539018996e-05, + "loss": 1.3323, + "mean_token_accuracy": 0.6666232148806254, + "num_tokens": 1044760684.0, + "step": 6219 + }, + { + "entropy": 1.723791241645813, + "epoch": 0.6833099887396665, + "grad_norm": 0.6360770463943481, + "learning_rate": 1.5793855523155214e-05, + "loss": 1.4283, + "mean_token_accuracy": 0.6450154383977255, + "num_tokens": 1044965265.0, + "step": 6220 + }, + { + "entropy": 1.6446198523044586, + "epoch": 0.6834198456510395, + "grad_norm": 0.7213537096977234, + "learning_rate": 1.5792500355568068e-05, + "loss": 1.2592, + "mean_token_accuracy": 0.683139756321907, + "num_tokens": 1045099732.0, + "step": 6221 + }, + { + "entropy": 1.6635667781035106, + "epoch": 0.6835297025624124, + "grad_norm": 0.7380548119544983, + "learning_rate": 1.5791145036300442e-05, + "loss": 1.4444, + "mean_token_accuracy": 0.6551113228003184, + "num_tokens": 1045308158.0, + "step": 6222 + }, + { + "entropy": 1.6907614171504974, + "epoch": 0.6836395594737854, + "grad_norm": 0.7286980748176575, + "learning_rate": 1.578978956539524e-05, + "loss": 1.4878, + "mean_token_accuracy": 0.6524398873249689, + "num_tokens": 1045510413.0, + "step": 6223 + }, + { + "entropy": 1.662454883257548, + "epoch": 0.6837494163851583, + "grad_norm": 0.6473574042320251, + "learning_rate": 1.5788433942895355e-05, + "loss": 1.2923, + "mean_token_accuracy": 0.6852824489275614, + "num_tokens": 1045657586.0, + "step": 6224 + }, + { + "entropy": 1.670077880223592, + "epoch": 0.6838592732965313, + "grad_norm": 0.7075192332267761, + "learning_rate": 1.5787078168843692e-05, + "loss": 1.2946, + "mean_token_accuracy": 0.6709181269009908, + "num_tokens": 1045801180.0, + "step": 6225 + }, + { + "entropy": 1.6907998820145924, + "epoch": 0.6839691302079042, + "grad_norm": 0.8029499053955078, + "learning_rate": 1.578572224328316e-05, + "loss": 1.5165, + "mean_token_accuracy": 0.6446396013100942, + "num_tokens": 1046004096.0, + "step": 6226 + }, + { + "entropy": 1.6825261414051056, + "epoch": 0.6840789871192772, + "grad_norm": 0.6166555285453796, + "learning_rate": 1.578436616625668e-05, + "loss": 1.4492, + "mean_token_accuracy": 0.6457721889019012, + "num_tokens": 1046213155.0, + "step": 6227 + }, + { + "entropy": 1.6593104998270671, + "epoch": 0.6841888440306501, + "grad_norm": 0.6207210421562195, + "learning_rate": 1.5783009937807163e-05, + "loss": 1.438, + "mean_token_accuracy": 0.64504507680734, + "num_tokens": 1046418954.0, + "step": 6228 + }, + { + "entropy": 1.6873585283756256, + "epoch": 0.684298700942023, + "grad_norm": 0.7427178621292114, + "learning_rate": 1.578165355797754e-05, + "loss": 1.414, + "mean_token_accuracy": 0.6532572110493978, + "num_tokens": 1046565869.0, + "step": 6229 + }, + { + "entropy": 1.7066354652245839, + "epoch": 0.6844085578533959, + "grad_norm": 0.6317359209060669, + "learning_rate": 1.5780297026810735e-05, + "loss": 1.3242, + "mean_token_accuracy": 0.6639684538046519, + "num_tokens": 1046700589.0, + "step": 6230 + }, + { + "entropy": 1.677226612965266, + "epoch": 0.6845184147647689, + "grad_norm": 0.6120962500572205, + "learning_rate": 1.5778940344349683e-05, + "loss": 1.369, + "mean_token_accuracy": 0.6590713312228521, + "num_tokens": 1046882958.0, + "step": 6231 + }, + { + "entropy": 1.7622087995211284, + "epoch": 0.6846282716761418, + "grad_norm": 0.7370443940162659, + "learning_rate": 1.5777583510637322e-05, + "loss": 1.4434, + "mean_token_accuracy": 0.6658334483702978, + "num_tokens": 1047045020.0, + "step": 6232 + }, + { + "entropy": 1.6845203638076782, + "epoch": 0.6847381285875148, + "grad_norm": 0.7158510088920593, + "learning_rate": 1.5776226525716597e-05, + "loss": 1.5932, + "mean_token_accuracy": 0.6466062217950821, + "num_tokens": 1047231565.0, + "step": 6233 + }, + { + "entropy": 1.679573267698288, + "epoch": 0.6848479854988877, + "grad_norm": 0.6586665511131287, + "learning_rate": 1.5774869389630452e-05, + "loss": 1.4108, + "mean_token_accuracy": 0.6435726036628088, + "num_tokens": 1047446628.0, + "step": 6234 + }, + { + "entropy": 1.7317407031853993, + "epoch": 0.6849578424102606, + "grad_norm": 0.7508777976036072, + "learning_rate": 1.5773512102421845e-05, + "loss": 1.2569, + "mean_token_accuracy": 0.6819585313399633, + "num_tokens": 1047551424.0, + "step": 6235 + }, + { + "entropy": 1.718717743953069, + "epoch": 0.6850676993216336, + "grad_norm": 0.6454728841781616, + "learning_rate": 1.5772154664133728e-05, + "loss": 1.3644, + "mean_token_accuracy": 0.6561121046543121, + "num_tokens": 1047686224.0, + "step": 6236 + }, + { + "entropy": 1.662040372689565, + "epoch": 0.6851775562330065, + "grad_norm": 0.7368992567062378, + "learning_rate": 1.5770797074809072e-05, + "loss": 1.3664, + "mean_token_accuracy": 0.6641696294148763, + "num_tokens": 1047865517.0, + "step": 6237 + }, + { + "entropy": 1.6964517335096996, + "epoch": 0.6852874131443795, + "grad_norm": 0.7247259020805359, + "learning_rate": 1.5769439334490836e-05, + "loss": 1.4358, + "mean_token_accuracy": 0.6385401288668314, + "num_tokens": 1048103495.0, + "step": 6238 + }, + { + "entropy": 1.6373733182748158, + "epoch": 0.6853972700557524, + "grad_norm": 0.664681613445282, + "learning_rate": 1.576808144322199e-05, + "loss": 1.5413, + "mean_token_accuracy": 0.6597782919804255, + "num_tokens": 1048314364.0, + "step": 6239 + }, + { + "entropy": 1.689589689175288, + "epoch": 0.6855071269671253, + "grad_norm": 0.7749432921409607, + "learning_rate": 1.576672340104552e-05, + "loss": 1.331, + "mean_token_accuracy": 0.6749097357193629, + "num_tokens": 1048518232.0, + "step": 6240 + }, + { + "entropy": 1.64166193207105, + "epoch": 0.6856169838784982, + "grad_norm": 0.6244519352912903, + "learning_rate": 1.57653652080044e-05, + "loss": 1.4521, + "mean_token_accuracy": 0.64791539311409, + "num_tokens": 1048744637.0, + "step": 6241 + }, + { + "entropy": 1.7174479762713115, + "epoch": 0.6857268407898712, + "grad_norm": 0.6479034423828125, + "learning_rate": 1.576400686414162e-05, + "loss": 1.2893, + "mean_token_accuracy": 0.6633307288090388, + "num_tokens": 1048870167.0, + "step": 6242 + }, + { + "entropy": 1.6295473476250966, + "epoch": 0.6858366977012441, + "grad_norm": 0.6615036725997925, + "learning_rate": 1.5762648369500168e-05, + "loss": 1.2225, + "mean_token_accuracy": 0.6706995218992233, + "num_tokens": 1049004380.0, + "step": 6243 + }, + { + "entropy": 1.6601495742797852, + "epoch": 0.6859465546126171, + "grad_norm": 0.6963624358177185, + "learning_rate": 1.576128972412304e-05, + "loss": 1.4818, + "mean_token_accuracy": 0.6445038865009943, + "num_tokens": 1049221111.0, + "step": 6244 + }, + { + "entropy": 1.740762710571289, + "epoch": 0.68605641152399, + "grad_norm": 0.746908962726593, + "learning_rate": 1.575993092805324e-05, + "loss": 1.3535, + "mean_token_accuracy": 0.6617808093627294, + "num_tokens": 1049354467.0, + "step": 6245 + }, + { + "entropy": 1.7130144536495209, + "epoch": 0.686166268435363, + "grad_norm": 0.6983469724655151, + "learning_rate": 1.575857198133377e-05, + "loss": 1.3761, + "mean_token_accuracy": 0.6857681721448898, + "num_tokens": 1049515088.0, + "step": 6246 + }, + { + "entropy": 1.6818876961867015, + "epoch": 0.6862761253467359, + "grad_norm": 0.6924529671669006, + "learning_rate": 1.575721288400764e-05, + "loss": 1.3247, + "mean_token_accuracy": 0.661658505598704, + "num_tokens": 1049683399.0, + "step": 6247 + }, + { + "entropy": 1.683126876751582, + "epoch": 0.6863859822581088, + "grad_norm": 0.7934529185295105, + "learning_rate": 1.5755853636117868e-05, + "loss": 1.27, + "mean_token_accuracy": 0.6863173047701517, + "num_tokens": 1049839321.0, + "step": 6248 + }, + { + "entropy": 1.7363257110118866, + "epoch": 0.6864958391694818, + "grad_norm": 0.8478215932846069, + "learning_rate": 1.575449423770747e-05, + "loss": 1.4775, + "mean_token_accuracy": 0.641467904051145, + "num_tokens": 1050004112.0, + "step": 6249 + }, + { + "entropy": 1.7177755236625671, + "epoch": 0.6866056960808546, + "grad_norm": 0.7801303863525391, + "learning_rate": 1.575313468881947e-05, + "loss": 1.2044, + "mean_token_accuracy": 0.6855588108301163, + "num_tokens": 1050126993.0, + "step": 6250 + }, + { + "entropy": 1.6946475009123485, + "epoch": 0.6867155529922276, + "grad_norm": 0.6409901976585388, + "learning_rate": 1.5751774989496905e-05, + "loss": 1.4604, + "mean_token_accuracy": 0.6573443065087, + "num_tokens": 1050290667.0, + "step": 6251 + }, + { + "entropy": 1.7695001463095348, + "epoch": 0.6868254099036005, + "grad_norm": 0.8223657608032227, + "learning_rate": 1.5750415139782796e-05, + "loss": 1.3509, + "mean_token_accuracy": 0.661173606912295, + "num_tokens": 1050391272.0, + "step": 6252 + }, + { + "entropy": 1.7316470444202423, + "epoch": 0.6869352668149735, + "grad_norm": 0.6836245656013489, + "learning_rate": 1.5749055139720194e-05, + "loss": 1.2609, + "mean_token_accuracy": 0.6763073106606802, + "num_tokens": 1050517210.0, + "step": 6253 + }, + { + "entropy": 1.7469671567281086, + "epoch": 0.6870451237263464, + "grad_norm": 0.7463729381561279, + "learning_rate": 1.5747694989352133e-05, + "loss": 1.3966, + "mean_token_accuracy": 0.650151307384173, + "num_tokens": 1050692998.0, + "step": 6254 + }, + { + "entropy": 1.6787622570991516, + "epoch": 0.6871549806377194, + "grad_norm": 0.6898899078369141, + "learning_rate": 1.5746334688721668e-05, + "loss": 1.4101, + "mean_token_accuracy": 0.6534204135338465, + "num_tokens": 1050859750.0, + "step": 6255 + }, + { + "entropy": 1.677261749903361, + "epoch": 0.6872648375490923, + "grad_norm": 0.6710191369056702, + "learning_rate": 1.5744974237871844e-05, + "loss": 1.4762, + "mean_token_accuracy": 0.6464169124762217, + "num_tokens": 1051093727.0, + "step": 6256 + }, + { + "entropy": 1.6988216042518616, + "epoch": 0.6873746944604653, + "grad_norm": 0.6617063879966736, + "learning_rate": 1.5743613636845728e-05, + "loss": 1.4268, + "mean_token_accuracy": 0.6438167144854864, + "num_tokens": 1051229009.0, + "step": 6257 + }, + { + "entropy": 1.6750045617421467, + "epoch": 0.6874845513718382, + "grad_norm": 0.83284592628479, + "learning_rate": 1.5742252885686376e-05, + "loss": 1.3649, + "mean_token_accuracy": 0.6614518413941065, + "num_tokens": 1051365725.0, + "step": 6258 + }, + { + "entropy": 1.7287048399448395, + "epoch": 0.6875944082832112, + "grad_norm": 0.7290322780609131, + "learning_rate": 1.574089198443686e-05, + "loss": 1.4128, + "mean_token_accuracy": 0.6501226375500361, + "num_tokens": 1051502305.0, + "step": 6259 + }, + { + "entropy": 1.6979427337646484, + "epoch": 0.687704265194584, + "grad_norm": 0.8345046043395996, + "learning_rate": 1.5739530933140246e-05, + "loss": 1.3527, + "mean_token_accuracy": 0.6696512003739675, + "num_tokens": 1051651829.0, + "step": 6260 + }, + { + "entropy": 1.7011751234531403, + "epoch": 0.6878141221059569, + "grad_norm": 0.6528536677360535, + "learning_rate": 1.5738169731839614e-05, + "loss": 1.49, + "mean_token_accuracy": 0.6397085040807724, + "num_tokens": 1051851383.0, + "step": 6261 + }, + { + "entropy": 1.6753086646397908, + "epoch": 0.6879239790173299, + "grad_norm": 0.7235574126243591, + "learning_rate": 1.5736808380578046e-05, + "loss": 1.262, + "mean_token_accuracy": 0.6736620018879572, + "num_tokens": 1051992386.0, + "step": 6262 + }, + { + "entropy": 1.710617204507192, + "epoch": 0.6880338359287028, + "grad_norm": 0.6896214485168457, + "learning_rate": 1.5735446879398623e-05, + "loss": 1.3153, + "mean_token_accuracy": 0.6659458925326666, + "num_tokens": 1052110869.0, + "step": 6263 + }, + { + "entropy": 1.7205885648727417, + "epoch": 0.6881436928400758, + "grad_norm": 0.7014085650444031, + "learning_rate": 1.5734085228344444e-05, + "loss": 1.5599, + "mean_token_accuracy": 0.6437405745188395, + "num_tokens": 1052279865.0, + "step": 6264 + }, + { + "entropy": 1.7538027067979176, + "epoch": 0.6882535497514487, + "grad_norm": 0.7048670053482056, + "learning_rate": 1.57327234274586e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.6546515574057897, + "num_tokens": 1052427858.0, + "step": 6265 + }, + { + "entropy": 1.7071273426214855, + "epoch": 0.6883634066628217, + "grad_norm": 0.7168692350387573, + "learning_rate": 1.5731361476784194e-05, + "loss": 1.4215, + "mean_token_accuracy": 0.6494481960932413, + "num_tokens": 1052601924.0, + "step": 6266 + }, + { + "entropy": 1.7125795582930248, + "epoch": 0.6884732635741946, + "grad_norm": 0.9502230882644653, + "learning_rate": 1.5729999376364325e-05, + "loss": 1.4322, + "mean_token_accuracy": 0.6579029063383738, + "num_tokens": 1052754664.0, + "step": 6267 + }, + { + "entropy": 1.7782972554365795, + "epoch": 0.6885831204855676, + "grad_norm": 0.7288692593574524, + "learning_rate": 1.572863712624211e-05, + "loss": 1.4447, + "mean_token_accuracy": 0.6494678606589636, + "num_tokens": 1052874157.0, + "step": 6268 + }, + { + "entropy": 1.762984275817871, + "epoch": 0.6886929773969405, + "grad_norm": 0.8050958514213562, + "learning_rate": 1.5727274726460663e-05, + "loss": 1.3848, + "mean_token_accuracy": 0.6562901983658472, + "num_tokens": 1053025611.0, + "step": 6269 + }, + { + "entropy": 1.7107179462909698, + "epoch": 0.6888028343083135, + "grad_norm": 0.6803024411201477, + "learning_rate": 1.57259121770631e-05, + "loss": 1.4613, + "mean_token_accuracy": 0.6524020483096441, + "num_tokens": 1053217911.0, + "step": 6270 + }, + { + "entropy": 1.7233806550502777, + "epoch": 0.6889126912196863, + "grad_norm": 0.811576783657074, + "learning_rate": 1.5724549478092544e-05, + "loss": 1.446, + "mean_token_accuracy": 0.6530528217554092, + "num_tokens": 1053439531.0, + "step": 6271 + }, + { + "entropy": 1.7882909178733826, + "epoch": 0.6890225481310593, + "grad_norm": 1.5311130285263062, + "learning_rate": 1.572318662959213e-05, + "loss": 1.5377, + "mean_token_accuracy": 0.6330806364615759, + "num_tokens": 1053607774.0, + "step": 6272 + }, + { + "entropy": 1.714803675810496, + "epoch": 0.6891324050424322, + "grad_norm": 0.6682481169700623, + "learning_rate": 1.572182363160498e-05, + "loss": 1.4491, + "mean_token_accuracy": 0.6306456079085668, + "num_tokens": 1053808448.0, + "step": 6273 + }, + { + "entropy": 1.7131713926792145, + "epoch": 0.6892422619538052, + "grad_norm": 0.6053488254547119, + "learning_rate": 1.5720460484174248e-05, + "loss": 1.3343, + "mean_token_accuracy": 0.6529037654399872, + "num_tokens": 1053985060.0, + "step": 6274 + }, + { + "entropy": 1.7150470713774364, + "epoch": 0.6893521188651781, + "grad_norm": 0.5887476801872253, + "learning_rate": 1.571909718734306e-05, + "loss": 1.4858, + "mean_token_accuracy": 0.638887827595075, + "num_tokens": 1054203107.0, + "step": 6275 + }, + { + "entropy": 1.6366633176803589, + "epoch": 0.689461975776551, + "grad_norm": 0.7320172786712646, + "learning_rate": 1.5717733741154578e-05, + "loss": 1.3864, + "mean_token_accuracy": 0.6679045160611471, + "num_tokens": 1054457206.0, + "step": 6276 + }, + { + "entropy": 1.7303629020849864, + "epoch": 0.689571832687924, + "grad_norm": 0.7379525303840637, + "learning_rate": 1.5716370145651952e-05, + "loss": 1.2187, + "mean_token_accuracy": 0.6834086825450262, + "num_tokens": 1054595268.0, + "step": 6277 + }, + { + "entropy": 1.688062181075414, + "epoch": 0.6896816895992969, + "grad_norm": 0.620086669921875, + "learning_rate": 1.571500640087833e-05, + "loss": 1.4843, + "mean_token_accuracy": 0.6472740769386292, + "num_tokens": 1054820248.0, + "step": 6278 + }, + { + "entropy": 1.6793719629446666, + "epoch": 0.6897915465106699, + "grad_norm": 0.7746139764785767, + "learning_rate": 1.5713642506876882e-05, + "loss": 1.3796, + "mean_token_accuracy": 0.6590060293674469, + "num_tokens": 1054990280.0, + "step": 6279 + }, + { + "entropy": 1.7257548173268635, + "epoch": 0.6899014034220428, + "grad_norm": 0.5978219509124756, + "learning_rate": 1.5712278463690774e-05, + "loss": 1.5817, + "mean_token_accuracy": 0.6256022801001867, + "num_tokens": 1055207569.0, + "step": 6280 + }, + { + "entropy": 1.7200091977914174, + "epoch": 0.6900112603334158, + "grad_norm": 0.6994427442550659, + "learning_rate": 1.5710914271363177e-05, + "loss": 1.2819, + "mean_token_accuracy": 0.6736390839020411, + "num_tokens": 1055332726.0, + "step": 6281 + }, + { + "entropy": 1.6921034355958302, + "epoch": 0.6901211172447886, + "grad_norm": 0.6004651784896851, + "learning_rate": 1.5709549929937263e-05, + "loss": 1.355, + "mean_token_accuracy": 0.6671501100063324, + "num_tokens": 1055490790.0, + "step": 6282 + }, + { + "entropy": 1.683781623840332, + "epoch": 0.6902309741561616, + "grad_norm": 0.7334898114204407, + "learning_rate": 1.5708185439456216e-05, + "loss": 1.3117, + "mean_token_accuracy": 0.6710262993971506, + "num_tokens": 1055607208.0, + "step": 6283 + }, + { + "entropy": 1.6216355661551158, + "epoch": 0.6903408310675345, + "grad_norm": 0.6549416780471802, + "learning_rate": 1.570682079996322e-05, + "loss": 1.3981, + "mean_token_accuracy": 0.6610698650280634, + "num_tokens": 1055755985.0, + "step": 6284 + }, + { + "entropy": 1.6326968371868134, + "epoch": 0.6904506879789075, + "grad_norm": 0.7157843708992004, + "learning_rate": 1.570545601150147e-05, + "loss": 1.4508, + "mean_token_accuracy": 0.6708127508560816, + "num_tokens": 1055946453.0, + "step": 6285 + }, + { + "entropy": 1.7335582971572876, + "epoch": 0.6905605448902804, + "grad_norm": 0.8362358212471008, + "learning_rate": 1.570409107411416e-05, + "loss": 1.3361, + "mean_token_accuracy": 0.654664600888888, + "num_tokens": 1056076709.0, + "step": 6286 + }, + { + "entropy": 1.7172273596127827, + "epoch": 0.6906704018016534, + "grad_norm": 0.6369051933288574, + "learning_rate": 1.5702725987844483e-05, + "loss": 1.4074, + "mean_token_accuracy": 0.6473657737175623, + "num_tokens": 1056239339.0, + "step": 6287 + }, + { + "entropy": 1.7123624682426453, + "epoch": 0.6907802587130263, + "grad_norm": 0.7884210348129272, + "learning_rate": 1.5701360752735648e-05, + "loss": 1.3425, + "mean_token_accuracy": 0.6655222127834955, + "num_tokens": 1056418414.0, + "step": 6288 + }, + { + "entropy": 1.6548800269762676, + "epoch": 0.6908901156243992, + "grad_norm": 0.5178220868110657, + "learning_rate": 1.5699995368830866e-05, + "loss": 1.3778, + "mean_token_accuracy": 0.6649152934551239, + "num_tokens": 1056649643.0, + "step": 6289 + }, + { + "entropy": 1.7105699678262074, + "epoch": 0.6909999725357722, + "grad_norm": 0.7892933487892151, + "learning_rate": 1.5698629836173346e-05, + "loss": 1.4091, + "mean_token_accuracy": 0.6649167090654373, + "num_tokens": 1056777360.0, + "step": 6290 + }, + { + "entropy": 1.7512224813302357, + "epoch": 0.691109829447145, + "grad_norm": 0.893334686756134, + "learning_rate": 1.5697264154806307e-05, + "loss": 1.4202, + "mean_token_accuracy": 0.6645146906375885, + "num_tokens": 1056975900.0, + "step": 6291 + }, + { + "entropy": 1.7694937487443287, + "epoch": 0.691219686358518, + "grad_norm": 0.6863387823104858, + "learning_rate": 1.569589832477298e-05, + "loss": 1.3598, + "mean_token_accuracy": 0.653782253464063, + "num_tokens": 1057120050.0, + "step": 6292 + }, + { + "entropy": 1.7361672918001811, + "epoch": 0.6913295432698909, + "grad_norm": 0.8135111927986145, + "learning_rate": 1.5694532346116583e-05, + "loss": 1.4873, + "mean_token_accuracy": 0.6366982012987137, + "num_tokens": 1057309306.0, + "step": 6293 + }, + { + "entropy": 1.6364782353242238, + "epoch": 0.6914394001812639, + "grad_norm": 0.6994947195053101, + "learning_rate": 1.5693166218880352e-05, + "loss": 1.3949, + "mean_token_accuracy": 0.6615471492211024, + "num_tokens": 1057484747.0, + "step": 6294 + }, + { + "entropy": 1.683074374993642, + "epoch": 0.6915492570926368, + "grad_norm": 0.6820729970932007, + "learning_rate": 1.5691799943107525e-05, + "loss": 1.3869, + "mean_token_accuracy": 0.6594087183475494, + "num_tokens": 1057653783.0, + "step": 6295 + }, + { + "entropy": 1.7118146419525146, + "epoch": 0.6916591140040098, + "grad_norm": 0.6727958917617798, + "learning_rate": 1.569043351884135e-05, + "loss": 1.3533, + "mean_token_accuracy": 0.6669302682081858, + "num_tokens": 1057793762.0, + "step": 6296 + }, + { + "entropy": 1.7263545592625935, + "epoch": 0.6917689709153827, + "grad_norm": 0.6792665123939514, + "learning_rate": 1.568906694612506e-05, + "loss": 1.4398, + "mean_token_accuracy": 0.6504052480061849, + "num_tokens": 1057966395.0, + "step": 6297 + }, + { + "entropy": 1.6999529401461284, + "epoch": 0.6918788278267557, + "grad_norm": 0.689646303653717, + "learning_rate": 1.5687700225001918e-05, + "loss": 1.227, + "mean_token_accuracy": 0.6813174386819204, + "num_tokens": 1058117252.0, + "step": 6298 + }, + { + "entropy": 1.6544790466626484, + "epoch": 0.6919886847381286, + "grad_norm": 0.7752074003219604, + "learning_rate": 1.5686333355515174e-05, + "loss": 1.3064, + "mean_token_accuracy": 0.6794366339842478, + "num_tokens": 1058273397.0, + "step": 6299 + }, + { + "entropy": 1.655057470003764, + "epoch": 0.6920985416495016, + "grad_norm": 0.6356586217880249, + "learning_rate": 1.5684966337708092e-05, + "loss": 1.3602, + "mean_token_accuracy": 0.6644938240448633, + "num_tokens": 1058443764.0, + "step": 6300 + }, + { + "entropy": 1.7445928851763408, + "epoch": 0.6922083985608745, + "grad_norm": 0.6769724488258362, + "learning_rate": 1.568359917162394e-05, + "loss": 1.3361, + "mean_token_accuracy": 0.6576990932226181, + "num_tokens": 1058589677.0, + "step": 6301 + }, + { + "entropy": 1.7161558667818706, + "epoch": 0.6923182554722473, + "grad_norm": 0.5942689180374146, + "learning_rate": 1.5682231857305978e-05, + "loss": 1.439, + "mean_token_accuracy": 0.6340092917283376, + "num_tokens": 1058821342.0, + "step": 6302 + }, + { + "entropy": 1.6530260841051738, + "epoch": 0.6924281123836203, + "grad_norm": 0.7436016798019409, + "learning_rate": 1.5680864394797492e-05, + "loss": 1.2778, + "mean_token_accuracy": 0.6772982229789098, + "num_tokens": 1058986237.0, + "step": 6303 + }, + { + "entropy": 1.7283445000648499, + "epoch": 0.6925379692949932, + "grad_norm": 0.588995635509491, + "learning_rate": 1.5679496784141757e-05, + "loss": 1.4262, + "mean_token_accuracy": 0.6482555766900381, + "num_tokens": 1059187307.0, + "step": 6304 + }, + { + "entropy": 1.6520406504472096, + "epoch": 0.6926478262063662, + "grad_norm": 0.5398334860801697, + "learning_rate": 1.5678129025382055e-05, + "loss": 1.3553, + "mean_token_accuracy": 0.6467950393756231, + "num_tokens": 1059396490.0, + "step": 6305 + }, + { + "entropy": 1.739362935225169, + "epoch": 0.6927576831177391, + "grad_norm": 0.7606070041656494, + "learning_rate": 1.5676761118561677e-05, + "loss": 1.2727, + "mean_token_accuracy": 0.6647885292768478, + "num_tokens": 1059501468.0, + "step": 6306 + }, + { + "entropy": 1.7537157237529755, + "epoch": 0.6928675400291121, + "grad_norm": 0.7173927426338196, + "learning_rate": 1.567539306372392e-05, + "loss": 1.3187, + "mean_token_accuracy": 0.664209653933843, + "num_tokens": 1059610640.0, + "step": 6307 + }, + { + "entropy": 1.699594388405482, + "epoch": 0.692977396940485, + "grad_norm": 0.6624416708946228, + "learning_rate": 1.5674024860912082e-05, + "loss": 1.3283, + "mean_token_accuracy": 0.6614074061314265, + "num_tokens": 1059746715.0, + "step": 6308 + }, + { + "entropy": 1.6853571037451427, + "epoch": 0.693087253851858, + "grad_norm": 0.6250735521316528, + "learning_rate": 1.5672656510169458e-05, + "loss": 1.4415, + "mean_token_accuracy": 0.6549326082070669, + "num_tokens": 1059928735.0, + "step": 6309 + }, + { + "entropy": 1.7501811683177948, + "epoch": 0.6931971107632309, + "grad_norm": 0.6659023761749268, + "learning_rate": 1.5671288011539364e-05, + "loss": 1.4479, + "mean_token_accuracy": 0.6498491813739141, + "num_tokens": 1060079413.0, + "step": 6310 + }, + { + "entropy": 1.6326484680175781, + "epoch": 0.6933069676746039, + "grad_norm": 0.5456228256225586, + "learning_rate": 1.5669919365065108e-05, + "loss": 1.4142, + "mean_token_accuracy": 0.6575722495714823, + "num_tokens": 1060259112.0, + "step": 6311 + }, + { + "entropy": 1.615660309791565, + "epoch": 0.6934168245859768, + "grad_norm": 0.8097618222236633, + "learning_rate": 1.5668550570790005e-05, + "loss": 1.5787, + "mean_token_accuracy": 0.6560301234324774, + "num_tokens": 1060428273.0, + "step": 6312 + }, + { + "entropy": 1.7176474730173747, + "epoch": 0.6935266814973498, + "grad_norm": 0.8573592901229858, + "learning_rate": 1.5667181628757388e-05, + "loss": 1.3698, + "mean_token_accuracy": 0.6745945662260056, + "num_tokens": 1060607857.0, + "step": 6313 + }, + { + "entropy": 1.6333107848962147, + "epoch": 0.6936365384087226, + "grad_norm": 0.6240670680999756, + "learning_rate": 1.566581253901057e-05, + "loss": 1.2348, + "mean_token_accuracy": 0.6824917644262314, + "num_tokens": 1060749584.0, + "step": 6314 + }, + { + "entropy": 1.7455682655175526, + "epoch": 0.6937463953200955, + "grad_norm": 0.6654044985771179, + "learning_rate": 1.5664443301592887e-05, + "loss": 1.4652, + "mean_token_accuracy": 0.6368297090133032, + "num_tokens": 1060897523.0, + "step": 6315 + }, + { + "entropy": 1.7202888826529186, + "epoch": 0.6938562522314685, + "grad_norm": 0.6972677707672119, + "learning_rate": 1.5663073916547676e-05, + "loss": 1.3784, + "mean_token_accuracy": 0.6670991877714793, + "num_tokens": 1061067739.0, + "step": 6316 + }, + { + "entropy": 1.6854785978794098, + "epoch": 0.6939661091428414, + "grad_norm": 1.7558538913726807, + "learning_rate": 1.5661704383918277e-05, + "loss": 1.4532, + "mean_token_accuracy": 0.6418699026107788, + "num_tokens": 1061300533.0, + "step": 6317 + }, + { + "entropy": 1.6643561919530232, + "epoch": 0.6940759660542144, + "grad_norm": 0.7051677107810974, + "learning_rate": 1.5660334703748037e-05, + "loss": 1.3727, + "mean_token_accuracy": 0.6603148529926935, + "num_tokens": 1061439950.0, + "step": 6318 + }, + { + "entropy": 1.7029017508029938, + "epoch": 0.6941858229655873, + "grad_norm": 0.7083843946456909, + "learning_rate": 1.5658964876080304e-05, + "loss": 1.3186, + "mean_token_accuracy": 0.6736210584640503, + "num_tokens": 1061611747.0, + "step": 6319 + }, + { + "entropy": 1.6688139041264851, + "epoch": 0.6942956798769603, + "grad_norm": 0.6067943572998047, + "learning_rate": 1.565759490095843e-05, + "loss": 1.5312, + "mean_token_accuracy": 0.6481608798106512, + "num_tokens": 1061835071.0, + "step": 6320 + }, + { + "entropy": 1.7166197299957275, + "epoch": 0.6944055367883332, + "grad_norm": 0.6531895995140076, + "learning_rate": 1.5656224778425776e-05, + "loss": 1.4703, + "mean_token_accuracy": 0.6500556915998459, + "num_tokens": 1062056631.0, + "step": 6321 + }, + { + "entropy": 1.6937325994173686, + "epoch": 0.6945153936997062, + "grad_norm": 0.6659431457519531, + "learning_rate": 1.565485450852571e-05, + "loss": 1.5078, + "mean_token_accuracy": 0.6421338419119517, + "num_tokens": 1062239900.0, + "step": 6322 + }, + { + "entropy": 1.7239821255207062, + "epoch": 0.694625250611079, + "grad_norm": 0.7423164248466492, + "learning_rate": 1.5653484091301588e-05, + "loss": 1.2976, + "mean_token_accuracy": 0.6711178521315256, + "num_tokens": 1062353554.0, + "step": 6323 + }, + { + "entropy": 1.633136639992396, + "epoch": 0.694735107522452, + "grad_norm": 0.6296790242195129, + "learning_rate": 1.5652113526796798e-05, + "loss": 1.3606, + "mean_token_accuracy": 0.6649264395236969, + "num_tokens": 1062555732.0, + "step": 6324 + }, + { + "entropy": 1.7429419159889221, + "epoch": 0.6948449644338249, + "grad_norm": 0.910716712474823, + "learning_rate": 1.5650742815054706e-05, + "loss": 1.4169, + "mean_token_accuracy": 0.6470295091470083, + "num_tokens": 1062773381.0, + "step": 6325 + }, + { + "entropy": 1.6380923291047413, + "epoch": 0.6949548213451979, + "grad_norm": 0.5968899726867676, + "learning_rate": 1.564937195611871e-05, + "loss": 1.3611, + "mean_token_accuracy": 0.6606245140234629, + "num_tokens": 1062970446.0, + "step": 6326 + }, + { + "entropy": 1.7097805937131245, + "epoch": 0.6950646782565708, + "grad_norm": 0.8309935331344604, + "learning_rate": 1.5648000950032177e-05, + "loss": 1.3135, + "mean_token_accuracy": 0.6708834419647852, + "num_tokens": 1063075251.0, + "step": 6327 + }, + { + "entropy": 1.687490463256836, + "epoch": 0.6951745351679438, + "grad_norm": 0.6703983545303345, + "learning_rate": 1.564662979683851e-05, + "loss": 1.3279, + "mean_token_accuracy": 0.6707485318183899, + "num_tokens": 1063191541.0, + "step": 6328 + }, + { + "entropy": 1.6575310031572978, + "epoch": 0.6952843920793167, + "grad_norm": 0.6347379088401794, + "learning_rate": 1.5645258496581105e-05, + "loss": 1.4302, + "mean_token_accuracy": 0.6440733820199966, + "num_tokens": 1063388167.0, + "step": 6329 + }, + { + "entropy": 1.6861818035443623, + "epoch": 0.6953942489906896, + "grad_norm": 0.6363089680671692, + "learning_rate": 1.564388704930336e-05, + "loss": 1.3444, + "mean_token_accuracy": 0.662903368473053, + "num_tokens": 1063543659.0, + "step": 6330 + }, + { + "entropy": 1.7344481647014618, + "epoch": 0.6955041059020626, + "grad_norm": 0.8041152358055115, + "learning_rate": 1.5642515455048684e-05, + "loss": 1.3584, + "mean_token_accuracy": 0.686911458770434, + "num_tokens": 1063676616.0, + "step": 6331 + }, + { + "entropy": 1.7182823022206624, + "epoch": 0.6956139628134355, + "grad_norm": 0.7374937534332275, + "learning_rate": 1.5641143713860485e-05, + "loss": 1.406, + "mean_token_accuracy": 0.6564443459113439, + "num_tokens": 1063821043.0, + "step": 6332 + }, + { + "entropy": 1.6801859041055043, + "epoch": 0.6957238197248085, + "grad_norm": 0.8506401181221008, + "learning_rate": 1.563977182578218e-05, + "loss": 1.4541, + "mean_token_accuracy": 0.6540651917457581, + "num_tokens": 1063987781.0, + "step": 6333 + }, + { + "entropy": 1.6580947836240132, + "epoch": 0.6958336766361813, + "grad_norm": 0.5996966361999512, + "learning_rate": 1.563839979085719e-05, + "loss": 1.3718, + "mean_token_accuracy": 0.663277710477511, + "num_tokens": 1064197609.0, + "step": 6334 + }, + { + "entropy": 1.7448161741097767, + "epoch": 0.6959435335475543, + "grad_norm": 0.6750478148460388, + "learning_rate": 1.563702760912893e-05, + "loss": 1.3593, + "mean_token_accuracy": 0.6488836805025736, + "num_tokens": 1064346954.0, + "step": 6335 + }, + { + "entropy": 1.6830095052719116, + "epoch": 0.6960533904589272, + "grad_norm": 0.7102033495903015, + "learning_rate": 1.5635655280640844e-05, + "loss": 1.4087, + "mean_token_accuracy": 0.6654968212048212, + "num_tokens": 1064517282.0, + "step": 6336 + }, + { + "entropy": 1.7004373967647552, + "epoch": 0.6961632473703002, + "grad_norm": 0.6220065355300903, + "learning_rate": 1.563428280543635e-05, + "loss": 1.363, + "mean_token_accuracy": 0.6625643819570541, + "num_tokens": 1064668244.0, + "step": 6337 + }, + { + "entropy": 1.6941338976224263, + "epoch": 0.6962731042816731, + "grad_norm": 0.6489022970199585, + "learning_rate": 1.5632910183558895e-05, + "loss": 1.3424, + "mean_token_accuracy": 0.6700575947761536, + "num_tokens": 1064823692.0, + "step": 6338 + }, + { + "entropy": 1.676298052072525, + "epoch": 0.6963829611930461, + "grad_norm": 0.7490513920783997, + "learning_rate": 1.5631537415051927e-05, + "loss": 1.3607, + "mean_token_accuracy": 0.6670024891694387, + "num_tokens": 1064991450.0, + "step": 6339 + }, + { + "entropy": 1.6780159771442413, + "epoch": 0.696492818104419, + "grad_norm": 0.8336478471755981, + "learning_rate": 1.5630164499958876e-05, + "loss": 1.5349, + "mean_token_accuracy": 0.6541972657044729, + "num_tokens": 1065167869.0, + "step": 6340 + }, + { + "entropy": 1.693702240784963, + "epoch": 0.696602675015792, + "grad_norm": 0.6953732967376709, + "learning_rate": 1.562879143832321e-05, + "loss": 1.2524, + "mean_token_accuracy": 0.6819103260835012, + "num_tokens": 1065309858.0, + "step": 6341 + }, + { + "entropy": 1.684336523214976, + "epoch": 0.6967125319271649, + "grad_norm": 0.6483939290046692, + "learning_rate": 1.562741823018838e-05, + "loss": 1.2624, + "mean_token_accuracy": 0.6634860585133234, + "num_tokens": 1065503318.0, + "step": 6342 + }, + { + "entropy": 1.7288965284824371, + "epoch": 0.6968223888385378, + "grad_norm": 0.7463001012802124, + "learning_rate": 1.562604487559785e-05, + "loss": 1.5298, + "mean_token_accuracy": 0.6451147546370825, + "num_tokens": 1065693529.0, + "step": 6343 + }, + { + "entropy": 1.7293485403060913, + "epoch": 0.6969322457499108, + "grad_norm": 0.6564697623252869, + "learning_rate": 1.5624671374595083e-05, + "loss": 1.3069, + "mean_token_accuracy": 0.6628097891807556, + "num_tokens": 1065829037.0, + "step": 6344 + }, + { + "entropy": 1.738810787598292, + "epoch": 0.6970421026612836, + "grad_norm": 0.6779906749725342, + "learning_rate": 1.5623297727223554e-05, + "loss": 1.3215, + "mean_token_accuracy": 0.6662501196066538, + "num_tokens": 1065959965.0, + "step": 6345 + }, + { + "entropy": 1.7129548887411754, + "epoch": 0.6971519595726566, + "grad_norm": 0.814060628414154, + "learning_rate": 1.5621923933526734e-05, + "loss": 1.3439, + "mean_token_accuracy": 0.6748589227596918, + "num_tokens": 1066076653.0, + "step": 6346 + }, + { + "entropy": 1.7468621532122295, + "epoch": 0.6972618164840295, + "grad_norm": 0.6097841858863831, + "learning_rate": 1.56205499935481e-05, + "loss": 1.4377, + "mean_token_accuracy": 0.6586494793494543, + "num_tokens": 1066260701.0, + "step": 6347 + }, + { + "entropy": 1.6852293213208516, + "epoch": 0.6973716733954025, + "grad_norm": 0.6476978063583374, + "learning_rate": 1.561917590733115e-05, + "loss": 1.332, + "mean_token_accuracy": 0.6707625389099121, + "num_tokens": 1066460345.0, + "step": 6348 + }, + { + "entropy": 1.7005867660045624, + "epoch": 0.6974815303067754, + "grad_norm": 0.6457695364952087, + "learning_rate": 1.5617801674919353e-05, + "loss": 1.4474, + "mean_token_accuracy": 0.649574855963389, + "num_tokens": 1066634701.0, + "step": 6349 + }, + { + "entropy": 1.6940444807211559, + "epoch": 0.6975913872181484, + "grad_norm": 0.7139136791229248, + "learning_rate": 1.5616427296356217e-05, + "loss": 1.3646, + "mean_token_accuracy": 0.6607652654250463, + "num_tokens": 1066769091.0, + "step": 6350 + }, + { + "entropy": 1.6971173187096913, + "epoch": 0.6977012441295213, + "grad_norm": 0.7305136919021606, + "learning_rate": 1.561505277168524e-05, + "loss": 1.3967, + "mean_token_accuracy": 0.6508905241886774, + "num_tokens": 1066944238.0, + "step": 6351 + }, + { + "entropy": 1.6950480441252391, + "epoch": 0.6978111010408943, + "grad_norm": 0.8133467435836792, + "learning_rate": 1.561367810094992e-05, + "loss": 1.4793, + "mean_token_accuracy": 0.6544815003871918, + "num_tokens": 1067126672.0, + "step": 6352 + }, + { + "entropy": 1.7079233924547832, + "epoch": 0.6979209579522672, + "grad_norm": 0.7765207290649414, + "learning_rate": 1.5612303284193765e-05, + "loss": 1.4357, + "mean_token_accuracy": 0.6562918275594711, + "num_tokens": 1067302213.0, + "step": 6353 + }, + { + "entropy": 1.7212556799252827, + "epoch": 0.6980308148636402, + "grad_norm": 0.7137874364852905, + "learning_rate": 1.5610928321460296e-05, + "loss": 1.3325, + "mean_token_accuracy": 0.6592159370581309, + "num_tokens": 1067451247.0, + "step": 6354 + }, + { + "entropy": 1.7126038074493408, + "epoch": 0.698140671775013, + "grad_norm": 0.8555010557174683, + "learning_rate": 1.5609553212793018e-05, + "loss": 1.4302, + "mean_token_accuracy": 0.6475658317406973, + "num_tokens": 1067578152.0, + "step": 6355 + }, + { + "entropy": 1.695282369852066, + "epoch": 0.6982505286863859, + "grad_norm": 0.6748037338256836, + "learning_rate": 1.5608177958235462e-05, + "loss": 1.267, + "mean_token_accuracy": 0.6690774112939835, + "num_tokens": 1067684477.0, + "step": 6356 + }, + { + "entropy": 1.69649139046669, + "epoch": 0.6983603855977589, + "grad_norm": 0.7423410415649414, + "learning_rate": 1.560680255783115e-05, + "loss": 1.2596, + "mean_token_accuracy": 0.6732803036769232, + "num_tokens": 1067800400.0, + "step": 6357 + }, + { + "entropy": 1.7296584745248158, + "epoch": 0.6984702425091318, + "grad_norm": 0.6657732129096985, + "learning_rate": 1.560542701162361e-05, + "loss": 1.4479, + "mean_token_accuracy": 0.6483340859413147, + "num_tokens": 1068016618.0, + "step": 6358 + }, + { + "entropy": 1.7768322229385376, + "epoch": 0.6985800994205048, + "grad_norm": 0.7509652972221375, + "learning_rate": 1.5604051319656378e-05, + "loss": 1.5285, + "mean_token_accuracy": 0.6426715403795242, + "num_tokens": 1068197524.0, + "step": 6359 + }, + { + "entropy": 1.729514628648758, + "epoch": 0.6986899563318777, + "grad_norm": 0.8853446841239929, + "learning_rate": 1.5602675481973003e-05, + "loss": 1.3558, + "mean_token_accuracy": 0.6598193844159445, + "num_tokens": 1068352214.0, + "step": 6360 + }, + { + "entropy": 1.714291383822759, + "epoch": 0.6987998132432507, + "grad_norm": 0.6338637471199036, + "learning_rate": 1.5601299498617017e-05, + "loss": 1.5695, + "mean_token_accuracy": 0.6288912991682688, + "num_tokens": 1068538787.0, + "step": 6361 + }, + { + "entropy": 1.7057184378306072, + "epoch": 0.6989096701546236, + "grad_norm": 0.7257465124130249, + "learning_rate": 1.5599923369631977e-05, + "loss": 1.3388, + "mean_token_accuracy": 0.661540021498998, + "num_tokens": 1068693499.0, + "step": 6362 + }, + { + "entropy": 1.749087264140447, + "epoch": 0.6990195270659966, + "grad_norm": 0.7464898228645325, + "learning_rate": 1.559854709506144e-05, + "loss": 1.2842, + "mean_token_accuracy": 0.6702013909816742, + "num_tokens": 1068847863.0, + "step": 6363 + }, + { + "entropy": 1.7329839169979095, + "epoch": 0.6991293839773695, + "grad_norm": 0.6883919835090637, + "learning_rate": 1.5597170674948956e-05, + "loss": 1.4929, + "mean_token_accuracy": 0.6517574687798818, + "num_tokens": 1069021234.0, + "step": 6364 + }, + { + "entropy": 1.7091910441716511, + "epoch": 0.6992392408887425, + "grad_norm": 0.5777117013931274, + "learning_rate": 1.5595794109338087e-05, + "loss": 1.4065, + "mean_token_accuracy": 0.6439725557963053, + "num_tokens": 1069203920.0, + "step": 6365 + }, + { + "entropy": 1.6956571837266285, + "epoch": 0.6993490978001153, + "grad_norm": 0.6748632192611694, + "learning_rate": 1.559441739827241e-05, + "loss": 1.4705, + "mean_token_accuracy": 0.6408219436804453, + "num_tokens": 1069401303.0, + "step": 6366 + }, + { + "entropy": 1.768018513917923, + "epoch": 0.6994589547114883, + "grad_norm": 0.6776396036148071, + "learning_rate": 1.5593040541795494e-05, + "loss": 1.415, + "mean_token_accuracy": 0.6665412137905756, + "num_tokens": 1069527841.0, + "step": 6367 + }, + { + "entropy": 1.6495687067508698, + "epoch": 0.6995688116228612, + "grad_norm": 0.6302627921104431, + "learning_rate": 1.559166353995091e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6526271998882294, + "num_tokens": 1069725307.0, + "step": 6368 + }, + { + "entropy": 1.7440635164578755, + "epoch": 0.6996786685342341, + "grad_norm": 0.6958877444267273, + "learning_rate": 1.559028639278225e-05, + "loss": 1.4643, + "mean_token_accuracy": 0.6413827786842982, + "num_tokens": 1069924701.0, + "step": 6369 + }, + { + "entropy": 1.7379199266433716, + "epoch": 0.6997885254456071, + "grad_norm": 0.7230368256568909, + "learning_rate": 1.5588909100333093e-05, + "loss": 1.4683, + "mean_token_accuracy": 0.6515718946854273, + "num_tokens": 1070076085.0, + "step": 6370 + }, + { + "entropy": 1.6385211845239003, + "epoch": 0.69989838235698, + "grad_norm": 0.628541886806488, + "learning_rate": 1.5587531662647025e-05, + "loss": 1.4062, + "mean_token_accuracy": 0.6495350897312164, + "num_tokens": 1070269052.0, + "step": 6371 + }, + { + "entropy": 1.7529702385266621, + "epoch": 0.700008239268353, + "grad_norm": 0.6730430126190186, + "learning_rate": 1.558615407976765e-05, + "loss": 1.3968, + "mean_token_accuracy": 0.6596626192331314, + "num_tokens": 1070390227.0, + "step": 6372 + }, + { + "entropy": 1.6995338002840679, + "epoch": 0.7001180961797259, + "grad_norm": 0.600246250629425, + "learning_rate": 1.5584776351738568e-05, + "loss": 1.4458, + "mean_token_accuracy": 0.6408328165610632, + "num_tokens": 1070624225.0, + "step": 6373 + }, + { + "entropy": 1.6173172891139984, + "epoch": 0.7002279530910989, + "grad_norm": 0.7701708674430847, + "learning_rate": 1.5583398478603375e-05, + "loss": 1.3347, + "mean_token_accuracy": 0.6675042559703191, + "num_tokens": 1070802383.0, + "step": 6374 + }, + { + "entropy": 1.6439649661382039, + "epoch": 0.7003378100024718, + "grad_norm": 0.6842703819274902, + "learning_rate": 1.558202046040569e-05, + "loss": 1.3865, + "mean_token_accuracy": 0.6518467565377554, + "num_tokens": 1070971297.0, + "step": 6375 + }, + { + "entropy": 1.7387069861094158, + "epoch": 0.7004476669138447, + "grad_norm": 0.7097147107124329, + "learning_rate": 1.5580642297189122e-05, + "loss": 1.3293, + "mean_token_accuracy": 0.6582437654336294, + "num_tokens": 1071111781.0, + "step": 6376 + }, + { + "entropy": 1.7173890272776287, + "epoch": 0.7005575238252176, + "grad_norm": 0.706751823425293, + "learning_rate": 1.5579263988997286e-05, + "loss": 1.4515, + "mean_token_accuracy": 0.6454547345638275, + "num_tokens": 1071299496.0, + "step": 6377 + }, + { + "entropy": 1.7025466759999592, + "epoch": 0.7006673807365906, + "grad_norm": 0.7652823328971863, + "learning_rate": 1.5577885535873813e-05, + "loss": 1.3607, + "mean_token_accuracy": 0.6740467697381973, + "num_tokens": 1071422802.0, + "step": 6378 + }, + { + "entropy": 1.7247331937154133, + "epoch": 0.7007772376479635, + "grad_norm": 0.6709319353103638, + "learning_rate": 1.5576506937862322e-05, + "loss": 1.3397, + "mean_token_accuracy": 0.6705234696467718, + "num_tokens": 1071594636.0, + "step": 6379 + }, + { + "entropy": 1.7250055472056072, + "epoch": 0.7008870945593365, + "grad_norm": 0.6866453289985657, + "learning_rate": 1.5575128195006452e-05, + "loss": 1.4093, + "mean_token_accuracy": 0.6612274398406347, + "num_tokens": 1071753971.0, + "step": 6380 + }, + { + "entropy": 1.7455697258313496, + "epoch": 0.7009969514707094, + "grad_norm": 0.8073441982269287, + "learning_rate": 1.5573749307349832e-05, + "loss": 1.5399, + "mean_token_accuracy": 0.629800001780192, + "num_tokens": 1071920504.0, + "step": 6381 + }, + { + "entropy": 1.7188852628072102, + "epoch": 0.7011068083820824, + "grad_norm": 0.7286099195480347, + "learning_rate": 1.5572370274936112e-05, + "loss": 1.3478, + "mean_token_accuracy": 0.66085384786129, + "num_tokens": 1072063218.0, + "step": 6382 + }, + { + "entropy": 1.7210610608259838, + "epoch": 0.7012166652934553, + "grad_norm": 0.886602520942688, + "learning_rate": 1.5570991097808926e-05, + "loss": 1.3156, + "mean_token_accuracy": 0.6739104390144348, + "num_tokens": 1072190834.0, + "step": 6383 + }, + { + "entropy": 1.6681243975957234, + "epoch": 0.7013265222048282, + "grad_norm": 0.7629004716873169, + "learning_rate": 1.5569611776011936e-05, + "loss": 1.3262, + "mean_token_accuracy": 0.6660947451988856, + "num_tokens": 1072319190.0, + "step": 6384 + }, + { + "entropy": 1.7308462460835774, + "epoch": 0.7014363791162012, + "grad_norm": 0.7029445767402649, + "learning_rate": 1.5568232309588793e-05, + "loss": 1.5264, + "mean_token_accuracy": 0.6421166161696116, + "num_tokens": 1072545984.0, + "step": 6385 + }, + { + "entropy": 1.7237797677516937, + "epoch": 0.701546236027574, + "grad_norm": 0.6271055936813354, + "learning_rate": 1.5566852698583156e-05, + "loss": 1.4193, + "mean_token_accuracy": 0.6527849485476812, + "num_tokens": 1072742663.0, + "step": 6386 + }, + { + "entropy": 1.7033430834611256, + "epoch": 0.701656092938947, + "grad_norm": 0.851382851600647, + "learning_rate": 1.5565472943038686e-05, + "loss": 1.3205, + "mean_token_accuracy": 0.6494818925857544, + "num_tokens": 1072892465.0, + "step": 6387 + }, + { + "entropy": 1.7225984930992126, + "epoch": 0.7017659498503199, + "grad_norm": 0.8029654622077942, + "learning_rate": 1.5564093042999058e-05, + "loss": 1.2164, + "mean_token_accuracy": 0.6834103514750799, + "num_tokens": 1073004684.0, + "step": 6388 + }, + { + "entropy": 1.660697062810262, + "epoch": 0.7018758067616929, + "grad_norm": 0.654461145401001, + "learning_rate": 1.556271299850794e-05, + "loss": 1.2874, + "mean_token_accuracy": 0.6679888367652893, + "num_tokens": 1073149632.0, + "step": 6389 + }, + { + "entropy": 1.7544045547644298, + "epoch": 0.7019856636730658, + "grad_norm": 0.7389849424362183, + "learning_rate": 1.5561332809609013e-05, + "loss": 1.4401, + "mean_token_accuracy": 0.6510027199983597, + "num_tokens": 1073278621.0, + "step": 6390 + }, + { + "entropy": 1.713613510131836, + "epoch": 0.7020955205844388, + "grad_norm": 0.6665468215942383, + "learning_rate": 1.5559952476345958e-05, + "loss": 1.3568, + "mean_token_accuracy": 0.6602018525203069, + "num_tokens": 1073419861.0, + "step": 6391 + }, + { + "entropy": 1.7007086873054504, + "epoch": 0.7022053774958117, + "grad_norm": 0.5884419083595276, + "learning_rate": 1.555857199876246e-05, + "loss": 1.4787, + "mean_token_accuracy": 0.63471091290315, + "num_tokens": 1073629064.0, + "step": 6392 + }, + { + "entropy": 1.6845936278502147, + "epoch": 0.7023152344071847, + "grad_norm": 0.6721514463424683, + "learning_rate": 1.5557191376902214e-05, + "loss": 1.5321, + "mean_token_accuracy": 0.6354875167210897, + "num_tokens": 1073831920.0, + "step": 6393 + }, + { + "entropy": 1.6878847082455952, + "epoch": 0.7024250913185576, + "grad_norm": 0.787539005279541, + "learning_rate": 1.5555810610808914e-05, + "loss": 1.3595, + "mean_token_accuracy": 0.6564808338880539, + "num_tokens": 1073990510.0, + "step": 6394 + }, + { + "entropy": 1.734977275133133, + "epoch": 0.7025349482299306, + "grad_norm": 0.7654755711555481, + "learning_rate": 1.555442970052626e-05, + "loss": 1.4424, + "mean_token_accuracy": 0.666431744893392, + "num_tokens": 1074150701.0, + "step": 6395 + }, + { + "entropy": 1.6856712996959686, + "epoch": 0.7026448051413035, + "grad_norm": 0.7252474427223206, + "learning_rate": 1.5553048646097958e-05, + "loss": 1.4068, + "mean_token_accuracy": 0.6496947507063547, + "num_tokens": 1074315075.0, + "step": 6396 + }, + { + "entropy": 1.7249629298845928, + "epoch": 0.7027546620526763, + "grad_norm": 0.7137119174003601, + "learning_rate": 1.555166744756772e-05, + "loss": 1.4618, + "mean_token_accuracy": 0.6392665853103002, + "num_tokens": 1074445490.0, + "step": 6397 + }, + { + "entropy": 1.6709438264369965, + "epoch": 0.7028645189640493, + "grad_norm": 0.6605518460273743, + "learning_rate": 1.555028610497926e-05, + "loss": 1.4832, + "mean_token_accuracy": 0.6422385623057684, + "num_tokens": 1074664978.0, + "step": 6398 + }, + { + "entropy": 1.6293116410573323, + "epoch": 0.7029743758754222, + "grad_norm": 0.5970544815063477, + "learning_rate": 1.554890461837629e-05, + "loss": 1.3538, + "mean_token_accuracy": 0.6604219327370325, + "num_tokens": 1074807024.0, + "step": 6399 + }, + { + "entropy": 1.7081689337889354, + "epoch": 0.7030842327867952, + "grad_norm": 0.759528636932373, + "learning_rate": 1.5547522987802542e-05, + "loss": 1.4654, + "mean_token_accuracy": 0.6531898428996404, + "num_tokens": 1074948098.0, + "step": 6400 + }, + { + "entropy": 1.6969364682833354, + "epoch": 0.7031940896981681, + "grad_norm": 0.7736058235168457, + "learning_rate": 1.554614121330174e-05, + "loss": 1.3684, + "mean_token_accuracy": 0.6516063958406448, + "num_tokens": 1075134536.0, + "step": 6401 + }, + { + "entropy": 1.7077515522638957, + "epoch": 0.7033039466095411, + "grad_norm": 0.5851559042930603, + "learning_rate": 1.5544759294917616e-05, + "loss": 1.3913, + "mean_token_accuracy": 0.6567753752072653, + "num_tokens": 1075319222.0, + "step": 6402 + }, + { + "entropy": 1.6978352069854736, + "epoch": 0.703413803520914, + "grad_norm": 0.7662501931190491, + "learning_rate": 1.554337723269391e-05, + "loss": 1.3474, + "mean_token_accuracy": 0.666194369395574, + "num_tokens": 1075447222.0, + "step": 6403 + }, + { + "entropy": 1.7218878070513408, + "epoch": 0.703523660432287, + "grad_norm": 0.6417670249938965, + "learning_rate": 1.5541995026674363e-05, + "loss": 1.4205, + "mean_token_accuracy": 0.6567677110433578, + "num_tokens": 1075603408.0, + "step": 6404 + }, + { + "entropy": 1.6632341345151265, + "epoch": 0.7036335173436599, + "grad_norm": 0.7193872332572937, + "learning_rate": 1.5540612676902715e-05, + "loss": 1.328, + "mean_token_accuracy": 0.6630524943272272, + "num_tokens": 1075726060.0, + "step": 6405 + }, + { + "entropy": 1.794555813074112, + "epoch": 0.7037433742550329, + "grad_norm": 0.7477422952651978, + "learning_rate": 1.5539230183422725e-05, + "loss": 1.3365, + "mean_token_accuracy": 0.6674585938453674, + "num_tokens": 1075847782.0, + "step": 6406 + }, + { + "entropy": 1.7614405552546184, + "epoch": 0.7038532311664057, + "grad_norm": 0.624266505241394, + "learning_rate": 1.5537847546278145e-05, + "loss": 1.3842, + "mean_token_accuracy": 0.653992493947347, + "num_tokens": 1076038754.0, + "step": 6407 + }, + { + "entropy": 1.741749346256256, + "epoch": 0.7039630880777787, + "grad_norm": 0.7164651155471802, + "learning_rate": 1.553646476551274e-05, + "loss": 1.6044, + "mean_token_accuracy": 0.6402417123317719, + "num_tokens": 1076211380.0, + "step": 6408 + }, + { + "entropy": 1.7253131071726482, + "epoch": 0.7040729449891516, + "grad_norm": 0.669684648513794, + "learning_rate": 1.5535081841170257e-05, + "loss": 1.5255, + "mean_token_accuracy": 0.6500623474518458, + "num_tokens": 1076421027.0, + "step": 6409 + }, + { + "entropy": 1.7519688804944356, + "epoch": 0.7041828019005245, + "grad_norm": 0.5820850133895874, + "learning_rate": 1.553369877329449e-05, + "loss": 1.3844, + "mean_token_accuracy": 0.650462418794632, + "num_tokens": 1076583424.0, + "step": 6410 + }, + { + "entropy": 1.6753594875335693, + "epoch": 0.7042926588118975, + "grad_norm": 0.6438754200935364, + "learning_rate": 1.5532315561929194e-05, + "loss": 1.3457, + "mean_token_accuracy": 0.6627227415641149, + "num_tokens": 1076765313.0, + "step": 6411 + }, + { + "entropy": 1.7350413004557292, + "epoch": 0.7044025157232704, + "grad_norm": 0.7367886900901794, + "learning_rate": 1.553093220711815e-05, + "loss": 1.5004, + "mean_token_accuracy": 0.6472184459368387, + "num_tokens": 1076924086.0, + "step": 6412 + }, + { + "entropy": 1.6722463369369507, + "epoch": 0.7045123726346434, + "grad_norm": 0.7393024563789368, + "learning_rate": 1.552954870890515e-05, + "loss": 1.357, + "mean_token_accuracy": 0.663534477353096, + "num_tokens": 1077094084.0, + "step": 6413 + }, + { + "entropy": 1.665319134791692, + "epoch": 0.7046222295460163, + "grad_norm": 0.8351560235023499, + "learning_rate": 1.5528165067333972e-05, + "loss": 1.4145, + "mean_token_accuracy": 0.6641974002122879, + "num_tokens": 1077301938.0, + "step": 6414 + }, + { + "entropy": 1.665222058693568, + "epoch": 0.7047320864573893, + "grad_norm": 0.6075441837310791, + "learning_rate": 1.5526781282448408e-05, + "loss": 1.3895, + "mean_token_accuracy": 0.6595604221026102, + "num_tokens": 1077518144.0, + "step": 6415 + }, + { + "entropy": 1.6636256277561188, + "epoch": 0.7048419433687622, + "grad_norm": 0.6443570852279663, + "learning_rate": 1.5525397354292256e-05, + "loss": 1.2649, + "mean_token_accuracy": 0.6825617849826813, + "num_tokens": 1077663050.0, + "step": 6416 + }, + { + "entropy": 1.7030467987060547, + "epoch": 0.7049518002801352, + "grad_norm": 0.6067739129066467, + "learning_rate": 1.5524013282909317e-05, + "loss": 1.4999, + "mean_token_accuracy": 0.6428120483954748, + "num_tokens": 1077865926.0, + "step": 6417 + }, + { + "entropy": 1.6580960551897685, + "epoch": 0.705061657191508, + "grad_norm": 0.657632052898407, + "learning_rate": 1.5522629068343398e-05, + "loss": 1.2896, + "mean_token_accuracy": 0.6659079343080521, + "num_tokens": 1078018210.0, + "step": 6418 + }, + { + "entropy": 1.743414322535197, + "epoch": 0.705171514102881, + "grad_norm": 0.7279876470565796, + "learning_rate": 1.5521244710638308e-05, + "loss": 1.3474, + "mean_token_accuracy": 0.6605549802382787, + "num_tokens": 1078149814.0, + "step": 6419 + }, + { + "entropy": 1.6969486773014069, + "epoch": 0.7052813710142539, + "grad_norm": 0.8344591856002808, + "learning_rate": 1.5519860209837858e-05, + "loss": 1.3424, + "mean_token_accuracy": 0.670002485315005, + "num_tokens": 1078317881.0, + "step": 6420 + }, + { + "entropy": 1.6994330783685048, + "epoch": 0.7053912279256269, + "grad_norm": 0.6680699586868286, + "learning_rate": 1.551847556598587e-05, + "loss": 1.3801, + "mean_token_accuracy": 0.669528936346372, + "num_tokens": 1078477165.0, + "step": 6421 + }, + { + "entropy": 1.666198472181956, + "epoch": 0.7055010848369998, + "grad_norm": 0.6620866656303406, + "learning_rate": 1.5517090779126164e-05, + "loss": 1.3215, + "mean_token_accuracy": 0.6595088789860407, + "num_tokens": 1078619965.0, + "step": 6422 + }, + { + "entropy": 1.7299580574035645, + "epoch": 0.7056109417483728, + "grad_norm": 0.7888288497924805, + "learning_rate": 1.5515705849302574e-05, + "loss": 1.2851, + "mean_token_accuracy": 0.6729756246010462, + "num_tokens": 1078758890.0, + "step": 6423 + }, + { + "entropy": 1.7276004652182262, + "epoch": 0.7057207986597457, + "grad_norm": 0.7002907991409302, + "learning_rate": 1.5514320776558928e-05, + "loss": 1.4228, + "mean_token_accuracy": 0.6579409589370092, + "num_tokens": 1078958010.0, + "step": 6424 + }, + { + "entropy": 1.6907508472601573, + "epoch": 0.7058306555711186, + "grad_norm": 0.632900595664978, + "learning_rate": 1.551293556093906e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.6516719460487366, + "num_tokens": 1079164270.0, + "step": 6425 + }, + { + "entropy": 1.7010155816872914, + "epoch": 0.7059405124824916, + "grad_norm": 0.6720937490463257, + "learning_rate": 1.551155020248682e-05, + "loss": 1.2768, + "mean_token_accuracy": 0.6753781239191691, + "num_tokens": 1079285399.0, + "step": 6426 + }, + { + "entropy": 1.6723153193791707, + "epoch": 0.7060503693938645, + "grad_norm": 0.8205432295799255, + "learning_rate": 1.5510164701246045e-05, + "loss": 1.4409, + "mean_token_accuracy": 0.655280739068985, + "num_tokens": 1079479151.0, + "step": 6427 + }, + { + "entropy": 1.731699009736379, + "epoch": 0.7061602263052374, + "grad_norm": 0.6112235188484192, + "learning_rate": 1.550877905726059e-05, + "loss": 1.4701, + "mean_token_accuracy": 0.6529090950886408, + "num_tokens": 1079685790.0, + "step": 6428 + }, + { + "entropy": 1.746723433335622, + "epoch": 0.7062700832166103, + "grad_norm": 0.7355782985687256, + "learning_rate": 1.5507393270574315e-05, + "loss": 1.3453, + "mean_token_accuracy": 0.6606174210707346, + "num_tokens": 1079837134.0, + "step": 6429 + }, + { + "entropy": 1.7096926669279735, + "epoch": 0.7063799401279833, + "grad_norm": 0.7809394001960754, + "learning_rate": 1.5506007341231068e-05, + "loss": 1.3517, + "mean_token_accuracy": 0.6668333212534586, + "num_tokens": 1079974934.0, + "step": 6430 + }, + { + "entropy": 1.7233172257741292, + "epoch": 0.7064897970393562, + "grad_norm": 0.9166316390037537, + "learning_rate": 1.550462126927472e-05, + "loss": 1.3369, + "mean_token_accuracy": 0.6595128228267034, + "num_tokens": 1080104310.0, + "step": 6431 + }, + { + "entropy": 1.7122790416081746, + "epoch": 0.7065996539507292, + "grad_norm": 0.8295903205871582, + "learning_rate": 1.550323505474914e-05, + "loss": 1.5162, + "mean_token_accuracy": 0.6423207471768061, + "num_tokens": 1080270460.0, + "step": 6432 + }, + { + "entropy": 1.6761127014954884, + "epoch": 0.7067095108621021, + "grad_norm": 0.9063708782196045, + "learning_rate": 1.55018486976982e-05, + "loss": 1.5275, + "mean_token_accuracy": 0.6545315235853195, + "num_tokens": 1080457268.0, + "step": 6433 + }, + { + "entropy": 1.7378019988536835, + "epoch": 0.7068193677734751, + "grad_norm": 0.6509607434272766, + "learning_rate": 1.5500462198165778e-05, + "loss": 1.4512, + "mean_token_accuracy": 0.6503981401522955, + "num_tokens": 1080644349.0, + "step": 6434 + }, + { + "entropy": 1.60017196337382, + "epoch": 0.706929224684848, + "grad_norm": 0.6047476530075073, + "learning_rate": 1.5499075556195752e-05, + "loss": 1.3142, + "mean_token_accuracy": 0.6764021714528402, + "num_tokens": 1080833259.0, + "step": 6435 + }, + { + "entropy": 1.7449369231859844, + "epoch": 0.707039081596221, + "grad_norm": 0.7219707369804382, + "learning_rate": 1.5497688771832017e-05, + "loss": 1.3236, + "mean_token_accuracy": 0.6659722030162811, + "num_tokens": 1080997620.0, + "step": 6436 + }, + { + "entropy": 1.7626902063687642, + "epoch": 0.7071489385075939, + "grad_norm": 0.7538440823554993, + "learning_rate": 1.549630184511845e-05, + "loss": 1.5468, + "mean_token_accuracy": 0.6329626242319742, + "num_tokens": 1081153212.0, + "step": 6437 + }, + { + "entropy": 1.6917728781700134, + "epoch": 0.7072587954189667, + "grad_norm": 0.7224356532096863, + "learning_rate": 1.5494914776098967e-05, + "loss": 1.6, + "mean_token_accuracy": 0.6502460787693659, + "num_tokens": 1081310174.0, + "step": 6438 + }, + { + "entropy": 1.7162715196609497, + "epoch": 0.7073686523303397, + "grad_norm": 0.7522397637367249, + "learning_rate": 1.549352756481745e-05, + "loss": 1.3803, + "mean_token_accuracy": 0.6606674641370773, + "num_tokens": 1081482148.0, + "step": 6439 + }, + { + "entropy": 1.6705755194028218, + "epoch": 0.7074785092417126, + "grad_norm": 0.7202532291412354, + "learning_rate": 1.5492140211317813e-05, + "loss": 1.355, + "mean_token_accuracy": 0.6613196780284246, + "num_tokens": 1081621256.0, + "step": 6440 + }, + { + "entropy": 1.6998901466528575, + "epoch": 0.7075883661530856, + "grad_norm": 0.6419969797134399, + "learning_rate": 1.549075271564396e-05, + "loss": 1.3242, + "mean_token_accuracy": 0.6660476873318354, + "num_tokens": 1081781478.0, + "step": 6441 + }, + { + "entropy": 1.714255303144455, + "epoch": 0.7076982230644585, + "grad_norm": 0.6204527020454407, + "learning_rate": 1.548936507783981e-05, + "loss": 1.3608, + "mean_token_accuracy": 0.655594398578008, + "num_tokens": 1081946139.0, + "step": 6442 + }, + { + "entropy": 1.6719560623168945, + "epoch": 0.7078080799758315, + "grad_norm": 0.6580803394317627, + "learning_rate": 1.5487977297949276e-05, + "loss": 1.4012, + "mean_token_accuracy": 0.6551670630772909, + "num_tokens": 1082090613.0, + "step": 6443 + }, + { + "entropy": 1.6676383117834728, + "epoch": 0.7079179368872044, + "grad_norm": 0.6761298179626465, + "learning_rate": 1.5486589376016284e-05, + "loss": 1.3466, + "mean_token_accuracy": 0.671358272433281, + "num_tokens": 1082206340.0, + "step": 6444 + }, + { + "entropy": 1.7442449033260345, + "epoch": 0.7080277937985774, + "grad_norm": 0.6918967962265015, + "learning_rate": 1.548520131208476e-05, + "loss": 1.3748, + "mean_token_accuracy": 0.6563903441031774, + "num_tokens": 1082338856.0, + "step": 6445 + }, + { + "entropy": 1.6667213837305705, + "epoch": 0.7081376507099503, + "grad_norm": 0.5648940205574036, + "learning_rate": 1.5483813106198634e-05, + "loss": 1.3901, + "mean_token_accuracy": 0.6636083672444025, + "num_tokens": 1082552762.0, + "step": 6446 + }, + { + "entropy": 1.6465531090895336, + "epoch": 0.7082475076213233, + "grad_norm": 0.7094516754150391, + "learning_rate": 1.5482424758401847e-05, + "loss": 1.3485, + "mean_token_accuracy": 0.6755081762870153, + "num_tokens": 1082720364.0, + "step": 6447 + }, + { + "entropy": 1.7297605971495311, + "epoch": 0.7083573645326962, + "grad_norm": 0.6798611283302307, + "learning_rate": 1.5481036268738334e-05, + "loss": 1.3216, + "mean_token_accuracy": 0.6573556611935297, + "num_tokens": 1082836557.0, + "step": 6448 + }, + { + "entropy": 1.6604599058628082, + "epoch": 0.7084672214440692, + "grad_norm": 0.659517228603363, + "learning_rate": 1.547964763725204e-05, + "loss": 1.3595, + "mean_token_accuracy": 0.6587564200162888, + "num_tokens": 1082997407.0, + "step": 6449 + }, + { + "entropy": 1.7796454230944316, + "epoch": 0.708577078355442, + "grad_norm": 0.7438165545463562, + "learning_rate": 1.547825886398692e-05, + "loss": 1.4302, + "mean_token_accuracy": 0.6503714273373286, + "num_tokens": 1083170024.0, + "step": 6450 + }, + { + "entropy": 1.6703903377056122, + "epoch": 0.7086869352668149, + "grad_norm": 0.7155895233154297, + "learning_rate": 1.5476869948986925e-05, + "loss": 1.5655, + "mean_token_accuracy": 0.6468634754419327, + "num_tokens": 1083357698.0, + "step": 6451 + }, + { + "entropy": 1.7359613676865895, + "epoch": 0.7087967921781879, + "grad_norm": 0.6262257695198059, + "learning_rate": 1.5475480892296013e-05, + "loss": 1.5612, + "mean_token_accuracy": 0.628671204050382, + "num_tokens": 1083543537.0, + "step": 6452 + }, + { + "entropy": 1.678429941336314, + "epoch": 0.7089066490895608, + "grad_norm": 0.7694371342658997, + "learning_rate": 1.5474091693958146e-05, + "loss": 1.3765, + "mean_token_accuracy": 0.6632258395353953, + "num_tokens": 1083695268.0, + "step": 6453 + }, + { + "entropy": 1.6886884073416393, + "epoch": 0.7090165060009338, + "grad_norm": 0.733223021030426, + "learning_rate": 1.5472702354017296e-05, + "loss": 1.3159, + "mean_token_accuracy": 0.6743916422128677, + "num_tokens": 1083825522.0, + "step": 6454 + }, + { + "entropy": 1.7432435353597004, + "epoch": 0.7091263629123067, + "grad_norm": 0.7304172515869141, + "learning_rate": 1.547131287251743e-05, + "loss": 1.2952, + "mean_token_accuracy": 0.6635237882534663, + "num_tokens": 1084003481.0, + "step": 6455 + }, + { + "entropy": 1.6784548958142598, + "epoch": 0.7092362198236797, + "grad_norm": 0.7074428200721741, + "learning_rate": 1.5469923249502525e-05, + "loss": 1.5661, + "mean_token_accuracy": 0.6370118310054144, + "num_tokens": 1084177783.0, + "step": 6456 + }, + { + "entropy": 1.6755680044492085, + "epoch": 0.7093460767350526, + "grad_norm": 0.6471802592277527, + "learning_rate": 1.5468533485016564e-05, + "loss": 1.4748, + "mean_token_accuracy": 0.634057030081749, + "num_tokens": 1084376301.0, + "step": 6457 + }, + { + "entropy": 1.6814270714918773, + "epoch": 0.7094559336464256, + "grad_norm": 0.6327021718025208, + "learning_rate": 1.5467143579103535e-05, + "loss": 1.3108, + "mean_token_accuracy": 0.6754618585109711, + "num_tokens": 1084510548.0, + "step": 6458 + }, + { + "entropy": 1.6858037908871968, + "epoch": 0.7095657905577984, + "grad_norm": 0.7395240664482117, + "learning_rate": 1.546575353180742e-05, + "loss": 1.239, + "mean_token_accuracy": 0.6764448136091232, + "num_tokens": 1084615745.0, + "step": 6459 + }, + { + "entropy": 1.7345422605673473, + "epoch": 0.7096756474691714, + "grad_norm": 0.628578245639801, + "learning_rate": 1.5464363343172223e-05, + "loss": 1.6155, + "mean_token_accuracy": 0.6141057461500168, + "num_tokens": 1084869481.0, + "step": 6460 + }, + { + "entropy": 1.6727848052978516, + "epoch": 0.7097855043805443, + "grad_norm": 0.7392853498458862, + "learning_rate": 1.5462973013241934e-05, + "loss": 1.4858, + "mean_token_accuracy": 0.6492108752330145, + "num_tokens": 1085037273.0, + "step": 6461 + }, + { + "entropy": 1.7004869282245636, + "epoch": 0.7098953612919173, + "grad_norm": 0.7187851071357727, + "learning_rate": 1.546158254206056e-05, + "loss": 1.478, + "mean_token_accuracy": 0.6374183098475138, + "num_tokens": 1085232886.0, + "step": 6462 + }, + { + "entropy": 1.6797158320744832, + "epoch": 0.7100052182032902, + "grad_norm": 0.7203065752983093, + "learning_rate": 1.546019192967211e-05, + "loss": 1.5025, + "mean_token_accuracy": 0.6410057172179222, + "num_tokens": 1085413273.0, + "step": 6463 + }, + { + "entropy": 1.742477943499883, + "epoch": 0.7101150751146631, + "grad_norm": 0.7050339579582214, + "learning_rate": 1.5458801176120597e-05, + "loss": 1.4738, + "mean_token_accuracy": 0.6423740684986115, + "num_tokens": 1085552171.0, + "step": 6464 + }, + { + "entropy": 1.7188653250535328, + "epoch": 0.7102249320260361, + "grad_norm": 0.7164783477783203, + "learning_rate": 1.5457410281450034e-05, + "loss": 1.506, + "mean_token_accuracy": 0.6500173856814703, + "num_tokens": 1085724958.0, + "step": 6465 + }, + { + "entropy": 1.76152570048968, + "epoch": 0.710334788937409, + "grad_norm": 0.7042224407196045, + "learning_rate": 1.5456019245704445e-05, + "loss": 1.451, + "mean_token_accuracy": 0.6454960157473882, + "num_tokens": 1085889597.0, + "step": 6466 + }, + { + "entropy": 1.666865090529124, + "epoch": 0.710444645848782, + "grad_norm": 0.6657201051712036, + "learning_rate": 1.5454628068927854e-05, + "loss": 1.3749, + "mean_token_accuracy": 0.6552201559146246, + "num_tokens": 1086027428.0, + "step": 6467 + }, + { + "entropy": 1.7004301051298778, + "epoch": 0.7105545027601549, + "grad_norm": 0.6426774263381958, + "learning_rate": 1.5453236751164293e-05, + "loss": 1.4649, + "mean_token_accuracy": 0.6523342033227285, + "num_tokens": 1086215530.0, + "step": 6468 + }, + { + "entropy": 1.680479904015859, + "epoch": 0.7106643596715279, + "grad_norm": 0.6741653680801392, + "learning_rate": 1.5451845292457793e-05, + "loss": 1.3898, + "mean_token_accuracy": 0.6664047390222549, + "num_tokens": 1086377606.0, + "step": 6469 + }, + { + "entropy": 1.6704957087834675, + "epoch": 0.7107742165829007, + "grad_norm": 0.7036837339401245, + "learning_rate": 1.54504536928524e-05, + "loss": 1.3221, + "mean_token_accuracy": 0.6773081024487814, + "num_tokens": 1086513650.0, + "step": 6470 + }, + { + "entropy": 1.6654736300309498, + "epoch": 0.7108840734942737, + "grad_norm": 0.642352819442749, + "learning_rate": 1.5449061952392148e-05, + "loss": 1.366, + "mean_token_accuracy": 0.6635124981403351, + "num_tokens": 1086681158.0, + "step": 6471 + }, + { + "entropy": 1.6999973754088085, + "epoch": 0.7109939304056466, + "grad_norm": 0.5569196939468384, + "learning_rate": 1.5447670071121093e-05, + "loss": 1.3281, + "mean_token_accuracy": 0.6624209682146708, + "num_tokens": 1086840513.0, + "step": 6472 + }, + { + "entropy": 1.7996805508931477, + "epoch": 0.7111037873170196, + "grad_norm": 0.66071617603302, + "learning_rate": 1.5446278049083284e-05, + "loss": 1.6934, + "mean_token_accuracy": 0.6085737546284994, + "num_tokens": 1087130957.0, + "step": 6473 + }, + { + "entropy": 1.734180857737859, + "epoch": 0.7112136442283925, + "grad_norm": 0.7350216507911682, + "learning_rate": 1.5444885886322778e-05, + "loss": 1.5484, + "mean_token_accuracy": 0.6372219175100327, + "num_tokens": 1087298330.0, + "step": 6474 + }, + { + "entropy": 1.7409184575080872, + "epoch": 0.7113235011397655, + "grad_norm": 0.6457538604736328, + "learning_rate": 1.544349358288364e-05, + "loss": 1.3968, + "mean_token_accuracy": 0.6600100994110107, + "num_tokens": 1087439869.0, + "step": 6475 + }, + { + "entropy": 1.747595449288686, + "epoch": 0.7114333580511384, + "grad_norm": 0.8765959739685059, + "learning_rate": 1.5442101138809928e-05, + "loss": 1.3413, + "mean_token_accuracy": 0.6616929272810618, + "num_tokens": 1087554965.0, + "step": 6476 + }, + { + "entropy": 1.775346169869105, + "epoch": 0.7115432149625114, + "grad_norm": 0.7475898265838623, + "learning_rate": 1.5440708554145713e-05, + "loss": 1.546, + "mean_token_accuracy": 0.6328155199686686, + "num_tokens": 1087769878.0, + "step": 6477 + }, + { + "entropy": 1.6867960194746654, + "epoch": 0.7116530718738843, + "grad_norm": 0.6512202620506287, + "learning_rate": 1.5439315828935083e-05, + "loss": 1.3724, + "mean_token_accuracy": 0.6561179707447687, + "num_tokens": 1087958250.0, + "step": 6478 + }, + { + "entropy": 1.7009850045045216, + "epoch": 0.7117629287852572, + "grad_norm": 0.6758841872215271, + "learning_rate": 1.54379229632221e-05, + "loss": 1.422, + "mean_token_accuracy": 0.6556328684091568, + "num_tokens": 1088149996.0, + "step": 6479 + }, + { + "entropy": 1.7897284130255382, + "epoch": 0.7118727856966302, + "grad_norm": 0.84705650806427, + "learning_rate": 1.5436529957050858e-05, + "loss": 1.2395, + "mean_token_accuracy": 0.6755161037047704, + "num_tokens": 1088256500.0, + "step": 6480 + }, + { + "entropy": 1.6992632150650024, + "epoch": 0.711982642608003, + "grad_norm": 0.8954823613166809, + "learning_rate": 1.543513681046544e-05, + "loss": 1.4168, + "mean_token_accuracy": 0.6409216324488322, + "num_tokens": 1088463599.0, + "step": 6481 + }, + { + "entropy": 1.7095726033051808, + "epoch": 0.712092499519376, + "grad_norm": 0.88639235496521, + "learning_rate": 1.5433743523509945e-05, + "loss": 1.365, + "mean_token_accuracy": 0.6730570693810781, + "num_tokens": 1088601030.0, + "step": 6482 + }, + { + "entropy": 1.7022682825724285, + "epoch": 0.7122023564307489, + "grad_norm": 0.5840624570846558, + "learning_rate": 1.543235009622846e-05, + "loss": 1.5923, + "mean_token_accuracy": 0.6384105285008749, + "num_tokens": 1088900721.0, + "step": 6483 + }, + { + "entropy": 1.6645398636658986, + "epoch": 0.7123122133421219, + "grad_norm": 0.5561464428901672, + "learning_rate": 1.5430956528665095e-05, + "loss": 1.4083, + "mean_token_accuracy": 0.6542864640553793, + "num_tokens": 1089140437.0, + "step": 6484 + }, + { + "entropy": 1.7146425247192383, + "epoch": 0.7124220702534948, + "grad_norm": 0.6508673429489136, + "learning_rate": 1.5429562820863954e-05, + "loss": 1.4767, + "mean_token_accuracy": 0.6512588014205297, + "num_tokens": 1089324853.0, + "step": 6485 + }, + { + "entropy": 1.6801698704560597, + "epoch": 0.7125319271648678, + "grad_norm": 0.6502287983894348, + "learning_rate": 1.542816897286914e-05, + "loss": 1.2781, + "mean_token_accuracy": 0.675328845779101, + "num_tokens": 1089463656.0, + "step": 6486 + }, + { + "entropy": 1.6508076985677083, + "epoch": 0.7126417840762407, + "grad_norm": 0.7556900978088379, + "learning_rate": 1.5426774984724775e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6651237408320109, + "num_tokens": 1089678421.0, + "step": 6487 + }, + { + "entropy": 1.651640792687734, + "epoch": 0.7127516409876137, + "grad_norm": 0.8668679594993591, + "learning_rate": 1.542538085647498e-05, + "loss": 1.4303, + "mean_token_accuracy": 0.6480504920085272, + "num_tokens": 1089818077.0, + "step": 6488 + }, + { + "entropy": 1.7374186714490254, + "epoch": 0.7128614978989866, + "grad_norm": 0.7118296027183533, + "learning_rate": 1.542398658816387e-05, + "loss": 1.4924, + "mean_token_accuracy": 0.6559328337510427, + "num_tokens": 1090014597.0, + "step": 6489 + }, + { + "entropy": 1.7288208802541096, + "epoch": 0.7129713548103596, + "grad_norm": 0.796561062335968, + "learning_rate": 1.5422592179835586e-05, + "loss": 1.5408, + "mean_token_accuracy": 0.6503265549739202, + "num_tokens": 1090158154.0, + "step": 6490 + }, + { + "entropy": 1.76965993642807, + "epoch": 0.7130812117217324, + "grad_norm": 0.6486510634422302, + "learning_rate": 1.5421197631534246e-05, + "loss": 1.4316, + "mean_token_accuracy": 0.6440114875634512, + "num_tokens": 1090373107.0, + "step": 6491 + }, + { + "entropy": 1.771783361832301, + "epoch": 0.7131910686331053, + "grad_norm": 0.6636711359024048, + "learning_rate": 1.5419802943303995e-05, + "loss": 1.3121, + "mean_token_accuracy": 0.6703186631202698, + "num_tokens": 1090523447.0, + "step": 6492 + }, + { + "entropy": 1.7251704931259155, + "epoch": 0.7133009255444783, + "grad_norm": 0.6942716836929321, + "learning_rate": 1.5418408115188973e-05, + "loss": 1.3039, + "mean_token_accuracy": 0.6602616558472315, + "num_tokens": 1090656225.0, + "step": 6493 + }, + { + "entropy": 1.6484019656976063, + "epoch": 0.7134107824558512, + "grad_norm": 0.7084615230560303, + "learning_rate": 1.5417013147233324e-05, + "loss": 1.2269, + "mean_token_accuracy": 0.6825538575649261, + "num_tokens": 1090806963.0, + "step": 6494 + }, + { + "entropy": 1.7776933411757152, + "epoch": 0.7135206393672242, + "grad_norm": 0.6260969042778015, + "learning_rate": 1.5415618039481196e-05, + "loss": 1.445, + "mean_token_accuracy": 0.6529321223497391, + "num_tokens": 1091004761.0, + "step": 6495 + }, + { + "entropy": 1.6919648945331573, + "epoch": 0.7136304962785971, + "grad_norm": 0.7151902914047241, + "learning_rate": 1.5414222791976753e-05, + "loss": 1.5192, + "mean_token_accuracy": 0.6683632185061773, + "num_tokens": 1091163822.0, + "step": 6496 + }, + { + "entropy": 1.7350868582725525, + "epoch": 0.7137403531899701, + "grad_norm": 0.7141695618629456, + "learning_rate": 1.5412827404764146e-05, + "loss": 1.3278, + "mean_token_accuracy": 0.6679353018601736, + "num_tokens": 1091293463.0, + "step": 6497 + }, + { + "entropy": 1.7138068477312725, + "epoch": 0.713850210101343, + "grad_norm": 0.7359333038330078, + "learning_rate": 1.5411431877887536e-05, + "loss": 1.4096, + "mean_token_accuracy": 0.6525428295135498, + "num_tokens": 1091461909.0, + "step": 6498 + }, + { + "entropy": 1.6973837018013, + "epoch": 0.713960067012716, + "grad_norm": 15.46304702758789, + "learning_rate": 1.54100362113911e-05, + "loss": 1.4589, + "mean_token_accuracy": 0.6513221810261408, + "num_tokens": 1091655585.0, + "step": 6499 + }, + { + "entropy": 1.7354335486888885, + "epoch": 0.7140699239240889, + "grad_norm": 0.6850374341011047, + "learning_rate": 1.5408640405319004e-05, + "loss": 1.4875, + "mean_token_accuracy": 0.6419627815485001, + "num_tokens": 1091795441.0, + "step": 6500 + }, + { + "entropy": 1.6926626861095428, + "epoch": 0.7141797808354619, + "grad_norm": 0.6201228499412537, + "learning_rate": 1.5407244459715424e-05, + "loss": 1.3315, + "mean_token_accuracy": 0.6752594908078512, + "num_tokens": 1091959354.0, + "step": 6501 + }, + { + "entropy": 1.6494509776433308, + "epoch": 0.7142896377468347, + "grad_norm": 0.5546920299530029, + "learning_rate": 1.5405848374624545e-05, + "loss": 1.514, + "mean_token_accuracy": 0.6437575320402781, + "num_tokens": 1092182556.0, + "step": 6502 + }, + { + "entropy": 1.7340616683165233, + "epoch": 0.7143994946582077, + "grad_norm": 0.7549980282783508, + "learning_rate": 1.540445215009055e-05, + "loss": 1.4259, + "mean_token_accuracy": 0.6449489444494247, + "num_tokens": 1092358579.0, + "step": 6503 + }, + { + "entropy": 1.6930972735087078, + "epoch": 0.7145093515695806, + "grad_norm": 0.6418382525444031, + "learning_rate": 1.5403055786157626e-05, + "loss": 1.3439, + "mean_token_accuracy": 0.6560649822155634, + "num_tokens": 1092489483.0, + "step": 6504 + }, + { + "entropy": 1.6835120022296906, + "epoch": 0.7146192084809535, + "grad_norm": 0.6219762563705444, + "learning_rate": 1.5401659282869973e-05, + "loss": 1.2959, + "mean_token_accuracy": 0.6700324018796285, + "num_tokens": 1092617242.0, + "step": 6505 + }, + { + "entropy": 1.7238627175490062, + "epoch": 0.7147290653923265, + "grad_norm": 0.734199583530426, + "learning_rate": 1.5400262640271786e-05, + "loss": 1.5356, + "mean_token_accuracy": 0.6461377541224161, + "num_tokens": 1092778452.0, + "step": 6506 + }, + { + "entropy": 1.668906291325887, + "epoch": 0.7148389223036994, + "grad_norm": 0.6679801344871521, + "learning_rate": 1.5398865858407272e-05, + "loss": 1.5205, + "mean_token_accuracy": 0.6480654130379359, + "num_tokens": 1092994198.0, + "step": 6507 + }, + { + "entropy": 1.7181902726491292, + "epoch": 0.7149487792150724, + "grad_norm": 0.7857739925384521, + "learning_rate": 1.539746893732063e-05, + "loss": 1.376, + "mean_token_accuracy": 0.6500228643417358, + "num_tokens": 1093171424.0, + "step": 6508 + }, + { + "entropy": 1.6858000059922535, + "epoch": 0.7150586361264453, + "grad_norm": 0.6876581907272339, + "learning_rate": 1.539607187705608e-05, + "loss": 1.3498, + "mean_token_accuracy": 0.6653678317864736, + "num_tokens": 1093348791.0, + "step": 6509 + }, + { + "entropy": 1.714595099290212, + "epoch": 0.7151684930378183, + "grad_norm": 0.5665342807769775, + "learning_rate": 1.5394674677657843e-05, + "loss": 1.3115, + "mean_token_accuracy": 0.655341257651647, + "num_tokens": 1093555039.0, + "step": 6510 + }, + { + "entropy": 1.6599359611670177, + "epoch": 0.7152783499491912, + "grad_norm": 0.7044631242752075, + "learning_rate": 1.5393277339170126e-05, + "loss": 1.3531, + "mean_token_accuracy": 0.6782778998215994, + "num_tokens": 1093719986.0, + "step": 6511 + }, + { + "entropy": 1.6935375332832336, + "epoch": 0.7153882068605641, + "grad_norm": 0.7041305303573608, + "learning_rate": 1.539187986163716e-05, + "loss": 1.445, + "mean_token_accuracy": 0.6444460153579712, + "num_tokens": 1093903707.0, + "step": 6512 + }, + { + "entropy": 1.6425399382909138, + "epoch": 0.715498063771937, + "grad_norm": 0.783403217792511, + "learning_rate": 1.5390482245103178e-05, + "loss": 1.4552, + "mean_token_accuracy": 0.6741680949926376, + "num_tokens": 1094040376.0, + "step": 6513 + }, + { + "entropy": 1.8000925381978352, + "epoch": 0.71560792068331, + "grad_norm": 0.6443293690681458, + "learning_rate": 1.538908448961241e-05, + "loss": 1.3937, + "mean_token_accuracy": 0.645576020081838, + "num_tokens": 1094221651.0, + "step": 6514 + }, + { + "entropy": 1.7409981389840443, + "epoch": 0.7157177775946829, + "grad_norm": 0.712158203125, + "learning_rate": 1.5387686595209097e-05, + "loss": 1.3776, + "mean_token_accuracy": 0.664316713809967, + "num_tokens": 1094377410.0, + "step": 6515 + }, + { + "entropy": 1.69818913936615, + "epoch": 0.7158276345060559, + "grad_norm": 0.681666910648346, + "learning_rate": 1.5386288561937482e-05, + "loss": 1.3804, + "mean_token_accuracy": 0.6551361183325449, + "num_tokens": 1094534065.0, + "step": 6516 + }, + { + "entropy": 1.718794455130895, + "epoch": 0.7159374914174288, + "grad_norm": 0.6347442865371704, + "learning_rate": 1.5384890389841803e-05, + "loss": 1.3203, + "mean_token_accuracy": 0.6684871315956116, + "num_tokens": 1094666102.0, + "step": 6517 + }, + { + "entropy": 1.7151610056559246, + "epoch": 0.7160473483288017, + "grad_norm": 0.6894080638885498, + "learning_rate": 1.5383492078966328e-05, + "loss": 1.3328, + "mean_token_accuracy": 0.667813797791799, + "num_tokens": 1094812585.0, + "step": 6518 + }, + { + "entropy": 1.6722050309181213, + "epoch": 0.7161572052401747, + "grad_norm": 0.6094774007797241, + "learning_rate": 1.5382093629355303e-05, + "loss": 1.4698, + "mean_token_accuracy": 0.6564723700284958, + "num_tokens": 1094988414.0, + "step": 6519 + }, + { + "entropy": 1.7040863831837971, + "epoch": 0.7162670621515476, + "grad_norm": 0.6441994905471802, + "learning_rate": 1.5380695041052983e-05, + "loss": 1.3583, + "mean_token_accuracy": 0.6653302560249964, + "num_tokens": 1095160094.0, + "step": 6520 + }, + { + "entropy": 1.7137998640537262, + "epoch": 0.7163769190629206, + "grad_norm": 0.6766939163208008, + "learning_rate": 1.5379296314103645e-05, + "loss": 1.0347, + "mean_token_accuracy": 0.6831163018941879, + "num_tokens": 1095308938.0, + "step": 6521 + }, + { + "entropy": 1.773234248161316, + "epoch": 0.7164867759742934, + "grad_norm": 0.6776031255722046, + "learning_rate": 1.5377897448551548e-05, + "loss": 1.489, + "mean_token_accuracy": 0.647934744755427, + "num_tokens": 1095487258.0, + "step": 6522 + }, + { + "entropy": 1.7177372376124065, + "epoch": 0.7165966328856664, + "grad_norm": 0.629857063293457, + "learning_rate": 1.537649844444097e-05, + "loss": 1.3285, + "mean_token_accuracy": 0.6603048046429952, + "num_tokens": 1095675612.0, + "step": 6523 + }, + { + "entropy": 1.7716063459714253, + "epoch": 0.7167064897970393, + "grad_norm": 0.7026961445808411, + "learning_rate": 1.537509930181619e-05, + "loss": 1.3509, + "mean_token_accuracy": 0.6556298683087031, + "num_tokens": 1095820438.0, + "step": 6524 + }, + { + "entropy": 1.7175563077131908, + "epoch": 0.7168163467084123, + "grad_norm": 0.765849232673645, + "learning_rate": 1.537370002072149e-05, + "loss": 1.4239, + "mean_token_accuracy": 0.6597336481014887, + "num_tokens": 1095962592.0, + "step": 6525 + }, + { + "entropy": 1.6588062246640523, + "epoch": 0.7169262036197852, + "grad_norm": 0.8724434971809387, + "learning_rate": 1.5372300601201152e-05, + "loss": 1.454, + "mean_token_accuracy": 0.6489892651637396, + "num_tokens": 1096119263.0, + "step": 6526 + }, + { + "entropy": 1.752094993988673, + "epoch": 0.7170360605311582, + "grad_norm": 0.7585995197296143, + "learning_rate": 1.537090104329947e-05, + "loss": 1.348, + "mean_token_accuracy": 0.677097295721372, + "num_tokens": 1096272863.0, + "step": 6527 + }, + { + "entropy": 1.7106123467286427, + "epoch": 0.7171459174425311, + "grad_norm": 0.691969633102417, + "learning_rate": 1.5369501347060744e-05, + "loss": 1.4732, + "mean_token_accuracy": 0.6419112334648768, + "num_tokens": 1096429369.0, + "step": 6528 + }, + { + "entropy": 1.6728369891643524, + "epoch": 0.7172557743539041, + "grad_norm": 0.6538815498352051, + "learning_rate": 1.5368101512529264e-05, + "loss": 1.3228, + "mean_token_accuracy": 0.6591041833162308, + "num_tokens": 1096543106.0, + "step": 6529 + }, + { + "entropy": 1.7897494733333588, + "epoch": 0.717365631265277, + "grad_norm": 0.8466572761535645, + "learning_rate": 1.536670153974934e-05, + "loss": 1.3401, + "mean_token_accuracy": 0.6769247204065323, + "num_tokens": 1096698071.0, + "step": 6530 + }, + { + "entropy": 1.7067344685395558, + "epoch": 0.71747548817665, + "grad_norm": 0.695894181728363, + "learning_rate": 1.5365301428765286e-05, + "loss": 1.4081, + "mean_token_accuracy": 0.6609604756037394, + "num_tokens": 1096850429.0, + "step": 6531 + }, + { + "entropy": 1.7233928342660267, + "epoch": 0.7175853450880229, + "grad_norm": 0.6567378044128418, + "learning_rate": 1.5363901179621403e-05, + "loss": 1.4852, + "mean_token_accuracy": 0.6497254719336828, + "num_tokens": 1097059613.0, + "step": 6532 + }, + { + "entropy": 1.7379729052384694, + "epoch": 0.7176952019993957, + "grad_norm": 0.7090989351272583, + "learning_rate": 1.5362500792362013e-05, + "loss": 1.3564, + "mean_token_accuracy": 0.6722868382930756, + "num_tokens": 1097197673.0, + "step": 6533 + }, + { + "entropy": 1.689707726240158, + "epoch": 0.7178050589107687, + "grad_norm": 0.6554761528968811, + "learning_rate": 1.5361100267031444e-05, + "loss": 1.29, + "mean_token_accuracy": 0.6656525383392969, + "num_tokens": 1097348154.0, + "step": 6534 + }, + { + "entropy": 1.7208319107691448, + "epoch": 0.7179149158221416, + "grad_norm": 0.6578717827796936, + "learning_rate": 1.5359699603674014e-05, + "loss": 1.3293, + "mean_token_accuracy": 0.6660318821668625, + "num_tokens": 1097489928.0, + "step": 6535 + }, + { + "entropy": 1.7080370386441548, + "epoch": 0.7180247727335146, + "grad_norm": 0.5738018155097961, + "learning_rate": 1.5358298802334053e-05, + "loss": 1.4117, + "mean_token_accuracy": 0.6500293960173925, + "num_tokens": 1097700016.0, + "step": 6536 + }, + { + "entropy": 1.6456352074940999, + "epoch": 0.7181346296448875, + "grad_norm": 0.612850546836853, + "learning_rate": 1.53568978630559e-05, + "loss": 1.4237, + "mean_token_accuracy": 0.6467505743106207, + "num_tokens": 1097920130.0, + "step": 6537 + }, + { + "entropy": 1.7054031888643901, + "epoch": 0.7182444865562605, + "grad_norm": 0.6343128681182861, + "learning_rate": 1.53554967858839e-05, + "loss": 1.31, + "mean_token_accuracy": 0.6704280972480774, + "num_tokens": 1098071482.0, + "step": 6538 + }, + { + "entropy": 1.739743580420812, + "epoch": 0.7183543434676334, + "grad_norm": 0.7319331169128418, + "learning_rate": 1.535409557086238e-05, + "loss": 1.2582, + "mean_token_accuracy": 0.6795140455166498, + "num_tokens": 1098233791.0, + "step": 6539 + }, + { + "entropy": 1.724317838748296, + "epoch": 0.7184642003790064, + "grad_norm": 0.6251640915870667, + "learning_rate": 1.5352694218035703e-05, + "loss": 1.4219, + "mean_token_accuracy": 0.6388923674821854, + "num_tokens": 1098447007.0, + "step": 6540 + }, + { + "entropy": 1.7108652492364247, + "epoch": 0.7185740572903793, + "grad_norm": 0.7806166410446167, + "learning_rate": 1.5351292727448214e-05, + "loss": 1.3493, + "mean_token_accuracy": 0.6638698279857635, + "num_tokens": 1098660254.0, + "step": 6541 + }, + { + "entropy": 1.6957137882709503, + "epoch": 0.7186839142017523, + "grad_norm": 0.6760042309761047, + "learning_rate": 1.534989109914427e-05, + "loss": 1.2932, + "mean_token_accuracy": 0.6657718569040298, + "num_tokens": 1098799723.0, + "step": 6542 + }, + { + "entropy": 1.6598234574000041, + "epoch": 0.7187937711131251, + "grad_norm": 0.6579678058624268, + "learning_rate": 1.5348489333168233e-05, + "loss": 1.3738, + "mean_token_accuracy": 0.65997414290905, + "num_tokens": 1098964913.0, + "step": 6543 + }, + { + "entropy": 1.6748213072617848, + "epoch": 0.7189036280244981, + "grad_norm": 0.693806529045105, + "learning_rate": 1.534708742956447e-05, + "loss": 1.3726, + "mean_token_accuracy": 0.658714180191358, + "num_tokens": 1099132726.0, + "step": 6544 + }, + { + "entropy": 1.7303595145543416, + "epoch": 0.719013484935871, + "grad_norm": 0.9831480979919434, + "learning_rate": 1.5345685388377342e-05, + "loss": 1.4561, + "mean_token_accuracy": 0.6566809763511022, + "num_tokens": 1099260996.0, + "step": 6545 + }, + { + "entropy": 1.6850965122381847, + "epoch": 0.7191233418472439, + "grad_norm": 0.6742528080940247, + "learning_rate": 1.5344283209651237e-05, + "loss": 1.4858, + "mean_token_accuracy": 0.650047724445661, + "num_tokens": 1099473098.0, + "step": 6546 + }, + { + "entropy": 1.726267506678899, + "epoch": 0.7192331987586169, + "grad_norm": 0.7677439451217651, + "learning_rate": 1.5342880893430526e-05, + "loss": 1.4177, + "mean_token_accuracy": 0.6506613542636236, + "num_tokens": 1099657221.0, + "step": 6547 + }, + { + "entropy": 1.6909742454687755, + "epoch": 0.7193430556699898, + "grad_norm": 0.6657822132110596, + "learning_rate": 1.534147843975959e-05, + "loss": 1.4182, + "mean_token_accuracy": 0.659657746553421, + "num_tokens": 1099834559.0, + "step": 6548 + }, + { + "entropy": 1.701712469259898, + "epoch": 0.7194529125813628, + "grad_norm": 0.7202948927879333, + "learning_rate": 1.5340075848682812e-05, + "loss": 1.296, + "mean_token_accuracy": 0.6672409772872925, + "num_tokens": 1099955050.0, + "step": 6549 + }, + { + "entropy": 1.711505303780238, + "epoch": 0.7195627694927357, + "grad_norm": 0.7047147154808044, + "learning_rate": 1.53386731202446e-05, + "loss": 1.4693, + "mean_token_accuracy": 0.6619550883769989, + "num_tokens": 1100138035.0, + "step": 6550 + }, + { + "entropy": 1.7319445709387462, + "epoch": 0.7196726264041087, + "grad_norm": 0.8525048494338989, + "learning_rate": 1.533727025448933e-05, + "loss": 1.3342, + "mean_token_accuracy": 0.6697620848814646, + "num_tokens": 1100271260.0, + "step": 6551 + }, + { + "entropy": 1.7205536564191182, + "epoch": 0.7197824833154816, + "grad_norm": 0.8322303891181946, + "learning_rate": 1.5335867251461415e-05, + "loss": 1.3912, + "mean_token_accuracy": 0.6587785333395004, + "num_tokens": 1100450079.0, + "step": 6552 + }, + { + "entropy": 1.7100428342819214, + "epoch": 0.7198923402268546, + "grad_norm": 0.7583587169647217, + "learning_rate": 1.5334464111205253e-05, + "loss": 1.4353, + "mean_token_accuracy": 0.6676417837540308, + "num_tokens": 1100568632.0, + "step": 6553 + }, + { + "entropy": 1.6718673606713612, + "epoch": 0.7200021971382274, + "grad_norm": 0.6357502937316895, + "learning_rate": 1.5333060833765255e-05, + "loss": 1.3762, + "mean_token_accuracy": 0.6590482493241628, + "num_tokens": 1100788024.0, + "step": 6554 + }, + { + "entropy": 1.7551223436991374, + "epoch": 0.7201120540496004, + "grad_norm": 0.7153880000114441, + "learning_rate": 1.5331657419185838e-05, + "loss": 1.3293, + "mean_token_accuracy": 0.6558401187260946, + "num_tokens": 1100984331.0, + "step": 6555 + }, + { + "entropy": 1.6930421988169353, + "epoch": 0.7202219109609733, + "grad_norm": 0.6794725656509399, + "learning_rate": 1.5330253867511415e-05, + "loss": 1.3988, + "mean_token_accuracy": 0.6546763380368551, + "num_tokens": 1101147632.0, + "step": 6556 + }, + { + "entropy": 1.679756999015808, + "epoch": 0.7203317678723463, + "grad_norm": 0.77561354637146, + "learning_rate": 1.5328850178786403e-05, + "loss": 1.3257, + "mean_token_accuracy": 0.6894521017869314, + "num_tokens": 1101300740.0, + "step": 6557 + }, + { + "entropy": 1.7333133816719055, + "epoch": 0.7204416247837192, + "grad_norm": 0.7283507585525513, + "learning_rate": 1.532744635305524e-05, + "loss": 1.3716, + "mean_token_accuracy": 0.6582729518413544, + "num_tokens": 1101477366.0, + "step": 6558 + }, + { + "entropy": 1.7297575374444325, + "epoch": 0.7205514816950921, + "grad_norm": 0.7023628950119019, + "learning_rate": 1.5326042390362347e-05, + "loss": 1.4311, + "mean_token_accuracy": 0.651911993821462, + "num_tokens": 1101638434.0, + "step": 6559 + }, + { + "entropy": 1.7308916648228962, + "epoch": 0.7206613386064651, + "grad_norm": 0.6646205186843872, + "learning_rate": 1.532463829075216e-05, + "loss": 1.4038, + "mean_token_accuracy": 0.6484263290961584, + "num_tokens": 1101821514.0, + "step": 6560 + }, + { + "entropy": 1.746131847302119, + "epoch": 0.720771195517838, + "grad_norm": 0.6808525323867798, + "learning_rate": 1.532323405426912e-05, + "loss": 1.4595, + "mean_token_accuracy": 0.6446107079585394, + "num_tokens": 1102044527.0, + "step": 6561 + }, + { + "entropy": 1.7083693246046703, + "epoch": 0.720881052429211, + "grad_norm": 0.7566711902618408, + "learning_rate": 1.5321829680957673e-05, + "loss": 1.2904, + "mean_token_accuracy": 0.680796946088473, + "num_tokens": 1102175522.0, + "step": 6562 + }, + { + "entropy": 1.7244884669780731, + "epoch": 0.7209909093405839, + "grad_norm": 0.6379838585853577, + "learning_rate": 1.532042517086226e-05, + "loss": 1.3748, + "mean_token_accuracy": 0.6597454150517782, + "num_tokens": 1102364222.0, + "step": 6563 + }, + { + "entropy": 1.7356492678324382, + "epoch": 0.7211007662519568, + "grad_norm": 0.6635233163833618, + "learning_rate": 1.531902052402734e-05, + "loss": 1.4433, + "mean_token_accuracy": 0.64767458041509, + "num_tokens": 1102498467.0, + "step": 6564 + }, + { + "entropy": 1.7014682590961456, + "epoch": 0.7212106231633297, + "grad_norm": 0.6852055788040161, + "learning_rate": 1.5317615740497366e-05, + "loss": 1.4805, + "mean_token_accuracy": 0.6671850581963857, + "num_tokens": 1102659134.0, + "step": 6565 + }, + { + "entropy": 1.7902653813362122, + "epoch": 0.7213204800747027, + "grad_norm": 0.6710415482521057, + "learning_rate": 1.53162108203168e-05, + "loss": 1.4562, + "mean_token_accuracy": 0.6423666675885519, + "num_tokens": 1102819522.0, + "step": 6566 + }, + { + "entropy": 1.6991690297921498, + "epoch": 0.7214303369860756, + "grad_norm": 0.592832088470459, + "learning_rate": 1.5314805763530106e-05, + "loss": 1.4588, + "mean_token_accuracy": 0.6513507117827734, + "num_tokens": 1103004741.0, + "step": 6567 + }, + { + "entropy": 1.7278131941954296, + "epoch": 0.7215401938974486, + "grad_norm": 0.6481438875198364, + "learning_rate": 1.5313400570181755e-05, + "loss": 1.3835, + "mean_token_accuracy": 0.6460322539011637, + "num_tokens": 1103174131.0, + "step": 6568 + }, + { + "entropy": 1.7156480550765991, + "epoch": 0.7216500508088215, + "grad_norm": 0.5778603553771973, + "learning_rate": 1.531199524031622e-05, + "loss": 1.5037, + "mean_token_accuracy": 0.6403802782297134, + "num_tokens": 1103460063.0, + "step": 6569 + }, + { + "entropy": 1.7245876292387645, + "epoch": 0.7217599077201945, + "grad_norm": 0.6985930800437927, + "learning_rate": 1.5310589773977974e-05, + "loss": 1.6586, + "mean_token_accuracy": 0.6096041947603226, + "num_tokens": 1103734337.0, + "step": 6570 + }, + { + "entropy": 1.6952235698699951, + "epoch": 0.7218697646315674, + "grad_norm": 0.7454794049263, + "learning_rate": 1.530918417121151e-05, + "loss": 1.4224, + "mean_token_accuracy": 0.6474892646074295, + "num_tokens": 1103880813.0, + "step": 6571 + }, + { + "entropy": 1.6598933935165405, + "epoch": 0.7219796215429404, + "grad_norm": 0.647855818271637, + "learning_rate": 1.5307778432061307e-05, + "loss": 1.5853, + "mean_token_accuracy": 0.6572377036015192, + "num_tokens": 1104135140.0, + "step": 6572 + }, + { + "entropy": 1.6665717562039692, + "epoch": 0.7220894784543133, + "grad_norm": 0.695473849773407, + "learning_rate": 1.5306372556571854e-05, + "loss": 1.3111, + "mean_token_accuracy": 0.6695192058881124, + "num_tokens": 1104325457.0, + "step": 6573 + }, + { + "entropy": 1.6647779544194539, + "epoch": 0.7221993353656861, + "grad_norm": 0.7145929336547852, + "learning_rate": 1.5304966544787655e-05, + "loss": 1.4169, + "mean_token_accuracy": 0.6621694614489874, + "num_tokens": 1104482078.0, + "step": 6574 + }, + { + "entropy": 1.705965260664622, + "epoch": 0.7223091922770591, + "grad_norm": 0.6520200371742249, + "learning_rate": 1.53035603967532e-05, + "loss": 1.3999, + "mean_token_accuracy": 0.6532257596651713, + "num_tokens": 1104709750.0, + "step": 6575 + }, + { + "entropy": 1.7335670789082844, + "epoch": 0.722419049188432, + "grad_norm": 0.6884702444076538, + "learning_rate": 1.5302154112513e-05, + "loss": 1.4801, + "mean_token_accuracy": 0.6404634515444437, + "num_tokens": 1104882328.0, + "step": 6576 + }, + { + "entropy": 1.7464761237303417, + "epoch": 0.722528906099805, + "grad_norm": 0.680620014667511, + "learning_rate": 1.5300747692111562e-05, + "loss": 1.3174, + "mean_token_accuracy": 0.6647460460662842, + "num_tokens": 1105029762.0, + "step": 6577 + }, + { + "entropy": 1.771151453256607, + "epoch": 0.7226387630111779, + "grad_norm": 0.6430426836013794, + "learning_rate": 1.5299341135593397e-05, + "loss": 1.416, + "mean_token_accuracy": 0.6499434957901636, + "num_tokens": 1105183987.0, + "step": 6578 + }, + { + "entropy": 1.7155382533868153, + "epoch": 0.7227486199225509, + "grad_norm": 0.589311957359314, + "learning_rate": 1.5297934443003023e-05, + "loss": 1.4938, + "mean_token_accuracy": 0.6517259627580643, + "num_tokens": 1105391076.0, + "step": 6579 + }, + { + "entropy": 1.6656960149606068, + "epoch": 0.7228584768339238, + "grad_norm": 0.7947255373001099, + "learning_rate": 1.529652761438496e-05, + "loss": 1.3701, + "mean_token_accuracy": 0.6710091133912405, + "num_tokens": 1105518257.0, + "step": 6580 + }, + { + "entropy": 1.695709377527237, + "epoch": 0.7229683337452968, + "grad_norm": 0.7417528033256531, + "learning_rate": 1.529512064978373e-05, + "loss": 1.5117, + "mean_token_accuracy": 0.6543413400650024, + "num_tokens": 1105681961.0, + "step": 6581 + }, + { + "entropy": 1.6597294012705486, + "epoch": 0.7230781906566697, + "grad_norm": 0.6216127872467041, + "learning_rate": 1.5293713549243872e-05, + "loss": 1.3874, + "mean_token_accuracy": 0.6591449032227198, + "num_tokens": 1105851772.0, + "step": 6582 + }, + { + "entropy": 1.677135815223058, + "epoch": 0.7231880475680427, + "grad_norm": 0.7040608525276184, + "learning_rate": 1.5292306312809914e-05, + "loss": 1.2326, + "mean_token_accuracy": 0.6805572162071863, + "num_tokens": 1106010660.0, + "step": 6583 + }, + { + "entropy": 1.6714986264705658, + "epoch": 0.7232979044794156, + "grad_norm": 0.7146863341331482, + "learning_rate": 1.52908989405264e-05, + "loss": 1.3671, + "mean_token_accuracy": 0.6705887715021769, + "num_tokens": 1106194910.0, + "step": 6584 + }, + { + "entropy": 1.7175088028113048, + "epoch": 0.7234077613907886, + "grad_norm": 0.5585004687309265, + "learning_rate": 1.5289491432437857e-05, + "loss": 1.4606, + "mean_token_accuracy": 0.6325180033842722, + "num_tokens": 1106435793.0, + "step": 6585 + }, + { + "entropy": 1.7380537887414296, + "epoch": 0.7235176183021614, + "grad_norm": 0.8111526966094971, + "learning_rate": 1.528808378858885e-05, + "loss": 1.4263, + "mean_token_accuracy": 0.6445368727048238, + "num_tokens": 1106592659.0, + "step": 6586 + }, + { + "entropy": 1.7423686981201172, + "epoch": 0.7236274752135343, + "grad_norm": 0.6948752403259277, + "learning_rate": 1.528667600902392e-05, + "loss": 1.354, + "mean_token_accuracy": 0.65903340280056, + "num_tokens": 1106736777.0, + "step": 6587 + }, + { + "entropy": 1.7096178233623505, + "epoch": 0.7237373321249073, + "grad_norm": 0.8738431334495544, + "learning_rate": 1.528526809378763e-05, + "loss": 1.302, + "mean_token_accuracy": 0.6691812723875046, + "num_tokens": 1106864737.0, + "step": 6588 + }, + { + "entropy": 1.7112048765023549, + "epoch": 0.7238471890362802, + "grad_norm": 0.7258959412574768, + "learning_rate": 1.5283860042924538e-05, + "loss": 1.419, + "mean_token_accuracy": 0.6464797953764597, + "num_tokens": 1107049074.0, + "step": 6589 + }, + { + "entropy": 1.6991894841194153, + "epoch": 0.7239570459476532, + "grad_norm": 0.6656256914138794, + "learning_rate": 1.5282451856479202e-05, + "loss": 1.3974, + "mean_token_accuracy": 0.6695433159669241, + "num_tokens": 1107213955.0, + "step": 6590 + }, + { + "entropy": 1.6625440021355946, + "epoch": 0.7240669028590261, + "grad_norm": 0.6745372414588928, + "learning_rate": 1.5281043534496193e-05, + "loss": 1.2792, + "mean_token_accuracy": 0.6713592559099197, + "num_tokens": 1107357394.0, + "step": 6591 + }, + { + "entropy": 1.695921152830124, + "epoch": 0.7241767597703991, + "grad_norm": 0.7285088896751404, + "learning_rate": 1.5279635077020087e-05, + "loss": 1.2813, + "mean_token_accuracy": 0.6724530756473541, + "num_tokens": 1107477762.0, + "step": 6592 + }, + { + "entropy": 1.691762089729309, + "epoch": 0.724286616681772, + "grad_norm": 0.730449914932251, + "learning_rate": 1.527822648409546e-05, + "loss": 1.3207, + "mean_token_accuracy": 0.6625142047802607, + "num_tokens": 1107709610.0, + "step": 6593 + }, + { + "entropy": 1.7176588773727417, + "epoch": 0.724396473593145, + "grad_norm": 0.6654216051101685, + "learning_rate": 1.5276817755766894e-05, + "loss": 1.3717, + "mean_token_accuracy": 0.6538281142711639, + "num_tokens": 1107913145.0, + "step": 6594 + }, + { + "entropy": 1.7409211297829945, + "epoch": 0.7245063305045178, + "grad_norm": 0.6343408823013306, + "learning_rate": 1.5275408892078967e-05, + "loss": 1.4509, + "mean_token_accuracy": 0.6523208022117615, + "num_tokens": 1108067869.0, + "step": 6595 + }, + { + "entropy": 1.7824281652768452, + "epoch": 0.7246161874158908, + "grad_norm": 0.7533055543899536, + "learning_rate": 1.527399989307628e-05, + "loss": 1.4195, + "mean_token_accuracy": 0.6448671966791153, + "num_tokens": 1108219158.0, + "step": 6596 + }, + { + "entropy": 1.7014999488989513, + "epoch": 0.7247260443272637, + "grad_norm": 0.5905172824859619, + "learning_rate": 1.5272590758803423e-05, + "loss": 1.4628, + "mean_token_accuracy": 0.6379047979911169, + "num_tokens": 1108422283.0, + "step": 6597 + }, + { + "entropy": 1.6868136525154114, + "epoch": 0.7248359012386367, + "grad_norm": 0.7195769548416138, + "learning_rate": 1.527118148930499e-05, + "loss": 1.4474, + "mean_token_accuracy": 0.6536327004432678, + "num_tokens": 1108576791.0, + "step": 6598 + }, + { + "entropy": 1.7102086345354717, + "epoch": 0.7249457581500096, + "grad_norm": 0.6632969975471497, + "learning_rate": 1.526977208462559e-05, + "loss": 1.3659, + "mean_token_accuracy": 0.6555024435122808, + "num_tokens": 1108758285.0, + "step": 6599 + }, + { + "entropy": 1.684918999671936, + "epoch": 0.7250556150613825, + "grad_norm": 1.003880262374878, + "learning_rate": 1.526836254480983e-05, + "loss": 1.523, + "mean_token_accuracy": 0.657150665918986, + "num_tokens": 1108915576.0, + "step": 6600 + }, + { + "entropy": 1.760528455177943, + "epoch": 0.7251654719727555, + "grad_norm": 0.6639096140861511, + "learning_rate": 1.5266952869902315e-05, + "loss": 1.3828, + "mean_token_accuracy": 0.6530559410651525, + "num_tokens": 1109037423.0, + "step": 6601 + }, + { + "entropy": 1.646875262260437, + "epoch": 0.7252753288841284, + "grad_norm": 0.7310366630554199, + "learning_rate": 1.526554305994766e-05, + "loss": 1.2516, + "mean_token_accuracy": 0.6824596722920736, + "num_tokens": 1109183215.0, + "step": 6602 + }, + { + "entropy": 1.6977095107237499, + "epoch": 0.7253851857955014, + "grad_norm": 0.6350131630897522, + "learning_rate": 1.5264133114990498e-05, + "loss": 1.4548, + "mean_token_accuracy": 0.6472870657841364, + "num_tokens": 1109397845.0, + "step": 6603 + }, + { + "entropy": 1.6626974542935689, + "epoch": 0.7254950427068743, + "grad_norm": 0.6890853047370911, + "learning_rate": 1.526272303507544e-05, + "loss": 1.3251, + "mean_token_accuracy": 0.6578503499428431, + "num_tokens": 1109534813.0, + "step": 6604 + }, + { + "entropy": 1.7217269043127696, + "epoch": 0.7256048996182473, + "grad_norm": 0.7920450568199158, + "learning_rate": 1.526131282024712e-05, + "loss": 1.4178, + "mean_token_accuracy": 0.6627766042947769, + "num_tokens": 1109701662.0, + "step": 6605 + }, + { + "entropy": 1.7304079035917919, + "epoch": 0.7257147565296201, + "grad_norm": 0.6736690402030945, + "learning_rate": 1.525990247055017e-05, + "loss": 1.582, + "mean_token_accuracy": 0.6449010322491328, + "num_tokens": 1109891057.0, + "step": 6606 + }, + { + "entropy": 1.7143594821294148, + "epoch": 0.7258246134409931, + "grad_norm": 0.813389778137207, + "learning_rate": 1.5258491986029224e-05, + "loss": 1.5102, + "mean_token_accuracy": 0.6494471182425817, + "num_tokens": 1110042199.0, + "step": 6607 + }, + { + "entropy": 1.712285617987315, + "epoch": 0.725934470352366, + "grad_norm": 0.651279628276825, + "learning_rate": 1.5257081366728928e-05, + "loss": 1.3512, + "mean_token_accuracy": 0.6540268957614899, + "num_tokens": 1110181003.0, + "step": 6608 + }, + { + "entropy": 1.7569693525632222, + "epoch": 0.726044327263739, + "grad_norm": 0.6964418292045593, + "learning_rate": 1.5255670612693925e-05, + "loss": 1.4252, + "mean_token_accuracy": 0.652028406659762, + "num_tokens": 1110325250.0, + "step": 6609 + }, + { + "entropy": 1.7216349244117737, + "epoch": 0.7261541841751119, + "grad_norm": 0.8492372035980225, + "learning_rate": 1.5254259723968865e-05, + "loss": 1.406, + "mean_token_accuracy": 0.6618664065996805, + "num_tokens": 1110484997.0, + "step": 6610 + }, + { + "entropy": 1.6996253232161205, + "epoch": 0.7262640410864849, + "grad_norm": 0.6752820014953613, + "learning_rate": 1.52528487005984e-05, + "loss": 1.3428, + "mean_token_accuracy": 0.6618599245945612, + "num_tokens": 1110628770.0, + "step": 6611 + }, + { + "entropy": 1.7417829434076946, + "epoch": 0.7263738979978578, + "grad_norm": 0.651860237121582, + "learning_rate": 1.525143754262719e-05, + "loss": 1.5468, + "mean_token_accuracy": 0.6249453624089559, + "num_tokens": 1110855352.0, + "step": 6612 + }, + { + "entropy": 1.6957463920116425, + "epoch": 0.7264837549092307, + "grad_norm": 0.6212682127952576, + "learning_rate": 1.5250026250099896e-05, + "loss": 1.328, + "mean_token_accuracy": 0.6674534380435944, + "num_tokens": 1111037352.0, + "step": 6613 + }, + { + "entropy": 1.721895823876063, + "epoch": 0.7265936118206037, + "grad_norm": 0.5673272013664246, + "learning_rate": 1.5248614823061191e-05, + "loss": 1.3398, + "mean_token_accuracy": 0.6586939742167791, + "num_tokens": 1111222031.0, + "step": 6614 + }, + { + "entropy": 1.6733955939610798, + "epoch": 0.7267034687319766, + "grad_norm": 0.696190595626831, + "learning_rate": 1.524720326155574e-05, + "loss": 1.3081, + "mean_token_accuracy": 0.670863464474678, + "num_tokens": 1111351621.0, + "step": 6615 + }, + { + "entropy": 1.7015343010425568, + "epoch": 0.7268133256433496, + "grad_norm": 0.8680276870727539, + "learning_rate": 1.5245791565628219e-05, + "loss": 1.554, + "mean_token_accuracy": 0.6325680613517761, + "num_tokens": 1111559646.0, + "step": 6616 + }, + { + "entropy": 1.6907628178596497, + "epoch": 0.7269231825547224, + "grad_norm": 0.8936082124710083, + "learning_rate": 1.5244379735323305e-05, + "loss": 1.4222, + "mean_token_accuracy": 0.6638303697109222, + "num_tokens": 1111676130.0, + "step": 6617 + }, + { + "entropy": 1.6800566116968791, + "epoch": 0.7270330394660954, + "grad_norm": 0.6707502603530884, + "learning_rate": 1.5242967770685688e-05, + "loss": 1.475, + "mean_token_accuracy": 0.6595744838317236, + "num_tokens": 1111839395.0, + "step": 6618 + }, + { + "entropy": 1.6874233186244965, + "epoch": 0.7271428963774683, + "grad_norm": 0.7718756198883057, + "learning_rate": 1.5241555671760053e-05, + "loss": 1.296, + "mean_token_accuracy": 0.673300489783287, + "num_tokens": 1111977599.0, + "step": 6619 + }, + { + "entropy": 1.69405393799146, + "epoch": 0.7272527532888413, + "grad_norm": 0.6836499571800232, + "learning_rate": 1.5240143438591091e-05, + "loss": 1.52, + "mean_token_accuracy": 0.6380777706702551, + "num_tokens": 1112169745.0, + "step": 6620 + }, + { + "entropy": 1.6723734835783641, + "epoch": 0.7273626102002142, + "grad_norm": 0.7240423560142517, + "learning_rate": 1.52387310712235e-05, + "loss": 1.445, + "mean_token_accuracy": 0.6521526724100113, + "num_tokens": 1112355144.0, + "step": 6621 + }, + { + "entropy": 1.6805169483025868, + "epoch": 0.7274724671115872, + "grad_norm": 0.6674152612686157, + "learning_rate": 1.5237318569701982e-05, + "loss": 1.4642, + "mean_token_accuracy": 0.6453837553660074, + "num_tokens": 1112530036.0, + "step": 6622 + }, + { + "entropy": 1.7313962876796722, + "epoch": 0.7275823240229601, + "grad_norm": 0.6953855156898499, + "learning_rate": 1.523590593407124e-05, + "loss": 1.3161, + "mean_token_accuracy": 0.6699342131614685, + "num_tokens": 1112683314.0, + "step": 6623 + }, + { + "entropy": 1.6776468753814697, + "epoch": 0.7276921809343331, + "grad_norm": 0.7314421534538269, + "learning_rate": 1.5234493164375983e-05, + "loss": 1.3186, + "mean_token_accuracy": 0.6604682207107544, + "num_tokens": 1112849572.0, + "step": 6624 + }, + { + "entropy": 1.774099330107371, + "epoch": 0.727802037845706, + "grad_norm": 0.9143105149269104, + "learning_rate": 1.5233080260660929e-05, + "loss": 1.3862, + "mean_token_accuracy": 0.6627939840157827, + "num_tokens": 1112960638.0, + "step": 6625 + }, + { + "entropy": 1.74024565021197, + "epoch": 0.727911894757079, + "grad_norm": 0.7166746854782104, + "learning_rate": 1.5231667222970788e-05, + "loss": 1.5266, + "mean_token_accuracy": 0.6335213532050451, + "num_tokens": 1113149401.0, + "step": 6626 + }, + { + "entropy": 1.6582687099774678, + "epoch": 0.7280217516684518, + "grad_norm": 0.6647648215293884, + "learning_rate": 1.5230254051350288e-05, + "loss": 1.3167, + "mean_token_accuracy": 0.6622982124487559, + "num_tokens": 1113296414.0, + "step": 6627 + }, + { + "entropy": 1.7777949670950572, + "epoch": 0.7281316085798247, + "grad_norm": 0.7266597151756287, + "learning_rate": 1.5228840745844154e-05, + "loss": 1.4685, + "mean_token_accuracy": 0.6511821498473486, + "num_tokens": 1113480827.0, + "step": 6628 + }, + { + "entropy": 1.702185720205307, + "epoch": 0.7282414654911977, + "grad_norm": 0.7669931650161743, + "learning_rate": 1.5227427306497113e-05, + "loss": 1.4266, + "mean_token_accuracy": 0.6498915751775106, + "num_tokens": 1113636000.0, + "step": 6629 + }, + { + "entropy": 1.69204247991244, + "epoch": 0.7283513224025706, + "grad_norm": 0.6504570841789246, + "learning_rate": 1.5226013733353906e-05, + "loss": 1.2188, + "mean_token_accuracy": 0.681893065571785, + "num_tokens": 1113756966.0, + "step": 6630 + }, + { + "entropy": 1.7312454879283905, + "epoch": 0.7284611793139436, + "grad_norm": 0.622042179107666, + "learning_rate": 1.5224600026459266e-05, + "loss": 1.4279, + "mean_token_accuracy": 0.6588011731704077, + "num_tokens": 1113941442.0, + "step": 6631 + }, + { + "entropy": 1.6893315315246582, + "epoch": 0.7285710362253165, + "grad_norm": 0.7400401830673218, + "learning_rate": 1.5223186185857941e-05, + "loss": 1.314, + "mean_token_accuracy": 0.6682803531487783, + "num_tokens": 1114137617.0, + "step": 6632 + }, + { + "entropy": 1.695889800786972, + "epoch": 0.7286808931366895, + "grad_norm": 0.7676869034767151, + "learning_rate": 1.5221772211594674e-05, + "loss": 1.4751, + "mean_token_accuracy": 0.6432386587063471, + "num_tokens": 1114363719.0, + "step": 6633 + }, + { + "entropy": 1.7293170789877574, + "epoch": 0.7287907500480624, + "grad_norm": 0.628367006778717, + "learning_rate": 1.5220358103714223e-05, + "loss": 1.472, + "mean_token_accuracy": 0.6526259730259577, + "num_tokens": 1114546800.0, + "step": 6634 + }, + { + "entropy": 1.6271907488505046, + "epoch": 0.7289006069594354, + "grad_norm": 0.5525145530700684, + "learning_rate": 1.5218943862261334e-05, + "loss": 1.3507, + "mean_token_accuracy": 0.6575760791699091, + "num_tokens": 1114731169.0, + "step": 6635 + }, + { + "entropy": 1.70510795712471, + "epoch": 0.7290104638708083, + "grad_norm": 0.5792953372001648, + "learning_rate": 1.5217529487280777e-05, + "loss": 1.3888, + "mean_token_accuracy": 0.6515243798494339, + "num_tokens": 1114929310.0, + "step": 6636 + }, + { + "entropy": 1.772267738978068, + "epoch": 0.7291203207821813, + "grad_norm": 0.7042428851127625, + "learning_rate": 1.5216114978817311e-05, + "loss": 1.393, + "mean_token_accuracy": 0.6505443006753922, + "num_tokens": 1115053969.0, + "step": 6637 + }, + { + "entropy": 1.7222981949647267, + "epoch": 0.7292301776935541, + "grad_norm": 0.7960110902786255, + "learning_rate": 1.5214700336915707e-05, + "loss": 1.3253, + "mean_token_accuracy": 0.6682598739862442, + "num_tokens": 1115182490.0, + "step": 6638 + }, + { + "entropy": 1.656973163286845, + "epoch": 0.7293400346049271, + "grad_norm": 0.8022185564041138, + "learning_rate": 1.5213285561620735e-05, + "loss": 1.3047, + "mean_token_accuracy": 0.678027073542277, + "num_tokens": 1115345148.0, + "step": 6639 + }, + { + "entropy": 1.6861250897248585, + "epoch": 0.7294498915163, + "grad_norm": 0.6490318179130554, + "learning_rate": 1.5211870652977174e-05, + "loss": 1.4897, + "mean_token_accuracy": 0.63862211505572, + "num_tokens": 1115591859.0, + "step": 6640 + }, + { + "entropy": 1.6945749620596569, + "epoch": 0.7295597484276729, + "grad_norm": 0.6802365779876709, + "learning_rate": 1.5210455611029805e-05, + "loss": 1.2691, + "mean_token_accuracy": 0.6764042377471924, + "num_tokens": 1115753223.0, + "step": 6641 + }, + { + "entropy": 1.6721658011277516, + "epoch": 0.7296696053390459, + "grad_norm": 0.7338157892227173, + "learning_rate": 1.5209040435823412e-05, + "loss": 1.2733, + "mean_token_accuracy": 0.6703089773654938, + "num_tokens": 1115947252.0, + "step": 6642 + }, + { + "entropy": 1.7122320334116619, + "epoch": 0.7297794622504188, + "grad_norm": 0.6685588955879211, + "learning_rate": 1.5207625127402788e-05, + "loss": 1.3575, + "mean_token_accuracy": 0.6568591793378195, + "num_tokens": 1116088045.0, + "step": 6643 + }, + { + "entropy": 1.7311648031075795, + "epoch": 0.7298893191617918, + "grad_norm": 0.635732114315033, + "learning_rate": 1.5206209685812723e-05, + "loss": 1.4057, + "mean_token_accuracy": 0.6452435304721197, + "num_tokens": 1116281246.0, + "step": 6644 + }, + { + "entropy": 1.6677932838598888, + "epoch": 0.7299991760731647, + "grad_norm": 0.6619640588760376, + "learning_rate": 1.5204794111098016e-05, + "loss": 1.4132, + "mean_token_accuracy": 0.6569789250691732, + "num_tokens": 1116507132.0, + "step": 6645 + }, + { + "entropy": 1.692247857650121, + "epoch": 0.7301090329845377, + "grad_norm": 0.6537544131278992, + "learning_rate": 1.5203378403303473e-05, + "loss": 1.3817, + "mean_token_accuracy": 0.6712391922871271, + "num_tokens": 1116674442.0, + "step": 6646 + }, + { + "entropy": 1.676435798406601, + "epoch": 0.7302188898959106, + "grad_norm": 0.5650108456611633, + "learning_rate": 1.5201962562473893e-05, + "loss": 1.4422, + "mean_token_accuracy": 0.6645029336214066, + "num_tokens": 1116857222.0, + "step": 6647 + }, + { + "entropy": 1.7497854729493458, + "epoch": 0.7303287468072835, + "grad_norm": 0.6814372539520264, + "learning_rate": 1.5200546588654097e-05, + "loss": 1.5072, + "mean_token_accuracy": 0.641438439488411, + "num_tokens": 1117030729.0, + "step": 6648 + }, + { + "entropy": 1.6965225736300151, + "epoch": 0.7304386037186564, + "grad_norm": 0.7143113017082214, + "learning_rate": 1.519913048188889e-05, + "loss": 1.4258, + "mean_token_accuracy": 0.6576728324095408, + "num_tokens": 1117220830.0, + "step": 6649 + }, + { + "entropy": 1.6916110416253407, + "epoch": 0.7305484606300294, + "grad_norm": 0.8342239260673523, + "learning_rate": 1.5197714242223098e-05, + "loss": 1.4201, + "mean_token_accuracy": 0.6518561790386835, + "num_tokens": 1117356587.0, + "step": 6650 + }, + { + "entropy": 1.6956707139809926, + "epoch": 0.7306583175414023, + "grad_norm": 0.6960749626159668, + "learning_rate": 1.519629786970154e-05, + "loss": 1.5202, + "mean_token_accuracy": 0.6355159282684326, + "num_tokens": 1117540824.0, + "step": 6651 + }, + { + "entropy": 1.6752854486306508, + "epoch": 0.7307681744527753, + "grad_norm": 0.6621298789978027, + "learning_rate": 1.5194881364369048e-05, + "loss": 1.198, + "mean_token_accuracy": 0.6848858445882797, + "num_tokens": 1117657941.0, + "step": 6652 + }, + { + "entropy": 1.7179746131102245, + "epoch": 0.7308780313641482, + "grad_norm": 0.6317608952522278, + "learning_rate": 1.5193464726270448e-05, + "loss": 1.3602, + "mean_token_accuracy": 0.6555174241463343, + "num_tokens": 1117821762.0, + "step": 6653 + }, + { + "entropy": 1.7452267309029896, + "epoch": 0.7309878882755211, + "grad_norm": 0.695980429649353, + "learning_rate": 1.519204795545058e-05, + "loss": 1.2723, + "mean_token_accuracy": 0.670079380273819, + "num_tokens": 1117938624.0, + "step": 6654 + }, + { + "entropy": 1.685364653666814, + "epoch": 0.7310977451868941, + "grad_norm": 0.6580285429954529, + "learning_rate": 1.5190631051954285e-05, + "loss": 1.4589, + "mean_token_accuracy": 0.6547621041536331, + "num_tokens": 1118134302.0, + "step": 6655 + }, + { + "entropy": 1.701217790444692, + "epoch": 0.731207602098267, + "grad_norm": 0.6681925058364868, + "learning_rate": 1.5189214015826406e-05, + "loss": 1.2259, + "mean_token_accuracy": 0.6868906915187836, + "num_tokens": 1118254275.0, + "step": 6656 + }, + { + "entropy": 1.7343595921993256, + "epoch": 0.73131745900964, + "grad_norm": 0.7051990628242493, + "learning_rate": 1.5187796847111787e-05, + "loss": 1.2701, + "mean_token_accuracy": 0.6716774702072144, + "num_tokens": 1118370267.0, + "step": 6657 + }, + { + "entropy": 1.7071207165718079, + "epoch": 0.7314273159210128, + "grad_norm": 0.6622605919837952, + "learning_rate": 1.5186379545855287e-05, + "loss": 1.4456, + "mean_token_accuracy": 0.6554889182249705, + "num_tokens": 1118535104.0, + "step": 6658 + }, + { + "entropy": 1.7222228546937306, + "epoch": 0.7315371728323858, + "grad_norm": 0.7932170033454895, + "learning_rate": 1.5184962112101762e-05, + "loss": 1.4314, + "mean_token_accuracy": 0.6692963143189748, + "num_tokens": 1118679063.0, + "step": 6659 + }, + { + "entropy": 1.6951739291350048, + "epoch": 0.7316470297437587, + "grad_norm": 0.7392410039901733, + "learning_rate": 1.5183544545896067e-05, + "loss": 1.4841, + "mean_token_accuracy": 0.6538377950588862, + "num_tokens": 1118849347.0, + "step": 6660 + }, + { + "entropy": 1.6979187230269115, + "epoch": 0.7317568866551317, + "grad_norm": 0.7129770517349243, + "learning_rate": 1.5182126847283079e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.6520341485738754, + "num_tokens": 1119029143.0, + "step": 6661 + }, + { + "entropy": 1.738596349954605, + "epoch": 0.7318667435665046, + "grad_norm": 0.707251250743866, + "learning_rate": 1.5180709016307657e-05, + "loss": 1.3563, + "mean_token_accuracy": 0.6612697641054789, + "num_tokens": 1119159190.0, + "step": 6662 + }, + { + "entropy": 1.760772128899892, + "epoch": 0.7319766004778776, + "grad_norm": 0.9078652262687683, + "learning_rate": 1.5179291053014678e-05, + "loss": 1.5109, + "mean_token_accuracy": 0.648401752114296, + "num_tokens": 1119289035.0, + "step": 6663 + }, + { + "entropy": 1.6850820978482564, + "epoch": 0.7320864573892505, + "grad_norm": 0.6886321902275085, + "learning_rate": 1.5177872957449022e-05, + "loss": 1.2989, + "mean_token_accuracy": 0.6829408456881841, + "num_tokens": 1119453081.0, + "step": 6664 + }, + { + "entropy": 1.6729042033354442, + "epoch": 0.7321963143006235, + "grad_norm": 0.6785295605659485, + "learning_rate": 1.517645472965557e-05, + "loss": 1.4541, + "mean_token_accuracy": 0.6468785454829534, + "num_tokens": 1119599099.0, + "step": 6665 + }, + { + "entropy": 1.7227231860160828, + "epoch": 0.7323061712119964, + "grad_norm": 0.7047486305236816, + "learning_rate": 1.5175036369679207e-05, + "loss": 1.3855, + "mean_token_accuracy": 0.6632230083147684, + "num_tokens": 1119738114.0, + "step": 6666 + }, + { + "entropy": 1.6837018032868702, + "epoch": 0.7324160281233693, + "grad_norm": 0.6316218972206116, + "learning_rate": 1.5173617877564824e-05, + "loss": 1.4426, + "mean_token_accuracy": 0.6650246977806091, + "num_tokens": 1119931974.0, + "step": 6667 + }, + { + "entropy": 1.7190644443035126, + "epoch": 0.7325258850347423, + "grad_norm": 0.6559016108512878, + "learning_rate": 1.5172199253357317e-05, + "loss": 1.4886, + "mean_token_accuracy": 0.647669846812884, + "num_tokens": 1120121272.0, + "step": 6668 + }, + { + "entropy": 1.7384677827358246, + "epoch": 0.7326357419461151, + "grad_norm": 0.689016580581665, + "learning_rate": 1.517078049710158e-05, + "loss": 1.5965, + "mean_token_accuracy": 0.6388440877199173, + "num_tokens": 1120326088.0, + "step": 6669 + }, + { + "entropy": 1.7198711037635803, + "epoch": 0.7327455988574881, + "grad_norm": 0.7162541747093201, + "learning_rate": 1.5169361608842526e-05, + "loss": 1.4903, + "mean_token_accuracy": 0.6306491643190384, + "num_tokens": 1120527353.0, + "step": 6670 + }, + { + "entropy": 1.7130565146605174, + "epoch": 0.732855455768861, + "grad_norm": 0.7176745533943176, + "learning_rate": 1.5167942588625051e-05, + "loss": 1.4091, + "mean_token_accuracy": 0.663971463839213, + "num_tokens": 1120649569.0, + "step": 6671 + }, + { + "entropy": 1.691343088944753, + "epoch": 0.732965312680234, + "grad_norm": 0.7076248526573181, + "learning_rate": 1.516652343649407e-05, + "loss": 1.2405, + "mean_token_accuracy": 0.684244821468989, + "num_tokens": 1120793554.0, + "step": 6672 + }, + { + "entropy": 1.7118805746237438, + "epoch": 0.7330751695916069, + "grad_norm": 0.6533283591270447, + "learning_rate": 1.51651041524945e-05, + "loss": 1.3292, + "mean_token_accuracy": 0.6786542683839798, + "num_tokens": 1120953250.0, + "step": 6673 + }, + { + "entropy": 1.717886209487915, + "epoch": 0.7331850265029799, + "grad_norm": 0.6892806887626648, + "learning_rate": 1.5163684736671268e-05, + "loss": 1.4064, + "mean_token_accuracy": 0.6795545766750971, + "num_tokens": 1121130654.0, + "step": 6674 + }, + { + "entropy": 1.6847498218218486, + "epoch": 0.7332948834143528, + "grad_norm": 0.6819150447845459, + "learning_rate": 1.516226518906928e-05, + "loss": 1.4565, + "mean_token_accuracy": 0.635857825477918, + "num_tokens": 1121314947.0, + "step": 6675 + }, + { + "entropy": 1.7709390620390575, + "epoch": 0.7334047403257258, + "grad_norm": 0.7383188605308533, + "learning_rate": 1.5160845509733481e-05, + "loss": 1.4823, + "mean_token_accuracy": 0.6510516554117203, + "num_tokens": 1121457352.0, + "step": 6676 + }, + { + "entropy": 1.7243086993694305, + "epoch": 0.7335145972370987, + "grad_norm": 0.7374492287635803, + "learning_rate": 1.5159425698708794e-05, + "loss": 1.3823, + "mean_token_accuracy": 0.6554542581240336, + "num_tokens": 1121595888.0, + "step": 6677 + }, + { + "entropy": 1.7116446495056152, + "epoch": 0.7336244541484717, + "grad_norm": 0.7225131988525391, + "learning_rate": 1.515800575604016e-05, + "loss": 1.3005, + "mean_token_accuracy": 0.6611469139655431, + "num_tokens": 1121715635.0, + "step": 6678 + }, + { + "entropy": 1.7593301236629486, + "epoch": 0.7337343110598445, + "grad_norm": 0.6853554844856262, + "learning_rate": 1.5156585681772513e-05, + "loss": 1.4137, + "mean_token_accuracy": 0.652967189749082, + "num_tokens": 1121900551.0, + "step": 6679 + }, + { + "entropy": 1.6201580166816711, + "epoch": 0.7338441679712175, + "grad_norm": 0.5897448658943176, + "learning_rate": 1.5155165475950808e-05, + "loss": 1.3441, + "mean_token_accuracy": 0.6597137997547785, + "num_tokens": 1122074093.0, + "step": 6680 + }, + { + "entropy": 1.6828961670398712, + "epoch": 0.7339540248825904, + "grad_norm": 0.6277378797531128, + "learning_rate": 1.5153745138619984e-05, + "loss": 1.447, + "mean_token_accuracy": 0.6535770297050476, + "num_tokens": 1122262412.0, + "step": 6681 + }, + { + "entropy": 1.6912165582180023, + "epoch": 0.7340638817939633, + "grad_norm": 0.6900094747543335, + "learning_rate": 1.5152324669825001e-05, + "loss": 1.3992, + "mean_token_accuracy": 0.6562529653310776, + "num_tokens": 1122393860.0, + "step": 6682 + }, + { + "entropy": 1.7647963762283325, + "epoch": 0.7341737387053363, + "grad_norm": 0.7445904612541199, + "learning_rate": 1.515090406961081e-05, + "loss": 1.4102, + "mean_token_accuracy": 0.6538730363051096, + "num_tokens": 1122528985.0, + "step": 6683 + }, + { + "entropy": 1.6255736549695332, + "epoch": 0.7342835956167092, + "grad_norm": 0.7534268498420715, + "learning_rate": 1.514948333802238e-05, + "loss": 1.509, + "mean_token_accuracy": 0.6601062913735708, + "num_tokens": 1122731476.0, + "step": 6684 + }, + { + "entropy": 1.744109223286311, + "epoch": 0.7343934525280822, + "grad_norm": 0.6984800100326538, + "learning_rate": 1.5148062475104667e-05, + "loss": 1.3975, + "mean_token_accuracy": 0.6600144853194555, + "num_tokens": 1122910393.0, + "step": 6685 + }, + { + "entropy": 1.706167111794154, + "epoch": 0.7345033094394551, + "grad_norm": 0.7030271887779236, + "learning_rate": 1.5146641480902648e-05, + "loss": 1.3823, + "mean_token_accuracy": 0.6593709588050842, + "num_tokens": 1123088442.0, + "step": 6686 + }, + { + "entropy": 1.6892346441745758, + "epoch": 0.7346131663508281, + "grad_norm": 0.7129636406898499, + "learning_rate": 1.5145220355461296e-05, + "loss": 1.4505, + "mean_token_accuracy": 0.6492991894483566, + "num_tokens": 1123239118.0, + "step": 6687 + }, + { + "entropy": 1.6550804773966472, + "epoch": 0.734723023262201, + "grad_norm": 0.5693633556365967, + "learning_rate": 1.5143799098825587e-05, + "loss": 1.5521, + "mean_token_accuracy": 0.634413423637549, + "num_tokens": 1123485150.0, + "step": 6688 + }, + { + "entropy": 1.6616708040237427, + "epoch": 0.734832880173574, + "grad_norm": 0.5824199318885803, + "learning_rate": 1.5142377711040503e-05, + "loss": 1.3501, + "mean_token_accuracy": 0.65452907482783, + "num_tokens": 1123694935.0, + "step": 6689 + }, + { + "entropy": 1.6947866678237915, + "epoch": 0.7349427370849468, + "grad_norm": 0.6208266019821167, + "learning_rate": 1.5140956192151031e-05, + "loss": 1.581, + "mean_token_accuracy": 0.6196437428394953, + "num_tokens": 1123927009.0, + "step": 6690 + }, + { + "entropy": 1.6772996087869008, + "epoch": 0.7350525939963198, + "grad_norm": 0.6417631506919861, + "learning_rate": 1.513953454220216e-05, + "loss": 1.3748, + "mean_token_accuracy": 0.6572922120491663, + "num_tokens": 1124106818.0, + "step": 6691 + }, + { + "entropy": 1.721063772837321, + "epoch": 0.7351624509076927, + "grad_norm": 0.7408942580223083, + "learning_rate": 1.513811276123889e-05, + "loss": 1.4452, + "mean_token_accuracy": 0.6464604238669077, + "num_tokens": 1124260837.0, + "step": 6692 + }, + { + "entropy": 1.7039151688416798, + "epoch": 0.7352723078190657, + "grad_norm": 0.7227491736412048, + "learning_rate": 1.5136690849306212e-05, + "loss": 1.3876, + "mean_token_accuracy": 0.6548017660776774, + "num_tokens": 1124411691.0, + "step": 6693 + }, + { + "entropy": 1.6541140377521515, + "epoch": 0.7353821647304386, + "grad_norm": 0.5854305624961853, + "learning_rate": 1.5135268806449135e-05, + "loss": 1.4233, + "mean_token_accuracy": 0.6561457067728043, + "num_tokens": 1124624577.0, + "step": 6694 + }, + { + "entropy": 1.6717216869195302, + "epoch": 0.7354920216418115, + "grad_norm": 0.6732227206230164, + "learning_rate": 1.5133846632712663e-05, + "loss": 1.3833, + "mean_token_accuracy": 0.6642757703860601, + "num_tokens": 1124813586.0, + "step": 6695 + }, + { + "entropy": 1.6729457378387451, + "epoch": 0.7356018785531845, + "grad_norm": 0.7442759871482849, + "learning_rate": 1.5132424328141809e-05, + "loss": 1.3488, + "mean_token_accuracy": 0.660913089911143, + "num_tokens": 1124978782.0, + "step": 6696 + }, + { + "entropy": 1.6723896364370983, + "epoch": 0.7357117354645574, + "grad_norm": 0.663541853427887, + "learning_rate": 1.5131001892781582e-05, + "loss": 1.3012, + "mean_token_accuracy": 0.6680503934621811, + "num_tokens": 1125122428.0, + "step": 6697 + }, + { + "entropy": 1.746447930733363, + "epoch": 0.7358215923759304, + "grad_norm": 0.8296138048171997, + "learning_rate": 1.5129579326677014e-05, + "loss": 1.3793, + "mean_token_accuracy": 0.6552826712528864, + "num_tokens": 1125290284.0, + "step": 6698 + }, + { + "entropy": 1.7190218269824982, + "epoch": 0.7359314492873033, + "grad_norm": 0.7751715779304504, + "learning_rate": 1.5128156629873119e-05, + "loss": 1.406, + "mean_token_accuracy": 0.6542918781439463, + "num_tokens": 1125443248.0, + "step": 6699 + }, + { + "entropy": 1.7631977200508118, + "epoch": 0.7360413061986762, + "grad_norm": 0.6478453874588013, + "learning_rate": 1.5126733802414923e-05, + "loss": 1.3893, + "mean_token_accuracy": 0.6554517845312754, + "num_tokens": 1125635355.0, + "step": 6700 + }, + { + "entropy": 1.6814933717250824, + "epoch": 0.7361511631100491, + "grad_norm": 0.6561465859413147, + "learning_rate": 1.5125310844347465e-05, + "loss": 1.4587, + "mean_token_accuracy": 0.6471677968899409, + "num_tokens": 1125845108.0, + "step": 6701 + }, + { + "entropy": 1.5653251310189564, + "epoch": 0.7362610200214221, + "grad_norm": 0.704756498336792, + "learning_rate": 1.5123887755715776e-05, + "loss": 1.288, + "mean_token_accuracy": 0.6854538271824518, + "num_tokens": 1125981759.0, + "step": 6702 + }, + { + "entropy": 1.7314506371815999, + "epoch": 0.736370876932795, + "grad_norm": 0.724329948425293, + "learning_rate": 1.5122464536564899e-05, + "loss": 1.4032, + "mean_token_accuracy": 0.6662786255280176, + "num_tokens": 1126133016.0, + "step": 6703 + }, + { + "entropy": 1.6579938729604085, + "epoch": 0.736480733844168, + "grad_norm": 0.6255090236663818, + "learning_rate": 1.5121041186939877e-05, + "loss": 1.3151, + "mean_token_accuracy": 0.6845894455909729, + "num_tokens": 1126274029.0, + "step": 6704 + }, + { + "entropy": 1.7049907743930817, + "epoch": 0.7365905907555409, + "grad_norm": 0.6977643370628357, + "learning_rate": 1.5119617706885759e-05, + "loss": 1.455, + "mean_token_accuracy": 0.6570224811633428, + "num_tokens": 1126430889.0, + "step": 6705 + }, + { + "entropy": 1.6987931430339813, + "epoch": 0.7367004476669139, + "grad_norm": 0.5982023477554321, + "learning_rate": 1.5118194096447595e-05, + "loss": 1.5509, + "mean_token_accuracy": 0.6404446264108022, + "num_tokens": 1126622602.0, + "step": 6706 + }, + { + "entropy": 1.678564767042796, + "epoch": 0.7368103045782868, + "grad_norm": 0.7278105020523071, + "learning_rate": 1.5116770355670443e-05, + "loss": 1.2952, + "mean_token_accuracy": 0.67676875491937, + "num_tokens": 1126798370.0, + "step": 6707 + }, + { + "entropy": 1.6989206870396931, + "epoch": 0.7369201614896597, + "grad_norm": 0.6880453824996948, + "learning_rate": 1.5115346484599369e-05, + "loss": 1.382, + "mean_token_accuracy": 0.6712295562028885, + "num_tokens": 1126918091.0, + "step": 6708 + }, + { + "entropy": 1.7392151554425557, + "epoch": 0.7370300184010327, + "grad_norm": 1.0246655941009521, + "learning_rate": 1.5113922483279428e-05, + "loss": 1.6634, + "mean_token_accuracy": 0.6349669992923737, + "num_tokens": 1127107995.0, + "step": 6709 + }, + { + "entropy": 1.707019825776418, + "epoch": 0.7371398753124055, + "grad_norm": 0.5890080332756042, + "learning_rate": 1.5112498351755698e-05, + "loss": 1.4471, + "mean_token_accuracy": 0.6488803972800573, + "num_tokens": 1127258053.0, + "step": 6710 + }, + { + "entropy": 1.7253030637900035, + "epoch": 0.7372497322237785, + "grad_norm": 0.8130516409873962, + "learning_rate": 1.5111074090073245e-05, + "loss": 1.3519, + "mean_token_accuracy": 0.6647070497274399, + "num_tokens": 1127396234.0, + "step": 6711 + }, + { + "entropy": 1.673587401707967, + "epoch": 0.7373595891351514, + "grad_norm": 0.7452590465545654, + "learning_rate": 1.5109649698277154e-05, + "loss": 1.3056, + "mean_token_accuracy": 0.6725350320339203, + "num_tokens": 1127561050.0, + "step": 6712 + }, + { + "entropy": 1.679761916399002, + "epoch": 0.7374694460465244, + "grad_norm": 0.6038820743560791, + "learning_rate": 1.5108225176412494e-05, + "loss": 1.3374, + "mean_token_accuracy": 0.6583557625611623, + "num_tokens": 1127773253.0, + "step": 6713 + }, + { + "entropy": 1.717817982037862, + "epoch": 0.7375793029578973, + "grad_norm": 0.7298420667648315, + "learning_rate": 1.5106800524524367e-05, + "loss": 1.2114, + "mean_token_accuracy": 0.685623566309611, + "num_tokens": 1127909193.0, + "step": 6714 + }, + { + "entropy": 1.695088545481364, + "epoch": 0.7376891598692703, + "grad_norm": 0.6675203442573547, + "learning_rate": 1.510537574265785e-05, + "loss": 1.3353, + "mean_token_accuracy": 0.6633873581886292, + "num_tokens": 1128068140.0, + "step": 6715 + }, + { + "entropy": 1.734484702348709, + "epoch": 0.7377990167806432, + "grad_norm": 0.7346919178962708, + "learning_rate": 1.5103950830858041e-05, + "loss": 1.3236, + "mean_token_accuracy": 0.6552731692790985, + "num_tokens": 1128212239.0, + "step": 6716 + }, + { + "entropy": 1.7688794334729512, + "epoch": 0.7379088736920162, + "grad_norm": 0.6720606684684753, + "learning_rate": 1.5102525789170038e-05, + "loss": 1.5391, + "mean_token_accuracy": 0.6279580891132355, + "num_tokens": 1128447592.0, + "step": 6717 + }, + { + "entropy": 1.6898958086967468, + "epoch": 0.7380187306033891, + "grad_norm": 0.6676945090293884, + "learning_rate": 1.5101100617638943e-05, + "loss": 1.4134, + "mean_token_accuracy": 0.6584409524997076, + "num_tokens": 1128590607.0, + "step": 6718 + }, + { + "entropy": 1.6703394452730815, + "epoch": 0.7381285875147621, + "grad_norm": 0.6979921460151672, + "learning_rate": 1.5099675316309857e-05, + "loss": 1.3771, + "mean_token_accuracy": 0.6564718584219614, + "num_tokens": 1128763833.0, + "step": 6719 + }, + { + "entropy": 1.6774761080741882, + "epoch": 0.738238444426135, + "grad_norm": 0.6202614903450012, + "learning_rate": 1.50982498852279e-05, + "loss": 1.4414, + "mean_token_accuracy": 0.6443298210700353, + "num_tokens": 1128979972.0, + "step": 6720 + }, + { + "entropy": 1.6933607856432598, + "epoch": 0.738348301337508, + "grad_norm": 0.6849549412727356, + "learning_rate": 1.5096824324438178e-05, + "loss": 1.3212, + "mean_token_accuracy": 0.6674212664365768, + "num_tokens": 1129105832.0, + "step": 6721 + }, + { + "entropy": 1.6850066979726155, + "epoch": 0.7384581582488808, + "grad_norm": 0.6466884613037109, + "learning_rate": 1.5095398633985812e-05, + "loss": 1.3967, + "mean_token_accuracy": 0.6724070111910502, + "num_tokens": 1129257945.0, + "step": 6722 + }, + { + "entropy": 1.751002699136734, + "epoch": 0.7385680151602537, + "grad_norm": 0.8040537238121033, + "learning_rate": 1.5093972813915927e-05, + "loss": 1.3518, + "mean_token_accuracy": 0.6617087076107661, + "num_tokens": 1129373817.0, + "step": 6723 + }, + { + "entropy": 1.7356711030006409, + "epoch": 0.7386778720716267, + "grad_norm": 0.7240248918533325, + "learning_rate": 1.5092546864273648e-05, + "loss": 1.268, + "mean_token_accuracy": 0.6819742123285929, + "num_tokens": 1129493491.0, + "step": 6724 + }, + { + "entropy": 1.7143625020980835, + "epoch": 0.7387877289829996, + "grad_norm": 0.6687737107276917, + "learning_rate": 1.50911207851041e-05, + "loss": 1.3479, + "mean_token_accuracy": 0.655364657441775, + "num_tokens": 1129641923.0, + "step": 6725 + }, + { + "entropy": 1.776674618323644, + "epoch": 0.7388975858943726, + "grad_norm": 0.7817396521568298, + "learning_rate": 1.5089694576452425e-05, + "loss": 1.3725, + "mean_token_accuracy": 0.6603845258553823, + "num_tokens": 1129787182.0, + "step": 6726 + }, + { + "entropy": 1.681551843881607, + "epoch": 0.7390074428057455, + "grad_norm": 0.643803596496582, + "learning_rate": 1.5088268238363762e-05, + "loss": 1.358, + "mean_token_accuracy": 0.6494590491056442, + "num_tokens": 1129974180.0, + "step": 6727 + }, + { + "entropy": 1.7054549753665924, + "epoch": 0.7391172997171185, + "grad_norm": 0.7394840121269226, + "learning_rate": 1.5086841770883249e-05, + "loss": 1.479, + "mean_token_accuracy": 0.6421494533618292, + "num_tokens": 1130137129.0, + "step": 6728 + }, + { + "entropy": 1.661314348379771, + "epoch": 0.7392271566284914, + "grad_norm": 0.8410232663154602, + "learning_rate": 1.5085415174056035e-05, + "loss": 1.2465, + "mean_token_accuracy": 0.6789174030224482, + "num_tokens": 1130250429.0, + "step": 6729 + }, + { + "entropy": 1.718008428812027, + "epoch": 0.7393370135398644, + "grad_norm": 0.6935721039772034, + "learning_rate": 1.5083988447927276e-05, + "loss": 1.2534, + "mean_token_accuracy": 0.6772228926420212, + "num_tokens": 1130373436.0, + "step": 6730 + }, + { + "entropy": 1.6716387967268627, + "epoch": 0.7394468704512372, + "grad_norm": 0.7610313892364502, + "learning_rate": 1.5082561592542115e-05, + "loss": 1.2726, + "mean_token_accuracy": 0.674076090256373, + "num_tokens": 1130495850.0, + "step": 6731 + }, + { + "entropy": 1.725304255882899, + "epoch": 0.7395567273626102, + "grad_norm": 0.8041244149208069, + "learning_rate": 1.5081134607945726e-05, + "loss": 1.274, + "mean_token_accuracy": 0.6769175430138906, + "num_tokens": 1130644766.0, + "step": 6732 + }, + { + "entropy": 1.7322080036004384, + "epoch": 0.7396665842739831, + "grad_norm": 0.6417209506034851, + "learning_rate": 1.5079707494183265e-05, + "loss": 1.3872, + "mean_token_accuracy": 0.6669703970352808, + "num_tokens": 1130810397.0, + "step": 6733 + }, + { + "entropy": 1.6604767839113872, + "epoch": 0.7397764411853561, + "grad_norm": 0.6830531358718872, + "learning_rate": 1.5078280251299898e-05, + "loss": 1.3999, + "mean_token_accuracy": 0.657826155424118, + "num_tokens": 1130962006.0, + "step": 6734 + }, + { + "entropy": 1.7110504806041718, + "epoch": 0.739886298096729, + "grad_norm": 0.7035920023918152, + "learning_rate": 1.5076852879340798e-05, + "loss": 1.3654, + "mean_token_accuracy": 0.6586341708898544, + "num_tokens": 1131090714.0, + "step": 6735 + }, + { + "entropy": 1.6222879389921825, + "epoch": 0.7399961550081019, + "grad_norm": 0.7651909589767456, + "learning_rate": 1.5075425378351143e-05, + "loss": 1.4796, + "mean_token_accuracy": 0.6476244777441025, + "num_tokens": 1131286901.0, + "step": 6736 + }, + { + "entropy": 1.6963482002417247, + "epoch": 0.7401060119194749, + "grad_norm": 0.6514533162117004, + "learning_rate": 1.507399774837611e-05, + "loss": 1.4149, + "mean_token_accuracy": 0.6608110070228577, + "num_tokens": 1131442624.0, + "step": 6737 + }, + { + "entropy": 1.6943640112876892, + "epoch": 0.7402158688308478, + "grad_norm": 0.7217374444007874, + "learning_rate": 1.5072569989460887e-05, + "loss": 1.4165, + "mean_token_accuracy": 0.664705902338028, + "num_tokens": 1131640297.0, + "step": 6738 + }, + { + "entropy": 1.7367797791957855, + "epoch": 0.7403257257422208, + "grad_norm": 0.6871684789657593, + "learning_rate": 1.5071142101650657e-05, + "loss": 1.4446, + "mean_token_accuracy": 0.6475637157758077, + "num_tokens": 1131801764.0, + "step": 6739 + }, + { + "entropy": 1.6569513181845348, + "epoch": 0.7404355826535937, + "grad_norm": 0.6715342998504639, + "learning_rate": 1.5069714084990614e-05, + "loss": 1.2538, + "mean_token_accuracy": 0.6771769026915232, + "num_tokens": 1131961068.0, + "step": 6740 + }, + { + "entropy": 1.648532897233963, + "epoch": 0.7405454395649667, + "grad_norm": 0.724288821220398, + "learning_rate": 1.5068285939525953e-05, + "loss": 1.3426, + "mean_token_accuracy": 0.6620112607876459, + "num_tokens": 1132131553.0, + "step": 6741 + }, + { + "entropy": 1.7068750858306885, + "epoch": 0.7406552964763395, + "grad_norm": 0.7151614427566528, + "learning_rate": 1.506685766530188e-05, + "loss": 1.328, + "mean_token_accuracy": 0.6632406115531921, + "num_tokens": 1132259340.0, + "step": 6742 + }, + { + "entropy": 1.713490217924118, + "epoch": 0.7407651533877125, + "grad_norm": 0.7277394533157349, + "learning_rate": 1.506542926236359e-05, + "loss": 1.3877, + "mean_token_accuracy": 0.6614874800046285, + "num_tokens": 1132391138.0, + "step": 6743 + }, + { + "entropy": 1.700178434451421, + "epoch": 0.7408750102990854, + "grad_norm": 0.6973790526390076, + "learning_rate": 1.5064000730756295e-05, + "loss": 1.4149, + "mean_token_accuracy": 0.6597117880980173, + "num_tokens": 1132531234.0, + "step": 6744 + }, + { + "entropy": 1.6388778189818065, + "epoch": 0.7409848672104584, + "grad_norm": 0.6643558144569397, + "learning_rate": 1.5062572070525207e-05, + "loss": 1.4025, + "mean_token_accuracy": 0.6725161075592041, + "num_tokens": 1132734802.0, + "step": 6745 + }, + { + "entropy": 1.7185613016287486, + "epoch": 0.7410947241218313, + "grad_norm": 0.8322924971580505, + "learning_rate": 1.5061143281715552e-05, + "loss": 1.4067, + "mean_token_accuracy": 0.6537605971097946, + "num_tokens": 1132895353.0, + "step": 6746 + }, + { + "entropy": 1.7120015025138855, + "epoch": 0.7412045810332043, + "grad_norm": 0.725736677646637, + "learning_rate": 1.5059714364372531e-05, + "loss": 1.4964, + "mean_token_accuracy": 0.6400942405064901, + "num_tokens": 1133050277.0, + "step": 6747 + }, + { + "entropy": 1.677657534678777, + "epoch": 0.7413144379445772, + "grad_norm": 0.7027127742767334, + "learning_rate": 1.5058285318541389e-05, + "loss": 1.5079, + "mean_token_accuracy": 0.6412277817726135, + "num_tokens": 1133231985.0, + "step": 6748 + }, + { + "entropy": 1.6989895105361938, + "epoch": 0.7414242948559501, + "grad_norm": 0.6422317028045654, + "learning_rate": 1.505685614426734e-05, + "loss": 1.398, + "mean_token_accuracy": 0.6447745362917582, + "num_tokens": 1133401721.0, + "step": 6749 + }, + { + "entropy": 1.7608485122521718, + "epoch": 0.7415341517673231, + "grad_norm": 0.6693912744522095, + "learning_rate": 1.5055426841595624e-05, + "loss": 1.3246, + "mean_token_accuracy": 0.6594583491484324, + "num_tokens": 1133512280.0, + "step": 6750 + }, + { + "entropy": 1.6934813757737477, + "epoch": 0.741644008678696, + "grad_norm": 0.692389965057373, + "learning_rate": 1.5053997410571474e-05, + "loss": 1.6025, + "mean_token_accuracy": 0.6274192283550898, + "num_tokens": 1133772590.0, + "step": 6751 + }, + { + "entropy": 1.7201940218607585, + "epoch": 0.741753865590069, + "grad_norm": 0.700623095035553, + "learning_rate": 1.5052567851240138e-05, + "loss": 1.3729, + "mean_token_accuracy": 0.6719879905382792, + "num_tokens": 1133893711.0, + "step": 6752 + }, + { + "entropy": 1.7405705253283184, + "epoch": 0.7418637225014418, + "grad_norm": 0.7587204575538635, + "learning_rate": 1.5051138163646848e-05, + "loss": 1.5049, + "mean_token_accuracy": 0.6506867110729218, + "num_tokens": 1134043053.0, + "step": 6753 + }, + { + "entropy": 1.7238063216209412, + "epoch": 0.7419735794128148, + "grad_norm": 0.6716615557670593, + "learning_rate": 1.5049708347836866e-05, + "loss": 1.6108, + "mean_token_accuracy": 0.6179195394118627, + "num_tokens": 1134248453.0, + "step": 6754 + }, + { + "entropy": 1.6737729807694752, + "epoch": 0.7420834363241877, + "grad_norm": 0.7254316210746765, + "learning_rate": 1.5048278403855439e-05, + "loss": 1.3895, + "mean_token_accuracy": 0.6687343964974085, + "num_tokens": 1134417667.0, + "step": 6755 + }, + { + "entropy": 1.6903795301914215, + "epoch": 0.7421932932355607, + "grad_norm": 0.6885725855827332, + "learning_rate": 1.5046848331747822e-05, + "loss": 1.324, + "mean_token_accuracy": 0.6620573401451111, + "num_tokens": 1134596695.0, + "step": 6756 + }, + { + "entropy": 1.6956494649251301, + "epoch": 0.7423031501469336, + "grad_norm": 0.7012706398963928, + "learning_rate": 1.5045418131559281e-05, + "loss": 1.3519, + "mean_token_accuracy": 0.6658426324526469, + "num_tokens": 1134747187.0, + "step": 6757 + }, + { + "entropy": 1.7318655947844188, + "epoch": 0.7424130070583066, + "grad_norm": 0.8121592402458191, + "learning_rate": 1.5043987803335081e-05, + "loss": 1.4545, + "mean_token_accuracy": 0.6543787519137064, + "num_tokens": 1134884449.0, + "step": 6758 + }, + { + "entropy": 1.7426222761472066, + "epoch": 0.7425228639696795, + "grad_norm": 0.7535271644592285, + "learning_rate": 1.5042557347120486e-05, + "loss": 1.3212, + "mean_token_accuracy": 0.6727963835000992, + "num_tokens": 1135027665.0, + "step": 6759 + }, + { + "entropy": 1.6979150076707203, + "epoch": 0.7426327208810525, + "grad_norm": 0.8373980522155762, + "learning_rate": 1.5041126762960774e-05, + "loss": 1.3267, + "mean_token_accuracy": 0.6588234305381775, + "num_tokens": 1135172577.0, + "step": 6760 + }, + { + "entropy": 1.700280745824178, + "epoch": 0.7427425777924254, + "grad_norm": 0.7200369834899902, + "learning_rate": 1.503969605090122e-05, + "loss": 1.3094, + "mean_token_accuracy": 0.6659737030665079, + "num_tokens": 1135299828.0, + "step": 6761 + }, + { + "entropy": 1.7104643682638805, + "epoch": 0.7428524347037982, + "grad_norm": 0.8385793566703796, + "learning_rate": 1.5038265210987109e-05, + "loss": 1.3116, + "mean_token_accuracy": 0.677546814084053, + "num_tokens": 1135459026.0, + "step": 6762 + }, + { + "entropy": 1.6308595538139343, + "epoch": 0.7429622916151712, + "grad_norm": 0.6230477690696716, + "learning_rate": 1.5036834243263718e-05, + "loss": 1.4281, + "mean_token_accuracy": 0.6566774696111679, + "num_tokens": 1135670612.0, + "step": 6763 + }, + { + "entropy": 1.709012786547343, + "epoch": 0.7430721485265441, + "grad_norm": 0.9405829906463623, + "learning_rate": 1.5035403147776348e-05, + "loss": 1.3462, + "mean_token_accuracy": 0.662533774971962, + "num_tokens": 1135792652.0, + "step": 6764 + }, + { + "entropy": 1.6998238166173298, + "epoch": 0.7431820054379171, + "grad_norm": 0.7661788463592529, + "learning_rate": 1.5033971924570283e-05, + "loss": 1.3654, + "mean_token_accuracy": 0.6749661912520727, + "num_tokens": 1135970182.0, + "step": 6765 + }, + { + "entropy": 1.6655668715635936, + "epoch": 0.74329186234929, + "grad_norm": 0.7080719470977783, + "learning_rate": 1.5032540573690828e-05, + "loss": 1.4305, + "mean_token_accuracy": 0.651120533545812, + "num_tokens": 1136142615.0, + "step": 6766 + }, + { + "entropy": 1.7483255763848622, + "epoch": 0.743401719260663, + "grad_norm": 0.7917311191558838, + "learning_rate": 1.5031109095183278e-05, + "loss": 1.5119, + "mean_token_accuracy": 0.6534301191568375, + "num_tokens": 1136350269.0, + "step": 6767 + }, + { + "entropy": 1.633179912964503, + "epoch": 0.7435115761720359, + "grad_norm": 0.5961988568305969, + "learning_rate": 1.5029677489092944e-05, + "loss": 1.3725, + "mean_token_accuracy": 0.6645344644784927, + "num_tokens": 1136531760.0, + "step": 6768 + }, + { + "entropy": 1.7083572447299957, + "epoch": 0.7436214330834089, + "grad_norm": 0.7430605292320251, + "learning_rate": 1.5028245755465129e-05, + "loss": 1.3712, + "mean_token_accuracy": 0.6828558494647344, + "num_tokens": 1136674495.0, + "step": 6769 + }, + { + "entropy": 1.6955423951148987, + "epoch": 0.7437312899947818, + "grad_norm": 0.6523454785346985, + "learning_rate": 1.5026813894345159e-05, + "loss": 1.4493, + "mean_token_accuracy": 0.6469675749540329, + "num_tokens": 1136869151.0, + "step": 6770 + }, + { + "entropy": 1.7560264070828755, + "epoch": 0.7438411469061548, + "grad_norm": 0.6248143911361694, + "learning_rate": 1.5025381905778336e-05, + "loss": 1.392, + "mean_token_accuracy": 0.6438594659169515, + "num_tokens": 1137053577.0, + "step": 6771 + }, + { + "entropy": 1.7154277463754017, + "epoch": 0.7439510038175277, + "grad_norm": 0.6773508191108704, + "learning_rate": 1.5023949789809991e-05, + "loss": 1.3957, + "mean_token_accuracy": 0.6574154595534006, + "num_tokens": 1137208972.0, + "step": 6772 + }, + { + "entropy": 1.6735303401947021, + "epoch": 0.7440608607289007, + "grad_norm": 0.7054452896118164, + "learning_rate": 1.5022517546485451e-05, + "loss": 1.2731, + "mean_token_accuracy": 0.6664699663718542, + "num_tokens": 1137320522.0, + "step": 6773 + }, + { + "entropy": 1.7122906744480133, + "epoch": 0.7441707176402735, + "grad_norm": 0.6373655796051025, + "learning_rate": 1.502108517585004e-05, + "loss": 1.4198, + "mean_token_accuracy": 0.6484536776940028, + "num_tokens": 1137490263.0, + "step": 6774 + }, + { + "entropy": 1.6470833718776703, + "epoch": 0.7442805745516465, + "grad_norm": 0.6568727493286133, + "learning_rate": 1.50196526779491e-05, + "loss": 1.441, + "mean_token_accuracy": 0.647697259982427, + "num_tokens": 1137726786.0, + "step": 6775 + }, + { + "entropy": 1.6590462823708851, + "epoch": 0.7443904314630194, + "grad_norm": 0.8282439112663269, + "learning_rate": 1.501822005282796e-05, + "loss": 1.3672, + "mean_token_accuracy": 0.6522834698359171, + "num_tokens": 1137919330.0, + "step": 6776 + }, + { + "entropy": 1.6876604358355205, + "epoch": 0.7445002883743923, + "grad_norm": 0.6368933320045471, + "learning_rate": 1.5016787300531965e-05, + "loss": 1.3694, + "mean_token_accuracy": 0.6590426663557688, + "num_tokens": 1138061019.0, + "step": 6777 + }, + { + "entropy": 1.6871869663397472, + "epoch": 0.7446101452857653, + "grad_norm": 0.6821638345718384, + "learning_rate": 1.5015354421106464e-05, + "loss": 1.5103, + "mean_token_accuracy": 0.6524971077839533, + "num_tokens": 1138230811.0, + "step": 6778 + }, + { + "entropy": 1.6363123655319214, + "epoch": 0.7447200021971382, + "grad_norm": 0.6159114837646484, + "learning_rate": 1.5013921414596806e-05, + "loss": 1.3712, + "mean_token_accuracy": 0.6576385100682577, + "num_tokens": 1138422996.0, + "step": 6779 + }, + { + "entropy": 1.6642758349577587, + "epoch": 0.7448298591085112, + "grad_norm": 0.6397916674613953, + "learning_rate": 1.5012488281048344e-05, + "loss": 1.3105, + "mean_token_accuracy": 0.6616799881060919, + "num_tokens": 1138559976.0, + "step": 6780 + }, + { + "entropy": 1.6806075970331829, + "epoch": 0.7449397160198841, + "grad_norm": 0.6554465889930725, + "learning_rate": 1.5011055020506432e-05, + "loss": 1.3143, + "mean_token_accuracy": 0.6696526308854421, + "num_tokens": 1138716321.0, + "step": 6781 + }, + { + "entropy": 1.7737302879492443, + "epoch": 0.7450495729312571, + "grad_norm": 0.7125297784805298, + "learning_rate": 1.500962163301644e-05, + "loss": 1.3339, + "mean_token_accuracy": 0.6506142367919286, + "num_tokens": 1138844468.0, + "step": 6782 + }, + { + "entropy": 1.7082662880420685, + "epoch": 0.74515942984263, + "grad_norm": 0.690348744392395, + "learning_rate": 1.500818811862373e-05, + "loss": 1.5161, + "mean_token_accuracy": 0.643420398235321, + "num_tokens": 1138993642.0, + "step": 6783 + }, + { + "entropy": 1.7463260293006897, + "epoch": 0.745269286754003, + "grad_norm": 1.0192396640777588, + "learning_rate": 1.500675447737367e-05, + "loss": 1.5818, + "mean_token_accuracy": 0.6677199751138687, + "num_tokens": 1139162652.0, + "step": 6784 + }, + { + "entropy": 1.6664335330327351, + "epoch": 0.7453791436653758, + "grad_norm": 0.6164513826370239, + "learning_rate": 1.5005320709311638e-05, + "loss": 1.4417, + "mean_token_accuracy": 0.6483336140712103, + "num_tokens": 1139341551.0, + "step": 6785 + }, + { + "entropy": 1.7031769156455994, + "epoch": 0.7454890005767488, + "grad_norm": 0.8442233204841614, + "learning_rate": 1.5003886814483011e-05, + "loss": 1.302, + "mean_token_accuracy": 0.6629199633995692, + "num_tokens": 1139484565.0, + "step": 6786 + }, + { + "entropy": 1.7150315344333649, + "epoch": 0.7455988574881217, + "grad_norm": 0.7291525602340698, + "learning_rate": 1.5002452792933166e-05, + "loss": 1.3973, + "mean_token_accuracy": 0.6579045653343201, + "num_tokens": 1139653727.0, + "step": 6787 + }, + { + "entropy": 1.6477097769578297, + "epoch": 0.7457087143994947, + "grad_norm": 0.6176000833511353, + "learning_rate": 1.50010186447075e-05, + "loss": 1.2534, + "mean_token_accuracy": 0.6797188719113668, + "num_tokens": 1139808285.0, + "step": 6788 + }, + { + "entropy": 1.6606711745262146, + "epoch": 0.7458185713108676, + "grad_norm": 0.705842912197113, + "learning_rate": 1.4999584369851392e-05, + "loss": 1.4349, + "mean_token_accuracy": 0.6436782528956732, + "num_tokens": 1140008061.0, + "step": 6789 + }, + { + "entropy": 1.7517486015955608, + "epoch": 0.7459284282222405, + "grad_norm": 0.625238835811615, + "learning_rate": 1.4998149968410243e-05, + "loss": 1.4634, + "mean_token_accuracy": 0.625215545296669, + "num_tokens": 1140198927.0, + "step": 6790 + }, + { + "entropy": 1.7464244266351063, + "epoch": 0.7460382851336135, + "grad_norm": 0.7666485905647278, + "learning_rate": 1.4996715440429447e-05, + "loss": 1.4979, + "mean_token_accuracy": 0.6369941085577011, + "num_tokens": 1140359162.0, + "step": 6791 + }, + { + "entropy": 1.667342593272527, + "epoch": 0.7461481420449864, + "grad_norm": 0.6632201671600342, + "learning_rate": 1.4995280785954413e-05, + "loss": 1.3564, + "mean_token_accuracy": 0.6578471561272939, + "num_tokens": 1140549770.0, + "step": 6792 + }, + { + "entropy": 1.7453742424647014, + "epoch": 0.7462579989563594, + "grad_norm": 0.7090116739273071, + "learning_rate": 1.4993846005030537e-05, + "loss": 1.4562, + "mean_token_accuracy": 0.653822178641955, + "num_tokens": 1140699614.0, + "step": 6793 + }, + { + "entropy": 1.7000041206677754, + "epoch": 0.7463678558677322, + "grad_norm": 0.7104377150535583, + "learning_rate": 1.4992411097703237e-05, + "loss": 1.3719, + "mean_token_accuracy": 0.6573426475127538, + "num_tokens": 1140867840.0, + "step": 6794 + }, + { + "entropy": 1.6723633507887523, + "epoch": 0.7464777127791052, + "grad_norm": 0.7225411534309387, + "learning_rate": 1.4990976064017925e-05, + "loss": 1.2767, + "mean_token_accuracy": 0.6692603131135305, + "num_tokens": 1140998012.0, + "step": 6795 + }, + { + "entropy": 1.6979938050111134, + "epoch": 0.7465875696904781, + "grad_norm": 0.6646689176559448, + "learning_rate": 1.4989540904020018e-05, + "loss": 1.2908, + "mean_token_accuracy": 0.6694482167561849, + "num_tokens": 1141117803.0, + "step": 6796 + }, + { + "entropy": 1.68376824259758, + "epoch": 0.7466974266018511, + "grad_norm": 0.6983800530433655, + "learning_rate": 1.4988105617754942e-05, + "loss": 1.5562, + "mean_token_accuracy": 0.6386436770359675, + "num_tokens": 1141368730.0, + "step": 6797 + }, + { + "entropy": 1.6934349636236827, + "epoch": 0.746807283513224, + "grad_norm": 0.9968954920768738, + "learning_rate": 1.498667020526812e-05, + "loss": 1.4534, + "mean_token_accuracy": 0.6503161440292994, + "num_tokens": 1141576192.0, + "step": 6798 + }, + { + "entropy": 1.7412570118904114, + "epoch": 0.746917140424597, + "grad_norm": 0.5900823473930359, + "learning_rate": 1.4985234666604978e-05, + "loss": 1.5797, + "mean_token_accuracy": 0.6377448340257009, + "num_tokens": 1141774836.0, + "step": 6799 + }, + { + "entropy": 1.659286359945933, + "epoch": 0.7470269973359699, + "grad_norm": 0.6186773180961609, + "learning_rate": 1.4983799001810957e-05, + "loss": 1.39, + "mean_token_accuracy": 0.6724599103132883, + "num_tokens": 1141944431.0, + "step": 6800 + }, + { + "entropy": 1.6813781360785167, + "epoch": 0.7471368542473429, + "grad_norm": 0.7697616219520569, + "learning_rate": 1.4982363210931495e-05, + "loss": 1.3546, + "mean_token_accuracy": 0.6622022340695063, + "num_tokens": 1142128048.0, + "step": 6801 + }, + { + "entropy": 1.7053708632787068, + "epoch": 0.7472467111587158, + "grad_norm": 0.625506579875946, + "learning_rate": 1.498092729401203e-05, + "loss": 1.3282, + "mean_token_accuracy": 0.6647703299919764, + "num_tokens": 1142326201.0, + "step": 6802 + }, + { + "entropy": 1.668170581261317, + "epoch": 0.7473565680700887, + "grad_norm": 0.6538956761360168, + "learning_rate": 1.4979491251098008e-05, + "loss": 1.3052, + "mean_token_accuracy": 0.6595268547534943, + "num_tokens": 1142502759.0, + "step": 6803 + }, + { + "entropy": 1.6516262590885162, + "epoch": 0.7474664249814617, + "grad_norm": 0.5898981690406799, + "learning_rate": 1.4978055082234883e-05, + "loss": 1.3253, + "mean_token_accuracy": 0.6630469312270483, + "num_tokens": 1142677014.0, + "step": 6804 + }, + { + "entropy": 1.7548083265622456, + "epoch": 0.7475762818928345, + "grad_norm": 0.7785349488258362, + "learning_rate": 1.4976618787468109e-05, + "loss": 1.3407, + "mean_token_accuracy": 0.6611214627822241, + "num_tokens": 1142821725.0, + "step": 6805 + }, + { + "entropy": 1.6554772853851318, + "epoch": 0.7476861388042075, + "grad_norm": 0.6662288308143616, + "learning_rate": 1.497518236684314e-05, + "loss": 1.4193, + "mean_token_accuracy": 0.6535109380880991, + "num_tokens": 1143014793.0, + "step": 6806 + }, + { + "entropy": 1.6847680111726124, + "epoch": 0.7477959957155804, + "grad_norm": 0.7195385098457336, + "learning_rate": 1.4973745820405442e-05, + "loss": 1.3784, + "mean_token_accuracy": 0.6672850747903188, + "num_tokens": 1143166310.0, + "step": 6807 + }, + { + "entropy": 1.7305493354797363, + "epoch": 0.7479058526269534, + "grad_norm": 0.7656927704811096, + "learning_rate": 1.497230914820048e-05, + "loss": 1.4317, + "mean_token_accuracy": 0.656554693977038, + "num_tokens": 1143316576.0, + "step": 6808 + }, + { + "entropy": 1.6923631529013317, + "epoch": 0.7480157095383263, + "grad_norm": 0.7529177069664001, + "learning_rate": 1.4970872350273717e-05, + "loss": 1.1828, + "mean_token_accuracy": 0.688283234834671, + "num_tokens": 1143455530.0, + "step": 6809 + }, + { + "entropy": 1.7461569805939992, + "epoch": 0.7481255664496993, + "grad_norm": 0.5634061694145203, + "learning_rate": 1.496943542667064e-05, + "loss": 1.3979, + "mean_token_accuracy": 0.6512090861797333, + "num_tokens": 1143626650.0, + "step": 6810 + }, + { + "entropy": 1.6781017482280731, + "epoch": 0.7482354233610722, + "grad_norm": 0.7326881885528564, + "learning_rate": 1.4967998377436717e-05, + "loss": 1.4369, + "mean_token_accuracy": 0.6579112311204275, + "num_tokens": 1143767777.0, + "step": 6811 + }, + { + "entropy": 1.6676452855269115, + "epoch": 0.7483452802724452, + "grad_norm": 0.8091786503791809, + "learning_rate": 1.4966561202617435e-05, + "loss": 1.4482, + "mean_token_accuracy": 0.6634651124477386, + "num_tokens": 1143955831.0, + "step": 6812 + }, + { + "entropy": 1.7290521661440532, + "epoch": 0.7484551371838181, + "grad_norm": 0.6584422588348389, + "learning_rate": 1.4965123902258279e-05, + "loss": 1.3347, + "mean_token_accuracy": 0.6581239700317383, + "num_tokens": 1144071834.0, + "step": 6813 + }, + { + "entropy": 1.6535277366638184, + "epoch": 0.7485649940951911, + "grad_norm": 0.6501061916351318, + "learning_rate": 1.4963686476404737e-05, + "loss": 1.4189, + "mean_token_accuracy": 0.6524655272563299, + "num_tokens": 1144258749.0, + "step": 6814 + }, + { + "entropy": 1.6799985071023305, + "epoch": 0.748674851006564, + "grad_norm": 0.8732005953788757, + "learning_rate": 1.4962248925102305e-05, + "loss": 1.4393, + "mean_token_accuracy": 0.6538581599791845, + "num_tokens": 1144394205.0, + "step": 6815 + }, + { + "entropy": 1.7935606241226196, + "epoch": 0.7487847079179369, + "grad_norm": 0.7209190726280212, + "learning_rate": 1.496081124839648e-05, + "loss": 1.4205, + "mean_token_accuracy": 0.6454559167226156, + "num_tokens": 1144522534.0, + "step": 6816 + }, + { + "entropy": 1.6710129082202911, + "epoch": 0.7488945648293098, + "grad_norm": 0.5794231295585632, + "learning_rate": 1.4959373446332762e-05, + "loss": 1.3592, + "mean_token_accuracy": 0.6566446522871653, + "num_tokens": 1144710283.0, + "step": 6817 + }, + { + "entropy": 1.6727528870105743, + "epoch": 0.7490044217406827, + "grad_norm": 1.6803122758865356, + "learning_rate": 1.4957935518956658e-05, + "loss": 1.3172, + "mean_token_accuracy": 0.6655105352401733, + "num_tokens": 1144865494.0, + "step": 6818 + }, + { + "entropy": 1.6749801139036815, + "epoch": 0.7491142786520557, + "grad_norm": 0.6018057465553284, + "learning_rate": 1.4956497466313682e-05, + "loss": 1.3718, + "mean_token_accuracy": 0.6562728782494863, + "num_tokens": 1145060596.0, + "step": 6819 + }, + { + "entropy": 1.6236327687899272, + "epoch": 0.7492241355634286, + "grad_norm": 0.7080875635147095, + "learning_rate": 1.4955059288449343e-05, + "loss": 1.4342, + "mean_token_accuracy": 0.6543013006448746, + "num_tokens": 1145299929.0, + "step": 6820 + }, + { + "entropy": 1.7417059342066448, + "epoch": 0.7493339924748016, + "grad_norm": 0.6856288313865662, + "learning_rate": 1.4953620985409156e-05, + "loss": 1.4524, + "mean_token_accuracy": 0.6499587247769038, + "num_tokens": 1145473231.0, + "step": 6821 + }, + { + "entropy": 1.7538205881913502, + "epoch": 0.7494438493861745, + "grad_norm": 0.6540340781211853, + "learning_rate": 1.495218255723865e-05, + "loss": 1.4127, + "mean_token_accuracy": 0.6443704416354498, + "num_tokens": 1145635824.0, + "step": 6822 + }, + { + "entropy": 1.8043759365876515, + "epoch": 0.7495537062975475, + "grad_norm": 0.7414424419403076, + "learning_rate": 1.4950744003983346e-05, + "loss": 1.2803, + "mean_token_accuracy": 0.6785912364721298, + "num_tokens": 1145771367.0, + "step": 6823 + }, + { + "entropy": 1.6939500371615093, + "epoch": 0.7496635632089204, + "grad_norm": 0.7015360593795776, + "learning_rate": 1.4949305325688776e-05, + "loss": 1.2973, + "mean_token_accuracy": 0.6717601070801417, + "num_tokens": 1145898356.0, + "step": 6824 + }, + { + "entropy": 1.719683289527893, + "epoch": 0.7497734201202934, + "grad_norm": 0.7223207950592041, + "learning_rate": 1.4947866522400469e-05, + "loss": 1.3485, + "mean_token_accuracy": 0.6610392481088638, + "num_tokens": 1146043519.0, + "step": 6825 + }, + { + "entropy": 1.726772169272105, + "epoch": 0.7498832770316662, + "grad_norm": 0.7367146611213684, + "learning_rate": 1.494642759416397e-05, + "loss": 1.3865, + "mean_token_accuracy": 0.6567636330922445, + "num_tokens": 1146210946.0, + "step": 6826 + }, + { + "entropy": 1.6836354335149128, + "epoch": 0.7499931339430392, + "grad_norm": 0.6234486103057861, + "learning_rate": 1.494498854102481e-05, + "loss": 1.4224, + "mean_token_accuracy": 0.6549923866987228, + "num_tokens": 1146431560.0, + "step": 6827 + }, + { + "entropy": 1.7541552980740864, + "epoch": 0.7501029908544121, + "grad_norm": 0.6722849607467651, + "learning_rate": 1.4943549363028544e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.6468459516763687, + "num_tokens": 1146608356.0, + "step": 6828 + }, + { + "entropy": 1.7377861142158508, + "epoch": 0.7502128477657851, + "grad_norm": 0.6956329941749573, + "learning_rate": 1.4942110060220718e-05, + "loss": 1.492, + "mean_token_accuracy": 0.645117849111557, + "num_tokens": 1146763510.0, + "step": 6829 + }, + { + "entropy": 1.7091150482495625, + "epoch": 0.750322704677158, + "grad_norm": 0.5567704439163208, + "learning_rate": 1.4940670632646886e-05, + "loss": 1.512, + "mean_token_accuracy": 0.6451121767361959, + "num_tokens": 1146972167.0, + "step": 6830 + }, + { + "entropy": 1.7323314944903057, + "epoch": 0.7504325615885309, + "grad_norm": 0.6515035629272461, + "learning_rate": 1.49392310803526e-05, + "loss": 1.4759, + "mean_token_accuracy": 0.6642122765382131, + "num_tokens": 1147157436.0, + "step": 6831 + }, + { + "entropy": 1.6438543101151784, + "epoch": 0.7505424184999039, + "grad_norm": 0.826503574848175, + "learning_rate": 1.4937791403383429e-05, + "loss": 1.2688, + "mean_token_accuracy": 0.6719188491503397, + "num_tokens": 1147301711.0, + "step": 6832 + }, + { + "entropy": 1.7450671792030334, + "epoch": 0.7506522754112768, + "grad_norm": 0.7970007658004761, + "learning_rate": 1.4936351601784936e-05, + "loss": 1.4347, + "mean_token_accuracy": 0.6608226199944814, + "num_tokens": 1147448757.0, + "step": 6833 + }, + { + "entropy": 1.7097887794176738, + "epoch": 0.7507621323226498, + "grad_norm": 0.6782563328742981, + "learning_rate": 1.4934911675602684e-05, + "loss": 1.4367, + "mean_token_accuracy": 0.6495286400119463, + "num_tokens": 1147583520.0, + "step": 6834 + }, + { + "entropy": 1.6653445859750111, + "epoch": 0.7508719892340227, + "grad_norm": 0.6241797804832458, + "learning_rate": 1.4933471624882252e-05, + "loss": 1.4375, + "mean_token_accuracy": 0.6648624936739603, + "num_tokens": 1147772528.0, + "step": 6835 + }, + { + "entropy": 1.6597016155719757, + "epoch": 0.7509818461453956, + "grad_norm": 0.6311061382293701, + "learning_rate": 1.4932031449669216e-05, + "loss": 1.3328, + "mean_token_accuracy": 0.6625900516907374, + "num_tokens": 1147914543.0, + "step": 6836 + }, + { + "entropy": 1.7220198810100555, + "epoch": 0.7510917030567685, + "grad_norm": 0.6737807393074036, + "learning_rate": 1.4930591150009153e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6545198758443197, + "num_tokens": 1148060878.0, + "step": 6837 + }, + { + "entropy": 1.7228451172510784, + "epoch": 0.7512015599681415, + "grad_norm": 0.5968870520591736, + "learning_rate": 1.4929150725947657e-05, + "loss": 1.4637, + "mean_token_accuracy": 0.6343479951222738, + "num_tokens": 1148284690.0, + "step": 6838 + }, + { + "entropy": 1.7416771451632183, + "epoch": 0.7513114168795144, + "grad_norm": 0.6965200901031494, + "learning_rate": 1.4927710177530308e-05, + "loss": 1.553, + "mean_token_accuracy": 0.6420968150099119, + "num_tokens": 1148448260.0, + "step": 6839 + }, + { + "entropy": 1.6946365038553874, + "epoch": 0.7514212737908874, + "grad_norm": 0.6708457469940186, + "learning_rate": 1.4926269504802702e-05, + "loss": 1.3724, + "mean_token_accuracy": 0.6507139702637991, + "num_tokens": 1148660677.0, + "step": 6840 + }, + { + "entropy": 1.7274916470050812, + "epoch": 0.7515311307022603, + "grad_norm": 0.8043560981750488, + "learning_rate": 1.4924828707810434e-05, + "loss": 1.4065, + "mean_token_accuracy": 0.6550286362568537, + "num_tokens": 1148845422.0, + "step": 6841 + }, + { + "entropy": 1.6953127483526866, + "epoch": 0.7516409876136333, + "grad_norm": 0.615106463432312, + "learning_rate": 1.4923387786599111e-05, + "loss": 1.3215, + "mean_token_accuracy": 0.6688494135936102, + "num_tokens": 1148998827.0, + "step": 6842 + }, + { + "entropy": 1.6872650881608326, + "epoch": 0.7517508445250062, + "grad_norm": 0.6760332584381104, + "learning_rate": 1.4921946741214328e-05, + "loss": 1.4611, + "mean_token_accuracy": 0.6474284629027048, + "num_tokens": 1149163526.0, + "step": 6843 + }, + { + "entropy": 1.7162937223911285, + "epoch": 0.7518607014363791, + "grad_norm": 0.6299443244934082, + "learning_rate": 1.49205055717017e-05, + "loss": 1.4696, + "mean_token_accuracy": 0.6503280848264694, + "num_tokens": 1149379885.0, + "step": 6844 + }, + { + "entropy": 1.707916518052419, + "epoch": 0.7519705583477521, + "grad_norm": 0.6752136945724487, + "learning_rate": 1.4919064278106837e-05, + "loss": 1.3816, + "mean_token_accuracy": 0.66462242603302, + "num_tokens": 1149545097.0, + "step": 6845 + }, + { + "entropy": 1.708972801764806, + "epoch": 0.752080415259125, + "grad_norm": 0.746783435344696, + "learning_rate": 1.4917622860475355e-05, + "loss": 1.292, + "mean_token_accuracy": 0.6678305218617121, + "num_tokens": 1149699714.0, + "step": 6846 + }, + { + "entropy": 1.7258944114049275, + "epoch": 0.7521902721704979, + "grad_norm": 0.7096854448318481, + "learning_rate": 1.4916181318852872e-05, + "loss": 1.5354, + "mean_token_accuracy": 0.6471205502748489, + "num_tokens": 1149911864.0, + "step": 6847 + }, + { + "entropy": 1.7283643583456676, + "epoch": 0.7523001290818708, + "grad_norm": 0.8113722801208496, + "learning_rate": 1.491473965328502e-05, + "loss": 1.5158, + "mean_token_accuracy": 0.6531914075215658, + "num_tokens": 1150075121.0, + "step": 6848 + }, + { + "entropy": 1.6454201638698578, + "epoch": 0.7524099859932438, + "grad_norm": 0.6000940203666687, + "learning_rate": 1.4913297863817417e-05, + "loss": 1.3858, + "mean_token_accuracy": 0.6618871788183848, + "num_tokens": 1150257842.0, + "step": 6849 + }, + { + "entropy": 1.7098387082417805, + "epoch": 0.7525198429046167, + "grad_norm": 0.6875969767570496, + "learning_rate": 1.4911855950495707e-05, + "loss": 1.5528, + "mean_token_accuracy": 0.6489702612161636, + "num_tokens": 1150479789.0, + "step": 6850 + }, + { + "entropy": 1.7067073086897533, + "epoch": 0.7526296998159897, + "grad_norm": 0.6003955602645874, + "learning_rate": 1.4910413913365511e-05, + "loss": 1.4514, + "mean_token_accuracy": 0.6396404554446539, + "num_tokens": 1150666377.0, + "step": 6851 + }, + { + "entropy": 1.7154656648635864, + "epoch": 0.7527395567273626, + "grad_norm": 0.7262822389602661, + "learning_rate": 1.490897175247248e-05, + "loss": 1.3599, + "mean_token_accuracy": 0.6746832331021627, + "num_tokens": 1150801744.0, + "step": 6852 + }, + { + "entropy": 1.7396377523740132, + "epoch": 0.7528494136387356, + "grad_norm": 0.6769723892211914, + "learning_rate": 1.4907529467862254e-05, + "loss": 1.6661, + "mean_token_accuracy": 0.6230086013674736, + "num_tokens": 1151017918.0, + "step": 6853 + }, + { + "entropy": 1.6834344764550526, + "epoch": 0.7529592705501085, + "grad_norm": 0.580190122127533, + "learning_rate": 1.4906087059580483e-05, + "loss": 1.3398, + "mean_token_accuracy": 0.6577950765689214, + "num_tokens": 1151202898.0, + "step": 6854 + }, + { + "entropy": 1.7219790021578472, + "epoch": 0.7530691274614815, + "grad_norm": 0.7568797469139099, + "learning_rate": 1.4904644527672813e-05, + "loss": 1.3304, + "mean_token_accuracy": 0.657778725028038, + "num_tokens": 1151353607.0, + "step": 6855 + }, + { + "entropy": 1.7640255590279896, + "epoch": 0.7531789843728544, + "grad_norm": 0.7201240658760071, + "learning_rate": 1.4903201872184909e-05, + "loss": 1.5171, + "mean_token_accuracy": 0.6397636433442434, + "num_tokens": 1151519388.0, + "step": 6856 + }, + { + "entropy": 1.6551097631454468, + "epoch": 0.7532888412842272, + "grad_norm": 0.6681106686592102, + "learning_rate": 1.4901759093162423e-05, + "loss": 1.2624, + "mean_token_accuracy": 0.6762852072715759, + "num_tokens": 1151672154.0, + "step": 6857 + }, + { + "entropy": 1.6735802292823792, + "epoch": 0.7533986981956002, + "grad_norm": 0.635806679725647, + "learning_rate": 1.4900316190651013e-05, + "loss": 1.422, + "mean_token_accuracy": 0.6669272085030874, + "num_tokens": 1151816351.0, + "step": 6858 + }, + { + "entropy": 1.7274777193864186, + "epoch": 0.7535085551069731, + "grad_norm": 0.7612840533256531, + "learning_rate": 1.4898873164696361e-05, + "loss": 1.2724, + "mean_token_accuracy": 0.668310264746348, + "num_tokens": 1151944344.0, + "step": 6859 + }, + { + "entropy": 1.7404861251513164, + "epoch": 0.7536184120183461, + "grad_norm": 0.7002642750740051, + "learning_rate": 1.4897430015344128e-05, + "loss": 1.376, + "mean_token_accuracy": 0.6616505285104116, + "num_tokens": 1152096710.0, + "step": 6860 + }, + { + "entropy": 1.7473096946875255, + "epoch": 0.753728268929719, + "grad_norm": 0.9548206925392151, + "learning_rate": 1.489598674263999e-05, + "loss": 1.5039, + "mean_token_accuracy": 0.6459807008504868, + "num_tokens": 1152222865.0, + "step": 6861 + }, + { + "entropy": 1.7407631874084473, + "epoch": 0.753838125841092, + "grad_norm": 0.7236599326133728, + "learning_rate": 1.4894543346629628e-05, + "loss": 1.4074, + "mean_token_accuracy": 0.6461221228043238, + "num_tokens": 1152370024.0, + "step": 6862 + }, + { + "entropy": 1.706801136334737, + "epoch": 0.7539479827524649, + "grad_norm": 0.6131123304367065, + "learning_rate": 1.4893099827358725e-05, + "loss": 1.4282, + "mean_token_accuracy": 0.640210434794426, + "num_tokens": 1152551295.0, + "step": 6863 + }, + { + "entropy": 1.7038409014542897, + "epoch": 0.7540578396638379, + "grad_norm": 0.849195122718811, + "learning_rate": 1.4891656184872967e-05, + "loss": 1.4797, + "mean_token_accuracy": 0.6482670257488886, + "num_tokens": 1152715725.0, + "step": 6864 + }, + { + "entropy": 1.671141008536021, + "epoch": 0.7541676965752108, + "grad_norm": 0.7273076772689819, + "learning_rate": 1.4890212419218042e-05, + "loss": 1.3456, + "mean_token_accuracy": 0.6643229325612386, + "num_tokens": 1152856152.0, + "step": 6865 + }, + { + "entropy": 1.7077520688374836, + "epoch": 0.7542775534865838, + "grad_norm": 0.6826111078262329, + "learning_rate": 1.4888768530439648e-05, + "loss": 1.3934, + "mean_token_accuracy": 0.6760113835334778, + "num_tokens": 1153011787.0, + "step": 6866 + }, + { + "entropy": 1.7440832058588664, + "epoch": 0.7543874103979566, + "grad_norm": 0.766875684261322, + "learning_rate": 1.4887324518583482e-05, + "loss": 1.5279, + "mean_token_accuracy": 0.6481309731801351, + "num_tokens": 1153194581.0, + "step": 6867 + }, + { + "entropy": 1.696602463722229, + "epoch": 0.7544972673093296, + "grad_norm": 0.6051673293113708, + "learning_rate": 1.4885880383695245e-05, + "loss": 1.3711, + "mean_token_accuracy": 0.6530511478583018, + "num_tokens": 1153426371.0, + "step": 6868 + }, + { + "entropy": 1.6467144290606182, + "epoch": 0.7546071242207025, + "grad_norm": 0.7292453646659851, + "learning_rate": 1.4884436125820647e-05, + "loss": 1.4731, + "mean_token_accuracy": 0.6530012140671412, + "num_tokens": 1153573291.0, + "step": 6869 + }, + { + "entropy": 1.7314301331837971, + "epoch": 0.7547169811320755, + "grad_norm": 0.6091862916946411, + "learning_rate": 1.4882991745005398e-05, + "loss": 1.4244, + "mean_token_accuracy": 0.6590274671713511, + "num_tokens": 1153732528.0, + "step": 6870 + }, + { + "entropy": 1.7819582720597584, + "epoch": 0.7548268380434484, + "grad_norm": 0.7857415676116943, + "learning_rate": 1.4881547241295207e-05, + "loss": 1.491, + "mean_token_accuracy": 0.650216872493426, + "num_tokens": 1153921075.0, + "step": 6871 + }, + { + "entropy": 1.723918507496516, + "epoch": 0.7549366949548213, + "grad_norm": 0.8788068294525146, + "learning_rate": 1.4880102614735793e-05, + "loss": 1.3552, + "mean_token_accuracy": 0.6704768786827723, + "num_tokens": 1154081656.0, + "step": 6872 + }, + { + "entropy": 1.762155642112096, + "epoch": 0.7550465518661943, + "grad_norm": 0.6839030981063843, + "learning_rate": 1.4878657865372885e-05, + "loss": 1.4846, + "mean_token_accuracy": 0.6490776985883713, + "num_tokens": 1154251448.0, + "step": 6873 + }, + { + "entropy": 1.69782950480779, + "epoch": 0.7551564087775672, + "grad_norm": 0.710850715637207, + "learning_rate": 1.48772129932522e-05, + "loss": 1.3765, + "mean_token_accuracy": 0.6583685626586279, + "num_tokens": 1154429721.0, + "step": 6874 + }, + { + "entropy": 1.6837405562400818, + "epoch": 0.7552662656889402, + "grad_norm": 0.6713438630104065, + "learning_rate": 1.487576799841947e-05, + "loss": 1.3999, + "mean_token_accuracy": 0.6706645538409551, + "num_tokens": 1154576709.0, + "step": 6875 + }, + { + "entropy": 1.697175920009613, + "epoch": 0.7553761226003131, + "grad_norm": 0.6505449414253235, + "learning_rate": 1.4874322880920433e-05, + "loss": 1.4259, + "mean_token_accuracy": 0.6693024138609568, + "num_tokens": 1154713459.0, + "step": 6876 + }, + { + "entropy": 1.6716215113798778, + "epoch": 0.7554859795116861, + "grad_norm": 0.6013683080673218, + "learning_rate": 1.4872877640800818e-05, + "loss": 1.4185, + "mean_token_accuracy": 0.6550382524728775, + "num_tokens": 1154916105.0, + "step": 6877 + }, + { + "entropy": 1.727581520875295, + "epoch": 0.7555958364230589, + "grad_norm": 0.634597897529602, + "learning_rate": 1.4871432278106376e-05, + "loss": 1.4924, + "mean_token_accuracy": 0.6589693377415339, + "num_tokens": 1155124626.0, + "step": 6878 + }, + { + "entropy": 1.6853844324747722, + "epoch": 0.7557056933344319, + "grad_norm": 0.7113041281700134, + "learning_rate": 1.4869986792882842e-05, + "loss": 1.393, + "mean_token_accuracy": 0.6586426496505737, + "num_tokens": 1155315768.0, + "step": 6879 + }, + { + "entropy": 1.720129370689392, + "epoch": 0.7558155502458048, + "grad_norm": 0.758216381072998, + "learning_rate": 1.4868541185175973e-05, + "loss": 1.2764, + "mean_token_accuracy": 0.6894825349251429, + "num_tokens": 1155458375.0, + "step": 6880 + }, + { + "entropy": 1.7519591550032299, + "epoch": 0.7559254071571778, + "grad_norm": 0.785953164100647, + "learning_rate": 1.4867095455031515e-05, + "loss": 1.4353, + "mean_token_accuracy": 0.6612924883762995, + "num_tokens": 1155625543.0, + "step": 6881 + }, + { + "entropy": 1.7473149696985881, + "epoch": 0.7560352640685507, + "grad_norm": 0.7062848210334778, + "learning_rate": 1.4865649602495233e-05, + "loss": 1.4931, + "mean_token_accuracy": 0.6371675978104273, + "num_tokens": 1155805083.0, + "step": 6882 + }, + { + "entropy": 1.6948178907235463, + "epoch": 0.7561451209799237, + "grad_norm": 0.6578991413116455, + "learning_rate": 1.4864203627612878e-05, + "loss": 1.2472, + "mean_token_accuracy": 0.6706608285506567, + "num_tokens": 1155948315.0, + "step": 6883 + }, + { + "entropy": 1.7165430684884389, + "epoch": 0.7562549778912966, + "grad_norm": 0.6664071083068848, + "learning_rate": 1.4862757530430228e-05, + "loss": 1.3434, + "mean_token_accuracy": 0.6707089493672053, + "num_tokens": 1156127831.0, + "step": 6884 + }, + { + "entropy": 1.643526017665863, + "epoch": 0.7563648348026695, + "grad_norm": 0.7331792116165161, + "learning_rate": 1.4861311310993037e-05, + "loss": 1.3877, + "mean_token_accuracy": 0.6548557827870051, + "num_tokens": 1156308750.0, + "step": 6885 + }, + { + "entropy": 1.709174503882726, + "epoch": 0.7564746917140425, + "grad_norm": 0.6824013590812683, + "learning_rate": 1.485986496934708e-05, + "loss": 1.3598, + "mean_token_accuracy": 0.6605821500221888, + "num_tokens": 1156454330.0, + "step": 6886 + }, + { + "entropy": 1.6876440346240997, + "epoch": 0.7565845486254154, + "grad_norm": 0.7022239565849304, + "learning_rate": 1.485841850553814e-05, + "loss": 1.4559, + "mean_token_accuracy": 0.659210721651713, + "num_tokens": 1156623224.0, + "step": 6887 + }, + { + "entropy": 1.670984039704005, + "epoch": 0.7566944055367884, + "grad_norm": 0.8806616067886353, + "learning_rate": 1.4856971919611993e-05, + "loss": 1.5488, + "mean_token_accuracy": 0.6296228965123495, + "num_tokens": 1156899766.0, + "step": 6888 + }, + { + "entropy": 1.6433234910170238, + "epoch": 0.7568042624481612, + "grad_norm": 0.6585554480552673, + "learning_rate": 1.485552521161442e-05, + "loss": 1.2847, + "mean_token_accuracy": 0.663401777545611, + "num_tokens": 1157043630.0, + "step": 6889 + }, + { + "entropy": 1.7140738268693287, + "epoch": 0.7569141193595342, + "grad_norm": 0.6763997077941895, + "learning_rate": 1.4854078381591215e-05, + "loss": 1.3165, + "mean_token_accuracy": 0.6577816307544708, + "num_tokens": 1157198891.0, + "step": 6890 + }, + { + "entropy": 1.7322425842285156, + "epoch": 0.7570239762709071, + "grad_norm": 0.6586790680885315, + "learning_rate": 1.4852631429588164e-05, + "loss": 1.4056, + "mean_token_accuracy": 0.6535748243331909, + "num_tokens": 1157367746.0, + "step": 6891 + }, + { + "entropy": 1.7426089147726695, + "epoch": 0.7571338331822801, + "grad_norm": 0.6329144835472107, + "learning_rate": 1.4851184355651063e-05, + "loss": 1.347, + "mean_token_accuracy": 0.6626821060975393, + "num_tokens": 1157518999.0, + "step": 6892 + }, + { + "entropy": 1.727944056193034, + "epoch": 0.757243690093653, + "grad_norm": 0.6308918595314026, + "learning_rate": 1.4849737159825714e-05, + "loss": 1.3709, + "mean_token_accuracy": 0.6626657843589783, + "num_tokens": 1157679031.0, + "step": 6893 + }, + { + "entropy": 1.7070819934209187, + "epoch": 0.757353547005026, + "grad_norm": 0.6136558055877686, + "learning_rate": 1.4848289842157922e-05, + "loss": 1.3357, + "mean_token_accuracy": 0.6673356592655182, + "num_tokens": 1157823732.0, + "step": 6894 + }, + { + "entropy": 1.7385432024796803, + "epoch": 0.7574634039163989, + "grad_norm": 0.6450533270835876, + "learning_rate": 1.4846842402693485e-05, + "loss": 1.3576, + "mean_token_accuracy": 0.6694497863451639, + "num_tokens": 1157968223.0, + "step": 6895 + }, + { + "entropy": 1.7011185189088185, + "epoch": 0.7575732608277719, + "grad_norm": 0.7629391551017761, + "learning_rate": 1.4845394841478223e-05, + "loss": 1.417, + "mean_token_accuracy": 0.6586320847272873, + "num_tokens": 1158093485.0, + "step": 6896 + }, + { + "entropy": 1.7235964337984722, + "epoch": 0.7576831177391448, + "grad_norm": 0.6074236631393433, + "learning_rate": 1.4843947158557943e-05, + "loss": 1.5449, + "mean_token_accuracy": 0.6520265738169352, + "num_tokens": 1158302727.0, + "step": 6897 + }, + { + "entropy": 1.6836207310358684, + "epoch": 0.7577929746505176, + "grad_norm": 0.6065682172775269, + "learning_rate": 1.484249935397847e-05, + "loss": 1.3707, + "mean_token_accuracy": 0.6710223456223806, + "num_tokens": 1158455241.0, + "step": 6898 + }, + { + "entropy": 1.7246917287508647, + "epoch": 0.7579028315618906, + "grad_norm": 0.7016457915306091, + "learning_rate": 1.4841051427785625e-05, + "loss": 1.4724, + "mean_token_accuracy": 0.6549848715464274, + "num_tokens": 1158625284.0, + "step": 6899 + }, + { + "entropy": 1.6922398805618286, + "epoch": 0.7580126884732635, + "grad_norm": 0.6993584036827087, + "learning_rate": 1.4839603380025236e-05, + "loss": 1.37, + "mean_token_accuracy": 0.662392814954122, + "num_tokens": 1158788784.0, + "step": 6900 + }, + { + "entropy": 1.6558389564355214, + "epoch": 0.7581225453846365, + "grad_norm": 0.7079626321792603, + "learning_rate": 1.4838155210743124e-05, + "loss": 1.2161, + "mean_token_accuracy": 0.68675068517526, + "num_tokens": 1158934601.0, + "step": 6901 + }, + { + "entropy": 1.7894011040528615, + "epoch": 0.7582324022960094, + "grad_norm": 0.7190823554992676, + "learning_rate": 1.4836706919985131e-05, + "loss": 1.517, + "mean_token_accuracy": 0.6554691096146902, + "num_tokens": 1159073338.0, + "step": 6902 + }, + { + "entropy": 1.7235852877298992, + "epoch": 0.7583422592073824, + "grad_norm": 0.6851528882980347, + "learning_rate": 1.4835258507797094e-05, + "loss": 1.3269, + "mean_token_accuracy": 0.6659359286228815, + "num_tokens": 1159221997.0, + "step": 6903 + }, + { + "entropy": 1.6805242598056793, + "epoch": 0.7584521161187553, + "grad_norm": 0.5390611886978149, + "learning_rate": 1.4833809974224853e-05, + "loss": 1.3438, + "mean_token_accuracy": 0.6687274475892385, + "num_tokens": 1159431862.0, + "step": 6904 + }, + { + "entropy": 1.7057210902372997, + "epoch": 0.7585619730301283, + "grad_norm": 0.6051385402679443, + "learning_rate": 1.4832361319314252e-05, + "loss": 1.4902, + "mean_token_accuracy": 0.6540891279776891, + "num_tokens": 1159648613.0, + "step": 6905 + }, + { + "entropy": 1.6478383739789326, + "epoch": 0.7586718299415012, + "grad_norm": 0.6398272514343262, + "learning_rate": 1.4830912543111146e-05, + "loss": 1.4191, + "mean_token_accuracy": 0.6588562329610189, + "num_tokens": 1159819964.0, + "step": 6906 + }, + { + "entropy": 1.7082193195819855, + "epoch": 0.7587816868528742, + "grad_norm": 0.7104562520980835, + "learning_rate": 1.4829463645661382e-05, + "loss": 1.3546, + "mean_token_accuracy": 0.6578278988599777, + "num_tokens": 1159996948.0, + "step": 6907 + }, + { + "entropy": 1.7297697563966115, + "epoch": 0.7588915437642471, + "grad_norm": 0.6430516242980957, + "learning_rate": 1.4828014627010819e-05, + "loss": 1.3365, + "mean_token_accuracy": 0.6678340236345927, + "num_tokens": 1160156500.0, + "step": 6908 + }, + { + "entropy": 1.639371891816457, + "epoch": 0.75900140067562, + "grad_norm": 0.6583812832832336, + "learning_rate": 1.4826565487205319e-05, + "loss": 1.3132, + "mean_token_accuracy": 0.6856881082057953, + "num_tokens": 1160300199.0, + "step": 6909 + }, + { + "entropy": 1.7579985360304515, + "epoch": 0.7591112575869929, + "grad_norm": 0.8110305666923523, + "learning_rate": 1.4825116226290746e-05, + "loss": 1.6068, + "mean_token_accuracy": 0.6306049029032389, + "num_tokens": 1160545710.0, + "step": 6910 + }, + { + "entropy": 1.6936591267585754, + "epoch": 0.7592211144983658, + "grad_norm": 0.6707553863525391, + "learning_rate": 1.4823666844312962e-05, + "loss": 1.29, + "mean_token_accuracy": 0.6731418470541636, + "num_tokens": 1160680981.0, + "step": 6911 + }, + { + "entropy": 1.705474744240443, + "epoch": 0.7593309714097388, + "grad_norm": 0.764737069606781, + "learning_rate": 1.4822217341317852e-05, + "loss": 1.4305, + "mean_token_accuracy": 0.6555500676234564, + "num_tokens": 1160819936.0, + "step": 6912 + }, + { + "entropy": 1.6776454746723175, + "epoch": 0.7594408283211117, + "grad_norm": 0.6505389213562012, + "learning_rate": 1.4820767717351285e-05, + "loss": 1.3035, + "mean_token_accuracy": 0.6752181301514307, + "num_tokens": 1161006775.0, + "step": 6913 + }, + { + "entropy": 1.6627070903778076, + "epoch": 0.7595506852324847, + "grad_norm": 0.6468442678451538, + "learning_rate": 1.481931797245914e-05, + "loss": 1.4297, + "mean_token_accuracy": 0.6586330334345499, + "num_tokens": 1161241820.0, + "step": 6914 + }, + { + "entropy": 1.6682479977607727, + "epoch": 0.7596605421438576, + "grad_norm": 0.6730937361717224, + "learning_rate": 1.4817868106687303e-05, + "loss": 1.4197, + "mean_token_accuracy": 0.6521937002738317, + "num_tokens": 1161416726.0, + "step": 6915 + }, + { + "entropy": 1.6636716326077778, + "epoch": 0.7597703990552306, + "grad_norm": 0.6824373006820679, + "learning_rate": 1.4816418120081662e-05, + "loss": 1.3944, + "mean_token_accuracy": 0.6513000329335531, + "num_tokens": 1161583847.0, + "step": 6916 + }, + { + "entropy": 1.7242677907148998, + "epoch": 0.7598802559666035, + "grad_norm": 0.686144232749939, + "learning_rate": 1.4814968012688102e-05, + "loss": 1.4005, + "mean_token_accuracy": 0.656681497891744, + "num_tokens": 1161736676.0, + "step": 6917 + }, + { + "entropy": 1.6589552164077759, + "epoch": 0.7599901128779765, + "grad_norm": 0.6163228154182434, + "learning_rate": 1.4813517784552529e-05, + "loss": 1.4136, + "mean_token_accuracy": 0.6552829394737879, + "num_tokens": 1161954113.0, + "step": 6918 + }, + { + "entropy": 1.678790142138799, + "epoch": 0.7600999697893494, + "grad_norm": 0.6521669030189514, + "learning_rate": 1.4812067435720834e-05, + "loss": 1.1138, + "mean_token_accuracy": 0.6720156023899714, + "num_tokens": 1162176443.0, + "step": 6919 + }, + { + "entropy": 1.6968080500761669, + "epoch": 0.7602098267007223, + "grad_norm": 0.6458204388618469, + "learning_rate": 1.4810616966238922e-05, + "loss": 1.4491, + "mean_token_accuracy": 0.6454138110081354, + "num_tokens": 1162348356.0, + "step": 6920 + }, + { + "entropy": 1.6829048295815785, + "epoch": 0.7603196836120952, + "grad_norm": 0.7455350756645203, + "learning_rate": 1.4809166376152701e-05, + "loss": 1.3664, + "mean_token_accuracy": 0.6672768096129099, + "num_tokens": 1162488148.0, + "step": 6921 + }, + { + "entropy": 1.6659102042516072, + "epoch": 0.7604295405234682, + "grad_norm": 0.7293592095375061, + "learning_rate": 1.4807715665508083e-05, + "loss": 1.3741, + "mean_token_accuracy": 0.6664891839027405, + "num_tokens": 1162650930.0, + "step": 6922 + }, + { + "entropy": 1.6555348932743073, + "epoch": 0.7605393974348411, + "grad_norm": 0.727997899055481, + "learning_rate": 1.4806264834350976e-05, + "loss": 1.3346, + "mean_token_accuracy": 0.6639738827943802, + "num_tokens": 1162797968.0, + "step": 6923 + }, + { + "entropy": 1.7288571496804555, + "epoch": 0.7606492543462141, + "grad_norm": 0.7030077576637268, + "learning_rate": 1.4804813882727305e-05, + "loss": 1.3352, + "mean_token_accuracy": 0.6633950720230738, + "num_tokens": 1163010773.0, + "step": 6924 + }, + { + "entropy": 1.6670528650283813, + "epoch": 0.760759111257587, + "grad_norm": 0.8227211236953735, + "learning_rate": 1.4803362810682988e-05, + "loss": 1.2828, + "mean_token_accuracy": 0.678699125846227, + "num_tokens": 1163144830.0, + "step": 6925 + }, + { + "entropy": 1.7120730479558308, + "epoch": 0.7608689681689599, + "grad_norm": 0.6343841552734375, + "learning_rate": 1.480191161826395e-05, + "loss": 1.4498, + "mean_token_accuracy": 0.638987218340238, + "num_tokens": 1163435423.0, + "step": 6926 + }, + { + "entropy": 1.7124398946762085, + "epoch": 0.7609788250803329, + "grad_norm": 0.5454217791557312, + "learning_rate": 1.4800460305516125e-05, + "loss": 1.523, + "mean_token_accuracy": 0.6400202016035715, + "num_tokens": 1163644758.0, + "step": 6927 + }, + { + "entropy": 1.7238081296284993, + "epoch": 0.7610886819917058, + "grad_norm": 0.9346860647201538, + "learning_rate": 1.4799008872485442e-05, + "loss": 1.4065, + "mean_token_accuracy": 0.6679123987754186, + "num_tokens": 1163827583.0, + "step": 6928 + }, + { + "entropy": 1.7018209397792816, + "epoch": 0.7611985389030788, + "grad_norm": 0.7219953536987305, + "learning_rate": 1.4797557319217844e-05, + "loss": 1.3688, + "mean_token_accuracy": 0.6602154572804769, + "num_tokens": 1163970324.0, + "step": 6929 + }, + { + "entropy": 1.668403019507726, + "epoch": 0.7613083958144516, + "grad_norm": 0.7923089861869812, + "learning_rate": 1.4796105645759265e-05, + "loss": 1.3472, + "mean_token_accuracy": 0.6825543294350306, + "num_tokens": 1164133261.0, + "step": 6930 + }, + { + "entropy": 1.717042436202367, + "epoch": 0.7614182527258246, + "grad_norm": 0.6521219611167908, + "learning_rate": 1.4794653852155652e-05, + "loss": 1.3194, + "mean_token_accuracy": 0.679710810383161, + "num_tokens": 1164291576.0, + "step": 6931 + }, + { + "entropy": 1.6769898136456807, + "epoch": 0.7615281096371975, + "grad_norm": 0.720014214515686, + "learning_rate": 1.4793201938452954e-05, + "loss": 1.2698, + "mean_token_accuracy": 0.6756969839334488, + "num_tokens": 1164403028.0, + "step": 6932 + }, + { + "entropy": 1.6898697714010875, + "epoch": 0.7616379665485705, + "grad_norm": 0.7772789001464844, + "learning_rate": 1.4791749904697126e-05, + "loss": 1.4018, + "mean_token_accuracy": 0.6722168525060018, + "num_tokens": 1164542964.0, + "step": 6933 + }, + { + "entropy": 1.7483848134676616, + "epoch": 0.7617478234599434, + "grad_norm": 0.7039276957511902, + "learning_rate": 1.4790297750934122e-05, + "loss": 1.5323, + "mean_token_accuracy": 0.6407303462425867, + "num_tokens": 1164715324.0, + "step": 6934 + }, + { + "entropy": 1.7075275778770447, + "epoch": 0.7618576803713164, + "grad_norm": 0.8316227197647095, + "learning_rate": 1.4788845477209902e-05, + "loss": 1.2911, + "mean_token_accuracy": 0.6719946066538492, + "num_tokens": 1164865136.0, + "step": 6935 + }, + { + "entropy": 1.7587328751881917, + "epoch": 0.7619675372826893, + "grad_norm": 0.7186470031738281, + "learning_rate": 1.478739308357043e-05, + "loss": 1.5645, + "mean_token_accuracy": 0.6294473161300024, + "num_tokens": 1165022876.0, + "step": 6936 + }, + { + "entropy": 1.6595724324385326, + "epoch": 0.7620773941940623, + "grad_norm": 0.7300217151641846, + "learning_rate": 1.4785940570061674e-05, + "loss": 1.2741, + "mean_token_accuracy": 0.6697218616803488, + "num_tokens": 1165153628.0, + "step": 6937 + }, + { + "entropy": 1.7758424580097198, + "epoch": 0.7621872511054352, + "grad_norm": 0.7241067886352539, + "learning_rate": 1.4784487936729603e-05, + "loss": 1.4515, + "mean_token_accuracy": 0.6553726196289062, + "num_tokens": 1165289807.0, + "step": 6938 + }, + { + "entropy": 1.6955066323280334, + "epoch": 0.7622971080168081, + "grad_norm": 0.7136008143424988, + "learning_rate": 1.4783035183620195e-05, + "loss": 1.3052, + "mean_token_accuracy": 0.6689305007457733, + "num_tokens": 1165443874.0, + "step": 6939 + }, + { + "entropy": 1.6818625926971436, + "epoch": 0.762406964928181, + "grad_norm": 0.7151510119438171, + "learning_rate": 1.478158231077943e-05, + "loss": 1.3418, + "mean_token_accuracy": 0.6611978759368261, + "num_tokens": 1165609704.0, + "step": 6940 + }, + { + "entropy": 1.6454001367092133, + "epoch": 0.7625168218395539, + "grad_norm": 0.7376065850257874, + "learning_rate": 1.4780129318253287e-05, + "loss": 1.262, + "mean_token_accuracy": 0.6749721119801203, + "num_tokens": 1165748280.0, + "step": 6941 + }, + { + "entropy": 1.7588698168595631, + "epoch": 0.7626266787509269, + "grad_norm": 0.8318473100662231, + "learning_rate": 1.4778676206087757e-05, + "loss": 1.3082, + "mean_token_accuracy": 0.6613359103600184, + "num_tokens": 1165874711.0, + "step": 6942 + }, + { + "entropy": 1.6534738838672638, + "epoch": 0.7627365356622998, + "grad_norm": 0.7632639408111572, + "learning_rate": 1.4777222974328823e-05, + "loss": 1.2516, + "mean_token_accuracy": 0.6722172896067301, + "num_tokens": 1166003519.0, + "step": 6943 + }, + { + "entropy": 1.716422309478124, + "epoch": 0.7628463925736728, + "grad_norm": 0.6576639413833618, + "learning_rate": 1.4775769623022488e-05, + "loss": 1.3884, + "mean_token_accuracy": 0.6642766098181406, + "num_tokens": 1166160623.0, + "step": 6944 + }, + { + "entropy": 1.6622655391693115, + "epoch": 0.7629562494850457, + "grad_norm": 0.619766116142273, + "learning_rate": 1.477431615221474e-05, + "loss": 1.3108, + "mean_token_accuracy": 0.6632679601510366, + "num_tokens": 1166325809.0, + "step": 6945 + }, + { + "entropy": 1.6929753025372822, + "epoch": 0.7630661063964187, + "grad_norm": 0.6698241829872131, + "learning_rate": 1.4772862561951595e-05, + "loss": 1.3187, + "mean_token_accuracy": 0.6679337720076243, + "num_tokens": 1166463053.0, + "step": 6946 + }, + { + "entropy": 1.6616682608922322, + "epoch": 0.7631759633077916, + "grad_norm": 0.5817018747329712, + "learning_rate": 1.4771408852279045e-05, + "loss": 1.358, + "mean_token_accuracy": 0.6618844419717789, + "num_tokens": 1166652937.0, + "step": 6947 + }, + { + "entropy": 1.6677273412545521, + "epoch": 0.7632858202191646, + "grad_norm": 0.7584317326545715, + "learning_rate": 1.4769955023243104e-05, + "loss": 1.2932, + "mean_token_accuracy": 0.6745211482048035, + "num_tokens": 1166780191.0, + "step": 6948 + }, + { + "entropy": 1.6915989518165588, + "epoch": 0.7633956771305375, + "grad_norm": 0.6527446508407593, + "learning_rate": 1.4768501074889787e-05, + "loss": 1.431, + "mean_token_accuracy": 0.641153042515119, + "num_tokens": 1166943637.0, + "step": 6949 + }, + { + "entropy": 1.722637156645457, + "epoch": 0.7635055340419105, + "grad_norm": 0.712783694267273, + "learning_rate": 1.476704700726511e-05, + "loss": 1.4764, + "mean_token_accuracy": 0.6418820122877756, + "num_tokens": 1167072984.0, + "step": 6950 + }, + { + "entropy": 1.6762286921342213, + "epoch": 0.7636153909532833, + "grad_norm": 0.7010881900787354, + "learning_rate": 1.4765592820415087e-05, + "loss": 1.3241, + "mean_token_accuracy": 0.6702330708503723, + "num_tokens": 1167222121.0, + "step": 6951 + }, + { + "entropy": 1.6860096454620361, + "epoch": 0.7637252478646562, + "grad_norm": 0.720114529132843, + "learning_rate": 1.4764138514385755e-05, + "loss": 1.3242, + "mean_token_accuracy": 0.6637054880460104, + "num_tokens": 1167366067.0, + "step": 6952 + }, + { + "entropy": 1.685429056485494, + "epoch": 0.7638351047760292, + "grad_norm": 0.6480314135551453, + "learning_rate": 1.4762684089223133e-05, + "loss": 1.4365, + "mean_token_accuracy": 0.6541512509187063, + "num_tokens": 1167560550.0, + "step": 6953 + }, + { + "entropy": 1.7416847745577495, + "epoch": 0.7639449616874021, + "grad_norm": 0.7204356789588928, + "learning_rate": 1.4761229544973253e-05, + "loss": 1.3083, + "mean_token_accuracy": 0.6680977592865626, + "num_tokens": 1167682119.0, + "step": 6954 + }, + { + "entropy": 1.7352920869986217, + "epoch": 0.7640548185987751, + "grad_norm": 0.6415309906005859, + "learning_rate": 1.4759774881682154e-05, + "loss": 1.4291, + "mean_token_accuracy": 0.6526644130547842, + "num_tokens": 1167920712.0, + "step": 6955 + }, + { + "entropy": 1.7168652017911274, + "epoch": 0.764164675510148, + "grad_norm": 0.7147775292396545, + "learning_rate": 1.4758320099395878e-05, + "loss": 1.4244, + "mean_token_accuracy": 0.6459483454624811, + "num_tokens": 1168098347.0, + "step": 6956 + }, + { + "entropy": 1.716566542784373, + "epoch": 0.764274532421521, + "grad_norm": 0.6769205331802368, + "learning_rate": 1.475686519816046e-05, + "loss": 1.3462, + "mean_token_accuracy": 0.6656129608551661, + "num_tokens": 1168227173.0, + "step": 6957 + }, + { + "entropy": 1.7634214858214061, + "epoch": 0.7643843893328939, + "grad_norm": 0.6557965874671936, + "learning_rate": 1.475541017802195e-05, + "loss": 1.3621, + "mean_token_accuracy": 0.6519608447949091, + "num_tokens": 1168383497.0, + "step": 6958 + }, + { + "entropy": 1.7473195095856984, + "epoch": 0.7644942462442669, + "grad_norm": 0.7062838673591614, + "learning_rate": 1.4753955039026404e-05, + "loss": 1.3984, + "mean_token_accuracy": 0.6513441602389017, + "num_tokens": 1168533793.0, + "step": 6959 + }, + { + "entropy": 1.6654168864091237, + "epoch": 0.7646041031556398, + "grad_norm": 0.6767547726631165, + "learning_rate": 1.4752499781219872e-05, + "loss": 1.2874, + "mean_token_accuracy": 0.6723661124706268, + "num_tokens": 1168672243.0, + "step": 6960 + }, + { + "entropy": 1.6357990304629009, + "epoch": 0.7647139600670128, + "grad_norm": 0.7614895105361938, + "learning_rate": 1.4751044404648408e-05, + "loss": 1.2983, + "mean_token_accuracy": 0.6715351541837057, + "num_tokens": 1168857075.0, + "step": 6961 + }, + { + "entropy": 1.7005026539166768, + "epoch": 0.7648238169783856, + "grad_norm": 0.666313648223877, + "learning_rate": 1.4749588909358083e-05, + "loss": 1.3576, + "mean_token_accuracy": 0.6531829734643301, + "num_tokens": 1169003694.0, + "step": 6962 + }, + { + "entropy": 1.6825850903987885, + "epoch": 0.7649336738897586, + "grad_norm": 0.6583350896835327, + "learning_rate": 1.474813329539496e-05, + "loss": 1.4258, + "mean_token_accuracy": 0.6811217963695526, + "num_tokens": 1169167340.0, + "step": 6963 + }, + { + "entropy": 1.6577100853125255, + "epoch": 0.7650435308011315, + "grad_norm": 0.9120500087738037, + "learning_rate": 1.4746677562805105e-05, + "loss": 1.1814, + "mean_token_accuracy": 0.6874327609936396, + "num_tokens": 1169279246.0, + "step": 6964 + }, + { + "entropy": 1.7242598036924999, + "epoch": 0.7651533877125045, + "grad_norm": 0.9223476052284241, + "learning_rate": 1.4745221711634595e-05, + "loss": 1.2861, + "mean_token_accuracy": 0.6691893190145493, + "num_tokens": 1169427675.0, + "step": 6965 + }, + { + "entropy": 1.727004200220108, + "epoch": 0.7652632446238774, + "grad_norm": 0.7496103644371033, + "learning_rate": 1.4743765741929503e-05, + "loss": 1.4509, + "mean_token_accuracy": 0.646850789586703, + "num_tokens": 1169595749.0, + "step": 6966 + }, + { + "entropy": 1.644486020008723, + "epoch": 0.7653731015352503, + "grad_norm": 0.5539238452911377, + "learning_rate": 1.4742309653735911e-05, + "loss": 1.3391, + "mean_token_accuracy": 0.655859500169754, + "num_tokens": 1169788192.0, + "step": 6967 + }, + { + "entropy": 1.7506780723730724, + "epoch": 0.7654829584466233, + "grad_norm": 0.7031539678573608, + "learning_rate": 1.4740853447099912e-05, + "loss": 1.4266, + "mean_token_accuracy": 0.6525140305360159, + "num_tokens": 1169919266.0, + "step": 6968 + }, + { + "entropy": 1.7322252094745636, + "epoch": 0.7655928153579962, + "grad_norm": 0.712948203086853, + "learning_rate": 1.4739397122067583e-05, + "loss": 1.4078, + "mean_token_accuracy": 0.6520160535971323, + "num_tokens": 1170084069.0, + "step": 6969 + }, + { + "entropy": 1.6913042962551117, + "epoch": 0.7657026722693692, + "grad_norm": 0.6860613226890564, + "learning_rate": 1.4737940678685016e-05, + "loss": 1.3426, + "mean_token_accuracy": 0.6700414170821508, + "num_tokens": 1170236546.0, + "step": 6970 + }, + { + "entropy": 1.7662197053432465, + "epoch": 0.765812529180742, + "grad_norm": 0.696919858455658, + "learning_rate": 1.4736484116998315e-05, + "loss": 1.4445, + "mean_token_accuracy": 0.6526474754015604, + "num_tokens": 1170412715.0, + "step": 6971 + }, + { + "entropy": 1.703002353509267, + "epoch": 0.765922386092115, + "grad_norm": 0.6996424198150635, + "learning_rate": 1.4735027437053576e-05, + "loss": 1.2528, + "mean_token_accuracy": 0.6738860954840978, + "num_tokens": 1170516111.0, + "step": 6972 + }, + { + "entropy": 1.7598189612229664, + "epoch": 0.7660322430034879, + "grad_norm": 0.6697201132774353, + "learning_rate": 1.47335706388969e-05, + "loss": 1.4846, + "mean_token_accuracy": 0.6326592018206915, + "num_tokens": 1170683915.0, + "step": 6973 + }, + { + "entropy": 1.744869331518809, + "epoch": 0.7661420999148609, + "grad_norm": 0.6344386339187622, + "learning_rate": 1.4732113722574395e-05, + "loss": 1.3379, + "mean_token_accuracy": 0.6556073526541392, + "num_tokens": 1170823388.0, + "step": 6974 + }, + { + "entropy": 1.6862431168556213, + "epoch": 0.7662519568262338, + "grad_norm": 0.6660764217376709, + "learning_rate": 1.4730656688132173e-05, + "loss": 1.4486, + "mean_token_accuracy": 0.6581073055664698, + "num_tokens": 1170965077.0, + "step": 6975 + }, + { + "entropy": 1.7214731673399608, + "epoch": 0.7663618137376068, + "grad_norm": 0.6826305389404297, + "learning_rate": 1.472919953561635e-05, + "loss": 1.3649, + "mean_token_accuracy": 0.6637348333994547, + "num_tokens": 1171116343.0, + "step": 6976 + }, + { + "entropy": 1.7036389410495758, + "epoch": 0.7664716706489797, + "grad_norm": 0.6430275440216064, + "learning_rate": 1.472774226507304e-05, + "loss": 1.3781, + "mean_token_accuracy": 0.6487658222516378, + "num_tokens": 1171264476.0, + "step": 6977 + }, + { + "entropy": 1.6874217987060547, + "epoch": 0.7665815275603527, + "grad_norm": 0.6152638792991638, + "learning_rate": 1.4726284876548367e-05, + "loss": 1.3798, + "mean_token_accuracy": 0.6498878498872122, + "num_tokens": 1171468339.0, + "step": 6978 + }, + { + "entropy": 1.7156427005926769, + "epoch": 0.7666913844717256, + "grad_norm": 0.6952628493309021, + "learning_rate": 1.4724827370088457e-05, + "loss": 1.4389, + "mean_token_accuracy": 0.6459887872139612, + "num_tokens": 1171683447.0, + "step": 6979 + }, + { + "entropy": 1.6867701709270477, + "epoch": 0.7668012413830985, + "grad_norm": 0.6468155980110168, + "learning_rate": 1.472336974573944e-05, + "loss": 1.4697, + "mean_token_accuracy": 0.6447963615258535, + "num_tokens": 1171853214.0, + "step": 6980 + }, + { + "entropy": 1.710288276274999, + "epoch": 0.7669110982944715, + "grad_norm": 0.5480025410652161, + "learning_rate": 1.4721912003547447e-05, + "loss": 1.4186, + "mean_token_accuracy": 0.648127923409144, + "num_tokens": 1172038628.0, + "step": 6981 + }, + { + "entropy": 1.6937748491764069, + "epoch": 0.7670209552058443, + "grad_norm": 0.6765931844711304, + "learning_rate": 1.4720454143558618e-05, + "loss": 1.4487, + "mean_token_accuracy": 0.6704505582650503, + "num_tokens": 1172249770.0, + "step": 6982 + }, + { + "entropy": 1.7202177445093791, + "epoch": 0.7671308121172173, + "grad_norm": 0.7446685433387756, + "learning_rate": 1.4718996165819093e-05, + "loss": 1.3326, + "mean_token_accuracy": 0.6584400335947672, + "num_tokens": 1172391864.0, + "step": 6983 + }, + { + "entropy": 1.6548251410325368, + "epoch": 0.7672406690285902, + "grad_norm": 0.7036621570587158, + "learning_rate": 1.471753807037501e-05, + "loss": 1.2375, + "mean_token_accuracy": 0.680665984749794, + "num_tokens": 1172535215.0, + "step": 6984 + }, + { + "entropy": 1.6808474858601887, + "epoch": 0.7673505259399632, + "grad_norm": 0.6487118601799011, + "learning_rate": 1.4716079857272527e-05, + "loss": 1.3899, + "mean_token_accuracy": 0.6563947548468908, + "num_tokens": 1172692357.0, + "step": 6985 + }, + { + "entropy": 1.695367197195689, + "epoch": 0.7674603828513361, + "grad_norm": 1.2500559091567993, + "learning_rate": 1.4714621526557788e-05, + "loss": 1.3457, + "mean_token_accuracy": 0.6743065714836121, + "num_tokens": 1172820590.0, + "step": 6986 + }, + { + "entropy": 1.6751854817072551, + "epoch": 0.7675702397627091, + "grad_norm": 0.5937096476554871, + "learning_rate": 1.4713163078276953e-05, + "loss": 1.4562, + "mean_token_accuracy": 0.6525317927201589, + "num_tokens": 1173063549.0, + "step": 6987 + }, + { + "entropy": 1.6620129545529683, + "epoch": 0.767680096674082, + "grad_norm": 0.7316376566886902, + "learning_rate": 1.471170451247618e-05, + "loss": 1.3667, + "mean_token_accuracy": 0.666183148821195, + "num_tokens": 1173292913.0, + "step": 6988 + }, + { + "entropy": 1.7417064011096954, + "epoch": 0.767789953585455, + "grad_norm": 0.6786331534385681, + "learning_rate": 1.471024582920163e-05, + "loss": 1.3281, + "mean_token_accuracy": 0.6628505686918894, + "num_tokens": 1173481347.0, + "step": 6989 + }, + { + "entropy": 1.6276845037937164, + "epoch": 0.7678998104968279, + "grad_norm": 0.8080840706825256, + "learning_rate": 1.4708787028499475e-05, + "loss": 1.2766, + "mean_token_accuracy": 0.670835038026174, + "num_tokens": 1173610214.0, + "step": 6990 + }, + { + "entropy": 1.725032518307368, + "epoch": 0.7680096674082009, + "grad_norm": 0.9934976696968079, + "learning_rate": 1.470732811041588e-05, + "loss": 1.3939, + "mean_token_accuracy": 0.6700426588455836, + "num_tokens": 1173748546.0, + "step": 6991 + }, + { + "entropy": 1.7025299568970997, + "epoch": 0.7681195243195738, + "grad_norm": 0.6891453266143799, + "learning_rate": 1.4705869074997022e-05, + "loss": 1.5514, + "mean_token_accuracy": 0.6387580533822378, + "num_tokens": 1173940859.0, + "step": 6992 + }, + { + "entropy": 1.7296819686889648, + "epoch": 0.7682293812309466, + "grad_norm": 0.6920235753059387, + "learning_rate": 1.4704409922289074e-05, + "loss": 1.3289, + "mean_token_accuracy": 0.6593838532765707, + "num_tokens": 1174139950.0, + "step": 6993 + }, + { + "entropy": 1.7431517243385315, + "epoch": 0.7683392381423196, + "grad_norm": 0.6889626979827881, + "learning_rate": 1.4702950652338224e-05, + "loss": 1.5143, + "mean_token_accuracy": 0.6448341409365336, + "num_tokens": 1174281330.0, + "step": 6994 + }, + { + "entropy": 1.6172963281472523, + "epoch": 0.7684490950536925, + "grad_norm": 0.6094475388526917, + "learning_rate": 1.4701491265190652e-05, + "loss": 1.3748, + "mean_token_accuracy": 0.6714093685150146, + "num_tokens": 1174444111.0, + "step": 6995 + }, + { + "entropy": 1.7066385547320049, + "epoch": 0.7685589519650655, + "grad_norm": 0.5568619966506958, + "learning_rate": 1.4700031760892552e-05, + "loss": 1.3955, + "mean_token_accuracy": 0.6417905241250992, + "num_tokens": 1174677634.0, + "step": 6996 + }, + { + "entropy": 1.7028604447841644, + "epoch": 0.7686688088764384, + "grad_norm": 0.8622896075248718, + "learning_rate": 1.4698572139490113e-05, + "loss": 1.3625, + "mean_token_accuracy": 0.6595305403073629, + "num_tokens": 1174829291.0, + "step": 6997 + }, + { + "entropy": 1.748704065879186, + "epoch": 0.7687786657878114, + "grad_norm": 0.7469777464866638, + "learning_rate": 1.4697112401029532e-05, + "loss": 1.6273, + "mean_token_accuracy": 0.6403237904111544, + "num_tokens": 1175037549.0, + "step": 6998 + }, + { + "entropy": 1.6959306299686432, + "epoch": 0.7688885226991843, + "grad_norm": 0.7172491550445557, + "learning_rate": 1.4695652545557009e-05, + "loss": 1.3416, + "mean_token_accuracy": 0.6578481743733088, + "num_tokens": 1175213614.0, + "step": 6999 + }, + { + "entropy": 1.7542682588100433, + "epoch": 0.7689983796105573, + "grad_norm": 0.665587306022644, + "learning_rate": 1.469419257311875e-05, + "loss": 1.3778, + "mean_token_accuracy": 0.6552835355202357, + "num_tokens": 1175337289.0, + "step": 7000 + }, + { + "entropy": 1.6769481201966603, + "epoch": 0.7691082365219302, + "grad_norm": 0.8862442374229431, + "learning_rate": 1.4692732483760958e-05, + "loss": 1.347, + "mean_token_accuracy": 0.6755407452583313, + "num_tokens": 1175483112.0, + "step": 7001 + }, + { + "entropy": 1.719422310590744, + "epoch": 0.7692180934333032, + "grad_norm": 0.755631148815155, + "learning_rate": 1.4691272277529852e-05, + "loss": 1.3332, + "mean_token_accuracy": 0.6593481749296188, + "num_tokens": 1175630991.0, + "step": 7002 + }, + { + "entropy": 1.6748768985271454, + "epoch": 0.769327950344676, + "grad_norm": 0.6282011270523071, + "learning_rate": 1.4689811954471638e-05, + "loss": 1.3524, + "mean_token_accuracy": 0.6604679971933365, + "num_tokens": 1175801098.0, + "step": 7003 + }, + { + "entropy": 1.6717474361260731, + "epoch": 0.769437807256049, + "grad_norm": 0.7004813551902771, + "learning_rate": 1.4688351514632539e-05, + "loss": 1.255, + "mean_token_accuracy": 0.6758040388425192, + "num_tokens": 1175949273.0, + "step": 7004 + }, + { + "entropy": 1.7221704920132954, + "epoch": 0.7695476641674219, + "grad_norm": 0.6526414752006531, + "learning_rate": 1.4686890958058774e-05, + "loss": 1.3306, + "mean_token_accuracy": 0.6652916769186655, + "num_tokens": 1176111924.0, + "step": 7005 + }, + { + "entropy": 1.7045084337393444, + "epoch": 0.7696575210787948, + "grad_norm": 0.6869589686393738, + "learning_rate": 1.4685430284796575e-05, + "loss": 1.3229, + "mean_token_accuracy": 0.6639479349056879, + "num_tokens": 1176252812.0, + "step": 7006 + }, + { + "entropy": 1.6703706979751587, + "epoch": 0.7697673779901678, + "grad_norm": 0.6456193923950195, + "learning_rate": 1.4683969494892168e-05, + "loss": 1.3546, + "mean_token_accuracy": 0.6619254897038142, + "num_tokens": 1176420918.0, + "step": 7007 + }, + { + "entropy": 1.7053408324718475, + "epoch": 0.7698772349015407, + "grad_norm": 0.724219560623169, + "learning_rate": 1.4682508588391786e-05, + "loss": 1.3881, + "mean_token_accuracy": 0.6580935915311178, + "num_tokens": 1176594950.0, + "step": 7008 + }, + { + "entropy": 1.6640128095944722, + "epoch": 0.7699870918129137, + "grad_norm": 0.7082245349884033, + "learning_rate": 1.4681047565341664e-05, + "loss": 1.4806, + "mean_token_accuracy": 0.6520075996716818, + "num_tokens": 1176779499.0, + "step": 7009 + }, + { + "entropy": 1.6447947323322296, + "epoch": 0.7700969487242866, + "grad_norm": 0.7441786527633667, + "learning_rate": 1.4679586425788051e-05, + "loss": 1.3688, + "mean_token_accuracy": 0.657518689831098, + "num_tokens": 1176947062.0, + "step": 7010 + }, + { + "entropy": 1.7106069127718608, + "epoch": 0.7702068056356596, + "grad_norm": 0.8191734552383423, + "learning_rate": 1.467812516977718e-05, + "loss": 1.4011, + "mean_token_accuracy": 0.6573437452316284, + "num_tokens": 1177110501.0, + "step": 7011 + }, + { + "entropy": 1.7322260042031605, + "epoch": 0.7703166625470325, + "grad_norm": 0.7164145708084106, + "learning_rate": 1.4676663797355307e-05, + "loss": 1.3584, + "mean_token_accuracy": 0.6518987119197845, + "num_tokens": 1177271454.0, + "step": 7012 + }, + { + "entropy": 1.6704432566960652, + "epoch": 0.7704265194584055, + "grad_norm": 0.7696778774261475, + "learning_rate": 1.4675202308568682e-05, + "loss": 1.2934, + "mean_token_accuracy": 0.6695187787214915, + "num_tokens": 1177410556.0, + "step": 7013 + }, + { + "entropy": 1.6978066364924114, + "epoch": 0.7705363763697783, + "grad_norm": 0.8118786811828613, + "learning_rate": 1.4673740703463559e-05, + "loss": 1.2362, + "mean_token_accuracy": 0.6846217463413874, + "num_tokens": 1177543278.0, + "step": 7014 + }, + { + "entropy": 1.6863240996996562, + "epoch": 0.7706462332811513, + "grad_norm": 0.7545218467712402, + "learning_rate": 1.46722789820862e-05, + "loss": 1.3926, + "mean_token_accuracy": 0.6479389071464539, + "num_tokens": 1177711545.0, + "step": 7015 + }, + { + "entropy": 1.6915642023086548, + "epoch": 0.7707560901925242, + "grad_norm": 0.6925393342971802, + "learning_rate": 1.4670817144482864e-05, + "loss": 1.2654, + "mean_token_accuracy": 0.6772432029247284, + "num_tokens": 1177877618.0, + "step": 7016 + }, + { + "entropy": 1.7470574875672658, + "epoch": 0.7708659471038972, + "grad_norm": 0.67853844165802, + "learning_rate": 1.466935519069982e-05, + "loss": 1.6348, + "mean_token_accuracy": 0.6177881682912508, + "num_tokens": 1178092438.0, + "step": 7017 + }, + { + "entropy": 1.7246305743853252, + "epoch": 0.7709758040152701, + "grad_norm": 0.6471522450447083, + "learning_rate": 1.4667893120783337e-05, + "loss": 1.4376, + "mean_token_accuracy": 0.6511177718639374, + "num_tokens": 1178263593.0, + "step": 7018 + }, + { + "entropy": 1.7667359312375386, + "epoch": 0.7710856609266431, + "grad_norm": 0.7176552414894104, + "learning_rate": 1.4666430934779692e-05, + "loss": 1.3189, + "mean_token_accuracy": 0.6782967547575632, + "num_tokens": 1178415863.0, + "step": 7019 + }, + { + "entropy": 1.7315512498219807, + "epoch": 0.771195517838016, + "grad_norm": 0.6310048699378967, + "learning_rate": 1.4664968632735157e-05, + "loss": 1.5482, + "mean_token_accuracy": 0.6332679738601049, + "num_tokens": 1178636690.0, + "step": 7020 + }, + { + "entropy": 1.6622104545434315, + "epoch": 0.7713053747493889, + "grad_norm": 0.5793983340263367, + "learning_rate": 1.4663506214696019e-05, + "loss": 1.5111, + "mean_token_accuracy": 0.6406310300032297, + "num_tokens": 1178883992.0, + "step": 7021 + }, + { + "entropy": 1.6803521513938904, + "epoch": 0.7714152316607619, + "grad_norm": 0.5114834308624268, + "learning_rate": 1.4662043680708557e-05, + "loss": 1.55, + "mean_token_accuracy": 0.6302276899417242, + "num_tokens": 1179137785.0, + "step": 7022 + }, + { + "entropy": 1.7061661183834076, + "epoch": 0.7715250885721348, + "grad_norm": 0.6063534617424011, + "learning_rate": 1.4660581030819063e-05, + "loss": 1.3733, + "mean_token_accuracy": 0.6526365379492441, + "num_tokens": 1179295006.0, + "step": 7023 + }, + { + "entropy": 1.7167406380176544, + "epoch": 0.7716349454835078, + "grad_norm": 0.6147019267082214, + "learning_rate": 1.4659118265073832e-05, + "loss": 1.394, + "mean_token_accuracy": 0.6613097737232844, + "num_tokens": 1179510820.0, + "step": 7024 + }, + { + "entropy": 1.6936827500661213, + "epoch": 0.7717448023948806, + "grad_norm": 0.7615833878517151, + "learning_rate": 1.4657655383519157e-05, + "loss": 1.3649, + "mean_token_accuracy": 0.6567181398471197, + "num_tokens": 1179665016.0, + "step": 7025 + }, + { + "entropy": 1.7460854351520538, + "epoch": 0.7718546593062536, + "grad_norm": 0.8454451560974121, + "learning_rate": 1.4656192386201333e-05, + "loss": 1.4548, + "mean_token_accuracy": 0.6643540759881338, + "num_tokens": 1179813832.0, + "step": 7026 + }, + { + "entropy": 1.6930663386980693, + "epoch": 0.7719645162176265, + "grad_norm": 0.7514932155609131, + "learning_rate": 1.465472927316667e-05, + "loss": 1.5265, + "mean_token_accuracy": 0.6348920861879984, + "num_tokens": 1180021268.0, + "step": 7027 + }, + { + "entropy": 1.7139446039994557, + "epoch": 0.7720743731289995, + "grad_norm": 0.6872261762619019, + "learning_rate": 1.4653266044461474e-05, + "loss": 1.4672, + "mean_token_accuracy": 0.648734375834465, + "num_tokens": 1180163008.0, + "step": 7028 + }, + { + "entropy": 1.7607576847076416, + "epoch": 0.7721842300403724, + "grad_norm": 0.6853843927383423, + "learning_rate": 1.465180270013205e-05, + "loss": 1.4754, + "mean_token_accuracy": 0.6421893537044525, + "num_tokens": 1180316292.0, + "step": 7029 + }, + { + "entropy": 1.6858061254024506, + "epoch": 0.7722940869517454, + "grad_norm": 0.684089720249176, + "learning_rate": 1.465033924022472e-05, + "loss": 1.4033, + "mean_token_accuracy": 0.6613581776618958, + "num_tokens": 1180452591.0, + "step": 7030 + }, + { + "entropy": 1.692560573418935, + "epoch": 0.7724039438631183, + "grad_norm": 0.6754666566848755, + "learning_rate": 1.4648875664785797e-05, + "loss": 1.4735, + "mean_token_accuracy": 0.6475921819607416, + "num_tokens": 1180654465.0, + "step": 7031 + }, + { + "entropy": 1.7072215179602306, + "epoch": 0.7725138007744913, + "grad_norm": 0.6174911260604858, + "learning_rate": 1.4647411973861601e-05, + "loss": 1.5394, + "mean_token_accuracy": 0.6355844636758169, + "num_tokens": 1180883593.0, + "step": 7032 + }, + { + "entropy": 1.7164014180501301, + "epoch": 0.7726236576858642, + "grad_norm": 0.760221004486084, + "learning_rate": 1.464594816749846e-05, + "loss": 1.3237, + "mean_token_accuracy": 0.6593389511108398, + "num_tokens": 1181049298.0, + "step": 7033 + }, + { + "entropy": 1.7105709115664165, + "epoch": 0.772733514597237, + "grad_norm": 0.7115726470947266, + "learning_rate": 1.4644484245742704e-05, + "loss": 1.568, + "mean_token_accuracy": 0.642639140288035, + "num_tokens": 1181262923.0, + "step": 7034 + }, + { + "entropy": 1.7091182271639507, + "epoch": 0.77284337150861, + "grad_norm": 0.7895721197128296, + "learning_rate": 1.4643020208640664e-05, + "loss": 1.4203, + "mean_token_accuracy": 0.652391200264295, + "num_tokens": 1181432024.0, + "step": 7035 + }, + { + "entropy": 1.6460919280846913, + "epoch": 0.7729532284199829, + "grad_norm": 0.5189076662063599, + "learning_rate": 1.4641556056238675e-05, + "loss": 1.532, + "mean_token_accuracy": 0.6269241819779078, + "num_tokens": 1181703567.0, + "step": 7036 + }, + { + "entropy": 1.738191584746043, + "epoch": 0.7730630853313559, + "grad_norm": 0.8178884983062744, + "learning_rate": 1.4640091788583079e-05, + "loss": 1.5218, + "mean_token_accuracy": 0.6404098868370056, + "num_tokens": 1181891405.0, + "step": 7037 + }, + { + "entropy": 1.7580572366714478, + "epoch": 0.7731729422427288, + "grad_norm": 0.594333291053772, + "learning_rate": 1.4638627405720216e-05, + "loss": 1.5078, + "mean_token_accuracy": 0.6458245019117991, + "num_tokens": 1182086324.0, + "step": 7038 + }, + { + "entropy": 1.7033535142739613, + "epoch": 0.7732827991541018, + "grad_norm": 0.6162420511245728, + "learning_rate": 1.4637162907696438e-05, + "loss": 1.2653, + "mean_token_accuracy": 0.6843320180972418, + "num_tokens": 1182290496.0, + "step": 7039 + }, + { + "entropy": 1.7195685009161632, + "epoch": 0.7733926560654747, + "grad_norm": 0.7166089415550232, + "learning_rate": 1.4635698294558092e-05, + "loss": 1.4405, + "mean_token_accuracy": 0.6487057308355967, + "num_tokens": 1182489457.0, + "step": 7040 + }, + { + "entropy": 1.7417178054650624, + "epoch": 0.7735025129768477, + "grad_norm": 0.8565784096717834, + "learning_rate": 1.463423356635153e-05, + "loss": 1.4438, + "mean_token_accuracy": 0.643385499715805, + "num_tokens": 1182731014.0, + "step": 7041 + }, + { + "entropy": 1.7084386845429738, + "epoch": 0.7736123698882206, + "grad_norm": 0.5510115027427673, + "learning_rate": 1.4632768723123119e-05, + "loss": 1.5365, + "mean_token_accuracy": 0.642001653711001, + "num_tokens": 1182947354.0, + "step": 7042 + }, + { + "entropy": 1.7248138189315796, + "epoch": 0.7737222267995936, + "grad_norm": 0.7305212616920471, + "learning_rate": 1.4631303764919208e-05, + "loss": 1.1954, + "mean_token_accuracy": 0.6862581819295883, + "num_tokens": 1183074561.0, + "step": 7043 + }, + { + "entropy": 1.7128371099630992, + "epoch": 0.7738320837109665, + "grad_norm": 0.7132003307342529, + "learning_rate": 1.4629838691786176e-05, + "loss": 1.3073, + "mean_token_accuracy": 0.6683625827232996, + "num_tokens": 1183198631.0, + "step": 7044 + }, + { + "entropy": 1.7525557577610016, + "epoch": 0.7739419406223395, + "grad_norm": 0.724991500377655, + "learning_rate": 1.462837350377038e-05, + "loss": 1.5135, + "mean_token_accuracy": 0.6378008325894674, + "num_tokens": 1183375784.0, + "step": 7045 + }, + { + "entropy": 1.717278391122818, + "epoch": 0.7740517975337123, + "grad_norm": 0.674461841583252, + "learning_rate": 1.4626908200918201e-05, + "loss": 1.3914, + "mean_token_accuracy": 0.6517154922087988, + "num_tokens": 1183544082.0, + "step": 7046 + }, + { + "entropy": 1.5977116922537486, + "epoch": 0.7741616544450852, + "grad_norm": 0.6767549514770508, + "learning_rate": 1.4625442783276012e-05, + "loss": 1.2301, + "mean_token_accuracy": 0.6752993414799372, + "num_tokens": 1183679452.0, + "step": 7047 + }, + { + "entropy": 1.7543257574240367, + "epoch": 0.7742715113564582, + "grad_norm": 0.6589480638504028, + "learning_rate": 1.462397725089019e-05, + "loss": 1.4896, + "mean_token_accuracy": 0.65002969900767, + "num_tokens": 1183809390.0, + "step": 7048 + }, + { + "entropy": 1.6917062997817993, + "epoch": 0.7743813682678311, + "grad_norm": 0.6566320061683655, + "learning_rate": 1.462251160380712e-05, + "loss": 1.3941, + "mean_token_accuracy": 0.6552205433448156, + "num_tokens": 1183967409.0, + "step": 7049 + }, + { + "entropy": 1.6994734903176625, + "epoch": 0.7744912251792041, + "grad_norm": 0.7120063900947571, + "learning_rate": 1.4621045842073194e-05, + "loss": 1.4939, + "mean_token_accuracy": 0.6527921060721079, + "num_tokens": 1184183390.0, + "step": 7050 + }, + { + "entropy": 1.6695085167884827, + "epoch": 0.774601082090577, + "grad_norm": 0.8632370829582214, + "learning_rate": 1.4619579965734797e-05, + "loss": 1.4014, + "mean_token_accuracy": 0.6676080425580343, + "num_tokens": 1184346247.0, + "step": 7051 + }, + { + "entropy": 1.6956228117148082, + "epoch": 0.77471093900195, + "grad_norm": 0.6049069166183472, + "learning_rate": 1.4618113974838324e-05, + "loss": 1.3815, + "mean_token_accuracy": 0.6569184164206187, + "num_tokens": 1184519454.0, + "step": 7052 + }, + { + "entropy": 1.6839020152886708, + "epoch": 0.7748207959133229, + "grad_norm": 0.6516950726509094, + "learning_rate": 1.4616647869430174e-05, + "loss": 1.3496, + "mean_token_accuracy": 0.665029858549436, + "num_tokens": 1184698002.0, + "step": 7053 + }, + { + "entropy": 1.706792841355006, + "epoch": 0.7749306528246959, + "grad_norm": 0.6899370551109314, + "learning_rate": 1.4615181649556751e-05, + "loss": 1.5164, + "mean_token_accuracy": 0.6479889204104742, + "num_tokens": 1184859559.0, + "step": 7054 + }, + { + "entropy": 1.726404498020808, + "epoch": 0.7750405097360688, + "grad_norm": 0.5813102126121521, + "learning_rate": 1.4613715315264453e-05, + "loss": 1.4043, + "mean_token_accuracy": 0.6609879980484644, + "num_tokens": 1185057723.0, + "step": 7055 + }, + { + "entropy": 1.7247681121031444, + "epoch": 0.7751503666474417, + "grad_norm": 0.6459679007530212, + "learning_rate": 1.4612248866599698e-05, + "loss": 1.438, + "mean_token_accuracy": 0.6407895038525263, + "num_tokens": 1185236310.0, + "step": 7056 + }, + { + "entropy": 1.696563959121704, + "epoch": 0.7752602235588146, + "grad_norm": 0.8122643828392029, + "learning_rate": 1.4610782303608895e-05, + "loss": 1.5251, + "mean_token_accuracy": 0.662423754731814, + "num_tokens": 1185411741.0, + "step": 7057 + }, + { + "entropy": 1.6828083793322246, + "epoch": 0.7753700804701876, + "grad_norm": 0.7139939665794373, + "learning_rate": 1.4609315626338455e-05, + "loss": 1.2945, + "mean_token_accuracy": 0.671471560994784, + "num_tokens": 1185540548.0, + "step": 7058 + }, + { + "entropy": 1.6743756433327992, + "epoch": 0.7754799373815605, + "grad_norm": 0.6616625785827637, + "learning_rate": 1.4607848834834808e-05, + "loss": 1.307, + "mean_token_accuracy": 0.6813914626836777, + "num_tokens": 1185668124.0, + "step": 7059 + }, + { + "entropy": 1.6928254266579945, + "epoch": 0.7755897942929334, + "grad_norm": 0.7479018568992615, + "learning_rate": 1.4606381929144366e-05, + "loss": 1.373, + "mean_token_accuracy": 0.6539994676907858, + "num_tokens": 1185800426.0, + "step": 7060 + }, + { + "entropy": 1.6936748921871185, + "epoch": 0.7756996512043064, + "grad_norm": 0.8162828087806702, + "learning_rate": 1.4604914909313562e-05, + "loss": 1.211, + "mean_token_accuracy": 0.6736436436573664, + "num_tokens": 1185908583.0, + "step": 7061 + }, + { + "entropy": 1.713008721669515, + "epoch": 0.7758095081156793, + "grad_norm": 0.6247701644897461, + "learning_rate": 1.4603447775388825e-05, + "loss": 1.5041, + "mean_token_accuracy": 0.6550228893756866, + "num_tokens": 1186056753.0, + "step": 7062 + }, + { + "entropy": 1.752008448044459, + "epoch": 0.7759193650270523, + "grad_norm": 0.6326964497566223, + "learning_rate": 1.4601980527416593e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6500469148159027, + "num_tokens": 1186254903.0, + "step": 7063 + }, + { + "entropy": 1.71945525209109, + "epoch": 0.7760292219384252, + "grad_norm": 0.6365431547164917, + "learning_rate": 1.4600513165443298e-05, + "loss": 1.4776, + "mean_token_accuracy": 0.6456809441248575, + "num_tokens": 1186449812.0, + "step": 7064 + }, + { + "entropy": 1.7330725888411205, + "epoch": 0.7761390788497982, + "grad_norm": 0.7301865816116333, + "learning_rate": 1.4599045689515383e-05, + "loss": 1.2947, + "mean_token_accuracy": 0.6618055999279022, + "num_tokens": 1186602119.0, + "step": 7065 + }, + { + "entropy": 1.732138842344284, + "epoch": 0.776248935761171, + "grad_norm": 0.639707624912262, + "learning_rate": 1.4597578099679293e-05, + "loss": 1.3862, + "mean_token_accuracy": 0.6675709386666616, + "num_tokens": 1186752684.0, + "step": 7066 + }, + { + "entropy": 1.7195107837518055, + "epoch": 0.776358792672544, + "grad_norm": 0.6329506635665894, + "learning_rate": 1.4596110395981477e-05, + "loss": 1.5057, + "mean_token_accuracy": 0.6421088526646296, + "num_tokens": 1186923028.0, + "step": 7067 + }, + { + "entropy": 1.7601061860720317, + "epoch": 0.7764686495839169, + "grad_norm": 0.7304174304008484, + "learning_rate": 1.459464257846839e-05, + "loss": 1.4543, + "mean_token_accuracy": 0.6482264697551727, + "num_tokens": 1187126827.0, + "step": 7068 + }, + { + "entropy": 1.6720358630021412, + "epoch": 0.7765785064952899, + "grad_norm": 0.6124829649925232, + "learning_rate": 1.4593174647186484e-05, + "loss": 1.3228, + "mean_token_accuracy": 0.6597183843453726, + "num_tokens": 1187279117.0, + "step": 7069 + }, + { + "entropy": 1.7097432514031727, + "epoch": 0.7766883634066628, + "grad_norm": 0.6492157578468323, + "learning_rate": 1.459170660218222e-05, + "loss": 1.4168, + "mean_token_accuracy": 0.6490695029497147, + "num_tokens": 1187477267.0, + "step": 7070 + }, + { + "entropy": 1.6731528639793396, + "epoch": 0.7767982203180358, + "grad_norm": 0.6846409440040588, + "learning_rate": 1.4590238443502062e-05, + "loss": 1.3208, + "mean_token_accuracy": 0.6802329818407694, + "num_tokens": 1187640442.0, + "step": 7071 + }, + { + "entropy": 1.694802353779475, + "epoch": 0.7769080772294087, + "grad_norm": 0.6815143823623657, + "learning_rate": 1.458877017119247e-05, + "loss": 1.442, + "mean_token_accuracy": 0.6708350131909052, + "num_tokens": 1187819222.0, + "step": 7072 + }, + { + "entropy": 1.7885936399300892, + "epoch": 0.7770179341407817, + "grad_norm": 0.6548523902893066, + "learning_rate": 1.4587301785299925e-05, + "loss": 1.4269, + "mean_token_accuracy": 0.6466637452443441, + "num_tokens": 1188034722.0, + "step": 7073 + }, + { + "entropy": 1.7135749161243439, + "epoch": 0.7771277910521546, + "grad_norm": 0.7812349796295166, + "learning_rate": 1.4585833285870891e-05, + "loss": 1.3671, + "mean_token_accuracy": 0.6483140687147776, + "num_tokens": 1188216014.0, + "step": 7074 + }, + { + "entropy": 1.6287512878576915, + "epoch": 0.7772376479635275, + "grad_norm": 0.7831335663795471, + "learning_rate": 1.4584364672951851e-05, + "loss": 1.3522, + "mean_token_accuracy": 0.6784834712743759, + "num_tokens": 1188410952.0, + "step": 7075 + }, + { + "entropy": 1.7114653885364532, + "epoch": 0.7773475048749005, + "grad_norm": 0.7758569121360779, + "learning_rate": 1.4582895946589287e-05, + "loss": 1.3488, + "mean_token_accuracy": 0.6737766712903976, + "num_tokens": 1188548783.0, + "step": 7076 + }, + { + "entropy": 1.7152994672457378, + "epoch": 0.7774573617862733, + "grad_norm": 0.6986987590789795, + "learning_rate": 1.4581427106829675e-05, + "loss": 1.4113, + "mean_token_accuracy": 0.6604458590348562, + "num_tokens": 1188715804.0, + "step": 7077 + }, + { + "entropy": 1.7589812874794006, + "epoch": 0.7775672186976463, + "grad_norm": 0.9140593409538269, + "learning_rate": 1.4579958153719513e-05, + "loss": 1.2027, + "mean_token_accuracy": 0.6834556410710017, + "num_tokens": 1188807708.0, + "step": 7078 + }, + { + "entropy": 1.6676159103711445, + "epoch": 0.7776770756090192, + "grad_norm": 0.6604957580566406, + "learning_rate": 1.4578489087305286e-05, + "loss": 1.395, + "mean_token_accuracy": 0.6646223912636439, + "num_tokens": 1189019253.0, + "step": 7079 + }, + { + "entropy": 1.701384961605072, + "epoch": 0.7777869325203922, + "grad_norm": 0.6634955406188965, + "learning_rate": 1.4577019907633494e-05, + "loss": 1.3598, + "mean_token_accuracy": 0.6663598666588465, + "num_tokens": 1189162079.0, + "step": 7080 + }, + { + "entropy": 1.7295205891132355, + "epoch": 0.7778967894317651, + "grad_norm": 0.7290459275245667, + "learning_rate": 1.4575550614750636e-05, + "loss": 1.4276, + "mean_token_accuracy": 0.6518198847770691, + "num_tokens": 1189348501.0, + "step": 7081 + }, + { + "entropy": 1.6786798735459645, + "epoch": 0.7780066463431381, + "grad_norm": 0.8376657962799072, + "learning_rate": 1.4574081208703205e-05, + "loss": 1.5544, + "mean_token_accuracy": 0.6468896766503652, + "num_tokens": 1189516193.0, + "step": 7082 + }, + { + "entropy": 1.6363192001978557, + "epoch": 0.778116503254511, + "grad_norm": 0.7634339332580566, + "learning_rate": 1.457261168953772e-05, + "loss": 1.4173, + "mean_token_accuracy": 0.6621982008218765, + "num_tokens": 1189666179.0, + "step": 7083 + }, + { + "entropy": 1.6522767841815948, + "epoch": 0.778226360165884, + "grad_norm": 0.6967000365257263, + "learning_rate": 1.4571142057300683e-05, + "loss": 1.2449, + "mean_token_accuracy": 0.677667478720347, + "num_tokens": 1189784559.0, + "step": 7084 + }, + { + "entropy": 1.667193869749705, + "epoch": 0.7783362170772569, + "grad_norm": 0.6377148032188416, + "learning_rate": 1.4569672312038607e-05, + "loss": 1.4232, + "mean_token_accuracy": 0.6659876654545466, + "num_tokens": 1189964900.0, + "step": 7085 + }, + { + "entropy": 1.7085439264774323, + "epoch": 0.7784460739886299, + "grad_norm": 0.743248701095581, + "learning_rate": 1.4568202453798014e-05, + "loss": 1.372, + "mean_token_accuracy": 0.6602604488531748, + "num_tokens": 1190156829.0, + "step": 7086 + }, + { + "entropy": 1.6639246940612793, + "epoch": 0.7785559309000027, + "grad_norm": 0.718908429145813, + "learning_rate": 1.4566732482625423e-05, + "loss": 1.3575, + "mean_token_accuracy": 0.6681891083717346, + "num_tokens": 1190316623.0, + "step": 7087 + }, + { + "entropy": 1.6937816043694813, + "epoch": 0.7786657878113756, + "grad_norm": 0.7970039248466492, + "learning_rate": 1.4565262398567352e-05, + "loss": 1.2439, + "mean_token_accuracy": 0.6761472771565119, + "num_tokens": 1190464941.0, + "step": 7088 + }, + { + "entropy": 1.639806220928828, + "epoch": 0.7787756447227486, + "grad_norm": 0.5538852214813232, + "learning_rate": 1.4563792201670334e-05, + "loss": 1.3917, + "mean_token_accuracy": 0.6550086786349615, + "num_tokens": 1190671240.0, + "step": 7089 + }, + { + "entropy": 1.7520277798175812, + "epoch": 0.7788855016341215, + "grad_norm": 0.7734472751617432, + "learning_rate": 1.45623218919809e-05, + "loss": 1.3325, + "mean_token_accuracy": 0.6582324057817459, + "num_tokens": 1190786339.0, + "step": 7090 + }, + { + "entropy": 1.715962419907252, + "epoch": 0.7789953585454945, + "grad_norm": 0.682809054851532, + "learning_rate": 1.456085146954558e-05, + "loss": 1.3316, + "mean_token_accuracy": 0.6533773044745127, + "num_tokens": 1190936395.0, + "step": 7091 + }, + { + "entropy": 1.6548383732636769, + "epoch": 0.7791052154568674, + "grad_norm": 0.7674005627632141, + "learning_rate": 1.4559380934410918e-05, + "loss": 1.4276, + "mean_token_accuracy": 0.6647045860687891, + "num_tokens": 1191063187.0, + "step": 7092 + }, + { + "entropy": 1.7424963613351185, + "epoch": 0.7792150723682404, + "grad_norm": 0.8039253950119019, + "learning_rate": 1.4557910286623456e-05, + "loss": 1.4081, + "mean_token_accuracy": 0.6619451393683752, + "num_tokens": 1191216376.0, + "step": 7093 + }, + { + "entropy": 1.673016995191574, + "epoch": 0.7793249292796133, + "grad_norm": 0.750651478767395, + "learning_rate": 1.455643952622973e-05, + "loss": 1.4322, + "mean_token_accuracy": 0.6569622804721197, + "num_tokens": 1191362087.0, + "step": 7094 + }, + { + "entropy": 1.7055266002813976, + "epoch": 0.7794347861909863, + "grad_norm": 0.74098801612854, + "learning_rate": 1.4554968653276303e-05, + "loss": 1.4917, + "mean_token_accuracy": 0.6539320250352224, + "num_tokens": 1191506306.0, + "step": 7095 + }, + { + "entropy": 1.6832468211650848, + "epoch": 0.7795446431023592, + "grad_norm": 0.6400901675224304, + "learning_rate": 1.4553497667809716e-05, + "loss": 1.5095, + "mean_token_accuracy": 0.637568806608518, + "num_tokens": 1191702148.0, + "step": 7096 + }, + { + "entropy": 1.6865523755550385, + "epoch": 0.7796545000137322, + "grad_norm": 0.7597500681877136, + "learning_rate": 1.455202656987653e-05, + "loss": 1.5815, + "mean_token_accuracy": 0.6512685567140579, + "num_tokens": 1191868681.0, + "step": 7097 + }, + { + "entropy": 1.7562141319115956, + "epoch": 0.779764356925105, + "grad_norm": 0.7682708501815796, + "learning_rate": 1.4550555359523303e-05, + "loss": 1.3168, + "mean_token_accuracy": 0.6706572075684866, + "num_tokens": 1192009219.0, + "step": 7098 + }, + { + "entropy": 1.7148883839448292, + "epoch": 0.779874213836478, + "grad_norm": 0.7574900984764099, + "learning_rate": 1.45490840367966e-05, + "loss": 1.6325, + "mean_token_accuracy": 0.6384792327880859, + "num_tokens": 1192191135.0, + "step": 7099 + }, + { + "entropy": 1.6949690977732341, + "epoch": 0.7799840707478509, + "grad_norm": 0.643990695476532, + "learning_rate": 1.4547612601742984e-05, + "loss": 1.3238, + "mean_token_accuracy": 0.6610787808895111, + "num_tokens": 1192365786.0, + "step": 7100 + }, + { + "entropy": 1.6968937317530315, + "epoch": 0.7800939276592238, + "grad_norm": 0.6925724744796753, + "learning_rate": 1.4546141054409026e-05, + "loss": 1.3673, + "mean_token_accuracy": 0.6582049876451492, + "num_tokens": 1192499916.0, + "step": 7101 + }, + { + "entropy": 1.6771467129389446, + "epoch": 0.7802037845705968, + "grad_norm": 6.278994560241699, + "learning_rate": 1.4544669394841307e-05, + "loss": 1.3181, + "mean_token_accuracy": 0.676389808456103, + "num_tokens": 1192644459.0, + "step": 7102 + }, + { + "entropy": 1.7438280681769054, + "epoch": 0.7803136414819697, + "grad_norm": 0.6767034530639648, + "learning_rate": 1.4543197623086398e-05, + "loss": 1.2602, + "mean_token_accuracy": 0.6726950407028198, + "num_tokens": 1192759155.0, + "step": 7103 + }, + { + "entropy": 1.6500231822331746, + "epoch": 0.7804234983933427, + "grad_norm": 0.642570972442627, + "learning_rate": 1.454172573919088e-05, + "loss": 1.3393, + "mean_token_accuracy": 0.6680295219024023, + "num_tokens": 1192903425.0, + "step": 7104 + }, + { + "entropy": 1.6823652784029643, + "epoch": 0.7805333553047156, + "grad_norm": 0.7715455889701843, + "learning_rate": 1.4540253743201336e-05, + "loss": 1.1928, + "mean_token_accuracy": 0.6783884565035502, + "num_tokens": 1193030964.0, + "step": 7105 + }, + { + "entropy": 1.67123677333196, + "epoch": 0.7806432122160886, + "grad_norm": 0.6010688543319702, + "learning_rate": 1.4538781635164359e-05, + "loss": 1.4498, + "mean_token_accuracy": 0.6451991299788157, + "num_tokens": 1193238226.0, + "step": 7106 + }, + { + "entropy": 1.7276023924350739, + "epoch": 0.7807530691274615, + "grad_norm": 0.7072981595993042, + "learning_rate": 1.4537309415126535e-05, + "loss": 1.2735, + "mean_token_accuracy": 0.677293395002683, + "num_tokens": 1193368968.0, + "step": 7107 + }, + { + "entropy": 1.7027284701665242, + "epoch": 0.7808629260388344, + "grad_norm": 1.048176884651184, + "learning_rate": 1.4535837083134465e-05, + "loss": 1.5693, + "mean_token_accuracy": 0.6457755664984385, + "num_tokens": 1193493783.0, + "step": 7108 + }, + { + "entropy": 1.710277110338211, + "epoch": 0.7809727829502073, + "grad_norm": 0.7665229439735413, + "learning_rate": 1.4534364639234744e-05, + "loss": 1.3597, + "mean_token_accuracy": 0.6668632626533508, + "num_tokens": 1193646619.0, + "step": 7109 + }, + { + "entropy": 1.7380196849505107, + "epoch": 0.7810826398615803, + "grad_norm": 0.6514268517494202, + "learning_rate": 1.4532892083473973e-05, + "loss": 1.4335, + "mean_token_accuracy": 0.6352319270372391, + "num_tokens": 1193853184.0, + "step": 7110 + }, + { + "entropy": 1.659955104192098, + "epoch": 0.7811924967729532, + "grad_norm": 0.6248233914375305, + "learning_rate": 1.4531419415898762e-05, + "loss": 1.4731, + "mean_token_accuracy": 0.6436507304509481, + "num_tokens": 1194058800.0, + "step": 7111 + }, + { + "entropy": 1.766392429669698, + "epoch": 0.7813023536843262, + "grad_norm": 0.6671506762504578, + "learning_rate": 1.4529946636555716e-05, + "loss": 1.4857, + "mean_token_accuracy": 0.642402226726214, + "num_tokens": 1194250202.0, + "step": 7112 + }, + { + "entropy": 1.737959663073222, + "epoch": 0.7814122105956991, + "grad_norm": 0.5975798964500427, + "learning_rate": 1.452847374549145e-05, + "loss": 1.3968, + "mean_token_accuracy": 0.6597791264454523, + "num_tokens": 1194441641.0, + "step": 7113 + }, + { + "entropy": 1.7000750998655956, + "epoch": 0.7815220675070721, + "grad_norm": 0.6063849329948425, + "learning_rate": 1.452700074275258e-05, + "loss": 1.4046, + "mean_token_accuracy": 0.6620944837729136, + "num_tokens": 1194625979.0, + "step": 7114 + }, + { + "entropy": 1.7005607883135478, + "epoch": 0.781631924418445, + "grad_norm": 0.5850129127502441, + "learning_rate": 1.4525527628385728e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6443201154470444, + "num_tokens": 1194826016.0, + "step": 7115 + }, + { + "entropy": 1.728336493174235, + "epoch": 0.7817417813298179, + "grad_norm": 0.6136082410812378, + "learning_rate": 1.4524054402437511e-05, + "loss": 1.4068, + "mean_token_accuracy": 0.6534546116987864, + "num_tokens": 1194994162.0, + "step": 7116 + }, + { + "entropy": 1.7266682982444763, + "epoch": 0.7818516382411909, + "grad_norm": 0.6669444441795349, + "learning_rate": 1.4522581064954563e-05, + "loss": 1.3809, + "mean_token_accuracy": 0.6540538171927134, + "num_tokens": 1195189176.0, + "step": 7117 + }, + { + "entropy": 1.756045748790105, + "epoch": 0.7819614951525637, + "grad_norm": 0.674505889415741, + "learning_rate": 1.4521107615983511e-05, + "loss": 1.3878, + "mean_token_accuracy": 0.6560290704170862, + "num_tokens": 1195346345.0, + "step": 7118 + }, + { + "entropy": 1.6180338263511658, + "epoch": 0.7820713520639367, + "grad_norm": 0.6228286623954773, + "learning_rate": 1.4519634055570988e-05, + "loss": 1.2565, + "mean_token_accuracy": 0.6812761723995209, + "num_tokens": 1195480456.0, + "step": 7119 + }, + { + "entropy": 1.7572944561640422, + "epoch": 0.7821812089753096, + "grad_norm": 0.6308412551879883, + "learning_rate": 1.4518160383763635e-05, + "loss": 1.4338, + "mean_token_accuracy": 0.6442923347155253, + "num_tokens": 1195630768.0, + "step": 7120 + }, + { + "entropy": 1.7162999709447224, + "epoch": 0.7822910658866826, + "grad_norm": 0.7594370245933533, + "learning_rate": 1.4516686600608089e-05, + "loss": 1.4204, + "mean_token_accuracy": 0.656131515900294, + "num_tokens": 1195813359.0, + "step": 7121 + }, + { + "entropy": 1.721927394469579, + "epoch": 0.7824009227980555, + "grad_norm": 0.7334505915641785, + "learning_rate": 1.4515212706151001e-05, + "loss": 1.2533, + "mean_token_accuracy": 0.6717882007360458, + "num_tokens": 1195918100.0, + "step": 7122 + }, + { + "entropy": 1.7734851737817128, + "epoch": 0.7825107797094285, + "grad_norm": 0.949683427810669, + "learning_rate": 1.4513738700439014e-05, + "loss": 1.5294, + "mean_token_accuracy": 0.642572283744812, + "num_tokens": 1196061449.0, + "step": 7123 + }, + { + "entropy": 1.7282946904500325, + "epoch": 0.7826206366208014, + "grad_norm": 0.7295317649841309, + "learning_rate": 1.4512264583518776e-05, + "loss": 1.4733, + "mean_token_accuracy": 0.6531463364760081, + "num_tokens": 1196213722.0, + "step": 7124 + }, + { + "entropy": 1.6672942737738292, + "epoch": 0.7827304935321744, + "grad_norm": 0.620469868183136, + "learning_rate": 1.451079035543695e-05, + "loss": 1.3513, + "mean_token_accuracy": 0.6611681828896204, + "num_tokens": 1196354639.0, + "step": 7125 + }, + { + "entropy": 1.743516246477763, + "epoch": 0.7828403504435473, + "grad_norm": 0.7476531863212585, + "learning_rate": 1.4509316016240189e-05, + "loss": 1.4161, + "mean_token_accuracy": 0.6624071647723516, + "num_tokens": 1196531954.0, + "step": 7126 + }, + { + "entropy": 1.6648136377334595, + "epoch": 0.7829502073549203, + "grad_norm": 0.6160597801208496, + "learning_rate": 1.4507841565975163e-05, + "loss": 1.3679, + "mean_token_accuracy": 0.6603780339161555, + "num_tokens": 1196696042.0, + "step": 7127 + }, + { + "entropy": 1.6777517398198445, + "epoch": 0.7830600642662932, + "grad_norm": 0.667905330657959, + "learning_rate": 1.4506367004688526e-05, + "loss": 1.2421, + "mean_token_accuracy": 0.6805467208226522, + "num_tokens": 1196869852.0, + "step": 7128 + }, + { + "entropy": 1.773158888022105, + "epoch": 0.783169921177666, + "grad_norm": 0.8235062956809998, + "learning_rate": 1.4504892332426954e-05, + "loss": 1.497, + "mean_token_accuracy": 0.6440630505482355, + "num_tokens": 1197042159.0, + "step": 7129 + }, + { + "entropy": 1.67943408091863, + "epoch": 0.783279778089039, + "grad_norm": 0.6340872049331665, + "learning_rate": 1.450341754923712e-05, + "loss": 1.3416, + "mean_token_accuracy": 0.6593078672885895, + "num_tokens": 1197192533.0, + "step": 7130 + }, + { + "entropy": 1.6706339716911316, + "epoch": 0.7833896350004119, + "grad_norm": 0.5731471180915833, + "learning_rate": 1.4501942655165701e-05, + "loss": 1.4853, + "mean_token_accuracy": 0.6310961991548538, + "num_tokens": 1197393580.0, + "step": 7131 + }, + { + "entropy": 1.698110560576121, + "epoch": 0.7834994919117849, + "grad_norm": 0.7156584858894348, + "learning_rate": 1.4500467650259373e-05, + "loss": 1.3617, + "mean_token_accuracy": 0.6580530057350794, + "num_tokens": 1197545276.0, + "step": 7132 + }, + { + "entropy": 1.6475163499514263, + "epoch": 0.7836093488231578, + "grad_norm": 0.8746734261512756, + "learning_rate": 1.4498992534564823e-05, + "loss": 1.3337, + "mean_token_accuracy": 0.6697489966948827, + "num_tokens": 1197712691.0, + "step": 7133 + }, + { + "entropy": 1.7037550906340282, + "epoch": 0.7837192057345308, + "grad_norm": 0.6587815284729004, + "learning_rate": 1.4497517308128734e-05, + "loss": 1.4479, + "mean_token_accuracy": 0.6581203639507294, + "num_tokens": 1197877900.0, + "step": 7134 + }, + { + "entropy": 1.6985561152299244, + "epoch": 0.7838290626459037, + "grad_norm": 0.6671653985977173, + "learning_rate": 1.44960419709978e-05, + "loss": 1.2729, + "mean_token_accuracy": 0.6706394900878271, + "num_tokens": 1198012365.0, + "step": 7135 + }, + { + "entropy": 1.68305508295695, + "epoch": 0.7839389195572767, + "grad_norm": 0.6679463982582092, + "learning_rate": 1.449456652321871e-05, + "loss": 1.2377, + "mean_token_accuracy": 0.6831518908341726, + "num_tokens": 1198152717.0, + "step": 7136 + }, + { + "entropy": 1.7280430893103282, + "epoch": 0.7840487764686496, + "grad_norm": 0.8825252056121826, + "learning_rate": 1.4493090964838167e-05, + "loss": 1.3264, + "mean_token_accuracy": 0.6597543060779572, + "num_tokens": 1198339044.0, + "step": 7137 + }, + { + "entropy": 1.7101606527964275, + "epoch": 0.7841586333800226, + "grad_norm": 0.6220462322235107, + "learning_rate": 1.449161529590287e-05, + "loss": 1.3644, + "mean_token_accuracy": 0.6578847219546636, + "num_tokens": 1198461351.0, + "step": 7138 + }, + { + "entropy": 1.6869953870773315, + "epoch": 0.7842684902913954, + "grad_norm": 0.6071659922599792, + "learning_rate": 1.449013951645952e-05, + "loss": 1.552, + "mean_token_accuracy": 0.636215329170227, + "num_tokens": 1198670100.0, + "step": 7139 + }, + { + "entropy": 1.6908225218454997, + "epoch": 0.7843783472027684, + "grad_norm": 0.72711580991745, + "learning_rate": 1.4488663626554826e-05, + "loss": 1.4751, + "mean_token_accuracy": 0.6628256092468897, + "num_tokens": 1198842301.0, + "step": 7140 + }, + { + "entropy": 1.7508656183878581, + "epoch": 0.7844882041141413, + "grad_norm": 0.6913683414459229, + "learning_rate": 1.4487187626235504e-05, + "loss": 1.4579, + "mean_token_accuracy": 0.6502855817476908, + "num_tokens": 1198992313.0, + "step": 7141 + }, + { + "entropy": 1.6920421818892162, + "epoch": 0.7845980610255142, + "grad_norm": 0.711681604385376, + "learning_rate": 1.4485711515548261e-05, + "loss": 1.37, + "mean_token_accuracy": 0.6575459539890289, + "num_tokens": 1199136804.0, + "step": 7142 + }, + { + "entropy": 1.6301299730936687, + "epoch": 0.7847079179368872, + "grad_norm": 0.7222129106521606, + "learning_rate": 1.4484235294539824e-05, + "loss": 1.3653, + "mean_token_accuracy": 0.669685035943985, + "num_tokens": 1199302100.0, + "step": 7143 + }, + { + "entropy": 1.6759273211161296, + "epoch": 0.7848177748482601, + "grad_norm": 0.727353036403656, + "learning_rate": 1.4482758963256904e-05, + "loss": 1.2621, + "mean_token_accuracy": 0.6747185587882996, + "num_tokens": 1199431204.0, + "step": 7144 + }, + { + "entropy": 1.7488112548987071, + "epoch": 0.7849276317596331, + "grad_norm": 0.6163308024406433, + "learning_rate": 1.4481282521746236e-05, + "loss": 1.5528, + "mean_token_accuracy": 0.6342363655567169, + "num_tokens": 1199615484.0, + "step": 7145 + }, + { + "entropy": 1.7138899366060893, + "epoch": 0.785037488671006, + "grad_norm": 0.619773268699646, + "learning_rate": 1.4479805970054544e-05, + "loss": 1.4131, + "mean_token_accuracy": 0.6630217432975769, + "num_tokens": 1199768737.0, + "step": 7146 + }, + { + "entropy": 1.7128514150778453, + "epoch": 0.785147345582379, + "grad_norm": 0.7631600499153137, + "learning_rate": 1.447832930822856e-05, + "loss": 1.4182, + "mean_token_accuracy": 0.6703273256619772, + "num_tokens": 1199917429.0, + "step": 7147 + }, + { + "entropy": 1.7253048022588093, + "epoch": 0.7852572024937519, + "grad_norm": 0.6738438010215759, + "learning_rate": 1.4476852536315022e-05, + "loss": 1.2802, + "mean_token_accuracy": 0.6651460230350494, + "num_tokens": 1200032163.0, + "step": 7148 + }, + { + "entropy": 1.673990160226822, + "epoch": 0.7853670594051249, + "grad_norm": 0.6391650438308716, + "learning_rate": 1.4475375654360669e-05, + "loss": 1.3706, + "mean_token_accuracy": 0.6688804576794306, + "num_tokens": 1200202637.0, + "step": 7149 + }, + { + "entropy": 1.7560782929261525, + "epoch": 0.7854769163164977, + "grad_norm": 0.683594286441803, + "learning_rate": 1.447389866241224e-05, + "loss": 1.4088, + "mean_token_accuracy": 0.6569011211395264, + "num_tokens": 1200359101.0, + "step": 7150 + }, + { + "entropy": 1.7138656278451283, + "epoch": 0.7855867732278707, + "grad_norm": 0.6900503635406494, + "learning_rate": 1.4472421560516485e-05, + "loss": 1.4651, + "mean_token_accuracy": 0.6426846434672674, + "num_tokens": 1200560872.0, + "step": 7151 + }, + { + "entropy": 1.7244854867458344, + "epoch": 0.7856966301392436, + "grad_norm": 0.6141315698623657, + "learning_rate": 1.4470944348720155e-05, + "loss": 1.4302, + "mean_token_accuracy": 0.6623003830512365, + "num_tokens": 1200716958.0, + "step": 7152 + }, + { + "entropy": 1.712331473827362, + "epoch": 0.7858064870506166, + "grad_norm": 0.6378352046012878, + "learning_rate": 1.4469467027069996e-05, + "loss": 1.4881, + "mean_token_accuracy": 0.6507144321997961, + "num_tokens": 1200953835.0, + "step": 7153 + }, + { + "entropy": 1.7112789849440257, + "epoch": 0.7859163439619895, + "grad_norm": 0.6243149042129517, + "learning_rate": 1.446798959561277e-05, + "loss": 1.3715, + "mean_token_accuracy": 0.6659561494986216, + "num_tokens": 1201154401.0, + "step": 7154 + }, + { + "entropy": 1.6723881363868713, + "epoch": 0.7860262008733624, + "grad_norm": 0.7144157290458679, + "learning_rate": 1.4466512054395238e-05, + "loss": 1.2879, + "mean_token_accuracy": 0.672190397977829, + "num_tokens": 1201292259.0, + "step": 7155 + }, + { + "entropy": 1.6763904094696045, + "epoch": 0.7861360577847354, + "grad_norm": 0.6170639395713806, + "learning_rate": 1.446503440346416e-05, + "loss": 1.2683, + "mean_token_accuracy": 0.6757438133160273, + "num_tokens": 1201434618.0, + "step": 7156 + }, + { + "entropy": 1.7109374403953552, + "epoch": 0.7862459146961083, + "grad_norm": 0.7301081418991089, + "learning_rate": 1.4463556642866305e-05, + "loss": 1.3605, + "mean_token_accuracy": 0.6616794069608053, + "num_tokens": 1201592457.0, + "step": 7157 + }, + { + "entropy": 1.6911123394966125, + "epoch": 0.7863557716074813, + "grad_norm": 0.6597540974617004, + "learning_rate": 1.4462078772648445e-05, + "loss": 1.4674, + "mean_token_accuracy": 0.6407536615928014, + "num_tokens": 1201781440.0, + "step": 7158 + }, + { + "entropy": 1.6666575372219086, + "epoch": 0.7864656285188542, + "grad_norm": 0.5943217873573303, + "learning_rate": 1.4460600792857349e-05, + "loss": 1.4631, + "mean_token_accuracy": 0.6552510807911555, + "num_tokens": 1201987350.0, + "step": 7159 + }, + { + "entropy": 1.736344705025355, + "epoch": 0.7865754854302272, + "grad_norm": 0.6110522150993347, + "learning_rate": 1.4459122703539796e-05, + "loss": 1.5263, + "mean_token_accuracy": 0.6426798502604166, + "num_tokens": 1202207575.0, + "step": 7160 + }, + { + "entropy": 1.7043259739875793, + "epoch": 0.7866853423416, + "grad_norm": 0.763015866279602, + "learning_rate": 1.4457644504742572e-05, + "loss": 1.4422, + "mean_token_accuracy": 0.654599666595459, + "num_tokens": 1202348000.0, + "step": 7161 + }, + { + "entropy": 1.7122917970021565, + "epoch": 0.786795199252973, + "grad_norm": 0.8961231112480164, + "learning_rate": 1.4456166196512453e-05, + "loss": 1.4143, + "mean_token_accuracy": 0.6518704841534296, + "num_tokens": 1202509044.0, + "step": 7162 + }, + { + "entropy": 1.6719779272874196, + "epoch": 0.7869050561643459, + "grad_norm": 0.6332946419715881, + "learning_rate": 1.4454687778896235e-05, + "loss": 1.4086, + "mean_token_accuracy": 0.6405781507492065, + "num_tokens": 1202724539.0, + "step": 7163 + }, + { + "entropy": 1.7168918947378795, + "epoch": 0.7870149130757189, + "grad_norm": 0.7975315451622009, + "learning_rate": 1.4453209251940706e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6588208178679148, + "num_tokens": 1202860338.0, + "step": 7164 + }, + { + "entropy": 1.7201037506262462, + "epoch": 0.7871247699870918, + "grad_norm": 0.7638601660728455, + "learning_rate": 1.4451730615692658e-05, + "loss": 1.3508, + "mean_token_accuracy": 0.6692859182755152, + "num_tokens": 1202992554.0, + "step": 7165 + }, + { + "entropy": 1.6601012448469799, + "epoch": 0.7872346268984648, + "grad_norm": 0.5404378175735474, + "learning_rate": 1.445025187019889e-05, + "loss": 1.3115, + "mean_token_accuracy": 0.6680645495653152, + "num_tokens": 1203174793.0, + "step": 7166 + }, + { + "entropy": 1.7655375202496846, + "epoch": 0.7873444838098377, + "grad_norm": 0.7130011320114136, + "learning_rate": 1.444877301550621e-05, + "loss": 1.4636, + "mean_token_accuracy": 0.6438860942920049, + "num_tokens": 1203369080.0, + "step": 7167 + }, + { + "entropy": 1.7369131445884705, + "epoch": 0.7874543407212107, + "grad_norm": 0.6770559549331665, + "learning_rate": 1.4447294051661414e-05, + "loss": 1.3676, + "mean_token_accuracy": 0.6571057687203089, + "num_tokens": 1203515924.0, + "step": 7168 + }, + { + "entropy": 1.7329241931438446, + "epoch": 0.7875641976325836, + "grad_norm": 0.6496652960777283, + "learning_rate": 1.4445814978711317e-05, + "loss": 1.5801, + "mean_token_accuracy": 0.6215295642614365, + "num_tokens": 1203718060.0, + "step": 7169 + }, + { + "entropy": 1.7130872507890065, + "epoch": 0.7876740545439564, + "grad_norm": 0.6565669178962708, + "learning_rate": 1.4444335796702726e-05, + "loss": 1.449, + "mean_token_accuracy": 0.6431066493193308, + "num_tokens": 1203949259.0, + "step": 7170 + }, + { + "entropy": 1.7333206037680309, + "epoch": 0.7877839114553294, + "grad_norm": 0.7088605165481567, + "learning_rate": 1.4442856505682462e-05, + "loss": 1.4145, + "mean_token_accuracy": 0.6694990048805872, + "num_tokens": 1204132043.0, + "step": 7171 + }, + { + "entropy": 1.697807510693868, + "epoch": 0.7878937683667023, + "grad_norm": 0.8567109704017639, + "learning_rate": 1.4441377105697339e-05, + "loss": 1.4784, + "mean_token_accuracy": 0.6585593720277151, + "num_tokens": 1204333339.0, + "step": 7172 + }, + { + "entropy": 1.6664861639340718, + "epoch": 0.7880036252780753, + "grad_norm": 0.711167573928833, + "learning_rate": 1.443989759679418e-05, + "loss": 1.3998, + "mean_token_accuracy": 0.6541877388954163, + "num_tokens": 1204476596.0, + "step": 7173 + }, + { + "entropy": 1.7098850707213085, + "epoch": 0.7881134821894482, + "grad_norm": 0.6929422616958618, + "learning_rate": 1.4438417979019817e-05, + "loss": 1.5313, + "mean_token_accuracy": 0.634042297800382, + "num_tokens": 1204657222.0, + "step": 7174 + }, + { + "entropy": 1.7105639080206554, + "epoch": 0.7882233391008212, + "grad_norm": 0.6769076585769653, + "learning_rate": 1.443693825242107e-05, + "loss": 1.4749, + "mean_token_accuracy": 0.6523736665646235, + "num_tokens": 1204808131.0, + "step": 7175 + }, + { + "entropy": 1.6820484797159831, + "epoch": 0.7883331960121941, + "grad_norm": 0.6611175537109375, + "learning_rate": 1.4435458417044777e-05, + "loss": 1.3882, + "mean_token_accuracy": 0.6505205978949865, + "num_tokens": 1204971165.0, + "step": 7176 + }, + { + "entropy": 1.6880492369333904, + "epoch": 0.7884430529235671, + "grad_norm": 0.7320232391357422, + "learning_rate": 1.4433978472937776e-05, + "loss": 1.312, + "mean_token_accuracy": 0.6604448159535726, + "num_tokens": 1205093123.0, + "step": 7177 + }, + { + "entropy": 1.736960728963216, + "epoch": 0.78855290983494, + "grad_norm": 0.6706869602203369, + "learning_rate": 1.44324984201469e-05, + "loss": 1.3149, + "mean_token_accuracy": 0.6695601592461268, + "num_tokens": 1205218514.0, + "step": 7178 + }, + { + "entropy": 1.7094795008500416, + "epoch": 0.788662766746313, + "grad_norm": 0.6586654186248779, + "learning_rate": 1.4431018258718996e-05, + "loss": 1.3724, + "mean_token_accuracy": 0.649998739361763, + "num_tokens": 1205350578.0, + "step": 7179 + }, + { + "entropy": 1.6713014940420787, + "epoch": 0.7887726236576859, + "grad_norm": 0.6533616781234741, + "learning_rate": 1.4429537988700913e-05, + "loss": 1.3621, + "mean_token_accuracy": 0.6759164482355118, + "num_tokens": 1205503254.0, + "step": 7180 + }, + { + "entropy": 1.683766891558965, + "epoch": 0.7888824805690589, + "grad_norm": 0.67979496717453, + "learning_rate": 1.4428057610139495e-05, + "loss": 1.3932, + "mean_token_accuracy": 0.6734537233908972, + "num_tokens": 1205639667.0, + "step": 7181 + }, + { + "entropy": 1.7113385399182637, + "epoch": 0.7889923374804317, + "grad_norm": 0.712390124797821, + "learning_rate": 1.4426577123081597e-05, + "loss": 1.2857, + "mean_token_accuracy": 0.6757234086592993, + "num_tokens": 1205769605.0, + "step": 7182 + }, + { + "entropy": 1.6981197694937389, + "epoch": 0.7891021943918046, + "grad_norm": 0.6102924942970276, + "learning_rate": 1.4425096527574082e-05, + "loss": 1.3276, + "mean_token_accuracy": 0.6542087992032369, + "num_tokens": 1205933495.0, + "step": 7183 + }, + { + "entropy": 1.6887332499027252, + "epoch": 0.7892120513031776, + "grad_norm": 0.745152473449707, + "learning_rate": 1.4423615823663804e-05, + "loss": 1.3603, + "mean_token_accuracy": 0.6610602786143621, + "num_tokens": 1206050631.0, + "step": 7184 + }, + { + "entropy": 1.7006172637144725, + "epoch": 0.7893219082145505, + "grad_norm": 0.6827918887138367, + "learning_rate": 1.4422135011397627e-05, + "loss": 1.2546, + "mean_token_accuracy": 0.6747910380363464, + "num_tokens": 1206167560.0, + "step": 7185 + }, + { + "entropy": 1.7094734410444896, + "epoch": 0.7894317651259235, + "grad_norm": 0.7063429951667786, + "learning_rate": 1.4420654090822416e-05, + "loss": 1.3133, + "mean_token_accuracy": 0.6641270716985067, + "num_tokens": 1206342612.0, + "step": 7186 + }, + { + "entropy": 1.6872341831525166, + "epoch": 0.7895416220372964, + "grad_norm": 0.7216169238090515, + "learning_rate": 1.4419173061985048e-05, + "loss": 1.3068, + "mean_token_accuracy": 0.6675632099310557, + "num_tokens": 1206480850.0, + "step": 7187 + }, + { + "entropy": 1.7280444105466206, + "epoch": 0.7896514789486694, + "grad_norm": 0.7706807851791382, + "learning_rate": 1.4417691924932394e-05, + "loss": 1.3474, + "mean_token_accuracy": 0.6651838620503744, + "num_tokens": 1206625169.0, + "step": 7188 + }, + { + "entropy": 1.719985653956731, + "epoch": 0.7897613358600423, + "grad_norm": 0.7672920227050781, + "learning_rate": 1.441621067971133e-05, + "loss": 1.3996, + "mean_token_accuracy": 0.6526385049025217, + "num_tokens": 1206774389.0, + "step": 7189 + }, + { + "entropy": 1.7050624787807465, + "epoch": 0.7898711927714153, + "grad_norm": 0.8137822151184082, + "learning_rate": 1.4414729326368736e-05, + "loss": 1.4153, + "mean_token_accuracy": 0.657172903418541, + "num_tokens": 1206920548.0, + "step": 7190 + }, + { + "entropy": 1.7352370421091716, + "epoch": 0.7899810496827882, + "grad_norm": 0.6931704878807068, + "learning_rate": 1.4413247864951499e-05, + "loss": 1.4766, + "mean_token_accuracy": 0.6547851413488388, + "num_tokens": 1207111742.0, + "step": 7191 + }, + { + "entropy": 1.6695275406042736, + "epoch": 0.7900909065941611, + "grad_norm": 0.7764499187469482, + "learning_rate": 1.4411766295506502e-05, + "loss": 1.1244, + "mean_token_accuracy": 0.7065702676773071, + "num_tokens": 1207238801.0, + "step": 7192 + }, + { + "entropy": 1.651973952849706, + "epoch": 0.790200763505534, + "grad_norm": 0.8136328458786011, + "learning_rate": 1.4410284618080644e-05, + "loss": 1.3584, + "mean_token_accuracy": 0.6729863931735357, + "num_tokens": 1207392700.0, + "step": 7193 + }, + { + "entropy": 1.6933129529158275, + "epoch": 0.790310620416907, + "grad_norm": 0.7404091954231262, + "learning_rate": 1.440880283272081e-05, + "loss": 1.407, + "mean_token_accuracy": 0.6515365193287531, + "num_tokens": 1207583914.0, + "step": 7194 + }, + { + "entropy": 1.7414989471435547, + "epoch": 0.7904204773282799, + "grad_norm": 0.7047650814056396, + "learning_rate": 1.4407320939473903e-05, + "loss": 1.386, + "mean_token_accuracy": 0.6629294902086258, + "num_tokens": 1207747681.0, + "step": 7195 + }, + { + "entropy": 1.7473087112108867, + "epoch": 0.7905303342396528, + "grad_norm": 0.6341284513473511, + "learning_rate": 1.4405838938386827e-05, + "loss": 1.4854, + "mean_token_accuracy": 0.6382193118333817, + "num_tokens": 1207993633.0, + "step": 7196 + }, + { + "entropy": 1.7010388871033986, + "epoch": 0.7906401911510258, + "grad_norm": 0.7494300603866577, + "learning_rate": 1.440435682950648e-05, + "loss": 1.3331, + "mean_token_accuracy": 0.669193853934606, + "num_tokens": 1208164918.0, + "step": 7197 + }, + { + "entropy": 1.7257753908634186, + "epoch": 0.7907500480623987, + "grad_norm": 0.7151653170585632, + "learning_rate": 1.4402874612879774e-05, + "loss": 1.2647, + "mean_token_accuracy": 0.6737553824981054, + "num_tokens": 1208275864.0, + "step": 7198 + }, + { + "entropy": 1.7049545844395955, + "epoch": 0.7908599049737717, + "grad_norm": 0.8426851630210876, + "learning_rate": 1.4401392288553622e-05, + "loss": 1.4961, + "mean_token_accuracy": 0.6478038181861242, + "num_tokens": 1208466701.0, + "step": 7199 + }, + { + "entropy": 1.7350122928619385, + "epoch": 0.7909697618851446, + "grad_norm": 0.816241979598999, + "learning_rate": 1.4399909856574931e-05, + "loss": 1.3144, + "mean_token_accuracy": 0.6634030193090439, + "num_tokens": 1208595401.0, + "step": 7200 + }, + { + "entropy": 1.7607790033022563, + "epoch": 0.7910796187965176, + "grad_norm": 0.6901513934135437, + "learning_rate": 1.4398427316990633e-05, + "loss": 1.348, + "mean_token_accuracy": 0.6634863515694936, + "num_tokens": 1208778298.0, + "step": 7201 + }, + { + "entropy": 1.7141740421454112, + "epoch": 0.7911894757078904, + "grad_norm": 0.6500587463378906, + "learning_rate": 1.4396944669847637e-05, + "loss": 1.4433, + "mean_token_accuracy": 0.6416673759619395, + "num_tokens": 1208970192.0, + "step": 7202 + }, + { + "entropy": 1.68813360730807, + "epoch": 0.7912993326192634, + "grad_norm": 0.6901952624320984, + "learning_rate": 1.4395461915192875e-05, + "loss": 1.366, + "mean_token_accuracy": 0.6568211714426676, + "num_tokens": 1209101142.0, + "step": 7203 + }, + { + "entropy": 1.6981943150361378, + "epoch": 0.7914091895306363, + "grad_norm": 0.7170037627220154, + "learning_rate": 1.439397905307327e-05, + "loss": 1.5666, + "mean_token_accuracy": 0.6461018125216166, + "num_tokens": 1209262785.0, + "step": 7204 + }, + { + "entropy": 1.5867635409037273, + "epoch": 0.7915190464420093, + "grad_norm": 0.7785094976425171, + "learning_rate": 1.4392496083535764e-05, + "loss": 1.3767, + "mean_token_accuracy": 0.6633408665657043, + "num_tokens": 1209442179.0, + "step": 7205 + }, + { + "entropy": 1.7308302025000255, + "epoch": 0.7916289033533822, + "grad_norm": 0.6985065937042236, + "learning_rate": 1.4391013006627276e-05, + "loss": 1.4598, + "mean_token_accuracy": 0.6464549154043198, + "num_tokens": 1209665973.0, + "step": 7206 + }, + { + "entropy": 1.7229714790980022, + "epoch": 0.7917387602647552, + "grad_norm": 0.6461872458457947, + "learning_rate": 1.438952982239476e-05, + "loss": 1.4052, + "mean_token_accuracy": 0.6459956765174866, + "num_tokens": 1209842142.0, + "step": 7207 + }, + { + "entropy": 1.6630838414033253, + "epoch": 0.7918486171761281, + "grad_norm": 0.7045498490333557, + "learning_rate": 1.4388046530885156e-05, + "loss": 1.2883, + "mean_token_accuracy": 0.6830503195524216, + "num_tokens": 1209971675.0, + "step": 7208 + }, + { + "entropy": 1.712927410999934, + "epoch": 0.7919584740875011, + "grad_norm": 0.8392392992973328, + "learning_rate": 1.43865631321454e-05, + "loss": 1.3648, + "mean_token_accuracy": 0.6657893657684326, + "num_tokens": 1210119563.0, + "step": 7209 + }, + { + "entropy": 1.67608709136645, + "epoch": 0.792068330998874, + "grad_norm": 0.6663906574249268, + "learning_rate": 1.438507962622245e-05, + "loss": 1.2885, + "mean_token_accuracy": 0.6710440864165624, + "num_tokens": 1210253870.0, + "step": 7210 + }, + { + "entropy": 1.7072576979796092, + "epoch": 0.7921781879102469, + "grad_norm": 0.6991833448410034, + "learning_rate": 1.4383596013163254e-05, + "loss": 1.5431, + "mean_token_accuracy": 0.641920750339826, + "num_tokens": 1210486487.0, + "step": 7211 + }, + { + "entropy": 1.6564313073952992, + "epoch": 0.7922880448216199, + "grad_norm": 0.8407886028289795, + "learning_rate": 1.4382112293014767e-05, + "loss": 1.2964, + "mean_token_accuracy": 0.6637519697348276, + "num_tokens": 1210607227.0, + "step": 7212 + }, + { + "entropy": 1.72148593266805, + "epoch": 0.7923979017329927, + "grad_norm": 0.8114281892776489, + "learning_rate": 1.4380628465823954e-05, + "loss": 1.371, + "mean_token_accuracy": 0.6630014181137085, + "num_tokens": 1210763126.0, + "step": 7213 + }, + { + "entropy": 1.6811268826325734, + "epoch": 0.7925077586443657, + "grad_norm": 0.6577451825141907, + "learning_rate": 1.4379144531637773e-05, + "loss": 1.4008, + "mean_token_accuracy": 0.6475434551636378, + "num_tokens": 1210944851.0, + "step": 7214 + }, + { + "entropy": 1.6897524297237396, + "epoch": 0.7926176155557386, + "grad_norm": 0.6809960603713989, + "learning_rate": 1.4377660490503187e-05, + "loss": 1.3815, + "mean_token_accuracy": 0.665686676899592, + "num_tokens": 1211129884.0, + "step": 7215 + }, + { + "entropy": 1.7260896265506744, + "epoch": 0.7927274724671116, + "grad_norm": 0.6479694247245789, + "learning_rate": 1.437617634246717e-05, + "loss": 1.3552, + "mean_token_accuracy": 0.6604562699794769, + "num_tokens": 1211273138.0, + "step": 7216 + }, + { + "entropy": 1.704200655221939, + "epoch": 0.7928373293784845, + "grad_norm": 0.8002914190292358, + "learning_rate": 1.4374692087576694e-05, + "loss": 1.2478, + "mean_token_accuracy": 0.6780698845783869, + "num_tokens": 1211401771.0, + "step": 7217 + }, + { + "entropy": 1.7003964483737946, + "epoch": 0.7929471862898575, + "grad_norm": 0.6824570894241333, + "learning_rate": 1.4373207725878736e-05, + "loss": 1.417, + "mean_token_accuracy": 0.6604044139385223, + "num_tokens": 1211525125.0, + "step": 7218 + }, + { + "entropy": 1.756047526995341, + "epoch": 0.7930570432012304, + "grad_norm": 0.7336795926094055, + "learning_rate": 1.437172325742027e-05, + "loss": 1.4541, + "mean_token_accuracy": 0.6522673020760218, + "num_tokens": 1211688977.0, + "step": 7219 + }, + { + "entropy": 1.674008419116338, + "epoch": 0.7931669001126034, + "grad_norm": 0.5697746872901917, + "learning_rate": 1.4370238682248284e-05, + "loss": 1.4325, + "mean_token_accuracy": 0.6417450805505117, + "num_tokens": 1211915500.0, + "step": 7220 + }, + { + "entropy": 1.669522186120351, + "epoch": 0.7932767570239763, + "grad_norm": 0.7043665051460266, + "learning_rate": 1.4368754000409759e-05, + "loss": 1.2599, + "mean_token_accuracy": 0.6736998210350672, + "num_tokens": 1212019533.0, + "step": 7221 + }, + { + "entropy": 1.7614585657914479, + "epoch": 0.7933866139353493, + "grad_norm": 0.6837732791900635, + "learning_rate": 1.4367269211951688e-05, + "loss": 1.3755, + "mean_token_accuracy": 0.6557194739580154, + "num_tokens": 1212154651.0, + "step": 7222 + }, + { + "entropy": 1.7001692553361256, + "epoch": 0.7934964708467221, + "grad_norm": 0.7408974170684814, + "learning_rate": 1.436578431692107e-05, + "loss": 1.4553, + "mean_token_accuracy": 0.6416794806718826, + "num_tokens": 1212298701.0, + "step": 7223 + }, + { + "entropy": 1.7324201961358388, + "epoch": 0.793606327758095, + "grad_norm": 0.7774471640586853, + "learning_rate": 1.436429931536489e-05, + "loss": 1.4997, + "mean_token_accuracy": 0.6496329059203466, + "num_tokens": 1212477054.0, + "step": 7224 + }, + { + "entropy": 1.6642400324344635, + "epoch": 0.793716184669468, + "grad_norm": 0.5790720582008362, + "learning_rate": 1.4362814207330154e-05, + "loss": 1.372, + "mean_token_accuracy": 0.6617165555556616, + "num_tokens": 1212697010.0, + "step": 7225 + }, + { + "entropy": 1.662980963786443, + "epoch": 0.7938260415808409, + "grad_norm": 0.6963479518890381, + "learning_rate": 1.4361328992863863e-05, + "loss": 1.3915, + "mean_token_accuracy": 0.6623529940843582, + "num_tokens": 1212835136.0, + "step": 7226 + }, + { + "entropy": 1.697847972313563, + "epoch": 0.7939358984922139, + "grad_norm": 0.676906943321228, + "learning_rate": 1.4359843672013025e-05, + "loss": 1.2866, + "mean_token_accuracy": 0.669852097829183, + "num_tokens": 1212964012.0, + "step": 7227 + }, + { + "entropy": 1.7281867067019145, + "epoch": 0.7940457554035868, + "grad_norm": 0.7022289633750916, + "learning_rate": 1.4358358244824646e-05, + "loss": 1.356, + "mean_token_accuracy": 0.6543090840180715, + "num_tokens": 1213170176.0, + "step": 7228 + }, + { + "entropy": 1.726097176472346, + "epoch": 0.7941556123149598, + "grad_norm": 0.799540638923645, + "learning_rate": 1.4356872711345746e-05, + "loss": 1.5969, + "mean_token_accuracy": 0.6627245545387268, + "num_tokens": 1213333780.0, + "step": 7229 + }, + { + "entropy": 1.6588218410809834, + "epoch": 0.7942654692263327, + "grad_norm": 0.7514909505844116, + "learning_rate": 1.4355387071623335e-05, + "loss": 1.4477, + "mean_token_accuracy": 0.6484199364980062, + "num_tokens": 1213525486.0, + "step": 7230 + }, + { + "entropy": 1.7335455020268757, + "epoch": 0.7943753261377057, + "grad_norm": 0.6512316465377808, + "learning_rate": 1.4353901325704439e-05, + "loss": 1.3723, + "mean_token_accuracy": 0.6515757242838541, + "num_tokens": 1213713590.0, + "step": 7231 + }, + { + "entropy": 1.7147459487120311, + "epoch": 0.7944851830490786, + "grad_norm": 0.7964367270469666, + "learning_rate": 1.4352415473636071e-05, + "loss": 1.2251, + "mean_token_accuracy": 0.6848846276601156, + "num_tokens": 1213837877.0, + "step": 7232 + }, + { + "entropy": 1.680985818306605, + "epoch": 0.7945950399604516, + "grad_norm": 1.0966331958770752, + "learning_rate": 1.4350929515465269e-05, + "loss": 1.447, + "mean_token_accuracy": 0.6554000427325567, + "num_tokens": 1213994774.0, + "step": 7233 + }, + { + "entropy": 1.7075146635373433, + "epoch": 0.7947048968718244, + "grad_norm": 0.6517575979232788, + "learning_rate": 1.4349443451239052e-05, + "loss": 1.3756, + "mean_token_accuracy": 0.6637918055057526, + "num_tokens": 1214150146.0, + "step": 7234 + }, + { + "entropy": 1.7310162385304768, + "epoch": 0.7948147537831974, + "grad_norm": 0.6323092579841614, + "learning_rate": 1.4347957281004466e-05, + "loss": 1.5208, + "mean_token_accuracy": 0.6204556177059809, + "num_tokens": 1214404039.0, + "step": 7235 + }, + { + "entropy": 1.722548524538676, + "epoch": 0.7949246106945703, + "grad_norm": 0.7502648234367371, + "learning_rate": 1.4346471004808536e-05, + "loss": 1.2681, + "mean_token_accuracy": 0.6832453906536102, + "num_tokens": 1214523130.0, + "step": 7236 + }, + { + "entropy": 1.7112720509370167, + "epoch": 0.7950344676059432, + "grad_norm": 0.71775221824646, + "learning_rate": 1.4344984622698308e-05, + "loss": 1.2868, + "mean_token_accuracy": 0.6676936894655228, + "num_tokens": 1214672632.0, + "step": 7237 + }, + { + "entropy": 1.6758712430795033, + "epoch": 0.7951443245173162, + "grad_norm": 0.7727818489074707, + "learning_rate": 1.4343498134720823e-05, + "loss": 1.3304, + "mean_token_accuracy": 0.6693208316961924, + "num_tokens": 1214805275.0, + "step": 7238 + }, + { + "entropy": 1.7423097888628643, + "epoch": 0.7952541814286891, + "grad_norm": 0.6385967135429382, + "learning_rate": 1.434201154092313e-05, + "loss": 1.5397, + "mean_token_accuracy": 0.6438859502474467, + "num_tokens": 1215023577.0, + "step": 7239 + }, + { + "entropy": 1.7130355834960938, + "epoch": 0.7953640383400621, + "grad_norm": 0.6498627662658691, + "learning_rate": 1.4340524841352278e-05, + "loss": 1.343, + "mean_token_accuracy": 0.6625998119513193, + "num_tokens": 1215162413.0, + "step": 7240 + }, + { + "entropy": 1.6157074769337971, + "epoch": 0.795473895251435, + "grad_norm": 0.669402003288269, + "learning_rate": 1.433903803605532e-05, + "loss": 1.298, + "mean_token_accuracy": 0.6726977676153183, + "num_tokens": 1215324282.0, + "step": 7241 + }, + { + "entropy": 1.679003765185674, + "epoch": 0.795583752162808, + "grad_norm": 0.6385429501533508, + "learning_rate": 1.4337551125079315e-05, + "loss": 1.5047, + "mean_token_accuracy": 0.6625163654486338, + "num_tokens": 1215524603.0, + "step": 7242 + }, + { + "entropy": 1.7556110223134358, + "epoch": 0.7956936090741809, + "grad_norm": 0.7021380662918091, + "learning_rate": 1.4336064108471315e-05, + "loss": 1.3767, + "mean_token_accuracy": 0.6701732029517492, + "num_tokens": 1215652188.0, + "step": 7243 + }, + { + "entropy": 1.7281469702720642, + "epoch": 0.7958034659855538, + "grad_norm": 0.6799027323722839, + "learning_rate": 1.4334576986278392e-05, + "loss": 1.431, + "mean_token_accuracy": 0.6460000276565552, + "num_tokens": 1215885275.0, + "step": 7244 + }, + { + "entropy": 1.7195940514405568, + "epoch": 0.7959133228969267, + "grad_norm": 0.720520555973053, + "learning_rate": 1.4333089758547611e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6720231225093206, + "num_tokens": 1216073716.0, + "step": 7245 + }, + { + "entropy": 1.7394606570402782, + "epoch": 0.7960231798082997, + "grad_norm": 0.7158997058868408, + "learning_rate": 1.4331602425326038e-05, + "loss": 1.5034, + "mean_token_accuracy": 0.6467889149983724, + "num_tokens": 1216232381.0, + "step": 7246 + }, + { + "entropy": 1.6945938964684804, + "epoch": 0.7961330367196726, + "grad_norm": 0.5890840291976929, + "learning_rate": 1.4330114986660755e-05, + "loss": 1.5163, + "mean_token_accuracy": 0.6420510311921438, + "num_tokens": 1216438001.0, + "step": 7247 + }, + { + "entropy": 1.7162805596987407, + "epoch": 0.7962428936310456, + "grad_norm": 0.7011001110076904, + "learning_rate": 1.4328627442598827e-05, + "loss": 1.4178, + "mean_token_accuracy": 0.6583675543467203, + "num_tokens": 1216579926.0, + "step": 7248 + }, + { + "entropy": 1.7298036813735962, + "epoch": 0.7963527505424185, + "grad_norm": 0.6295740008354187, + "learning_rate": 1.4327139793187343e-05, + "loss": 1.3503, + "mean_token_accuracy": 0.6574052224556605, + "num_tokens": 1216725414.0, + "step": 7249 + }, + { + "entropy": 1.6822136640548706, + "epoch": 0.7964626074537914, + "grad_norm": 0.8162563443183899, + "learning_rate": 1.4325652038473386e-05, + "loss": 1.516, + "mean_token_accuracy": 0.6439789732297262, + "num_tokens": 1216913110.0, + "step": 7250 + }, + { + "entropy": 1.68292702237765, + "epoch": 0.7965724643651644, + "grad_norm": 0.6158664226531982, + "learning_rate": 1.432416417850404e-05, + "loss": 1.3995, + "mean_token_accuracy": 0.6543232848246893, + "num_tokens": 1217101697.0, + "step": 7251 + }, + { + "entropy": 1.7199612259864807, + "epoch": 0.7966823212765373, + "grad_norm": 0.6690497398376465, + "learning_rate": 1.4322676213326392e-05, + "loss": 1.3827, + "mean_token_accuracy": 0.6606669773658117, + "num_tokens": 1217278700.0, + "step": 7252 + }, + { + "entropy": 1.6890186369419098, + "epoch": 0.7967921781879103, + "grad_norm": 0.6910893321037292, + "learning_rate": 1.4321188142987545e-05, + "loss": 1.3698, + "mean_token_accuracy": 0.656900112827619, + "num_tokens": 1217443444.0, + "step": 7253 + }, + { + "entropy": 1.7530109186967213, + "epoch": 0.7969020350992831, + "grad_norm": 0.6086611747741699, + "learning_rate": 1.4319699967534584e-05, + "loss": 1.6023, + "mean_token_accuracy": 0.6309465765953064, + "num_tokens": 1217637360.0, + "step": 7254 + }, + { + "entropy": 1.7479777733484905, + "epoch": 0.7970118920106561, + "grad_norm": 0.8288069367408752, + "learning_rate": 1.4318211687014618e-05, + "loss": 1.5313, + "mean_token_accuracy": 0.6323782056570053, + "num_tokens": 1217803903.0, + "step": 7255 + }, + { + "entropy": 1.7118703424930573, + "epoch": 0.797121748922029, + "grad_norm": 0.6841630339622498, + "learning_rate": 1.4316723301474744e-05, + "loss": 1.3312, + "mean_token_accuracy": 0.6639546205600103, + "num_tokens": 1217950265.0, + "step": 7256 + }, + { + "entropy": 1.6801489094893138, + "epoch": 0.797231605833402, + "grad_norm": 0.785036027431488, + "learning_rate": 1.4315234810962077e-05, + "loss": 1.5764, + "mean_token_accuracy": 0.6401002655426661, + "num_tokens": 1218123645.0, + "step": 7257 + }, + { + "entropy": 1.7071903347969055, + "epoch": 0.7973414627447749, + "grad_norm": 0.6923706531524658, + "learning_rate": 1.431374621552372e-05, + "loss": 1.3916, + "mean_token_accuracy": 0.6654524803161621, + "num_tokens": 1218284414.0, + "step": 7258 + }, + { + "entropy": 1.6701487104098003, + "epoch": 0.7974513196561479, + "grad_norm": 0.6866686344146729, + "learning_rate": 1.4312257515206788e-05, + "loss": 1.4887, + "mean_token_accuracy": 0.6344787627458572, + "num_tokens": 1218489559.0, + "step": 7259 + }, + { + "entropy": 1.6827106575171153, + "epoch": 0.7975611765675208, + "grad_norm": 0.6273086667060852, + "learning_rate": 1.4310768710058398e-05, + "loss": 1.4382, + "mean_token_accuracy": 0.6519757409890493, + "num_tokens": 1218759113.0, + "step": 7260 + }, + { + "entropy": 1.703356256087621, + "epoch": 0.7976710334788938, + "grad_norm": 0.7796327471733093, + "learning_rate": 1.4309279800125673e-05, + "loss": 1.4105, + "mean_token_accuracy": 0.6624956379334132, + "num_tokens": 1218906762.0, + "step": 7261 + }, + { + "entropy": 1.661956379810969, + "epoch": 0.7977808903902667, + "grad_norm": 0.5531797409057617, + "learning_rate": 1.4307790785455729e-05, + "loss": 1.4883, + "mean_token_accuracy": 0.6515692820151647, + "num_tokens": 1219102172.0, + "step": 7262 + }, + { + "entropy": 1.7754519681135814, + "epoch": 0.7978907473016397, + "grad_norm": 0.7891526222229004, + "learning_rate": 1.4306301666095702e-05, + "loss": 1.4952, + "mean_token_accuracy": 0.644075925151507, + "num_tokens": 1219278956.0, + "step": 7263 + }, + { + "entropy": 1.6872022251288097, + "epoch": 0.7980006042130126, + "grad_norm": 0.6932452321052551, + "learning_rate": 1.4304812442092713e-05, + "loss": 1.1952, + "mean_token_accuracy": 0.6833833257357279, + "num_tokens": 1219417575.0, + "step": 7264 + }, + { + "entropy": 1.7105295658111572, + "epoch": 0.7981104611243854, + "grad_norm": 0.6524655818939209, + "learning_rate": 1.43033231134939e-05, + "loss": 1.3186, + "mean_token_accuracy": 0.6636832803487778, + "num_tokens": 1219558072.0, + "step": 7265 + }, + { + "entropy": 1.6755134363969166, + "epoch": 0.7982203180357584, + "grad_norm": 0.6954984664916992, + "learning_rate": 1.43018336803464e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.6699787775675455, + "num_tokens": 1219718681.0, + "step": 7266 + }, + { + "entropy": 1.7518522143363953, + "epoch": 0.7983301749471313, + "grad_norm": 0.6694498658180237, + "learning_rate": 1.4300344142697353e-05, + "loss": 1.4487, + "mean_token_accuracy": 0.6395488778750101, + "num_tokens": 1219883865.0, + "step": 7267 + }, + { + "entropy": 1.6969805459181468, + "epoch": 0.7984400318585043, + "grad_norm": 0.6084674000740051, + "learning_rate": 1.4298854500593897e-05, + "loss": 1.3776, + "mean_token_accuracy": 0.6505701790253321, + "num_tokens": 1220093180.0, + "step": 7268 + }, + { + "entropy": 1.7364482978979747, + "epoch": 0.7985498887698772, + "grad_norm": 0.7209011912345886, + "learning_rate": 1.4297364754083187e-05, + "loss": 1.5423, + "mean_token_accuracy": 0.6337501257658005, + "num_tokens": 1220270122.0, + "step": 7269 + }, + { + "entropy": 1.7650366127490997, + "epoch": 0.7986597456812502, + "grad_norm": 0.7382558584213257, + "learning_rate": 1.4295874903212365e-05, + "loss": 1.4628, + "mean_token_accuracy": 0.6457639882961909, + "num_tokens": 1220450090.0, + "step": 7270 + }, + { + "entropy": 1.7180620034535725, + "epoch": 0.7987696025926231, + "grad_norm": 0.7159477472305298, + "learning_rate": 1.4294384948028592e-05, + "loss": 1.4328, + "mean_token_accuracy": 0.6519068032503128, + "num_tokens": 1220627437.0, + "step": 7271 + }, + { + "entropy": 1.7323042750358582, + "epoch": 0.7988794595039961, + "grad_norm": 0.6927447319030762, + "learning_rate": 1.4292894888579014e-05, + "loss": 1.3658, + "mean_token_accuracy": 0.6517662604649862, + "num_tokens": 1220748500.0, + "step": 7272 + }, + { + "entropy": 1.7364622453848522, + "epoch": 0.798989316415369, + "grad_norm": 0.6226632595062256, + "learning_rate": 1.4291404724910803e-05, + "loss": 1.4278, + "mean_token_accuracy": 0.6461608906586965, + "num_tokens": 1220930228.0, + "step": 7273 + }, + { + "entropy": 1.7141484121481578, + "epoch": 0.799099173326742, + "grad_norm": 0.6916367411613464, + "learning_rate": 1.428991445707111e-05, + "loss": 1.2965, + "mean_token_accuracy": 0.669938306013743, + "num_tokens": 1221053000.0, + "step": 7274 + }, + { + "entropy": 1.7483568787574768, + "epoch": 0.7992090302381148, + "grad_norm": 0.7176331877708435, + "learning_rate": 1.428842408510711e-05, + "loss": 1.2919, + "mean_token_accuracy": 0.6726898650328318, + "num_tokens": 1221164118.0, + "step": 7275 + }, + { + "entropy": 1.7342201670010884, + "epoch": 0.7993188871494878, + "grad_norm": 0.6409656405448914, + "learning_rate": 1.4286933609065967e-05, + "loss": 1.4841, + "mean_token_accuracy": 0.6403429557879766, + "num_tokens": 1221324965.0, + "step": 7276 + }, + { + "entropy": 1.6586043238639832, + "epoch": 0.7994287440608607, + "grad_norm": 0.8449923396110535, + "learning_rate": 1.4285443028994859e-05, + "loss": 1.2365, + "mean_token_accuracy": 0.6813416828711828, + "num_tokens": 1221465842.0, + "step": 7277 + }, + { + "entropy": 1.696493277947108, + "epoch": 0.7995386009722336, + "grad_norm": 0.7281336784362793, + "learning_rate": 1.4283952344940957e-05, + "loss": 1.247, + "mean_token_accuracy": 0.681158721446991, + "num_tokens": 1221591240.0, + "step": 7278 + }, + { + "entropy": 1.7007083594799042, + "epoch": 0.7996484578836066, + "grad_norm": 0.6635676622390747, + "learning_rate": 1.4282461556951445e-05, + "loss": 1.3977, + "mean_token_accuracy": 0.6511793335278829, + "num_tokens": 1221772023.0, + "step": 7279 + }, + { + "entropy": 1.7213706970214844, + "epoch": 0.7997583147949795, + "grad_norm": 0.7881171107292175, + "learning_rate": 1.4280970665073503e-05, + "loss": 1.1846, + "mean_token_accuracy": 0.6822384099165598, + "num_tokens": 1221870566.0, + "step": 7280 + }, + { + "entropy": 1.658156931400299, + "epoch": 0.7998681717063525, + "grad_norm": 0.7354137897491455, + "learning_rate": 1.4279479669354319e-05, + "loss": 1.362, + "mean_token_accuracy": 0.6667267928520838, + "num_tokens": 1222061004.0, + "step": 7281 + }, + { + "entropy": 1.7199231286843617, + "epoch": 0.7999780286177254, + "grad_norm": 0.6585229635238647, + "learning_rate": 1.4277988569841082e-05, + "loss": 1.4833, + "mean_token_accuracy": 0.6446650822957357, + "num_tokens": 1222247129.0, + "step": 7282 + }, + { + "entropy": 1.6953574518362682, + "epoch": 0.8000878855290984, + "grad_norm": 0.581791341304779, + "learning_rate": 1.4276497366580982e-05, + "loss": 1.4653, + "mean_token_accuracy": 0.6526039590438207, + "num_tokens": 1222436035.0, + "step": 7283 + }, + { + "entropy": 1.7320642570654552, + "epoch": 0.8001977424404713, + "grad_norm": 0.7845410704612732, + "learning_rate": 1.4275006059621217e-05, + "loss": 1.5006, + "mean_token_accuracy": 0.6406663705905279, + "num_tokens": 1222625280.0, + "step": 7284 + }, + { + "entropy": 1.7656051715215046, + "epoch": 0.8003075993518443, + "grad_norm": 0.8226374983787537, + "learning_rate": 1.4273514649008989e-05, + "loss": 1.3163, + "mean_token_accuracy": 0.6708792199691137, + "num_tokens": 1222743645.0, + "step": 7285 + }, + { + "entropy": 1.764061023791631, + "epoch": 0.8004174562632171, + "grad_norm": 0.6624506115913391, + "learning_rate": 1.4272023134791493e-05, + "loss": 1.4877, + "mean_token_accuracy": 0.6504695763190588, + "num_tokens": 1222947630.0, + "step": 7286 + }, + { + "entropy": 1.738725354274114, + "epoch": 0.8005273131745901, + "grad_norm": 0.827363133430481, + "learning_rate": 1.4270531517015943e-05, + "loss": 1.4878, + "mean_token_accuracy": 0.6427379300196966, + "num_tokens": 1223111073.0, + "step": 7287 + }, + { + "entropy": 1.7238669991493225, + "epoch": 0.800637170085963, + "grad_norm": 0.665775716304779, + "learning_rate": 1.426903979572954e-05, + "loss": 1.4706, + "mean_token_accuracy": 0.6543847819169363, + "num_tokens": 1223269067.0, + "step": 7288 + }, + { + "entropy": 1.659266859292984, + "epoch": 0.800747026997336, + "grad_norm": 0.6303220391273499, + "learning_rate": 1.4267547970979502e-05, + "loss": 1.2609, + "mean_token_accuracy": 0.679823304216067, + "num_tokens": 1223420407.0, + "step": 7289 + }, + { + "entropy": 1.672978659470876, + "epoch": 0.8008568839087089, + "grad_norm": 0.5509341359138489, + "learning_rate": 1.4266056042813043e-05, + "loss": 1.3959, + "mean_token_accuracy": 0.6600791364908218, + "num_tokens": 1223639283.0, + "step": 7290 + }, + { + "entropy": 1.6871724128723145, + "epoch": 0.8009667408200818, + "grad_norm": 0.6795254349708557, + "learning_rate": 1.4264564011277384e-05, + "loss": 1.3352, + "mean_token_accuracy": 0.6592608243227005, + "num_tokens": 1223817100.0, + "step": 7291 + }, + { + "entropy": 1.7118334273497264, + "epoch": 0.8010765977314548, + "grad_norm": 0.7024778723716736, + "learning_rate": 1.4263071876419744e-05, + "loss": 1.6732, + "mean_token_accuracy": 0.6494659408926964, + "num_tokens": 1224030111.0, + "step": 7292 + }, + { + "entropy": 1.6586161156495411, + "epoch": 0.8011864546428277, + "grad_norm": 0.7682591676712036, + "learning_rate": 1.4261579638287351e-05, + "loss": 1.201, + "mean_token_accuracy": 0.6980761736631393, + "num_tokens": 1224163835.0, + "step": 7293 + }, + { + "entropy": 1.7308415472507477, + "epoch": 0.8012963115542007, + "grad_norm": 0.7771059274673462, + "learning_rate": 1.4260087296927427e-05, + "loss": 1.4017, + "mean_token_accuracy": 0.6600347012281418, + "num_tokens": 1224300825.0, + "step": 7294 + }, + { + "entropy": 1.695349782705307, + "epoch": 0.8014061684655736, + "grad_norm": 13.370857238769531, + "learning_rate": 1.4258594852387213e-05, + "loss": 1.5951, + "mean_token_accuracy": 0.6378213365872701, + "num_tokens": 1224469121.0, + "step": 7295 + }, + { + "entropy": 1.7092136939366658, + "epoch": 0.8015160253769466, + "grad_norm": 0.7703883647918701, + "learning_rate": 1.425710230471394e-05, + "loss": 1.3407, + "mean_token_accuracy": 0.6662224382162094, + "num_tokens": 1224620305.0, + "step": 7296 + }, + { + "entropy": 1.7132171392440796, + "epoch": 0.8016258822883194, + "grad_norm": 0.6171491146087646, + "learning_rate": 1.4255609653954847e-05, + "loss": 1.3658, + "mean_token_accuracy": 0.6527489374081293, + "num_tokens": 1224785259.0, + "step": 7297 + }, + { + "entropy": 1.6952051520347595, + "epoch": 0.8017357391996924, + "grad_norm": 0.7840876579284668, + "learning_rate": 1.4254116900157173e-05, + "loss": 1.4597, + "mean_token_accuracy": 0.6597426682710648, + "num_tokens": 1224930212.0, + "step": 7298 + }, + { + "entropy": 1.785047431786855, + "epoch": 0.8018455961110653, + "grad_norm": 1.3836613893508911, + "learning_rate": 1.4252624043368169e-05, + "loss": 1.4614, + "mean_token_accuracy": 0.6522940744956335, + "num_tokens": 1225052106.0, + "step": 7299 + }, + { + "entropy": 1.6921504139900208, + "epoch": 0.8019554530224383, + "grad_norm": 0.6270791888237, + "learning_rate": 1.4251131083635079e-05, + "loss": 1.4451, + "mean_token_accuracy": 0.6468443423509598, + "num_tokens": 1225221791.0, + "step": 7300 + }, + { + "entropy": 1.745924452940623, + "epoch": 0.8020653099338112, + "grad_norm": 0.8254175186157227, + "learning_rate": 1.4249638021005154e-05, + "loss": 1.4149, + "mean_token_accuracy": 0.6477925777435303, + "num_tokens": 1225366690.0, + "step": 7301 + }, + { + "entropy": 1.6810812751452129, + "epoch": 0.8021751668451842, + "grad_norm": 0.6022759675979614, + "learning_rate": 1.4248144855525649e-05, + "loss": 1.2906, + "mean_token_accuracy": 0.683276375134786, + "num_tokens": 1225516260.0, + "step": 7302 + }, + { + "entropy": 1.647108296553294, + "epoch": 0.8022850237565571, + "grad_norm": 0.7280488610267639, + "learning_rate": 1.4246651587243825e-05, + "loss": 1.3632, + "mean_token_accuracy": 0.6681808729966482, + "num_tokens": 1225722689.0, + "step": 7303 + }, + { + "entropy": 1.6735007365544636, + "epoch": 0.80239488066793, + "grad_norm": 0.6589364409446716, + "learning_rate": 1.424515821620694e-05, + "loss": 1.3376, + "mean_token_accuracy": 0.6602647950251898, + "num_tokens": 1225891479.0, + "step": 7304 + }, + { + "entropy": 1.7428459525108337, + "epoch": 0.802504737579303, + "grad_norm": 0.7700157165527344, + "learning_rate": 1.424366474246226e-05, + "loss": 1.3954, + "mean_token_accuracy": 0.6708898593982061, + "num_tokens": 1226034151.0, + "step": 7305 + }, + { + "entropy": 1.7126306494077046, + "epoch": 0.8026145944906758, + "grad_norm": 0.7041934728622437, + "learning_rate": 1.4242171166057053e-05, + "loss": 1.551, + "mean_token_accuracy": 0.6539329538742701, + "num_tokens": 1226215865.0, + "step": 7306 + }, + { + "entropy": 1.6650786697864532, + "epoch": 0.8027244514020488, + "grad_norm": 0.6044019460678101, + "learning_rate": 1.4240677487038593e-05, + "loss": 1.3191, + "mean_token_accuracy": 0.6622636218865713, + "num_tokens": 1226372708.0, + "step": 7307 + }, + { + "entropy": 1.723981390396754, + "epoch": 0.8028343083134217, + "grad_norm": 0.8162484765052795, + "learning_rate": 1.4239183705454142e-05, + "loss": 1.4615, + "mean_token_accuracy": 0.6413914859294891, + "num_tokens": 1226535459.0, + "step": 7308 + }, + { + "entropy": 1.7149596611658733, + "epoch": 0.8029441652247947, + "grad_norm": 0.73653644323349, + "learning_rate": 1.4237689821350992e-05, + "loss": 1.3971, + "mean_token_accuracy": 0.6685678660869598, + "num_tokens": 1226686426.0, + "step": 7309 + }, + { + "entropy": 1.6466976702213287, + "epoch": 0.8030540221361676, + "grad_norm": 0.743812084197998, + "learning_rate": 1.4236195834776418e-05, + "loss": 1.3838, + "mean_token_accuracy": 0.6726710299650828, + "num_tokens": 1226842592.0, + "step": 7310 + }, + { + "entropy": 1.8227874239285786, + "epoch": 0.8031638790475406, + "grad_norm": 0.9089652299880981, + "learning_rate": 1.4234701745777704e-05, + "loss": 1.6301, + "mean_token_accuracy": 0.6158707390228907, + "num_tokens": 1227021023.0, + "step": 7311 + }, + { + "entropy": 1.6785000363985698, + "epoch": 0.8032737359589135, + "grad_norm": 0.6702415347099304, + "learning_rate": 1.4233207554402138e-05, + "loss": 1.4478, + "mean_token_accuracy": 0.6375938355922699, + "num_tokens": 1227242256.0, + "step": 7312 + }, + { + "entropy": 1.719922512769699, + "epoch": 0.8033835928702865, + "grad_norm": 0.8157113790512085, + "learning_rate": 1.423171326069701e-05, + "loss": 1.4502, + "mean_token_accuracy": 0.6464798400799433, + "num_tokens": 1227418743.0, + "step": 7313 + }, + { + "entropy": 1.6604024668534596, + "epoch": 0.8034934497816594, + "grad_norm": 0.7159737348556519, + "learning_rate": 1.4230218864709612e-05, + "loss": 1.4431, + "mean_token_accuracy": 0.6605499237775803, + "num_tokens": 1227635822.0, + "step": 7314 + }, + { + "entropy": 1.706661621729533, + "epoch": 0.8036033066930324, + "grad_norm": 0.6185526251792908, + "learning_rate": 1.4228724366487242e-05, + "loss": 1.3767, + "mean_token_accuracy": 0.67093226313591, + "num_tokens": 1227818074.0, + "step": 7315 + }, + { + "entropy": 1.6483195424079895, + "epoch": 0.8037131636044053, + "grad_norm": 0.6785904765129089, + "learning_rate": 1.4227229766077202e-05, + "loss": 1.3335, + "mean_token_accuracy": 0.67606753607591, + "num_tokens": 1227982171.0, + "step": 7316 + }, + { + "entropy": 1.6934054692586262, + "epoch": 0.8038230205157783, + "grad_norm": 0.6464650630950928, + "learning_rate": 1.4225735063526792e-05, + "loss": 1.3717, + "mean_token_accuracy": 0.6636403550704321, + "num_tokens": 1228125554.0, + "step": 7317 + }, + { + "entropy": 1.636114815870921, + "epoch": 0.8039328774271511, + "grad_norm": 0.6265885233879089, + "learning_rate": 1.4224240258883324e-05, + "loss": 1.2675, + "mean_token_accuracy": 0.6773168394962946, + "num_tokens": 1228305404.0, + "step": 7318 + }, + { + "entropy": 1.73605677485466, + "epoch": 0.804042734338524, + "grad_norm": 0.659631073474884, + "learning_rate": 1.4222745352194102e-05, + "loss": 1.4094, + "mean_token_accuracy": 0.6566009968519211, + "num_tokens": 1228493229.0, + "step": 7319 + }, + { + "entropy": 1.6943889657656352, + "epoch": 0.804152591249897, + "grad_norm": 0.7117233276367188, + "learning_rate": 1.4221250343506445e-05, + "loss": 1.1594, + "mean_token_accuracy": 0.6870453854401907, + "num_tokens": 1228603511.0, + "step": 7320 + }, + { + "entropy": 1.6649406949679058, + "epoch": 0.8042624481612699, + "grad_norm": 0.7140738368034363, + "learning_rate": 1.4219755232867662e-05, + "loss": 1.2535, + "mean_token_accuracy": 0.6795340776443481, + "num_tokens": 1228720218.0, + "step": 7321 + }, + { + "entropy": 1.7426091035207112, + "epoch": 0.8043723050726429, + "grad_norm": 0.6925419569015503, + "learning_rate": 1.4218260020325079e-05, + "loss": 1.3582, + "mean_token_accuracy": 0.655068372686704, + "num_tokens": 1228895991.0, + "step": 7322 + }, + { + "entropy": 1.7241126894950867, + "epoch": 0.8044821619840158, + "grad_norm": 0.6894976496696472, + "learning_rate": 1.4216764705926019e-05, + "loss": 1.3149, + "mean_token_accuracy": 0.6623408049345016, + "num_tokens": 1229066649.0, + "step": 7323 + }, + { + "entropy": 1.725894719362259, + "epoch": 0.8045920188953888, + "grad_norm": 0.669735848903656, + "learning_rate": 1.4215269289717802e-05, + "loss": 1.3299, + "mean_token_accuracy": 0.6665193190177282, + "num_tokens": 1229214694.0, + "step": 7324 + }, + { + "entropy": 1.6697326302528381, + "epoch": 0.8047018758067617, + "grad_norm": 0.9352332353591919, + "learning_rate": 1.4213773771747763e-05, + "loss": 1.3784, + "mean_token_accuracy": 0.6521992981433868, + "num_tokens": 1229407333.0, + "step": 7325 + }, + { + "entropy": 1.6702334781487782, + "epoch": 0.8048117327181347, + "grad_norm": 0.7152570486068726, + "learning_rate": 1.4212278152063228e-05, + "loss": 1.3232, + "mean_token_accuracy": 0.6703629096349081, + "num_tokens": 1229536389.0, + "step": 7326 + }, + { + "entropy": 1.714383860429128, + "epoch": 0.8049215896295076, + "grad_norm": 0.7807464599609375, + "learning_rate": 1.4210782430711541e-05, + "loss": 1.2982, + "mean_token_accuracy": 0.6688077251116434, + "num_tokens": 1229662562.0, + "step": 7327 + }, + { + "entropy": 1.699068009853363, + "epoch": 0.8050314465408805, + "grad_norm": 0.6622336506843567, + "learning_rate": 1.4209286607740036e-05, + "loss": 1.3098, + "mean_token_accuracy": 0.6640769392251968, + "num_tokens": 1229832222.0, + "step": 7328 + }, + { + "entropy": 1.7409149905045826, + "epoch": 0.8051413034522534, + "grad_norm": 0.6264408826828003, + "learning_rate": 1.4207790683196056e-05, + "loss": 1.3233, + "mean_token_accuracy": 0.6729765981435776, + "num_tokens": 1229979504.0, + "step": 7329 + }, + { + "entropy": 1.696464866399765, + "epoch": 0.8052511603636264, + "grad_norm": 0.6128476858139038, + "learning_rate": 1.4206294657126944e-05, + "loss": 1.3835, + "mean_token_accuracy": 0.6473323603471121, + "num_tokens": 1230154418.0, + "step": 7330 + }, + { + "entropy": 1.7256540358066559, + "epoch": 0.8053610172749993, + "grad_norm": 0.6184810400009155, + "learning_rate": 1.4204798529580055e-05, + "loss": 1.424, + "mean_token_accuracy": 0.6607218682765961, + "num_tokens": 1230355844.0, + "step": 7331 + }, + { + "entropy": 1.6991771360238392, + "epoch": 0.8054708741863722, + "grad_norm": 0.6512514352798462, + "learning_rate": 1.4203302300602735e-05, + "loss": 1.3036, + "mean_token_accuracy": 0.6798295130332311, + "num_tokens": 1230493084.0, + "step": 7332 + }, + { + "entropy": 1.6824211478233337, + "epoch": 0.8055807310977452, + "grad_norm": 0.6564586758613586, + "learning_rate": 1.420180597024234e-05, + "loss": 1.3671, + "mean_token_accuracy": 0.6743087867895762, + "num_tokens": 1230629324.0, + "step": 7333 + }, + { + "entropy": 1.691230148077011, + "epoch": 0.8056905880091181, + "grad_norm": 0.6371413469314575, + "learning_rate": 1.420030953854623e-05, + "loss": 1.3799, + "mean_token_accuracy": 0.649422844250997, + "num_tokens": 1230812422.0, + "step": 7334 + }, + { + "entropy": 1.6784875591595967, + "epoch": 0.8058004449204911, + "grad_norm": 0.8957354426383972, + "learning_rate": 1.4198813005561765e-05, + "loss": 1.4552, + "mean_token_accuracy": 0.6490340381860733, + "num_tokens": 1231007507.0, + "step": 7335 + }, + { + "entropy": 1.7226787110169728, + "epoch": 0.805910301831864, + "grad_norm": 0.7002930641174316, + "learning_rate": 1.4197316371336307e-05, + "loss": 1.3037, + "mean_token_accuracy": 0.6610642572244009, + "num_tokens": 1231148630.0, + "step": 7336 + }, + { + "entropy": 1.6683934728304546, + "epoch": 0.806020158743237, + "grad_norm": 0.6079908013343811, + "learning_rate": 1.419581963591723e-05, + "loss": 1.3604, + "mean_token_accuracy": 0.6693431635697683, + "num_tokens": 1231354998.0, + "step": 7337 + }, + { + "entropy": 1.7245097557703655, + "epoch": 0.8061300156546098, + "grad_norm": 0.5906463265419006, + "learning_rate": 1.41943227993519e-05, + "loss": 1.4765, + "mean_token_accuracy": 0.6449787418047587, + "num_tokens": 1231548058.0, + "step": 7338 + }, + { + "entropy": 1.757252832253774, + "epoch": 0.8062398725659828, + "grad_norm": 0.6691707968711853, + "learning_rate": 1.4192825861687694e-05, + "loss": 1.2934, + "mean_token_accuracy": 0.6613676349322001, + "num_tokens": 1231674981.0, + "step": 7339 + }, + { + "entropy": 1.7177879710992177, + "epoch": 0.8063497294773557, + "grad_norm": 0.7445343136787415, + "learning_rate": 1.4191328822971988e-05, + "loss": 1.2699, + "mean_token_accuracy": 0.6711633503437042, + "num_tokens": 1231810236.0, + "step": 7340 + }, + { + "entropy": 1.7406736811002095, + "epoch": 0.8064595863887287, + "grad_norm": 0.5835939645767212, + "learning_rate": 1.4189831683252162e-05, + "loss": 1.3592, + "mean_token_accuracy": 0.6513313700755438, + "num_tokens": 1231967375.0, + "step": 7341 + }, + { + "entropy": 1.714879075686137, + "epoch": 0.8065694433001016, + "grad_norm": 0.723407506942749, + "learning_rate": 1.41883344425756e-05, + "loss": 1.4315, + "mean_token_accuracy": 0.6485263953606287, + "num_tokens": 1232124514.0, + "step": 7342 + }, + { + "entropy": 1.7313550611337025, + "epoch": 0.8066793002114746, + "grad_norm": 0.671328067779541, + "learning_rate": 1.4186837100989693e-05, + "loss": 1.4934, + "mean_token_accuracy": 0.6444969574610392, + "num_tokens": 1232311836.0, + "step": 7343 + }, + { + "entropy": 1.7535496056079865, + "epoch": 0.8067891571228475, + "grad_norm": 0.7109101414680481, + "learning_rate": 1.4185339658541824e-05, + "loss": 1.5029, + "mean_token_accuracy": 0.6288647800683975, + "num_tokens": 1232511838.0, + "step": 7344 + }, + { + "entropy": 1.6992291112740834, + "epoch": 0.8068990140342204, + "grad_norm": 0.6878111958503723, + "learning_rate": 1.4183842115279391e-05, + "loss": 1.3329, + "mean_token_accuracy": 0.6704870462417603, + "num_tokens": 1232683222.0, + "step": 7345 + }, + { + "entropy": 1.664991666873296, + "epoch": 0.8070088709455934, + "grad_norm": 0.6333096623420715, + "learning_rate": 1.4182344471249789e-05, + "loss": 1.4144, + "mean_token_accuracy": 0.6521175851424535, + "num_tokens": 1232883037.0, + "step": 7346 + }, + { + "entropy": 1.7921419044335682, + "epoch": 0.8071187278569663, + "grad_norm": 0.7302199602127075, + "learning_rate": 1.4180846726500422e-05, + "loss": 1.417, + "mean_token_accuracy": 0.6555136690537134, + "num_tokens": 1233037865.0, + "step": 7347 + }, + { + "entropy": 1.7335894107818604, + "epoch": 0.8072285847683393, + "grad_norm": 0.6250348687171936, + "learning_rate": 1.4179348881078687e-05, + "loss": 1.4014, + "mean_token_accuracy": 0.6503031303485235, + "num_tokens": 1233208316.0, + "step": 7348 + }, + { + "entropy": 1.7149858474731445, + "epoch": 0.8073384416797121, + "grad_norm": 0.6429965496063232, + "learning_rate": 1.4177850935031991e-05, + "loss": 1.4741, + "mean_token_accuracy": 0.6448526183764139, + "num_tokens": 1233386088.0, + "step": 7349 + }, + { + "entropy": 1.7311444580554962, + "epoch": 0.8074482985910851, + "grad_norm": 0.6727426052093506, + "learning_rate": 1.4176352888407748e-05, + "loss": 1.4227, + "mean_token_accuracy": 0.6663500418265661, + "num_tokens": 1233510643.0, + "step": 7350 + }, + { + "entropy": 1.7784233887990315, + "epoch": 0.807558155502458, + "grad_norm": 0.7476586699485779, + "learning_rate": 1.4174854741253368e-05, + "loss": 1.3135, + "mean_token_accuracy": 0.6639601538578669, + "num_tokens": 1233678361.0, + "step": 7351 + }, + { + "entropy": 1.6666496098041534, + "epoch": 0.807668012413831, + "grad_norm": 0.7939883470535278, + "learning_rate": 1.417335649361626e-05, + "loss": 1.3468, + "mean_token_accuracy": 0.6685838301976522, + "num_tokens": 1233827347.0, + "step": 7352 + }, + { + "entropy": 1.6903888583183289, + "epoch": 0.8077778693252039, + "grad_norm": 0.6927181482315063, + "learning_rate": 1.4171858145543856e-05, + "loss": 1.4467, + "mean_token_accuracy": 0.6569238354762396, + "num_tokens": 1233951478.0, + "step": 7353 + }, + { + "entropy": 1.6132766505082448, + "epoch": 0.8078877262365769, + "grad_norm": 0.6065834164619446, + "learning_rate": 1.4170359697083564e-05, + "loss": 1.378, + "mean_token_accuracy": 0.6582773874203364, + "num_tokens": 1234135918.0, + "step": 7354 + }, + { + "entropy": 1.667845626672109, + "epoch": 0.8079975831479498, + "grad_norm": 0.5912481546401978, + "learning_rate": 1.416886114828282e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6613271286090215, + "num_tokens": 1234318765.0, + "step": 7355 + }, + { + "entropy": 1.7094935675462086, + "epoch": 0.8081074400593228, + "grad_norm": 0.6756147742271423, + "learning_rate": 1.416736249918905e-05, + "loss": 1.3621, + "mean_token_accuracy": 0.6520873159170151, + "num_tokens": 1234459417.0, + "step": 7356 + }, + { + "entropy": 1.7112588385740917, + "epoch": 0.8082172969706957, + "grad_norm": 0.703292965888977, + "learning_rate": 1.4165863749849684e-05, + "loss": 1.4601, + "mean_token_accuracy": 0.6652803619702657, + "num_tokens": 1234622325.0, + "step": 7357 + }, + { + "entropy": 1.733866771062215, + "epoch": 0.8083271538820687, + "grad_norm": 0.6714462637901306, + "learning_rate": 1.4164364900312152e-05, + "loss": 1.429, + "mean_token_accuracy": 0.6481720258792242, + "num_tokens": 1234820943.0, + "step": 7358 + }, + { + "entropy": 1.6990663806597393, + "epoch": 0.8084370107934415, + "grad_norm": 0.7425878643989563, + "learning_rate": 1.4162865950623903e-05, + "loss": 1.4332, + "mean_token_accuracy": 0.6520752906799316, + "num_tokens": 1234974817.0, + "step": 7359 + }, + { + "entropy": 1.6746714909871419, + "epoch": 0.8085468677048144, + "grad_norm": 0.7591057419776917, + "learning_rate": 1.416136690083237e-05, + "loss": 1.4892, + "mean_token_accuracy": 0.6340252707401911, + "num_tokens": 1235191091.0, + "step": 7360 + }, + { + "entropy": 1.7674211462338765, + "epoch": 0.8086567246161874, + "grad_norm": 0.7490597367286682, + "learning_rate": 1.4159867750984998e-05, + "loss": 1.523, + "mean_token_accuracy": 0.6239955872297287, + "num_tokens": 1235418947.0, + "step": 7361 + }, + { + "entropy": 1.758747826019923, + "epoch": 0.8087665815275603, + "grad_norm": 0.7145038843154907, + "learning_rate": 1.4158368501129234e-05, + "loss": 1.3235, + "mean_token_accuracy": 0.6654741416374842, + "num_tokens": 1235562850.0, + "step": 7362 + }, + { + "entropy": 1.6892236868540447, + "epoch": 0.8088764384389333, + "grad_norm": 0.6746348142623901, + "learning_rate": 1.4156869151312536e-05, + "loss": 1.3841, + "mean_token_accuracy": 0.6534732679526011, + "num_tokens": 1235704318.0, + "step": 7363 + }, + { + "entropy": 1.7318583031495411, + "epoch": 0.8089862953503062, + "grad_norm": 0.7424976825714111, + "learning_rate": 1.4155369701582344e-05, + "loss": 1.5392, + "mean_token_accuracy": 0.6394655803839365, + "num_tokens": 1235921680.0, + "step": 7364 + }, + { + "entropy": 1.7026897370815277, + "epoch": 0.8090961522616792, + "grad_norm": 0.6459017395973206, + "learning_rate": 1.4153870151986127e-05, + "loss": 1.4672, + "mean_token_accuracy": 0.6501971036195755, + "num_tokens": 1236149444.0, + "step": 7365 + }, + { + "entropy": 1.7260268131891887, + "epoch": 0.8092060091730521, + "grad_norm": 0.6344119906425476, + "learning_rate": 1.4152370502571343e-05, + "loss": 1.3406, + "mean_token_accuracy": 0.6541385352611542, + "num_tokens": 1236288713.0, + "step": 7366 + }, + { + "entropy": 1.6736855705579121, + "epoch": 0.8093158660844251, + "grad_norm": 0.7475732564926147, + "learning_rate": 1.415087075338545e-05, + "loss": 1.3069, + "mean_token_accuracy": 0.6776044766108195, + "num_tokens": 1236442000.0, + "step": 7367 + }, + { + "entropy": 1.6757760147253673, + "epoch": 0.809425722995798, + "grad_norm": 0.6088912487030029, + "learning_rate": 1.4149370904475916e-05, + "loss": 1.4567, + "mean_token_accuracy": 0.6608283271392187, + "num_tokens": 1236620361.0, + "step": 7368 + }, + { + "entropy": 1.635620504617691, + "epoch": 0.809535579907171, + "grad_norm": 1.5183919668197632, + "learning_rate": 1.4147870955890217e-05, + "loss": 1.3127, + "mean_token_accuracy": 0.6584126055240631, + "num_tokens": 1236884490.0, + "step": 7369 + }, + { + "entropy": 1.6411939958731334, + "epoch": 0.8096454368185438, + "grad_norm": 0.6386780142784119, + "learning_rate": 1.4146370907675816e-05, + "loss": 1.3192, + "mean_token_accuracy": 0.6725454529126486, + "num_tokens": 1237042264.0, + "step": 7370 + }, + { + "entropy": 1.7076645493507385, + "epoch": 0.8097552937299168, + "grad_norm": 0.7984034419059753, + "learning_rate": 1.4144870759880196e-05, + "loss": 1.5304, + "mean_token_accuracy": 0.633898084362348, + "num_tokens": 1237194352.0, + "step": 7371 + }, + { + "entropy": 1.756166120370229, + "epoch": 0.8098651506412897, + "grad_norm": 0.7380567789077759, + "learning_rate": 1.4143370512550831e-05, + "loss": 1.4426, + "mean_token_accuracy": 0.6379890193541845, + "num_tokens": 1237324596.0, + "step": 7372 + }, + { + "entropy": 1.7159120738506317, + "epoch": 0.8099750075526626, + "grad_norm": 0.6627910733222961, + "learning_rate": 1.414187016573521e-05, + "loss": 1.3909, + "mean_token_accuracy": 0.6649558494488398, + "num_tokens": 1237483038.0, + "step": 7373 + }, + { + "entropy": 1.7520929177602131, + "epoch": 0.8100848644640356, + "grad_norm": 0.7027316093444824, + "learning_rate": 1.4140369719480812e-05, + "loss": 1.665, + "mean_token_accuracy": 0.6250251233577728, + "num_tokens": 1237657133.0, + "step": 7374 + }, + { + "entropy": 1.700265755256017, + "epoch": 0.8101947213754085, + "grad_norm": 0.6161781549453735, + "learning_rate": 1.4138869173835128e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.661478283504645, + "num_tokens": 1237839093.0, + "step": 7375 + }, + { + "entropy": 1.666181892156601, + "epoch": 0.8103045782867815, + "grad_norm": 0.7483380436897278, + "learning_rate": 1.4137368528845648e-05, + "loss": 1.3713, + "mean_token_accuracy": 0.6747563034296036, + "num_tokens": 1237973528.0, + "step": 7376 + }, + { + "entropy": 1.7163873811562855, + "epoch": 0.8104144351981544, + "grad_norm": 0.6816152930259705, + "learning_rate": 1.4135867784559867e-05, + "loss": 1.4682, + "mean_token_accuracy": 0.645609254638354, + "num_tokens": 1238112424.0, + "step": 7377 + }, + { + "entropy": 1.6794928908348083, + "epoch": 0.8105242921095274, + "grad_norm": 0.7110916376113892, + "learning_rate": 1.4134366941025283e-05, + "loss": 1.4644, + "mean_token_accuracy": 0.6717881063620249, + "num_tokens": 1238254177.0, + "step": 7378 + }, + { + "entropy": 1.6711277862389882, + "epoch": 0.8106341490209003, + "grad_norm": 0.6988908648490906, + "learning_rate": 1.4132865998289402e-05, + "loss": 1.3469, + "mean_token_accuracy": 0.652699887752533, + "num_tokens": 1238443003.0, + "step": 7379 + }, + { + "entropy": 1.679229776064555, + "epoch": 0.8107440059322732, + "grad_norm": 0.6412124037742615, + "learning_rate": 1.413136495639972e-05, + "loss": 1.4704, + "mean_token_accuracy": 0.6530571530262629, + "num_tokens": 1238623416.0, + "step": 7380 + }, + { + "entropy": 1.6649984618028004, + "epoch": 0.8108538628436461, + "grad_norm": 0.6781518459320068, + "learning_rate": 1.412986381540375e-05, + "loss": 1.2339, + "mean_token_accuracy": 0.6802275578180949, + "num_tokens": 1238797372.0, + "step": 7381 + }, + { + "entropy": 1.739145537217458, + "epoch": 0.8109637197550191, + "grad_norm": 0.7031247019767761, + "learning_rate": 1.4128362575349e-05, + "loss": 1.2742, + "mean_token_accuracy": 0.6698156992594401, + "num_tokens": 1238931134.0, + "step": 7382 + }, + { + "entropy": 1.6609701414903004, + "epoch": 0.811073576666392, + "grad_norm": 0.6347212791442871, + "learning_rate": 1.4126861236282985e-05, + "loss": 1.3583, + "mean_token_accuracy": 0.6621866772572199, + "num_tokens": 1239089834.0, + "step": 7383 + }, + { + "entropy": 1.6577509045600891, + "epoch": 0.811183433577765, + "grad_norm": 1.0662569999694824, + "learning_rate": 1.412535979825322e-05, + "loss": 1.3994, + "mean_token_accuracy": 0.6656260589758555, + "num_tokens": 1239250886.0, + "step": 7384 + }, + { + "entropy": 1.73176771402359, + "epoch": 0.8112932904891379, + "grad_norm": 0.8225398659706116, + "learning_rate": 1.4123858261307227e-05, + "loss": 1.2648, + "mean_token_accuracy": 0.6719925204912821, + "num_tokens": 1239372148.0, + "step": 7385 + }, + { + "entropy": 1.6916013360023499, + "epoch": 0.8114031474005108, + "grad_norm": 0.7312498092651367, + "learning_rate": 1.4122356625492524e-05, + "loss": 1.4389, + "mean_token_accuracy": 0.6483626465002695, + "num_tokens": 1239549194.0, + "step": 7386 + }, + { + "entropy": 1.715633491675059, + "epoch": 0.8115130043118838, + "grad_norm": 0.7113534808158875, + "learning_rate": 1.4120854890856643e-05, + "loss": 1.4785, + "mean_token_accuracy": 0.6499229570229849, + "num_tokens": 1239715171.0, + "step": 7387 + }, + { + "entropy": 1.6997787555058796, + "epoch": 0.8116228612232567, + "grad_norm": 0.7462812662124634, + "learning_rate": 1.4119353057447112e-05, + "loss": 1.2767, + "mean_token_accuracy": 0.677174707253774, + "num_tokens": 1239820052.0, + "step": 7388 + }, + { + "entropy": 1.7284424602985382, + "epoch": 0.8117327181346297, + "grad_norm": 0.6025354862213135, + "learning_rate": 1.4117851125311462e-05, + "loss": 1.4262, + "mean_token_accuracy": 0.6419772803783417, + "num_tokens": 1239981429.0, + "step": 7389 + }, + { + "entropy": 1.7345656553904216, + "epoch": 0.8118425750460025, + "grad_norm": 0.6548384428024292, + "learning_rate": 1.4116349094497228e-05, + "loss": 1.3457, + "mean_token_accuracy": 0.6590167085329691, + "num_tokens": 1240144145.0, + "step": 7390 + }, + { + "entropy": 1.7846961518128712, + "epoch": 0.8119524319573755, + "grad_norm": 0.7144470810890198, + "learning_rate": 1.4114846965051952e-05, + "loss": 1.5711, + "mean_token_accuracy": 0.643833170334498, + "num_tokens": 1240324650.0, + "step": 7391 + }, + { + "entropy": 1.7715802987416585, + "epoch": 0.8120622888687484, + "grad_norm": 0.8290162682533264, + "learning_rate": 1.4113344737023167e-05, + "loss": 1.2688, + "mean_token_accuracy": 0.6774558126926422, + "num_tokens": 1240451300.0, + "step": 7392 + }, + { + "entropy": 1.658871442079544, + "epoch": 0.8121721457801214, + "grad_norm": 0.6699681878089905, + "learning_rate": 1.411184241045843e-05, + "loss": 1.4638, + "mean_token_accuracy": 0.6388088216384252, + "num_tokens": 1240657822.0, + "step": 7393 + }, + { + "entropy": 1.733129362265269, + "epoch": 0.8122820026914943, + "grad_norm": 0.7511906623840332, + "learning_rate": 1.411033998540528e-05, + "loss": 1.3968, + "mean_token_accuracy": 0.657474105556806, + "num_tokens": 1240814705.0, + "step": 7394 + }, + { + "entropy": 1.7295902868111928, + "epoch": 0.8123918596028673, + "grad_norm": 0.7865959405899048, + "learning_rate": 1.4108837461911273e-05, + "loss": 1.477, + "mean_token_accuracy": 0.6377530843019485, + "num_tokens": 1240969971.0, + "step": 7395 + }, + { + "entropy": 1.695414235194524, + "epoch": 0.8125017165142402, + "grad_norm": 0.6054057478904724, + "learning_rate": 1.410733484002396e-05, + "loss": 1.4033, + "mean_token_accuracy": 0.6436779797077179, + "num_tokens": 1241149681.0, + "step": 7396 + }, + { + "entropy": 1.6878659625848134, + "epoch": 0.8126115734256132, + "grad_norm": 0.6600765585899353, + "learning_rate": 1.4105832119790898e-05, + "loss": 1.2499, + "mean_token_accuracy": 0.68401571114858, + "num_tokens": 1241269068.0, + "step": 7397 + }, + { + "entropy": 1.68837175766627, + "epoch": 0.8127214303369861, + "grad_norm": 0.788350522518158, + "learning_rate": 1.4104329301259652e-05, + "loss": 1.447, + "mean_token_accuracy": 0.6587880005439123, + "num_tokens": 1241425698.0, + "step": 7398 + }, + { + "entropy": 1.7728902697563171, + "epoch": 0.812831287248359, + "grad_norm": 0.839996337890625, + "learning_rate": 1.4102826384477782e-05, + "loss": 1.4142, + "mean_token_accuracy": 0.6632864475250244, + "num_tokens": 1241542424.0, + "step": 7399 + }, + { + "entropy": 1.679862250884374, + "epoch": 0.812941144159732, + "grad_norm": 0.7468640804290771, + "learning_rate": 1.4101323369492854e-05, + "loss": 1.3749, + "mean_token_accuracy": 0.6650107949972153, + "num_tokens": 1241699390.0, + "step": 7400 + }, + { + "entropy": 1.7201584080855052, + "epoch": 0.8130510010711048, + "grad_norm": 0.633568525314331, + "learning_rate": 1.4099820256352436e-05, + "loss": 1.4956, + "mean_token_accuracy": 0.6278330336014429, + "num_tokens": 1241877531.0, + "step": 7401 + }, + { + "entropy": 1.6532767017682393, + "epoch": 0.8131608579824778, + "grad_norm": 1.0218884944915771, + "learning_rate": 1.4098317045104106e-05, + "loss": 1.47, + "mean_token_accuracy": 0.6716248542070389, + "num_tokens": 1241999071.0, + "step": 7402 + }, + { + "entropy": 1.7018433213233948, + "epoch": 0.8132707148938507, + "grad_norm": 0.7204262018203735, + "learning_rate": 1.4096813735795443e-05, + "loss": 1.3745, + "mean_token_accuracy": 0.6768209586540858, + "num_tokens": 1242125538.0, + "step": 7403 + }, + { + "entropy": 1.6862289508183796, + "epoch": 0.8133805718052237, + "grad_norm": 0.6623913049697876, + "learning_rate": 1.4095310328474015e-05, + "loss": 1.4319, + "mean_token_accuracy": 0.6565775275230408, + "num_tokens": 1242318229.0, + "step": 7404 + }, + { + "entropy": 1.6885381937026978, + "epoch": 0.8134904287165966, + "grad_norm": 0.7589840888977051, + "learning_rate": 1.4093806823187408e-05, + "loss": 1.3628, + "mean_token_accuracy": 0.6577220807472864, + "num_tokens": 1242448793.0, + "step": 7405 + }, + { + "entropy": 1.6763391296068828, + "epoch": 0.8136002856279696, + "grad_norm": 0.8346347212791443, + "learning_rate": 1.4092303219983215e-05, + "loss": 1.3303, + "mean_token_accuracy": 0.6764725148677826, + "num_tokens": 1242553987.0, + "step": 7406 + }, + { + "entropy": 1.657248059908549, + "epoch": 0.8137101425393425, + "grad_norm": 0.7094236612319946, + "learning_rate": 1.4090799518909015e-05, + "loss": 1.3725, + "mean_token_accuracy": 0.6620489905277888, + "num_tokens": 1242749089.0, + "step": 7407 + }, + { + "entropy": 1.7001424332459767, + "epoch": 0.8138199994507155, + "grad_norm": 0.7831171751022339, + "learning_rate": 1.4089295720012402e-05, + "loss": 1.3146, + "mean_token_accuracy": 0.6626470337311426, + "num_tokens": 1242871762.0, + "step": 7408 + }, + { + "entropy": 1.7205629646778107, + "epoch": 0.8139298563620884, + "grad_norm": 0.744288980960846, + "learning_rate": 1.4087791823340975e-05, + "loss": 1.4054, + "mean_token_accuracy": 0.6607886006434759, + "num_tokens": 1243041832.0, + "step": 7409 + }, + { + "entropy": 1.688971887032191, + "epoch": 0.8140397132734614, + "grad_norm": 0.6575775742530823, + "learning_rate": 1.4086287828942326e-05, + "loss": 1.5004, + "mean_token_accuracy": 0.6478760689496994, + "num_tokens": 1243182302.0, + "step": 7410 + }, + { + "entropy": 1.6879553596178691, + "epoch": 0.8141495701848342, + "grad_norm": 0.6473801136016846, + "learning_rate": 1.4084783736864055e-05, + "loss": 1.273, + "mean_token_accuracy": 0.6795784085988998, + "num_tokens": 1243333276.0, + "step": 7411 + }, + { + "entropy": 1.771759420633316, + "epoch": 0.8142594270962072, + "grad_norm": 0.8150290250778198, + "learning_rate": 1.4083279547153774e-05, + "loss": 1.5647, + "mean_token_accuracy": 0.6433297594388326, + "num_tokens": 1243491601.0, + "step": 7412 + }, + { + "entropy": 1.6955374677975972, + "epoch": 0.8143692840075801, + "grad_norm": 0.5659390091896057, + "learning_rate": 1.4081775259859083e-05, + "loss": 1.5195, + "mean_token_accuracy": 0.6323159287373225, + "num_tokens": 1243669031.0, + "step": 7413 + }, + { + "entropy": 1.6806012392044067, + "epoch": 0.814479140918953, + "grad_norm": 0.7607001066207886, + "learning_rate": 1.408027087502759e-05, + "loss": 1.3539, + "mean_token_accuracy": 0.6575177957614263, + "num_tokens": 1243784885.0, + "step": 7414 + }, + { + "entropy": 1.735370695590973, + "epoch": 0.814588997830326, + "grad_norm": 0.6500069499015808, + "learning_rate": 1.4078766392706919e-05, + "loss": 1.3712, + "mean_token_accuracy": 0.6685160199801127, + "num_tokens": 1243908604.0, + "step": 7415 + }, + { + "entropy": 1.7190530995527904, + "epoch": 0.8146988547416989, + "grad_norm": 0.6073886156082153, + "learning_rate": 1.4077261812944675e-05, + "loss": 1.4499, + "mean_token_accuracy": 0.6379290819168091, + "num_tokens": 1244149442.0, + "step": 7416 + }, + { + "entropy": 1.7028957704703014, + "epoch": 0.8148087116530719, + "grad_norm": 0.7936030626296997, + "learning_rate": 1.4075757135788481e-05, + "loss": 1.3337, + "mean_token_accuracy": 0.663459782799085, + "num_tokens": 1244299370.0, + "step": 7417 + }, + { + "entropy": 1.6539308826128643, + "epoch": 0.8149185685644448, + "grad_norm": 0.6710909605026245, + "learning_rate": 1.4074252361285961e-05, + "loss": 1.3521, + "mean_token_accuracy": 0.6716783146063486, + "num_tokens": 1244465781.0, + "step": 7418 + }, + { + "entropy": 1.7069568435351055, + "epoch": 0.8150284254758178, + "grad_norm": 0.8002110123634338, + "learning_rate": 1.4072747489484736e-05, + "loss": 1.2018, + "mean_token_accuracy": 0.6825656940539678, + "num_tokens": 1244599995.0, + "step": 7419 + }, + { + "entropy": 1.663703719774882, + "epoch": 0.8151382823871907, + "grad_norm": 0.7054318785667419, + "learning_rate": 1.407124252043244e-05, + "loss": 1.3063, + "mean_token_accuracy": 0.6676364541053772, + "num_tokens": 1244739287.0, + "step": 7420 + }, + { + "entropy": 1.7169875999291737, + "epoch": 0.8152481392985637, + "grad_norm": 0.7952367067337036, + "learning_rate": 1.4069737454176704e-05, + "loss": 1.5895, + "mean_token_accuracy": 0.6387151479721069, + "num_tokens": 1244927614.0, + "step": 7421 + }, + { + "entropy": 1.7815453708171844, + "epoch": 0.8153579962099365, + "grad_norm": 0.959420919418335, + "learning_rate": 1.4068232290765158e-05, + "loss": 1.3701, + "mean_token_accuracy": 0.6549298316240311, + "num_tokens": 1245061586.0, + "step": 7422 + }, + { + "entropy": 1.681764543056488, + "epoch": 0.8154678531213095, + "grad_norm": 0.6878554224967957, + "learning_rate": 1.4066727030245442e-05, + "loss": 1.384, + "mean_token_accuracy": 0.6570885529120764, + "num_tokens": 1245214907.0, + "step": 7423 + }, + { + "entropy": 1.7041710217793782, + "epoch": 0.8155777100326824, + "grad_norm": 0.8578292727470398, + "learning_rate": 1.4065221672665199e-05, + "loss": 1.4803, + "mean_token_accuracy": 0.6547748496135076, + "num_tokens": 1245366976.0, + "step": 7424 + }, + { + "entropy": 1.7255032062530518, + "epoch": 0.8156875669440554, + "grad_norm": 0.6222707033157349, + "learning_rate": 1.4063716218072072e-05, + "loss": 1.3239, + "mean_token_accuracy": 0.6556070099274317, + "num_tokens": 1245497096.0, + "step": 7425 + }, + { + "entropy": 1.721158226331075, + "epoch": 0.8157974238554283, + "grad_norm": 0.7585559487342834, + "learning_rate": 1.4062210666513705e-05, + "loss": 1.55, + "mean_token_accuracy": 0.6495650957028071, + "num_tokens": 1245700324.0, + "step": 7426 + }, + { + "entropy": 1.6588152348995209, + "epoch": 0.8159072807668012, + "grad_norm": 0.6037783622741699, + "learning_rate": 1.4060705018037752e-05, + "loss": 1.2474, + "mean_token_accuracy": 0.6797231733798981, + "num_tokens": 1245829906.0, + "step": 7427 + }, + { + "entropy": 1.7529727220535278, + "epoch": 0.8160171376781742, + "grad_norm": 0.6627152562141418, + "learning_rate": 1.4059199272691864e-05, + "loss": 1.274, + "mean_token_accuracy": 0.6731006652116776, + "num_tokens": 1245965202.0, + "step": 7428 + }, + { + "entropy": 1.7085862557093303, + "epoch": 0.8161269945895471, + "grad_norm": 0.7288098931312561, + "learning_rate": 1.4057693430523696e-05, + "loss": 1.405, + "mean_token_accuracy": 0.6680357307195663, + "num_tokens": 1246151767.0, + "step": 7429 + }, + { + "entropy": 1.690618246793747, + "epoch": 0.8162368515009201, + "grad_norm": 0.8623577952384949, + "learning_rate": 1.4056187491580911e-05, + "loss": 1.4846, + "mean_token_accuracy": 0.6495842784643173, + "num_tokens": 1246308268.0, + "step": 7430 + }, + { + "entropy": 1.7508464058240254, + "epoch": 0.816346708412293, + "grad_norm": 0.716978132724762, + "learning_rate": 1.4054681455911168e-05, + "loss": 1.337, + "mean_token_accuracy": 0.6598278482755026, + "num_tokens": 1246427848.0, + "step": 7431 + }, + { + "entropy": 1.6642480889956157, + "epoch": 0.816456565323666, + "grad_norm": 0.632388174533844, + "learning_rate": 1.4053175323562132e-05, + "loss": 1.3534, + "mean_token_accuracy": 0.6599838187297186, + "num_tokens": 1246615964.0, + "step": 7432 + }, + { + "entropy": 1.6865338583787282, + "epoch": 0.8165664222350388, + "grad_norm": 0.7921580672264099, + "learning_rate": 1.4051669094581478e-05, + "loss": 1.4469, + "mean_token_accuracy": 0.6657046775023142, + "num_tokens": 1246761618.0, + "step": 7433 + }, + { + "entropy": 1.7106922467549641, + "epoch": 0.8166762791464118, + "grad_norm": 0.6639615893363953, + "learning_rate": 1.4050162769016867e-05, + "loss": 1.3718, + "mean_token_accuracy": 0.6516933192809423, + "num_tokens": 1246914824.0, + "step": 7434 + }, + { + "entropy": 1.6909891565640767, + "epoch": 0.8167861360577847, + "grad_norm": 0.6576387882232666, + "learning_rate": 1.4048656346915984e-05, + "loss": 1.4314, + "mean_token_accuracy": 0.6656709363063177, + "num_tokens": 1247129112.0, + "step": 7435 + }, + { + "entropy": 1.7483769953250885, + "epoch": 0.8168959929691577, + "grad_norm": 0.9580649733543396, + "learning_rate": 1.4047149828326491e-05, + "loss": 1.4487, + "mean_token_accuracy": 0.648655946056048, + "num_tokens": 1247272520.0, + "step": 7436 + }, + { + "entropy": 1.76641050974528, + "epoch": 0.8170058498805306, + "grad_norm": 0.7349089980125427, + "learning_rate": 1.404564321329609e-05, + "loss": 1.5273, + "mean_token_accuracy": 0.6450001696745554, + "num_tokens": 1247422308.0, + "step": 7437 + }, + { + "entropy": 1.7038010954856873, + "epoch": 0.8171157067919036, + "grad_norm": 0.612480878829956, + "learning_rate": 1.4044136501872447e-05, + "loss": 1.4251, + "mean_token_accuracy": 0.6478384385506312, + "num_tokens": 1247602434.0, + "step": 7438 + }, + { + "entropy": 1.7296733955542247, + "epoch": 0.8172255637032765, + "grad_norm": 0.6623514294624329, + "learning_rate": 1.4042629694103259e-05, + "loss": 1.4849, + "mean_token_accuracy": 0.6379480262597402, + "num_tokens": 1247765918.0, + "step": 7439 + }, + { + "entropy": 1.7219422558943431, + "epoch": 0.8173354206146494, + "grad_norm": 0.756436288356781, + "learning_rate": 1.404112279003621e-05, + "loss": 1.3645, + "mean_token_accuracy": 0.6643207172552744, + "num_tokens": 1247913353.0, + "step": 7440 + }, + { + "entropy": 1.6891942123572032, + "epoch": 0.8174452775260224, + "grad_norm": 0.7340204119682312, + "learning_rate": 1.4039615789719e-05, + "loss": 1.4962, + "mean_token_accuracy": 0.6556605597337087, + "num_tokens": 1248126159.0, + "step": 7441 + }, + { + "entropy": 1.7491665681203206, + "epoch": 0.8175551344373952, + "grad_norm": 0.7347027659416199, + "learning_rate": 1.4038108693199313e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.6711843659480413, + "num_tokens": 1248263141.0, + "step": 7442 + }, + { + "entropy": 1.705857555071513, + "epoch": 0.8176649913487682, + "grad_norm": 0.8173395991325378, + "learning_rate": 1.4036601500524858e-05, + "loss": 1.5044, + "mean_token_accuracy": 0.6398077656825384, + "num_tokens": 1248453485.0, + "step": 7443 + }, + { + "entropy": 1.7155713438987732, + "epoch": 0.8177748482601411, + "grad_norm": 0.6897460222244263, + "learning_rate": 1.4035094211743335e-05, + "loss": 1.4384, + "mean_token_accuracy": 0.6557023028532664, + "num_tokens": 1248605375.0, + "step": 7444 + }, + { + "entropy": 1.7846886813640594, + "epoch": 0.8178847051715141, + "grad_norm": 0.7669224739074707, + "learning_rate": 1.4033586826902446e-05, + "loss": 1.4515, + "mean_token_accuracy": 0.6553824543952942, + "num_tokens": 1248829634.0, + "step": 7445 + }, + { + "entropy": 1.7502902050813038, + "epoch": 0.817994562082887, + "grad_norm": 0.9004881978034973, + "learning_rate": 1.40320793460499e-05, + "loss": 1.4205, + "mean_token_accuracy": 0.6514177868763605, + "num_tokens": 1249052664.0, + "step": 7446 + }, + { + "entropy": 1.7154946823914845, + "epoch": 0.81810441899426, + "grad_norm": 0.7069644927978516, + "learning_rate": 1.4030571769233411e-05, + "loss": 1.3711, + "mean_token_accuracy": 0.6522306303183237, + "num_tokens": 1249220107.0, + "step": 7447 + }, + { + "entropy": 1.7312849462032318, + "epoch": 0.8182142759056329, + "grad_norm": 0.7484097480773926, + "learning_rate": 1.4029064096500689e-05, + "loss": 1.2812, + "mean_token_accuracy": 0.6704812347888947, + "num_tokens": 1249347873.0, + "step": 7448 + }, + { + "entropy": 1.7800405323505402, + "epoch": 0.8183241328170059, + "grad_norm": 0.635948657989502, + "learning_rate": 1.4027556327899456e-05, + "loss": 1.3237, + "mean_token_accuracy": 0.6623754402001699, + "num_tokens": 1249477584.0, + "step": 7449 + }, + { + "entropy": 1.7426054279009502, + "epoch": 0.8184339897283788, + "grad_norm": 0.6542919874191284, + "learning_rate": 1.402604846347743e-05, + "loss": 1.5097, + "mean_token_accuracy": 0.6329874048630396, + "num_tokens": 1249683768.0, + "step": 7450 + }, + { + "entropy": 1.7369756003220875, + "epoch": 0.8185438466397518, + "grad_norm": 0.7994239330291748, + "learning_rate": 1.402454050328233e-05, + "loss": 1.4815, + "mean_token_accuracy": 0.6522703667481741, + "num_tokens": 1249832625.0, + "step": 7451 + }, + { + "entropy": 1.6854838530222576, + "epoch": 0.8186537035511247, + "grad_norm": 0.6902982592582703, + "learning_rate": 1.4023032447361888e-05, + "loss": 1.3764, + "mean_token_accuracy": 0.6690236181020737, + "num_tokens": 1249979361.0, + "step": 7452 + }, + { + "entropy": 1.717117150624593, + "epoch": 0.8187635604624975, + "grad_norm": 0.7572975158691406, + "learning_rate": 1.4021524295763832e-05, + "loss": 1.5046, + "mean_token_accuracy": 0.6426510065793991, + "num_tokens": 1250141814.0, + "step": 7453 + }, + { + "entropy": 1.6960802574952443, + "epoch": 0.8188734173738705, + "grad_norm": 0.6628535985946655, + "learning_rate": 1.4020016048535894e-05, + "loss": 1.4133, + "mean_token_accuracy": 0.6609309216340383, + "num_tokens": 1250311504.0, + "step": 7454 + }, + { + "entropy": 1.7238394518693287, + "epoch": 0.8189832742852434, + "grad_norm": 0.6855142712593079, + "learning_rate": 1.401850770572581e-05, + "loss": 1.3652, + "mean_token_accuracy": 0.6540437589089075, + "num_tokens": 1250445247.0, + "step": 7455 + }, + { + "entropy": 1.7303275068600972, + "epoch": 0.8190931311966164, + "grad_norm": 0.6414943933486938, + "learning_rate": 1.4016999267381312e-05, + "loss": 1.4394, + "mean_token_accuracy": 0.6512100994586945, + "num_tokens": 1250642583.0, + "step": 7456 + }, + { + "entropy": 1.6663293739159901, + "epoch": 0.8192029881079893, + "grad_norm": 0.6580475568771362, + "learning_rate": 1.401549073355015e-05, + "loss": 1.3234, + "mean_token_accuracy": 0.6663972685734431, + "num_tokens": 1250818999.0, + "step": 7457 + }, + { + "entropy": 1.7176962395509083, + "epoch": 0.8193128450193623, + "grad_norm": 0.5929837822914124, + "learning_rate": 1.4013982104280063e-05, + "loss": 1.5187, + "mean_token_accuracy": 0.6359325100978216, + "num_tokens": 1251021224.0, + "step": 7458 + }, + { + "entropy": 1.7315525313218434, + "epoch": 0.8194227019307352, + "grad_norm": 0.7245919108390808, + "learning_rate": 1.4012473379618804e-05, + "loss": 1.4878, + "mean_token_accuracy": 0.6452689071496328, + "num_tokens": 1251185042.0, + "step": 7459 + }, + { + "entropy": 1.7084216177463531, + "epoch": 0.8195325588421082, + "grad_norm": 0.713031530380249, + "learning_rate": 1.4010964559614118e-05, + "loss": 1.492, + "mean_token_accuracy": 0.6423763384421667, + "num_tokens": 1251383890.0, + "step": 7460 + }, + { + "entropy": 1.7116627792517345, + "epoch": 0.8196424157534811, + "grad_norm": 0.7990723848342896, + "learning_rate": 1.4009455644313764e-05, + "loss": 1.462, + "mean_token_accuracy": 0.6500266542037328, + "num_tokens": 1251560097.0, + "step": 7461 + }, + { + "entropy": 1.659277429183324, + "epoch": 0.8197522726648541, + "grad_norm": 0.6893109083175659, + "learning_rate": 1.400794663376549e-05, + "loss": 1.4211, + "mean_token_accuracy": 0.6517892877260844, + "num_tokens": 1251745264.0, + "step": 7462 + }, + { + "entropy": 1.6505893170833588, + "epoch": 0.819862129576227, + "grad_norm": 0.806117057800293, + "learning_rate": 1.4006437528017063e-05, + "loss": 1.4743, + "mean_token_accuracy": 0.6712833146254221, + "num_tokens": 1251939883.0, + "step": 7463 + }, + { + "entropy": 1.6510307888189952, + "epoch": 0.8199719864876, + "grad_norm": 0.6745132803916931, + "learning_rate": 1.400492832711624e-05, + "loss": 1.3437, + "mean_token_accuracy": 0.6679824143648148, + "num_tokens": 1252128793.0, + "step": 7464 + }, + { + "entropy": 1.706537942091624, + "epoch": 0.8200818433989728, + "grad_norm": 0.6287968158721924, + "learning_rate": 1.4003419031110794e-05, + "loss": 1.3509, + "mean_token_accuracy": 0.6736375490824381, + "num_tokens": 1252315132.0, + "step": 7465 + }, + { + "entropy": 1.6979309916496277, + "epoch": 0.8201917003103458, + "grad_norm": 0.7194148302078247, + "learning_rate": 1.4001909640048485e-05, + "loss": 1.2779, + "mean_token_accuracy": 0.6610195636749268, + "num_tokens": 1252483018.0, + "step": 7466 + }, + { + "entropy": 1.6625094612439473, + "epoch": 0.8203015572217187, + "grad_norm": 0.7005951404571533, + "learning_rate": 1.4000400153977092e-05, + "loss": 1.2451, + "mean_token_accuracy": 0.6797448396682739, + "num_tokens": 1252621050.0, + "step": 7467 + }, + { + "entropy": 1.635416607062022, + "epoch": 0.8204114141330916, + "grad_norm": 0.5949429273605347, + "learning_rate": 1.3998890572944383e-05, + "loss": 1.3522, + "mean_token_accuracy": 0.6722677995761236, + "num_tokens": 1252787245.0, + "step": 7468 + }, + { + "entropy": 1.6553764442602794, + "epoch": 0.8205212710444646, + "grad_norm": 0.7361176013946533, + "learning_rate": 1.3997380896998141e-05, + "loss": 1.4988, + "mean_token_accuracy": 0.6553371498982111, + "num_tokens": 1252949574.0, + "step": 7469 + }, + { + "entropy": 1.6532206336657207, + "epoch": 0.8206311279558375, + "grad_norm": 0.7868068814277649, + "learning_rate": 1.3995871126186142e-05, + "loss": 1.3155, + "mean_token_accuracy": 0.6697768618663152, + "num_tokens": 1253093237.0, + "step": 7470 + }, + { + "entropy": 1.6919066905975342, + "epoch": 0.8207409848672105, + "grad_norm": 0.7258913516998291, + "learning_rate": 1.3994361260556175e-05, + "loss": 1.4628, + "mean_token_accuracy": 0.6440849602222443, + "num_tokens": 1253248303.0, + "step": 7471 + }, + { + "entropy": 1.7257492740948994, + "epoch": 0.8208508417785834, + "grad_norm": 0.6986476182937622, + "learning_rate": 1.3992851300156024e-05, + "loss": 1.3161, + "mean_token_accuracy": 0.6773944149414698, + "num_tokens": 1253458286.0, + "step": 7472 + }, + { + "entropy": 1.7134167353312175, + "epoch": 0.8209606986899564, + "grad_norm": 0.6301187872886658, + "learning_rate": 1.3991341245033474e-05, + "loss": 1.2863, + "mean_token_accuracy": 0.6769869873921076, + "num_tokens": 1253628175.0, + "step": 7473 + }, + { + "entropy": 1.7503137389818828, + "epoch": 0.8210705556013292, + "grad_norm": 0.706662654876709, + "learning_rate": 1.3989831095236321e-05, + "loss": 1.4009, + "mean_token_accuracy": 0.639786938826243, + "num_tokens": 1253775675.0, + "step": 7474 + }, + { + "entropy": 1.678730736176173, + "epoch": 0.8211804125127022, + "grad_norm": 0.6688277721405029, + "learning_rate": 1.3988320850812367e-05, + "loss": 1.2569, + "mean_token_accuracy": 0.6795259167750677, + "num_tokens": 1253917761.0, + "step": 7475 + }, + { + "entropy": 1.6687143941720326, + "epoch": 0.8212902694240751, + "grad_norm": 0.7596714496612549, + "learning_rate": 1.3986810511809396e-05, + "loss": 1.2865, + "mean_token_accuracy": 0.679057906071345, + "num_tokens": 1254076456.0, + "step": 7476 + }, + { + "entropy": 1.7445678611596425, + "epoch": 0.8214001263354481, + "grad_norm": 0.7357975244522095, + "learning_rate": 1.3985300078275226e-05, + "loss": 1.3714, + "mean_token_accuracy": 0.6583857784668604, + "num_tokens": 1254202651.0, + "step": 7477 + }, + { + "entropy": 1.7018981575965881, + "epoch": 0.821509983246821, + "grad_norm": 0.7750345468521118, + "learning_rate": 1.398378955025765e-05, + "loss": 1.3008, + "mean_token_accuracy": 0.6796058019002279, + "num_tokens": 1254359741.0, + "step": 7478 + }, + { + "entropy": 1.8074349462985992, + "epoch": 0.821619840158194, + "grad_norm": 0.9239285588264465, + "learning_rate": 1.398227892780448e-05, + "loss": 1.4871, + "mean_token_accuracy": 0.6412616769472758, + "num_tokens": 1254504105.0, + "step": 7479 + }, + { + "entropy": 1.8056731621424358, + "epoch": 0.8217296970695669, + "grad_norm": 0.8136707544326782, + "learning_rate": 1.3980768210963524e-05, + "loss": 1.4746, + "mean_token_accuracy": 0.6449335664510727, + "num_tokens": 1254705954.0, + "step": 7480 + }, + { + "entropy": 1.6646238962809246, + "epoch": 0.8218395539809398, + "grad_norm": 0.6531665921211243, + "learning_rate": 1.3979257399782603e-05, + "loss": 1.4227, + "mean_token_accuracy": 0.6651426901419958, + "num_tokens": 1254843961.0, + "step": 7481 + }, + { + "entropy": 1.7323362529277802, + "epoch": 0.8219494108923128, + "grad_norm": 0.9482001662254333, + "learning_rate": 1.3977746494309521e-05, + "loss": 1.3124, + "mean_token_accuracy": 0.6612970530986786, + "num_tokens": 1254957021.0, + "step": 7482 + }, + { + "entropy": 1.7369599243005116, + "epoch": 0.8220592678036857, + "grad_norm": 0.7881601452827454, + "learning_rate": 1.3976235494592107e-05, + "loss": 1.3393, + "mean_token_accuracy": 0.6736653447151184, + "num_tokens": 1255084387.0, + "step": 7483 + }, + { + "entropy": 1.7011582553386688, + "epoch": 0.8221691247150587, + "grad_norm": 0.6895524859428406, + "learning_rate": 1.3974724400678183e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.6459124386310577, + "num_tokens": 1255274711.0, + "step": 7484 + }, + { + "entropy": 1.767829418182373, + "epoch": 0.8222789816264315, + "grad_norm": 0.6690332293510437, + "learning_rate": 1.3973213212615569e-05, + "loss": 1.5124, + "mean_token_accuracy": 0.6429779479900996, + "num_tokens": 1255410443.0, + "step": 7485 + }, + { + "entropy": 1.756785641113917, + "epoch": 0.8223888385378045, + "grad_norm": 0.6052453517913818, + "learning_rate": 1.3971701930452097e-05, + "loss": 1.4162, + "mean_token_accuracy": 0.6490946312745413, + "num_tokens": 1255596887.0, + "step": 7486 + }, + { + "entropy": 1.6892687578996022, + "epoch": 0.8224986954491774, + "grad_norm": 0.7892059683799744, + "learning_rate": 1.39701905542356e-05, + "loss": 1.2558, + "mean_token_accuracy": 0.6721896727879842, + "num_tokens": 1255744213.0, + "step": 7487 + }, + { + "entropy": 1.7740124762058258, + "epoch": 0.8226085523605504, + "grad_norm": 0.6680493950843811, + "learning_rate": 1.3968679084013905e-05, + "loss": 1.3391, + "mean_token_accuracy": 0.6599594056606293, + "num_tokens": 1255886708.0, + "step": 7488 + }, + { + "entropy": 1.6903804937998455, + "epoch": 0.8227184092719233, + "grad_norm": 0.6269848346710205, + "learning_rate": 1.396716751983486e-05, + "loss": 1.4869, + "mean_token_accuracy": 0.6487141301234564, + "num_tokens": 1256060601.0, + "step": 7489 + }, + { + "entropy": 1.74964839220047, + "epoch": 0.8228282661832963, + "grad_norm": 0.6463726758956909, + "learning_rate": 1.3965655861746302e-05, + "loss": 1.3761, + "mean_token_accuracy": 0.6572670241196951, + "num_tokens": 1256223609.0, + "step": 7490 + }, + { + "entropy": 1.6653203467528026, + "epoch": 0.8229381230946692, + "grad_norm": 0.701028048992157, + "learning_rate": 1.3964144109796067e-05, + "loss": 1.3633, + "mean_token_accuracy": 0.6773179620504379, + "num_tokens": 1256361202.0, + "step": 7491 + }, + { + "entropy": 1.7040152450402577, + "epoch": 0.8230479800060422, + "grad_norm": 0.7073589563369751, + "learning_rate": 1.396263226403201e-05, + "loss": 1.4705, + "mean_token_accuracy": 0.664089247584343, + "num_tokens": 1256517961.0, + "step": 7492 + }, + { + "entropy": 1.6980151931444805, + "epoch": 0.8231578369174151, + "grad_norm": 0.5834183692932129, + "learning_rate": 1.3961120324501978e-05, + "loss": 1.4236, + "mean_token_accuracy": 0.6516111840804418, + "num_tokens": 1256713181.0, + "step": 7493 + }, + { + "entropy": 1.6546641091505687, + "epoch": 0.823267693828788, + "grad_norm": 0.686537504196167, + "learning_rate": 1.3959608291253815e-05, + "loss": 1.2607, + "mean_token_accuracy": 0.6846804320812225, + "num_tokens": 1256923364.0, + "step": 7494 + }, + { + "entropy": 1.7012316783269246, + "epoch": 0.823377550740161, + "grad_norm": 0.6142180562019348, + "learning_rate": 1.3958096164335391e-05, + "loss": 1.4143, + "mean_token_accuracy": 0.6444648404916128, + "num_tokens": 1257106231.0, + "step": 7495 + }, + { + "entropy": 1.707035501797994, + "epoch": 0.8234874076515338, + "grad_norm": 0.7143438458442688, + "learning_rate": 1.395658394379455e-05, + "loss": 1.4001, + "mean_token_accuracy": 0.6408938119808832, + "num_tokens": 1257276313.0, + "step": 7496 + }, + { + "entropy": 1.7746020754178364, + "epoch": 0.8235972645629068, + "grad_norm": 0.7095411419868469, + "learning_rate": 1.3955071629679164e-05, + "loss": 1.4486, + "mean_token_accuracy": 0.6568672706683477, + "num_tokens": 1257442953.0, + "step": 7497 + }, + { + "entropy": 1.683940976858139, + "epoch": 0.8237071214742797, + "grad_norm": 0.7230114340782166, + "learning_rate": 1.395355922203709e-05, + "loss": 1.3716, + "mean_token_accuracy": 0.6584843943516413, + "num_tokens": 1257589278.0, + "step": 7498 + }, + { + "entropy": 1.7272027730941772, + "epoch": 0.8238169783856527, + "grad_norm": 0.8744773268699646, + "learning_rate": 1.39520467209162e-05, + "loss": 1.4112, + "mean_token_accuracy": 0.6432114988565445, + "num_tokens": 1257732387.0, + "step": 7499 + }, + { + "entropy": 1.7044003407160442, + "epoch": 0.8239268352970256, + "grad_norm": 0.659590482711792, + "learning_rate": 1.395053412636436e-05, + "loss": 1.4211, + "mean_token_accuracy": 0.652682383855184, + "num_tokens": 1257916824.0, + "step": 7500 + }, + { + "entropy": 1.7109976410865784, + "epoch": 0.8240366922083986, + "grad_norm": 0.6942122578620911, + "learning_rate": 1.3949021438429445e-05, + "loss": 1.4207, + "mean_token_accuracy": 0.6466724226872126, + "num_tokens": 1258102470.0, + "step": 7501 + }, + { + "entropy": 1.6839348375797272, + "epoch": 0.8241465491197715, + "grad_norm": 0.6320016980171204, + "learning_rate": 1.3947508657159328e-05, + "loss": 1.3188, + "mean_token_accuracy": 0.671158974369367, + "num_tokens": 1258260131.0, + "step": 7502 + }, + { + "entropy": 1.7140422860781352, + "epoch": 0.8242564060311445, + "grad_norm": 0.9178858995437622, + "learning_rate": 1.3945995782601893e-05, + "loss": 1.3831, + "mean_token_accuracy": 0.6665874371925989, + "num_tokens": 1258411004.0, + "step": 7503 + }, + { + "entropy": 1.682992508014043, + "epoch": 0.8243662629425174, + "grad_norm": 0.6360436081886292, + "learning_rate": 1.3944482814805018e-05, + "loss": 1.2866, + "mean_token_accuracy": 0.675972451766332, + "num_tokens": 1258560249.0, + "step": 7504 + }, + { + "entropy": 1.7603969275951385, + "epoch": 0.8244761198538904, + "grad_norm": 0.6871092915534973, + "learning_rate": 1.3942969753816589e-05, + "loss": 1.295, + "mean_token_accuracy": 0.6693955262502035, + "num_tokens": 1258691545.0, + "step": 7505 + }, + { + "entropy": 1.7000905573368073, + "epoch": 0.8245859767652632, + "grad_norm": 0.6371673941612244, + "learning_rate": 1.3941456599684493e-05, + "loss": 1.4154, + "mean_token_accuracy": 0.644666830698649, + "num_tokens": 1258870586.0, + "step": 7506 + }, + { + "entropy": 1.6680241823196411, + "epoch": 0.8246958336766362, + "grad_norm": 0.6976217031478882, + "learning_rate": 1.3939943352456623e-05, + "loss": 1.3481, + "mean_token_accuracy": 0.6589303016662598, + "num_tokens": 1259046324.0, + "step": 7507 + }, + { + "entropy": 1.6960498293240864, + "epoch": 0.8248056905880091, + "grad_norm": 0.7053700685501099, + "learning_rate": 1.3938430012180868e-05, + "loss": 1.4329, + "mean_token_accuracy": 0.6613789896170298, + "num_tokens": 1259217515.0, + "step": 7508 + }, + { + "entropy": 1.712292194366455, + "epoch": 0.824915547499382, + "grad_norm": 0.6611363291740417, + "learning_rate": 1.393691657890513e-05, + "loss": 1.4571, + "mean_token_accuracy": 0.6506155629952749, + "num_tokens": 1259405409.0, + "step": 7509 + }, + { + "entropy": 1.698621819416682, + "epoch": 0.825025404410755, + "grad_norm": 0.7005683183670044, + "learning_rate": 1.39354030526773e-05, + "loss": 1.3047, + "mean_token_accuracy": 0.6643372923135757, + "num_tokens": 1259524807.0, + "step": 7510 + }, + { + "entropy": 1.612492283185323, + "epoch": 0.8251352613221279, + "grad_norm": 0.6348177194595337, + "learning_rate": 1.3933889433545292e-05, + "loss": 1.3432, + "mean_token_accuracy": 0.6589316080013911, + "num_tokens": 1259689565.0, + "step": 7511 + }, + { + "entropy": 1.6569429437319438, + "epoch": 0.8252451182335009, + "grad_norm": 0.6412181258201599, + "learning_rate": 1.3932375721557004e-05, + "loss": 1.3133, + "mean_token_accuracy": 0.6661182244618734, + "num_tokens": 1259862375.0, + "step": 7512 + }, + { + "entropy": 1.7173986732959747, + "epoch": 0.8253549751448738, + "grad_norm": 0.673636257648468, + "learning_rate": 1.3930861916760343e-05, + "loss": 1.5394, + "mean_token_accuracy": 0.6368442674477895, + "num_tokens": 1260043863.0, + "step": 7513 + }, + { + "entropy": 1.7598693370819092, + "epoch": 0.8254648320562468, + "grad_norm": 0.7226663827896118, + "learning_rate": 1.3929348019203223e-05, + "loss": 1.4727, + "mean_token_accuracy": 0.6370566139618555, + "num_tokens": 1260163201.0, + "step": 7514 + }, + { + "entropy": 1.7205652197202046, + "epoch": 0.8255746889676197, + "grad_norm": 1.0138691663742065, + "learning_rate": 1.3927834028933565e-05, + "loss": 1.2895, + "mean_token_accuracy": 0.6636170645554861, + "num_tokens": 1260282440.0, + "step": 7515 + }, + { + "entropy": 1.700068513552348, + "epoch": 0.8256845458789926, + "grad_norm": 0.7420987486839294, + "learning_rate": 1.3926319945999272e-05, + "loss": 1.4072, + "mean_token_accuracy": 0.6525601297616959, + "num_tokens": 1260443719.0, + "step": 7516 + }, + { + "entropy": 1.69573841492335, + "epoch": 0.8257944027903655, + "grad_norm": 0.5588994026184082, + "learning_rate": 1.3924805770448275e-05, + "loss": 1.3833, + "mean_token_accuracy": 0.6473828703165054, + "num_tokens": 1260653846.0, + "step": 7517 + }, + { + "entropy": 1.7026591698328655, + "epoch": 0.8259042597017385, + "grad_norm": 0.6840011477470398, + "learning_rate": 1.3923291502328493e-05, + "loss": 1.3299, + "mean_token_accuracy": 0.6716416925191879, + "num_tokens": 1260834311.0, + "step": 7518 + }, + { + "entropy": 1.7089744905630748, + "epoch": 0.8260141166131114, + "grad_norm": 0.8017948865890503, + "learning_rate": 1.3921777141687851e-05, + "loss": 1.3692, + "mean_token_accuracy": 0.6594879478216171, + "num_tokens": 1260978152.0, + "step": 7519 + }, + { + "entropy": 1.717678815126419, + "epoch": 0.8261239735244844, + "grad_norm": 0.6748639941215515, + "learning_rate": 1.392026268857428e-05, + "loss": 1.5332, + "mean_token_accuracy": 0.6416458636522293, + "num_tokens": 1261161530.0, + "step": 7520 + }, + { + "entropy": 1.7014433046181996, + "epoch": 0.8262338304358573, + "grad_norm": 0.5452734231948853, + "learning_rate": 1.3918748143035712e-05, + "loss": 1.4928, + "mean_token_accuracy": 0.6255062818527222, + "num_tokens": 1261392764.0, + "step": 7521 + }, + { + "entropy": 1.6911316414674122, + "epoch": 0.8263436873472302, + "grad_norm": 0.694558322429657, + "learning_rate": 1.3917233505120073e-05, + "loss": 1.4584, + "mean_token_accuracy": 0.6488548169533411, + "num_tokens": 1261521560.0, + "step": 7522 + }, + { + "entropy": 1.6891414125760396, + "epoch": 0.8264535442586032, + "grad_norm": 0.6259676814079285, + "learning_rate": 1.3915718774875317e-05, + "loss": 1.4898, + "mean_token_accuracy": 0.632416253288587, + "num_tokens": 1261714723.0, + "step": 7523 + }, + { + "entropy": 1.7080417772134144, + "epoch": 0.8265634011699761, + "grad_norm": 0.7242565751075745, + "learning_rate": 1.3914203952349374e-05, + "loss": 1.3847, + "mean_token_accuracy": 0.6655266831318537, + "num_tokens": 1261850914.0, + "step": 7524 + }, + { + "entropy": 1.6633085012435913, + "epoch": 0.8266732580813491, + "grad_norm": 0.7950205206871033, + "learning_rate": 1.3912689037590189e-05, + "loss": 1.5695, + "mean_token_accuracy": 0.6606140186389288, + "num_tokens": 1261993356.0, + "step": 7525 + }, + { + "entropy": 1.7180237174034119, + "epoch": 0.826783114992722, + "grad_norm": 0.6543428301811218, + "learning_rate": 1.3911174030645705e-05, + "loss": 1.4543, + "mean_token_accuracy": 0.660189817349116, + "num_tokens": 1262170004.0, + "step": 7526 + }, + { + "entropy": 1.6681218047936757, + "epoch": 0.8268929719040949, + "grad_norm": 0.6747716069221497, + "learning_rate": 1.390965893156388e-05, + "loss": 1.3851, + "mean_token_accuracy": 0.650188018878301, + "num_tokens": 1262325407.0, + "step": 7527 + }, + { + "entropy": 1.69178702433904, + "epoch": 0.8270028288154678, + "grad_norm": 0.7375442385673523, + "learning_rate": 1.3908143740392657e-05, + "loss": 1.4235, + "mean_token_accuracy": 0.6644584635893503, + "num_tokens": 1262479364.0, + "step": 7528 + }, + { + "entropy": 1.7722974220911663, + "epoch": 0.8271126857268408, + "grad_norm": 0.720845103263855, + "learning_rate": 1.3906628457179994e-05, + "loss": 1.3381, + "mean_token_accuracy": 0.66179092725118, + "num_tokens": 1262577801.0, + "step": 7529 + }, + { + "entropy": 1.7099088231722515, + "epoch": 0.8272225426382137, + "grad_norm": 0.8065167665481567, + "learning_rate": 1.3905113081973854e-05, + "loss": 1.3695, + "mean_token_accuracy": 0.656641498208046, + "num_tokens": 1262761100.0, + "step": 7530 + }, + { + "entropy": 1.665325830380122, + "epoch": 0.8273323995495867, + "grad_norm": 0.7184627056121826, + "learning_rate": 1.390359761482219e-05, + "loss": 1.3796, + "mean_token_accuracy": 0.6679128209749857, + "num_tokens": 1262899915.0, + "step": 7531 + }, + { + "entropy": 1.7003116806348164, + "epoch": 0.8274422564609596, + "grad_norm": 0.7257794141769409, + "learning_rate": 1.390208205577297e-05, + "loss": 1.3098, + "mean_token_accuracy": 0.6723635445038477, + "num_tokens": 1263025397.0, + "step": 7532 + }, + { + "entropy": 1.6903511186440785, + "epoch": 0.8275521133723326, + "grad_norm": 0.7771725058555603, + "learning_rate": 1.3900566404874165e-05, + "loss": 1.3125, + "mean_token_accuracy": 0.6623782813549042, + "num_tokens": 1263172104.0, + "step": 7533 + }, + { + "entropy": 1.738791823387146, + "epoch": 0.8276619702837055, + "grad_norm": 0.6048402786254883, + "learning_rate": 1.3899050662173736e-05, + "loss": 1.3476, + "mean_token_accuracy": 0.6545044680436453, + "num_tokens": 1263335359.0, + "step": 7534 + }, + { + "entropy": 1.7750772138436635, + "epoch": 0.8277718271950784, + "grad_norm": 0.7775545120239258, + "learning_rate": 1.3897534827719663e-05, + "loss": 1.6391, + "mean_token_accuracy": 0.6482603698968887, + "num_tokens": 1263472462.0, + "step": 7535 + }, + { + "entropy": 1.7368451058864594, + "epoch": 0.8278816841064514, + "grad_norm": 0.7810788750648499, + "learning_rate": 1.3896018901559915e-05, + "loss": 1.5206, + "mean_token_accuracy": 0.6479092240333557, + "num_tokens": 1263615854.0, + "step": 7536 + }, + { + "entropy": 1.7175723016262054, + "epoch": 0.8279915410178242, + "grad_norm": 0.675439178943634, + "learning_rate": 1.389450288374248e-05, + "loss": 1.4315, + "mean_token_accuracy": 0.6542692532142004, + "num_tokens": 1263767613.0, + "step": 7537 + }, + { + "entropy": 1.6717278858025868, + "epoch": 0.8281013979291972, + "grad_norm": 0.8180225491523743, + "learning_rate": 1.3892986774315325e-05, + "loss": 1.2958, + "mean_token_accuracy": 0.6851775397857031, + "num_tokens": 1263942570.0, + "step": 7538 + }, + { + "entropy": 1.7367408871650696, + "epoch": 0.8282112548405701, + "grad_norm": 0.7362589240074158, + "learning_rate": 1.3891470573326446e-05, + "loss": 1.5338, + "mean_token_accuracy": 0.6389251748720804, + "num_tokens": 1264115819.0, + "step": 7539 + }, + { + "entropy": 1.7383471826712291, + "epoch": 0.8283211117519431, + "grad_norm": 0.6163108348846436, + "learning_rate": 1.3889954280823828e-05, + "loss": 1.4486, + "mean_token_accuracy": 0.6470398008823395, + "num_tokens": 1264290334.0, + "step": 7540 + }, + { + "entropy": 1.697384923696518, + "epoch": 0.828430968663316, + "grad_norm": 0.6724095344543457, + "learning_rate": 1.3888437896855456e-05, + "loss": 1.2777, + "mean_token_accuracy": 0.6817297836144766, + "num_tokens": 1264414382.0, + "step": 7541 + }, + { + "entropy": 1.6797985633214314, + "epoch": 0.828540825574689, + "grad_norm": 0.6427711844444275, + "learning_rate": 1.3886921421469329e-05, + "loss": 1.2919, + "mean_token_accuracy": 0.6724950323502222, + "num_tokens": 1264550399.0, + "step": 7542 + }, + { + "entropy": 1.7129569550355275, + "epoch": 0.8286506824860619, + "grad_norm": 0.6985865831375122, + "learning_rate": 1.3885404854713437e-05, + "loss": 1.2449, + "mean_token_accuracy": 0.6730537166198095, + "num_tokens": 1264690996.0, + "step": 7543 + }, + { + "entropy": 1.7042374809583027, + "epoch": 0.8287605393974349, + "grad_norm": 0.6594078540802002, + "learning_rate": 1.3883888196635785e-05, + "loss": 1.4256, + "mean_token_accuracy": 0.643621101975441, + "num_tokens": 1264927827.0, + "step": 7544 + }, + { + "entropy": 1.7009220818678539, + "epoch": 0.8288703963088078, + "grad_norm": 0.8057886362075806, + "learning_rate": 1.3882371447284369e-05, + "loss": 1.4477, + "mean_token_accuracy": 0.6611939668655396, + "num_tokens": 1265053926.0, + "step": 7545 + }, + { + "entropy": 1.6498080094655354, + "epoch": 0.8289802532201808, + "grad_norm": 0.6930307149887085, + "learning_rate": 1.3880854606707195e-05, + "loss": 1.4983, + "mean_token_accuracy": 0.6612697939078013, + "num_tokens": 1265206127.0, + "step": 7546 + }, + { + "entropy": 1.694092224041621, + "epoch": 0.8290901101315536, + "grad_norm": 0.6370811462402344, + "learning_rate": 1.3879337674952274e-05, + "loss": 1.3453, + "mean_token_accuracy": 0.6520874202251434, + "num_tokens": 1265360777.0, + "step": 7547 + }, + { + "entropy": 1.7223928372065227, + "epoch": 0.8291999670429265, + "grad_norm": 0.6785632967948914, + "learning_rate": 1.3877820652067609e-05, + "loss": 1.4052, + "mean_token_accuracy": 0.6524649461110433, + "num_tokens": 1265517050.0, + "step": 7548 + }, + { + "entropy": 1.7133225500583649, + "epoch": 0.8293098239542995, + "grad_norm": 0.6474944353103638, + "learning_rate": 1.3876303538101218e-05, + "loss": 1.3002, + "mean_token_accuracy": 0.6698809812466303, + "num_tokens": 1265665927.0, + "step": 7549 + }, + { + "entropy": 1.7734433313210805, + "epoch": 0.8294196808656724, + "grad_norm": 0.7001741528511047, + "learning_rate": 1.3874786333101117e-05, + "loss": 1.3274, + "mean_token_accuracy": 0.6608901371558508, + "num_tokens": 1265774674.0, + "step": 7550 + }, + { + "entropy": 1.633638968070348, + "epoch": 0.8295295377770454, + "grad_norm": 0.6309921741485596, + "learning_rate": 1.3873269037115325e-05, + "loss": 1.4088, + "mean_token_accuracy": 0.6554737389087677, + "num_tokens": 1265997082.0, + "step": 7551 + }, + { + "entropy": 1.677928477525711, + "epoch": 0.8296393946884183, + "grad_norm": 2.2038040161132812, + "learning_rate": 1.3871751650191861e-05, + "loss": 1.1266, + "mean_token_accuracy": 0.6781335373719534, + "num_tokens": 1266160041.0, + "step": 7552 + }, + { + "entropy": 1.7169209221998851, + "epoch": 0.8297492515997913, + "grad_norm": 0.6631376147270203, + "learning_rate": 1.387023417237875e-05, + "loss": 1.4185, + "mean_token_accuracy": 0.6488533268372217, + "num_tokens": 1266332720.0, + "step": 7553 + }, + { + "entropy": 1.698218435049057, + "epoch": 0.8298591085111642, + "grad_norm": 0.6376110911369324, + "learning_rate": 1.3868716603724024e-05, + "loss": 1.4032, + "mean_token_accuracy": 0.6554437925418218, + "num_tokens": 1266512357.0, + "step": 7554 + }, + { + "entropy": 1.7614865104357402, + "epoch": 0.8299689654225372, + "grad_norm": 0.6769449710845947, + "learning_rate": 1.386719894427571e-05, + "loss": 1.457, + "mean_token_accuracy": 0.640682727098465, + "num_tokens": 1266656391.0, + "step": 7555 + }, + { + "entropy": 1.6298018097877502, + "epoch": 0.8300788223339101, + "grad_norm": 0.6778908371925354, + "learning_rate": 1.386568119408184e-05, + "loss": 1.3012, + "mean_token_accuracy": 0.6756581912438074, + "num_tokens": 1266798263.0, + "step": 7556 + }, + { + "entropy": 1.725428541501363, + "epoch": 0.8301886792452831, + "grad_norm": 0.6709782481193542, + "learning_rate": 1.3864163353190453e-05, + "loss": 1.3906, + "mean_token_accuracy": 0.66632479429245, + "num_tokens": 1266924812.0, + "step": 7557 + }, + { + "entropy": 1.6883962154388428, + "epoch": 0.8302985361566559, + "grad_norm": 0.68055260181427, + "learning_rate": 1.3862645421649582e-05, + "loss": 1.3803, + "mean_token_accuracy": 0.6522450596094131, + "num_tokens": 1267073930.0, + "step": 7558 + }, + { + "entropy": 1.7249091962973278, + "epoch": 0.8304083930680289, + "grad_norm": 0.669360339641571, + "learning_rate": 1.386112739950728e-05, + "loss": 1.3908, + "mean_token_accuracy": 0.6687298119068146, + "num_tokens": 1267263523.0, + "step": 7559 + }, + { + "entropy": 1.7766032218933105, + "epoch": 0.8305182499794018, + "grad_norm": 0.8653247952461243, + "learning_rate": 1.3859609286811576e-05, + "loss": 1.4976, + "mean_token_accuracy": 0.6408978551626205, + "num_tokens": 1267408031.0, + "step": 7560 + }, + { + "entropy": 1.7056670884291332, + "epoch": 0.8306281068907748, + "grad_norm": 0.7807538509368896, + "learning_rate": 1.3858091083610537e-05, + "loss": 1.4544, + "mean_token_accuracy": 0.658196692665418, + "num_tokens": 1267607695.0, + "step": 7561 + }, + { + "entropy": 1.7037642896175385, + "epoch": 0.8307379638021477, + "grad_norm": 0.6015084981918335, + "learning_rate": 1.3856572789952197e-05, + "loss": 1.4094, + "mean_token_accuracy": 0.6588554928700129, + "num_tokens": 1267778360.0, + "step": 7562 + }, + { + "entropy": 1.677244524161021, + "epoch": 0.8308478207135206, + "grad_norm": 0.6811591982841492, + "learning_rate": 1.3855054405884619e-05, + "loss": 1.3449, + "mean_token_accuracy": 0.6656729827324549, + "num_tokens": 1267937117.0, + "step": 7563 + }, + { + "entropy": 1.6949976682662964, + "epoch": 0.8309576776248936, + "grad_norm": 0.8715330362319946, + "learning_rate": 1.385353593145585e-05, + "loss": 1.4748, + "mean_token_accuracy": 0.6535067111253738, + "num_tokens": 1268150134.0, + "step": 7564 + }, + { + "entropy": 1.742262860139211, + "epoch": 0.8310675345362665, + "grad_norm": 0.7902666330337524, + "learning_rate": 1.3852017366713962e-05, + "loss": 1.6062, + "mean_token_accuracy": 0.642037237683932, + "num_tokens": 1268318445.0, + "step": 7565 + }, + { + "entropy": 1.707626740137736, + "epoch": 0.8311773914476395, + "grad_norm": 0.7191417813301086, + "learning_rate": 1.3850498711707001e-05, + "loss": 1.3055, + "mean_token_accuracy": 0.6673828760782877, + "num_tokens": 1268422008.0, + "step": 7566 + }, + { + "entropy": 1.689712017774582, + "epoch": 0.8312872483590124, + "grad_norm": 0.6316739916801453, + "learning_rate": 1.3848979966483048e-05, + "loss": 1.5149, + "mean_token_accuracy": 0.6431446621815363, + "num_tokens": 1268691031.0, + "step": 7567 + }, + { + "entropy": 1.6952175498008728, + "epoch": 0.8313971052703854, + "grad_norm": 0.6542701125144958, + "learning_rate": 1.3847461131090159e-05, + "loss": 1.3766, + "mean_token_accuracy": 0.6535580505927404, + "num_tokens": 1268869666.0, + "step": 7568 + }, + { + "entropy": 1.6646797557671864, + "epoch": 0.8315069621817582, + "grad_norm": 0.7026156187057495, + "learning_rate": 1.3845942205576408e-05, + "loss": 1.3871, + "mean_token_accuracy": 0.6553316861391068, + "num_tokens": 1269053305.0, + "step": 7569 + }, + { + "entropy": 1.6775665481885274, + "epoch": 0.8316168190931312, + "grad_norm": 0.7035835981369019, + "learning_rate": 1.3844423189989868e-05, + "loss": 1.2915, + "mean_token_accuracy": 0.6790016442537308, + "num_tokens": 1269181271.0, + "step": 7570 + }, + { + "entropy": 1.6944467822710674, + "epoch": 0.8317266760045041, + "grad_norm": 0.6992266178131104, + "learning_rate": 1.3842904084378619e-05, + "loss": 1.2448, + "mean_token_accuracy": 0.6773143957058588, + "num_tokens": 1269291180.0, + "step": 7571 + }, + { + "entropy": 1.7388213972250621, + "epoch": 0.8318365329158771, + "grad_norm": 0.7684722542762756, + "learning_rate": 1.3841384888790734e-05, + "loss": 1.5707, + "mean_token_accuracy": 0.6360517491896948, + "num_tokens": 1269452126.0, + "step": 7572 + }, + { + "entropy": 1.749897877375285, + "epoch": 0.83194638982725, + "grad_norm": 0.6144039630889893, + "learning_rate": 1.38398656032743e-05, + "loss": 1.4208, + "mean_token_accuracy": 0.6549445589383444, + "num_tokens": 1269616125.0, + "step": 7573 + }, + { + "entropy": 1.7409840126832326, + "epoch": 0.832056246738623, + "grad_norm": 0.687271773815155, + "learning_rate": 1.3838346227877398e-05, + "loss": 1.3635, + "mean_token_accuracy": 0.6573161135117213, + "num_tokens": 1269735340.0, + "step": 7574 + }, + { + "entropy": 1.650204559167226, + "epoch": 0.8321661036499959, + "grad_norm": 0.6359143257141113, + "learning_rate": 1.3836826762648117e-05, + "loss": 1.3908, + "mean_token_accuracy": 0.6477119276920954, + "num_tokens": 1269901539.0, + "step": 7575 + }, + { + "entropy": 1.7002926965554555, + "epoch": 0.8322759605613688, + "grad_norm": 0.6047689914703369, + "learning_rate": 1.3835307207634545e-05, + "loss": 1.3467, + "mean_token_accuracy": 0.6575921724239985, + "num_tokens": 1270036658.0, + "step": 7576 + }, + { + "entropy": 1.699533224105835, + "epoch": 0.8323858174727418, + "grad_norm": 0.690969705581665, + "learning_rate": 1.3833787562884784e-05, + "loss": 1.3226, + "mean_token_accuracy": 0.6634237319231033, + "num_tokens": 1270178148.0, + "step": 7577 + }, + { + "entropy": 1.664102743069331, + "epoch": 0.8324956743841146, + "grad_norm": 0.663180947303772, + "learning_rate": 1.3832267828446914e-05, + "loss": 1.2973, + "mean_token_accuracy": 0.6731946070988973, + "num_tokens": 1270323103.0, + "step": 7578 + }, + { + "entropy": 1.6971265574296315, + "epoch": 0.8326055312954876, + "grad_norm": 0.9087331295013428, + "learning_rate": 1.383074800436905e-05, + "loss": 1.4288, + "mean_token_accuracy": 0.6556025594472885, + "num_tokens": 1270488548.0, + "step": 7579 + }, + { + "entropy": 1.6653367976347606, + "epoch": 0.8327153882068605, + "grad_norm": 0.6120437979698181, + "learning_rate": 1.3829228090699286e-05, + "loss": 1.4193, + "mean_token_accuracy": 0.6514114042123159, + "num_tokens": 1270694474.0, + "step": 7580 + }, + { + "entropy": 1.7614048818747203, + "epoch": 0.8328252451182335, + "grad_norm": 0.6478108763694763, + "learning_rate": 1.3827708087485727e-05, + "loss": 1.5128, + "mean_token_accuracy": 0.6446801622708639, + "num_tokens": 1270918826.0, + "step": 7581 + }, + { + "entropy": 1.7623259325822194, + "epoch": 0.8329351020296064, + "grad_norm": 0.6244803667068481, + "learning_rate": 1.3826187994776484e-05, + "loss": 1.3796, + "mean_token_accuracy": 0.6532119462887446, + "num_tokens": 1271092214.0, + "step": 7582 + }, + { + "entropy": 1.6713003118832905, + "epoch": 0.8330449589409794, + "grad_norm": 0.8507984280586243, + "learning_rate": 1.382466781261966e-05, + "loss": 1.3142, + "mean_token_accuracy": 0.6736680517594019, + "num_tokens": 1271207600.0, + "step": 7583 + }, + { + "entropy": 1.7317315141359966, + "epoch": 0.8331548158523523, + "grad_norm": 0.6434891819953918, + "learning_rate": 1.3823147541063376e-05, + "loss": 1.5426, + "mean_token_accuracy": 0.6401833097139994, + "num_tokens": 1271386603.0, + "step": 7584 + }, + { + "entropy": 1.7154659231503804, + "epoch": 0.8332646727637253, + "grad_norm": 0.6992514133453369, + "learning_rate": 1.3821627180155743e-05, + "loss": 1.4521, + "mean_token_accuracy": 0.6397911409536997, + "num_tokens": 1271565473.0, + "step": 7585 + }, + { + "entropy": 1.6124655902385712, + "epoch": 0.8333745296750982, + "grad_norm": 0.5322008728981018, + "learning_rate": 1.3820106729944882e-05, + "loss": 1.411, + "mean_token_accuracy": 0.6569860825935999, + "num_tokens": 1271800335.0, + "step": 7586 + }, + { + "entropy": 1.693232387304306, + "epoch": 0.8334843865864712, + "grad_norm": 0.6390825510025024, + "learning_rate": 1.3818586190478916e-05, + "loss": 1.5123, + "mean_token_accuracy": 0.6511318882306417, + "num_tokens": 1271963559.0, + "step": 7587 + }, + { + "entropy": 1.7264248430728912, + "epoch": 0.8335942434978441, + "grad_norm": 0.8136751651763916, + "learning_rate": 1.3817065561805962e-05, + "loss": 1.3956, + "mean_token_accuracy": 0.6575086663166682, + "num_tokens": 1272120985.0, + "step": 7588 + }, + { + "entropy": 1.7141720652580261, + "epoch": 0.8337041004092169, + "grad_norm": 0.8287689089775085, + "learning_rate": 1.3815544843974156e-05, + "loss": 1.4675, + "mean_token_accuracy": 0.661083827416102, + "num_tokens": 1272246029.0, + "step": 7589 + }, + { + "entropy": 1.712748219569524, + "epoch": 0.8338139573205899, + "grad_norm": 0.6662017703056335, + "learning_rate": 1.3814024037031624e-05, + "loss": 1.4793, + "mean_token_accuracy": 0.6412945588429769, + "num_tokens": 1272428836.0, + "step": 7590 + }, + { + "entropy": 1.694316158692042, + "epoch": 0.8339238142319628, + "grad_norm": 0.7065073251724243, + "learning_rate": 1.3812503141026497e-05, + "loss": 1.4966, + "mean_token_accuracy": 0.6386247078577677, + "num_tokens": 1272662052.0, + "step": 7591 + }, + { + "entropy": 1.6363686819871266, + "epoch": 0.8340336711433358, + "grad_norm": 0.7142933011054993, + "learning_rate": 1.3810982156006914e-05, + "loss": 1.2562, + "mean_token_accuracy": 0.6731893370548884, + "num_tokens": 1272810815.0, + "step": 7592 + }, + { + "entropy": 1.705398678779602, + "epoch": 0.8341435280547087, + "grad_norm": 0.7638614773750305, + "learning_rate": 1.3809461082021015e-05, + "loss": 1.3403, + "mean_token_accuracy": 0.6670798907677332, + "num_tokens": 1272950592.0, + "step": 7593 + }, + { + "entropy": 1.7166258593400319, + "epoch": 0.8342533849660817, + "grad_norm": 0.7492454648017883, + "learning_rate": 1.3807939919116935e-05, + "loss": 1.4619, + "mean_token_accuracy": 0.6594860553741455, + "num_tokens": 1273083671.0, + "step": 7594 + }, + { + "entropy": 1.7036760747432709, + "epoch": 0.8343632418774546, + "grad_norm": 0.6306270360946655, + "learning_rate": 1.3806418667342825e-05, + "loss": 1.4087, + "mean_token_accuracy": 0.6539557129144669, + "num_tokens": 1273249788.0, + "step": 7595 + }, + { + "entropy": 1.7021053830782573, + "epoch": 0.8344730987888276, + "grad_norm": 0.7657412886619568, + "learning_rate": 1.3804897326746826e-05, + "loss": 1.2697, + "mean_token_accuracy": 0.664386381705602, + "num_tokens": 1273375149.0, + "step": 7596 + }, + { + "entropy": 1.7494820058345795, + "epoch": 0.8345829557002005, + "grad_norm": 0.6962859630584717, + "learning_rate": 1.3803375897377091e-05, + "loss": 1.3636, + "mean_token_accuracy": 0.6591188112894694, + "num_tokens": 1273491406.0, + "step": 7597 + }, + { + "entropy": 1.727548082669576, + "epoch": 0.8346928126115735, + "grad_norm": 0.5763877034187317, + "learning_rate": 1.3801854379281772e-05, + "loss": 1.4143, + "mean_token_accuracy": 0.6519613862037659, + "num_tokens": 1273691352.0, + "step": 7598 + }, + { + "entropy": 1.7307646075884502, + "epoch": 0.8348026695229464, + "grad_norm": 0.6695159077644348, + "learning_rate": 1.3800332772509028e-05, + "loss": 1.3111, + "mean_token_accuracy": 0.6668645044167837, + "num_tokens": 1273843898.0, + "step": 7599 + }, + { + "entropy": 1.7181670566399891, + "epoch": 0.8349125264343193, + "grad_norm": 0.675504207611084, + "learning_rate": 1.3798811077107008e-05, + "loss": 1.3693, + "mean_token_accuracy": 0.6546612332264582, + "num_tokens": 1273988852.0, + "step": 7600 + }, + { + "entropy": 1.7171874046325684, + "epoch": 0.8350223833456922, + "grad_norm": 0.8511648774147034, + "learning_rate": 1.3797289293123884e-05, + "loss": 1.4898, + "mean_token_accuracy": 0.6442197859287262, + "num_tokens": 1274195068.0, + "step": 7601 + }, + { + "entropy": 1.7084606885910034, + "epoch": 0.8351322402570651, + "grad_norm": 0.6916182637214661, + "learning_rate": 1.379576742060781e-05, + "loss": 1.5412, + "mean_token_accuracy": 0.6330763747294744, + "num_tokens": 1274428299.0, + "step": 7602 + }, + { + "entropy": 1.7000287473201752, + "epoch": 0.8352420971684381, + "grad_norm": 0.7107706069946289, + "learning_rate": 1.379424545960696e-05, + "loss": 1.2805, + "mean_token_accuracy": 0.6748431076606115, + "num_tokens": 1274598715.0, + "step": 7603 + }, + { + "entropy": 1.6960639754931133, + "epoch": 0.835351954079811, + "grad_norm": 0.6091739535331726, + "learning_rate": 1.3792723410169498e-05, + "loss": 1.4367, + "mean_token_accuracy": 0.6490481595198313, + "num_tokens": 1274769825.0, + "step": 7604 + }, + { + "entropy": 1.6969983875751495, + "epoch": 0.835461810991184, + "grad_norm": 0.6014200448989868, + "learning_rate": 1.3791201272343602e-05, + "loss": 1.4189, + "mean_token_accuracy": 0.657557855049769, + "num_tokens": 1274963060.0, + "step": 7605 + }, + { + "entropy": 1.7252587974071503, + "epoch": 0.8355716679025569, + "grad_norm": 0.6048182249069214, + "learning_rate": 1.3789679046177438e-05, + "loss": 1.3069, + "mean_token_accuracy": 0.6655952880779902, + "num_tokens": 1275104494.0, + "step": 7606 + }, + { + "entropy": 1.7161117593447368, + "epoch": 0.8356815248139299, + "grad_norm": 0.7150284647941589, + "learning_rate": 1.3788156731719196e-05, + "loss": 1.3054, + "mean_token_accuracy": 0.6586286971966425, + "num_tokens": 1275219815.0, + "step": 7607 + }, + { + "entropy": 1.7322356899579365, + "epoch": 0.8357913817253028, + "grad_norm": 0.719291627407074, + "learning_rate": 1.3786634329017044e-05, + "loss": 1.4993, + "mean_token_accuracy": 0.6435506194829941, + "num_tokens": 1275361651.0, + "step": 7608 + }, + { + "entropy": 1.7530653476715088, + "epoch": 0.8359012386366758, + "grad_norm": 0.6821619868278503, + "learning_rate": 1.3785111838119174e-05, + "loss": 1.4983, + "mean_token_accuracy": 0.6553416550159454, + "num_tokens": 1275492520.0, + "step": 7609 + }, + { + "entropy": 1.691613495349884, + "epoch": 0.8360110955480486, + "grad_norm": 0.6883498430252075, + "learning_rate": 1.3783589259073766e-05, + "loss": 1.3471, + "mean_token_accuracy": 0.6675893068313599, + "num_tokens": 1275630876.0, + "step": 7610 + }, + { + "entropy": 1.7405516107877095, + "epoch": 0.8361209524594216, + "grad_norm": 0.7199444770812988, + "learning_rate": 1.3782066591929017e-05, + "loss": 1.2501, + "mean_token_accuracy": 0.6688971618811289, + "num_tokens": 1275742939.0, + "step": 7611 + }, + { + "entropy": 1.6530592640240986, + "epoch": 0.8362308093707945, + "grad_norm": 0.7949721813201904, + "learning_rate": 1.3780543836733112e-05, + "loss": 1.4701, + "mean_token_accuracy": 0.6404084165891012, + "num_tokens": 1275941141.0, + "step": 7612 + }, + { + "entropy": 1.7587747077147167, + "epoch": 0.8363406662821675, + "grad_norm": 0.7122784852981567, + "learning_rate": 1.3779020993534249e-05, + "loss": 1.3668, + "mean_token_accuracy": 0.6711858014265696, + "num_tokens": 1276087415.0, + "step": 7613 + }, + { + "entropy": 1.6879003842671711, + "epoch": 0.8364505231935404, + "grad_norm": 0.7349809408187866, + "learning_rate": 1.3777498062380622e-05, + "loss": 1.4567, + "mean_token_accuracy": 0.661365215977033, + "num_tokens": 1276239252.0, + "step": 7614 + }, + { + "entropy": 1.6400221586227417, + "epoch": 0.8365603801049134, + "grad_norm": 0.7023922204971313, + "learning_rate": 1.3775975043320433e-05, + "loss": 1.2416, + "mean_token_accuracy": 0.6837769548098246, + "num_tokens": 1276403009.0, + "step": 7615 + }, + { + "entropy": 1.7130279938379924, + "epoch": 0.8366702370162863, + "grad_norm": 0.7748481631278992, + "learning_rate": 1.3774451936401882e-05, + "loss": 1.4081, + "mean_token_accuracy": 0.670517255862554, + "num_tokens": 1276574324.0, + "step": 7616 + }, + { + "entropy": 1.762073000272115, + "epoch": 0.8367800939276592, + "grad_norm": 0.7048318386077881, + "learning_rate": 1.3772928741673184e-05, + "loss": 1.5452, + "mean_token_accuracy": 0.6333042333523432, + "num_tokens": 1276765168.0, + "step": 7617 + }, + { + "entropy": 1.6817299922307332, + "epoch": 0.8368899508390322, + "grad_norm": 0.6088959574699402, + "learning_rate": 1.3771405459182536e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.6705379237731298, + "num_tokens": 1276951991.0, + "step": 7618 + }, + { + "entropy": 1.6863965789477031, + "epoch": 0.8369998077504051, + "grad_norm": 0.7418268918991089, + "learning_rate": 1.3769882088978154e-05, + "loss": 1.2244, + "mean_token_accuracy": 0.6815223594506582, + "num_tokens": 1277059043.0, + "step": 7619 + }, + { + "entropy": 1.6903445621331532, + "epoch": 0.837109664661778, + "grad_norm": 0.6564303636550903, + "learning_rate": 1.3768358631108254e-05, + "loss": 1.4957, + "mean_token_accuracy": 0.6512309859196345, + "num_tokens": 1277229644.0, + "step": 7620 + }, + { + "entropy": 1.7503305276234944, + "epoch": 0.8372195215731509, + "grad_norm": 0.6145588159561157, + "learning_rate": 1.376683508562105e-05, + "loss": 1.3732, + "mean_token_accuracy": 0.6656107902526855, + "num_tokens": 1277367439.0, + "step": 7621 + }, + { + "entropy": 1.6609105666478474, + "epoch": 0.8373293784845239, + "grad_norm": 0.635491669178009, + "learning_rate": 1.376531145256476e-05, + "loss": 1.3981, + "mean_token_accuracy": 0.6671904375155767, + "num_tokens": 1277528410.0, + "step": 7622 + }, + { + "entropy": 1.6996264060338337, + "epoch": 0.8374392353958968, + "grad_norm": 0.6683711409568787, + "learning_rate": 1.3763787731987614e-05, + "loss": 1.3574, + "mean_token_accuracy": 0.6552396714687347, + "num_tokens": 1277707936.0, + "step": 7623 + }, + { + "entropy": 1.7298993468284607, + "epoch": 0.8375490923072698, + "grad_norm": 0.7171658873558044, + "learning_rate": 1.3762263923937829e-05, + "loss": 1.3435, + "mean_token_accuracy": 0.6661288539568583, + "num_tokens": 1277857298.0, + "step": 7624 + }, + { + "entropy": 1.6921402116616566, + "epoch": 0.8376589492186427, + "grad_norm": 0.6446428894996643, + "learning_rate": 1.3760740028463632e-05, + "loss": 1.3402, + "mean_token_accuracy": 0.6615449984868368, + "num_tokens": 1277989822.0, + "step": 7625 + }, + { + "entropy": 1.6879879732926686, + "epoch": 0.8377688061300157, + "grad_norm": 0.6671029925346375, + "learning_rate": 1.3759216045613262e-05, + "loss": 1.3044, + "mean_token_accuracy": 0.6745457847913107, + "num_tokens": 1278178157.0, + "step": 7626 + }, + { + "entropy": 1.6387710173924763, + "epoch": 0.8378786630413886, + "grad_norm": 0.5973528027534485, + "learning_rate": 1.3757691975434949e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6483223338921865, + "num_tokens": 1278350974.0, + "step": 7627 + }, + { + "entropy": 1.726877639691035, + "epoch": 0.8379885199527616, + "grad_norm": 0.7956101894378662, + "learning_rate": 1.375616781797692e-05, + "loss": 1.3057, + "mean_token_accuracy": 0.6639814128478368, + "num_tokens": 1278481024.0, + "step": 7628 + }, + { + "entropy": 1.6962241232395172, + "epoch": 0.8380983768641345, + "grad_norm": 0.7772151827812195, + "learning_rate": 1.3754643573287428e-05, + "loss": 1.3155, + "mean_token_accuracy": 0.6763526697953542, + "num_tokens": 1278635659.0, + "step": 7629 + }, + { + "entropy": 1.7717609802881877, + "epoch": 0.8382082337755073, + "grad_norm": 0.6711469292640686, + "learning_rate": 1.3753119241414706e-05, + "loss": 1.3777, + "mean_token_accuracy": 0.6482406208912531, + "num_tokens": 1278798975.0, + "step": 7630 + }, + { + "entropy": 1.7128020922342937, + "epoch": 0.8383180906868803, + "grad_norm": 0.6757957339286804, + "learning_rate": 1.3751594822407e-05, + "loss": 1.3464, + "mean_token_accuracy": 0.6667918612559637, + "num_tokens": 1278927300.0, + "step": 7631 + }, + { + "entropy": 1.7163095275561016, + "epoch": 0.8384279475982532, + "grad_norm": 0.6230423450469971, + "learning_rate": 1.3750070316312559e-05, + "loss": 1.4484, + "mean_token_accuracy": 0.6352711419264475, + "num_tokens": 1279138435.0, + "step": 7632 + }, + { + "entropy": 1.7445255815982819, + "epoch": 0.8385378045096262, + "grad_norm": 0.6208248734474182, + "learning_rate": 1.374854572317963e-05, + "loss": 1.594, + "mean_token_accuracy": 0.6226391047239304, + "num_tokens": 1279332164.0, + "step": 7633 + }, + { + "entropy": 1.6468991041183472, + "epoch": 0.8386476614209991, + "grad_norm": 0.7001860737800598, + "learning_rate": 1.3747021043056468e-05, + "loss": 1.4056, + "mean_token_accuracy": 0.674930676817894, + "num_tokens": 1279498135.0, + "step": 7634 + }, + { + "entropy": 1.7082558274269104, + "epoch": 0.8387575183323721, + "grad_norm": 0.6932383179664612, + "learning_rate": 1.3745496275991328e-05, + "loss": 1.329, + "mean_token_accuracy": 0.6606937795877457, + "num_tokens": 1279642711.0, + "step": 7635 + }, + { + "entropy": 1.7408236265182495, + "epoch": 0.838867375243745, + "grad_norm": 0.6592848896980286, + "learning_rate": 1.374397142203247e-05, + "loss": 1.4983, + "mean_token_accuracy": 0.6435133467117945, + "num_tokens": 1279837041.0, + "step": 7636 + }, + { + "entropy": 1.6659258703390758, + "epoch": 0.838977232155118, + "grad_norm": 0.7573028802871704, + "learning_rate": 1.3742446481228149e-05, + "loss": 1.5325, + "mean_token_accuracy": 0.6281823118527731, + "num_tokens": 1280043404.0, + "step": 7637 + }, + { + "entropy": 1.7001129885514576, + "epoch": 0.8390870890664909, + "grad_norm": 0.7068085670471191, + "learning_rate": 1.3740921453626635e-05, + "loss": 1.4459, + "mean_token_accuracy": 0.6530873229106268, + "num_tokens": 1280220340.0, + "step": 7638 + }, + { + "entropy": 1.7399956981341045, + "epoch": 0.8391969459778639, + "grad_norm": 0.7076330184936523, + "learning_rate": 1.3739396339276194e-05, + "loss": 1.5227, + "mean_token_accuracy": 0.6424537748098373, + "num_tokens": 1280364296.0, + "step": 7639 + }, + { + "entropy": 1.653020828962326, + "epoch": 0.8393068028892368, + "grad_norm": 0.7728797793388367, + "learning_rate": 1.373787113822509e-05, + "loss": 1.3846, + "mean_token_accuracy": 0.6617040187120438, + "num_tokens": 1280503851.0, + "step": 7640 + }, + { + "entropy": 1.7436749835809071, + "epoch": 0.8394166598006098, + "grad_norm": 0.7593557238578796, + "learning_rate": 1.3736345850521602e-05, + "loss": 1.4094, + "mean_token_accuracy": 0.6662583450476328, + "num_tokens": 1280648876.0, + "step": 7641 + }, + { + "entropy": 1.7310488323370616, + "epoch": 0.8395265167119826, + "grad_norm": 0.6699831485748291, + "learning_rate": 1.3734820476213997e-05, + "loss": 1.3641, + "mean_token_accuracy": 0.6698733866214752, + "num_tokens": 1280785864.0, + "step": 7642 + }, + { + "entropy": 1.6108634571234386, + "epoch": 0.8396363736233555, + "grad_norm": 0.667095959186554, + "learning_rate": 1.3733295015350557e-05, + "loss": 1.2481, + "mean_token_accuracy": 0.6830354034900665, + "num_tokens": 1280910220.0, + "step": 7643 + }, + { + "entropy": 1.8118035594622295, + "epoch": 0.8397462305347285, + "grad_norm": 0.7681687474250793, + "learning_rate": 1.373176946797956e-05, + "loss": 1.476, + "mean_token_accuracy": 0.6538631469011307, + "num_tokens": 1281025428.0, + "step": 7644 + }, + { + "entropy": 1.7167300780614216, + "epoch": 0.8398560874461014, + "grad_norm": 0.5978860259056091, + "learning_rate": 1.3730243834149295e-05, + "loss": 1.5872, + "mean_token_accuracy": 0.6373479117949804, + "num_tokens": 1281203179.0, + "step": 7645 + }, + { + "entropy": 1.613272448380788, + "epoch": 0.8399659443574744, + "grad_norm": 0.657454252243042, + "learning_rate": 1.3728718113908039e-05, + "loss": 1.3732, + "mean_token_accuracy": 0.666471059123675, + "num_tokens": 1281375107.0, + "step": 7646 + }, + { + "entropy": 1.699168860912323, + "epoch": 0.8400758012688473, + "grad_norm": 0.6724218726158142, + "learning_rate": 1.3727192307304085e-05, + "loss": 1.3107, + "mean_token_accuracy": 0.6698317726453146, + "num_tokens": 1281502914.0, + "step": 7647 + }, + { + "entropy": 1.6800562342007954, + "epoch": 0.8401856581802203, + "grad_norm": 0.6762789487838745, + "learning_rate": 1.3725666414385723e-05, + "loss": 1.3332, + "mean_token_accuracy": 0.6533271272977194, + "num_tokens": 1281663636.0, + "step": 7648 + }, + { + "entropy": 1.774364064137141, + "epoch": 0.8402955150915932, + "grad_norm": 0.7857850193977356, + "learning_rate": 1.372414043520125e-05, + "loss": 1.4153, + "mean_token_accuracy": 0.6500428368647894, + "num_tokens": 1281789745.0, + "step": 7649 + }, + { + "entropy": 1.7148446440696716, + "epoch": 0.8404053720029662, + "grad_norm": 0.650869607925415, + "learning_rate": 1.3722614369798957e-05, + "loss": 1.439, + "mean_token_accuracy": 0.6369368185599645, + "num_tokens": 1282005721.0, + "step": 7650 + }, + { + "entropy": 1.6923074920972188, + "epoch": 0.840515228914339, + "grad_norm": 0.7095004916191101, + "learning_rate": 1.3721088218227148e-05, + "loss": 1.3425, + "mean_token_accuracy": 0.6514080464839935, + "num_tokens": 1282166997.0, + "step": 7651 + }, + { + "entropy": 1.6772983868916829, + "epoch": 0.840625085825712, + "grad_norm": 0.6236726641654968, + "learning_rate": 1.3719561980534122e-05, + "loss": 1.4042, + "mean_token_accuracy": 0.6637339144945145, + "num_tokens": 1282356185.0, + "step": 7652 + }, + { + "entropy": 1.7177359561125438, + "epoch": 0.8407349427370849, + "grad_norm": 0.7458381652832031, + "learning_rate": 1.3718035656768182e-05, + "loss": 1.4507, + "mean_token_accuracy": 0.6659137606620789, + "num_tokens": 1282520253.0, + "step": 7653 + }, + { + "entropy": 1.6357039312521617, + "epoch": 0.8408447996484579, + "grad_norm": 0.5765164494514465, + "learning_rate": 1.3716509246977643e-05, + "loss": 1.4195, + "mean_token_accuracy": 0.6570479621489843, + "num_tokens": 1282709467.0, + "step": 7654 + }, + { + "entropy": 1.7260218759377797, + "epoch": 0.8409546565598308, + "grad_norm": 0.7507497668266296, + "learning_rate": 1.3714982751210808e-05, + "loss": 1.314, + "mean_token_accuracy": 0.6629079331954321, + "num_tokens": 1282831662.0, + "step": 7655 + }, + { + "entropy": 1.675975243250529, + "epoch": 0.8410645134712038, + "grad_norm": 0.7367669343948364, + "learning_rate": 1.371345616951599e-05, + "loss": 1.2233, + "mean_token_accuracy": 0.6800348659356436, + "num_tokens": 1282976248.0, + "step": 7656 + }, + { + "entropy": 1.7002997398376465, + "epoch": 0.8411743703825767, + "grad_norm": 0.6870225071907043, + "learning_rate": 1.3711929501941512e-05, + "loss": 1.3712, + "mean_token_accuracy": 0.6616632044315338, + "num_tokens": 1283105621.0, + "step": 7657 + }, + { + "entropy": 1.710231105486552, + "epoch": 0.8412842272939496, + "grad_norm": 0.6416940093040466, + "learning_rate": 1.3710402748535688e-05, + "loss": 1.3102, + "mean_token_accuracy": 0.6693031589190165, + "num_tokens": 1283251344.0, + "step": 7658 + }, + { + "entropy": 1.6835933824380238, + "epoch": 0.8413940842053226, + "grad_norm": 0.6878907680511475, + "learning_rate": 1.3708875909346832e-05, + "loss": 1.4185, + "mean_token_accuracy": 0.6552811364332835, + "num_tokens": 1283435304.0, + "step": 7659 + }, + { + "entropy": 1.671914945046107, + "epoch": 0.8415039411166955, + "grad_norm": 0.6930204033851624, + "learning_rate": 1.3707348984423277e-05, + "loss": 1.3017, + "mean_token_accuracy": 0.6702569822470347, + "num_tokens": 1283566399.0, + "step": 7660 + }, + { + "entropy": 1.6549534698327382, + "epoch": 0.8416137980280685, + "grad_norm": 0.6953391432762146, + "learning_rate": 1.3705821973813352e-05, + "loss": 1.4282, + "mean_token_accuracy": 0.6581354439258575, + "num_tokens": 1283720803.0, + "step": 7661 + }, + { + "entropy": 1.735606461763382, + "epoch": 0.8417236549394413, + "grad_norm": 0.8534516096115112, + "learning_rate": 1.3704294877565372e-05, + "loss": 1.3774, + "mean_token_accuracy": 0.6662740260362625, + "num_tokens": 1283849961.0, + "step": 7662 + }, + { + "entropy": 1.7239322364330292, + "epoch": 0.8418335118508143, + "grad_norm": 0.6426288485527039, + "learning_rate": 1.3702767695727684e-05, + "loss": 1.4996, + "mean_token_accuracy": 0.6409449676672617, + "num_tokens": 1284040809.0, + "step": 7663 + }, + { + "entropy": 1.6683675050735474, + "epoch": 0.8419433687621872, + "grad_norm": 0.7720414400100708, + "learning_rate": 1.3701240428348612e-05, + "loss": 1.482, + "mean_token_accuracy": 0.6555820008118948, + "num_tokens": 1284206147.0, + "step": 7664 + }, + { + "entropy": 1.7035513420899708, + "epoch": 0.8420532256735602, + "grad_norm": 0.5820039510726929, + "learning_rate": 1.36997130754765e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6452493071556091, + "num_tokens": 1284370393.0, + "step": 7665 + }, + { + "entropy": 1.681229054927826, + "epoch": 0.8421630825849331, + "grad_norm": 0.7429522275924683, + "learning_rate": 1.3698185637159682e-05, + "loss": 1.235, + "mean_token_accuracy": 0.6775188346703848, + "num_tokens": 1284493127.0, + "step": 7666 + }, + { + "entropy": 1.7138707240422566, + "epoch": 0.8422729394963061, + "grad_norm": 0.5457119345664978, + "learning_rate": 1.369665811344651e-05, + "loss": 1.4761, + "mean_token_accuracy": 0.6490335464477539, + "num_tokens": 1284771528.0, + "step": 7667 + }, + { + "entropy": 1.6924820840358734, + "epoch": 0.842382796407679, + "grad_norm": 0.6924734115600586, + "learning_rate": 1.369513050438532e-05, + "loss": 1.3606, + "mean_token_accuracy": 0.6642278035481771, + "num_tokens": 1284923425.0, + "step": 7668 + }, + { + "entropy": 1.6925741334756215, + "epoch": 0.842492653319052, + "grad_norm": 0.6529973745346069, + "learning_rate": 1.3693602810024466e-05, + "loss": 1.2482, + "mean_token_accuracy": 0.6726948221524557, + "num_tokens": 1285060828.0, + "step": 7669 + }, + { + "entropy": 1.6858268876870472, + "epoch": 0.8426025102304249, + "grad_norm": 0.649381160736084, + "learning_rate": 1.3692075030412295e-05, + "loss": 1.462, + "mean_token_accuracy": 0.6515221893787384, + "num_tokens": 1285247826.0, + "step": 7670 + }, + { + "entropy": 1.7777485251426697, + "epoch": 0.8427123671417978, + "grad_norm": 0.713453471660614, + "learning_rate": 1.3690547165597166e-05, + "loss": 1.4854, + "mean_token_accuracy": 0.643087034424146, + "num_tokens": 1285378746.0, + "step": 7671 + }, + { + "entropy": 1.6306644082069397, + "epoch": 0.8428222240531708, + "grad_norm": 0.6652552485466003, + "learning_rate": 1.3689019215627428e-05, + "loss": 1.3156, + "mean_token_accuracy": 0.671681821346283, + "num_tokens": 1285560412.0, + "step": 7672 + }, + { + "entropy": 1.7075538237889607, + "epoch": 0.8429320809645436, + "grad_norm": 0.7357656359672546, + "learning_rate": 1.3687491180551447e-05, + "loss": 1.4037, + "mean_token_accuracy": 0.6523735970258713, + "num_tokens": 1285702229.0, + "step": 7673 + }, + { + "entropy": 1.7711325983206432, + "epoch": 0.8430419378759166, + "grad_norm": 0.686625599861145, + "learning_rate": 1.3685963060417576e-05, + "loss": 1.4509, + "mean_token_accuracy": 0.6421075165271759, + "num_tokens": 1285900255.0, + "step": 7674 + }, + { + "entropy": 1.6846754550933838, + "epoch": 0.8431517947872895, + "grad_norm": 0.7092203497886658, + "learning_rate": 1.3684434855274189e-05, + "loss": 1.2795, + "mean_token_accuracy": 0.6742515216271082, + "num_tokens": 1286027859.0, + "step": 7675 + }, + { + "entropy": 1.6374373237291973, + "epoch": 0.8432616516986625, + "grad_norm": 0.6417721509933472, + "learning_rate": 1.3682906565169646e-05, + "loss": 1.3225, + "mean_token_accuracy": 0.6675249536832174, + "num_tokens": 1286181159.0, + "step": 7676 + }, + { + "entropy": 1.6539149185021718, + "epoch": 0.8433715086100354, + "grad_norm": 0.6134423017501831, + "learning_rate": 1.3681378190152321e-05, + "loss": 1.4416, + "mean_token_accuracy": 0.6583320200443268, + "num_tokens": 1286380359.0, + "step": 7677 + }, + { + "entropy": 1.7651469906171162, + "epoch": 0.8434813655214084, + "grad_norm": 0.6425126791000366, + "learning_rate": 1.3679849730270582e-05, + "loss": 1.4183, + "mean_token_accuracy": 0.6462546785672506, + "num_tokens": 1286545480.0, + "step": 7678 + }, + { + "entropy": 1.6872264842192333, + "epoch": 0.8435912224327813, + "grad_norm": 0.6594815254211426, + "learning_rate": 1.367832118557281e-05, + "loss": 1.3546, + "mean_token_accuracy": 0.6645703117052714, + "num_tokens": 1286686590.0, + "step": 7679 + }, + { + "entropy": 1.7343334058920543, + "epoch": 0.8437010793441543, + "grad_norm": 0.7362040877342224, + "learning_rate": 1.3676792556107376e-05, + "loss": 1.3422, + "mean_token_accuracy": 0.667659322420756, + "num_tokens": 1286859906.0, + "step": 7680 + }, + { + "entropy": 1.7099198997020721, + "epoch": 0.8438109362555272, + "grad_norm": 0.6804381608963013, + "learning_rate": 1.3675263841922665e-05, + "loss": 1.643, + "mean_token_accuracy": 0.6239674588044485, + "num_tokens": 1287079553.0, + "step": 7681 + }, + { + "entropy": 1.7008231182893117, + "epoch": 0.8439207931669002, + "grad_norm": 0.7834773063659668, + "learning_rate": 1.367373504306706e-05, + "loss": 1.3961, + "mean_token_accuracy": 0.6471186677614847, + "num_tokens": 1287215979.0, + "step": 7682 + }, + { + "entropy": 1.7217775185902913, + "epoch": 0.844030650078273, + "grad_norm": 0.6311613917350769, + "learning_rate": 1.3672206159588945e-05, + "loss": 1.4119, + "mean_token_accuracy": 0.6476258685191473, + "num_tokens": 1287372294.0, + "step": 7683 + }, + { + "entropy": 1.7093331813812256, + "epoch": 0.8441405069896459, + "grad_norm": 2.1464595794677734, + "learning_rate": 1.3670677191536707e-05, + "loss": 1.2492, + "mean_token_accuracy": 0.664307658871015, + "num_tokens": 1287584672.0, + "step": 7684 + }, + { + "entropy": 1.7259198725223541, + "epoch": 0.8442503639010189, + "grad_norm": 0.6909459829330444, + "learning_rate": 1.3669148138958744e-05, + "loss": 1.4728, + "mean_token_accuracy": 0.6467719525098801, + "num_tokens": 1287755964.0, + "step": 7685 + }, + { + "entropy": 1.72645503282547, + "epoch": 0.8443602208123918, + "grad_norm": 0.6276677846908569, + "learning_rate": 1.3667619001903442e-05, + "loss": 1.4365, + "mean_token_accuracy": 0.6519429683685303, + "num_tokens": 1287962476.0, + "step": 7686 + }, + { + "entropy": 1.7283104161421459, + "epoch": 0.8444700777237648, + "grad_norm": 0.7658132314682007, + "learning_rate": 1.3666089780419201e-05, + "loss": 1.497, + "mean_token_accuracy": 0.6409247318903605, + "num_tokens": 1288111416.0, + "step": 7687 + }, + { + "entropy": 1.6543226341406505, + "epoch": 0.8445799346351377, + "grad_norm": 0.686872124671936, + "learning_rate": 1.3664560474554419e-05, + "loss": 1.4009, + "mean_token_accuracy": 0.655271073182424, + "num_tokens": 1288308554.0, + "step": 7688 + }, + { + "entropy": 1.6488666733105977, + "epoch": 0.8446897915465107, + "grad_norm": 0.6725640296936035, + "learning_rate": 1.3663031084357501e-05, + "loss": 1.3845, + "mean_token_accuracy": 0.658675899108251, + "num_tokens": 1288486606.0, + "step": 7689 + }, + { + "entropy": 1.7219010492165883, + "epoch": 0.8447996484578836, + "grad_norm": 0.6540157794952393, + "learning_rate": 1.3661501609876847e-05, + "loss": 1.3808, + "mean_token_accuracy": 0.6512012432018915, + "num_tokens": 1288658497.0, + "step": 7690 + }, + { + "entropy": 1.7627086639404297, + "epoch": 0.8449095053692566, + "grad_norm": 0.7566828727722168, + "learning_rate": 1.3659972051160868e-05, + "loss": 1.4124, + "mean_token_accuracy": 0.6677108506361643, + "num_tokens": 1288811315.0, + "step": 7691 + }, + { + "entropy": 1.7626505196094513, + "epoch": 0.8450193622806295, + "grad_norm": 0.6381642818450928, + "learning_rate": 1.3658442408257972e-05, + "loss": 1.4573, + "mean_token_accuracy": 0.6391281684239706, + "num_tokens": 1289035418.0, + "step": 7692 + }, + { + "entropy": 1.731861154238383, + "epoch": 0.8451292191920025, + "grad_norm": 0.6614934802055359, + "learning_rate": 1.365691268121657e-05, + "loss": 1.4221, + "mean_token_accuracy": 0.6379889895518621, + "num_tokens": 1289211468.0, + "step": 7693 + }, + { + "entropy": 1.737843285004298, + "epoch": 0.8452390761033753, + "grad_norm": 0.6590113043785095, + "learning_rate": 1.3655382870085078e-05, + "loss": 1.4666, + "mean_token_accuracy": 0.6468397031227747, + "num_tokens": 1289389121.0, + "step": 7694 + }, + { + "entropy": 1.723410467306773, + "epoch": 0.8453489330147483, + "grad_norm": 0.7802287936210632, + "learning_rate": 1.3653852974911919e-05, + "loss": 1.4251, + "mean_token_accuracy": 0.6476560135682424, + "num_tokens": 1289559256.0, + "step": 7695 + }, + { + "entropy": 1.7518675525983174, + "epoch": 0.8454587899261212, + "grad_norm": 0.7318578958511353, + "learning_rate": 1.3652322995745504e-05, + "loss": 1.2606, + "mean_token_accuracy": 0.6652724295854568, + "num_tokens": 1289658783.0, + "step": 7696 + }, + { + "entropy": 1.749257892370224, + "epoch": 0.8455686468374941, + "grad_norm": 0.7955240607261658, + "learning_rate": 1.3650792932634268e-05, + "loss": 1.2613, + "mean_token_accuracy": 0.6822487364212672, + "num_tokens": 1289795148.0, + "step": 7697 + }, + { + "entropy": 1.7356181144714355, + "epoch": 0.8456785037488671, + "grad_norm": 0.7357754707336426, + "learning_rate": 1.3649262785626624e-05, + "loss": 1.5575, + "mean_token_accuracy": 0.6548448453346888, + "num_tokens": 1289948148.0, + "step": 7698 + }, + { + "entropy": 1.7077325284481049, + "epoch": 0.84578836066024, + "grad_norm": 0.7098826169967651, + "learning_rate": 1.3647732554771009e-05, + "loss": 1.502, + "mean_token_accuracy": 0.6426471124092737, + "num_tokens": 1290138416.0, + "step": 7699 + }, + { + "entropy": 1.735207627216975, + "epoch": 0.845898217571613, + "grad_norm": 0.6340279579162598, + "learning_rate": 1.3646202240115852e-05, + "loss": 1.3897, + "mean_token_accuracy": 0.6579069246848425, + "num_tokens": 1290340173.0, + "step": 7700 + }, + { + "entropy": 1.6785156230131786, + "epoch": 0.8460080744829859, + "grad_norm": 0.8312824964523315, + "learning_rate": 1.3644671841709586e-05, + "loss": 1.2704, + "mean_token_accuracy": 0.6747389038403829, + "num_tokens": 1290456610.0, + "step": 7701 + }, + { + "entropy": 1.6480493446191151, + "epoch": 0.8461179313943589, + "grad_norm": 0.6698850989341736, + "learning_rate": 1.3643141359600647e-05, + "loss": 1.267, + "mean_token_accuracy": 0.6753464639186859, + "num_tokens": 1290626267.0, + "step": 7702 + }, + { + "entropy": 1.7196992834409077, + "epoch": 0.8462277883057318, + "grad_norm": 0.7034914493560791, + "learning_rate": 1.3641610793837478e-05, + "loss": 1.4121, + "mean_token_accuracy": 0.6590729554494222, + "num_tokens": 1290772213.0, + "step": 7703 + }, + { + "entropy": 1.7197033961613972, + "epoch": 0.8463376452171048, + "grad_norm": 0.6949339509010315, + "learning_rate": 1.3640080144468515e-05, + "loss": 1.3447, + "mean_token_accuracy": 0.6591121902068456, + "num_tokens": 1290899259.0, + "step": 7704 + }, + { + "entropy": 1.7233955065409343, + "epoch": 0.8464475021284776, + "grad_norm": 0.7986577749252319, + "learning_rate": 1.3638549411542205e-05, + "loss": 1.4605, + "mean_token_accuracy": 0.660988504687945, + "num_tokens": 1291113194.0, + "step": 7705 + }, + { + "entropy": 1.6643975575764973, + "epoch": 0.8465573590398506, + "grad_norm": 0.6427856683731079, + "learning_rate": 1.3637018595106996e-05, + "loss": 1.4165, + "mean_token_accuracy": 0.6534582326809565, + "num_tokens": 1291263837.0, + "step": 7706 + }, + { + "entropy": 1.7111331125100453, + "epoch": 0.8466672159512235, + "grad_norm": 0.7272197008132935, + "learning_rate": 1.3635487695211337e-05, + "loss": 1.3851, + "mean_token_accuracy": 0.6659311503171921, + "num_tokens": 1291412707.0, + "step": 7707 + }, + { + "entropy": 1.7255754868189495, + "epoch": 0.8467770728625965, + "grad_norm": 0.6733617782592773, + "learning_rate": 1.3633956711903682e-05, + "loss": 1.4219, + "mean_token_accuracy": 0.6492577840884527, + "num_tokens": 1291572306.0, + "step": 7708 + }, + { + "entropy": 1.7116366227467854, + "epoch": 0.8468869297739694, + "grad_norm": 0.6349593997001648, + "learning_rate": 1.363242564523248e-05, + "loss": 1.4648, + "mean_token_accuracy": 0.6612848242123922, + "num_tokens": 1291738907.0, + "step": 7709 + }, + { + "entropy": 1.782869964838028, + "epoch": 0.8469967866853424, + "grad_norm": 0.8094156384468079, + "learning_rate": 1.3630894495246194e-05, + "loss": 1.3299, + "mean_token_accuracy": 0.6685720980167389, + "num_tokens": 1291866880.0, + "step": 7710 + }, + { + "entropy": 1.6088589231173198, + "epoch": 0.8471066435967153, + "grad_norm": 1.1662250757217407, + "learning_rate": 1.3629363261993285e-05, + "loss": 1.2702, + "mean_token_accuracy": 0.6650574405988058, + "num_tokens": 1292039473.0, + "step": 7711 + }, + { + "entropy": 1.7531055708726246, + "epoch": 0.8472165005080882, + "grad_norm": 0.680927038192749, + "learning_rate": 1.362783194552221e-05, + "loss": 1.4834, + "mean_token_accuracy": 0.6375831713279089, + "num_tokens": 1292229376.0, + "step": 7712 + }, + { + "entropy": 1.694258709748586, + "epoch": 0.8473263574194612, + "grad_norm": 0.7578868865966797, + "learning_rate": 1.3626300545881442e-05, + "loss": 1.2226, + "mean_token_accuracy": 0.6796438743670782, + "num_tokens": 1292349842.0, + "step": 7713 + }, + { + "entropy": 1.6762607991695404, + "epoch": 0.847436214330834, + "grad_norm": 0.6741758584976196, + "learning_rate": 1.362476906311944e-05, + "loss": 1.4122, + "mean_token_accuracy": 0.6557339429855347, + "num_tokens": 1292545481.0, + "step": 7714 + }, + { + "entropy": 1.7098297476768494, + "epoch": 0.847546071242207, + "grad_norm": 0.6798667907714844, + "learning_rate": 1.3623237497284683e-05, + "loss": 1.4471, + "mean_token_accuracy": 0.6456852555274963, + "num_tokens": 1292734742.0, + "step": 7715 + }, + { + "entropy": 1.7433710793654125, + "epoch": 0.8476559281535799, + "grad_norm": 0.7019221186637878, + "learning_rate": 1.3621705848425641e-05, + "loss": 1.4745, + "mean_token_accuracy": 0.6478450198968252, + "num_tokens": 1292893188.0, + "step": 7716 + }, + { + "entropy": 1.731772820154826, + "epoch": 0.8477657850649529, + "grad_norm": 0.7611411213874817, + "learning_rate": 1.3620174116590791e-05, + "loss": 1.3669, + "mean_token_accuracy": 0.6490048070748647, + "num_tokens": 1293055710.0, + "step": 7717 + }, + { + "entropy": 1.7181940376758575, + "epoch": 0.8478756419763258, + "grad_norm": 0.8934732675552368, + "learning_rate": 1.361864230182861e-05, + "loss": 1.5196, + "mean_token_accuracy": 0.6444053202867508, + "num_tokens": 1293253973.0, + "step": 7718 + }, + { + "entropy": 1.685392697652181, + "epoch": 0.8479854988876988, + "grad_norm": 0.6858906149864197, + "learning_rate": 1.361711040418758e-05, + "loss": 1.2893, + "mean_token_accuracy": 0.6633900205294291, + "num_tokens": 1293387605.0, + "step": 7719 + }, + { + "entropy": 1.713352640469869, + "epoch": 0.8480953557990717, + "grad_norm": 0.6775051355361938, + "learning_rate": 1.3615578423716187e-05, + "loss": 1.513, + "mean_token_accuracy": 0.6740926851828893, + "num_tokens": 1293564698.0, + "step": 7720 + }, + { + "entropy": 1.767835130294164, + "epoch": 0.8482052127104447, + "grad_norm": 0.6613144278526306, + "learning_rate": 1.3614046360462912e-05, + "loss": 1.5051, + "mean_token_accuracy": 0.6474483261505762, + "num_tokens": 1293727382.0, + "step": 7721 + }, + { + "entropy": 1.7436497310797374, + "epoch": 0.8483150696218176, + "grad_norm": 0.6383576989173889, + "learning_rate": 1.3612514214476249e-05, + "loss": 1.2954, + "mean_token_accuracy": 0.6786330391963323, + "num_tokens": 1293878593.0, + "step": 7722 + }, + { + "entropy": 1.7230326632658641, + "epoch": 0.8484249265331906, + "grad_norm": 0.7083463668823242, + "learning_rate": 1.361098198580469e-05, + "loss": 1.3808, + "mean_token_accuracy": 0.6482375711202621, + "num_tokens": 1294025781.0, + "step": 7723 + }, + { + "entropy": 1.7545043329397838, + "epoch": 0.8485347834445635, + "grad_norm": 0.5937564969062805, + "learning_rate": 1.3609449674496726e-05, + "loss": 1.501, + "mean_token_accuracy": 0.6370914578437805, + "num_tokens": 1294227008.0, + "step": 7724 + }, + { + "entropy": 1.700508326292038, + "epoch": 0.8486446403559363, + "grad_norm": 0.8746032118797302, + "learning_rate": 1.3607917280600855e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6724284738302231, + "num_tokens": 1294393254.0, + "step": 7725 + }, + { + "entropy": 1.661549021800359, + "epoch": 0.8487544972673093, + "grad_norm": 0.7372915744781494, + "learning_rate": 1.360638480416558e-05, + "loss": 1.4659, + "mean_token_accuracy": 0.6515500744183859, + "num_tokens": 1294571064.0, + "step": 7726 + }, + { + "entropy": 1.7442650695641835, + "epoch": 0.8488643541786822, + "grad_norm": 0.6306323409080505, + "learning_rate": 1.3604852245239397e-05, + "loss": 1.5477, + "mean_token_accuracy": 0.6327639867862066, + "num_tokens": 1294744598.0, + "step": 7727 + }, + { + "entropy": 1.712537129720052, + "epoch": 0.8489742110900552, + "grad_norm": 0.7366087436676025, + "learning_rate": 1.3603319603870818e-05, + "loss": 1.3154, + "mean_token_accuracy": 0.67984339594841, + "num_tokens": 1294886218.0, + "step": 7728 + }, + { + "entropy": 1.7154980301856995, + "epoch": 0.8490840680014281, + "grad_norm": 0.8025618195533752, + "learning_rate": 1.3601786880108343e-05, + "loss": 1.5105, + "mean_token_accuracy": 0.6657672872145971, + "num_tokens": 1295017541.0, + "step": 7729 + }, + { + "entropy": 1.711490790049235, + "epoch": 0.8491939249128011, + "grad_norm": 0.6536463499069214, + "learning_rate": 1.3600254074000488e-05, + "loss": 1.4221, + "mean_token_accuracy": 0.6637669056653976, + "num_tokens": 1295227688.0, + "step": 7730 + }, + { + "entropy": 1.6762659549713135, + "epoch": 0.849303781824174, + "grad_norm": 0.7155306339263916, + "learning_rate": 1.359872118559576e-05, + "loss": 1.3234, + "mean_token_accuracy": 0.6701004455486933, + "num_tokens": 1295379203.0, + "step": 7731 + }, + { + "entropy": 1.6928722262382507, + "epoch": 0.849413638735547, + "grad_norm": 0.7518654465675354, + "learning_rate": 1.359718821494268e-05, + "loss": 1.5937, + "mean_token_accuracy": 0.6596247951189677, + "num_tokens": 1295542003.0, + "step": 7732 + }, + { + "entropy": 1.6721904973189037, + "epoch": 0.8495234956469199, + "grad_norm": 0.7368571758270264, + "learning_rate": 1.3595655162089763e-05, + "loss": 1.4228, + "mean_token_accuracy": 0.6764175544182459, + "num_tokens": 1295695454.0, + "step": 7733 + }, + { + "entropy": 1.7590387463569641, + "epoch": 0.8496333525582929, + "grad_norm": 0.7963206768035889, + "learning_rate": 1.359412202708553e-05, + "loss": 1.4675, + "mean_token_accuracy": 0.647629976272583, + "num_tokens": 1295827314.0, + "step": 7734 + }, + { + "entropy": 1.731603890657425, + "epoch": 0.8497432094696658, + "grad_norm": 0.6758211851119995, + "learning_rate": 1.3592588809978506e-05, + "loss": 1.3838, + "mean_token_accuracy": 0.659120092789332, + "num_tokens": 1295996986.0, + "step": 7735 + }, + { + "entropy": 1.6921556492646534, + "epoch": 0.8498530663810387, + "grad_norm": 0.5895377993583679, + "learning_rate": 1.3591055510817213e-05, + "loss": 1.3931, + "mean_token_accuracy": 0.652939553062121, + "num_tokens": 1296164645.0, + "step": 7736 + }, + { + "entropy": 1.6915649970372517, + "epoch": 0.8499629232924116, + "grad_norm": 0.7645225524902344, + "learning_rate": 1.358952212965018e-05, + "loss": 1.3265, + "mean_token_accuracy": 0.6632872621218363, + "num_tokens": 1296327910.0, + "step": 7737 + }, + { + "entropy": 1.7159571647644043, + "epoch": 0.8500727802037845, + "grad_norm": 0.7446976900100708, + "learning_rate": 1.3587988666525935e-05, + "loss": 1.4285, + "mean_token_accuracy": 0.6510045429070791, + "num_tokens": 1296466186.0, + "step": 7738 + }, + { + "entropy": 1.6824923356374104, + "epoch": 0.8501826371151575, + "grad_norm": 0.6190294027328491, + "learning_rate": 1.358645512149302e-05, + "loss": 1.4375, + "mean_token_accuracy": 0.6470278948545456, + "num_tokens": 1296637794.0, + "step": 7739 + }, + { + "entropy": 1.7388030588626862, + "epoch": 0.8502924940265304, + "grad_norm": 0.7785733938217163, + "learning_rate": 1.3584921494599963e-05, + "loss": 1.3741, + "mean_token_accuracy": 0.657695472240448, + "num_tokens": 1296760052.0, + "step": 7740 + }, + { + "entropy": 1.7327676912148793, + "epoch": 0.8504023509379034, + "grad_norm": 0.7766647338867188, + "learning_rate": 1.3583387785895307e-05, + "loss": 1.2975, + "mean_token_accuracy": 0.673372263709704, + "num_tokens": 1296869236.0, + "step": 7741 + }, + { + "entropy": 1.718622773885727, + "epoch": 0.8505122078492763, + "grad_norm": 0.690539538860321, + "learning_rate": 1.3581853995427591e-05, + "loss": 1.3776, + "mean_token_accuracy": 0.6660670936107635, + "num_tokens": 1297011769.0, + "step": 7742 + }, + { + "entropy": 1.6976758639017742, + "epoch": 0.8506220647606493, + "grad_norm": 0.6688826084136963, + "learning_rate": 1.3580320123245361e-05, + "loss": 1.3505, + "mean_token_accuracy": 0.6525122026602427, + "num_tokens": 1297178554.0, + "step": 7743 + }, + { + "entropy": 1.691978245973587, + "epoch": 0.8507319216720222, + "grad_norm": 0.6021746397018433, + "learning_rate": 1.3578786169397158e-05, + "loss": 1.3599, + "mean_token_accuracy": 0.6527169843514761, + "num_tokens": 1297364819.0, + "step": 7744 + }, + { + "entropy": 1.6742048561573029, + "epoch": 0.8508417785833952, + "grad_norm": 0.8681425452232361, + "learning_rate": 1.357725213393154e-05, + "loss": 1.2843, + "mean_token_accuracy": 0.6764674683411916, + "num_tokens": 1297516573.0, + "step": 7745 + }, + { + "entropy": 1.7442771196365356, + "epoch": 0.850951635494768, + "grad_norm": 0.6213224530220032, + "learning_rate": 1.3575718016897046e-05, + "loss": 1.4259, + "mean_token_accuracy": 0.6503079384565353, + "num_tokens": 1297672956.0, + "step": 7746 + }, + { + "entropy": 1.7606963614622753, + "epoch": 0.851061492406141, + "grad_norm": 0.7436356544494629, + "learning_rate": 1.3574183818342245e-05, + "loss": 1.3349, + "mean_token_accuracy": 0.6708455085754395, + "num_tokens": 1297818809.0, + "step": 7747 + }, + { + "entropy": 1.672513614098231, + "epoch": 0.8511713493175139, + "grad_norm": 0.73287034034729, + "learning_rate": 1.3572649538315683e-05, + "loss": 1.3939, + "mean_token_accuracy": 0.6704998711744944, + "num_tokens": 1297980662.0, + "step": 7748 + }, + { + "entropy": 1.7294066945711772, + "epoch": 0.8512812062288869, + "grad_norm": 0.6251292824745178, + "learning_rate": 1.3571115176865923e-05, + "loss": 1.542, + "mean_token_accuracy": 0.643743579586347, + "num_tokens": 1298143653.0, + "step": 7749 + }, + { + "entropy": 1.6557518442471821, + "epoch": 0.8513910631402598, + "grad_norm": 0.6958547830581665, + "learning_rate": 1.3569580734041524e-05, + "loss": 1.3905, + "mean_token_accuracy": 0.6798481444517771, + "num_tokens": 1298331907.0, + "step": 7750 + }, + { + "entropy": 1.6850103636582692, + "epoch": 0.8515009200516328, + "grad_norm": 0.7102126479148865, + "learning_rate": 1.3568046209891055e-05, + "loss": 1.2097, + "mean_token_accuracy": 0.6820806463559469, + "num_tokens": 1298488338.0, + "step": 7751 + }, + { + "entropy": 1.7033019761244457, + "epoch": 0.8516107769630057, + "grad_norm": 0.639173686504364, + "learning_rate": 1.356651160446308e-05, + "loss": 1.4144, + "mean_token_accuracy": 0.6469381103912989, + "num_tokens": 1298684159.0, + "step": 7752 + }, + { + "entropy": 1.7016167442003887, + "epoch": 0.8517206338743786, + "grad_norm": 0.9110562801361084, + "learning_rate": 1.356497691780617e-05, + "loss": 1.7517, + "mean_token_accuracy": 0.6370747834444046, + "num_tokens": 1298844311.0, + "step": 7753 + }, + { + "entropy": 1.6738309760888417, + "epoch": 0.8518304907857516, + "grad_norm": 0.7459472417831421, + "learning_rate": 1.3563442149968896e-05, + "loss": 1.3617, + "mean_token_accuracy": 0.6658263305823008, + "num_tokens": 1298991771.0, + "step": 7754 + }, + { + "entropy": 1.6745908459027607, + "epoch": 0.8519403476971245, + "grad_norm": 0.8173218369483948, + "learning_rate": 1.356190730099983e-05, + "loss": 1.3996, + "mean_token_accuracy": 0.6685936997334162, + "num_tokens": 1299145840.0, + "step": 7755 + }, + { + "entropy": 1.7424539625644684, + "epoch": 0.8520502046084975, + "grad_norm": 0.6466085314750671, + "learning_rate": 1.3560372370947557e-05, + "loss": 1.3801, + "mean_token_accuracy": 0.6733126441637675, + "num_tokens": 1299315556.0, + "step": 7756 + }, + { + "entropy": 1.6781065960725148, + "epoch": 0.8521600615198703, + "grad_norm": 0.6531357765197754, + "learning_rate": 1.3558837359860651e-05, + "loss": 1.3956, + "mean_token_accuracy": 0.6521165718634924, + "num_tokens": 1299491636.0, + "step": 7757 + }, + { + "entropy": 1.7538845141728718, + "epoch": 0.8522699184312433, + "grad_norm": 0.6810640692710876, + "learning_rate": 1.3557302267787691e-05, + "loss": 1.5143, + "mean_token_accuracy": 0.6484838575124741, + "num_tokens": 1299682671.0, + "step": 7758 + }, + { + "entropy": 1.734057645003001, + "epoch": 0.8523797753426162, + "grad_norm": 0.6916408538818359, + "learning_rate": 1.3555767094777272e-05, + "loss": 1.3975, + "mean_token_accuracy": 0.6555085331201553, + "num_tokens": 1299884491.0, + "step": 7759 + }, + { + "entropy": 1.7141314844290416, + "epoch": 0.8524896322539892, + "grad_norm": 0.6095522046089172, + "learning_rate": 1.3554231840877973e-05, + "loss": 1.3404, + "mean_token_accuracy": 0.653201217452685, + "num_tokens": 1300048240.0, + "step": 7760 + }, + { + "entropy": 1.73670361439387, + "epoch": 0.8525994891653621, + "grad_norm": 0.615277111530304, + "learning_rate": 1.355269650613839e-05, + "loss": 1.3983, + "mean_token_accuracy": 0.653611977895101, + "num_tokens": 1300238035.0, + "step": 7761 + }, + { + "entropy": 1.7244941194852192, + "epoch": 0.8527093460767351, + "grad_norm": 0.689967155456543, + "learning_rate": 1.3551161090607113e-05, + "loss": 1.3408, + "mean_token_accuracy": 0.668989489475886, + "num_tokens": 1300408112.0, + "step": 7762 + }, + { + "entropy": 1.7187703053156536, + "epoch": 0.852819202988108, + "grad_norm": 0.7365146279335022, + "learning_rate": 1.3549625594332734e-05, + "loss": 1.4606, + "mean_token_accuracy": 0.6453435768683752, + "num_tokens": 1300635927.0, + "step": 7763 + }, + { + "entropy": 1.7673336962858837, + "epoch": 0.852929059899481, + "grad_norm": 0.7960333824157715, + "learning_rate": 1.3548090017363853e-05, + "loss": 1.3389, + "mean_token_accuracy": 0.658059557278951, + "num_tokens": 1300768798.0, + "step": 7764 + }, + { + "entropy": 1.7308455010255177, + "epoch": 0.8530389168108539, + "grad_norm": 0.6843191385269165, + "learning_rate": 1.3546554359749078e-05, + "loss": 1.3368, + "mean_token_accuracy": 0.6665947139263153, + "num_tokens": 1300927812.0, + "step": 7765 + }, + { + "entropy": 1.708117683728536, + "epoch": 0.8531487737222267, + "grad_norm": 0.7319220900535583, + "learning_rate": 1.3545018621537e-05, + "loss": 1.4025, + "mean_token_accuracy": 0.6578193108240763, + "num_tokens": 1301110237.0, + "step": 7766 + }, + { + "entropy": 1.7092965046564739, + "epoch": 0.8532586306335997, + "grad_norm": 0.7056390047073364, + "learning_rate": 1.354348280277623e-05, + "loss": 1.4762, + "mean_token_accuracy": 0.645611047744751, + "num_tokens": 1301280209.0, + "step": 7767 + }, + { + "entropy": 1.6814217766125996, + "epoch": 0.8533684875449726, + "grad_norm": 0.6106694936752319, + "learning_rate": 1.3541946903515373e-05, + "loss": 1.4101, + "mean_token_accuracy": 0.6508079369862875, + "num_tokens": 1301471786.0, + "step": 7768 + }, + { + "entropy": 1.7410283883412678, + "epoch": 0.8534783444563456, + "grad_norm": 0.5932704210281372, + "learning_rate": 1.3540410923803047e-05, + "loss": 1.322, + "mean_token_accuracy": 0.65825983385245, + "num_tokens": 1301608122.0, + "step": 7769 + }, + { + "entropy": 1.654642830292384, + "epoch": 0.8535882013677185, + "grad_norm": 0.6340963840484619, + "learning_rate": 1.3538874863687857e-05, + "loss": 1.3904, + "mean_token_accuracy": 0.6750276188055674, + "num_tokens": 1301774068.0, + "step": 7770 + }, + { + "entropy": 1.8286733229955037, + "epoch": 0.8536980582790915, + "grad_norm": 0.8947479128837585, + "learning_rate": 1.353733872321842e-05, + "loss": 1.4883, + "mean_token_accuracy": 0.6447364389896393, + "num_tokens": 1301942077.0, + "step": 7771 + }, + { + "entropy": 1.734379122654597, + "epoch": 0.8538079151904644, + "grad_norm": 0.7082586884498596, + "learning_rate": 1.3535802502443358e-05, + "loss": 1.424, + "mean_token_accuracy": 0.6567316949367523, + "num_tokens": 1302111046.0, + "step": 7772 + }, + { + "entropy": 1.7233747939268749, + "epoch": 0.8539177721018374, + "grad_norm": 0.7988469004631042, + "learning_rate": 1.353426620141129e-05, + "loss": 1.2831, + "mean_token_accuracy": 0.6733775039513906, + "num_tokens": 1302216832.0, + "step": 7773 + }, + { + "entropy": 1.6604902148246765, + "epoch": 0.8540276290132103, + "grad_norm": 0.780096173286438, + "learning_rate": 1.3532729820170835e-05, + "loss": 1.2723, + "mean_token_accuracy": 0.6720686207214991, + "num_tokens": 1302337836.0, + "step": 7774 + }, + { + "entropy": 1.6743205388387044, + "epoch": 0.8541374859245833, + "grad_norm": 0.9057300090789795, + "learning_rate": 1.353119335877063e-05, + "loss": 1.4191, + "mean_token_accuracy": 0.6672864605983099, + "num_tokens": 1302489417.0, + "step": 7775 + }, + { + "entropy": 1.6898978352546692, + "epoch": 0.8542473428359562, + "grad_norm": 0.7002508044242859, + "learning_rate": 1.3529656817259287e-05, + "loss": 1.4303, + "mean_token_accuracy": 0.6622383644183477, + "num_tokens": 1302686661.0, + "step": 7776 + }, + { + "entropy": 1.6237229605515797, + "epoch": 0.8543571997473292, + "grad_norm": 0.7121983170509338, + "learning_rate": 1.3528120195685451e-05, + "loss": 1.3871, + "mean_token_accuracy": 0.6706487536430359, + "num_tokens": 1302863425.0, + "step": 7777 + }, + { + "entropy": 1.7381801307201385, + "epoch": 0.854467056658702, + "grad_norm": 0.7030956149101257, + "learning_rate": 1.3526583494097749e-05, + "loss": 1.3846, + "mean_token_accuracy": 0.6603255172570547, + "num_tokens": 1303005875.0, + "step": 7778 + }, + { + "entropy": 1.7050376236438751, + "epoch": 0.8545769135700749, + "grad_norm": 0.737881600856781, + "learning_rate": 1.3525046712544818e-05, + "loss": 1.4434, + "mean_token_accuracy": 0.650563602646192, + "num_tokens": 1303199958.0, + "step": 7779 + }, + { + "entropy": 1.65973166624705, + "epoch": 0.8546867704814479, + "grad_norm": 0.8631945848464966, + "learning_rate": 1.3523509851075293e-05, + "loss": 1.3929, + "mean_token_accuracy": 0.6690235982338587, + "num_tokens": 1303349109.0, + "step": 7780 + }, + { + "entropy": 1.7056522568066914, + "epoch": 0.8547966273928208, + "grad_norm": 0.6538403630256653, + "learning_rate": 1.3521972909737824e-05, + "loss": 1.4684, + "mean_token_accuracy": 0.6520558893680573, + "num_tokens": 1303526369.0, + "step": 7781 + }, + { + "entropy": 1.664311518271764, + "epoch": 0.8549064843041938, + "grad_norm": 0.5570957064628601, + "learning_rate": 1.3520435888581044e-05, + "loss": 1.3669, + "mean_token_accuracy": 0.6615254829327265, + "num_tokens": 1303777805.0, + "step": 7782 + }, + { + "entropy": 1.7221255699793498, + "epoch": 0.8550163412155667, + "grad_norm": 0.6258386969566345, + "learning_rate": 1.351889878765361e-05, + "loss": 1.3535, + "mean_token_accuracy": 0.6568758289019266, + "num_tokens": 1303942723.0, + "step": 7783 + }, + { + "entropy": 1.716312845547994, + "epoch": 0.8551261981269397, + "grad_norm": 0.6711044907569885, + "learning_rate": 1.3517361607004158e-05, + "loss": 1.3727, + "mean_token_accuracy": 0.6513389696677526, + "num_tokens": 1304117546.0, + "step": 7784 + }, + { + "entropy": 1.7806439300378163, + "epoch": 0.8552360550383126, + "grad_norm": 0.7212101817131042, + "learning_rate": 1.3515824346681348e-05, + "loss": 1.4373, + "mean_token_accuracy": 0.6462565610806147, + "num_tokens": 1304261647.0, + "step": 7785 + }, + { + "entropy": 1.5951051115989685, + "epoch": 0.8553459119496856, + "grad_norm": 0.6845982074737549, + "learning_rate": 1.351428700673383e-05, + "loss": 1.2047, + "mean_token_accuracy": 0.6924866537253062, + "num_tokens": 1304411452.0, + "step": 7786 + }, + { + "entropy": 1.709506740172704, + "epoch": 0.8554557688610585, + "grad_norm": 0.5833786129951477, + "learning_rate": 1.3512749587210264e-05, + "loss": 1.3802, + "mean_token_accuracy": 0.6559178431828817, + "num_tokens": 1304591192.0, + "step": 7787 + }, + { + "entropy": 1.6945497194925945, + "epoch": 0.8555656257724314, + "grad_norm": 0.7337885499000549, + "learning_rate": 1.3511212088159302e-05, + "loss": 1.426, + "mean_token_accuracy": 0.6539691934982935, + "num_tokens": 1304787587.0, + "step": 7788 + }, + { + "entropy": 1.7732653816541035, + "epoch": 0.8556754826838043, + "grad_norm": 0.7243953347206116, + "learning_rate": 1.3509674509629612e-05, + "loss": 1.5344, + "mean_token_accuracy": 0.6324852307637533, + "num_tokens": 1304969243.0, + "step": 7789 + }, + { + "entropy": 1.7157978514830272, + "epoch": 0.8557853395951773, + "grad_norm": 0.6849737763404846, + "learning_rate": 1.3508136851669853e-05, + "loss": 1.3162, + "mean_token_accuracy": 0.6636256823937098, + "num_tokens": 1305132204.0, + "step": 7790 + }, + { + "entropy": 1.6959392031033833, + "epoch": 0.8558951965065502, + "grad_norm": 0.65585857629776, + "learning_rate": 1.3506599114328695e-05, + "loss": 1.262, + "mean_token_accuracy": 0.6834282577037811, + "num_tokens": 1305276090.0, + "step": 7791 + }, + { + "entropy": 1.6683409810066223, + "epoch": 0.8560050534179231, + "grad_norm": 0.7357686758041382, + "learning_rate": 1.35050612976548e-05, + "loss": 1.5049, + "mean_token_accuracy": 0.6626105507214864, + "num_tokens": 1305470955.0, + "step": 7792 + }, + { + "entropy": 1.6899990141391754, + "epoch": 0.8561149103292961, + "grad_norm": 0.7392531633377075, + "learning_rate": 1.3503523401696849e-05, + "loss": 1.4496, + "mean_token_accuracy": 0.6497561434904734, + "num_tokens": 1305631140.0, + "step": 7793 + }, + { + "entropy": 1.6974543333053589, + "epoch": 0.856224767240669, + "grad_norm": 0.6308239102363586, + "learning_rate": 1.3501985426503508e-05, + "loss": 1.3712, + "mean_token_accuracy": 0.6657331734895706, + "num_tokens": 1305789255.0, + "step": 7794 + }, + { + "entropy": 1.6489758292833965, + "epoch": 0.856334624152042, + "grad_norm": 0.5670278668403625, + "learning_rate": 1.3500447372123455e-05, + "loss": 1.4481, + "mean_token_accuracy": 0.6598306248585383, + "num_tokens": 1305971505.0, + "step": 7795 + }, + { + "entropy": 1.640490214029948, + "epoch": 0.8564444810634149, + "grad_norm": 0.7669674754142761, + "learning_rate": 1.3498909238605371e-05, + "loss": 1.2363, + "mean_token_accuracy": 0.68501316010952, + "num_tokens": 1306085158.0, + "step": 7796 + }, + { + "entropy": 1.697544554869334, + "epoch": 0.8565543379747879, + "grad_norm": 0.7241058349609375, + "learning_rate": 1.3497371025997938e-05, + "loss": 1.3508, + "mean_token_accuracy": 0.6685928404331207, + "num_tokens": 1306257026.0, + "step": 7797 + }, + { + "entropy": 1.694200575351715, + "epoch": 0.8566641948861607, + "grad_norm": 0.7969628572463989, + "learning_rate": 1.3495832734349831e-05, + "loss": 1.3376, + "mean_token_accuracy": 0.6603265057007471, + "num_tokens": 1306376894.0, + "step": 7798 + }, + { + "entropy": 1.7276227374871571, + "epoch": 0.8567740517975337, + "grad_norm": 0.7449766397476196, + "learning_rate": 1.3494294363709746e-05, + "loss": 1.4243, + "mean_token_accuracy": 0.6657720059156418, + "num_tokens": 1306503412.0, + "step": 7799 + }, + { + "entropy": 1.6635934511820476, + "epoch": 0.8568839087089066, + "grad_norm": 0.7158152461051941, + "learning_rate": 1.349275591412637e-05, + "loss": 1.3208, + "mean_token_accuracy": 0.6728008190790812, + "num_tokens": 1306649874.0, + "step": 7800 + }, + { + "entropy": 1.7274696131547291, + "epoch": 0.8569937656202796, + "grad_norm": 0.7230932116508484, + "learning_rate": 1.3491217385648392e-05, + "loss": 1.5066, + "mean_token_accuracy": 0.6431404302517573, + "num_tokens": 1306837527.0, + "step": 7801 + }, + { + "entropy": 1.7124834557374318, + "epoch": 0.8571036225316525, + "grad_norm": 0.6679414510726929, + "learning_rate": 1.3489678778324501e-05, + "loss": 1.343, + "mean_token_accuracy": 0.660656655828158, + "num_tokens": 1306992011.0, + "step": 7802 + }, + { + "entropy": 1.730410397052765, + "epoch": 0.8572134794430255, + "grad_norm": 0.676726758480072, + "learning_rate": 1.3488140092203405e-05, + "loss": 1.4971, + "mean_token_accuracy": 0.6543196365237236, + "num_tokens": 1307160616.0, + "step": 7803 + }, + { + "entropy": 1.6937486827373505, + "epoch": 0.8573233363543984, + "grad_norm": 0.8226978182792664, + "learning_rate": 1.3486601327333795e-05, + "loss": 1.3941, + "mean_token_accuracy": 0.6554071108500162, + "num_tokens": 1307301087.0, + "step": 7804 + }, + { + "entropy": 1.6323599517345428, + "epoch": 0.8574331932657714, + "grad_norm": 0.7059171199798584, + "learning_rate": 1.3485062483764372e-05, + "loss": 1.3001, + "mean_token_accuracy": 0.6734863370656967, + "num_tokens": 1307429854.0, + "step": 7805 + }, + { + "entropy": 1.6826396882534027, + "epoch": 0.8575430501771443, + "grad_norm": 0.6876824498176575, + "learning_rate": 1.3483523561543842e-05, + "loss": 1.4278, + "mean_token_accuracy": 0.6498329937458038, + "num_tokens": 1307587225.0, + "step": 7806 + }, + { + "entropy": 1.7240065733591716, + "epoch": 0.8576529070885172, + "grad_norm": 0.7715162634849548, + "learning_rate": 1.348198456072091e-05, + "loss": 1.4212, + "mean_token_accuracy": 0.6564840972423553, + "num_tokens": 1307748629.0, + "step": 7807 + }, + { + "entropy": 1.7205755809942882, + "epoch": 0.8577627639998902, + "grad_norm": 0.68276047706604, + "learning_rate": 1.3480445481344282e-05, + "loss": 1.467, + "mean_token_accuracy": 0.6505002329746882, + "num_tokens": 1307912570.0, + "step": 7808 + }, + { + "entropy": 1.754823088645935, + "epoch": 0.857872620911263, + "grad_norm": 0.6763650178909302, + "learning_rate": 1.3478906323462677e-05, + "loss": 1.5182, + "mean_token_accuracy": 0.6330088227987289, + "num_tokens": 1308180580.0, + "step": 7809 + }, + { + "entropy": 1.722066303094228, + "epoch": 0.857982477822636, + "grad_norm": 0.7227879166603088, + "learning_rate": 1.3477367087124801e-05, + "loss": 1.3975, + "mean_token_accuracy": 0.6556669274965922, + "num_tokens": 1308325653.0, + "step": 7810 + }, + { + "entropy": 1.6812674701213837, + "epoch": 0.8580923347340089, + "grad_norm": 0.5938608050346375, + "learning_rate": 1.3475827772379374e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6548441002766291, + "num_tokens": 1308530259.0, + "step": 7811 + }, + { + "entropy": 1.7212364772955577, + "epoch": 0.8582021916453819, + "grad_norm": 0.7832656502723694, + "learning_rate": 1.3474288379275116e-05, + "loss": 1.375, + "mean_token_accuracy": 0.6673020124435425, + "num_tokens": 1308688364.0, + "step": 7812 + }, + { + "entropy": 1.6957217554251354, + "epoch": 0.8583120485567548, + "grad_norm": 0.8879761695861816, + "learning_rate": 1.3472748907860745e-05, + "loss": 1.3826, + "mean_token_accuracy": 0.6552250782648722, + "num_tokens": 1308826808.0, + "step": 7813 + }, + { + "entropy": 1.6911301414171855, + "epoch": 0.8584219054681278, + "grad_norm": 0.7592836022377014, + "learning_rate": 1.347120935818498e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6610475679238638, + "num_tokens": 1308965132.0, + "step": 7814 + }, + { + "entropy": 1.6509084304173787, + "epoch": 0.8585317623795007, + "grad_norm": 0.6882309317588806, + "learning_rate": 1.3469669730296558e-05, + "loss": 1.5456, + "mean_token_accuracy": 0.6430366585652033, + "num_tokens": 1309185918.0, + "step": 7815 + }, + { + "entropy": 1.6446500718593597, + "epoch": 0.8586416192908737, + "grad_norm": 0.7358518838882446, + "learning_rate": 1.34681300242442e-05, + "loss": 1.277, + "mean_token_accuracy": 0.6738730818033218, + "num_tokens": 1309312422.0, + "step": 7816 + }, + { + "entropy": 1.7559974590937297, + "epoch": 0.8587514762022466, + "grad_norm": 0.682422399520874, + "learning_rate": 1.346659024007664e-05, + "loss": 1.3794, + "mean_token_accuracy": 0.6634985208511353, + "num_tokens": 1309441403.0, + "step": 7817 + }, + { + "entropy": 1.663654625415802, + "epoch": 0.8588613331136196, + "grad_norm": 0.6666421890258789, + "learning_rate": 1.3465050377842608e-05, + "loss": 1.4686, + "mean_token_accuracy": 0.659173255165418, + "num_tokens": 1309669132.0, + "step": 7818 + }, + { + "entropy": 1.6507892608642578, + "epoch": 0.8589711900249924, + "grad_norm": 0.6008228659629822, + "learning_rate": 1.3463510437590846e-05, + "loss": 1.4239, + "mean_token_accuracy": 0.6523040185372034, + "num_tokens": 1309852500.0, + "step": 7819 + }, + { + "entropy": 1.7078345616658528, + "epoch": 0.8590810469363653, + "grad_norm": 0.6348268985748291, + "learning_rate": 1.3461970419370083e-05, + "loss": 1.4027, + "mean_token_accuracy": 0.6630667001008987, + "num_tokens": 1310008578.0, + "step": 7820 + }, + { + "entropy": 1.7023847003777821, + "epoch": 0.8591909038477383, + "grad_norm": 0.8338757157325745, + "learning_rate": 1.3460430323229071e-05, + "loss": 1.3093, + "mean_token_accuracy": 0.6611761053403219, + "num_tokens": 1310151323.0, + "step": 7821 + }, + { + "entropy": 1.6935268541177113, + "epoch": 0.8593007607591112, + "grad_norm": 0.7700740098953247, + "learning_rate": 1.3458890149216546e-05, + "loss": 1.4202, + "mean_token_accuracy": 0.6526497304439545, + "num_tokens": 1310313628.0, + "step": 7822 + }, + { + "entropy": 1.7246541380882263, + "epoch": 0.8594106176704842, + "grad_norm": 0.6157558560371399, + "learning_rate": 1.3457349897381256e-05, + "loss": 1.2788, + "mean_token_accuracy": 0.6698776682217916, + "num_tokens": 1310453120.0, + "step": 7823 + }, + { + "entropy": 1.7128772636254628, + "epoch": 0.8595204745818571, + "grad_norm": 0.8066511750221252, + "learning_rate": 1.345580956777195e-05, + "loss": 1.3368, + "mean_token_accuracy": 0.6596921930710474, + "num_tokens": 1310607539.0, + "step": 7824 + }, + { + "entropy": 1.7081545094648998, + "epoch": 0.8596303314932301, + "grad_norm": 0.754356324672699, + "learning_rate": 1.3454269160437377e-05, + "loss": 1.4662, + "mean_token_accuracy": 0.638428787390391, + "num_tokens": 1310768768.0, + "step": 7825 + }, + { + "entropy": 1.6387466490268707, + "epoch": 0.859740188404603, + "grad_norm": 0.6813954710960388, + "learning_rate": 1.345272867542629e-05, + "loss": 1.2403, + "mean_token_accuracy": 0.6748148997624716, + "num_tokens": 1310885892.0, + "step": 7826 + }, + { + "entropy": 1.7912492752075195, + "epoch": 0.859850045315976, + "grad_norm": 0.7757691144943237, + "learning_rate": 1.3451188112787446e-05, + "loss": 1.3154, + "mean_token_accuracy": 0.6780353983243307, + "num_tokens": 1311037679.0, + "step": 7827 + }, + { + "entropy": 1.6761020123958588, + "epoch": 0.8599599022273489, + "grad_norm": 0.8084965348243713, + "learning_rate": 1.3449647472569603e-05, + "loss": 1.3014, + "mean_token_accuracy": 0.6743810077508291, + "num_tokens": 1311198055.0, + "step": 7828 + }, + { + "entropy": 1.708401362101237, + "epoch": 0.8600697591387219, + "grad_norm": 0.6399450898170471, + "learning_rate": 1.344810675482152e-05, + "loss": 1.2311, + "mean_token_accuracy": 0.6786706000566483, + "num_tokens": 1311322715.0, + "step": 7829 + }, + { + "entropy": 1.662269651889801, + "epoch": 0.8601796160500947, + "grad_norm": 0.6289361715316772, + "learning_rate": 1.3446565959591963e-05, + "loss": 1.2845, + "mean_token_accuracy": 0.6679496963818868, + "num_tokens": 1311461506.0, + "step": 7830 + }, + { + "entropy": 1.682697872320811, + "epoch": 0.8602894729614677, + "grad_norm": 0.613720715045929, + "learning_rate": 1.3445025086929698e-05, + "loss": 1.4083, + "mean_token_accuracy": 0.65741033355395, + "num_tokens": 1311626696.0, + "step": 7831 + }, + { + "entropy": 1.7680631478627522, + "epoch": 0.8603993298728406, + "grad_norm": 0.7231320142745972, + "learning_rate": 1.3443484136883486e-05, + "loss": 1.3911, + "mean_token_accuracy": 0.6568551162878672, + "num_tokens": 1311757726.0, + "step": 7832 + }, + { + "entropy": 1.722759485244751, + "epoch": 0.8605091867842135, + "grad_norm": 0.8545400500297546, + "learning_rate": 1.3441943109502105e-05, + "loss": 1.2789, + "mean_token_accuracy": 0.6633422871430715, + "num_tokens": 1311870074.0, + "step": 7833 + }, + { + "entropy": 1.7299232880274455, + "epoch": 0.8606190436955865, + "grad_norm": 0.765442430973053, + "learning_rate": 1.3440402004834323e-05, + "loss": 1.5995, + "mean_token_accuracy": 0.633381262421608, + "num_tokens": 1312052301.0, + "step": 7834 + }, + { + "entropy": 1.7414843638737996, + "epoch": 0.8607289006069594, + "grad_norm": 0.72737717628479, + "learning_rate": 1.343886082292892e-05, + "loss": 1.4679, + "mean_token_accuracy": 0.6508069137732188, + "num_tokens": 1312225655.0, + "step": 7835 + }, + { + "entropy": 1.7260840733846028, + "epoch": 0.8608387575183324, + "grad_norm": 0.7150377035140991, + "learning_rate": 1.343731956383467e-05, + "loss": 1.5002, + "mean_token_accuracy": 0.6485694497823715, + "num_tokens": 1312371137.0, + "step": 7836 + }, + { + "entropy": 1.7633921603361766, + "epoch": 0.8609486144297053, + "grad_norm": 0.7518701553344727, + "learning_rate": 1.3435778227600354e-05, + "loss": 1.4145, + "mean_token_accuracy": 0.6541777650515238, + "num_tokens": 1312486083.0, + "step": 7837 + }, + { + "entropy": 1.7039452989896138, + "epoch": 0.8610584713410783, + "grad_norm": 0.744445264339447, + "learning_rate": 1.3434236814274752e-05, + "loss": 1.3822, + "mean_token_accuracy": 0.6768287618954977, + "num_tokens": 1312646037.0, + "step": 7838 + }, + { + "entropy": 1.691188375155131, + "epoch": 0.8611683282524512, + "grad_norm": 0.6668843030929565, + "learning_rate": 1.3432695323906657e-05, + "loss": 1.3382, + "mean_token_accuracy": 0.66710098584493, + "num_tokens": 1312772973.0, + "step": 7839 + }, + { + "entropy": 1.7686751286188762, + "epoch": 0.8612781851638242, + "grad_norm": 0.8265035152435303, + "learning_rate": 1.3431153756544849e-05, + "loss": 1.4093, + "mean_token_accuracy": 0.654551645119985, + "num_tokens": 1312935895.0, + "step": 7840 + }, + { + "entropy": 1.7481578489144642, + "epoch": 0.861388042075197, + "grad_norm": 0.6910483241081238, + "learning_rate": 1.3429612112238119e-05, + "loss": 1.4032, + "mean_token_accuracy": 0.6643613328536352, + "num_tokens": 1313100695.0, + "step": 7841 + }, + { + "entropy": 1.7362829943497975, + "epoch": 0.86149789898657, + "grad_norm": 0.7029606699943542, + "learning_rate": 1.342807039103526e-05, + "loss": 1.4679, + "mean_token_accuracy": 0.6628567526737849, + "num_tokens": 1313272040.0, + "step": 7842 + }, + { + "entropy": 1.756723831097285, + "epoch": 0.8616077558979429, + "grad_norm": 3.727766513824463, + "learning_rate": 1.3426528592985068e-05, + "loss": 1.2346, + "mean_token_accuracy": 0.6736096292734146, + "num_tokens": 1313460210.0, + "step": 7843 + }, + { + "entropy": 1.678599238395691, + "epoch": 0.8617176128093159, + "grad_norm": 0.5941556692123413, + "learning_rate": 1.342498671813634e-05, + "loss": 1.423, + "mean_token_accuracy": 0.6411783198515574, + "num_tokens": 1313628236.0, + "step": 7844 + }, + { + "entropy": 1.6739432116349537, + "epoch": 0.8618274697206888, + "grad_norm": 0.6052295565605164, + "learning_rate": 1.3423444766537874e-05, + "loss": 1.3497, + "mean_token_accuracy": 0.6628371526797613, + "num_tokens": 1313794912.0, + "step": 7845 + }, + { + "entropy": 1.7657102247079213, + "epoch": 0.8619373266320617, + "grad_norm": 0.6851087212562561, + "learning_rate": 1.3421902738238473e-05, + "loss": 1.5192, + "mean_token_accuracy": 0.6398663818836212, + "num_tokens": 1313957418.0, + "step": 7846 + }, + { + "entropy": 1.7496927479902904, + "epoch": 0.8620471835434347, + "grad_norm": 0.675603449344635, + "learning_rate": 1.3420360633286944e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6570636580387751, + "num_tokens": 1314089818.0, + "step": 7847 + }, + { + "entropy": 1.7162054479122162, + "epoch": 0.8621570404548076, + "grad_norm": 0.8953336477279663, + "learning_rate": 1.3418818451732087e-05, + "loss": 1.4036, + "mean_token_accuracy": 0.6674930676817894, + "num_tokens": 1314213824.0, + "step": 7848 + }, + { + "entropy": 1.6898426910241444, + "epoch": 0.8622668973661806, + "grad_norm": 0.721627414226532, + "learning_rate": 1.3417276193622721e-05, + "loss": 1.5284, + "mean_token_accuracy": 0.6341640055179596, + "num_tokens": 1314457495.0, + "step": 7849 + }, + { + "entropy": 1.7005638281504314, + "epoch": 0.8623767542775534, + "grad_norm": 0.8751857876777649, + "learning_rate": 1.3415733859007652e-05, + "loss": 1.1988, + "mean_token_accuracy": 0.6919720123211542, + "num_tokens": 1314566939.0, + "step": 7850 + }, + { + "entropy": 1.7131429314613342, + "epoch": 0.8624866111889264, + "grad_norm": 0.7577322125434875, + "learning_rate": 1.3414191447935695e-05, + "loss": 1.3949, + "mean_token_accuracy": 0.6738038708766302, + "num_tokens": 1314734429.0, + "step": 7851 + }, + { + "entropy": 1.6478227376937866, + "epoch": 0.8625964681002993, + "grad_norm": 0.6248055696487427, + "learning_rate": 1.341264896045566e-05, + "loss": 1.4491, + "mean_token_accuracy": 0.6442533234755198, + "num_tokens": 1314953465.0, + "step": 7852 + }, + { + "entropy": 1.7321637471516926, + "epoch": 0.8627063250116723, + "grad_norm": 0.7030457258224487, + "learning_rate": 1.3411106396616382e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6582097162803014, + "num_tokens": 1315141404.0, + "step": 7853 + }, + { + "entropy": 1.7090543111165364, + "epoch": 0.8628161819230452, + "grad_norm": 0.6939349174499512, + "learning_rate": 1.3409563756466667e-05, + "loss": 1.4836, + "mean_token_accuracy": 0.63978943725427, + "num_tokens": 1315296574.0, + "step": 7854 + }, + { + "entropy": 1.7191306352615356, + "epoch": 0.8629260388344182, + "grad_norm": 0.654860258102417, + "learning_rate": 1.3408021040055348e-05, + "loss": 1.2846, + "mean_token_accuracy": 0.6714579413334528, + "num_tokens": 1315419882.0, + "step": 7855 + }, + { + "entropy": 1.6892946660518646, + "epoch": 0.8630358957457911, + "grad_norm": 0.7134132385253906, + "learning_rate": 1.3406478247431246e-05, + "loss": 1.4268, + "mean_token_accuracy": 0.6599131226539612, + "num_tokens": 1315635983.0, + "step": 7856 + }, + { + "entropy": 1.7296896080176036, + "epoch": 0.8631457526571641, + "grad_norm": 0.7645989656448364, + "learning_rate": 1.340493537864319e-05, + "loss": 1.3553, + "mean_token_accuracy": 0.6631773064533869, + "num_tokens": 1315842488.0, + "step": 7857 + }, + { + "entropy": 1.7573328018188477, + "epoch": 0.863255609568537, + "grad_norm": 0.6863840222358704, + "learning_rate": 1.3403392433740017e-05, + "loss": 1.406, + "mean_token_accuracy": 0.6623306075731913, + "num_tokens": 1316019789.0, + "step": 7858 + }, + { + "entropy": 1.680842439333598, + "epoch": 0.86336546647991, + "grad_norm": 0.6738454699516296, + "learning_rate": 1.3401849412770556e-05, + "loss": 1.3635, + "mean_token_accuracy": 0.6600955078999201, + "num_tokens": 1316184981.0, + "step": 7859 + }, + { + "entropy": 1.7413969735304515, + "epoch": 0.8634753233912829, + "grad_norm": 0.7007496953010559, + "learning_rate": 1.3400306315783641e-05, + "loss": 1.4063, + "mean_token_accuracy": 0.6600519170363744, + "num_tokens": 1316341745.0, + "step": 7860 + }, + { + "entropy": 1.6591811577479045, + "epoch": 0.8635851803026557, + "grad_norm": 0.711081326007843, + "learning_rate": 1.3398763142828115e-05, + "loss": 1.33, + "mean_token_accuracy": 0.6715270678202311, + "num_tokens": 1316453593.0, + "step": 7861 + }, + { + "entropy": 1.7344895700613658, + "epoch": 0.8636950372140287, + "grad_norm": 0.6897302865982056, + "learning_rate": 1.3397219893952816e-05, + "loss": 1.3221, + "mean_token_accuracy": 0.6635162134965261, + "num_tokens": 1316620759.0, + "step": 7862 + }, + { + "entropy": 1.7110367218653362, + "epoch": 0.8638048941254016, + "grad_norm": 0.7375456690788269, + "learning_rate": 1.3395676569206587e-05, + "loss": 1.3048, + "mean_token_accuracy": 0.6686635613441467, + "num_tokens": 1316744902.0, + "step": 7863 + }, + { + "entropy": 1.7058011094729106, + "epoch": 0.8639147510367746, + "grad_norm": 0.7151663303375244, + "learning_rate": 1.3394133168638274e-05, + "loss": 1.3693, + "mean_token_accuracy": 0.6559457530577978, + "num_tokens": 1316872149.0, + "step": 7864 + }, + { + "entropy": 1.6610862414042156, + "epoch": 0.8640246079481475, + "grad_norm": 0.660163402557373, + "learning_rate": 1.3392589692296727e-05, + "loss": 1.3023, + "mean_token_accuracy": 0.6779667536417643, + "num_tokens": 1317040405.0, + "step": 7865 + }, + { + "entropy": 1.7064382135868073, + "epoch": 0.8641344648595205, + "grad_norm": 0.7105300426483154, + "learning_rate": 1.3391046140230792e-05, + "loss": 1.2392, + "mean_token_accuracy": 0.6803303956985474, + "num_tokens": 1317174030.0, + "step": 7866 + }, + { + "entropy": 1.7627345124880474, + "epoch": 0.8642443217708934, + "grad_norm": 0.7074387073516846, + "learning_rate": 1.3389502512489326e-05, + "loss": 1.5413, + "mean_token_accuracy": 0.6326200217008591, + "num_tokens": 1317350387.0, + "step": 7867 + }, + { + "entropy": 1.7101947367191315, + "epoch": 0.8643541786822664, + "grad_norm": 0.6427745819091797, + "learning_rate": 1.3387958809121177e-05, + "loss": 1.3858, + "mean_token_accuracy": 0.6644566704829534, + "num_tokens": 1317502704.0, + "step": 7868 + }, + { + "entropy": 1.6640800833702087, + "epoch": 0.8644640355936393, + "grad_norm": 0.618799090385437, + "learning_rate": 1.3386415030175212e-05, + "loss": 1.4887, + "mean_token_accuracy": 0.6454523503780365, + "num_tokens": 1317730335.0, + "step": 7869 + }, + { + "entropy": 1.7683025399843852, + "epoch": 0.8645738925050123, + "grad_norm": 0.6431897282600403, + "learning_rate": 1.3384871175700287e-05, + "loss": 1.4189, + "mean_token_accuracy": 0.6522246897220612, + "num_tokens": 1317876056.0, + "step": 7870 + }, + { + "entropy": 1.6496396660804749, + "epoch": 0.8646837494163852, + "grad_norm": 0.6853657364845276, + "learning_rate": 1.3383327245745266e-05, + "loss": 1.3756, + "mean_token_accuracy": 0.6567689329385757, + "num_tokens": 1318062538.0, + "step": 7871 + }, + { + "entropy": 1.6585274438063304, + "epoch": 0.8647936063277581, + "grad_norm": 0.5792921185493469, + "learning_rate": 1.3381783240359007e-05, + "loss": 1.427, + "mean_token_accuracy": 0.6579019178946813, + "num_tokens": 1318242979.0, + "step": 7872 + }, + { + "entropy": 1.7485062181949615, + "epoch": 0.864903463239131, + "grad_norm": 1.0194803476333618, + "learning_rate": 1.3380239159590385e-05, + "loss": 1.7003, + "mean_token_accuracy": 0.648332287867864, + "num_tokens": 1318393747.0, + "step": 7873 + }, + { + "entropy": 1.6931703289349873, + "epoch": 0.8650133201505039, + "grad_norm": 0.665524423122406, + "learning_rate": 1.3378695003488264e-05, + "loss": 1.3505, + "mean_token_accuracy": 0.6695401221513748, + "num_tokens": 1318562851.0, + "step": 7874 + }, + { + "entropy": 1.7432339489459991, + "epoch": 0.8651231770618769, + "grad_norm": 0.70815509557724, + "learning_rate": 1.3377150772101517e-05, + "loss": 1.4095, + "mean_token_accuracy": 0.6534250229597092, + "num_tokens": 1318737443.0, + "step": 7875 + }, + { + "entropy": 1.6439895927906036, + "epoch": 0.8652330339732498, + "grad_norm": 0.6177237629890442, + "learning_rate": 1.3375606465479024e-05, + "loss": 1.3875, + "mean_token_accuracy": 0.6528783192237219, + "num_tokens": 1318906562.0, + "step": 7876 + }, + { + "entropy": 1.672441154718399, + "epoch": 0.8653428908846228, + "grad_norm": 0.6379650235176086, + "learning_rate": 1.3374062083669653e-05, + "loss": 1.3043, + "mean_token_accuracy": 0.6697641412417094, + "num_tokens": 1319050646.0, + "step": 7877 + }, + { + "entropy": 1.668338378270467, + "epoch": 0.8654527477959957, + "grad_norm": 0.7436346411705017, + "learning_rate": 1.3372517626722288e-05, + "loss": 1.3871, + "mean_token_accuracy": 0.6683712204297384, + "num_tokens": 1319219239.0, + "step": 7878 + }, + { + "entropy": 1.642588605483373, + "epoch": 0.8655626047073687, + "grad_norm": 0.7161867022514343, + "learning_rate": 1.3370973094685809e-05, + "loss": 1.4992, + "mean_token_accuracy": 0.6446088055769602, + "num_tokens": 1319395389.0, + "step": 7879 + }, + { + "entropy": 1.639831284681956, + "epoch": 0.8656724616187416, + "grad_norm": 0.6735728979110718, + "learning_rate": 1.33694284876091e-05, + "loss": 1.3289, + "mean_token_accuracy": 0.6666930864254633, + "num_tokens": 1319561485.0, + "step": 7880 + }, + { + "entropy": 1.6760977109273274, + "epoch": 0.8657823185301146, + "grad_norm": 0.5775339007377625, + "learning_rate": 1.3367883805541048e-05, + "loss": 1.4283, + "mean_token_accuracy": 0.6341107288996378, + "num_tokens": 1319788535.0, + "step": 7881 + }, + { + "entropy": 1.6838768422603607, + "epoch": 0.8658921754414874, + "grad_norm": 0.8022451400756836, + "learning_rate": 1.3366339048530537e-05, + "loss": 1.6205, + "mean_token_accuracy": 0.6297398805618286, + "num_tokens": 1320001016.0, + "step": 7882 + }, + { + "entropy": 1.636765331029892, + "epoch": 0.8660020323528604, + "grad_norm": 0.6422226428985596, + "learning_rate": 1.3364794216626467e-05, + "loss": 1.3239, + "mean_token_accuracy": 0.6695540249347687, + "num_tokens": 1320153921.0, + "step": 7883 + }, + { + "entropy": 1.7224363684654236, + "epoch": 0.8661118892642333, + "grad_norm": 0.6930742859840393, + "learning_rate": 1.3363249309877719e-05, + "loss": 1.373, + "mean_token_accuracy": 0.6712667942047119, + "num_tokens": 1320321110.0, + "step": 7884 + }, + { + "entropy": 1.6932853261629741, + "epoch": 0.8662217461756063, + "grad_norm": 0.771900475025177, + "learning_rate": 1.3361704328333198e-05, + "loss": 1.4568, + "mean_token_accuracy": 0.6484440217415491, + "num_tokens": 1320473455.0, + "step": 7885 + }, + { + "entropy": 1.7034966945648193, + "epoch": 0.8663316030869792, + "grad_norm": 0.6940920352935791, + "learning_rate": 1.3360159272041801e-05, + "loss": 1.394, + "mean_token_accuracy": 0.6586278776327769, + "num_tokens": 1320651894.0, + "step": 7886 + }, + { + "entropy": 1.701320121685664, + "epoch": 0.8664414599983521, + "grad_norm": 0.614683985710144, + "learning_rate": 1.3358614141052429e-05, + "loss": 1.4261, + "mean_token_accuracy": 0.6411514133214951, + "num_tokens": 1320841977.0, + "step": 7887 + }, + { + "entropy": 1.6840573747952778, + "epoch": 0.8665513169097251, + "grad_norm": 0.7097548842430115, + "learning_rate": 1.3357068935413975e-05, + "loss": 1.4517, + "mean_token_accuracy": 0.6532600124677023, + "num_tokens": 1321017646.0, + "step": 7888 + }, + { + "entropy": 1.7255522906780243, + "epoch": 0.866661173821098, + "grad_norm": 0.7355175614356995, + "learning_rate": 1.3355523655175357e-05, + "loss": 1.2392, + "mean_token_accuracy": 0.682395468155543, + "num_tokens": 1321169484.0, + "step": 7889 + }, + { + "entropy": 1.6489202578862507, + "epoch": 0.866771030732471, + "grad_norm": 0.6251848936080933, + "learning_rate": 1.3353978300385472e-05, + "loss": 1.2982, + "mean_token_accuracy": 0.6671447803576788, + "num_tokens": 1321319826.0, + "step": 7890 + }, + { + "entropy": 1.6924934685230255, + "epoch": 0.8668808876438439, + "grad_norm": 0.6230257749557495, + "learning_rate": 1.3352432871093239e-05, + "loss": 1.3966, + "mean_token_accuracy": 0.6617010831832886, + "num_tokens": 1321482330.0, + "step": 7891 + }, + { + "entropy": 1.7230217059453328, + "epoch": 0.8669907445552169, + "grad_norm": 0.6096069812774658, + "learning_rate": 1.3350887367347565e-05, + "loss": 1.5194, + "mean_token_accuracy": 0.6552851547797521, + "num_tokens": 1321660671.0, + "step": 7892 + }, + { + "entropy": 1.6864555577437084, + "epoch": 0.8671006014665897, + "grad_norm": 0.604369044303894, + "learning_rate": 1.3349341789197365e-05, + "loss": 1.3665, + "mean_token_accuracy": 0.6564571112394333, + "num_tokens": 1321849446.0, + "step": 7893 + }, + { + "entropy": 1.7439285119374592, + "epoch": 0.8672104583779627, + "grad_norm": 0.802845299243927, + "learning_rate": 1.3347796136691553e-05, + "loss": 1.4116, + "mean_token_accuracy": 0.6643748581409454, + "num_tokens": 1322003294.0, + "step": 7894 + }, + { + "entropy": 1.6520594159762065, + "epoch": 0.8673203152893356, + "grad_norm": 1.0191586017608643, + "learning_rate": 1.3346250409879056e-05, + "loss": 1.4608, + "mean_token_accuracy": 0.6599269956350327, + "num_tokens": 1322132928.0, + "step": 7895 + }, + { + "entropy": 1.6756224830945332, + "epoch": 0.8674301722007086, + "grad_norm": 0.5737661123275757, + "learning_rate": 1.3344704608808787e-05, + "loss": 0.9707, + "mean_token_accuracy": 0.696823646624883, + "num_tokens": 1322293646.0, + "step": 7896 + }, + { + "entropy": 1.720400442679723, + "epoch": 0.8675400291120815, + "grad_norm": 0.7352355122566223, + "learning_rate": 1.3343158733529673e-05, + "loss": 1.3443, + "mean_token_accuracy": 0.6619482586781184, + "num_tokens": 1322472406.0, + "step": 7897 + }, + { + "entropy": 1.726184109846751, + "epoch": 0.8676498860234545, + "grad_norm": 0.7784338593482971, + "learning_rate": 1.3341612784090643e-05, + "loss": 1.2061, + "mean_token_accuracy": 0.683728352189064, + "num_tokens": 1322582080.0, + "step": 7898 + }, + { + "entropy": 1.754507710536321, + "epoch": 0.8677597429348274, + "grad_norm": 0.8482814431190491, + "learning_rate": 1.3340066760540624e-05, + "loss": 1.4338, + "mean_token_accuracy": 0.6534205625454584, + "num_tokens": 1322768188.0, + "step": 7899 + }, + { + "entropy": 1.7251827617486317, + "epoch": 0.8678695998462004, + "grad_norm": 0.6221253871917725, + "learning_rate": 1.3338520662928545e-05, + "loss": 1.525, + "mean_token_accuracy": 0.6393624295790991, + "num_tokens": 1322947115.0, + "step": 7900 + }, + { + "entropy": 1.673731615146001, + "epoch": 0.8679794567575733, + "grad_norm": 0.748742938041687, + "learning_rate": 1.3336974491303343e-05, + "loss": 1.2827, + "mean_token_accuracy": 0.6780743896961212, + "num_tokens": 1323119216.0, + "step": 7901 + }, + { + "entropy": 1.6825636823972066, + "epoch": 0.8680893136689461, + "grad_norm": 0.5840120911598206, + "learning_rate": 1.3335428245713949e-05, + "loss": 1.2393, + "mean_token_accuracy": 0.6743812263011932, + "num_tokens": 1323274126.0, + "step": 7902 + }, + { + "entropy": 1.7099784115950267, + "epoch": 0.8681991705803191, + "grad_norm": 0.6230765581130981, + "learning_rate": 1.3333881926209304e-05, + "loss": 1.5362, + "mean_token_accuracy": 0.6441142161687216, + "num_tokens": 1323465046.0, + "step": 7903 + }, + { + "entropy": 1.7780559460322063, + "epoch": 0.868309027491692, + "grad_norm": 0.778548538684845, + "learning_rate": 1.3332335532838347e-05, + "loss": 1.2601, + "mean_token_accuracy": 0.6791570882002512, + "num_tokens": 1323603989.0, + "step": 7904 + }, + { + "entropy": 1.6974503993988037, + "epoch": 0.868418884403065, + "grad_norm": 0.771642804145813, + "learning_rate": 1.3330789065650025e-05, + "loss": 1.4817, + "mean_token_accuracy": 0.6580607742071152, + "num_tokens": 1323792589.0, + "step": 7905 + }, + { + "entropy": 1.7237968544165294, + "epoch": 0.8685287413144379, + "grad_norm": 0.674707293510437, + "learning_rate": 1.3329242524693278e-05, + "loss": 1.3211, + "mean_token_accuracy": 0.657488743464152, + "num_tokens": 1323964176.0, + "step": 7906 + }, + { + "entropy": 1.68796439965566, + "epoch": 0.8686385982258109, + "grad_norm": 0.6906165480613708, + "learning_rate": 1.3327695910017051e-05, + "loss": 1.3029, + "mean_token_accuracy": 0.6650421073039373, + "num_tokens": 1324089399.0, + "step": 7907 + }, + { + "entropy": 1.6761383811632793, + "epoch": 0.8687484551371838, + "grad_norm": 0.6780588626861572, + "learning_rate": 1.33261492216703e-05, + "loss": 1.3755, + "mean_token_accuracy": 0.6589695960283279, + "num_tokens": 1324248925.0, + "step": 7908 + }, + { + "entropy": 1.6557303369045258, + "epoch": 0.8688583120485568, + "grad_norm": 0.6261764764785767, + "learning_rate": 1.3324602459701973e-05, + "loss": 1.3346, + "mean_token_accuracy": 0.6587485869725546, + "num_tokens": 1324440175.0, + "step": 7909 + }, + { + "entropy": 1.700795332590739, + "epoch": 0.8689681689599297, + "grad_norm": 0.6909230351448059, + "learning_rate": 1.332305562416103e-05, + "loss": 1.3137, + "mean_token_accuracy": 0.6629070142904917, + "num_tokens": 1324596366.0, + "step": 7910 + }, + { + "entropy": 1.7026448547840118, + "epoch": 0.8690780258713027, + "grad_norm": 0.6621904373168945, + "learning_rate": 1.3321508715096418e-05, + "loss": 1.3886, + "mean_token_accuracy": 0.6658767014741898, + "num_tokens": 1324743986.0, + "step": 7911 + }, + { + "entropy": 1.7210921943187714, + "epoch": 0.8691878827826756, + "grad_norm": 0.660092294216156, + "learning_rate": 1.3319961732557105e-05, + "loss": 1.3884, + "mean_token_accuracy": 0.6560011406739553, + "num_tokens": 1324875317.0, + "step": 7912 + }, + { + "entropy": 1.6727829774220784, + "epoch": 0.8692977396940486, + "grad_norm": 0.6397646069526672, + "learning_rate": 1.3318414676592047e-05, + "loss": 1.3876, + "mean_token_accuracy": 0.6527626812458038, + "num_tokens": 1325061131.0, + "step": 7913 + }, + { + "entropy": 1.671280860900879, + "epoch": 0.8694075966054214, + "grad_norm": 0.6057349443435669, + "learning_rate": 1.3316867547250207e-05, + "loss": 1.3217, + "mean_token_accuracy": 0.6597979366779327, + "num_tokens": 1325235395.0, + "step": 7914 + }, + { + "entropy": 1.7142386734485626, + "epoch": 0.8695174535167943, + "grad_norm": 0.6590214967727661, + "learning_rate": 1.3315320344580556e-05, + "loss": 1.3788, + "mean_token_accuracy": 0.6602257490158081, + "num_tokens": 1325394926.0, + "step": 7915 + }, + { + "entropy": 1.687685916821162, + "epoch": 0.8696273104281673, + "grad_norm": 0.8642633557319641, + "learning_rate": 1.3313773068632058e-05, + "loss": 1.2531, + "mean_token_accuracy": 0.6789219677448273, + "num_tokens": 1325582463.0, + "step": 7916 + }, + { + "entropy": 1.6729052861531575, + "epoch": 0.8697371673395402, + "grad_norm": 0.6398195028305054, + "learning_rate": 1.3312225719453688e-05, + "loss": 1.386, + "mean_token_accuracy": 0.6651468873023987, + "num_tokens": 1325770243.0, + "step": 7917 + }, + { + "entropy": 1.6908937791983287, + "epoch": 0.8698470242509132, + "grad_norm": 0.5825358033180237, + "learning_rate": 1.3310678297094412e-05, + "loss": 1.4217, + "mean_token_accuracy": 0.658950557311376, + "num_tokens": 1325994900.0, + "step": 7918 + }, + { + "entropy": 1.717313547929128, + "epoch": 0.8699568811622861, + "grad_norm": 0.6195710897445679, + "learning_rate": 1.3309130801603209e-05, + "loss": 1.4411, + "mean_token_accuracy": 0.651375080148379, + "num_tokens": 1326175599.0, + "step": 7919 + }, + { + "entropy": 1.7254052360852559, + "epoch": 0.8700667380736591, + "grad_norm": 0.6985616087913513, + "learning_rate": 1.330758323302906e-05, + "loss": 1.5164, + "mean_token_accuracy": 0.6510032018025717, + "num_tokens": 1326384995.0, + "step": 7920 + }, + { + "entropy": 1.7191411058108013, + "epoch": 0.870176594985032, + "grad_norm": 0.7221682667732239, + "learning_rate": 1.330603559142094e-05, + "loss": 1.593, + "mean_token_accuracy": 0.6362573454777399, + "num_tokens": 1326565564.0, + "step": 7921 + }, + { + "entropy": 1.6968744397163391, + "epoch": 0.870286451896405, + "grad_norm": 0.6443734765052795, + "learning_rate": 1.3304487876827831e-05, + "loss": 1.418, + "mean_token_accuracy": 0.6536405632893244, + "num_tokens": 1326738893.0, + "step": 7922 + }, + { + "entropy": 1.71379288037618, + "epoch": 0.8703963088077779, + "grad_norm": 0.6184552311897278, + "learning_rate": 1.3302940089298722e-05, + "loss": 1.3953, + "mean_token_accuracy": 0.661503846446673, + "num_tokens": 1326894910.0, + "step": 7923 + }, + { + "entropy": 1.7957302431265514, + "epoch": 0.8705061657191508, + "grad_norm": 0.6591717600822449, + "learning_rate": 1.3301392228882598e-05, + "loss": 1.5458, + "mean_token_accuracy": 0.6335903803507487, + "num_tokens": 1327137563.0, + "step": 7924 + }, + { + "entropy": 1.6927340527375538, + "epoch": 0.8706160226305237, + "grad_norm": 0.7157540917396545, + "learning_rate": 1.3299844295628442e-05, + "loss": 1.4384, + "mean_token_accuracy": 0.6635022660096487, + "num_tokens": 1327287689.0, + "step": 7925 + }, + { + "entropy": 1.6452916463216145, + "epoch": 0.8707258795418967, + "grad_norm": 1.110607624053955, + "learning_rate": 1.3298296289585254e-05, + "loss": 1.2155, + "mean_token_accuracy": 0.6726205994685491, + "num_tokens": 1327496091.0, + "step": 7926 + }, + { + "entropy": 1.6707601845264435, + "epoch": 0.8708357364532696, + "grad_norm": 1.3014352321624756, + "learning_rate": 1.3296748210802022e-05, + "loss": 1.361, + "mean_token_accuracy": 0.6518186579147974, + "num_tokens": 1327691641.0, + "step": 7927 + }, + { + "entropy": 1.6890023251374562, + "epoch": 0.8709455933646425, + "grad_norm": 0.759329080581665, + "learning_rate": 1.3295200059327744e-05, + "loss": 1.2898, + "mean_token_accuracy": 0.6639624188343684, + "num_tokens": 1327858295.0, + "step": 7928 + }, + { + "entropy": 1.765195220708847, + "epoch": 0.8710554502760155, + "grad_norm": 0.8240900039672852, + "learning_rate": 1.329365183521142e-05, + "loss": 1.6451, + "mean_token_accuracy": 0.6218532472848892, + "num_tokens": 1328056850.0, + "step": 7929 + }, + { + "entropy": 1.7044878403345745, + "epoch": 0.8711653071873884, + "grad_norm": 0.7272043824195862, + "learning_rate": 1.3292103538502048e-05, + "loss": 1.4744, + "mean_token_accuracy": 0.6585461994012197, + "num_tokens": 1328221991.0, + "step": 7930 + }, + { + "entropy": 1.6718167662620544, + "epoch": 0.8712751640987614, + "grad_norm": 0.7868055701255798, + "learning_rate": 1.3290555169248631e-05, + "loss": 1.2644, + "mean_token_accuracy": 0.683634286125501, + "num_tokens": 1328351299.0, + "step": 7931 + }, + { + "entropy": 1.7744904160499573, + "epoch": 0.8713850210101343, + "grad_norm": 1.1038771867752075, + "learning_rate": 1.3289006727500179e-05, + "loss": 1.5812, + "mean_token_accuracy": 0.6369357059399287, + "num_tokens": 1328547843.0, + "step": 7932 + }, + { + "entropy": 1.731363942225774, + "epoch": 0.8714948779215073, + "grad_norm": 0.6608824729919434, + "learning_rate": 1.3287458213305693e-05, + "loss": 1.3958, + "mean_token_accuracy": 0.6688510825236639, + "num_tokens": 1328733249.0, + "step": 7933 + }, + { + "entropy": 1.6438096066315968, + "epoch": 0.8716047348328801, + "grad_norm": 0.6863547563552856, + "learning_rate": 1.3285909626714184e-05, + "loss": 1.394, + "mean_token_accuracy": 0.6732639074325562, + "num_tokens": 1328944621.0, + "step": 7934 + }, + { + "entropy": 1.752169926961263, + "epoch": 0.8717145917442531, + "grad_norm": 0.7345601320266724, + "learning_rate": 1.3284360967774668e-05, + "loss": 1.5279, + "mean_token_accuracy": 0.639055406053861, + "num_tokens": 1329117591.0, + "step": 7935 + }, + { + "entropy": 1.7430291771888733, + "epoch": 0.871824448655626, + "grad_norm": 0.684219241142273, + "learning_rate": 1.3282812236536153e-05, + "loss": 1.2381, + "mean_token_accuracy": 0.6767359425624212, + "num_tokens": 1329257421.0, + "step": 7936 + }, + { + "entropy": 1.7502950926621754, + "epoch": 0.871934305566999, + "grad_norm": 0.6932651996612549, + "learning_rate": 1.328126343304766e-05, + "loss": 1.4658, + "mean_token_accuracy": 0.6520059059063593, + "num_tokens": 1329399517.0, + "step": 7937 + }, + { + "entropy": 1.6875435908635457, + "epoch": 0.8720441624783719, + "grad_norm": 0.7229261994361877, + "learning_rate": 1.3279714557358207e-05, + "loss": 1.2595, + "mean_token_accuracy": 0.678600956996282, + "num_tokens": 1329537962.0, + "step": 7938 + }, + { + "entropy": 1.6786061922709148, + "epoch": 0.8721540193897449, + "grad_norm": 0.707375168800354, + "learning_rate": 1.327816560951682e-05, + "loss": 1.3806, + "mean_token_accuracy": 0.679164374868075, + "num_tokens": 1329678550.0, + "step": 7939 + }, + { + "entropy": 1.695389598608017, + "epoch": 0.8722638763011178, + "grad_norm": 0.7458509206771851, + "learning_rate": 1.3276616589572516e-05, + "loss": 1.3762, + "mean_token_accuracy": 0.646119033296903, + "num_tokens": 1329866491.0, + "step": 7940 + }, + { + "entropy": 1.7952686448891957, + "epoch": 0.8723737332124907, + "grad_norm": 0.9156729578971863, + "learning_rate": 1.3275067497574323e-05, + "loss": 1.4037, + "mean_token_accuracy": 0.6545374641815821, + "num_tokens": 1330008220.0, + "step": 7941 + }, + { + "entropy": 1.704797516266505, + "epoch": 0.8724835901238637, + "grad_norm": 0.7488060593605042, + "learning_rate": 1.3273518333571267e-05, + "loss": 1.4559, + "mean_token_accuracy": 0.6578322052955627, + "num_tokens": 1330177595.0, + "step": 7942 + }, + { + "entropy": 1.7436169187227886, + "epoch": 0.8725934470352366, + "grad_norm": 0.6806449294090271, + "learning_rate": 1.3271969097612381e-05, + "loss": 1.5533, + "mean_token_accuracy": 0.6280863881111145, + "num_tokens": 1330355286.0, + "step": 7943 + }, + { + "entropy": 1.7313541571299236, + "epoch": 0.8727033039466096, + "grad_norm": 0.7117105722427368, + "learning_rate": 1.3270419789746696e-05, + "loss": 1.4541, + "mean_token_accuracy": 0.6451993534962336, + "num_tokens": 1330539558.0, + "step": 7944 + }, + { + "entropy": 1.8234418034553528, + "epoch": 0.8728131608579824, + "grad_norm": 0.8532108068466187, + "learning_rate": 1.326887041002325e-05, + "loss": 1.5098, + "mean_token_accuracy": 0.6385622421900431, + "num_tokens": 1330714292.0, + "step": 7945 + }, + { + "entropy": 1.7574726343154907, + "epoch": 0.8729230177693554, + "grad_norm": 0.7637962102890015, + "learning_rate": 1.3267320958491078e-05, + "loss": 1.299, + "mean_token_accuracy": 0.6635063340266546, + "num_tokens": 1330836527.0, + "step": 7946 + }, + { + "entropy": 1.7145869135856628, + "epoch": 0.8730328746807283, + "grad_norm": 0.6418426036834717, + "learning_rate": 1.3265771435199214e-05, + "loss": 1.3584, + "mean_token_accuracy": 0.6642368783553442, + "num_tokens": 1330995034.0, + "step": 7947 + }, + { + "entropy": 1.7334729234377544, + "epoch": 0.8731427315921013, + "grad_norm": 0.7519394159317017, + "learning_rate": 1.3264221840196712e-05, + "loss": 1.3698, + "mean_token_accuracy": 0.6525163898865382, + "num_tokens": 1331131359.0, + "step": 7948 + }, + { + "entropy": 1.720908761024475, + "epoch": 0.8732525885034742, + "grad_norm": 0.7024477124214172, + "learning_rate": 1.3262672173532607e-05, + "loss": 1.3505, + "mean_token_accuracy": 0.6521359930435816, + "num_tokens": 1331265467.0, + "step": 7949 + }, + { + "entropy": 1.6730608840783436, + "epoch": 0.8733624454148472, + "grad_norm": 0.6420222520828247, + "learning_rate": 1.3261122435255946e-05, + "loss": 1.338, + "mean_token_accuracy": 0.6632480919361115, + "num_tokens": 1331421892.0, + "step": 7950 + }, + { + "entropy": 1.7371685206890106, + "epoch": 0.8734723023262201, + "grad_norm": 0.7718887329101562, + "learning_rate": 1.3259572625415778e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.65486179292202, + "num_tokens": 1331550141.0, + "step": 7951 + }, + { + "entropy": 1.6655554672082264, + "epoch": 0.8735821592375931, + "grad_norm": 0.646058201789856, + "learning_rate": 1.3258022744061157e-05, + "loss": 1.3398, + "mean_token_accuracy": 0.6610534985860189, + "num_tokens": 1331802988.0, + "step": 7952 + }, + { + "entropy": 1.7078120807806652, + "epoch": 0.873692016148966, + "grad_norm": 0.8628278374671936, + "learning_rate": 1.3256472791241131e-05, + "loss": 1.4658, + "mean_token_accuracy": 0.6470666378736496, + "num_tokens": 1331979281.0, + "step": 7953 + }, + { + "entropy": 1.7339671850204468, + "epoch": 0.873801873060339, + "grad_norm": 0.8818228244781494, + "learning_rate": 1.3254922767004759e-05, + "loss": 1.2851, + "mean_token_accuracy": 0.6744043976068497, + "num_tokens": 1332118565.0, + "step": 7954 + }, + { + "entropy": 1.6598787903785706, + "epoch": 0.8739117299717118, + "grad_norm": 0.7136387228965759, + "learning_rate": 1.3253372671401099e-05, + "loss": 1.3051, + "mean_token_accuracy": 0.6756090174118677, + "num_tokens": 1332233805.0, + "step": 7955 + }, + { + "entropy": 1.7392461498578389, + "epoch": 0.8740215868830847, + "grad_norm": 0.6263501644134521, + "learning_rate": 1.3251822504479207e-05, + "loss": 1.3962, + "mean_token_accuracy": 0.6539953947067261, + "num_tokens": 1332404169.0, + "step": 7956 + }, + { + "entropy": 1.7147627174854279, + "epoch": 0.8741314437944577, + "grad_norm": 0.7884214520454407, + "learning_rate": 1.3250272266288149e-05, + "loss": 1.577, + "mean_token_accuracy": 0.6330409099658331, + "num_tokens": 1332570136.0, + "step": 7957 + }, + { + "entropy": 1.7000277042388916, + "epoch": 0.8742413007058306, + "grad_norm": 0.9027677178382874, + "learning_rate": 1.324872195687699e-05, + "loss": 1.3856, + "mean_token_accuracy": 0.6580591748158137, + "num_tokens": 1332706178.0, + "step": 7958 + }, + { + "entropy": 1.7484715183575947, + "epoch": 0.8743511576172036, + "grad_norm": 0.8269484639167786, + "learning_rate": 1.3247171576294791e-05, + "loss": 1.4001, + "mean_token_accuracy": 0.6555562863747278, + "num_tokens": 1332886733.0, + "step": 7959 + }, + { + "entropy": 1.698500504096349, + "epoch": 0.8744610145285765, + "grad_norm": 0.7188910245895386, + "learning_rate": 1.3245621124590625e-05, + "loss": 1.4849, + "mean_token_accuracy": 0.6570742378632227, + "num_tokens": 1333091324.0, + "step": 7960 + }, + { + "entropy": 1.691120167573293, + "epoch": 0.8745708714399495, + "grad_norm": 0.6104452610015869, + "learning_rate": 1.3244070601813564e-05, + "loss": 1.4192, + "mean_token_accuracy": 0.6446485817432404, + "num_tokens": 1333253616.0, + "step": 7961 + }, + { + "entropy": 1.6805418034394581, + "epoch": 0.8746807283513224, + "grad_norm": 1.033692717552185, + "learning_rate": 1.3242520008012676e-05, + "loss": 1.5897, + "mean_token_accuracy": 0.6430460214614868, + "num_tokens": 1333449657.0, + "step": 7962 + }, + { + "entropy": 1.650507350762685, + "epoch": 0.8747905852626954, + "grad_norm": 0.6361247301101685, + "learning_rate": 1.3240969343237042e-05, + "loss": 1.2264, + "mean_token_accuracy": 0.6698981175820032, + "num_tokens": 1333679417.0, + "step": 7963 + }, + { + "entropy": 1.6824390292167664, + "epoch": 0.8749004421740683, + "grad_norm": 0.7842257618904114, + "learning_rate": 1.3239418607535737e-05, + "loss": 1.2012, + "mean_token_accuracy": 0.6809289256731669, + "num_tokens": 1333808944.0, + "step": 7964 + }, + { + "entropy": 1.6636536419391632, + "epoch": 0.8750102990854413, + "grad_norm": 0.6498377919197083, + "learning_rate": 1.3237867800957843e-05, + "loss": 1.2811, + "mean_token_accuracy": 0.6815899511178335, + "num_tokens": 1334018959.0, + "step": 7965 + }, + { + "entropy": 1.7166621784369152, + "epoch": 0.8751201559968141, + "grad_norm": 0.678538978099823, + "learning_rate": 1.3236316923552443e-05, + "loss": 1.4701, + "mean_token_accuracy": 0.6503576040267944, + "num_tokens": 1334162080.0, + "step": 7966 + }, + { + "entropy": 1.6881644229094188, + "epoch": 0.8752300129081871, + "grad_norm": 0.8077633380889893, + "learning_rate": 1.3234765975368622e-05, + "loss": 1.515, + "mean_token_accuracy": 0.6426454931497574, + "num_tokens": 1334344960.0, + "step": 7967 + }, + { + "entropy": 1.7116271654764812, + "epoch": 0.87533986981956, + "grad_norm": 0.6946110725402832, + "learning_rate": 1.3233214956455461e-05, + "loss": 1.3035, + "mean_token_accuracy": 0.6665644148985544, + "num_tokens": 1334564958.0, + "step": 7968 + }, + { + "entropy": 1.699356774489085, + "epoch": 0.8754497267309329, + "grad_norm": 0.7314174175262451, + "learning_rate": 1.3231663866862052e-05, + "loss": 1.4564, + "mean_token_accuracy": 0.6603659292062124, + "num_tokens": 1334742349.0, + "step": 7969 + }, + { + "entropy": 1.7104867994785309, + "epoch": 0.8755595836423059, + "grad_norm": 0.6361663341522217, + "learning_rate": 1.323011270663749e-05, + "loss": 1.546, + "mean_token_accuracy": 0.6363288114468256, + "num_tokens": 1334924134.0, + "step": 7970 + }, + { + "entropy": 1.703242838382721, + "epoch": 0.8756694405536788, + "grad_norm": 0.6855658888816833, + "learning_rate": 1.3228561475830866e-05, + "loss": 1.256, + "mean_token_accuracy": 0.6767214983701706, + "num_tokens": 1335041877.0, + "step": 7971 + }, + { + "entropy": 1.7060719430446625, + "epoch": 0.8757792974650518, + "grad_norm": 0.6562889218330383, + "learning_rate": 1.3227010174491272e-05, + "loss": 1.5151, + "mean_token_accuracy": 0.6551367143789927, + "num_tokens": 1335195526.0, + "step": 7972 + }, + { + "entropy": 1.6942510604858398, + "epoch": 0.8758891543764247, + "grad_norm": 0.680708646774292, + "learning_rate": 1.3225458802667814e-05, + "loss": 1.3885, + "mean_token_accuracy": 0.6581521232922872, + "num_tokens": 1335342529.0, + "step": 7973 + }, + { + "entropy": 1.6612178683280945, + "epoch": 0.8759990112877977, + "grad_norm": 0.641156017780304, + "learning_rate": 1.3223907360409585e-05, + "loss": 1.4741, + "mean_token_accuracy": 0.6471780588229498, + "num_tokens": 1335590113.0, + "step": 7974 + }, + { + "entropy": 1.7183683415253956, + "epoch": 0.8761088681991706, + "grad_norm": 0.72384113073349, + "learning_rate": 1.3222355847765691e-05, + "loss": 1.2578, + "mean_token_accuracy": 0.6796244730552038, + "num_tokens": 1335743664.0, + "step": 7975 + }, + { + "entropy": 1.6416078209877014, + "epoch": 0.8762187251105436, + "grad_norm": 0.6154013276100159, + "learning_rate": 1.3220804264785233e-05, + "loss": 1.4331, + "mean_token_accuracy": 0.6597521007061005, + "num_tokens": 1335926381.0, + "step": 7976 + }, + { + "entropy": 1.7263556321461995, + "epoch": 0.8763285820219164, + "grad_norm": 0.6627217531204224, + "learning_rate": 1.3219252611517326e-05, + "loss": 1.3985, + "mean_token_accuracy": 0.6437351852655411, + "num_tokens": 1336093870.0, + "step": 7977 + }, + { + "entropy": 1.7286728123823802, + "epoch": 0.8764384389332894, + "grad_norm": 0.7835181355476379, + "learning_rate": 1.3217700888011072e-05, + "loss": 1.3356, + "mean_token_accuracy": 0.6737496505180994, + "num_tokens": 1336228546.0, + "step": 7978 + }, + { + "entropy": 1.6983933846155803, + "epoch": 0.8765482958446623, + "grad_norm": 0.8094499707221985, + "learning_rate": 1.3216149094315585e-05, + "loss": 1.3539, + "mean_token_accuracy": 0.6737185815970103, + "num_tokens": 1336384984.0, + "step": 7979 + }, + { + "entropy": 1.6982168157895405, + "epoch": 0.8766581527560353, + "grad_norm": 0.7010941505432129, + "learning_rate": 1.3214597230479973e-05, + "loss": 1.4708, + "mean_token_accuracy": 0.6522993097702662, + "num_tokens": 1336542390.0, + "step": 7980 + }, + { + "entropy": 1.6157186627388, + "epoch": 0.8767680096674082, + "grad_norm": 0.6266676187515259, + "learning_rate": 1.321304529655336e-05, + "loss": 1.2673, + "mean_token_accuracy": 0.6801381210486094, + "num_tokens": 1336752692.0, + "step": 7981 + }, + { + "entropy": 1.671266903479894, + "epoch": 0.8768778665787811, + "grad_norm": 0.7309592962265015, + "learning_rate": 1.3211493292584861e-05, + "loss": 1.3975, + "mean_token_accuracy": 0.6700055301189423, + "num_tokens": 1336903638.0, + "step": 7982 + }, + { + "entropy": 1.677875409523646, + "epoch": 0.8769877234901541, + "grad_norm": 0.7530861496925354, + "learning_rate": 1.3209941218623594e-05, + "loss": 1.4529, + "mean_token_accuracy": 0.6435393045345942, + "num_tokens": 1337079040.0, + "step": 7983 + }, + { + "entropy": 1.695750226577123, + "epoch": 0.877097580401527, + "grad_norm": 9.608741760253906, + "learning_rate": 1.3208389074718686e-05, + "loss": 1.3589, + "mean_token_accuracy": 0.6756584992011389, + "num_tokens": 1337261289.0, + "step": 7984 + }, + { + "entropy": 1.7192882398764293, + "epoch": 0.8772074373129, + "grad_norm": 0.6504213213920593, + "learning_rate": 1.3206836860919258e-05, + "loss": 1.4516, + "mean_token_accuracy": 0.6443294088045756, + "num_tokens": 1337445749.0, + "step": 7985 + }, + { + "entropy": 1.6834936439990997, + "epoch": 0.8773172942242728, + "grad_norm": 0.6355934739112854, + "learning_rate": 1.3205284577274438e-05, + "loss": 1.365, + "mean_token_accuracy": 0.6607343057791392, + "num_tokens": 1337641026.0, + "step": 7986 + }, + { + "entropy": 1.6866546074549358, + "epoch": 0.8774271511356458, + "grad_norm": 0.5846768021583557, + "learning_rate": 1.3203732223833352e-05, + "loss": 1.4117, + "mean_token_accuracy": 0.6505334079265594, + "num_tokens": 1337845504.0, + "step": 7987 + }, + { + "entropy": 1.6987277269363403, + "epoch": 0.8775370080470187, + "grad_norm": 0.6802120804786682, + "learning_rate": 1.3202179800645137e-05, + "loss": 1.3893, + "mean_token_accuracy": 0.6545873979727427, + "num_tokens": 1338027155.0, + "step": 7988 + }, + { + "entropy": 1.8231212794780731, + "epoch": 0.8776468649583917, + "grad_norm": 0.7370648980140686, + "learning_rate": 1.3200627307758922e-05, + "loss": 1.5404, + "mean_token_accuracy": 0.6349399189154307, + "num_tokens": 1338177966.0, + "step": 7989 + }, + { + "entropy": 1.7170870800813038, + "epoch": 0.8777567218697646, + "grad_norm": 0.6052808165550232, + "learning_rate": 1.3199074745223849e-05, + "loss": 1.3193, + "mean_token_accuracy": 0.6577565719683965, + "num_tokens": 1338338646.0, + "step": 7990 + }, + { + "entropy": 1.6922166148821514, + "epoch": 0.8778665787811376, + "grad_norm": 0.5780960917472839, + "learning_rate": 1.3197522113089045e-05, + "loss": 1.3872, + "mean_token_accuracy": 0.6465161889791489, + "num_tokens": 1338520517.0, + "step": 7991 + }, + { + "entropy": 1.7829012076059978, + "epoch": 0.8779764356925105, + "grad_norm": 0.6546869277954102, + "learning_rate": 1.3195969411403657e-05, + "loss": 1.5343, + "mean_token_accuracy": 0.6399548500776291, + "num_tokens": 1338703551.0, + "step": 7992 + }, + { + "entropy": 1.7429544230302174, + "epoch": 0.8780862926038835, + "grad_norm": 0.6422476172447205, + "learning_rate": 1.319441664021683e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6439397037029266, + "num_tokens": 1338876260.0, + "step": 7993 + }, + { + "entropy": 1.7216396530469258, + "epoch": 0.8781961495152564, + "grad_norm": 0.7357894778251648, + "learning_rate": 1.3192863799577702e-05, + "loss": 1.5756, + "mean_token_accuracy": 0.6361222863197327, + "num_tokens": 1339067416.0, + "step": 7994 + }, + { + "entropy": 1.712314208348592, + "epoch": 0.8783060064266293, + "grad_norm": 0.6917596459388733, + "learning_rate": 1.3191310889535425e-05, + "loss": 1.4794, + "mean_token_accuracy": 0.6355889936288198, + "num_tokens": 1339228303.0, + "step": 7995 + }, + { + "entropy": 1.7267231245835621, + "epoch": 0.8784158633380023, + "grad_norm": 0.7089694738388062, + "learning_rate": 1.3189757910139144e-05, + "loss": 1.4635, + "mean_token_accuracy": 0.6534610986709595, + "num_tokens": 1339364859.0, + "step": 7996 + }, + { + "entropy": 1.7212253610293071, + "epoch": 0.8785257202493751, + "grad_norm": 0.782818078994751, + "learning_rate": 1.3188204861438014e-05, + "loss": 1.3417, + "mean_token_accuracy": 0.6555100381374359, + "num_tokens": 1339513208.0, + "step": 7997 + }, + { + "entropy": 1.7673552135626476, + "epoch": 0.8786355771607481, + "grad_norm": 0.6749922633171082, + "learning_rate": 1.3186651743481185e-05, + "loss": 1.366, + "mean_token_accuracy": 0.6576452553272247, + "num_tokens": 1339679215.0, + "step": 7998 + }, + { + "entropy": 1.7615666389465332, + "epoch": 0.878745434072121, + "grad_norm": 0.7322090268135071, + "learning_rate": 1.3185098556317814e-05, + "loss": 1.5279, + "mean_token_accuracy": 0.644319606324037, + "num_tokens": 1339867935.0, + "step": 7999 + }, + { + "entropy": 1.7344570557276409, + "epoch": 0.878855290983494, + "grad_norm": 0.6841682195663452, + "learning_rate": 1.3183545299997059e-05, + "loss": 1.465, + "mean_token_accuracy": 0.6409310499827067, + "num_tokens": 1340072053.0, + "step": 8000 + }, + { + "entropy": 1.6895175874233246, + "epoch": 0.8789651478948669, + "grad_norm": 0.622738242149353, + "learning_rate": 1.3181991974568078e-05, + "loss": 1.371, + "mean_token_accuracy": 0.6549272984266281, + "num_tokens": 1340258625.0, + "step": 8001 + }, + { + "entropy": 1.7024616301059723, + "epoch": 0.8790750048062399, + "grad_norm": 0.6197877526283264, + "learning_rate": 1.3180438580080035e-05, + "loss": 1.3511, + "mean_token_accuracy": 0.6649558196465174, + "num_tokens": 1340444952.0, + "step": 8002 + }, + { + "entropy": 1.6654754877090454, + "epoch": 0.8791848617176128, + "grad_norm": 1.9219154119491577, + "learning_rate": 1.3178885116582092e-05, + "loss": 1.2438, + "mean_token_accuracy": 0.6720800052086512, + "num_tokens": 1340670642.0, + "step": 8003 + }, + { + "entropy": 1.7420108218987782, + "epoch": 0.8792947186289858, + "grad_norm": 0.6763916015625, + "learning_rate": 1.3177331584123415e-05, + "loss": 1.3347, + "mean_token_accuracy": 0.6596690714359283, + "num_tokens": 1340838356.0, + "step": 8004 + }, + { + "entropy": 1.7406767507394154, + "epoch": 0.8794045755403587, + "grad_norm": 0.6398903727531433, + "learning_rate": 1.3175777982753181e-05, + "loss": 1.4552, + "mean_token_accuracy": 0.6480192442735037, + "num_tokens": 1340984070.0, + "step": 8005 + }, + { + "entropy": 1.6723910371462505, + "epoch": 0.8795144324517317, + "grad_norm": 0.6995456218719482, + "learning_rate": 1.317422431252055e-05, + "loss": 1.2322, + "mean_token_accuracy": 0.684855322043101, + "num_tokens": 1341178363.0, + "step": 8006 + }, + { + "entropy": 1.656785657008489, + "epoch": 0.8796242893631046, + "grad_norm": 0.6289657354354858, + "learning_rate": 1.3172670573474702e-05, + "loss": 1.3367, + "mean_token_accuracy": 0.6745680520931879, + "num_tokens": 1341321368.0, + "step": 8007 + }, + { + "entropy": 1.6708788673082988, + "epoch": 0.8797341462744775, + "grad_norm": 0.6338051557540894, + "learning_rate": 1.3171116765664806e-05, + "loss": 1.5384, + "mean_token_accuracy": 0.6278170545895895, + "num_tokens": 1341560129.0, + "step": 8008 + }, + { + "entropy": 1.6958947479724884, + "epoch": 0.8798440031858504, + "grad_norm": 0.8811983466148376, + "learning_rate": 1.3169562889140044e-05, + "loss": 1.3398, + "mean_token_accuracy": 0.6620439837376276, + "num_tokens": 1341696606.0, + "step": 8009 + }, + { + "entropy": 1.7053867677847545, + "epoch": 0.8799538600972233, + "grad_norm": 1.529905915260315, + "learning_rate": 1.3168008943949595e-05, + "loss": 1.3059, + "mean_token_accuracy": 0.6612508594989777, + "num_tokens": 1341886733.0, + "step": 8010 + }, + { + "entropy": 1.6900759637355804, + "epoch": 0.8800637170085963, + "grad_norm": 0.6369670629501343, + "learning_rate": 1.3166454930142638e-05, + "loss": 1.4512, + "mean_token_accuracy": 0.6452312916517258, + "num_tokens": 1342084201.0, + "step": 8011 + }, + { + "entropy": 1.6731635133425395, + "epoch": 0.8801735739199692, + "grad_norm": 0.772038996219635, + "learning_rate": 1.316490084776836e-05, + "loss": 1.2925, + "mean_token_accuracy": 0.6838527768850327, + "num_tokens": 1342258577.0, + "step": 8012 + }, + { + "entropy": 1.7299912571907043, + "epoch": 0.8802834308313422, + "grad_norm": 0.6461442708969116, + "learning_rate": 1.3163346696875948e-05, + "loss": 1.4096, + "mean_token_accuracy": 0.6662061562140783, + "num_tokens": 1342489971.0, + "step": 8013 + }, + { + "entropy": 1.6888580024242401, + "epoch": 0.8803932877427151, + "grad_norm": 0.6851293444633484, + "learning_rate": 1.3161792477514581e-05, + "loss": 1.5714, + "mean_token_accuracy": 0.648188849290212, + "num_tokens": 1342667835.0, + "step": 8014 + }, + { + "entropy": 1.7008213798205059, + "epoch": 0.8805031446540881, + "grad_norm": 0.5513099431991577, + "learning_rate": 1.3160238189733461e-05, + "loss": 1.4225, + "mean_token_accuracy": 0.6414727667967478, + "num_tokens": 1342901022.0, + "step": 8015 + }, + { + "entropy": 1.6794636050860088, + "epoch": 0.880613001565461, + "grad_norm": 0.6192083358764648, + "learning_rate": 1.3158683833581776e-05, + "loss": 1.3251, + "mean_token_accuracy": 0.664734274148941, + "num_tokens": 1343050141.0, + "step": 8016 + }, + { + "entropy": 1.6627166867256165, + "epoch": 0.880722858476834, + "grad_norm": 0.7238250970840454, + "learning_rate": 1.315712940910872e-05, + "loss": 1.3705, + "mean_token_accuracy": 0.6606160700321198, + "num_tokens": 1343257130.0, + "step": 8017 + }, + { + "entropy": 1.6578301588694255, + "epoch": 0.8808327153882068, + "grad_norm": 0.644854724407196, + "learning_rate": 1.3155574916363489e-05, + "loss": 1.4382, + "mean_token_accuracy": 0.6555332243442535, + "num_tokens": 1343435487.0, + "step": 8018 + }, + { + "entropy": 1.662944386402766, + "epoch": 0.8809425722995798, + "grad_norm": 0.7258864641189575, + "learning_rate": 1.3154020355395285e-05, + "loss": 1.352, + "mean_token_accuracy": 0.6717381527026495, + "num_tokens": 1343561950.0, + "step": 8019 + }, + { + "entropy": 1.701594094435374, + "epoch": 0.8810524292109527, + "grad_norm": 0.7201105952262878, + "learning_rate": 1.3152465726253307e-05, + "loss": 1.3787, + "mean_token_accuracy": 0.6671847403049469, + "num_tokens": 1343705589.0, + "step": 8020 + }, + { + "entropy": 1.704057554403941, + "epoch": 0.8811622861223257, + "grad_norm": 0.6907951831817627, + "learning_rate": 1.3150911028986756e-05, + "loss": 1.2673, + "mean_token_accuracy": 0.6720318496227264, + "num_tokens": 1343840657.0, + "step": 8021 + }, + { + "entropy": 1.6859318315982819, + "epoch": 0.8812721430336986, + "grad_norm": 0.7433066368103027, + "learning_rate": 1.3149356263644844e-05, + "loss": 1.386, + "mean_token_accuracy": 0.6604318271080653, + "num_tokens": 1343965621.0, + "step": 8022 + }, + { + "entropy": 1.730026255051295, + "epoch": 0.8813819999450715, + "grad_norm": 0.9503698348999023, + "learning_rate": 1.3147801430276771e-05, + "loss": 1.4897, + "mean_token_accuracy": 0.6525371472040812, + "num_tokens": 1344127435.0, + "step": 8023 + }, + { + "entropy": 1.6777367393175762, + "epoch": 0.8814918568564445, + "grad_norm": 0.6132991909980774, + "learning_rate": 1.3146246528931757e-05, + "loss": 1.4217, + "mean_token_accuracy": 0.6546296526988348, + "num_tokens": 1344296015.0, + "step": 8024 + }, + { + "entropy": 1.6886170705159504, + "epoch": 0.8816017137678174, + "grad_norm": 0.6643725633621216, + "learning_rate": 1.3144691559659e-05, + "loss": 1.4503, + "mean_token_accuracy": 0.6464731891949972, + "num_tokens": 1344487057.0, + "step": 8025 + }, + { + "entropy": 1.694665402173996, + "epoch": 0.8817115706791904, + "grad_norm": 0.8616427183151245, + "learning_rate": 1.3143136522507727e-05, + "loss": 1.3552, + "mean_token_accuracy": 0.6591685314973196, + "num_tokens": 1344651179.0, + "step": 8026 + }, + { + "entropy": 1.705785721540451, + "epoch": 0.8818214275905633, + "grad_norm": 0.6112991571426392, + "learning_rate": 1.3141581417527142e-05, + "loss": 1.4192, + "mean_token_accuracy": 0.6488917469978333, + "num_tokens": 1344852823.0, + "step": 8027 + }, + { + "entropy": 1.6365499794483185, + "epoch": 0.8819312845019363, + "grad_norm": 0.6156476736068726, + "learning_rate": 1.3140026244766474e-05, + "loss": 1.407, + "mean_token_accuracy": 0.6569693684577942, + "num_tokens": 1345024193.0, + "step": 8028 + }, + { + "entropy": 1.7127653062343597, + "epoch": 0.8820411414133091, + "grad_norm": 0.6506058573722839, + "learning_rate": 1.3138471004274942e-05, + "loss": 1.3516, + "mean_token_accuracy": 0.6794911821683248, + "num_tokens": 1345221187.0, + "step": 8029 + }, + { + "entropy": 1.709245463212331, + "epoch": 0.8821509983246821, + "grad_norm": 0.7298224568367004, + "learning_rate": 1.3136915696101768e-05, + "loss": 1.5062, + "mean_token_accuracy": 0.6443512787421545, + "num_tokens": 1345408682.0, + "step": 8030 + }, + { + "entropy": 1.753902445236842, + "epoch": 0.882260855236055, + "grad_norm": 0.6582000255584717, + "learning_rate": 1.3135360320296172e-05, + "loss": 1.2243, + "mean_token_accuracy": 0.6782107502222061, + "num_tokens": 1345548857.0, + "step": 8031 + }, + { + "entropy": 1.689902792374293, + "epoch": 0.882370712147428, + "grad_norm": 0.6262725591659546, + "learning_rate": 1.3133804876907381e-05, + "loss": 1.4091, + "mean_token_accuracy": 0.669882799188296, + "num_tokens": 1345724951.0, + "step": 8032 + }, + { + "entropy": 1.7034416993459065, + "epoch": 0.8824805690588009, + "grad_norm": 0.6859605312347412, + "learning_rate": 1.313224936598463e-05, + "loss": 1.4119, + "mean_token_accuracy": 0.6529233107964197, + "num_tokens": 1345847774.0, + "step": 8033 + }, + { + "entropy": 1.6871616741021473, + "epoch": 0.8825904259701739, + "grad_norm": 0.7056890726089478, + "learning_rate": 1.3130693787577149e-05, + "loss": 1.4314, + "mean_token_accuracy": 0.6448503235975901, + "num_tokens": 1346052041.0, + "step": 8034 + }, + { + "entropy": 1.6991152167320251, + "epoch": 0.8827002828815468, + "grad_norm": 0.7343994975090027, + "learning_rate": 1.312913814173417e-05, + "loss": 1.465, + "mean_token_accuracy": 0.636786495645841, + "num_tokens": 1346230977.0, + "step": 8035 + }, + { + "entropy": 1.740164339542389, + "epoch": 0.8828101397929197, + "grad_norm": 0.6870355606079102, + "learning_rate": 1.3127582428504924e-05, + "loss": 1.3112, + "mean_token_accuracy": 0.6617578764756521, + "num_tokens": 1346377875.0, + "step": 8036 + }, + { + "entropy": 1.6937303443749745, + "epoch": 0.8829199967042927, + "grad_norm": 0.6445454359054565, + "learning_rate": 1.3126026647938656e-05, + "loss": 1.4059, + "mean_token_accuracy": 0.6537498732407888, + "num_tokens": 1346547610.0, + "step": 8037 + }, + { + "entropy": 1.7122799456119537, + "epoch": 0.8830298536156655, + "grad_norm": 0.6448954343795776, + "learning_rate": 1.3124470800084602e-05, + "loss": 1.4434, + "mean_token_accuracy": 0.6424995213747025, + "num_tokens": 1346735778.0, + "step": 8038 + }, + { + "entropy": 1.7327117224534352, + "epoch": 0.8831397105270385, + "grad_norm": 0.6899316310882568, + "learning_rate": 1.3122914884992001e-05, + "loss": 1.3337, + "mean_token_accuracy": 0.6604535380999247, + "num_tokens": 1346857791.0, + "step": 8039 + }, + { + "entropy": 1.6908073723316193, + "epoch": 0.8832495674384114, + "grad_norm": 0.6724409461021423, + "learning_rate": 1.3121358902710106e-05, + "loss": 1.3755, + "mean_token_accuracy": 0.6658162524302801, + "num_tokens": 1347035627.0, + "step": 8040 + }, + { + "entropy": 1.6952118575572968, + "epoch": 0.8833594243497844, + "grad_norm": 1.6206833124160767, + "learning_rate": 1.3119802853288157e-05, + "loss": 1.176, + "mean_token_accuracy": 0.6623029261827469, + "num_tokens": 1347231666.0, + "step": 8041 + }, + { + "entropy": 1.6843286454677582, + "epoch": 0.8834692812611573, + "grad_norm": 0.6928609609603882, + "learning_rate": 1.31182467367754e-05, + "loss": 1.3903, + "mean_token_accuracy": 0.6516619374354681, + "num_tokens": 1347406884.0, + "step": 8042 + }, + { + "entropy": 1.6303186118602753, + "epoch": 0.8835791381725303, + "grad_norm": 0.6562328934669495, + "learning_rate": 1.311669055322109e-05, + "loss": 1.3083, + "mean_token_accuracy": 0.6617482751607895, + "num_tokens": 1347573182.0, + "step": 8043 + }, + { + "entropy": 1.7110784550507863, + "epoch": 0.8836889950839032, + "grad_norm": 0.6911236643791199, + "learning_rate": 1.3115134302674476e-05, + "loss": 1.3642, + "mean_token_accuracy": 0.6580260396003723, + "num_tokens": 1347729826.0, + "step": 8044 + }, + { + "entropy": 1.7337822914123535, + "epoch": 0.8837988519952762, + "grad_norm": 0.840054988861084, + "learning_rate": 1.3113577985184815e-05, + "loss": 1.3266, + "mean_token_accuracy": 0.6658614228169123, + "num_tokens": 1347900052.0, + "step": 8045 + }, + { + "entropy": 1.7084493140379589, + "epoch": 0.8839087089066491, + "grad_norm": 0.749947726726532, + "learning_rate": 1.3112021600801367e-05, + "loss": 1.474, + "mean_token_accuracy": 0.6458921432495117, + "num_tokens": 1348105613.0, + "step": 8046 + }, + { + "entropy": 1.620888243118922, + "epoch": 0.8840185658180221, + "grad_norm": 0.6167489290237427, + "learning_rate": 1.3110465149573384e-05, + "loss": 1.398, + "mean_token_accuracy": 0.6520049870014191, + "num_tokens": 1348336198.0, + "step": 8047 + }, + { + "entropy": 1.7000917494297028, + "epoch": 0.884128422729395, + "grad_norm": 0.6212296485900879, + "learning_rate": 1.3108908631550128e-05, + "loss": 1.491, + "mean_token_accuracy": 0.6476211200157801, + "num_tokens": 1348499904.0, + "step": 8048 + }, + { + "entropy": 1.6660768489042919, + "epoch": 0.884238279640768, + "grad_norm": 0.8931158781051636, + "learning_rate": 1.3107352046780865e-05, + "loss": 1.0585, + "mean_token_accuracy": 0.6905455191930135, + "num_tokens": 1348668149.0, + "step": 8049 + }, + { + "entropy": 1.7104520897070568, + "epoch": 0.8843481365521408, + "grad_norm": 0.7820631861686707, + "learning_rate": 1.3105795395314863e-05, + "loss": 1.3984, + "mean_token_accuracy": 0.6557039568821589, + "num_tokens": 1348803873.0, + "step": 8050 + }, + { + "entropy": 1.7667625844478607, + "epoch": 0.8844579934635137, + "grad_norm": 0.6352094411849976, + "learning_rate": 1.3104238677201382e-05, + "loss": 1.3466, + "mean_token_accuracy": 0.6517351716756821, + "num_tokens": 1348947762.0, + "step": 8051 + }, + { + "entropy": 1.6937001744906108, + "epoch": 0.8845678503748867, + "grad_norm": 0.700639009475708, + "learning_rate": 1.3102681892489698e-05, + "loss": 1.4141, + "mean_token_accuracy": 0.6588475555181503, + "num_tokens": 1349135921.0, + "step": 8052 + }, + { + "entropy": 1.7913443545500438, + "epoch": 0.8846777072862596, + "grad_norm": 0.719153881072998, + "learning_rate": 1.3101125041229077e-05, + "loss": 1.3666, + "mean_token_accuracy": 0.654579242070516, + "num_tokens": 1349279170.0, + "step": 8053 + }, + { + "entropy": 1.7504811882972717, + "epoch": 0.8847875641976326, + "grad_norm": 0.7166516184806824, + "learning_rate": 1.3099568123468796e-05, + "loss": 1.6457, + "mean_token_accuracy": 0.6370598326126734, + "num_tokens": 1349463834.0, + "step": 8054 + }, + { + "entropy": 1.7077268064022064, + "epoch": 0.8848974211090055, + "grad_norm": 0.6966634392738342, + "learning_rate": 1.309801113925813e-05, + "loss": 1.3574, + "mean_token_accuracy": 0.6640727718671163, + "num_tokens": 1349625453.0, + "step": 8055 + }, + { + "entropy": 1.718926727771759, + "epoch": 0.8850072780203785, + "grad_norm": 0.6505473852157593, + "learning_rate": 1.3096454088646355e-05, + "loss": 1.2966, + "mean_token_accuracy": 0.6705836703379949, + "num_tokens": 1349759088.0, + "step": 8056 + }, + { + "entropy": 1.659464915593465, + "epoch": 0.8851171349317514, + "grad_norm": 0.5969595909118652, + "learning_rate": 1.3094896971682756e-05, + "loss": 1.3489, + "mean_token_accuracy": 0.6619338194529215, + "num_tokens": 1349958910.0, + "step": 8057 + }, + { + "entropy": 1.7118416329224904, + "epoch": 0.8852269918431244, + "grad_norm": 0.7195928692817688, + "learning_rate": 1.3093339788416611e-05, + "loss": 1.3853, + "mean_token_accuracy": 0.6494811822970709, + "num_tokens": 1350096253.0, + "step": 8058 + }, + { + "entropy": 1.7227883338928223, + "epoch": 0.8853368487544973, + "grad_norm": 0.7288689613342285, + "learning_rate": 1.3091782538897204e-05, + "loss": 1.5027, + "mean_token_accuracy": 0.6574197262525558, + "num_tokens": 1350239891.0, + "step": 8059 + }, + { + "entropy": 1.6495947043100994, + "epoch": 0.8854467056658702, + "grad_norm": 0.6621578335762024, + "learning_rate": 1.3090225223173822e-05, + "loss": 1.463, + "mean_token_accuracy": 0.6442839155594507, + "num_tokens": 1350454281.0, + "step": 8060 + }, + { + "entropy": 1.7141701777776082, + "epoch": 0.8855565625772431, + "grad_norm": 0.7444977164268494, + "learning_rate": 1.3088667841295755e-05, + "loss": 1.3837, + "mean_token_accuracy": 0.6679119020700455, + "num_tokens": 1350628019.0, + "step": 8061 + }, + { + "entropy": 1.7080074946085613, + "epoch": 0.8856664194886161, + "grad_norm": 0.6999690532684326, + "learning_rate": 1.308711039331229e-05, + "loss": 1.3976, + "mean_token_accuracy": 0.6628955900669098, + "num_tokens": 1350818224.0, + "step": 8062 + }, + { + "entropy": 1.7163402338822682, + "epoch": 0.885776276399989, + "grad_norm": 0.7445178627967834, + "learning_rate": 1.3085552879272723e-05, + "loss": 1.5502, + "mean_token_accuracy": 0.6500421464443207, + "num_tokens": 1350961323.0, + "step": 8063 + }, + { + "entropy": 1.6648909350236256, + "epoch": 0.8858861333113619, + "grad_norm": 0.6955971717834473, + "learning_rate": 1.3083995299226349e-05, + "loss": 1.222, + "mean_token_accuracy": 0.6757313311100006, + "num_tokens": 1351090851.0, + "step": 8064 + }, + { + "entropy": 1.7097779909769695, + "epoch": 0.8859959902227349, + "grad_norm": 0.7316083908081055, + "learning_rate": 1.308243765322246e-05, + "loss": 1.4142, + "mean_token_accuracy": 0.6512270569801331, + "num_tokens": 1351338598.0, + "step": 8065 + }, + { + "entropy": 1.7168799837430317, + "epoch": 0.8861058471341078, + "grad_norm": 0.8173125386238098, + "learning_rate": 1.3080879941310357e-05, + "loss": 1.431, + "mean_token_accuracy": 0.6548661192258199, + "num_tokens": 1351471043.0, + "step": 8066 + }, + { + "entropy": 1.7480222483476002, + "epoch": 0.8862157040454808, + "grad_norm": 3.3873794078826904, + "learning_rate": 1.3079322163539343e-05, + "loss": 1.1677, + "mean_token_accuracy": 0.677946095665296, + "num_tokens": 1351661275.0, + "step": 8067 + }, + { + "entropy": 1.75862056016922, + "epoch": 0.8863255609568537, + "grad_norm": 0.7313094735145569, + "learning_rate": 1.307776431995872e-05, + "loss": 1.5896, + "mean_token_accuracy": 0.6547629435857137, + "num_tokens": 1351818032.0, + "step": 8068 + }, + { + "entropy": 1.736459106206894, + "epoch": 0.8864354178682267, + "grad_norm": 0.6745466589927673, + "learning_rate": 1.3076206410617792e-05, + "loss": 1.3345, + "mean_token_accuracy": 0.6557362129290899, + "num_tokens": 1351960721.0, + "step": 8069 + }, + { + "entropy": 1.6622845729192097, + "epoch": 0.8865452747795995, + "grad_norm": 0.6415925621986389, + "learning_rate": 1.3074648435565866e-05, + "loss": 1.4075, + "mean_token_accuracy": 0.6481207013130188, + "num_tokens": 1352103987.0, + "step": 8070 + }, + { + "entropy": 1.760219156742096, + "epoch": 0.8866551316909725, + "grad_norm": 0.6308138370513916, + "learning_rate": 1.3073090394852253e-05, + "loss": 1.3635, + "mean_token_accuracy": 0.6509590496619543, + "num_tokens": 1352236836.0, + "step": 8071 + }, + { + "entropy": 1.665820409854253, + "epoch": 0.8867649886023454, + "grad_norm": 0.7212702035903931, + "learning_rate": 1.307153228852626e-05, + "loss": 1.3673, + "mean_token_accuracy": 0.6518625418345133, + "num_tokens": 1352428604.0, + "step": 8072 + }, + { + "entropy": 1.7342944145202637, + "epoch": 0.8868748455137184, + "grad_norm": 0.7079007029533386, + "learning_rate": 1.3069974116637207e-05, + "loss": 1.2633, + "mean_token_accuracy": 0.6728782703479131, + "num_tokens": 1352562805.0, + "step": 8073 + }, + { + "entropy": 1.728011429309845, + "epoch": 0.8869847024250913, + "grad_norm": 0.6237488389015198, + "learning_rate": 1.3068415879234409e-05, + "loss": 1.389, + "mean_token_accuracy": 0.6614675124486288, + "num_tokens": 1352798623.0, + "step": 8074 + }, + { + "entropy": 1.6492702066898346, + "epoch": 0.8870945593364643, + "grad_norm": 0.744462251663208, + "learning_rate": 1.3066857576367173e-05, + "loss": 1.4776, + "mean_token_accuracy": 0.6488187313079834, + "num_tokens": 1352998143.0, + "step": 8075 + }, + { + "entropy": 1.7421591877937317, + "epoch": 0.8872044162478372, + "grad_norm": 0.6387677788734436, + "learning_rate": 1.306529920808483e-05, + "loss": 1.5294, + "mean_token_accuracy": 0.6411878218253454, + "num_tokens": 1353150077.0, + "step": 8076 + }, + { + "entropy": 1.7408295770486195, + "epoch": 0.8873142731592101, + "grad_norm": 0.7120410799980164, + "learning_rate": 1.3063740774436699e-05, + "loss": 1.3272, + "mean_token_accuracy": 0.6632676968971888, + "num_tokens": 1353315861.0, + "step": 8077 + }, + { + "entropy": 1.6731611490249634, + "epoch": 0.8874241300705831, + "grad_norm": 0.706117570400238, + "learning_rate": 1.3062182275472097e-05, + "loss": 1.316, + "mean_token_accuracy": 0.6600356449683508, + "num_tokens": 1353424738.0, + "step": 8078 + }, + { + "entropy": 1.6679266492525737, + "epoch": 0.887533986981956, + "grad_norm": 0.7776505351066589, + "learning_rate": 1.3060623711240362e-05, + "loss": 1.3721, + "mean_token_accuracy": 0.669564555088679, + "num_tokens": 1353592283.0, + "step": 8079 + }, + { + "entropy": 1.6982711652914684, + "epoch": 0.887643843893329, + "grad_norm": 0.7552779316902161, + "learning_rate": 1.3059065081790814e-05, + "loss": 1.4374, + "mean_token_accuracy": 0.6687319328387579, + "num_tokens": 1353771761.0, + "step": 8080 + }, + { + "entropy": 1.638779918352763, + "epoch": 0.8877537008047018, + "grad_norm": 0.5680516362190247, + "learning_rate": 1.305750638717278e-05, + "loss": 1.3348, + "mean_token_accuracy": 0.6778454432884852, + "num_tokens": 1353939394.0, + "step": 8081 + }, + { + "entropy": 1.6758286853631337, + "epoch": 0.8878635577160748, + "grad_norm": 0.6949761509895325, + "learning_rate": 1.3055947627435597e-05, + "loss": 1.3651, + "mean_token_accuracy": 0.6731551140546799, + "num_tokens": 1354080326.0, + "step": 8082 + }, + { + "entropy": 1.688368280728658, + "epoch": 0.8879734146274477, + "grad_norm": 0.6399317979812622, + "learning_rate": 1.30543888026286e-05, + "loss": 1.357, + "mean_token_accuracy": 0.6511105100313822, + "num_tokens": 1354217646.0, + "step": 8083 + }, + { + "entropy": 1.691909670829773, + "epoch": 0.8880832715388207, + "grad_norm": 0.6731947660446167, + "learning_rate": 1.3052829912801121e-05, + "loss": 1.5381, + "mean_token_accuracy": 0.6489096581935883, + "num_tokens": 1354406834.0, + "step": 8084 + }, + { + "entropy": 1.6758221685886383, + "epoch": 0.8881931284501936, + "grad_norm": 0.6500033140182495, + "learning_rate": 1.3051270958002503e-05, + "loss": 1.3698, + "mean_token_accuracy": 0.6664744565884272, + "num_tokens": 1354556751.0, + "step": 8085 + }, + { + "entropy": 1.6546663045883179, + "epoch": 0.8883029853615666, + "grad_norm": 0.6402091979980469, + "learning_rate": 1.3049711938282084e-05, + "loss": 1.35, + "mean_token_accuracy": 0.6814102729161581, + "num_tokens": 1354717236.0, + "step": 8086 + }, + { + "entropy": 1.6440533498922985, + "epoch": 0.8884128422729395, + "grad_norm": 0.7296947836875916, + "learning_rate": 1.3048152853689202e-05, + "loss": 1.3896, + "mean_token_accuracy": 0.6694160799185435, + "num_tokens": 1354912353.0, + "step": 8087 + }, + { + "entropy": 1.7287603914737701, + "epoch": 0.8885226991843125, + "grad_norm": 0.8662500977516174, + "learning_rate": 1.3046593704273205e-05, + "loss": 1.2404, + "mean_token_accuracy": 0.686885267496109, + "num_tokens": 1355040756.0, + "step": 8088 + }, + { + "entropy": 1.741082489490509, + "epoch": 0.8886325560956854, + "grad_norm": 0.7242109775543213, + "learning_rate": 1.3045034490083442e-05, + "loss": 1.4916, + "mean_token_accuracy": 0.6341162770986557, + "num_tokens": 1355257903.0, + "step": 8089 + }, + { + "entropy": 1.7150556246439617, + "epoch": 0.8887424130070583, + "grad_norm": 0.67889803647995, + "learning_rate": 1.3043475211169257e-05, + "loss": 1.4755, + "mean_token_accuracy": 0.6414446582396826, + "num_tokens": 1355428402.0, + "step": 8090 + }, + { + "entropy": 1.6610381305217743, + "epoch": 0.8888522699184312, + "grad_norm": 0.6266405582427979, + "learning_rate": 1.3041915867580004e-05, + "loss": 1.3219, + "mean_token_accuracy": 0.6615271915992101, + "num_tokens": 1355587479.0, + "step": 8091 + }, + { + "entropy": 1.7193395793437958, + "epoch": 0.8889621268298041, + "grad_norm": 0.6784216165542603, + "learning_rate": 1.3040356459365035e-05, + "loss": 1.3691, + "mean_token_accuracy": 0.6642310122648875, + "num_tokens": 1355710721.0, + "step": 8092 + }, + { + "entropy": 1.671286831299464, + "epoch": 0.8890719837411771, + "grad_norm": 0.6728245615959167, + "learning_rate": 1.30387969865737e-05, + "loss": 1.3102, + "mean_token_accuracy": 0.666391134262085, + "num_tokens": 1355856077.0, + "step": 8093 + }, + { + "entropy": 1.6652600566546123, + "epoch": 0.88918184065255, + "grad_norm": 0.8366493582725525, + "learning_rate": 1.3037237449255363e-05, + "loss": 1.1922, + "mean_token_accuracy": 0.6832515945037206, + "num_tokens": 1356007976.0, + "step": 8094 + }, + { + "entropy": 1.7026143074035645, + "epoch": 0.889291697563923, + "grad_norm": 0.7122969031333923, + "learning_rate": 1.3035677847459376e-05, + "loss": 1.3912, + "mean_token_accuracy": 0.6449934641520182, + "num_tokens": 1356202823.0, + "step": 8095 + }, + { + "entropy": 1.712339609861374, + "epoch": 0.8894015544752959, + "grad_norm": 0.6235902309417725, + "learning_rate": 1.3034118181235103e-05, + "loss": 1.3057, + "mean_token_accuracy": 0.6651495695114136, + "num_tokens": 1356362943.0, + "step": 8096 + }, + { + "entropy": 1.6995584865411122, + "epoch": 0.8895114113866689, + "grad_norm": 0.6809194684028625, + "learning_rate": 1.3032558450631905e-05, + "loss": 1.4245, + "mean_token_accuracy": 0.6478584508101145, + "num_tokens": 1356528585.0, + "step": 8097 + }, + { + "entropy": 1.6661972502867382, + "epoch": 0.8896212682980418, + "grad_norm": 0.7444778680801392, + "learning_rate": 1.3030998655699152e-05, + "loss": 1.4135, + "mean_token_accuracy": 0.6626001720627149, + "num_tokens": 1356696607.0, + "step": 8098 + }, + { + "entropy": 1.666017969449361, + "epoch": 0.8897311252094148, + "grad_norm": 0.6592122912406921, + "learning_rate": 1.3029438796486205e-05, + "loss": 1.4551, + "mean_token_accuracy": 0.6414574682712555, + "num_tokens": 1356874909.0, + "step": 8099 + }, + { + "entropy": 1.7365792989730835, + "epoch": 0.8898409821207877, + "grad_norm": 0.7328019738197327, + "learning_rate": 1.3027878873042431e-05, + "loss": 1.4733, + "mean_token_accuracy": 0.6426637371381124, + "num_tokens": 1357089568.0, + "step": 8100 + }, + { + "entropy": 1.7083971202373505, + "epoch": 0.8899508390321607, + "grad_norm": 0.6374284625053406, + "learning_rate": 1.3026318885417208e-05, + "loss": 1.2617, + "mean_token_accuracy": 0.6822344164053599, + "num_tokens": 1357196677.0, + "step": 8101 + }, + { + "entropy": 1.7009641925493877, + "epoch": 0.8900606959435335, + "grad_norm": 0.6936139464378357, + "learning_rate": 1.3024758833659906e-05, + "loss": 1.4522, + "mean_token_accuracy": 0.654137596487999, + "num_tokens": 1357358345.0, + "step": 8102 + }, + { + "entropy": 1.6674350996812184, + "epoch": 0.8901705528549065, + "grad_norm": 0.6887747049331665, + "learning_rate": 1.3023198717819896e-05, + "loss": 1.2265, + "mean_token_accuracy": 0.675381526350975, + "num_tokens": 1357473542.0, + "step": 8103 + }, + { + "entropy": 1.7373623251914978, + "epoch": 0.8902804097662794, + "grad_norm": 0.8778982162475586, + "learning_rate": 1.3021638537946562e-05, + "loss": 1.4434, + "mean_token_accuracy": 0.6643926252921423, + "num_tokens": 1357646759.0, + "step": 8104 + }, + { + "entropy": 1.7153649926185608, + "epoch": 0.8903902666776523, + "grad_norm": 0.6064153909683228, + "learning_rate": 1.3020078294089276e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6452821493148804, + "num_tokens": 1357837941.0, + "step": 8105 + }, + { + "entropy": 1.7316668430964153, + "epoch": 0.8905001235890253, + "grad_norm": 0.7943192720413208, + "learning_rate": 1.3018517986297423e-05, + "loss": 1.3593, + "mean_token_accuracy": 0.6662193487087885, + "num_tokens": 1358041225.0, + "step": 8106 + }, + { + "entropy": 1.7015369435151417, + "epoch": 0.8906099805003982, + "grad_norm": 0.6925376057624817, + "learning_rate": 1.3016957614620385e-05, + "loss": 1.4367, + "mean_token_accuracy": 0.6497325003147125, + "num_tokens": 1358238077.0, + "step": 8107 + }, + { + "entropy": 1.7462484240531921, + "epoch": 0.8907198374117712, + "grad_norm": 0.6663040518760681, + "learning_rate": 1.301539717910755e-05, + "loss": 1.4275, + "mean_token_accuracy": 0.663551022609075, + "num_tokens": 1358404375.0, + "step": 8108 + }, + { + "entropy": 1.7458167274792988, + "epoch": 0.8908296943231441, + "grad_norm": 0.7102859020233154, + "learning_rate": 1.3013836679808299e-05, + "loss": 1.4161, + "mean_token_accuracy": 0.6619683603445689, + "num_tokens": 1358590647.0, + "step": 8109 + }, + { + "entropy": 1.6818938553333282, + "epoch": 0.8909395512345171, + "grad_norm": 0.6789277195930481, + "learning_rate": 1.3012276116772027e-05, + "loss": 1.2391, + "mean_token_accuracy": 0.6869035313526789, + "num_tokens": 1358735171.0, + "step": 8110 + }, + { + "entropy": 1.7530201375484467, + "epoch": 0.89104940814589, + "grad_norm": 0.849226713180542, + "learning_rate": 1.301071549004812e-05, + "loss": 1.4003, + "mean_token_accuracy": 0.6840375413497289, + "num_tokens": 1358913878.0, + "step": 8111 + }, + { + "entropy": 1.766763836145401, + "epoch": 0.891159265057263, + "grad_norm": 0.6677963733673096, + "learning_rate": 1.3009154799685977e-05, + "loss": 1.4964, + "mean_token_accuracy": 0.6554523011048635, + "num_tokens": 1359064147.0, + "step": 8112 + }, + { + "entropy": 1.7489437560240428, + "epoch": 0.8912691219686358, + "grad_norm": 0.7236900329589844, + "learning_rate": 1.3007594045734986e-05, + "loss": 1.439, + "mean_token_accuracy": 0.6437687029441198, + "num_tokens": 1359239467.0, + "step": 8113 + }, + { + "entropy": 1.7261870900789897, + "epoch": 0.8913789788800088, + "grad_norm": 0.6887776851654053, + "learning_rate": 1.3006033228244551e-05, + "loss": 1.4056, + "mean_token_accuracy": 0.6591099550326666, + "num_tokens": 1359378786.0, + "step": 8114 + }, + { + "entropy": 1.6371654470761616, + "epoch": 0.8914888357913817, + "grad_norm": 0.8251991868019104, + "learning_rate": 1.300447234726407e-05, + "loss": 1.3108, + "mean_token_accuracy": 0.68764096001784, + "num_tokens": 1359534184.0, + "step": 8115 + }, + { + "entropy": 1.6558915674686432, + "epoch": 0.8915986927027547, + "grad_norm": 0.7391266822814941, + "learning_rate": 1.3002911402842941e-05, + "loss": 1.3898, + "mean_token_accuracy": 0.6650058180093765, + "num_tokens": 1359685889.0, + "step": 8116 + }, + { + "entropy": 1.7229611972967784, + "epoch": 0.8917085496141276, + "grad_norm": 0.8075942993164062, + "learning_rate": 1.3001350395030568e-05, + "loss": 1.3773, + "mean_token_accuracy": 0.6721263627211252, + "num_tokens": 1359898074.0, + "step": 8117 + }, + { + "entropy": 1.6780574719111125, + "epoch": 0.8918184065255005, + "grad_norm": 0.6064956784248352, + "learning_rate": 1.2999789323876355e-05, + "loss": 1.3074, + "mean_token_accuracy": 0.6772323052088419, + "num_tokens": 1360031925.0, + "step": 8118 + }, + { + "entropy": 1.7141193449497223, + "epoch": 0.8919282634368735, + "grad_norm": 0.7515255808830261, + "learning_rate": 1.2998228189429713e-05, + "loss": 1.411, + "mean_token_accuracy": 0.6621668885151545, + "num_tokens": 1360194172.0, + "step": 8119 + }, + { + "entropy": 1.7168916761875153, + "epoch": 0.8920381203482464, + "grad_norm": 0.6676003932952881, + "learning_rate": 1.299666699174005e-05, + "loss": 1.4152, + "mean_token_accuracy": 0.6431319614251455, + "num_tokens": 1360408050.0, + "step": 8120 + }, + { + "entropy": 1.7273538609345753, + "epoch": 0.8921479772596194, + "grad_norm": 0.7413110136985779, + "learning_rate": 1.2995105730856774e-05, + "loss": 1.3913, + "mean_token_accuracy": 0.6430693517128626, + "num_tokens": 1360572401.0, + "step": 8121 + }, + { + "entropy": 1.6615086793899536, + "epoch": 0.8922578341709922, + "grad_norm": 0.7433538436889648, + "learning_rate": 1.2993544406829303e-05, + "loss": 1.473, + "mean_token_accuracy": 0.6483894636233648, + "num_tokens": 1360765523.0, + "step": 8122 + }, + { + "entropy": 1.708322823047638, + "epoch": 0.8923676910823652, + "grad_norm": 0.6540583372116089, + "learning_rate": 1.299198301970705e-05, + "loss": 1.3412, + "mean_token_accuracy": 0.6675945669412613, + "num_tokens": 1360935268.0, + "step": 8123 + }, + { + "entropy": 1.6495771209398906, + "epoch": 0.8924775479937381, + "grad_norm": 0.6546026468276978, + "learning_rate": 1.2990421569539429e-05, + "loss": 1.4705, + "mean_token_accuracy": 0.6375894794861475, + "num_tokens": 1361161749.0, + "step": 8124 + }, + { + "entropy": 1.6485347251097362, + "epoch": 0.8925874049051111, + "grad_norm": 0.6614772081375122, + "learning_rate": 1.2988860056375864e-05, + "loss": 1.4092, + "mean_token_accuracy": 0.660191277662913, + "num_tokens": 1361334599.0, + "step": 8125 + }, + { + "entropy": 1.7235966821511586, + "epoch": 0.892697261816484, + "grad_norm": 0.6788547039031982, + "learning_rate": 1.2987298480265775e-05, + "loss": 1.5024, + "mean_token_accuracy": 0.6518261929353079, + "num_tokens": 1361511927.0, + "step": 8126 + }, + { + "entropy": 1.7020771602789562, + "epoch": 0.892807118727857, + "grad_norm": 0.7183151841163635, + "learning_rate": 1.2985736841258585e-05, + "loss": 1.4419, + "mean_token_accuracy": 0.6394909024238586, + "num_tokens": 1361690858.0, + "step": 8127 + }, + { + "entropy": 1.68422997991244, + "epoch": 0.8929169756392299, + "grad_norm": 0.6331420540809631, + "learning_rate": 1.2984175139403719e-05, + "loss": 1.3114, + "mean_token_accuracy": 0.6713191568851471, + "num_tokens": 1361842250.0, + "step": 8128 + }, + { + "entropy": 1.728828767935435, + "epoch": 0.8930268325506029, + "grad_norm": 0.7083820700645447, + "learning_rate": 1.29826133747506e-05, + "loss": 1.6225, + "mean_token_accuracy": 0.6279341727495193, + "num_tokens": 1362061841.0, + "step": 8129 + }, + { + "entropy": 1.718500663836797, + "epoch": 0.8931366894619758, + "grad_norm": 0.6595919132232666, + "learning_rate": 1.2981051547348667e-05, + "loss": 1.5593, + "mean_token_accuracy": 0.6272151817878088, + "num_tokens": 1362318836.0, + "step": 8130 + }, + { + "entropy": 1.7139343520005543, + "epoch": 0.8932465463733487, + "grad_norm": 0.7923753261566162, + "learning_rate": 1.297948965724734e-05, + "loss": 1.534, + "mean_token_accuracy": 0.6324710150559744, + "num_tokens": 1362525771.0, + "step": 8131 + }, + { + "entropy": 1.722178190946579, + "epoch": 0.8933564032847217, + "grad_norm": 0.6958953142166138, + "learning_rate": 1.2977927704496063e-05, + "loss": 1.4105, + "mean_token_accuracy": 0.6546609650055567, + "num_tokens": 1362697980.0, + "step": 8132 + }, + { + "entropy": 1.7218119998772938, + "epoch": 0.8934662601960945, + "grad_norm": 0.720320463180542, + "learning_rate": 1.2976365689144262e-05, + "loss": 1.3803, + "mean_token_accuracy": 0.6600370605786642, + "num_tokens": 1362841672.0, + "step": 8133 + }, + { + "entropy": 1.6862787107626598, + "epoch": 0.8935761171074675, + "grad_norm": 0.6662365794181824, + "learning_rate": 1.2974803611241375e-05, + "loss": 1.5426, + "mean_token_accuracy": 0.6379824380079905, + "num_tokens": 1363031283.0, + "step": 8134 + }, + { + "entropy": 1.7841876844565074, + "epoch": 0.8936859740188404, + "grad_norm": 0.6615442633628845, + "learning_rate": 1.2973241470836844e-05, + "loss": 1.3344, + "mean_token_accuracy": 0.6595032413800558, + "num_tokens": 1363152561.0, + "step": 8135 + }, + { + "entropy": 1.7041483422120411, + "epoch": 0.8937958309302134, + "grad_norm": 0.7564711570739746, + "learning_rate": 1.2971679267980115e-05, + "loss": 1.3051, + "mean_token_accuracy": 0.6767140378554662, + "num_tokens": 1363281399.0, + "step": 8136 + }, + { + "entropy": 1.6851592659950256, + "epoch": 0.8939056878415863, + "grad_norm": 0.6683154702186584, + "learning_rate": 1.2970117002720619e-05, + "loss": 1.4669, + "mean_token_accuracy": 0.6413289060195287, + "num_tokens": 1363470077.0, + "step": 8137 + }, + { + "entropy": 1.7018636564413707, + "epoch": 0.8940155447529593, + "grad_norm": 0.6740677356719971, + "learning_rate": 1.2968554675107811e-05, + "loss": 1.4038, + "mean_token_accuracy": 0.6529013961553574, + "num_tokens": 1363608254.0, + "step": 8138 + }, + { + "entropy": 1.6942344903945923, + "epoch": 0.8941254016643322, + "grad_norm": 0.6417088508605957, + "learning_rate": 1.2966992285191136e-05, + "loss": 1.3952, + "mean_token_accuracy": 0.6635211457808813, + "num_tokens": 1363799347.0, + "step": 8139 + }, + { + "entropy": 1.7088079651196797, + "epoch": 0.8942352585757052, + "grad_norm": 0.6937258243560791, + "learning_rate": 1.296542983302004e-05, + "loss": 1.4418, + "mean_token_accuracy": 0.6464910159508387, + "num_tokens": 1363948205.0, + "step": 8140 + }, + { + "entropy": 1.7441412607828777, + "epoch": 0.8943451154870781, + "grad_norm": 0.6750649809837341, + "learning_rate": 1.2963867318643977e-05, + "loss": 1.3852, + "mean_token_accuracy": 0.6618605355421702, + "num_tokens": 1364070807.0, + "step": 8141 + }, + { + "entropy": 1.7100872000058491, + "epoch": 0.8944549723984511, + "grad_norm": 0.6978124976158142, + "learning_rate": 1.2962304742112398e-05, + "loss": 1.3172, + "mean_token_accuracy": 0.6667628437280655, + "num_tokens": 1364214371.0, + "step": 8142 + }, + { + "entropy": 1.7487525542577107, + "epoch": 0.894564829309824, + "grad_norm": 0.746597945690155, + "learning_rate": 1.2960742103474752e-05, + "loss": 1.3387, + "mean_token_accuracy": 0.6738084952036539, + "num_tokens": 1364319641.0, + "step": 8143 + }, + { + "entropy": 1.7394179999828339, + "epoch": 0.894674686221197, + "grad_norm": 0.676131546497345, + "learning_rate": 1.2959179402780508e-05, + "loss": 1.4124, + "mean_token_accuracy": 0.6497220148642858, + "num_tokens": 1364460921.0, + "step": 8144 + }, + { + "entropy": 1.7177764972050984, + "epoch": 0.8947845431325698, + "grad_norm": 0.9208407402038574, + "learning_rate": 1.2957616640079118e-05, + "loss": 1.515, + "mean_token_accuracy": 0.6533168703317642, + "num_tokens": 1364628805.0, + "step": 8145 + }, + { + "entropy": 1.7679544786612194, + "epoch": 0.8948944000439427, + "grad_norm": 0.6939182281494141, + "learning_rate": 1.2956053815420044e-05, + "loss": 1.3703, + "mean_token_accuracy": 0.6553449034690857, + "num_tokens": 1364794603.0, + "step": 8146 + }, + { + "entropy": 1.687473217646281, + "epoch": 0.8950042569553157, + "grad_norm": 0.7051041722297668, + "learning_rate": 1.2954490928852746e-05, + "loss": 1.4056, + "mean_token_accuracy": 0.6483729779720306, + "num_tokens": 1365029285.0, + "step": 8147 + }, + { + "entropy": 1.7038420736789703, + "epoch": 0.8951141138666886, + "grad_norm": 0.8011882901191711, + "learning_rate": 1.2952927980426696e-05, + "loss": 1.3818, + "mean_token_accuracy": 0.6608738501866659, + "num_tokens": 1365145593.0, + "step": 8148 + }, + { + "entropy": 1.6690380970637004, + "epoch": 0.8952239707780616, + "grad_norm": 0.6747339963912964, + "learning_rate": 1.2951364970191347e-05, + "loss": 1.4883, + "mean_token_accuracy": 0.6363983005285263, + "num_tokens": 1365353607.0, + "step": 8149 + }, + { + "entropy": 1.7097953756650288, + "epoch": 0.8953338276894345, + "grad_norm": 0.7147229909896851, + "learning_rate": 1.2949801898196182e-05, + "loss": 1.4018, + "mean_token_accuracy": 0.6539677331844965, + "num_tokens": 1365499388.0, + "step": 8150 + }, + { + "entropy": 1.7601737678050995, + "epoch": 0.8954436846008075, + "grad_norm": 0.6297820806503296, + "learning_rate": 1.2948238764490664e-05, + "loss": 1.5235, + "mean_token_accuracy": 0.6257789582014084, + "num_tokens": 1365714612.0, + "step": 8151 + }, + { + "entropy": 1.6906922459602356, + "epoch": 0.8955535415121804, + "grad_norm": 0.837054431438446, + "learning_rate": 1.2946675569124266e-05, + "loss": 1.3727, + "mean_token_accuracy": 0.6722191870212555, + "num_tokens": 1365845342.0, + "step": 8152 + }, + { + "entropy": 1.7662516037623088, + "epoch": 0.8956633984235534, + "grad_norm": 0.6832341551780701, + "learning_rate": 1.2945112312146464e-05, + "loss": 1.4975, + "mean_token_accuracy": 0.6488762050867081, + "num_tokens": 1365980955.0, + "step": 8153 + }, + { + "entropy": 1.6681772371133168, + "epoch": 0.8957732553349262, + "grad_norm": 0.5820019245147705, + "learning_rate": 1.2943548993606736e-05, + "loss": 1.3885, + "mean_token_accuracy": 0.6611274381478628, + "num_tokens": 1366185654.0, + "step": 8154 + }, + { + "entropy": 1.773647129535675, + "epoch": 0.8958831122462992, + "grad_norm": 0.6890908479690552, + "learning_rate": 1.2941985613554558e-05, + "loss": 1.4015, + "mean_token_accuracy": 0.6438634345928828, + "num_tokens": 1366332030.0, + "step": 8155 + }, + { + "entropy": 1.7235575517018635, + "epoch": 0.8959929691576721, + "grad_norm": 0.7061694264411926, + "learning_rate": 1.294042217203941e-05, + "loss": 1.4915, + "mean_token_accuracy": 0.6430180122454962, + "num_tokens": 1366525544.0, + "step": 8156 + }, + { + "entropy": 1.670232355594635, + "epoch": 0.8961028260690451, + "grad_norm": 0.5871284604072571, + "learning_rate": 1.293885866911077e-05, + "loss": 1.3359, + "mean_token_accuracy": 0.6684425920248032, + "num_tokens": 1366765534.0, + "step": 8157 + }, + { + "entropy": 1.7217474579811096, + "epoch": 0.896212682980418, + "grad_norm": 0.6503912806510925, + "learning_rate": 1.293729510481813e-05, + "loss": 1.2649, + "mean_token_accuracy": 0.6772298713525137, + "num_tokens": 1366890588.0, + "step": 8158 + }, + { + "entropy": 1.686649481455485, + "epoch": 0.8963225398917909, + "grad_norm": 0.8139302730560303, + "learning_rate": 1.293573147921097e-05, + "loss": 1.3724, + "mean_token_accuracy": 0.6652147769927979, + "num_tokens": 1367071167.0, + "step": 8159 + }, + { + "entropy": 1.7288777728875477, + "epoch": 0.8964323968031639, + "grad_norm": 0.6936602592468262, + "learning_rate": 1.2934167792338788e-05, + "loss": 1.5265, + "mean_token_accuracy": 0.6469365855058035, + "num_tokens": 1367276133.0, + "step": 8160 + }, + { + "entropy": 1.689979334672292, + "epoch": 0.8965422537145368, + "grad_norm": 0.7575037479400635, + "learning_rate": 1.2932604044251063e-05, + "loss": 1.5474, + "mean_token_accuracy": 0.6251169790824255, + "num_tokens": 1367539260.0, + "step": 8161 + }, + { + "entropy": 1.6986994842688243, + "epoch": 0.8966521106259098, + "grad_norm": 0.6705021858215332, + "learning_rate": 1.293104023499729e-05, + "loss": 1.4051, + "mean_token_accuracy": 0.6545246789852778, + "num_tokens": 1367699048.0, + "step": 8162 + }, + { + "entropy": 1.685945173104604, + "epoch": 0.8967619675372827, + "grad_norm": 0.7093241214752197, + "learning_rate": 1.2929476364626965e-05, + "loss": 1.362, + "mean_token_accuracy": 0.6544206788142523, + "num_tokens": 1367871465.0, + "step": 8163 + }, + { + "entropy": 1.7687697807947795, + "epoch": 0.8968718244486557, + "grad_norm": 0.6797177195549011, + "learning_rate": 1.2927912433189583e-05, + "loss": 1.523, + "mean_token_accuracy": 0.6424828718105952, + "num_tokens": 1368046329.0, + "step": 8164 + }, + { + "entropy": 1.7541530827681224, + "epoch": 0.8969816813600285, + "grad_norm": 0.6629700660705566, + "learning_rate": 1.2926348440734637e-05, + "loss": 1.369, + "mean_token_accuracy": 0.6573603600263596, + "num_tokens": 1368231299.0, + "step": 8165 + }, + { + "entropy": 1.7120748162269592, + "epoch": 0.8970915382714015, + "grad_norm": 0.6065205335617065, + "learning_rate": 1.2924784387311638e-05, + "loss": 1.5206, + "mean_token_accuracy": 0.6434388856093088, + "num_tokens": 1368425603.0, + "step": 8166 + }, + { + "entropy": 1.6880040764808655, + "epoch": 0.8972013951827744, + "grad_norm": 0.6545516848564148, + "learning_rate": 1.2923220272970074e-05, + "loss": 1.5294, + "mean_token_accuracy": 0.6539272020260493, + "num_tokens": 1368582508.0, + "step": 8167 + }, + { + "entropy": 1.7425115207831066, + "epoch": 0.8973112520941474, + "grad_norm": 0.8306770920753479, + "learning_rate": 1.2921656097759459e-05, + "loss": 1.4239, + "mean_token_accuracy": 0.6608427713314692, + "num_tokens": 1368707001.0, + "step": 8168 + }, + { + "entropy": 1.6598342955112457, + "epoch": 0.8974211090055203, + "grad_norm": 0.5864236950874329, + "learning_rate": 1.2920091861729291e-05, + "loss": 1.3178, + "mean_token_accuracy": 0.6688061058521271, + "num_tokens": 1368866831.0, + "step": 8169 + }, + { + "entropy": 1.7307129402955372, + "epoch": 0.8975309659168933, + "grad_norm": 0.7228249907493591, + "learning_rate": 1.2918527564929084e-05, + "loss": 1.4023, + "mean_token_accuracy": 0.6562465329964956, + "num_tokens": 1369012025.0, + "step": 8170 + }, + { + "entropy": 1.6857167681058247, + "epoch": 0.8976408228282662, + "grad_norm": 0.7698543071746826, + "learning_rate": 1.2916963207408339e-05, + "loss": 1.3763, + "mean_token_accuracy": 0.6605077634255091, + "num_tokens": 1369220961.0, + "step": 8171 + }, + { + "entropy": 1.6822535892327626, + "epoch": 0.8977506797396391, + "grad_norm": 0.6768351197242737, + "learning_rate": 1.291539878921658e-05, + "loss": 1.1921, + "mean_token_accuracy": 0.6810629268487295, + "num_tokens": 1369395347.0, + "step": 8172 + }, + { + "entropy": 1.6895845532417297, + "epoch": 0.8978605366510121, + "grad_norm": 0.591072678565979, + "learning_rate": 1.2913834310403309e-05, + "loss": 1.3772, + "mean_token_accuracy": 0.6566944519678751, + "num_tokens": 1369551940.0, + "step": 8173 + }, + { + "entropy": 1.6599336862564087, + "epoch": 0.897970393562385, + "grad_norm": 0.5289608836174011, + "learning_rate": 1.2912269771018042e-05, + "loss": 1.4677, + "mean_token_accuracy": 0.6440421094497045, + "num_tokens": 1369765218.0, + "step": 8174 + }, + { + "entropy": 1.7176273266474407, + "epoch": 0.898080250473758, + "grad_norm": 0.9386351108551025, + "learning_rate": 1.29107051711103e-05, + "loss": 1.4368, + "mean_token_accuracy": 0.6671102990706762, + "num_tokens": 1369906301.0, + "step": 8175 + }, + { + "entropy": 1.674050102631251, + "epoch": 0.8981901073851308, + "grad_norm": 0.659980833530426, + "learning_rate": 1.2909140510729602e-05, + "loss": 1.4214, + "mean_token_accuracy": 0.6602593511343002, + "num_tokens": 1370085506.0, + "step": 8176 + }, + { + "entropy": 1.6822616755962372, + "epoch": 0.8982999642965038, + "grad_norm": 0.7302301526069641, + "learning_rate": 1.2907575789925464e-05, + "loss": 1.2561, + "mean_token_accuracy": 0.6732948124408722, + "num_tokens": 1370218678.0, + "step": 8177 + }, + { + "entropy": 1.6883311371008556, + "epoch": 0.8984098212078767, + "grad_norm": 0.679049551486969, + "learning_rate": 1.2906011008747416e-05, + "loss": 1.533, + "mean_token_accuracy": 0.6458878070116043, + "num_tokens": 1370432098.0, + "step": 8178 + }, + { + "entropy": 1.7072526613871257, + "epoch": 0.8985196781192497, + "grad_norm": 0.6430801153182983, + "learning_rate": 1.2904446167244975e-05, + "loss": 1.246, + "mean_token_accuracy": 0.6781556854645411, + "num_tokens": 1370555624.0, + "step": 8179 + }, + { + "entropy": 1.7674343287944794, + "epoch": 0.8986295350306226, + "grad_norm": 0.6533283591270447, + "learning_rate": 1.2902881265467672e-05, + "loss": 1.44, + "mean_token_accuracy": 0.6452242086331049, + "num_tokens": 1370727544.0, + "step": 8180 + }, + { + "entropy": 1.737773100535075, + "epoch": 0.8987393919419956, + "grad_norm": 0.6868041157722473, + "learning_rate": 1.2901316303465034e-05, + "loss": 1.3688, + "mean_token_accuracy": 0.6619212180376053, + "num_tokens": 1370861919.0, + "step": 8181 + }, + { + "entropy": 1.6641343732674916, + "epoch": 0.8988492488533685, + "grad_norm": 1.105683445930481, + "learning_rate": 1.2899751281286595e-05, + "loss": 1.4682, + "mean_token_accuracy": 0.6598964184522629, + "num_tokens": 1371063236.0, + "step": 8182 + }, + { + "entropy": 1.756370743115743, + "epoch": 0.8989591057647415, + "grad_norm": 0.854651927947998, + "learning_rate": 1.289818619898188e-05, + "loss": 1.5702, + "mean_token_accuracy": 0.6351617326339086, + "num_tokens": 1371203907.0, + "step": 8183 + }, + { + "entropy": 1.7830684284369152, + "epoch": 0.8990689626761144, + "grad_norm": 0.6945520639419556, + "learning_rate": 1.2896621056600429e-05, + "loss": 1.4435, + "mean_token_accuracy": 0.644214446345965, + "num_tokens": 1371354131.0, + "step": 8184 + }, + { + "entropy": 1.672329713900884, + "epoch": 0.8991788195874872, + "grad_norm": 0.6429153084754944, + "learning_rate": 1.2895055854191776e-05, + "loss": 1.2707, + "mean_token_accuracy": 0.6696565896272659, + "num_tokens": 1371497398.0, + "step": 8185 + }, + { + "entropy": 1.6738866865634918, + "epoch": 0.8992886764988602, + "grad_norm": 0.676530122756958, + "learning_rate": 1.2893490591805458e-05, + "loss": 1.3334, + "mean_token_accuracy": 0.6663380612929662, + "num_tokens": 1371708217.0, + "step": 8186 + }, + { + "entropy": 1.704796036084493, + "epoch": 0.8993985334102331, + "grad_norm": 0.7546509504318237, + "learning_rate": 1.2891925269491018e-05, + "loss": 1.2849, + "mean_token_accuracy": 0.6688467363516489, + "num_tokens": 1371894274.0, + "step": 8187 + }, + { + "entropy": 1.7164516548315685, + "epoch": 0.8995083903216061, + "grad_norm": 0.6668331027030945, + "learning_rate": 1.2890359887297996e-05, + "loss": 1.5352, + "mean_token_accuracy": 0.6357905914386114, + "num_tokens": 1372102280.0, + "step": 8188 + }, + { + "entropy": 1.7005331714948018, + "epoch": 0.899618247232979, + "grad_norm": 0.6698519587516785, + "learning_rate": 1.2888794445275931e-05, + "loss": 1.2379, + "mean_token_accuracy": 0.6748972535133362, + "num_tokens": 1372240684.0, + "step": 8189 + }, + { + "entropy": 1.7654169201850891, + "epoch": 0.899728104144352, + "grad_norm": 0.8310354948043823, + "learning_rate": 1.2887228943474376e-05, + "loss": 1.4978, + "mean_token_accuracy": 0.6487491776545843, + "num_tokens": 1372380490.0, + "step": 8190 + }, + { + "entropy": 1.7207094430923462, + "epoch": 0.8998379610557249, + "grad_norm": 0.6810332536697388, + "learning_rate": 1.2885663381942877e-05, + "loss": 1.506, + "mean_token_accuracy": 0.6450515190760294, + "num_tokens": 1372567918.0, + "step": 8191 + }, + { + "entropy": 1.6392219463984172, + "epoch": 0.8999478179670979, + "grad_norm": 0.6061927676200867, + "learning_rate": 1.288409776073098e-05, + "loss": 1.4508, + "mean_token_accuracy": 0.6553038557370504, + "num_tokens": 1372767417.0, + "step": 8192 + }, + { + "entropy": 1.6560143133004506, + "epoch": 0.9000576748784708, + "grad_norm": 0.7693495750427246, + "learning_rate": 1.2882532079888234e-05, + "loss": 1.2824, + "mean_token_accuracy": 0.6741581360499064, + "num_tokens": 1372908182.0, + "step": 8193 + }, + { + "entropy": 1.7727676530679066, + "epoch": 0.9001675317898438, + "grad_norm": 0.7517789602279663, + "learning_rate": 1.2880966339464203e-05, + "loss": 1.3932, + "mean_token_accuracy": 0.6643371681372324, + "num_tokens": 1373084118.0, + "step": 8194 + }, + { + "entropy": 1.6854754785696666, + "epoch": 0.9002773887012167, + "grad_norm": 0.7738332152366638, + "learning_rate": 1.2879400539508431e-05, + "loss": 1.3219, + "mean_token_accuracy": 0.6660276005665461, + "num_tokens": 1373209452.0, + "step": 8195 + }, + { + "entropy": 1.6973048547903697, + "epoch": 0.9003872456125896, + "grad_norm": 0.6942301392555237, + "learning_rate": 1.287783468007048e-05, + "loss": 1.294, + "mean_token_accuracy": 0.6708059559265772, + "num_tokens": 1373356174.0, + "step": 8196 + }, + { + "entropy": 1.6481784184773762, + "epoch": 0.9004971025239625, + "grad_norm": 0.6286212801933289, + "learning_rate": 1.2876268761199905e-05, + "loss": 1.3796, + "mean_token_accuracy": 0.6566238403320312, + "num_tokens": 1373512786.0, + "step": 8197 + }, + { + "entropy": 1.657067855199178, + "epoch": 0.9006069594353355, + "grad_norm": 0.6861938238143921, + "learning_rate": 1.2874702782946273e-05, + "loss": 1.3017, + "mean_token_accuracy": 0.6696779529253641, + "num_tokens": 1373631016.0, + "step": 8198 + }, + { + "entropy": 1.6833013196786244, + "epoch": 0.9007168163467084, + "grad_norm": 0.6281445026397705, + "learning_rate": 1.2873136745359138e-05, + "loss": 1.4777, + "mean_token_accuracy": 0.6407529513041178, + "num_tokens": 1373814798.0, + "step": 8199 + }, + { + "entropy": 1.7312106589476268, + "epoch": 0.9008266732580813, + "grad_norm": 0.6315418481826782, + "learning_rate": 1.2871570648488074e-05, + "loss": 1.409, + "mean_token_accuracy": 0.6527961442867914, + "num_tokens": 1373963715.0, + "step": 8200 + }, + { + "entropy": 1.636249562104543, + "epoch": 0.9009365301694543, + "grad_norm": 0.614751935005188, + "learning_rate": 1.2870004492382639e-05, + "loss": 1.464, + "mean_token_accuracy": 0.6453719884157181, + "num_tokens": 1374167035.0, + "step": 8201 + }, + { + "entropy": 1.7170550723870595, + "epoch": 0.9010463870808272, + "grad_norm": 0.7123568058013916, + "learning_rate": 1.2868438277092408e-05, + "loss": 1.355, + "mean_token_accuracy": 0.6694580415884653, + "num_tokens": 1374299248.0, + "step": 8202 + }, + { + "entropy": 1.8127153019110362, + "epoch": 0.9011562439922002, + "grad_norm": 0.6031003594398499, + "learning_rate": 1.2866872002666943e-05, + "loss": 1.4627, + "mean_token_accuracy": 0.6451161354780197, + "num_tokens": 1374494985.0, + "step": 8203 + }, + { + "entropy": 1.7356827060381572, + "epoch": 0.9012661009035731, + "grad_norm": 0.8380510807037354, + "learning_rate": 1.2865305669155822e-05, + "loss": 1.4517, + "mean_token_accuracy": 0.6483780195315679, + "num_tokens": 1374700001.0, + "step": 8204 + }, + { + "entropy": 1.7188110053539276, + "epoch": 0.9013759578149461, + "grad_norm": 0.6875895261764526, + "learning_rate": 1.2863739276608618e-05, + "loss": 1.3714, + "mean_token_accuracy": 0.6474323074022929, + "num_tokens": 1374861646.0, + "step": 8205 + }, + { + "entropy": 1.6347064077854156, + "epoch": 0.901485814726319, + "grad_norm": 0.6865116953849792, + "learning_rate": 1.2862172825074906e-05, + "loss": 1.5043, + "mean_token_accuracy": 0.6302382349967957, + "num_tokens": 1375086655.0, + "step": 8206 + }, + { + "entropy": 1.664256900548935, + "epoch": 0.9015956716376919, + "grad_norm": 0.6529141664505005, + "learning_rate": 1.2860606314604262e-05, + "loss": 1.4591, + "mean_token_accuracy": 0.6522035201390585, + "num_tokens": 1375273760.0, + "step": 8207 + }, + { + "entropy": 1.783077895641327, + "epoch": 0.9017055285490648, + "grad_norm": 0.7015360593795776, + "learning_rate": 1.2859039745246267e-05, + "loss": 1.3206, + "mean_token_accuracy": 0.6642241428295771, + "num_tokens": 1375397296.0, + "step": 8208 + }, + { + "entropy": 1.677474598089854, + "epoch": 0.9018153854604378, + "grad_norm": 0.6711921095848083, + "learning_rate": 1.28574731170505e-05, + "loss": 1.5791, + "mean_token_accuracy": 0.6554554551839828, + "num_tokens": 1375570200.0, + "step": 8209 + }, + { + "entropy": 1.7532115777333577, + "epoch": 0.9019252423718107, + "grad_norm": 0.7770174741744995, + "learning_rate": 1.2855906430066552e-05, + "loss": 1.4795, + "mean_token_accuracy": 0.6456655959288279, + "num_tokens": 1375729714.0, + "step": 8210 + }, + { + "entropy": 1.7668093641599019, + "epoch": 0.9020350992831837, + "grad_norm": 0.6325072050094604, + "learning_rate": 1.2854339684343993e-05, + "loss": 1.5966, + "mean_token_accuracy": 0.6250222822030386, + "num_tokens": 1375963527.0, + "step": 8211 + }, + { + "entropy": 1.6405591766039531, + "epoch": 0.9021449561945566, + "grad_norm": 0.6002046465873718, + "learning_rate": 1.2852772879932425e-05, + "loss": 1.3697, + "mean_token_accuracy": 0.6608653912941614, + "num_tokens": 1376118353.0, + "step": 8212 + }, + { + "entropy": 1.7306797703107197, + "epoch": 0.9022548131059295, + "grad_norm": 0.8113459944725037, + "learning_rate": 1.285120601688143e-05, + "loss": 1.4127, + "mean_token_accuracy": 0.6562537600596746, + "num_tokens": 1376286365.0, + "step": 8213 + }, + { + "entropy": 1.6529719134171803, + "epoch": 0.9023646700173025, + "grad_norm": 0.6751854419708252, + "learning_rate": 1.2849639095240596e-05, + "loss": 1.3637, + "mean_token_accuracy": 0.6638490408658981, + "num_tokens": 1376444760.0, + "step": 8214 + }, + { + "entropy": 1.6793536742528279, + "epoch": 0.9024745269286754, + "grad_norm": 0.6569497585296631, + "learning_rate": 1.284807211505952e-05, + "loss": 1.4346, + "mean_token_accuracy": 0.643970454732577, + "num_tokens": 1376679188.0, + "step": 8215 + }, + { + "entropy": 1.65372101465861, + "epoch": 0.9025843838400484, + "grad_norm": 0.7767542004585266, + "learning_rate": 1.2846505076387794e-05, + "loss": 1.3946, + "mean_token_accuracy": 0.6664103666941324, + "num_tokens": 1376842688.0, + "step": 8216 + }, + { + "entropy": 1.682920217514038, + "epoch": 0.9026942407514212, + "grad_norm": 0.6500419974327087, + "learning_rate": 1.284493797927501e-05, + "loss": 1.488, + "mean_token_accuracy": 0.641838863492012, + "num_tokens": 1377060077.0, + "step": 8217 + }, + { + "entropy": 1.6799386739730835, + "epoch": 0.9028040976627942, + "grad_norm": 0.7521069049835205, + "learning_rate": 1.2843370823770776e-05, + "loss": 1.4117, + "mean_token_accuracy": 0.6611147572596868, + "num_tokens": 1377218796.0, + "step": 8218 + }, + { + "entropy": 1.6649762590726216, + "epoch": 0.9029139545741671, + "grad_norm": 0.8042888641357422, + "learning_rate": 1.2841803609924684e-05, + "loss": 1.4148, + "mean_token_accuracy": 0.6492630541324615, + "num_tokens": 1377444988.0, + "step": 8219 + }, + { + "entropy": 1.6619918942451477, + "epoch": 0.9030238114855401, + "grad_norm": 0.6244032382965088, + "learning_rate": 1.284023633778634e-05, + "loss": 1.2809, + "mean_token_accuracy": 0.6721317023038864, + "num_tokens": 1377588973.0, + "step": 8220 + }, + { + "entropy": 1.7610027194023132, + "epoch": 0.903133668396913, + "grad_norm": 0.7928817272186279, + "learning_rate": 1.2838669007405343e-05, + "loss": 1.4796, + "mean_token_accuracy": 0.6414338201284409, + "num_tokens": 1377760628.0, + "step": 8221 + }, + { + "entropy": 1.666873186826706, + "epoch": 0.903243525308286, + "grad_norm": 0.7573736310005188, + "learning_rate": 1.2837101618831298e-05, + "loss": 1.2919, + "mean_token_accuracy": 0.6718289206425349, + "num_tokens": 1377896394.0, + "step": 8222 + }, + { + "entropy": 1.659334381421407, + "epoch": 0.9033533822196589, + "grad_norm": 0.6099801659584045, + "learning_rate": 1.2835534172113818e-05, + "loss": 1.3683, + "mean_token_accuracy": 0.6715717862049738, + "num_tokens": 1378074075.0, + "step": 8223 + }, + { + "entropy": 1.7113625705242157, + "epoch": 0.9034632391310319, + "grad_norm": 0.8202866911888123, + "learning_rate": 1.2833966667302507e-05, + "loss": 1.3022, + "mean_token_accuracy": 0.6690480063358942, + "num_tokens": 1378201013.0, + "step": 8224 + }, + { + "entropy": 1.6659850974877675, + "epoch": 0.9035730960424048, + "grad_norm": 0.6271844506263733, + "learning_rate": 1.283239910444698e-05, + "loss": 1.3199, + "mean_token_accuracy": 0.659883846839269, + "num_tokens": 1378393507.0, + "step": 8225 + }, + { + "entropy": 1.6229958931605022, + "epoch": 0.9036829529537777, + "grad_norm": 0.6993024945259094, + "learning_rate": 1.2830831483596843e-05, + "loss": 1.3524, + "mean_token_accuracy": 0.6596012363831202, + "num_tokens": 1378561370.0, + "step": 8226 + }, + { + "entropy": 1.7114948133627574, + "epoch": 0.9037928098651506, + "grad_norm": 0.7609866857528687, + "learning_rate": 1.2829263804801717e-05, + "loss": 1.2955, + "mean_token_accuracy": 0.6641099601984024, + "num_tokens": 1378671487.0, + "step": 8227 + }, + { + "entropy": 1.6923895478248596, + "epoch": 0.9039026667765235, + "grad_norm": 0.6936889290809631, + "learning_rate": 1.2827696068111215e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.6670081863800684, + "num_tokens": 1378865187.0, + "step": 8228 + }, + { + "entropy": 1.6573287546634674, + "epoch": 0.9040125236878965, + "grad_norm": 0.768925666809082, + "learning_rate": 1.2826128273574956e-05, + "loss": 1.2579, + "mean_token_accuracy": 0.6781423836946487, + "num_tokens": 1378998735.0, + "step": 8229 + }, + { + "entropy": 1.7605082790056865, + "epoch": 0.9041223805992694, + "grad_norm": 0.8298249244689941, + "learning_rate": 1.2824560421242561e-05, + "loss": 1.3718, + "mean_token_accuracy": 0.6664842814207077, + "num_tokens": 1379115376.0, + "step": 8230 + }, + { + "entropy": 1.7072460353374481, + "epoch": 0.9042322375106424, + "grad_norm": 0.6020426154136658, + "learning_rate": 1.282299251116365e-05, + "loss": 1.4243, + "mean_token_accuracy": 0.6609440296888351, + "num_tokens": 1379263910.0, + "step": 8231 + }, + { + "entropy": 1.684839407602946, + "epoch": 0.9043420944220153, + "grad_norm": 0.6894782185554504, + "learning_rate": 1.2821424543387847e-05, + "loss": 1.2821, + "mean_token_accuracy": 0.6726427723964056, + "num_tokens": 1379417480.0, + "step": 8232 + }, + { + "entropy": 1.6503340899944305, + "epoch": 0.9044519513333883, + "grad_norm": 0.5822688937187195, + "learning_rate": 1.281985651796478e-05, + "loss": 1.4296, + "mean_token_accuracy": 0.649740070104599, + "num_tokens": 1379665136.0, + "step": 8233 + }, + { + "entropy": 1.6690570612748463, + "epoch": 0.9045618082447612, + "grad_norm": 0.6308638453483582, + "learning_rate": 1.2818288434944072e-05, + "loss": 1.3888, + "mean_token_accuracy": 0.6529064277807871, + "num_tokens": 1379883176.0, + "step": 8234 + }, + { + "entropy": 1.7349829475084941, + "epoch": 0.9046716651561342, + "grad_norm": 0.6335077881813049, + "learning_rate": 1.2816720294375356e-05, + "loss": 1.5175, + "mean_token_accuracy": 0.6491954425970713, + "num_tokens": 1380137225.0, + "step": 8235 + }, + { + "entropy": 1.754395325978597, + "epoch": 0.9047815220675071, + "grad_norm": 0.5818184018135071, + "learning_rate": 1.281515209630826e-05, + "loss": 1.4973, + "mean_token_accuracy": 0.6262113898992538, + "num_tokens": 1380356095.0, + "step": 8236 + }, + { + "entropy": 1.7043922344843547, + "epoch": 0.9048913789788801, + "grad_norm": 0.6699923276901245, + "learning_rate": 1.281358384079242e-05, + "loss": 1.4521, + "mean_token_accuracy": 0.6527075817187628, + "num_tokens": 1380514717.0, + "step": 8237 + }, + { + "entropy": 1.690778245528539, + "epoch": 0.9050012358902529, + "grad_norm": 0.6731590628623962, + "learning_rate": 1.2812015527877468e-05, + "loss": 1.3295, + "mean_token_accuracy": 0.6692969848712286, + "num_tokens": 1380638078.0, + "step": 8238 + }, + { + "entropy": 1.7137603163719177, + "epoch": 0.9051110928016258, + "grad_norm": 0.9683634042739868, + "learning_rate": 1.281044715761304e-05, + "loss": 1.0955, + "mean_token_accuracy": 0.6890260974566141, + "num_tokens": 1380809601.0, + "step": 8239 + }, + { + "entropy": 1.709503750006358, + "epoch": 0.9052209497129988, + "grad_norm": 0.6515042781829834, + "learning_rate": 1.2808878730048776e-05, + "loss": 1.3787, + "mean_token_accuracy": 0.6661019821961721, + "num_tokens": 1380961836.0, + "step": 8240 + }, + { + "entropy": 1.7124955157438915, + "epoch": 0.9053308066243717, + "grad_norm": 0.6768700480461121, + "learning_rate": 1.2807310245234315e-05, + "loss": 1.3232, + "mean_token_accuracy": 0.6700140833854675, + "num_tokens": 1381120067.0, + "step": 8241 + }, + { + "entropy": 1.7447414994239807, + "epoch": 0.9054406635357447, + "grad_norm": 0.6183107495307922, + "learning_rate": 1.2805741703219298e-05, + "loss": 1.394, + "mean_token_accuracy": 0.6493855814139048, + "num_tokens": 1381295366.0, + "step": 8242 + }, + { + "entropy": 1.6751858790715535, + "epoch": 0.9055505204471176, + "grad_norm": 0.7140679359436035, + "learning_rate": 1.280417310405337e-05, + "loss": 1.2097, + "mean_token_accuracy": 0.678791751464208, + "num_tokens": 1381414709.0, + "step": 8243 + }, + { + "entropy": 1.7946178317070007, + "epoch": 0.9056603773584906, + "grad_norm": 0.7667945027351379, + "learning_rate": 1.280260444778618e-05, + "loss": 1.6662, + "mean_token_accuracy": 0.6264889935652415, + "num_tokens": 1381601348.0, + "step": 8244 + }, + { + "entropy": 1.733892410993576, + "epoch": 0.9057702342698635, + "grad_norm": 5.535741806030273, + "learning_rate": 1.2801035734467367e-05, + "loss": 1.5474, + "mean_token_accuracy": 0.6549767504135767, + "num_tokens": 1381785359.0, + "step": 8245 + }, + { + "entropy": 1.6751560469468434, + "epoch": 0.9058800911812365, + "grad_norm": 0.6716073751449585, + "learning_rate": 1.2799466964146588e-05, + "loss": 1.2718, + "mean_token_accuracy": 0.6745875130097071, + "num_tokens": 1381912751.0, + "step": 8246 + }, + { + "entropy": 1.6946379244327545, + "epoch": 0.9059899480926094, + "grad_norm": 0.7089009284973145, + "learning_rate": 1.2797898136873488e-05, + "loss": 1.3159, + "mean_token_accuracy": 0.6636594186226527, + "num_tokens": 1382073482.0, + "step": 8247 + }, + { + "entropy": 1.6982887486616771, + "epoch": 0.9060998050039824, + "grad_norm": 0.7040889263153076, + "learning_rate": 1.2796329252697723e-05, + "loss": 1.4122, + "mean_token_accuracy": 0.6548338035742441, + "num_tokens": 1382261132.0, + "step": 8248 + }, + { + "entropy": 1.7026695410410564, + "epoch": 0.9062096619153552, + "grad_norm": 0.6209987998008728, + "learning_rate": 1.2794760311668946e-05, + "loss": 1.4881, + "mean_token_accuracy": 0.6293542782465616, + "num_tokens": 1382448970.0, + "step": 8249 + }, + { + "entropy": 1.7288841704527538, + "epoch": 0.9063195188267282, + "grad_norm": 0.6209704875946045, + "learning_rate": 1.2793191313836815e-05, + "loss": 1.3709, + "mean_token_accuracy": 0.6561005115509033, + "num_tokens": 1382661028.0, + "step": 8250 + }, + { + "entropy": 1.6416561702887218, + "epoch": 0.9064293757381011, + "grad_norm": 0.8076834678649902, + "learning_rate": 1.2791622259250986e-05, + "loss": 1.2379, + "mean_token_accuracy": 0.6792215506235758, + "num_tokens": 1382780205.0, + "step": 8251 + }, + { + "entropy": 1.6922811170419056, + "epoch": 0.9065392326494741, + "grad_norm": 0.5980085730552673, + "learning_rate": 1.2790053147961119e-05, + "loss": 1.3521, + "mean_token_accuracy": 0.6647952993710836, + "num_tokens": 1382952918.0, + "step": 8252 + }, + { + "entropy": 1.6755750874678295, + "epoch": 0.906649089560847, + "grad_norm": 0.7372617721557617, + "learning_rate": 1.2788483980016878e-05, + "loss": 1.367, + "mean_token_accuracy": 0.6559625367323557, + "num_tokens": 1383126488.0, + "step": 8253 + }, + { + "entropy": 1.7112425963083904, + "epoch": 0.9067589464722199, + "grad_norm": 0.634032666683197, + "learning_rate": 1.2786914755467924e-05, + "loss": 1.4346, + "mean_token_accuracy": 0.6412641257047653, + "num_tokens": 1383322709.0, + "step": 8254 + }, + { + "entropy": 1.6903300682703655, + "epoch": 0.9068688033835929, + "grad_norm": 0.6346539855003357, + "learning_rate": 1.2785345474363922e-05, + "loss": 1.457, + "mean_token_accuracy": 0.6424979070822397, + "num_tokens": 1383520924.0, + "step": 8255 + }, + { + "entropy": 1.7225427826245625, + "epoch": 0.9069786602949658, + "grad_norm": 0.8833540678024292, + "learning_rate": 1.2783776136754544e-05, + "loss": 1.1687, + "mean_token_accuracy": 0.690707857410113, + "num_tokens": 1383642044.0, + "step": 8256 + }, + { + "entropy": 1.660976231098175, + "epoch": 0.9070885172063388, + "grad_norm": 0.6287848353385925, + "learning_rate": 1.2782206742689453e-05, + "loss": 1.434, + "mean_token_accuracy": 0.6516825159390768, + "num_tokens": 1383861090.0, + "step": 8257 + }, + { + "entropy": 1.784266173839569, + "epoch": 0.9071983741177116, + "grad_norm": 0.7297434210777283, + "learning_rate": 1.278063729221832e-05, + "loss": 1.4181, + "mean_token_accuracy": 0.6553197354078293, + "num_tokens": 1384011860.0, + "step": 8258 + }, + { + "entropy": 1.704436033964157, + "epoch": 0.9073082310290846, + "grad_norm": 0.7265962362289429, + "learning_rate": 1.2779067785390822e-05, + "loss": 1.5012, + "mean_token_accuracy": 0.6423581590255102, + "num_tokens": 1384221197.0, + "step": 8259 + }, + { + "entropy": 1.7151046693325043, + "epoch": 0.9074180879404575, + "grad_norm": 0.6915941834449768, + "learning_rate": 1.277749822225663e-05, + "loss": 1.2919, + "mean_token_accuracy": 0.6690166046222051, + "num_tokens": 1384407824.0, + "step": 8260 + }, + { + "entropy": 1.71536985039711, + "epoch": 0.9075279448518305, + "grad_norm": 0.6239339709281921, + "learning_rate": 1.2775928602865418e-05, + "loss": 1.4164, + "mean_token_accuracy": 0.646524965763092, + "num_tokens": 1384587139.0, + "step": 8261 + }, + { + "entropy": 1.7479670147101085, + "epoch": 0.9076378017632034, + "grad_norm": 0.6657982468605042, + "learning_rate": 1.2774358927266869e-05, + "loss": 1.399, + "mean_token_accuracy": 0.6403181304534277, + "num_tokens": 1384776964.0, + "step": 8262 + }, + { + "entropy": 1.6458578010400136, + "epoch": 0.9077476586745764, + "grad_norm": 0.6543890833854675, + "learning_rate": 1.2772789195510658e-05, + "loss": 1.382, + "mean_token_accuracy": 0.6550763497749964, + "num_tokens": 1384967491.0, + "step": 8263 + }, + { + "entropy": 1.7436320980389912, + "epoch": 0.9078575155859493, + "grad_norm": 0.683189332485199, + "learning_rate": 1.2771219407646465e-05, + "loss": 1.4977, + "mean_token_accuracy": 0.6435463974873225, + "num_tokens": 1385186079.0, + "step": 8264 + }, + { + "entropy": 1.6882221698760986, + "epoch": 0.9079673724973223, + "grad_norm": 0.9108843803405762, + "learning_rate": 1.2769649563723979e-05, + "loss": 1.3097, + "mean_token_accuracy": 0.6750803043444952, + "num_tokens": 1385311026.0, + "step": 8265 + }, + { + "entropy": 1.7177577217419941, + "epoch": 0.9080772294086952, + "grad_norm": 0.681003987789154, + "learning_rate": 1.276807966379288e-05, + "loss": 1.2956, + "mean_token_accuracy": 0.6633460720380148, + "num_tokens": 1385507602.0, + "step": 8266 + }, + { + "entropy": 1.7337321539719899, + "epoch": 0.9081870863200681, + "grad_norm": 27.118099212646484, + "learning_rate": 1.2766509707902856e-05, + "loss": 1.382, + "mean_token_accuracy": 0.6552288780609766, + "num_tokens": 1385687771.0, + "step": 8267 + }, + { + "entropy": 1.7292255461215973, + "epoch": 0.9082969432314411, + "grad_norm": 0.722960352897644, + "learning_rate": 1.27649396961036e-05, + "loss": 1.4267, + "mean_token_accuracy": 0.6583873132864634, + "num_tokens": 1385851761.0, + "step": 8268 + }, + { + "entropy": 1.6824017763137817, + "epoch": 0.9084068001428139, + "grad_norm": 0.74224454164505, + "learning_rate": 1.2763369628444793e-05, + "loss": 1.4162, + "mean_token_accuracy": 0.6561558942000071, + "num_tokens": 1385995094.0, + "step": 8269 + }, + { + "entropy": 1.774072657028834, + "epoch": 0.9085166570541869, + "grad_norm": 0.6898522973060608, + "learning_rate": 1.2761799504976133e-05, + "loss": 1.6522, + "mean_token_accuracy": 0.6376588419079781, + "num_tokens": 1386198262.0, + "step": 8270 + }, + { + "entropy": 1.7188159724076588, + "epoch": 0.9086265139655598, + "grad_norm": 0.6473353505134583, + "learning_rate": 1.2760229325747316e-05, + "loss": 1.3344, + "mean_token_accuracy": 0.6641741444667181, + "num_tokens": 1386372553.0, + "step": 8271 + }, + { + "entropy": 1.6474638481934865, + "epoch": 0.9087363708769328, + "grad_norm": 0.6398204565048218, + "learning_rate": 1.2758659090808032e-05, + "loss": 1.4653, + "mean_token_accuracy": 0.6392107456922531, + "num_tokens": 1386547723.0, + "step": 8272 + }, + { + "entropy": 1.7262985209623973, + "epoch": 0.9088462277883057, + "grad_norm": 0.7074971199035645, + "learning_rate": 1.2757088800207977e-05, + "loss": 1.4144, + "mean_token_accuracy": 0.6551804691553116, + "num_tokens": 1386692276.0, + "step": 8273 + }, + { + "entropy": 1.7006110846996307, + "epoch": 0.9089560846996787, + "grad_norm": 0.7677414417266846, + "learning_rate": 1.275551845399686e-05, + "loss": 1.3142, + "mean_token_accuracy": 0.6622739533583323, + "num_tokens": 1386841086.0, + "step": 8274 + }, + { + "entropy": 1.727352688709895, + "epoch": 0.9090659416110516, + "grad_norm": 0.660779595375061, + "learning_rate": 1.275394805222437e-05, + "loss": 1.3278, + "mean_token_accuracy": 0.6708957056204478, + "num_tokens": 1386990369.0, + "step": 8275 + }, + { + "entropy": 1.6799138486385345, + "epoch": 0.9091757985224246, + "grad_norm": 0.6254904270172119, + "learning_rate": 1.2752377594940215e-05, + "loss": 1.337, + "mean_token_accuracy": 0.6697245140870413, + "num_tokens": 1387149561.0, + "step": 8276 + }, + { + "entropy": 1.7286913692951202, + "epoch": 0.9092856554337975, + "grad_norm": 0.6610144376754761, + "learning_rate": 1.27508070821941e-05, + "loss": 1.3642, + "mean_token_accuracy": 0.6627133886019388, + "num_tokens": 1387344643.0, + "step": 8277 + }, + { + "entropy": 1.764141748348872, + "epoch": 0.9093955123451705, + "grad_norm": 0.7539360523223877, + "learning_rate": 1.2749236514035727e-05, + "loss": 1.3591, + "mean_token_accuracy": 0.6495264520247778, + "num_tokens": 1387478751.0, + "step": 8278 + }, + { + "entropy": 1.7027659912904103, + "epoch": 0.9095053692565434, + "grad_norm": 0.6264234185218811, + "learning_rate": 1.2747665890514808e-05, + "loss": 1.3784, + "mean_token_accuracy": 0.6494586914777756, + "num_tokens": 1387632284.0, + "step": 8279 + }, + { + "entropy": 1.7547483344872792, + "epoch": 0.9096152261679162, + "grad_norm": 0.8345460295677185, + "learning_rate": 1.2746095211681053e-05, + "loss": 1.3302, + "mean_token_accuracy": 0.6681412657101949, + "num_tokens": 1387757487.0, + "step": 8280 + }, + { + "entropy": 1.6923251152038574, + "epoch": 0.9097250830792892, + "grad_norm": 0.6497990489006042, + "learning_rate": 1.2744524477584171e-05, + "loss": 1.4038, + "mean_token_accuracy": 0.6624845961729685, + "num_tokens": 1387931911.0, + "step": 8281 + }, + { + "entropy": 1.682264655828476, + "epoch": 0.9098349399906621, + "grad_norm": 1.6480847597122192, + "learning_rate": 1.2742953688273877e-05, + "loss": 1.2411, + "mean_token_accuracy": 0.6727783133586248, + "num_tokens": 1388125678.0, + "step": 8282 + }, + { + "entropy": 1.6904015044371288, + "epoch": 0.9099447969020351, + "grad_norm": 0.67786705493927, + "learning_rate": 1.2741382843799879e-05, + "loss": 1.3375, + "mean_token_accuracy": 0.6675763030846914, + "num_tokens": 1388267967.0, + "step": 8283 + }, + { + "entropy": 1.7203630308310192, + "epoch": 0.910054653813408, + "grad_norm": 0.7089915871620178, + "learning_rate": 1.2739811944211902e-05, + "loss": 1.5562, + "mean_token_accuracy": 0.6400385747353236, + "num_tokens": 1388461097.0, + "step": 8284 + }, + { + "entropy": 1.6640840868155162, + "epoch": 0.910164510724781, + "grad_norm": 0.7004643678665161, + "learning_rate": 1.273824098955966e-05, + "loss": 1.2948, + "mean_token_accuracy": 0.6744209975004196, + "num_tokens": 1388587063.0, + "step": 8285 + }, + { + "entropy": 1.7536252836386363, + "epoch": 0.9102743676361539, + "grad_norm": 0.7096135020256042, + "learning_rate": 1.2736669979892874e-05, + "loss": 1.5139, + "mean_token_accuracy": 0.6507594784100851, + "num_tokens": 1388788368.0, + "step": 8286 + }, + { + "entropy": 1.7375941177209218, + "epoch": 0.9103842245475269, + "grad_norm": 0.6821257472038269, + "learning_rate": 1.2735098915261264e-05, + "loss": 1.348, + "mean_token_accuracy": 0.6525353888670603, + "num_tokens": 1388976824.0, + "step": 8287 + }, + { + "entropy": 1.7107720772425334, + "epoch": 0.9104940814588998, + "grad_norm": 0.717570960521698, + "learning_rate": 1.2733527795714558e-05, + "loss": 1.4769, + "mean_token_accuracy": 0.6409533818562826, + "num_tokens": 1389192824.0, + "step": 8288 + }, + { + "entropy": 1.7379231850306194, + "epoch": 0.9106039383702728, + "grad_norm": 0.8190452456474304, + "learning_rate": 1.2731956621302477e-05, + "loss": 1.4556, + "mean_token_accuracy": 0.6462369511524836, + "num_tokens": 1389373761.0, + "step": 8289 + }, + { + "entropy": 1.7158324718475342, + "epoch": 0.9107137952816456, + "grad_norm": 0.7872406840324402, + "learning_rate": 1.2730385392074751e-05, + "loss": 1.63, + "mean_token_accuracy": 0.6238044649362564, + "num_tokens": 1389557573.0, + "step": 8290 + }, + { + "entropy": 1.7453622718652089, + "epoch": 0.9108236521930186, + "grad_norm": 0.7404714226722717, + "learning_rate": 1.2728814108081105e-05, + "loss": 1.2521, + "mean_token_accuracy": 0.6764761656522751, + "num_tokens": 1389670922.0, + "step": 8291 + }, + { + "entropy": 1.7301382223765056, + "epoch": 0.9109335091043915, + "grad_norm": 0.6327300071716309, + "learning_rate": 1.272724276937127e-05, + "loss": 1.3361, + "mean_token_accuracy": 0.6668089230855306, + "num_tokens": 1389851239.0, + "step": 8292 + }, + { + "entropy": 1.719651500384013, + "epoch": 0.9110433660157645, + "grad_norm": 0.564914882183075, + "learning_rate": 1.2725671375994984e-05, + "loss": 1.4253, + "mean_token_accuracy": 0.6573879073063532, + "num_tokens": 1390084857.0, + "step": 8293 + }, + { + "entropy": 1.695727248986562, + "epoch": 0.9111532229271374, + "grad_norm": 0.6507130861282349, + "learning_rate": 1.2724099928001977e-05, + "loss": 1.3804, + "mean_token_accuracy": 0.6489260047674179, + "num_tokens": 1390242291.0, + "step": 8294 + }, + { + "entropy": 1.7323060234387715, + "epoch": 0.9112630798385103, + "grad_norm": 0.6349548101425171, + "learning_rate": 1.2722528425441978e-05, + "loss": 1.4971, + "mean_token_accuracy": 0.6366077115138372, + "num_tokens": 1390456782.0, + "step": 8295 + }, + { + "entropy": 1.6607350210348766, + "epoch": 0.9113729367498833, + "grad_norm": 0.6189599633216858, + "learning_rate": 1.2720956868364737e-05, + "loss": 1.3644, + "mean_token_accuracy": 0.6537392934163412, + "num_tokens": 1390624854.0, + "step": 8296 + }, + { + "entropy": 1.676759531100591, + "epoch": 0.9114827936612562, + "grad_norm": 0.682950496673584, + "learning_rate": 1.2719385256819983e-05, + "loss": 1.2863, + "mean_token_accuracy": 0.6645957181851069, + "num_tokens": 1390765067.0, + "step": 8297 + }, + { + "entropy": 1.7330308457215626, + "epoch": 0.9115926505726292, + "grad_norm": 0.7536049485206604, + "learning_rate": 1.2717813590857462e-05, + "loss": 1.4905, + "mean_token_accuracy": 0.6402916212876638, + "num_tokens": 1390972898.0, + "step": 8298 + }, + { + "entropy": 1.706084320942561, + "epoch": 0.9117025074840021, + "grad_norm": 0.61712247133255, + "learning_rate": 1.2716241870526913e-05, + "loss": 1.412, + "mean_token_accuracy": 0.6601520677407583, + "num_tokens": 1391173481.0, + "step": 8299 + }, + { + "entropy": 1.7238514224688213, + "epoch": 0.911812364395375, + "grad_norm": 0.7173047661781311, + "learning_rate": 1.2714670095878085e-05, + "loss": 1.3898, + "mean_token_accuracy": 0.6515658646821976, + "num_tokens": 1391342776.0, + "step": 8300 + }, + { + "entropy": 1.6747208436330159, + "epoch": 0.9119222213067479, + "grad_norm": 0.6746057868003845, + "learning_rate": 1.2713098266960717e-05, + "loss": 1.4247, + "mean_token_accuracy": 0.6679030358791351, + "num_tokens": 1391480315.0, + "step": 8301 + }, + { + "entropy": 1.70952441294988, + "epoch": 0.9120320782181209, + "grad_norm": 0.6282344460487366, + "learning_rate": 1.2711526383824567e-05, + "loss": 1.3414, + "mean_token_accuracy": 0.6575956245263418, + "num_tokens": 1391643523.0, + "step": 8302 + }, + { + "entropy": 1.7618720829486847, + "epoch": 0.9121419351294938, + "grad_norm": 0.6925609707832336, + "learning_rate": 1.2709954446519372e-05, + "loss": 1.3791, + "mean_token_accuracy": 0.6538346409797668, + "num_tokens": 1391809693.0, + "step": 8303 + }, + { + "entropy": 1.7483843763669331, + "epoch": 0.9122517920408668, + "grad_norm": 0.807517945766449, + "learning_rate": 1.2708382455094893e-05, + "loss": 1.4168, + "mean_token_accuracy": 0.6449888050556183, + "num_tokens": 1391968941.0, + "step": 8304 + }, + { + "entropy": 1.6495538353919983, + "epoch": 0.9123616489522397, + "grad_norm": 0.5905596017837524, + "learning_rate": 1.2706810409600877e-05, + "loss": 1.4587, + "mean_token_accuracy": 0.6530623485644659, + "num_tokens": 1392185953.0, + "step": 8305 + }, + { + "entropy": 1.638418326775233, + "epoch": 0.9124715058636127, + "grad_norm": 0.7588011622428894, + "learning_rate": 1.2705238310087082e-05, + "loss": 1.4184, + "mean_token_accuracy": 0.659953753153483, + "num_tokens": 1392375019.0, + "step": 8306 + }, + { + "entropy": 1.7082325716813405, + "epoch": 0.9125813627749856, + "grad_norm": 0.586554229259491, + "learning_rate": 1.270366615660326e-05, + "loss": 1.4373, + "mean_token_accuracy": 0.6499194204807281, + "num_tokens": 1392615284.0, + "step": 8307 + }, + { + "entropy": 1.7017434040705364, + "epoch": 0.9126912196863585, + "grad_norm": 0.7278909683227539, + "learning_rate": 1.2702093949199177e-05, + "loss": 1.4675, + "mean_token_accuracy": 0.6527448892593384, + "num_tokens": 1392790159.0, + "step": 8308 + }, + { + "entropy": 1.6655905544757843, + "epoch": 0.9128010765977315, + "grad_norm": 1.0901970863342285, + "learning_rate": 1.2700521687924583e-05, + "loss": 1.3658, + "mean_token_accuracy": 0.6630384723345438, + "num_tokens": 1392957262.0, + "step": 8309 + }, + { + "entropy": 1.7326476275920868, + "epoch": 0.9129109335091043, + "grad_norm": 0.6388905048370361, + "learning_rate": 1.2698949372829248e-05, + "loss": 1.3687, + "mean_token_accuracy": 0.6470983376105627, + "num_tokens": 1393116396.0, + "step": 8310 + }, + { + "entropy": 1.6326172451178234, + "epoch": 0.9130207904204773, + "grad_norm": 0.7294114828109741, + "learning_rate": 1.2697377003962925e-05, + "loss": 1.3318, + "mean_token_accuracy": 0.6786777973175049, + "num_tokens": 1393326810.0, + "step": 8311 + }, + { + "entropy": 1.6858830153942108, + "epoch": 0.9131306473318502, + "grad_norm": 0.7388503551483154, + "learning_rate": 1.269580458137539e-05, + "loss": 1.4234, + "mean_token_accuracy": 0.6717520157496134, + "num_tokens": 1393522621.0, + "step": 8312 + }, + { + "entropy": 1.7482396562894185, + "epoch": 0.9132405042432232, + "grad_norm": 0.9013400077819824, + "learning_rate": 1.2694232105116401e-05, + "loss": 1.5036, + "mean_token_accuracy": 0.6550756047169367, + "num_tokens": 1393690191.0, + "step": 8313 + }, + { + "entropy": 1.7518030802408855, + "epoch": 0.9133503611545961, + "grad_norm": 0.6710026264190674, + "learning_rate": 1.269265957523573e-05, + "loss": 1.6336, + "mean_token_accuracy": 0.629910779496034, + "num_tokens": 1393926208.0, + "step": 8314 + }, + { + "entropy": 1.7153498927752178, + "epoch": 0.9134602180659691, + "grad_norm": 0.6707825064659119, + "learning_rate": 1.2691086991783147e-05, + "loss": 1.6033, + "mean_token_accuracy": 0.6374608427286148, + "num_tokens": 1394147685.0, + "step": 8315 + }, + { + "entropy": 1.7012030879656475, + "epoch": 0.913570074977342, + "grad_norm": 0.6023955345153809, + "learning_rate": 1.2689514354808425e-05, + "loss": 1.4334, + "mean_token_accuracy": 0.6374744673569998, + "num_tokens": 1394333296.0, + "step": 8316 + }, + { + "entropy": 1.7281060020128887, + "epoch": 0.913679931888715, + "grad_norm": 0.714945912361145, + "learning_rate": 1.268794166436133e-05, + "loss": 1.5645, + "mean_token_accuracy": 0.6323638061682383, + "num_tokens": 1394552789.0, + "step": 8317 + }, + { + "entropy": 1.6209270258744557, + "epoch": 0.9137897888000879, + "grad_norm": 0.6666091084480286, + "learning_rate": 1.2686368920491648e-05, + "loss": 1.2327, + "mean_token_accuracy": 0.6780135631561279, + "num_tokens": 1394674019.0, + "step": 8318 + }, + { + "entropy": 1.736515998840332, + "epoch": 0.9138996457114609, + "grad_norm": 0.7664490938186646, + "learning_rate": 1.2684796123249145e-05, + "loss": 1.3157, + "mean_token_accuracy": 0.6623199184735616, + "num_tokens": 1394823043.0, + "step": 8319 + }, + { + "entropy": 1.6709323624769847, + "epoch": 0.9140095026228338, + "grad_norm": 0.6247878670692444, + "learning_rate": 1.2683223272683604e-05, + "loss": 1.3596, + "mean_token_accuracy": 0.6630469262599945, + "num_tokens": 1395012780.0, + "step": 8320 + }, + { + "entropy": 1.7542273203531902, + "epoch": 0.9141193595342066, + "grad_norm": 0.6201014518737793, + "learning_rate": 1.2681650368844804e-05, + "loss": 1.5317, + "mean_token_accuracy": 0.6245150317748388, + "num_tokens": 1395180061.0, + "step": 8321 + }, + { + "entropy": 1.7659225364526112, + "epoch": 0.9142292164455796, + "grad_norm": 0.8412492275238037, + "learning_rate": 1.2680077411782533e-05, + "loss": 1.4629, + "mean_token_accuracy": 0.6547584036986033, + "num_tokens": 1395322290.0, + "step": 8322 + }, + { + "entropy": 1.7514819204807281, + "epoch": 0.9143390733569525, + "grad_norm": 0.6405079960823059, + "learning_rate": 1.2678504401546563e-05, + "loss": 1.4131, + "mean_token_accuracy": 0.6564811915159225, + "num_tokens": 1395491553.0, + "step": 8323 + }, + { + "entropy": 1.692853420972824, + "epoch": 0.9144489302683255, + "grad_norm": 0.7179692387580872, + "learning_rate": 1.2676931338186688e-05, + "loss": 1.3103, + "mean_token_accuracy": 0.6628242333730062, + "num_tokens": 1395638800.0, + "step": 8324 + }, + { + "entropy": 1.7319222788016002, + "epoch": 0.9145587871796984, + "grad_norm": 0.621341347694397, + "learning_rate": 1.2675358221752691e-05, + "loss": 1.3553, + "mean_token_accuracy": 0.6674741456906, + "num_tokens": 1395825432.0, + "step": 8325 + }, + { + "entropy": 1.7467030783494313, + "epoch": 0.9146686440910714, + "grad_norm": 0.712310791015625, + "learning_rate": 1.2673785052294364e-05, + "loss": 1.4939, + "mean_token_accuracy": 0.6631985902786255, + "num_tokens": 1395995766.0, + "step": 8326 + }, + { + "entropy": 1.6573481957117717, + "epoch": 0.9147785010024443, + "grad_norm": 0.6593137979507446, + "learning_rate": 1.267221182986149e-05, + "loss": 1.3545, + "mean_token_accuracy": 0.6750344733397166, + "num_tokens": 1396168861.0, + "step": 8327 + }, + { + "entropy": 1.6669391791025798, + "epoch": 0.9148883579138173, + "grad_norm": 0.58598792552948, + "learning_rate": 1.2670638554503867e-05, + "loss": 1.3189, + "mean_token_accuracy": 0.6607841104269028, + "num_tokens": 1396310664.0, + "step": 8328 + }, + { + "entropy": 1.7750846942265828, + "epoch": 0.9149982148251902, + "grad_norm": 0.6623988747596741, + "learning_rate": 1.2669065226271284e-05, + "loss": 1.4966, + "mean_token_accuracy": 0.6353505253791809, + "num_tokens": 1396515959.0, + "step": 8329 + }, + { + "entropy": 1.6843581199645996, + "epoch": 0.9151080717365632, + "grad_norm": 0.6799822449684143, + "learning_rate": 1.2667491845213545e-05, + "loss": 1.4068, + "mean_token_accuracy": 0.6600462645292282, + "num_tokens": 1396680133.0, + "step": 8330 + }, + { + "entropy": 1.6392850279808044, + "epoch": 0.915217928647936, + "grad_norm": 0.5755462050437927, + "learning_rate": 1.2665918411380434e-05, + "loss": 1.38, + "mean_token_accuracy": 0.6573647956053416, + "num_tokens": 1396852839.0, + "step": 8331 + }, + { + "entropy": 1.6700959205627441, + "epoch": 0.915327785559309, + "grad_norm": 0.6844523549079895, + "learning_rate": 1.2664344924821758e-05, + "loss": 1.4268, + "mean_token_accuracy": 0.6432247956593832, + "num_tokens": 1397067247.0, + "step": 8332 + }, + { + "entropy": 1.8182924290498097, + "epoch": 0.9154376424706819, + "grad_norm": 0.6894211173057556, + "learning_rate": 1.2662771385587317e-05, + "loss": 1.3992, + "mean_token_accuracy": 0.649507686495781, + "num_tokens": 1397187287.0, + "step": 8333 + }, + { + "entropy": 1.7245789070924122, + "epoch": 0.9155474993820548, + "grad_norm": 0.7190035581588745, + "learning_rate": 1.266119779372691e-05, + "loss": 1.5355, + "mean_token_accuracy": 0.6330128163099289, + "num_tokens": 1397345435.0, + "step": 8334 + }, + { + "entropy": 1.71918981273969, + "epoch": 0.9156573562934278, + "grad_norm": 0.782472550868988, + "learning_rate": 1.2659624149290337e-05, + "loss": 1.4884, + "mean_token_accuracy": 0.6548773149649302, + "num_tokens": 1397524068.0, + "step": 8335 + }, + { + "entropy": 1.7045840620994568, + "epoch": 0.9157672132048007, + "grad_norm": 0.6515925526618958, + "learning_rate": 1.2658050452327415e-05, + "loss": 1.3979, + "mean_token_accuracy": 0.6536910384893417, + "num_tokens": 1397679482.0, + "step": 8336 + }, + { + "entropy": 1.6657692591349285, + "epoch": 0.9158770701161737, + "grad_norm": 0.6342126131057739, + "learning_rate": 1.2656476702887939e-05, + "loss": 1.3015, + "mean_token_accuracy": 0.6769297222296397, + "num_tokens": 1397858550.0, + "step": 8337 + }, + { + "entropy": 1.7451824148495991, + "epoch": 0.9159869270275466, + "grad_norm": 0.7401023507118225, + "learning_rate": 1.2654902901021725e-05, + "loss": 1.2909, + "mean_token_accuracy": 0.6768914808829626, + "num_tokens": 1397963039.0, + "step": 8338 + }, + { + "entropy": 1.7285672624905903, + "epoch": 0.9160967839389196, + "grad_norm": 0.727428138256073, + "learning_rate": 1.2653329046778576e-05, + "loss": 1.4568, + "mean_token_accuracy": 0.665610060095787, + "num_tokens": 1398148299.0, + "step": 8339 + }, + { + "entropy": 1.7707529664039612, + "epoch": 0.9162066408502925, + "grad_norm": 0.8654743432998657, + "learning_rate": 1.265175514020831e-05, + "loss": 1.3726, + "mean_token_accuracy": 0.6515340109666189, + "num_tokens": 1398290632.0, + "step": 8340 + }, + { + "entropy": 1.7380510866641998, + "epoch": 0.9163164977616655, + "grad_norm": 0.6711382269859314, + "learning_rate": 1.2650181181360734e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.6581040819485983, + "num_tokens": 1398430925.0, + "step": 8341 + }, + { + "entropy": 1.734023739894231, + "epoch": 0.9164263546730383, + "grad_norm": 0.6661319732666016, + "learning_rate": 1.2648607170285671e-05, + "loss": 1.3642, + "mean_token_accuracy": 0.659072607755661, + "num_tokens": 1398612350.0, + "step": 8342 + }, + { + "entropy": 1.70571368932724, + "epoch": 0.9165362115844113, + "grad_norm": 0.6126469373703003, + "learning_rate": 1.2647033107032936e-05, + "loss": 1.5186, + "mean_token_accuracy": 0.6387844830751419, + "num_tokens": 1398842688.0, + "step": 8343 + }, + { + "entropy": 1.6804417272408803, + "epoch": 0.9166460684957842, + "grad_norm": 0.5771108865737915, + "learning_rate": 1.2645458991652342e-05, + "loss": 1.2489, + "mean_token_accuracy": 0.6804888198773066, + "num_tokens": 1398998228.0, + "step": 8344 + }, + { + "entropy": 1.741776704788208, + "epoch": 0.9167559254071572, + "grad_norm": 0.6767376065254211, + "learning_rate": 1.264388482419371e-05, + "loss": 1.3365, + "mean_token_accuracy": 0.6583043287197748, + "num_tokens": 1399121579.0, + "step": 8345 + }, + { + "entropy": 1.6522979438304901, + "epoch": 0.9168657823185301, + "grad_norm": 0.7042422294616699, + "learning_rate": 1.2642310604706868e-05, + "loss": 1.2792, + "mean_token_accuracy": 0.6725463817516962, + "num_tokens": 1399288987.0, + "step": 8346 + }, + { + "entropy": 1.7250507672627766, + "epoch": 0.9169756392299031, + "grad_norm": 0.5657103657722473, + "learning_rate": 1.2640736333241634e-05, + "loss": 1.4876, + "mean_token_accuracy": 0.6186676770448685, + "num_tokens": 1399528479.0, + "step": 8347 + }, + { + "entropy": 1.7386068999767303, + "epoch": 0.917085496141276, + "grad_norm": 0.638064444065094, + "learning_rate": 1.2639162009847836e-05, + "loss": 1.4766, + "mean_token_accuracy": 0.6609574755032858, + "num_tokens": 1399691992.0, + "step": 8348 + }, + { + "entropy": 1.7201377550760906, + "epoch": 0.9171953530526489, + "grad_norm": 0.7226953506469727, + "learning_rate": 1.2637587634575297e-05, + "loss": 1.4676, + "mean_token_accuracy": 0.6478322297334671, + "num_tokens": 1399958317.0, + "step": 8349 + }, + { + "entropy": 1.6978058218955994, + "epoch": 0.9173052099640219, + "grad_norm": 0.5818222761154175, + "learning_rate": 1.2636013207473849e-05, + "loss": 1.4523, + "mean_token_accuracy": 0.6427704046169916, + "num_tokens": 1400192788.0, + "step": 8350 + }, + { + "entropy": 1.6792748073736827, + "epoch": 0.9174150668753948, + "grad_norm": 0.5890861749649048, + "learning_rate": 1.2634438728593319e-05, + "loss": 1.3267, + "mean_token_accuracy": 0.6607906967401505, + "num_tokens": 1400378162.0, + "step": 8351 + }, + { + "entropy": 1.7638347347577412, + "epoch": 0.9175249237867678, + "grad_norm": 0.7139198184013367, + "learning_rate": 1.263286419798354e-05, + "loss": 1.3686, + "mean_token_accuracy": 0.6581098288297653, + "num_tokens": 1400548924.0, + "step": 8352 + }, + { + "entropy": 1.7151235540707905, + "epoch": 0.9176347806981406, + "grad_norm": 0.6000536680221558, + "learning_rate": 1.2631289615694347e-05, + "loss": 1.4582, + "mean_token_accuracy": 0.6504177699486414, + "num_tokens": 1400749113.0, + "step": 8353 + }, + { + "entropy": 1.727899005015691, + "epoch": 0.9177446376095136, + "grad_norm": 0.8415667414665222, + "learning_rate": 1.262971498177557e-05, + "loss": 1.3531, + "mean_token_accuracy": 0.6797670821348826, + "num_tokens": 1400926683.0, + "step": 8354 + }, + { + "entropy": 1.73043093085289, + "epoch": 0.9178544945208865, + "grad_norm": 0.7654650211334229, + "learning_rate": 1.2628140296277049e-05, + "loss": 1.5241, + "mean_token_accuracy": 0.6519339581330618, + "num_tokens": 1401120570.0, + "step": 8355 + }, + { + "entropy": 1.7113446791966755, + "epoch": 0.9179643514322595, + "grad_norm": 0.7105302810668945, + "learning_rate": 1.2626565559248622e-05, + "loss": 1.3434, + "mean_token_accuracy": 0.6649382462104162, + "num_tokens": 1401267560.0, + "step": 8356 + }, + { + "entropy": 1.7008299330870311, + "epoch": 0.9180742083436324, + "grad_norm": 0.5741628408432007, + "learning_rate": 1.2624990770740123e-05, + "loss": 1.3885, + "mean_token_accuracy": 0.6473373621702194, + "num_tokens": 1401462451.0, + "step": 8357 + }, + { + "entropy": 1.7151657938957214, + "epoch": 0.9181840652550054, + "grad_norm": 0.6269608736038208, + "learning_rate": 1.2623415930801405e-05, + "loss": 1.5253, + "mean_token_accuracy": 0.6376299858093262, + "num_tokens": 1401646187.0, + "step": 8358 + }, + { + "entropy": 1.7247498830159504, + "epoch": 0.9182939221663783, + "grad_norm": 0.7287866473197937, + "learning_rate": 1.2621841039482303e-05, + "loss": 1.2987, + "mean_token_accuracy": 0.6703514059384664, + "num_tokens": 1401771042.0, + "step": 8359 + }, + { + "entropy": 1.6538776357968648, + "epoch": 0.9184037790777513, + "grad_norm": 0.6443969011306763, + "learning_rate": 1.2620266096832663e-05, + "loss": 1.3982, + "mean_token_accuracy": 0.6559311151504517, + "num_tokens": 1401999604.0, + "step": 8360 + }, + { + "entropy": 1.7296237647533417, + "epoch": 0.9185136359891242, + "grad_norm": 0.8798031210899353, + "learning_rate": 1.261869110290233e-05, + "loss": 1.4636, + "mean_token_accuracy": 0.6591440141201019, + "num_tokens": 1402164177.0, + "step": 8361 + }, + { + "entropy": 1.712800492842992, + "epoch": 0.918623492900497, + "grad_norm": 0.7003832459449768, + "learning_rate": 1.2617116057741152e-05, + "loss": 1.3247, + "mean_token_accuracy": 0.6562914500633875, + "num_tokens": 1402297927.0, + "step": 8362 + }, + { + "entropy": 1.6861789226531982, + "epoch": 0.91873334981187, + "grad_norm": 0.6626542806625366, + "learning_rate": 1.261554096139898e-05, + "loss": 1.5779, + "mean_token_accuracy": 0.6270742913087209, + "num_tokens": 1402514465.0, + "step": 8363 + }, + { + "entropy": 1.7360760072867076, + "epoch": 0.9188432067232429, + "grad_norm": 0.8311833739280701, + "learning_rate": 1.2613965813925666e-05, + "loss": 1.3172, + "mean_token_accuracy": 0.6613505631685257, + "num_tokens": 1402618003.0, + "step": 8364 + }, + { + "entropy": 1.7023787597815196, + "epoch": 0.9189530636346159, + "grad_norm": 0.6417757868766785, + "learning_rate": 1.261239061537106e-05, + "loss": 1.3442, + "mean_token_accuracy": 0.6616579592227936, + "num_tokens": 1402818781.0, + "step": 8365 + }, + { + "entropy": 1.732607791821162, + "epoch": 0.9190629205459888, + "grad_norm": 0.6801357865333557, + "learning_rate": 1.261081536578502e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.6601632038752238, + "num_tokens": 1402973852.0, + "step": 8366 + }, + { + "entropy": 1.6984934012095134, + "epoch": 0.9191727774573618, + "grad_norm": 0.6982681751251221, + "learning_rate": 1.2609240065217396e-05, + "loss": 1.4012, + "mean_token_accuracy": 0.6618342697620392, + "num_tokens": 1403131167.0, + "step": 8367 + }, + { + "entropy": 1.6751105586687725, + "epoch": 0.9192826343687347, + "grad_norm": 0.7063464522361755, + "learning_rate": 1.260766471371805e-05, + "loss": 1.256, + "mean_token_accuracy": 0.6785000711679459, + "num_tokens": 1403267829.0, + "step": 8368 + }, + { + "entropy": 1.7225077946980794, + "epoch": 0.9193924912801077, + "grad_norm": 0.6618905067443848, + "learning_rate": 1.260608931133684e-05, + "loss": 1.4058, + "mean_token_accuracy": 0.6538095970948538, + "num_tokens": 1403431242.0, + "step": 8369 + }, + { + "entropy": 1.6800328195095062, + "epoch": 0.9195023481914806, + "grad_norm": 0.6762068867683411, + "learning_rate": 1.2604513858123629e-05, + "loss": 1.4314, + "mean_token_accuracy": 0.6620854735374451, + "num_tokens": 1403605335.0, + "step": 8370 + }, + { + "entropy": 1.700233409802119, + "epoch": 0.9196122051028536, + "grad_norm": 0.6993805170059204, + "learning_rate": 1.2602938354128276e-05, + "loss": 1.5316, + "mean_token_accuracy": 0.6513668298721313, + "num_tokens": 1403787016.0, + "step": 8371 + }, + { + "entropy": 1.6806213955084484, + "epoch": 0.9197220620142265, + "grad_norm": 0.9308298826217651, + "learning_rate": 1.2601362799400648e-05, + "loss": 1.3191, + "mean_token_accuracy": 0.6591364145278931, + "num_tokens": 1403994148.0, + "step": 8372 + }, + { + "entropy": 1.6756692230701447, + "epoch": 0.9198319189255995, + "grad_norm": 0.6425023674964905, + "learning_rate": 1.2599787193990605e-05, + "loss": 1.2729, + "mean_token_accuracy": 0.6760301639636358, + "num_tokens": 1404117376.0, + "step": 8373 + }, + { + "entropy": 1.6797867914040883, + "epoch": 0.9199417758369723, + "grad_norm": 0.6142690777778625, + "learning_rate": 1.2598211537948022e-05, + "loss": 1.3994, + "mean_token_accuracy": 0.6605397313833237, + "num_tokens": 1404310009.0, + "step": 8374 + }, + { + "entropy": 1.6622991561889648, + "epoch": 0.9200516327483452, + "grad_norm": 0.7478998899459839, + "learning_rate": 1.2596635831322761e-05, + "loss": 1.5679, + "mean_token_accuracy": 0.6572754432757696, + "num_tokens": 1404484363.0, + "step": 8375 + }, + { + "entropy": 1.6799915730953217, + "epoch": 0.9201614896597182, + "grad_norm": 0.6949113607406616, + "learning_rate": 1.2595060074164698e-05, + "loss": 1.2796, + "mean_token_accuracy": 0.6649729063113531, + "num_tokens": 1404661640.0, + "step": 8376 + }, + { + "entropy": 1.7269630233446758, + "epoch": 0.9202713465710911, + "grad_norm": 0.7026051878929138, + "learning_rate": 1.2593484266523701e-05, + "loss": 1.3694, + "mean_token_accuracy": 0.6497256408135096, + "num_tokens": 1404821227.0, + "step": 8377 + }, + { + "entropy": 1.7466710011164348, + "epoch": 0.9203812034824641, + "grad_norm": 0.9665287733078003, + "learning_rate": 1.2591908408449647e-05, + "loss": 1.3825, + "mean_token_accuracy": 0.6634558041890463, + "num_tokens": 1404954222.0, + "step": 8378 + }, + { + "entropy": 1.6914934416611989, + "epoch": 0.920491060393837, + "grad_norm": 0.7472662925720215, + "learning_rate": 1.2590332499992406e-05, + "loss": 1.286, + "mean_token_accuracy": 0.672187735637029, + "num_tokens": 1405065197.0, + "step": 8379 + }, + { + "entropy": 1.6020830969015758, + "epoch": 0.92060091730521, + "grad_norm": 0.635012149810791, + "learning_rate": 1.2588756541201861e-05, + "loss": 1.2781, + "mean_token_accuracy": 0.6772213528553644, + "num_tokens": 1405219589.0, + "step": 8380 + }, + { + "entropy": 1.7316095232963562, + "epoch": 0.9207107742165829, + "grad_norm": 0.7221272587776184, + "learning_rate": 1.2587180532127886e-05, + "loss": 1.6001, + "mean_token_accuracy": 0.6357069065173467, + "num_tokens": 1405416476.0, + "step": 8381 + }, + { + "entropy": 1.7943975925445557, + "epoch": 0.9208206311279559, + "grad_norm": 0.77412348985672, + "learning_rate": 1.258560447282036e-05, + "loss": 1.5053, + "mean_token_accuracy": 0.6331639190514883, + "num_tokens": 1405601890.0, + "step": 8382 + }, + { + "entropy": 1.7066338161627452, + "epoch": 0.9209304880393288, + "grad_norm": 0.6691288352012634, + "learning_rate": 1.2584028363329172e-05, + "loss": 1.3199, + "mean_token_accuracy": 0.6606237838665644, + "num_tokens": 1405752844.0, + "step": 8383 + }, + { + "entropy": 1.713592956463496, + "epoch": 0.9210403449507018, + "grad_norm": 0.6735867857933044, + "learning_rate": 1.2582452203704196e-05, + "loss": 1.3706, + "mean_token_accuracy": 0.6515240619579951, + "num_tokens": 1405978966.0, + "step": 8384 + }, + { + "entropy": 1.6773227254549663, + "epoch": 0.9211502018620746, + "grad_norm": 0.6591025590896606, + "learning_rate": 1.2580875993995324e-05, + "loss": 1.3694, + "mean_token_accuracy": 0.665874645113945, + "num_tokens": 1406131263.0, + "step": 8385 + }, + { + "entropy": 1.7191874583562214, + "epoch": 0.9212600587734476, + "grad_norm": 0.6563531160354614, + "learning_rate": 1.2579299734252435e-05, + "loss": 1.6292, + "mean_token_accuracy": 0.6292891552050909, + "num_tokens": 1406323110.0, + "step": 8386 + }, + { + "entropy": 1.7289370795090993, + "epoch": 0.9213699156848205, + "grad_norm": 0.659126341342926, + "learning_rate": 1.2577723424525425e-05, + "loss": 1.2939, + "mean_token_accuracy": 0.6766804109017054, + "num_tokens": 1406466231.0, + "step": 8387 + }, + { + "entropy": 1.690926472345988, + "epoch": 0.9214797725961934, + "grad_norm": 0.7042950391769409, + "learning_rate": 1.2576147064864177e-05, + "loss": 1.4388, + "mean_token_accuracy": 0.6502556999524435, + "num_tokens": 1406626762.0, + "step": 8388 + }, + { + "entropy": 1.6028278172016144, + "epoch": 0.9215896295075664, + "grad_norm": 0.6516503691673279, + "learning_rate": 1.2574570655318586e-05, + "loss": 1.2367, + "mean_token_accuracy": 0.6695101857185364, + "num_tokens": 1406811606.0, + "step": 8389 + }, + { + "entropy": 1.642892986536026, + "epoch": 0.9216994864189393, + "grad_norm": 0.6159811615943909, + "learning_rate": 1.2572994195938543e-05, + "loss": 1.4283, + "mean_token_accuracy": 0.6557525595029196, + "num_tokens": 1406972657.0, + "step": 8390 + }, + { + "entropy": 1.7573666175206502, + "epoch": 0.9218093433303123, + "grad_norm": 0.6652973890304565, + "learning_rate": 1.2571417686773942e-05, + "loss": 1.354, + "mean_token_accuracy": 0.6559064388275146, + "num_tokens": 1407093499.0, + "step": 8391 + }, + { + "entropy": 1.7429068982601166, + "epoch": 0.9219192002416852, + "grad_norm": 0.7107627987861633, + "learning_rate": 1.256984112787468e-05, + "loss": 1.4118, + "mean_token_accuracy": 0.6419812937577566, + "num_tokens": 1407253457.0, + "step": 8392 + }, + { + "entropy": 1.7409753501415253, + "epoch": 0.9220290571530582, + "grad_norm": 0.7587690949440002, + "learning_rate": 1.2568264519290654e-05, + "loss": 1.3945, + "mean_token_accuracy": 0.65499414006869, + "num_tokens": 1407450974.0, + "step": 8393 + }, + { + "entropy": 1.7279663681983948, + "epoch": 0.922138914064431, + "grad_norm": 0.6342840194702148, + "learning_rate": 1.2566687861071762e-05, + "loss": 1.3915, + "mean_token_accuracy": 0.660114253560702, + "num_tokens": 1407606742.0, + "step": 8394 + }, + { + "entropy": 1.6986714601516724, + "epoch": 0.922248770975804, + "grad_norm": 0.6908047795295715, + "learning_rate": 1.2565111153267904e-05, + "loss": 1.5053, + "mean_token_accuracy": 0.6442679464817047, + "num_tokens": 1407769641.0, + "step": 8395 + }, + { + "entropy": 1.7808184325695038, + "epoch": 0.9223586278871769, + "grad_norm": 0.6698216199874878, + "learning_rate": 1.2563534395928987e-05, + "loss": 1.5286, + "mean_token_accuracy": 0.642250527938207, + "num_tokens": 1407933465.0, + "step": 8396 + }, + { + "entropy": 1.733227163553238, + "epoch": 0.9224684847985499, + "grad_norm": 0.7015756964683533, + "learning_rate": 1.2561957589104908e-05, + "loss": 1.2428, + "mean_token_accuracy": 0.6744599491357803, + "num_tokens": 1408037394.0, + "step": 8397 + }, + { + "entropy": 1.7421282827854156, + "epoch": 0.9225783417099228, + "grad_norm": 0.6361960768699646, + "learning_rate": 1.2560380732845577e-05, + "loss": 1.2879, + "mean_token_accuracy": 0.6739509999752045, + "num_tokens": 1408162511.0, + "step": 8398 + }, + { + "entropy": 1.7272718846797943, + "epoch": 0.9226881986212958, + "grad_norm": 0.8491674661636353, + "learning_rate": 1.2558803827200896e-05, + "loss": 1.4679, + "mean_token_accuracy": 0.6472969353199005, + "num_tokens": 1408337595.0, + "step": 8399 + }, + { + "entropy": 1.724097619454066, + "epoch": 0.9227980555326687, + "grad_norm": 0.6793266534805298, + "learning_rate": 1.255722687222078e-05, + "loss": 1.4947, + "mean_token_accuracy": 0.6494862536589304, + "num_tokens": 1408563108.0, + "step": 8400 + }, + { + "entropy": 1.734597235918045, + "epoch": 0.9229079124440417, + "grad_norm": 0.695158064365387, + "learning_rate": 1.2555649867955128e-05, + "loss": 1.4444, + "mean_token_accuracy": 0.6569420943657557, + "num_tokens": 1408721968.0, + "step": 8401 + }, + { + "entropy": 1.715299169222514, + "epoch": 0.9230177693554146, + "grad_norm": 0.7081811428070068, + "learning_rate": 1.2554072814453865e-05, + "loss": 1.4605, + "mean_token_accuracy": 0.6608149409294128, + "num_tokens": 1408873539.0, + "step": 8402 + }, + { + "entropy": 1.6958219408988953, + "epoch": 0.9231276262667875, + "grad_norm": 0.7010018825531006, + "learning_rate": 1.2552495711766897e-05, + "loss": 1.2639, + "mean_token_accuracy": 0.6718494196732839, + "num_tokens": 1409024445.0, + "step": 8403 + }, + { + "entropy": 1.6861001054445903, + "epoch": 0.9232374831781605, + "grad_norm": 0.8214607834815979, + "learning_rate": 1.2550918559944138e-05, + "loss": 1.3134, + "mean_token_accuracy": 0.6580562740564346, + "num_tokens": 1409185013.0, + "step": 8404 + }, + { + "entropy": 1.6605658928553264, + "epoch": 0.9233473400895333, + "grad_norm": 0.6449036598205566, + "learning_rate": 1.2549341359035507e-05, + "loss": 1.4082, + "mean_token_accuracy": 0.6663586348295212, + "num_tokens": 1409335022.0, + "step": 8405 + }, + { + "entropy": 1.763859748840332, + "epoch": 0.9234571970009063, + "grad_norm": 0.7659093141555786, + "learning_rate": 1.254776410909092e-05, + "loss": 1.3732, + "mean_token_accuracy": 0.6532298525174459, + "num_tokens": 1409453428.0, + "step": 8406 + }, + { + "entropy": 1.696772535641988, + "epoch": 0.9235670539122792, + "grad_norm": 0.7647340893745422, + "learning_rate": 1.2546186810160294e-05, + "loss": 1.3433, + "mean_token_accuracy": 0.667681892712911, + "num_tokens": 1409572864.0, + "step": 8407 + }, + { + "entropy": 1.7054913242657979, + "epoch": 0.9236769108236522, + "grad_norm": 0.6106885075569153, + "learning_rate": 1.2544609462293555e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6489782730738322, + "num_tokens": 1409739736.0, + "step": 8408 + }, + { + "entropy": 1.7002596755822499, + "epoch": 0.9237867677350251, + "grad_norm": 0.599602997303009, + "learning_rate": 1.2543032065540622e-05, + "loss": 1.343, + "mean_token_accuracy": 0.6687337110439936, + "num_tokens": 1409932692.0, + "step": 8409 + }, + { + "entropy": 1.6986660559972127, + "epoch": 0.9238966246463981, + "grad_norm": 0.6248365640640259, + "learning_rate": 1.2541454619951416e-05, + "loss": 1.391, + "mean_token_accuracy": 0.6515335639317831, + "num_tokens": 1410120104.0, + "step": 8410 + }, + { + "entropy": 1.686285485823949, + "epoch": 0.924006481557771, + "grad_norm": 0.7016609311103821, + "learning_rate": 1.253987712557587e-05, + "loss": 1.358, + "mean_token_accuracy": 0.6637587447961172, + "num_tokens": 1410281992.0, + "step": 8411 + }, + { + "entropy": 1.740788499514262, + "epoch": 0.924116338469144, + "grad_norm": 0.7246219515800476, + "learning_rate": 1.2538299582463906e-05, + "loss": 1.4015, + "mean_token_accuracy": 0.6567526757717133, + "num_tokens": 1410440576.0, + "step": 8412 + }, + { + "entropy": 1.7054814994335175, + "epoch": 0.9242261953805169, + "grad_norm": 0.7429696917533875, + "learning_rate": 1.253672199066545e-05, + "loss": 1.4615, + "mean_token_accuracy": 0.6404254684845606, + "num_tokens": 1410634470.0, + "step": 8413 + }, + { + "entropy": 1.6572390894095104, + "epoch": 0.9243360522918899, + "grad_norm": 0.713073194026947, + "learning_rate": 1.2535144350230441e-05, + "loss": 1.345, + "mean_token_accuracy": 0.6785429567098618, + "num_tokens": 1410752529.0, + "step": 8414 + }, + { + "entropy": 1.6576418578624725, + "epoch": 0.9244459092032628, + "grad_norm": 0.5704638361930847, + "learning_rate": 1.2533566661208803e-05, + "loss": 1.3617, + "mean_token_accuracy": 0.6624558568000793, + "num_tokens": 1410940518.0, + "step": 8415 + }, + { + "entropy": 1.6944686770439148, + "epoch": 0.9245557661146356, + "grad_norm": 0.7132385969161987, + "learning_rate": 1.2531988923650469e-05, + "loss": 1.3251, + "mean_token_accuracy": 0.6656810740629832, + "num_tokens": 1411091487.0, + "step": 8416 + }, + { + "entropy": 1.6068796714146931, + "epoch": 0.9246656230260086, + "grad_norm": 0.5596727132797241, + "learning_rate": 1.2530411137605376e-05, + "loss": 1.3988, + "mean_token_accuracy": 0.651663064956665, + "num_tokens": 1411349098.0, + "step": 8417 + }, + { + "entropy": 1.6926358838876088, + "epoch": 0.9247754799373815, + "grad_norm": 0.6509303450584412, + "learning_rate": 1.2528833303123464e-05, + "loss": 1.4295, + "mean_token_accuracy": 0.6421109537283579, + "num_tokens": 1411553106.0, + "step": 8418 + }, + { + "entropy": 1.6756529609362285, + "epoch": 0.9248853368487545, + "grad_norm": 0.7943989038467407, + "learning_rate": 1.2527255420254663e-05, + "loss": 1.4821, + "mean_token_accuracy": 0.6490869422753652, + "num_tokens": 1411745801.0, + "step": 8419 + }, + { + "entropy": 1.6821991205215454, + "epoch": 0.9249951937601274, + "grad_norm": 0.6343573331832886, + "learning_rate": 1.2525677489048919e-05, + "loss": 1.3901, + "mean_token_accuracy": 0.6549980839093527, + "num_tokens": 1411971218.0, + "step": 8420 + }, + { + "entropy": 1.7167380253473918, + "epoch": 0.9251050506715004, + "grad_norm": 0.6473302841186523, + "learning_rate": 1.252409950955617e-05, + "loss": 1.2781, + "mean_token_accuracy": 0.6656585186719894, + "num_tokens": 1412093664.0, + "step": 8421 + }, + { + "entropy": 1.6299481391906738, + "epoch": 0.9252149075828733, + "grad_norm": 0.5957537293434143, + "learning_rate": 1.2522521481826355e-05, + "loss": 1.5059, + "mean_token_accuracy": 0.6492985685666403, + "num_tokens": 1412312361.0, + "step": 8422 + }, + { + "entropy": 1.6707092622915904, + "epoch": 0.9253247644942463, + "grad_norm": 0.6398028135299683, + "learning_rate": 1.2520943405909423e-05, + "loss": 1.3445, + "mean_token_accuracy": 0.6681879659493765, + "num_tokens": 1412486757.0, + "step": 8423 + }, + { + "entropy": 1.6841243704160054, + "epoch": 0.9254346214056192, + "grad_norm": 0.6823419332504272, + "learning_rate": 1.251936528185532e-05, + "loss": 1.4074, + "mean_token_accuracy": 0.6703370014826456, + "num_tokens": 1412624400.0, + "step": 8424 + }, + { + "entropy": 1.7717651029427846, + "epoch": 0.9255444783169922, + "grad_norm": 0.699639618396759, + "learning_rate": 1.2517787109713986e-05, + "loss": 1.4845, + "mean_token_accuracy": 0.6359593768914541, + "num_tokens": 1412807209.0, + "step": 8425 + }, + { + "entropy": 1.7056340873241425, + "epoch": 0.925654335228365, + "grad_norm": 0.7243174314498901, + "learning_rate": 1.2516208889535377e-05, + "loss": 1.4097, + "mean_token_accuracy": 0.668630967537562, + "num_tokens": 1412947855.0, + "step": 8426 + }, + { + "entropy": 1.73410764336586, + "epoch": 0.925764192139738, + "grad_norm": 0.764216959476471, + "learning_rate": 1.2514630621369437e-05, + "loss": 1.368, + "mean_token_accuracy": 0.670150876045227, + "num_tokens": 1413110693.0, + "step": 8427 + }, + { + "entropy": 1.7153330743312836, + "epoch": 0.9258740490511109, + "grad_norm": 0.6585131287574768, + "learning_rate": 1.2513052305266123e-05, + "loss": 1.2796, + "mean_token_accuracy": 0.6731938471396764, + "num_tokens": 1413242626.0, + "step": 8428 + }, + { + "entropy": 1.7023490965366364, + "epoch": 0.9259839059624838, + "grad_norm": 0.6882517337799072, + "learning_rate": 1.2511473941275385e-05, + "loss": 1.2992, + "mean_token_accuracy": 0.6625126004219055, + "num_tokens": 1413400164.0, + "step": 8429 + }, + { + "entropy": 1.705989311138789, + "epoch": 0.9260937628738568, + "grad_norm": 0.8120949864387512, + "learning_rate": 1.2509895529447178e-05, + "loss": 1.3778, + "mean_token_accuracy": 0.6520531823237737, + "num_tokens": 1413522941.0, + "step": 8430 + }, + { + "entropy": 1.7116830845673878, + "epoch": 0.9262036197852297, + "grad_norm": 0.7705047130584717, + "learning_rate": 1.250831706983146e-05, + "loss": 1.2424, + "mean_token_accuracy": 0.681115910410881, + "num_tokens": 1413645220.0, + "step": 8431 + }, + { + "entropy": 1.7691873808701832, + "epoch": 0.9263134766966027, + "grad_norm": 0.7861184477806091, + "learning_rate": 1.250673856247818e-05, + "loss": 1.4448, + "mean_token_accuracy": 0.6577897220849991, + "num_tokens": 1413789562.0, + "step": 8432 + }, + { + "entropy": 1.7770712574323018, + "epoch": 0.9264233336079756, + "grad_norm": 0.6887810826301575, + "learning_rate": 1.2505160007437309e-05, + "loss": 1.4101, + "mean_token_accuracy": 0.6699302395184835, + "num_tokens": 1413947460.0, + "step": 8433 + }, + { + "entropy": 1.6620681683222454, + "epoch": 0.9265331905193486, + "grad_norm": 0.6765469908714294, + "learning_rate": 1.25035814047588e-05, + "loss": 1.382, + "mean_token_accuracy": 0.6652428507804871, + "num_tokens": 1414106635.0, + "step": 8434 + }, + { + "entropy": 1.6912939846515656, + "epoch": 0.9266430474307215, + "grad_norm": 0.6659488081932068, + "learning_rate": 1.2502002754492614e-05, + "loss": 1.3485, + "mean_token_accuracy": 0.6623717993497849, + "num_tokens": 1414283009.0, + "step": 8435 + }, + { + "entropy": 1.7182398637135823, + "epoch": 0.9267529043420945, + "grad_norm": 0.6735560894012451, + "learning_rate": 1.2500424056688722e-05, + "loss": 1.3758, + "mean_token_accuracy": 0.6657535483439764, + "num_tokens": 1414423522.0, + "step": 8436 + }, + { + "entropy": 1.6874533692995708, + "epoch": 0.9268627612534673, + "grad_norm": 0.5966618061065674, + "learning_rate": 1.2498845311397083e-05, + "loss": 1.5429, + "mean_token_accuracy": 0.6355781530340513, + "num_tokens": 1414625227.0, + "step": 8437 + }, + { + "entropy": 1.6916421949863434, + "epoch": 0.9269726181648403, + "grad_norm": 0.6825304627418518, + "learning_rate": 1.2497266518667667e-05, + "loss": 1.4306, + "mean_token_accuracy": 0.6556826333204905, + "num_tokens": 1414808666.0, + "step": 8438 + }, + { + "entropy": 1.7351886530717213, + "epoch": 0.9270824750762132, + "grad_norm": 0.8129728436470032, + "learning_rate": 1.249568767855044e-05, + "loss": 1.3367, + "mean_token_accuracy": 0.6633518934249878, + "num_tokens": 1414917866.0, + "step": 8439 + }, + { + "entropy": 1.6401391724745433, + "epoch": 0.9271923319875862, + "grad_norm": 0.7099363207817078, + "learning_rate": 1.2494108791095372e-05, + "loss": 1.3524, + "mean_token_accuracy": 0.6650873670975367, + "num_tokens": 1415083152.0, + "step": 8440 + }, + { + "entropy": 1.69183216492335, + "epoch": 0.9273021888989591, + "grad_norm": 0.637658953666687, + "learning_rate": 1.2492529856352431e-05, + "loss": 1.436, + "mean_token_accuracy": 0.6460116654634476, + "num_tokens": 1415302375.0, + "step": 8441 + }, + { + "entropy": 1.6242200930913289, + "epoch": 0.9274120458103321, + "grad_norm": 0.6604406237602234, + "learning_rate": 1.2490950874371594e-05, + "loss": 1.2643, + "mean_token_accuracy": 0.6734424084424973, + "num_tokens": 1415474824.0, + "step": 8442 + }, + { + "entropy": 1.6695034007231395, + "epoch": 0.927521902721705, + "grad_norm": 0.7424845695495605, + "learning_rate": 1.2489371845202836e-05, + "loss": 1.3691, + "mean_token_accuracy": 0.6602280388275782, + "num_tokens": 1415661133.0, + "step": 8443 + }, + { + "entropy": 1.7420236865679424, + "epoch": 0.9276317596330779, + "grad_norm": 0.8569214940071106, + "learning_rate": 1.2487792768896127e-05, + "loss": 1.5441, + "mean_token_accuracy": 0.6413151572148005, + "num_tokens": 1415805898.0, + "step": 8444 + }, + { + "entropy": 1.686678260564804, + "epoch": 0.9277416165444509, + "grad_norm": 0.760896623134613, + "learning_rate": 1.248621364550145e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.667745237549146, + "num_tokens": 1415925173.0, + "step": 8445 + }, + { + "entropy": 1.694368968407313, + "epoch": 0.9278514734558237, + "grad_norm": 0.8224613666534424, + "learning_rate": 1.2484634475068781e-05, + "loss": 1.4879, + "mean_token_accuracy": 0.6645511214931806, + "num_tokens": 1416074408.0, + "step": 8446 + }, + { + "entropy": 1.6848430434862773, + "epoch": 0.9279613303671967, + "grad_norm": 0.6078700423240662, + "learning_rate": 1.2483055257648098e-05, + "loss": 1.3897, + "mean_token_accuracy": 0.6586156040430069, + "num_tokens": 1416257726.0, + "step": 8447 + }, + { + "entropy": 1.6660497585932414, + "epoch": 0.9280711872785696, + "grad_norm": 0.7581548094749451, + "learning_rate": 1.2481475993289385e-05, + "loss": 1.4777, + "mean_token_accuracy": 0.6510738035043081, + "num_tokens": 1416421922.0, + "step": 8448 + }, + { + "entropy": 1.7178461253643036, + "epoch": 0.9281810441899426, + "grad_norm": 0.706211507320404, + "learning_rate": 1.2479896682042625e-05, + "loss": 1.3219, + "mean_token_accuracy": 0.6680372059345245, + "num_tokens": 1416593459.0, + "step": 8449 + }, + { + "entropy": 1.684712787469228, + "epoch": 0.9282909011013155, + "grad_norm": 0.6404665112495422, + "learning_rate": 1.24783173239578e-05, + "loss": 1.3882, + "mean_token_accuracy": 0.6511215766270956, + "num_tokens": 1416740179.0, + "step": 8450 + }, + { + "entropy": 1.6592655877272289, + "epoch": 0.9284007580126885, + "grad_norm": 0.6992117762565613, + "learning_rate": 1.2476737919084898e-05, + "loss": 1.3854, + "mean_token_accuracy": 0.653921420375506, + "num_tokens": 1416916822.0, + "step": 8451 + }, + { + "entropy": 1.698452393213908, + "epoch": 0.9285106149240614, + "grad_norm": 0.6352612972259521, + "learning_rate": 1.2475158467473911e-05, + "loss": 1.4618, + "mean_token_accuracy": 0.6499315698941549, + "num_tokens": 1417107907.0, + "step": 8452 + }, + { + "entropy": 1.7192625999450684, + "epoch": 0.9286204718354344, + "grad_norm": 0.7419793605804443, + "learning_rate": 1.2473578969174817e-05, + "loss": 1.478, + "mean_token_accuracy": 0.6512725353240967, + "num_tokens": 1417291830.0, + "step": 8453 + }, + { + "entropy": 1.684527148803075, + "epoch": 0.9287303287468073, + "grad_norm": 0.6986659169197083, + "learning_rate": 1.2471999424237615e-05, + "loss": 1.3782, + "mean_token_accuracy": 0.658308207988739, + "num_tokens": 1417446239.0, + "step": 8454 + }, + { + "entropy": 1.6121302247047424, + "epoch": 0.9288401856581803, + "grad_norm": 0.7182373404502869, + "learning_rate": 1.2470419832712295e-05, + "loss": 1.3144, + "mean_token_accuracy": 0.6786510099967321, + "num_tokens": 1417581558.0, + "step": 8455 + }, + { + "entropy": 1.6011373102664948, + "epoch": 0.9289500425695532, + "grad_norm": 0.7618659138679504, + "learning_rate": 1.246884019464885e-05, + "loss": 1.2432, + "mean_token_accuracy": 0.6746308306852976, + "num_tokens": 1417763013.0, + "step": 8456 + }, + { + "entropy": 1.662847876548767, + "epoch": 0.929059899480926, + "grad_norm": 0.6191786527633667, + "learning_rate": 1.2467260510097275e-05, + "loss": 1.4174, + "mean_token_accuracy": 0.6469977349042892, + "num_tokens": 1418005665.0, + "step": 8457 + }, + { + "entropy": 1.713506688674291, + "epoch": 0.929169756392299, + "grad_norm": 0.7612412571907043, + "learning_rate": 1.2465680779107564e-05, + "loss": 1.5276, + "mean_token_accuracy": 0.6622031579415003, + "num_tokens": 1418151137.0, + "step": 8458 + }, + { + "entropy": 1.7254696488380432, + "epoch": 0.9292796133036719, + "grad_norm": 0.6934106945991516, + "learning_rate": 1.246410100172972e-05, + "loss": 1.3767, + "mean_token_accuracy": 0.6779943505922953, + "num_tokens": 1418282753.0, + "step": 8459 + }, + { + "entropy": 1.7169914940992992, + "epoch": 0.9293894702150449, + "grad_norm": 0.6493316292762756, + "learning_rate": 1.2462521178013736e-05, + "loss": 1.3592, + "mean_token_accuracy": 0.6591555128494898, + "num_tokens": 1418439711.0, + "step": 8460 + }, + { + "entropy": 1.6799816985925038, + "epoch": 0.9294993271264178, + "grad_norm": 0.8348252773284912, + "learning_rate": 1.2460941308009615e-05, + "loss": 1.2885, + "mean_token_accuracy": 0.6671053916215897, + "num_tokens": 1418565133.0, + "step": 8461 + }, + { + "entropy": 1.6939114530881245, + "epoch": 0.9296091840377908, + "grad_norm": 0.8337905406951904, + "learning_rate": 1.2459361391767366e-05, + "loss": 1.5956, + "mean_token_accuracy": 0.6368262519439062, + "num_tokens": 1418758270.0, + "step": 8462 + }, + { + "entropy": 1.6878935396671295, + "epoch": 0.9297190409491637, + "grad_norm": 0.7053253054618835, + "learning_rate": 1.245778142933698e-05, + "loss": 1.3431, + "mean_token_accuracy": 0.6594057977199554, + "num_tokens": 1418943386.0, + "step": 8463 + }, + { + "entropy": 1.7375274399916332, + "epoch": 0.9298288978605367, + "grad_norm": 0.7320691347122192, + "learning_rate": 1.2456201420768472e-05, + "loss": 1.4361, + "mean_token_accuracy": 0.6541687101125717, + "num_tokens": 1419098061.0, + "step": 8464 + }, + { + "entropy": 1.6313360234101613, + "epoch": 0.9299387547719096, + "grad_norm": 0.5476987957954407, + "learning_rate": 1.2454621366111843e-05, + "loss": 1.436, + "mean_token_accuracy": 0.6428209195534388, + "num_tokens": 1419319095.0, + "step": 8465 + }, + { + "entropy": 1.713592936595281, + "epoch": 0.9300486116832826, + "grad_norm": 0.7491887807846069, + "learning_rate": 1.2453041265417105e-05, + "loss": 1.4515, + "mean_token_accuracy": 0.6389258007208506, + "num_tokens": 1419463995.0, + "step": 8466 + }, + { + "entropy": 1.6619866987069447, + "epoch": 0.9301584685946555, + "grad_norm": 0.5975055694580078, + "learning_rate": 1.2451461118734267e-05, + "loss": 1.4631, + "mean_token_accuracy": 0.6633708626031876, + "num_tokens": 1419650890.0, + "step": 8467 + }, + { + "entropy": 1.6842903196811676, + "epoch": 0.9302683255060284, + "grad_norm": 0.7102713584899902, + "learning_rate": 1.2449880926113339e-05, + "loss": 1.552, + "mean_token_accuracy": 0.6366306593020757, + "num_tokens": 1419817492.0, + "step": 8468 + }, + { + "entropy": 1.6764464179674785, + "epoch": 0.9303781824174013, + "grad_norm": 0.648377001285553, + "learning_rate": 1.2448300687604327e-05, + "loss": 1.5442, + "mean_token_accuracy": 0.6397795329491297, + "num_tokens": 1420015035.0, + "step": 8469 + }, + { + "entropy": 1.736095021168391, + "epoch": 0.9304880393287742, + "grad_norm": 0.7206259965896606, + "learning_rate": 1.2446720403257255e-05, + "loss": 1.4128, + "mean_token_accuracy": 0.6519557138284048, + "num_tokens": 1420202744.0, + "step": 8470 + }, + { + "entropy": 1.671694815158844, + "epoch": 0.9305978962401472, + "grad_norm": 0.7011840343475342, + "learning_rate": 1.2445140073122135e-05, + "loss": 1.452, + "mean_token_accuracy": 0.6552790006001791, + "num_tokens": 1420346561.0, + "step": 8471 + }, + { + "entropy": 1.649581750233968, + "epoch": 0.9307077531515201, + "grad_norm": 0.7680811285972595, + "learning_rate": 1.244355969724898e-05, + "loss": 1.2785, + "mean_token_accuracy": 0.6835348854462305, + "num_tokens": 1420520670.0, + "step": 8472 + }, + { + "entropy": 1.6625533699989319, + "epoch": 0.9308176100628931, + "grad_norm": 0.7054336667060852, + "learning_rate": 1.2441979275687813e-05, + "loss": 1.3931, + "mean_token_accuracy": 0.6612143218517303, + "num_tokens": 1420685237.0, + "step": 8473 + }, + { + "entropy": 1.692489633957545, + "epoch": 0.930927466974266, + "grad_norm": 0.6488029360771179, + "learning_rate": 1.2440398808488654e-05, + "loss": 1.3779, + "mean_token_accuracy": 0.679279754559199, + "num_tokens": 1420901387.0, + "step": 8474 + }, + { + "entropy": 1.7313298384348552, + "epoch": 0.931037323885639, + "grad_norm": 0.7841524481773376, + "learning_rate": 1.2438818295701515e-05, + "loss": 1.3249, + "mean_token_accuracy": 0.6554968257745107, + "num_tokens": 1421099246.0, + "step": 8475 + }, + { + "entropy": 1.715612788995107, + "epoch": 0.9311471807970119, + "grad_norm": 0.7147516012191772, + "learning_rate": 1.2437237737376431e-05, + "loss": 1.3202, + "mean_token_accuracy": 0.6654301732778549, + "num_tokens": 1421270600.0, + "step": 8476 + }, + { + "entropy": 1.6151216328144073, + "epoch": 0.9312570377083849, + "grad_norm": 0.6839093565940857, + "learning_rate": 1.2435657133563419e-05, + "loss": 1.3194, + "mean_token_accuracy": 0.6828833172718684, + "num_tokens": 1421443338.0, + "step": 8477 + }, + { + "entropy": 1.681669036547343, + "epoch": 0.9313668946197577, + "grad_norm": 0.6480600237846375, + "learning_rate": 1.2434076484312507e-05, + "loss": 1.3436, + "mean_token_accuracy": 0.6785709311564764, + "num_tokens": 1421703636.0, + "step": 8478 + }, + { + "entropy": 1.7277530829111736, + "epoch": 0.9314767515311307, + "grad_norm": 0.7484097480773926, + "learning_rate": 1.2432495789673717e-05, + "loss": 1.3296, + "mean_token_accuracy": 0.669383779168129, + "num_tokens": 1421860755.0, + "step": 8479 + }, + { + "entropy": 1.694298009077708, + "epoch": 0.9315866084425036, + "grad_norm": 0.6591439247131348, + "learning_rate": 1.2430915049697086e-05, + "loss": 1.3729, + "mean_token_accuracy": 0.6702051361401876, + "num_tokens": 1422006872.0, + "step": 8480 + }, + { + "entropy": 1.654652992884318, + "epoch": 0.9316964653538766, + "grad_norm": 0.6514762043952942, + "learning_rate": 1.2429334264432632e-05, + "loss": 1.2161, + "mean_token_accuracy": 0.6821036289135615, + "num_tokens": 1422135857.0, + "step": 8481 + }, + { + "entropy": 1.7302643954753876, + "epoch": 0.9318063222652495, + "grad_norm": 0.6303740739822388, + "learning_rate": 1.2427753433930398e-05, + "loss": 1.3769, + "mean_token_accuracy": 0.6573646614948908, + "num_tokens": 1422283990.0, + "step": 8482 + }, + { + "entropy": 1.7143929402033489, + "epoch": 0.9319161791766224, + "grad_norm": 0.7011592984199524, + "learning_rate": 1.2426172558240408e-05, + "loss": 1.6449, + "mean_token_accuracy": 0.626821535329024, + "num_tokens": 1422527659.0, + "step": 8483 + }, + { + "entropy": 1.6688818732897441, + "epoch": 0.9320260360879954, + "grad_norm": 0.707831621170044, + "learning_rate": 1.24245916374127e-05, + "loss": 1.4702, + "mean_token_accuracy": 0.6559558510780334, + "num_tokens": 1422709784.0, + "step": 8484 + }, + { + "entropy": 1.7655527591705322, + "epoch": 0.9321358929993683, + "grad_norm": 0.7450686693191528, + "learning_rate": 1.2423010671497309e-05, + "loss": 1.363, + "mean_token_accuracy": 0.6644783665736517, + "num_tokens": 1422827575.0, + "step": 8485 + }, + { + "entropy": 1.7146152754624684, + "epoch": 0.9322457499107413, + "grad_norm": 0.7006492614746094, + "learning_rate": 1.2421429660544274e-05, + "loss": 1.2541, + "mean_token_accuracy": 0.6772527098655701, + "num_tokens": 1422967424.0, + "step": 8486 + }, + { + "entropy": 1.6500997145970662, + "epoch": 0.9323556068221142, + "grad_norm": 152.7211151123047, + "learning_rate": 1.2419848604603624e-05, + "loss": 1.3616, + "mean_token_accuracy": 0.6649878074725469, + "num_tokens": 1423124348.0, + "step": 8487 + }, + { + "entropy": 1.6900402406851451, + "epoch": 0.9324654637334872, + "grad_norm": 0.6380817294120789, + "learning_rate": 1.2418267503725409e-05, + "loss": 1.5177, + "mean_token_accuracy": 0.6421713878711065, + "num_tokens": 1423284404.0, + "step": 8488 + }, + { + "entropy": 1.6533196369806926, + "epoch": 0.93257532064486, + "grad_norm": 0.9251922965049744, + "learning_rate": 1.2416686357959668e-05, + "loss": 1.1989, + "mean_token_accuracy": 0.6831783403952917, + "num_tokens": 1423413353.0, + "step": 8489 + }, + { + "entropy": 1.6796255509058635, + "epoch": 0.932685177556233, + "grad_norm": 0.6227523684501648, + "learning_rate": 1.2415105167356442e-05, + "loss": 1.4729, + "mean_token_accuracy": 0.6428764114777247, + "num_tokens": 1423591192.0, + "step": 8490 + }, + { + "entropy": 1.7086295584837596, + "epoch": 0.9327950344676059, + "grad_norm": 0.6465691924095154, + "learning_rate": 1.2413523931965775e-05, + "loss": 1.4636, + "mean_token_accuracy": 0.6566512435674667, + "num_tokens": 1423756354.0, + "step": 8491 + }, + { + "entropy": 1.7025008797645569, + "epoch": 0.9329048913789789, + "grad_norm": 0.6250014901161194, + "learning_rate": 1.2411942651837712e-05, + "loss": 1.3253, + "mean_token_accuracy": 0.6653555085261663, + "num_tokens": 1423920837.0, + "step": 8492 + }, + { + "entropy": 1.7105981409549713, + "epoch": 0.9330147482903518, + "grad_norm": 0.7055547833442688, + "learning_rate": 1.24103613270223e-05, + "loss": 1.4082, + "mean_token_accuracy": 0.6560359050830206, + "num_tokens": 1424106336.0, + "step": 8493 + }, + { + "entropy": 1.7495214740435283, + "epoch": 0.9331246052017248, + "grad_norm": 0.7928077578544617, + "learning_rate": 1.2408779957569586e-05, + "loss": 1.3455, + "mean_token_accuracy": 0.6681791841983795, + "num_tokens": 1424236467.0, + "step": 8494 + }, + { + "entropy": 1.6718592544396718, + "epoch": 0.9332344621130977, + "grad_norm": 1.34169602394104, + "learning_rate": 1.2407198543529624e-05, + "loss": 1.4863, + "mean_token_accuracy": 0.6318272079030672, + "num_tokens": 1424400906.0, + "step": 8495 + }, + { + "entropy": 1.8002726435661316, + "epoch": 0.9333443190244707, + "grad_norm": 0.8875370025634766, + "learning_rate": 1.2405617084952461e-05, + "loss": 1.4144, + "mean_token_accuracy": 0.6550916383663813, + "num_tokens": 1424538191.0, + "step": 8496 + }, + { + "entropy": 1.6871886054674785, + "epoch": 0.9334541759358436, + "grad_norm": 0.6395008563995361, + "learning_rate": 1.2404035581888149e-05, + "loss": 1.4748, + "mean_token_accuracy": 0.6380327840646108, + "num_tokens": 1424765006.0, + "step": 8497 + }, + { + "entropy": 1.750147004922231, + "epoch": 0.9335640328472165, + "grad_norm": 0.7137428522109985, + "learning_rate": 1.2402454034386747e-05, + "loss": 1.485, + "mean_token_accuracy": 0.6527098168929418, + "num_tokens": 1424918563.0, + "step": 8498 + }, + { + "entropy": 1.7322723865509033, + "epoch": 0.9336738897585894, + "grad_norm": 0.7444048523902893, + "learning_rate": 1.2400872442498306e-05, + "loss": 1.4352, + "mean_token_accuracy": 0.6530717114607493, + "num_tokens": 1425078392.0, + "step": 8499 + }, + { + "entropy": 1.7369141379992168, + "epoch": 0.9337837466699623, + "grad_norm": 0.6727978587150574, + "learning_rate": 1.239929080627288e-05, + "loss": 1.4268, + "mean_token_accuracy": 0.654167448480924, + "num_tokens": 1425227908.0, + "step": 8500 + }, + { + "entropy": 1.7379600306351979, + "epoch": 0.9338936035813353, + "grad_norm": 0.8184369802474976, + "learning_rate": 1.2397709125760533e-05, + "loss": 1.3786, + "mean_token_accuracy": 0.6590213775634766, + "num_tokens": 1425352745.0, + "step": 8501 + }, + { + "entropy": 1.674992948770523, + "epoch": 0.9340034604927082, + "grad_norm": 0.6965845823287964, + "learning_rate": 1.2396127401011324e-05, + "loss": 1.2521, + "mean_token_accuracy": 0.6741303652524948, + "num_tokens": 1425458702.0, + "step": 8502 + }, + { + "entropy": 1.7390521963437398, + "epoch": 0.9341133174040812, + "grad_norm": 0.6826538443565369, + "learning_rate": 1.2394545632075305e-05, + "loss": 1.2356, + "mean_token_accuracy": 0.6743427018324534, + "num_tokens": 1425560277.0, + "step": 8503 + }, + { + "entropy": 1.7002219259738922, + "epoch": 0.9342231743154541, + "grad_norm": 0.5916033387184143, + "learning_rate": 1.2392963819002555e-05, + "loss": 1.3097, + "mean_token_accuracy": 0.6765924940506617, + "num_tokens": 1425736774.0, + "step": 8504 + }, + { + "entropy": 1.7539168000221252, + "epoch": 0.9343330312268271, + "grad_norm": 0.7708821296691895, + "learning_rate": 1.2391381961843121e-05, + "loss": 1.2834, + "mean_token_accuracy": 0.6643802175919215, + "num_tokens": 1425859969.0, + "step": 8505 + }, + { + "entropy": 1.7002765933672588, + "epoch": 0.9344428881382, + "grad_norm": 0.8281891345977783, + "learning_rate": 1.2389800060647077e-05, + "loss": 1.3764, + "mean_token_accuracy": 0.6619481245676676, + "num_tokens": 1425999977.0, + "step": 8506 + }, + { + "entropy": 1.7185891668001811, + "epoch": 0.934552745049573, + "grad_norm": 0.8112635612487793, + "learning_rate": 1.2388218115464486e-05, + "loss": 1.4361, + "mean_token_accuracy": 0.6457992345094681, + "num_tokens": 1426174270.0, + "step": 8507 + }, + { + "entropy": 1.7229107817014058, + "epoch": 0.9346626019609459, + "grad_norm": 0.6610147953033447, + "learning_rate": 1.238663612634542e-05, + "loss": 1.415, + "mean_token_accuracy": 0.641492078701655, + "num_tokens": 1426351638.0, + "step": 8508 + }, + { + "entropy": 1.7340005536874135, + "epoch": 0.9347724588723189, + "grad_norm": 0.729792058467865, + "learning_rate": 1.2385054093339941e-05, + "loss": 1.2905, + "mean_token_accuracy": 0.6576346158981323, + "num_tokens": 1426491965.0, + "step": 8509 + }, + { + "entropy": 1.703254113594691, + "epoch": 0.9348823157836917, + "grad_norm": 0.7107172012329102, + "learning_rate": 1.2383472016498128e-05, + "loss": 1.423, + "mean_token_accuracy": 0.6514757623275121, + "num_tokens": 1426659259.0, + "step": 8510 + }, + { + "entropy": 1.652273913224538, + "epoch": 0.9349921726950646, + "grad_norm": 0.6343944072723389, + "learning_rate": 1.2381889895870047e-05, + "loss": 1.4148, + "mean_token_accuracy": 0.6528402169545492, + "num_tokens": 1426855817.0, + "step": 8511 + }, + { + "entropy": 1.6305622259775798, + "epoch": 0.9351020296064376, + "grad_norm": 0.7010462880134583, + "learning_rate": 1.2380307731505774e-05, + "loss": 1.415, + "mean_token_accuracy": 0.6715733309586843, + "num_tokens": 1427018351.0, + "step": 8512 + }, + { + "entropy": 1.698022296031316, + "epoch": 0.9352118865178105, + "grad_norm": 0.7755676507949829, + "learning_rate": 1.2378725523455385e-05, + "loss": 1.4059, + "mean_token_accuracy": 0.6574622690677643, + "num_tokens": 1427181253.0, + "step": 8513 + }, + { + "entropy": 1.6600361963113148, + "epoch": 0.9353217434291835, + "grad_norm": 0.6646712422370911, + "learning_rate": 1.2377143271768952e-05, + "loss": 1.2889, + "mean_token_accuracy": 0.6721018751462301, + "num_tokens": 1427336298.0, + "step": 8514 + }, + { + "entropy": 1.7082114418347676, + "epoch": 0.9354316003405564, + "grad_norm": 0.681930422782898, + "learning_rate": 1.2375560976496552e-05, + "loss": 1.376, + "mean_token_accuracy": 0.653716524442037, + "num_tokens": 1427502299.0, + "step": 8515 + }, + { + "entropy": 1.7119992474714916, + "epoch": 0.9355414572519294, + "grad_norm": 0.6305302977561951, + "learning_rate": 1.2373978637688273e-05, + "loss": 1.3506, + "mean_token_accuracy": 0.6574372202157974, + "num_tokens": 1427667365.0, + "step": 8516 + }, + { + "entropy": 1.6821909447511036, + "epoch": 0.9356513141633023, + "grad_norm": 0.6896127462387085, + "learning_rate": 1.2372396255394187e-05, + "loss": 1.4327, + "mean_token_accuracy": 0.6596807638804117, + "num_tokens": 1427798152.0, + "step": 8517 + }, + { + "entropy": 1.6889863014221191, + "epoch": 0.9357611710746753, + "grad_norm": 0.6460300087928772, + "learning_rate": 1.2370813829664378e-05, + "loss": 1.3724, + "mean_token_accuracy": 0.6577446510394415, + "num_tokens": 1427962124.0, + "step": 8518 + }, + { + "entropy": 1.7353461384773254, + "epoch": 0.9358710279860482, + "grad_norm": 0.7331773638725281, + "learning_rate": 1.236923136054893e-05, + "loss": 1.3649, + "mean_token_accuracy": 0.6644917080799738, + "num_tokens": 1428118360.0, + "step": 8519 + }, + { + "entropy": 1.698060820500056, + "epoch": 0.9359808848974212, + "grad_norm": 0.6143467426300049, + "learning_rate": 1.2367648848097926e-05, + "loss": 1.3148, + "mean_token_accuracy": 0.6635189453760783, + "num_tokens": 1428275558.0, + "step": 8520 + }, + { + "entropy": 1.7490674356619518, + "epoch": 0.936090741808794, + "grad_norm": 0.7750940918922424, + "learning_rate": 1.2366066292361452e-05, + "loss": 1.6152, + "mean_token_accuracy": 0.6309548219045004, + "num_tokens": 1428429123.0, + "step": 8521 + }, + { + "entropy": 1.7576595544815063, + "epoch": 0.936200598720167, + "grad_norm": 0.7605588436126709, + "learning_rate": 1.2364483693389595e-05, + "loss": 1.3305, + "mean_token_accuracy": 0.660569633046786, + "num_tokens": 1428583904.0, + "step": 8522 + }, + { + "entropy": 1.6813994944095612, + "epoch": 0.9363104556315399, + "grad_norm": 0.6593738794326782, + "learning_rate": 1.2362901051232443e-05, + "loss": 1.424, + "mean_token_accuracy": 0.6580062558253607, + "num_tokens": 1428758635.0, + "step": 8523 + }, + { + "entropy": 1.6494923929373424, + "epoch": 0.9364203125429128, + "grad_norm": 0.6516050696372986, + "learning_rate": 1.236131836594009e-05, + "loss": 1.2905, + "mean_token_accuracy": 0.6730307787656784, + "num_tokens": 1428897399.0, + "step": 8524 + }, + { + "entropy": 1.626155565182368, + "epoch": 0.9365301694542858, + "grad_norm": 0.7014347314834595, + "learning_rate": 1.235973563756262e-05, + "loss": 1.2858, + "mean_token_accuracy": 0.6724262833595276, + "num_tokens": 1429039444.0, + "step": 8525 + }, + { + "entropy": 1.709865580002467, + "epoch": 0.9366400263656587, + "grad_norm": 0.61009281873703, + "learning_rate": 1.2358152866150132e-05, + "loss": 1.3338, + "mean_token_accuracy": 0.6688801348209381, + "num_tokens": 1429165058.0, + "step": 8526 + }, + { + "entropy": 1.6634798149267833, + "epoch": 0.9367498832770317, + "grad_norm": 0.7199569344520569, + "learning_rate": 1.235657005175272e-05, + "loss": 1.4218, + "mean_token_accuracy": 0.6555696477492651, + "num_tokens": 1429330762.0, + "step": 8527 + }, + { + "entropy": 1.7542400260766347, + "epoch": 0.9368597401884046, + "grad_norm": 0.6616681814193726, + "learning_rate": 1.235498719442047e-05, + "loss": 1.5701, + "mean_token_accuracy": 0.6374204456806183, + "num_tokens": 1429540856.0, + "step": 8528 + }, + { + "entropy": 1.7321417133013408, + "epoch": 0.9369695970997776, + "grad_norm": 0.9613889455795288, + "learning_rate": 1.2353404294203493e-05, + "loss": 1.4868, + "mean_token_accuracy": 0.6607649475336075, + "num_tokens": 1429656849.0, + "step": 8529 + }, + { + "entropy": 1.694995254278183, + "epoch": 0.9370794540111504, + "grad_norm": 0.6711973547935486, + "learning_rate": 1.2351821351151877e-05, + "loss": 1.4466, + "mean_token_accuracy": 0.6372744043668112, + "num_tokens": 1429878375.0, + "step": 8530 + }, + { + "entropy": 1.605004479487737, + "epoch": 0.9371893109225234, + "grad_norm": 0.7244718074798584, + "learning_rate": 1.2350238365315725e-05, + "loss": 1.1491, + "mean_token_accuracy": 0.6727927128473917, + "num_tokens": 1430059211.0, + "step": 8531 + }, + { + "entropy": 1.6997300287087758, + "epoch": 0.9372991678338963, + "grad_norm": 0.669244110584259, + "learning_rate": 1.2348655336745139e-05, + "loss": 1.597, + "mean_token_accuracy": 0.6324973752101263, + "num_tokens": 1430300402.0, + "step": 8532 + }, + { + "entropy": 1.7309175928433735, + "epoch": 0.9374090247452693, + "grad_norm": 0.6466419696807861, + "learning_rate": 1.2347072265490217e-05, + "loss": 1.3845, + "mean_token_accuracy": 0.6553378701210022, + "num_tokens": 1430441031.0, + "step": 8533 + }, + { + "entropy": 1.71918390194575, + "epoch": 0.9375188816566422, + "grad_norm": 0.7030944228172302, + "learning_rate": 1.2345489151601065e-05, + "loss": 1.5114, + "mean_token_accuracy": 0.6413251161575317, + "num_tokens": 1430640995.0, + "step": 8534 + }, + { + "entropy": 1.735251506169637, + "epoch": 0.9376287385680152, + "grad_norm": 0.619408369064331, + "learning_rate": 1.2343905995127787e-05, + "loss": 1.4836, + "mean_token_accuracy": 0.6480836818615595, + "num_tokens": 1430801643.0, + "step": 8535 + }, + { + "entropy": 1.7108490367730458, + "epoch": 0.9377385954793881, + "grad_norm": 0.6266258358955383, + "learning_rate": 1.2342322796120494e-05, + "loss": 1.4076, + "mean_token_accuracy": 0.6552125016848246, + "num_tokens": 1430964465.0, + "step": 8536 + }, + { + "entropy": 1.7700796524683635, + "epoch": 0.9378484523907611, + "grad_norm": 0.6809885501861572, + "learning_rate": 1.2340739554629285e-05, + "loss": 1.4017, + "mean_token_accuracy": 0.647578035791715, + "num_tokens": 1431106838.0, + "step": 8537 + }, + { + "entropy": 1.722182273864746, + "epoch": 0.937958309302134, + "grad_norm": 0.6057953834533691, + "learning_rate": 1.2339156270704273e-05, + "loss": 1.3765, + "mean_token_accuracy": 0.6555658529202143, + "num_tokens": 1431241799.0, + "step": 8538 + }, + { + "entropy": 1.7782465716203053, + "epoch": 0.9380681662135069, + "grad_norm": 0.7593392729759216, + "learning_rate": 1.233757294439557e-05, + "loss": 1.5713, + "mean_token_accuracy": 0.6409921248753866, + "num_tokens": 1431375532.0, + "step": 8539 + }, + { + "entropy": 1.6467249592145283, + "epoch": 0.9381780231248799, + "grad_norm": 0.6114002466201782, + "learning_rate": 1.2335989575753287e-05, + "loss": 1.3017, + "mean_token_accuracy": 0.666813870271047, + "num_tokens": 1431556355.0, + "step": 8540 + }, + { + "entropy": 1.7343460321426392, + "epoch": 0.9382878800362527, + "grad_norm": 0.673979640007019, + "learning_rate": 1.2334406164827532e-05, + "loss": 1.2794, + "mean_token_accuracy": 0.6679557810227076, + "num_tokens": 1431733091.0, + "step": 8541 + }, + { + "entropy": 1.6641011436780293, + "epoch": 0.9383977369476257, + "grad_norm": 0.7670729160308838, + "learning_rate": 1.2332822711668429e-05, + "loss": 1.3856, + "mean_token_accuracy": 0.6573190341393153, + "num_tokens": 1431889217.0, + "step": 8542 + }, + { + "entropy": 1.7148659825325012, + "epoch": 0.9385075938589986, + "grad_norm": 0.7064855098724365, + "learning_rate": 1.233123921632608e-05, + "loss": 1.3523, + "mean_token_accuracy": 0.665443574388822, + "num_tokens": 1432021404.0, + "step": 8543 + }, + { + "entropy": 1.6672392785549164, + "epoch": 0.9386174507703716, + "grad_norm": 0.6361745595932007, + "learning_rate": 1.2329655678850619e-05, + "loss": 1.412, + "mean_token_accuracy": 0.6467155714829763, + "num_tokens": 1432204022.0, + "step": 8544 + }, + { + "entropy": 1.601523111263911, + "epoch": 0.9387273076817445, + "grad_norm": 0.5796836018562317, + "learning_rate": 1.2328072099292148e-05, + "loss": 1.3599, + "mean_token_accuracy": 0.6564453194538752, + "num_tokens": 1432403878.0, + "step": 8545 + }, + { + "entropy": 1.699288825194041, + "epoch": 0.9388371645931175, + "grad_norm": 0.7255278825759888, + "learning_rate": 1.2326488477700795e-05, + "loss": 1.5015, + "mean_token_accuracy": 0.6359589795271555, + "num_tokens": 1432602079.0, + "step": 8546 + }, + { + "entropy": 1.6769676804542542, + "epoch": 0.9389470215044904, + "grad_norm": 0.7669406533241272, + "learning_rate": 1.2324904814126682e-05, + "loss": 1.4365, + "mean_token_accuracy": 0.645541230837504, + "num_tokens": 1432795695.0, + "step": 8547 + }, + { + "entropy": 1.7266658147176106, + "epoch": 0.9390568784158634, + "grad_norm": 0.6003899574279785, + "learning_rate": 1.2323321108619927e-05, + "loss": 1.5048, + "mean_token_accuracy": 0.6312548617521921, + "num_tokens": 1433015547.0, + "step": 8548 + }, + { + "entropy": 1.694272110859553, + "epoch": 0.9391667353272363, + "grad_norm": 0.597334623336792, + "learning_rate": 1.2321737361230657e-05, + "loss": 1.4965, + "mean_token_accuracy": 0.6420956204334894, + "num_tokens": 1433216205.0, + "step": 8549 + }, + { + "entropy": 1.750846117734909, + "epoch": 0.9392765922386093, + "grad_norm": 0.6369536519050598, + "learning_rate": 1.232015357200899e-05, + "loss": 1.4068, + "mean_token_accuracy": 0.6545988370974859, + "num_tokens": 1433365930.0, + "step": 8550 + }, + { + "entropy": 1.7660410205523174, + "epoch": 0.9393864491499822, + "grad_norm": 0.7357500195503235, + "learning_rate": 1.231856974100506e-05, + "loss": 1.2884, + "mean_token_accuracy": 0.6664379785458246, + "num_tokens": 1433474906.0, + "step": 8551 + }, + { + "entropy": 1.717143605152766, + "epoch": 0.939496306061355, + "grad_norm": 0.7086150050163269, + "learning_rate": 1.2316985868268996e-05, + "loss": 1.2126, + "mean_token_accuracy": 0.6740925163030624, + "num_tokens": 1433578842.0, + "step": 8552 + }, + { + "entropy": 1.689423680305481, + "epoch": 0.939606162972728, + "grad_norm": 0.7993782758712769, + "learning_rate": 1.2315401953850915e-05, + "loss": 1.3022, + "mean_token_accuracy": 0.6719008336464564, + "num_tokens": 1433704238.0, + "step": 8553 + }, + { + "entropy": 1.7148550947507222, + "epoch": 0.9397160198841009, + "grad_norm": 0.6006395220756531, + "learning_rate": 1.2313817997800963e-05, + "loss": 1.3841, + "mean_token_accuracy": 0.6570403923590978, + "num_tokens": 1433895168.0, + "step": 8554 + }, + { + "entropy": 1.7745637098948162, + "epoch": 0.9398258767954739, + "grad_norm": 0.6105915904045105, + "learning_rate": 1.231223400016926e-05, + "loss": 1.3853, + "mean_token_accuracy": 0.6493383248647054, + "num_tokens": 1434091545.0, + "step": 8555 + }, + { + "entropy": 1.6534299850463867, + "epoch": 0.9399357337068468, + "grad_norm": 0.630155622959137, + "learning_rate": 1.2310649961005937e-05, + "loss": 1.3915, + "mean_token_accuracy": 0.6596393237511317, + "num_tokens": 1434295654.0, + "step": 8556 + }, + { + "entropy": 1.6820517877737682, + "epoch": 0.9400455906182198, + "grad_norm": 0.6753036975860596, + "learning_rate": 1.2309065880361139e-05, + "loss": 1.3637, + "mean_token_accuracy": 0.6623709599177042, + "num_tokens": 1434465834.0, + "step": 8557 + }, + { + "entropy": 1.7782000700632732, + "epoch": 0.9401554475295927, + "grad_norm": 0.7968646883964539, + "learning_rate": 1.2307481758284996e-05, + "loss": 1.3908, + "mean_token_accuracy": 0.6523198982079824, + "num_tokens": 1434654878.0, + "step": 8558 + }, + { + "entropy": 1.7569174667199452, + "epoch": 0.9402653044409657, + "grad_norm": 0.8099093437194824, + "learning_rate": 1.2305897594827642e-05, + "loss": 1.4227, + "mean_token_accuracy": 0.654918392499288, + "num_tokens": 1434786625.0, + "step": 8559 + }, + { + "entropy": 1.6676458517710369, + "epoch": 0.9403751613523386, + "grad_norm": 0.7002986669540405, + "learning_rate": 1.230431339003922e-05, + "loss": 1.3799, + "mean_token_accuracy": 0.6608104457457861, + "num_tokens": 1434961024.0, + "step": 8560 + }, + { + "entropy": 1.7227271993954976, + "epoch": 0.9404850182637116, + "grad_norm": 0.6729583144187927, + "learning_rate": 1.2302729143969864e-05, + "loss": 1.4044, + "mean_token_accuracy": 0.6510558873414993, + "num_tokens": 1435139239.0, + "step": 8561 + }, + { + "entropy": 1.7713795006275177, + "epoch": 0.9405948751750844, + "grad_norm": 0.6517491340637207, + "learning_rate": 1.230114485666972e-05, + "loss": 1.3088, + "mean_token_accuracy": 0.6703696896632513, + "num_tokens": 1435306595.0, + "step": 8562 + }, + { + "entropy": 1.7000613113244374, + "epoch": 0.9407047320864574, + "grad_norm": 0.7172781229019165, + "learning_rate": 1.2299560528188928e-05, + "loss": 1.4559, + "mean_token_accuracy": 0.6527749449014664, + "num_tokens": 1435512818.0, + "step": 8563 + }, + { + "entropy": 1.6688750584920247, + "epoch": 0.9408145889978303, + "grad_norm": 0.6413210034370422, + "learning_rate": 1.2297976158577632e-05, + "loss": 1.4387, + "mean_token_accuracy": 0.655961866180102, + "num_tokens": 1435687791.0, + "step": 8564 + }, + { + "entropy": 1.7471512258052826, + "epoch": 0.9409244459092032, + "grad_norm": 0.7935196161270142, + "learning_rate": 1.2296391747885969e-05, + "loss": 1.4428, + "mean_token_accuracy": 0.652156800031662, + "num_tokens": 1435831770.0, + "step": 8565 + }, + { + "entropy": 1.645731806755066, + "epoch": 0.9410343028205762, + "grad_norm": 0.6245310306549072, + "learning_rate": 1.22948072961641e-05, + "loss": 1.3675, + "mean_token_accuracy": 0.6590336362520853, + "num_tokens": 1436053698.0, + "step": 8566 + }, + { + "entropy": 1.7025697429974873, + "epoch": 0.9411441597319491, + "grad_norm": 0.6795003414154053, + "learning_rate": 1.2293222803462157e-05, + "loss": 1.3575, + "mean_token_accuracy": 0.6611177573601404, + "num_tokens": 1436232464.0, + "step": 8567 + }, + { + "entropy": 1.6784963309764862, + "epoch": 0.9412540166433221, + "grad_norm": 0.6474406719207764, + "learning_rate": 1.2291638269830296e-05, + "loss": 1.5794, + "mean_token_accuracy": 0.6418151060740153, + "num_tokens": 1436431748.0, + "step": 8568 + }, + { + "entropy": 1.6753207445144653, + "epoch": 0.941363873554695, + "grad_norm": 0.7023776769638062, + "learning_rate": 1.2290053695318666e-05, + "loss": 1.2703, + "mean_token_accuracy": 0.6644528806209564, + "num_tokens": 1436594419.0, + "step": 8569 + }, + { + "entropy": 1.693762997786204, + "epoch": 0.941473730466068, + "grad_norm": 0.7446289658546448, + "learning_rate": 1.2288469079977423e-05, + "loss": 1.4171, + "mean_token_accuracy": 0.6514114439487457, + "num_tokens": 1436802609.0, + "step": 8570 + }, + { + "entropy": 1.7322840690612793, + "epoch": 0.9415835873774409, + "grad_norm": 0.7329487204551697, + "learning_rate": 1.2286884423856707e-05, + "loss": 1.3476, + "mean_token_accuracy": 0.6640298316876093, + "num_tokens": 1436947973.0, + "step": 8571 + }, + { + "entropy": 1.7172012130419414, + "epoch": 0.9416934442888139, + "grad_norm": 0.7047858238220215, + "learning_rate": 1.2285299727006681e-05, + "loss": 1.2538, + "mean_token_accuracy": 0.6799323062102, + "num_tokens": 1437075212.0, + "step": 8572 + }, + { + "entropy": 1.7134579122066498, + "epoch": 0.9418033012001867, + "grad_norm": 0.7685977816581726, + "learning_rate": 1.22837149894775e-05, + "loss": 1.3434, + "mean_token_accuracy": 0.6664036015669504, + "num_tokens": 1437204294.0, + "step": 8573 + }, + { + "entropy": 1.6684084435304005, + "epoch": 0.9419131581115597, + "grad_norm": 0.6283167004585266, + "learning_rate": 1.2282130211319317e-05, + "loss": 1.4298, + "mean_token_accuracy": 0.6495349705219269, + "num_tokens": 1437409005.0, + "step": 8574 + }, + { + "entropy": 1.7082226773103077, + "epoch": 0.9420230150229326, + "grad_norm": 0.710364043712616, + "learning_rate": 1.228054539258229e-05, + "loss": 1.3381, + "mean_token_accuracy": 0.6665283391873041, + "num_tokens": 1437559170.0, + "step": 8575 + }, + { + "entropy": 1.6750965019067128, + "epoch": 0.9421328719343056, + "grad_norm": 0.7653444409370422, + "learning_rate": 1.227896053331658e-05, + "loss": 1.2684, + "mean_token_accuracy": 0.6685964713493983, + "num_tokens": 1437700695.0, + "step": 8576 + }, + { + "entropy": 1.740403562784195, + "epoch": 0.9422427288456785, + "grad_norm": 0.6980507373809814, + "learning_rate": 1.2277375633572342e-05, + "loss": 1.4079, + "mean_token_accuracy": 0.6575213720401129, + "num_tokens": 1437824155.0, + "step": 8577 + }, + { + "entropy": 1.697914143403371, + "epoch": 0.9423525857570514, + "grad_norm": 1.6362974643707275, + "learning_rate": 1.2275790693399742e-05, + "loss": 1.1908, + "mean_token_accuracy": 0.6685192883014679, + "num_tokens": 1438043105.0, + "step": 8578 + }, + { + "entropy": 1.7175638178984325, + "epoch": 0.9424624426684244, + "grad_norm": 0.7739365696907043, + "learning_rate": 1.2274205712848946e-05, + "loss": 1.2582, + "mean_token_accuracy": 0.6746840725342432, + "num_tokens": 1438171351.0, + "step": 8579 + }, + { + "entropy": 1.7105300724506378, + "epoch": 0.9425722995797973, + "grad_norm": 0.6245592832565308, + "learning_rate": 1.227262069197011e-05, + "loss": 1.4033, + "mean_token_accuracy": 0.6511557598908743, + "num_tokens": 1438333063.0, + "step": 8580 + }, + { + "entropy": 1.7435839176177979, + "epoch": 0.9426821564911703, + "grad_norm": 1.099191665649414, + "learning_rate": 1.2271035630813399e-05, + "loss": 1.6261, + "mean_token_accuracy": 0.6620939721663793, + "num_tokens": 1438480683.0, + "step": 8581 + }, + { + "entropy": 1.6721567908922832, + "epoch": 0.9427920134025431, + "grad_norm": 0.7055137753486633, + "learning_rate": 1.2269450529428987e-05, + "loss": 1.3201, + "mean_token_accuracy": 0.6689964085817337, + "num_tokens": 1438677861.0, + "step": 8582 + }, + { + "entropy": 1.7083663443724315, + "epoch": 0.9429018703139161, + "grad_norm": 0.5708084106445312, + "learning_rate": 1.2267865387867038e-05, + "loss": 1.3933, + "mean_token_accuracy": 0.656741683681806, + "num_tokens": 1438877416.0, + "step": 8583 + }, + { + "entropy": 1.706307937701543, + "epoch": 0.943011727225289, + "grad_norm": 0.7216012477874756, + "learning_rate": 1.2266280206177718e-05, + "loss": 1.4801, + "mean_token_accuracy": 0.6545088092486063, + "num_tokens": 1439042236.0, + "step": 8584 + }, + { + "entropy": 1.70126873254776, + "epoch": 0.943121584136662, + "grad_norm": 0.7581015229225159, + "learning_rate": 1.2264694984411203e-05, + "loss": 1.5482, + "mean_token_accuracy": 0.6490120142698288, + "num_tokens": 1439220469.0, + "step": 8585 + }, + { + "entropy": 1.651670257250468, + "epoch": 0.9432314410480349, + "grad_norm": 0.6032636761665344, + "learning_rate": 1.226310972261766e-05, + "loss": 1.5364, + "mean_token_accuracy": 0.6319657365481058, + "num_tokens": 1439465630.0, + "step": 8586 + }, + { + "entropy": 1.6893612047036488, + "epoch": 0.9433412979594079, + "grad_norm": 0.5619993805885315, + "learning_rate": 1.2261524420847265e-05, + "loss": 1.0479, + "mean_token_accuracy": 0.6818432062864304, + "num_tokens": 1439603786.0, + "step": 8587 + }, + { + "entropy": 1.7372606893380482, + "epoch": 0.9434511548707808, + "grad_norm": 0.8364537954330444, + "learning_rate": 1.225993907915019e-05, + "loss": 1.433, + "mean_token_accuracy": 0.6543504744768143, + "num_tokens": 1439802054.0, + "step": 8588 + }, + { + "entropy": 1.6576914886633556, + "epoch": 0.9435610117821538, + "grad_norm": 0.6688291430473328, + "learning_rate": 1.225835369757661e-05, + "loss": 1.3298, + "mean_token_accuracy": 0.6816399743159612, + "num_tokens": 1439958813.0, + "step": 8589 + }, + { + "entropy": 1.6907293200492859, + "epoch": 0.9436708686935267, + "grad_norm": 0.657168984413147, + "learning_rate": 1.2256768276176702e-05, + "loss": 1.3813, + "mean_token_accuracy": 0.6570898244778315, + "num_tokens": 1440161016.0, + "step": 8590 + }, + { + "entropy": 1.6189829607804616, + "epoch": 0.9437807256048997, + "grad_norm": 0.7953099012374878, + "learning_rate": 1.2255182815000646e-05, + "loss": 1.4897, + "mean_token_accuracy": 0.6546467443307241, + "num_tokens": 1440331282.0, + "step": 8591 + }, + { + "entropy": 1.751821796099345, + "epoch": 0.9438905825162726, + "grad_norm": 0.8301904797554016, + "learning_rate": 1.225359731409862e-05, + "loss": 1.5474, + "mean_token_accuracy": 0.6519018063942591, + "num_tokens": 1440558471.0, + "step": 8592 + }, + { + "entropy": 1.741612325112025, + "epoch": 0.9440004394276454, + "grad_norm": 0.7195350527763367, + "learning_rate": 1.22520117735208e-05, + "loss": 1.4176, + "mean_token_accuracy": 0.6544721672932307, + "num_tokens": 1440739587.0, + "step": 8593 + }, + { + "entropy": 1.6872974336147308, + "epoch": 0.9441102963390184, + "grad_norm": 0.6328041553497314, + "learning_rate": 1.2250426193317376e-05, + "loss": 1.5012, + "mean_token_accuracy": 0.645188053448995, + "num_tokens": 1440907130.0, + "step": 8594 + }, + { + "entropy": 1.7358746826648712, + "epoch": 0.9442201532503913, + "grad_norm": 0.6817638278007507, + "learning_rate": 1.2248840573538522e-05, + "loss": 1.43, + "mean_token_accuracy": 0.6595302472511927, + "num_tokens": 1441080715.0, + "step": 8595 + }, + { + "entropy": 1.6565217673778534, + "epoch": 0.9443300101617643, + "grad_norm": 0.6565890312194824, + "learning_rate": 1.224725491423443e-05, + "loss": 1.5008, + "mean_token_accuracy": 0.6534999509652456, + "num_tokens": 1441287677.0, + "step": 8596 + }, + { + "entropy": 1.7144455512364705, + "epoch": 0.9444398670731372, + "grad_norm": 0.6671959757804871, + "learning_rate": 1.224566921545528e-05, + "loss": 1.4134, + "mean_token_accuracy": 0.6520157555739085, + "num_tokens": 1441433876.0, + "step": 8597 + }, + { + "entropy": 1.690605749686559, + "epoch": 0.9445497239845102, + "grad_norm": 0.723557710647583, + "learning_rate": 1.2244083477251265e-05, + "loss": 1.5496, + "mean_token_accuracy": 0.6480442037185034, + "num_tokens": 1441611098.0, + "step": 8598 + }, + { + "entropy": 1.7220774292945862, + "epoch": 0.9446595808958831, + "grad_norm": 0.6669568419456482, + "learning_rate": 1.2242497699672562e-05, + "loss": 1.547, + "mean_token_accuracy": 0.6334994534651438, + "num_tokens": 1441771369.0, + "step": 8599 + }, + { + "entropy": 1.7486573259035747, + "epoch": 0.9447694378072561, + "grad_norm": 0.6781217455863953, + "learning_rate": 1.2240911882769372e-05, + "loss": 1.4171, + "mean_token_accuracy": 0.645118405421575, + "num_tokens": 1441916756.0, + "step": 8600 + }, + { + "entropy": 1.6590690712134044, + "epoch": 0.944879294718629, + "grad_norm": 0.6555703282356262, + "learning_rate": 1.2239326026591877e-05, + "loss": 1.2832, + "mean_token_accuracy": 0.6796758274237314, + "num_tokens": 1442041756.0, + "step": 8601 + }, + { + "entropy": 1.7096993426481883, + "epoch": 0.944989151630002, + "grad_norm": 0.690627932548523, + "learning_rate": 1.2237740131190275e-05, + "loss": 1.3739, + "mean_token_accuracy": 0.66547991335392, + "num_tokens": 1442198769.0, + "step": 8602 + }, + { + "entropy": 1.728843520085017, + "epoch": 0.9450990085413749, + "grad_norm": 0.7249277830123901, + "learning_rate": 1.2236154196614754e-05, + "loss": 1.5124, + "mean_token_accuracy": 0.6484755227963129, + "num_tokens": 1442371906.0, + "step": 8603 + }, + { + "entropy": 1.7191427449385326, + "epoch": 0.9452088654527478, + "grad_norm": 0.6279011368751526, + "learning_rate": 1.2234568222915511e-05, + "loss": 1.5187, + "mean_token_accuracy": 0.6500843664010366, + "num_tokens": 1442549837.0, + "step": 8604 + }, + { + "entropy": 1.7306394279003143, + "epoch": 0.9453187223641207, + "grad_norm": 0.7033337950706482, + "learning_rate": 1.2232982210142734e-05, + "loss": 1.4231, + "mean_token_accuracy": 0.649728591243426, + "num_tokens": 1442708077.0, + "step": 8605 + }, + { + "entropy": 1.6703736782073975, + "epoch": 0.9454285792754936, + "grad_norm": 0.745704174041748, + "learning_rate": 1.2231396158346631e-05, + "loss": 1.4547, + "mean_token_accuracy": 0.6601613610982895, + "num_tokens": 1442846251.0, + "step": 8606 + }, + { + "entropy": 1.6758897602558136, + "epoch": 0.9455384361868666, + "grad_norm": 0.6928910613059998, + "learning_rate": 1.2229810067577395e-05, + "loss": 1.5418, + "mean_token_accuracy": 0.6258559823036194, + "num_tokens": 1443052093.0, + "step": 8607 + }, + { + "entropy": 1.685420682032903, + "epoch": 0.9456482930982395, + "grad_norm": 0.6015498042106628, + "learning_rate": 1.2228223937885222e-05, + "loss": 1.3963, + "mean_token_accuracy": 0.6562847743431727, + "num_tokens": 1443234288.0, + "step": 8608 + }, + { + "entropy": 1.7362493971983592, + "epoch": 0.9457581500096125, + "grad_norm": 0.7113436460494995, + "learning_rate": 1.2226637769320316e-05, + "loss": 1.3335, + "mean_token_accuracy": 0.6664293905099233, + "num_tokens": 1443373399.0, + "step": 8609 + }, + { + "entropy": 1.7087362408638, + "epoch": 0.9458680069209854, + "grad_norm": 0.6327268481254578, + "learning_rate": 1.2225051561932877e-05, + "loss": 1.4998, + "mean_token_accuracy": 0.6323538819948832, + "num_tokens": 1443518849.0, + "step": 8610 + }, + { + "entropy": 1.673716555039088, + "epoch": 0.9459778638323584, + "grad_norm": 0.6509339213371277, + "learning_rate": 1.2223465315773109e-05, + "loss": 1.3558, + "mean_token_accuracy": 0.6670020073652267, + "num_tokens": 1443642314.0, + "step": 8611 + }, + { + "entropy": 1.6992060740788777, + "epoch": 0.9460877207437313, + "grad_norm": 0.8136547207832336, + "learning_rate": 1.2221879030891214e-05, + "loss": 1.5503, + "mean_token_accuracy": 0.6302010516325632, + "num_tokens": 1443856184.0, + "step": 8612 + }, + { + "entropy": 1.7098911603291829, + "epoch": 0.9461975776551043, + "grad_norm": 0.7616743445396423, + "learning_rate": 1.2220292707337396e-05, + "loss": 1.4039, + "mean_token_accuracy": 0.650201790034771, + "num_tokens": 1443988081.0, + "step": 8613 + }, + { + "entropy": 1.679803490638733, + "epoch": 0.9463074345664771, + "grad_norm": 0.8417178988456726, + "learning_rate": 1.2218706345161869e-05, + "loss": 1.3653, + "mean_token_accuracy": 0.6717022359371185, + "num_tokens": 1444134135.0, + "step": 8614 + }, + { + "entropy": 1.7369107902050018, + "epoch": 0.9464172914778501, + "grad_norm": 0.7196716070175171, + "learning_rate": 1.221711994441483e-05, + "loss": 1.259, + "mean_token_accuracy": 0.6838698089122772, + "num_tokens": 1444243607.0, + "step": 8615 + }, + { + "entropy": 1.6895853380362194, + "epoch": 0.946527148389223, + "grad_norm": 0.6455492973327637, + "learning_rate": 1.2215533505146498e-05, + "loss": 1.5036, + "mean_token_accuracy": 0.638408382733663, + "num_tokens": 1444393001.0, + "step": 8616 + }, + { + "entropy": 1.7575147449970245, + "epoch": 0.946637005300596, + "grad_norm": 0.7868098020553589, + "learning_rate": 1.2213947027407074e-05, + "loss": 1.4593, + "mean_token_accuracy": 0.6475796749194463, + "num_tokens": 1444581539.0, + "step": 8617 + }, + { + "entropy": 1.6824683447678883, + "epoch": 0.9467468622119689, + "grad_norm": 0.7493737936019897, + "learning_rate": 1.2212360511246775e-05, + "loss": 1.2181, + "mean_token_accuracy": 0.6755462735891342, + "num_tokens": 1444696551.0, + "step": 8618 + }, + { + "entropy": 1.685210108757019, + "epoch": 0.9468567191233418, + "grad_norm": 0.6668775081634521, + "learning_rate": 1.221077395671581e-05, + "loss": 1.3536, + "mean_token_accuracy": 0.6589836031198502, + "num_tokens": 1444842322.0, + "step": 8619 + }, + { + "entropy": 1.695833792289098, + "epoch": 0.9469665760347148, + "grad_norm": 0.6757495403289795, + "learning_rate": 1.2209187363864403e-05, + "loss": 1.4149, + "mean_token_accuracy": 0.6574927568435669, + "num_tokens": 1444978361.0, + "step": 8620 + }, + { + "entropy": 1.6860649983088176, + "epoch": 0.9470764329460877, + "grad_norm": 0.8738111853599548, + "learning_rate": 1.2207600732742753e-05, + "loss": 1.4086, + "mean_token_accuracy": 0.672735000650088, + "num_tokens": 1445114765.0, + "step": 8621 + }, + { + "entropy": 1.7755548655986786, + "epoch": 0.9471862898574607, + "grad_norm": 0.7084314227104187, + "learning_rate": 1.2206014063401088e-05, + "loss": 1.4593, + "mean_token_accuracy": 0.6394910415013632, + "num_tokens": 1445280099.0, + "step": 8622 + }, + { + "entropy": 1.6687957346439362, + "epoch": 0.9472961467688336, + "grad_norm": 0.6261263489723206, + "learning_rate": 1.2204427355889619e-05, + "loss": 1.4939, + "mean_token_accuracy": 0.6568110336860021, + "num_tokens": 1445470949.0, + "step": 8623 + }, + { + "entropy": 1.6220212280750275, + "epoch": 0.9474060036802066, + "grad_norm": 0.6310474276542664, + "learning_rate": 1.2202840610258567e-05, + "loss": 1.4776, + "mean_token_accuracy": 0.6486604412396749, + "num_tokens": 1445669405.0, + "step": 8624 + }, + { + "entropy": 1.7591717044512432, + "epoch": 0.9475158605915794, + "grad_norm": 0.6975616812705994, + "learning_rate": 1.2201253826558151e-05, + "loss": 1.4629, + "mean_token_accuracy": 0.6384722590446472, + "num_tokens": 1445848452.0, + "step": 8625 + }, + { + "entropy": 1.7158987323443096, + "epoch": 0.9476257175029524, + "grad_norm": 0.7575097680091858, + "learning_rate": 1.2199667004838595e-05, + "loss": 1.5049, + "mean_token_accuracy": 0.6423617899417877, + "num_tokens": 1446025783.0, + "step": 8626 + }, + { + "entropy": 1.7229747573534648, + "epoch": 0.9477355744143253, + "grad_norm": 0.7464864253997803, + "learning_rate": 1.2198080145150115e-05, + "loss": 1.5275, + "mean_token_accuracy": 0.6419810652732849, + "num_tokens": 1446233918.0, + "step": 8627 + }, + { + "entropy": 1.6983900268872578, + "epoch": 0.9478454313256983, + "grad_norm": 0.7038105130195618, + "learning_rate": 1.2196493247542945e-05, + "loss": 1.4288, + "mean_token_accuracy": 0.6560607403516769, + "num_tokens": 1446397858.0, + "step": 8628 + }, + { + "entropy": 1.6872251629829407, + "epoch": 0.9479552882370712, + "grad_norm": 0.6693107485771179, + "learning_rate": 1.2194906312067298e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.6616791983445486, + "num_tokens": 1446572029.0, + "step": 8629 + }, + { + "entropy": 1.734555850426356, + "epoch": 0.9480651451484442, + "grad_norm": 0.8772332668304443, + "learning_rate": 1.2193319338773407e-05, + "loss": 1.2269, + "mean_token_accuracy": 0.675347904364268, + "num_tokens": 1446687371.0, + "step": 8630 + }, + { + "entropy": 1.7342069049676259, + "epoch": 0.9481750020598171, + "grad_norm": 0.7794189453125, + "learning_rate": 1.2191732327711494e-05, + "loss": 1.4603, + "mean_token_accuracy": 0.6412904510895411, + "num_tokens": 1446848084.0, + "step": 8631 + }, + { + "entropy": 1.6514850755532582, + "epoch": 0.94828485897119, + "grad_norm": 0.660847544670105, + "learning_rate": 1.219014527893179e-05, + "loss": 1.3589, + "mean_token_accuracy": 0.6614196399847666, + "num_tokens": 1447020659.0, + "step": 8632 + }, + { + "entropy": 1.752412219842275, + "epoch": 0.948394715882563, + "grad_norm": 0.7109209895133972, + "learning_rate": 1.2188558192484524e-05, + "loss": 1.476, + "mean_token_accuracy": 0.6500055193901062, + "num_tokens": 1447193145.0, + "step": 8633 + }, + { + "entropy": 1.7354917923609416, + "epoch": 0.9485045727939359, + "grad_norm": 0.7011961936950684, + "learning_rate": 1.2186971068419933e-05, + "loss": 1.405, + "mean_token_accuracy": 0.6602567285299301, + "num_tokens": 1447336250.0, + "step": 8634 + }, + { + "entropy": 1.7502802014350891, + "epoch": 0.9486144297053088, + "grad_norm": 0.5763677358627319, + "learning_rate": 1.2185383906788235e-05, + "loss": 1.3808, + "mean_token_accuracy": 0.6556936403115591, + "num_tokens": 1447545615.0, + "step": 8635 + }, + { + "entropy": 1.6883324980735779, + "epoch": 0.9487242866166817, + "grad_norm": 0.6211029887199402, + "learning_rate": 1.2183796707639672e-05, + "loss": 1.412, + "mean_token_accuracy": 0.6450077096621195, + "num_tokens": 1447732873.0, + "step": 8636 + }, + { + "entropy": 1.7086971402168274, + "epoch": 0.9488341435280547, + "grad_norm": 0.7448198199272156, + "learning_rate": 1.2182209471024478e-05, + "loss": 1.2632, + "mean_token_accuracy": 0.6758876889944077, + "num_tokens": 1447854972.0, + "step": 8637 + }, + { + "entropy": 1.7209039429823558, + "epoch": 0.9489440004394276, + "grad_norm": 0.770972490310669, + "learning_rate": 1.2180622196992889e-05, + "loss": 1.4801, + "mean_token_accuracy": 0.6674757947524389, + "num_tokens": 1447990717.0, + "step": 8638 + }, + { + "entropy": 1.6779508491357167, + "epoch": 0.9490538573508006, + "grad_norm": 0.6987808346748352, + "learning_rate": 1.2179034885595133e-05, + "loss": 1.4486, + "mean_token_accuracy": 0.6476506143808365, + "num_tokens": 1448146980.0, + "step": 8639 + }, + { + "entropy": 1.70108496149381, + "epoch": 0.9491637142621735, + "grad_norm": 0.5300113558769226, + "learning_rate": 1.217744753688146e-05, + "loss": 1.4492, + "mean_token_accuracy": 0.6409799307584763, + "num_tokens": 1448398932.0, + "step": 8640 + }, + { + "entropy": 1.7490708430608113, + "epoch": 0.9492735711735465, + "grad_norm": 0.7466815710067749, + "learning_rate": 1.2175860150902103e-05, + "loss": 1.3808, + "mean_token_accuracy": 0.6627901097138723, + "num_tokens": 1448517817.0, + "step": 8641 + }, + { + "entropy": 1.8031253119309743, + "epoch": 0.9493834280849194, + "grad_norm": 0.7149515748023987, + "learning_rate": 1.21742727277073e-05, + "loss": 1.4257, + "mean_token_accuracy": 0.6576060056686401, + "num_tokens": 1448680244.0, + "step": 8642 + }, + { + "entropy": 1.6629578669865925, + "epoch": 0.9494932849962924, + "grad_norm": 0.8295400738716125, + "learning_rate": 1.2172685267347293e-05, + "loss": 1.3317, + "mean_token_accuracy": 0.6723531931638718, + "num_tokens": 1448837554.0, + "step": 8643 + }, + { + "entropy": 1.6949988305568695, + "epoch": 0.9496031419076653, + "grad_norm": 0.5842585563659668, + "learning_rate": 1.2171097769872331e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6408194800217947, + "num_tokens": 1449029085.0, + "step": 8644 + }, + { + "entropy": 1.6800251007080078, + "epoch": 0.9497129988190383, + "grad_norm": 0.6801996231079102, + "learning_rate": 1.216951023533265e-05, + "loss": 1.2726, + "mean_token_accuracy": 0.6702685306469599, + "num_tokens": 1449167357.0, + "step": 8645 + }, + { + "entropy": 1.7152654727300007, + "epoch": 0.9498228557304111, + "grad_norm": 0.6115834712982178, + "learning_rate": 1.2167922663778493e-05, + "loss": 1.4586, + "mean_token_accuracy": 0.6457581520080566, + "num_tokens": 1449365920.0, + "step": 8646 + }, + { + "entropy": 1.6668970982233684, + "epoch": 0.949932712641784, + "grad_norm": 0.6367796063423157, + "learning_rate": 1.2166335055260112e-05, + "loss": 1.544, + "mean_token_accuracy": 0.6396810958782831, + "num_tokens": 1449592927.0, + "step": 8647 + }, + { + "entropy": 1.6488535205523174, + "epoch": 0.950042569553157, + "grad_norm": 0.6653515100479126, + "learning_rate": 1.2164747409827755e-05, + "loss": 1.3773, + "mean_token_accuracy": 0.6532280345757803, + "num_tokens": 1449757980.0, + "step": 8648 + }, + { + "entropy": 1.7187113364537556, + "epoch": 0.9501524264645299, + "grad_norm": 0.6801130771636963, + "learning_rate": 1.2163159727531664e-05, + "loss": 1.4093, + "mean_token_accuracy": 0.6681001136700312, + "num_tokens": 1449900483.0, + "step": 8649 + }, + { + "entropy": 1.6937756339708965, + "epoch": 0.9502622833759029, + "grad_norm": 0.6868298053741455, + "learning_rate": 1.2161572008422093e-05, + "loss": 1.4056, + "mean_token_accuracy": 0.6516019354263941, + "num_tokens": 1450048324.0, + "step": 8650 + }, + { + "entropy": 1.7173837820688884, + "epoch": 0.9503721402872758, + "grad_norm": 0.6891928315162659, + "learning_rate": 1.215998425254929e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.6473551144202551, + "num_tokens": 1450207740.0, + "step": 8651 + }, + { + "entropy": 1.68080539504687, + "epoch": 0.9504819971986488, + "grad_norm": 0.7287924289703369, + "learning_rate": 1.2158396459963511e-05, + "loss": 1.4223, + "mean_token_accuracy": 0.6610532452662786, + "num_tokens": 1450370382.0, + "step": 8652 + }, + { + "entropy": 1.7321538031101227, + "epoch": 0.9505918541100217, + "grad_norm": 0.8169899582862854, + "learning_rate": 1.2156808630715004e-05, + "loss": 1.5197, + "mean_token_accuracy": 0.6509786198536555, + "num_tokens": 1450555161.0, + "step": 8653 + }, + { + "entropy": 1.7048485080401103, + "epoch": 0.9507017110213947, + "grad_norm": 0.6728548407554626, + "learning_rate": 1.2155220764854027e-05, + "loss": 1.2885, + "mean_token_accuracy": 0.6768156687418619, + "num_tokens": 1450701110.0, + "step": 8654 + }, + { + "entropy": 1.7684779067834218, + "epoch": 0.9508115679327676, + "grad_norm": 0.7296202778816223, + "learning_rate": 1.2153632862430828e-05, + "loss": 1.4785, + "mean_token_accuracy": 0.6470775653918585, + "num_tokens": 1450896700.0, + "step": 8655 + }, + { + "entropy": 1.7234237790107727, + "epoch": 0.9509214248441406, + "grad_norm": 0.691101610660553, + "learning_rate": 1.2152044923495676e-05, + "loss": 1.4179, + "mean_token_accuracy": 0.6461136788129807, + "num_tokens": 1451079096.0, + "step": 8656 + }, + { + "entropy": 1.6851498285929363, + "epoch": 0.9510312817555134, + "grad_norm": 0.6487606763839722, + "learning_rate": 1.215045694809882e-05, + "loss": 1.3169, + "mean_token_accuracy": 0.6667506843805313, + "num_tokens": 1451221118.0, + "step": 8657 + }, + { + "entropy": 1.709127922852834, + "epoch": 0.9511411386668864, + "grad_norm": 0.7377097606658936, + "learning_rate": 1.2148868936290515e-05, + "loss": 1.4872, + "mean_token_accuracy": 0.655120978752772, + "num_tokens": 1451396693.0, + "step": 8658 + }, + { + "entropy": 1.6430183351039886, + "epoch": 0.9512509955782593, + "grad_norm": 0.6040515899658203, + "learning_rate": 1.2147280888121026e-05, + "loss": 1.5599, + "mean_token_accuracy": 0.6429479469855627, + "num_tokens": 1451618865.0, + "step": 8659 + }, + { + "entropy": 1.7497617801030476, + "epoch": 0.9513608524896322, + "grad_norm": 0.613490104675293, + "learning_rate": 1.2145692803640621e-05, + "loss": 1.3799, + "mean_token_accuracy": 0.6475595831871033, + "num_tokens": 1451744744.0, + "step": 8660 + }, + { + "entropy": 1.7006073792775471, + "epoch": 0.9514707094010052, + "grad_norm": 0.6991299986839294, + "learning_rate": 1.2144104682899548e-05, + "loss": 1.5495, + "mean_token_accuracy": 0.6511161873737971, + "num_tokens": 1451894113.0, + "step": 8661 + }, + { + "entropy": 1.6935534576574962, + "epoch": 0.9515805663123781, + "grad_norm": 0.7065954208374023, + "learning_rate": 1.2142516525948083e-05, + "loss": 1.3041, + "mean_token_accuracy": 0.6719731688499451, + "num_tokens": 1452005836.0, + "step": 8662 + }, + { + "entropy": 1.6703713536262512, + "epoch": 0.9516904232237511, + "grad_norm": 0.6174436211585999, + "learning_rate": 1.214092833283648e-05, + "loss": 1.3075, + "mean_token_accuracy": 0.6695892562468847, + "num_tokens": 1452146942.0, + "step": 8663 + }, + { + "entropy": 1.7062116861343384, + "epoch": 0.951800280135124, + "grad_norm": 0.6128714084625244, + "learning_rate": 1.2139340103615011e-05, + "loss": 1.398, + "mean_token_accuracy": 0.6536041001478831, + "num_tokens": 1452297995.0, + "step": 8664 + }, + { + "entropy": 1.691074013710022, + "epoch": 0.951910137046497, + "grad_norm": 0.618613600730896, + "learning_rate": 1.2137751838333943e-05, + "loss": 1.4527, + "mean_token_accuracy": 0.6470177272955576, + "num_tokens": 1452490687.0, + "step": 8665 + }, + { + "entropy": 1.7151458064715068, + "epoch": 0.9520199939578698, + "grad_norm": 0.6557570695877075, + "learning_rate": 1.213616353704354e-05, + "loss": 1.4584, + "mean_token_accuracy": 0.6519462615251541, + "num_tokens": 1452657863.0, + "step": 8666 + }, + { + "entropy": 1.7122306029001872, + "epoch": 0.9521298508692428, + "grad_norm": 0.675037682056427, + "learning_rate": 1.2134575199794072e-05, + "loss": 1.4007, + "mean_token_accuracy": 0.6560028443733851, + "num_tokens": 1452824357.0, + "step": 8667 + }, + { + "entropy": 1.7048703233400981, + "epoch": 0.9522397077806157, + "grad_norm": 0.7334290146827698, + "learning_rate": 1.213298682663581e-05, + "loss": 1.3043, + "mean_token_accuracy": 0.6710883726676306, + "num_tokens": 1452982914.0, + "step": 8668 + }, + { + "entropy": 1.7539819777011871, + "epoch": 0.9523495646919887, + "grad_norm": 0.7398406267166138, + "learning_rate": 1.2131398417619029e-05, + "loss": 1.3738, + "mean_token_accuracy": 0.6580022970835367, + "num_tokens": 1453109599.0, + "step": 8669 + }, + { + "entropy": 1.6519952714443207, + "epoch": 0.9524594216033616, + "grad_norm": 0.7986524701118469, + "learning_rate": 1.2129809972793997e-05, + "loss": 1.413, + "mean_token_accuracy": 0.658644050359726, + "num_tokens": 1453319483.0, + "step": 8670 + }, + { + "entropy": 1.7309541801611583, + "epoch": 0.9525692785147346, + "grad_norm": 0.7906885147094727, + "learning_rate": 1.2128221492210986e-05, + "loss": 1.3046, + "mean_token_accuracy": 0.6700306981801987, + "num_tokens": 1453490803.0, + "step": 8671 + }, + { + "entropy": 1.6974034408728282, + "epoch": 0.9526791354261075, + "grad_norm": 0.7683852314949036, + "learning_rate": 1.2126632975920277e-05, + "loss": 1.3621, + "mean_token_accuracy": 0.6743018825848898, + "num_tokens": 1453617883.0, + "step": 8672 + }, + { + "entropy": 1.7786914706230164, + "epoch": 0.9527889923374804, + "grad_norm": 0.7997028231620789, + "learning_rate": 1.2125044423972139e-05, + "loss": 1.588, + "mean_token_accuracy": 0.6399167478084564, + "num_tokens": 1453781186.0, + "step": 8673 + }, + { + "entropy": 1.7436818778514862, + "epoch": 0.9528988492488534, + "grad_norm": 0.702833890914917, + "learning_rate": 1.2123455836416852e-05, + "loss": 1.2924, + "mean_token_accuracy": 0.6721568206946055, + "num_tokens": 1453888795.0, + "step": 8674 + }, + { + "entropy": 1.7169397870699565, + "epoch": 0.9530087061602263, + "grad_norm": 0.7126211524009705, + "learning_rate": 1.2121867213304692e-05, + "loss": 1.4358, + "mean_token_accuracy": 0.6485247810681661, + "num_tokens": 1454039464.0, + "step": 8675 + }, + { + "entropy": 1.628256380558014, + "epoch": 0.9531185630715993, + "grad_norm": 0.6888135671615601, + "learning_rate": 1.2120278554685944e-05, + "loss": 1.3604, + "mean_token_accuracy": 0.6754744102557501, + "num_tokens": 1454188174.0, + "step": 8676 + }, + { + "entropy": 1.7532523274421692, + "epoch": 0.9532284199829721, + "grad_norm": 0.6963343620300293, + "learning_rate": 1.2118689860610882e-05, + "loss": 1.3401, + "mean_token_accuracy": 0.6503902872403463, + "num_tokens": 1454357110.0, + "step": 8677 + }, + { + "entropy": 1.7357207636038463, + "epoch": 0.9533382768943451, + "grad_norm": 0.6894516348838806, + "learning_rate": 1.2117101131129793e-05, + "loss": 1.3727, + "mean_token_accuracy": 0.661634643872579, + "num_tokens": 1454486942.0, + "step": 8678 + }, + { + "entropy": 1.674231469631195, + "epoch": 0.953448133805718, + "grad_norm": 0.5685495734214783, + "learning_rate": 1.2115512366292954e-05, + "loss": 1.3578, + "mean_token_accuracy": 0.6616425861914953, + "num_tokens": 1454666674.0, + "step": 8679 + }, + { + "entropy": 1.765285313129425, + "epoch": 0.953557990717091, + "grad_norm": 0.7109258770942688, + "learning_rate": 1.2113923566150651e-05, + "loss": 1.3287, + "mean_token_accuracy": 0.6567708303531011, + "num_tokens": 1454802351.0, + "step": 8680 + }, + { + "entropy": 1.709367722272873, + "epoch": 0.9536678476284639, + "grad_norm": 0.6101370453834534, + "learning_rate": 1.211233473075317e-05, + "loss": 1.3885, + "mean_token_accuracy": 0.6554233133792877, + "num_tokens": 1454989959.0, + "step": 8681 + }, + { + "entropy": 1.7657522161801655, + "epoch": 0.9537777045398369, + "grad_norm": 0.6212213039398193, + "learning_rate": 1.2110745860150798e-05, + "loss": 1.4819, + "mean_token_accuracy": 0.6521278421084086, + "num_tokens": 1455172258.0, + "step": 8682 + }, + { + "entropy": 1.7684936622778575, + "epoch": 0.9538875614512098, + "grad_norm": 0.8916065692901611, + "learning_rate": 1.2109156954393815e-05, + "loss": 1.516, + "mean_token_accuracy": 0.6521298487981161, + "num_tokens": 1455316700.0, + "step": 8683 + }, + { + "entropy": 1.7200209399064381, + "epoch": 0.9539974183625828, + "grad_norm": 0.6917714476585388, + "learning_rate": 1.210756801353252e-05, + "loss": 1.5067, + "mean_token_accuracy": 0.6368722418944041, + "num_tokens": 1455501678.0, + "step": 8684 + }, + { + "entropy": 1.7058296203613281, + "epoch": 0.9541072752739557, + "grad_norm": 0.6779616475105286, + "learning_rate": 1.2105979037617196e-05, + "loss": 1.4618, + "mean_token_accuracy": 0.6522943874200186, + "num_tokens": 1455659558.0, + "step": 8685 + }, + { + "entropy": 1.648918906847636, + "epoch": 0.9542171321853287, + "grad_norm": 0.5767722725868225, + "learning_rate": 1.210439002669813e-05, + "loss": 1.4618, + "mean_token_accuracy": 0.6379017184178034, + "num_tokens": 1455914506.0, + "step": 8686 + }, + { + "entropy": 1.7035534083843231, + "epoch": 0.9543269890967016, + "grad_norm": 0.7029200792312622, + "learning_rate": 1.2102800980825617e-05, + "loss": 1.3144, + "mean_token_accuracy": 0.6641533325115839, + "num_tokens": 1456036680.0, + "step": 8687 + }, + { + "entropy": 1.68134809533755, + "epoch": 0.9544368460080744, + "grad_norm": 0.9198618531227112, + "learning_rate": 1.2101211900049954e-05, + "loss": 1.3963, + "mean_token_accuracy": 0.6574839899937311, + "num_tokens": 1456182571.0, + "step": 8688 + }, + { + "entropy": 1.7450095514456432, + "epoch": 0.9545467029194474, + "grad_norm": 0.7267429232597351, + "learning_rate": 1.2099622784421426e-05, + "loss": 1.4871, + "mean_token_accuracy": 0.6336076408624649, + "num_tokens": 1456363555.0, + "step": 8689 + }, + { + "entropy": 1.749456803003947, + "epoch": 0.9546565598308203, + "grad_norm": 4.669123649597168, + "learning_rate": 1.2098033633990336e-05, + "loss": 1.02, + "mean_token_accuracy": 0.6833541542291641, + "num_tokens": 1456514456.0, + "step": 8690 + }, + { + "entropy": 1.7409884134928386, + "epoch": 0.9547664167421933, + "grad_norm": 0.7867989540100098, + "learning_rate": 1.2096444448806977e-05, + "loss": 1.4597, + "mean_token_accuracy": 0.6389687110980352, + "num_tokens": 1456692736.0, + "step": 8691 + }, + { + "entropy": 1.701568841934204, + "epoch": 0.9548762736535662, + "grad_norm": 0.6494891047477722, + "learning_rate": 1.209485522892164e-05, + "loss": 1.3752, + "mean_token_accuracy": 0.6650643845399221, + "num_tokens": 1456880796.0, + "step": 8692 + }, + { + "entropy": 1.7152188817660015, + "epoch": 0.9549861305649392, + "grad_norm": 0.7680609226226807, + "learning_rate": 1.2093265974384631e-05, + "loss": 1.3529, + "mean_token_accuracy": 0.6595882922410965, + "num_tokens": 1457008533.0, + "step": 8693 + }, + { + "entropy": 1.722943127155304, + "epoch": 0.9550959874763121, + "grad_norm": 0.6621650457382202, + "learning_rate": 1.2091676685246252e-05, + "loss": 1.5738, + "mean_token_accuracy": 0.6255774199962616, + "num_tokens": 1457229767.0, + "step": 8694 + }, + { + "entropy": 1.702676256497701, + "epoch": 0.9552058443876851, + "grad_norm": 0.5807628631591797, + "learning_rate": 1.209008736155679e-05, + "loss": 1.4362, + "mean_token_accuracy": 0.657213474313418, + "num_tokens": 1457448218.0, + "step": 8695 + }, + { + "entropy": 1.7524159948031108, + "epoch": 0.955315701299058, + "grad_norm": 0.9052096605300903, + "learning_rate": 1.208849800336656e-05, + "loss": 1.5353, + "mean_token_accuracy": 0.6409603903690974, + "num_tokens": 1457652077.0, + "step": 8696 + }, + { + "entropy": 1.6945769389470418, + "epoch": 0.955425558210431, + "grad_norm": 0.6669119596481323, + "learning_rate": 1.2086908610725854e-05, + "loss": 1.5198, + "mean_token_accuracy": 0.6341730306545893, + "num_tokens": 1457881605.0, + "step": 8697 + }, + { + "entropy": 1.757227510213852, + "epoch": 0.9555354151218038, + "grad_norm": 0.6839233636856079, + "learning_rate": 1.2085319183684981e-05, + "loss": 1.4284, + "mean_token_accuracy": 0.6508975972731909, + "num_tokens": 1458082960.0, + "step": 8698 + }, + { + "entropy": 1.7042691508928935, + "epoch": 0.9556452720331768, + "grad_norm": 0.6574342846870422, + "learning_rate": 1.2083729722294246e-05, + "loss": 1.5346, + "mean_token_accuracy": 0.6502645313739777, + "num_tokens": 1458281946.0, + "step": 8699 + }, + { + "entropy": 1.6815843482812245, + "epoch": 0.9557551289445497, + "grad_norm": 0.7146515846252441, + "learning_rate": 1.2082140226603955e-05, + "loss": 1.3785, + "mean_token_accuracy": 0.6631735612948736, + "num_tokens": 1458433277.0, + "step": 8700 + }, + { + "entropy": 1.6828182240327199, + "epoch": 0.9558649858559226, + "grad_norm": 0.6447663307189941, + "learning_rate": 1.2080550696664413e-05, + "loss": 1.2576, + "mean_token_accuracy": 0.6774131655693054, + "num_tokens": 1458590309.0, + "step": 8701 + }, + { + "entropy": 1.6610161860783894, + "epoch": 0.9559748427672956, + "grad_norm": 0.5871066451072693, + "learning_rate": 1.2078961132525929e-05, + "loss": 1.325, + "mean_token_accuracy": 0.666997030377388, + "num_tokens": 1458767372.0, + "step": 8702 + }, + { + "entropy": 1.6988926430543263, + "epoch": 0.9560846996786685, + "grad_norm": 0.6767246127128601, + "learning_rate": 1.2077371534238809e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.6547530144453049, + "num_tokens": 1458929290.0, + "step": 8703 + }, + { + "entropy": 1.7858167787392933, + "epoch": 0.9561945565900415, + "grad_norm": 0.6854000091552734, + "learning_rate": 1.2075781901853367e-05, + "loss": 1.3713, + "mean_token_accuracy": 0.658269797762235, + "num_tokens": 1459077793.0, + "step": 8704 + }, + { + "entropy": 1.6693780521551769, + "epoch": 0.9563044135014144, + "grad_norm": 0.6797814965248108, + "learning_rate": 1.2074192235419908e-05, + "loss": 1.2583, + "mean_token_accuracy": 0.6774491270383199, + "num_tokens": 1459203208.0, + "step": 8705 + }, + { + "entropy": 1.7343849937121074, + "epoch": 0.9564142704127874, + "grad_norm": 0.588331401348114, + "learning_rate": 1.2072602534988756e-05, + "loss": 1.4504, + "mean_token_accuracy": 0.643997256954511, + "num_tokens": 1459385818.0, + "step": 8706 + }, + { + "entropy": 1.6691008905569713, + "epoch": 0.9565241273241603, + "grad_norm": 0.7460022568702698, + "learning_rate": 1.2071012800610214e-05, + "loss": 1.3452, + "mean_token_accuracy": 0.6772498339414597, + "num_tokens": 1459524006.0, + "step": 8707 + }, + { + "entropy": 1.6213213801383972, + "epoch": 0.9566339842355333, + "grad_norm": 0.6637667417526245, + "learning_rate": 1.2069423032334598e-05, + "loss": 1.3413, + "mean_token_accuracy": 0.6586452474196752, + "num_tokens": 1459689191.0, + "step": 8708 + }, + { + "entropy": 1.6628740727901459, + "epoch": 0.9567438411469061, + "grad_norm": 0.6381793022155762, + "learning_rate": 1.2067833230212225e-05, + "loss": 1.4917, + "mean_token_accuracy": 0.6472151229778925, + "num_tokens": 1459903129.0, + "step": 8709 + }, + { + "entropy": 1.734858940045039, + "epoch": 0.9568536980582791, + "grad_norm": 0.7690825462341309, + "learning_rate": 1.2066243394293412e-05, + "loss": 1.4222, + "mean_token_accuracy": 0.6538131634394327, + "num_tokens": 1460058233.0, + "step": 8710 + }, + { + "entropy": 1.7033085723718007, + "epoch": 0.956963554969652, + "grad_norm": 0.6803534626960754, + "learning_rate": 1.2064653524628478e-05, + "loss": 1.3746, + "mean_token_accuracy": 0.6563105036815008, + "num_tokens": 1460257861.0, + "step": 8711 + }, + { + "entropy": 1.6802352865537007, + "epoch": 0.957073411881025, + "grad_norm": 0.6258231401443481, + "learning_rate": 1.2063063621267738e-05, + "loss": 1.3747, + "mean_token_accuracy": 0.6545542577902476, + "num_tokens": 1460441507.0, + "step": 8712 + }, + { + "entropy": 1.7096125185489655, + "epoch": 0.9571832687923979, + "grad_norm": 0.7180109024047852, + "learning_rate": 1.2061473684261513e-05, + "loss": 1.3745, + "mean_token_accuracy": 0.6694450577100118, + "num_tokens": 1460625014.0, + "step": 8713 + }, + { + "entropy": 1.6804834107557933, + "epoch": 0.9572931257037708, + "grad_norm": 0.8495198488235474, + "learning_rate": 1.2059883713660125e-05, + "loss": 1.3337, + "mean_token_accuracy": 0.656502236922582, + "num_tokens": 1460792252.0, + "step": 8714 + }, + { + "entropy": 1.6971332728862762, + "epoch": 0.9574029826151438, + "grad_norm": 0.6549186706542969, + "learning_rate": 1.2058293709513896e-05, + "loss": 1.406, + "mean_token_accuracy": 0.6549234290917715, + "num_tokens": 1460980358.0, + "step": 8715 + }, + { + "entropy": 1.6848892569541931, + "epoch": 0.9575128395265167, + "grad_norm": 0.6425775289535522, + "learning_rate": 1.2056703671873148e-05, + "loss": 1.3264, + "mean_token_accuracy": 0.6857404808203379, + "num_tokens": 1461152259.0, + "step": 8716 + }, + { + "entropy": 1.7091786166032155, + "epoch": 0.9576226964378897, + "grad_norm": 8.731441497802734, + "learning_rate": 1.2055113600788202e-05, + "loss": 1.2535, + "mean_token_accuracy": 0.6799486676851908, + "num_tokens": 1461321663.0, + "step": 8717 + }, + { + "entropy": 1.748598317305247, + "epoch": 0.9577325533492625, + "grad_norm": 0.6042277812957764, + "learning_rate": 1.205352349630939e-05, + "loss": 1.4481, + "mean_token_accuracy": 0.6515509237845739, + "num_tokens": 1461516392.0, + "step": 8718 + }, + { + "entropy": 1.786596695582072, + "epoch": 0.9578424102606355, + "grad_norm": 0.636871337890625, + "learning_rate": 1.2051933358487031e-05, + "loss": 1.6215, + "mean_token_accuracy": 0.6109706809123358, + "num_tokens": 1461727681.0, + "step": 8719 + }, + { + "entropy": 1.7093546092510223, + "epoch": 0.9579522671720084, + "grad_norm": 0.6685346364974976, + "learning_rate": 1.2050343187371457e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6527099361022314, + "num_tokens": 1461888422.0, + "step": 8720 + }, + { + "entropy": 1.7236407697200775, + "epoch": 0.9580621240833814, + "grad_norm": 0.750136137008667, + "learning_rate": 1.2048752983012992e-05, + "loss": 1.3902, + "mean_token_accuracy": 0.661915456255277, + "num_tokens": 1462058752.0, + "step": 8721 + }, + { + "entropy": 1.6986550291379292, + "epoch": 0.9581719809947543, + "grad_norm": 0.7931959629058838, + "learning_rate": 1.2047162745461974e-05, + "loss": 1.255, + "mean_token_accuracy": 0.6735956718524297, + "num_tokens": 1462196633.0, + "step": 8722 + }, + { + "entropy": 1.6549886465072632, + "epoch": 0.9582818379061273, + "grad_norm": 0.6466114521026611, + "learning_rate": 1.2045572474768718e-05, + "loss": 1.4337, + "mean_token_accuracy": 0.671045849720637, + "num_tokens": 1462409134.0, + "step": 8723 + }, + { + "entropy": 1.7527413566907246, + "epoch": 0.9583916948175002, + "grad_norm": 0.8396289348602295, + "learning_rate": 1.2043982170983568e-05, + "loss": 1.3088, + "mean_token_accuracy": 0.679228276014328, + "num_tokens": 1462541782.0, + "step": 8724 + }, + { + "entropy": 1.735308289527893, + "epoch": 0.9585015517288732, + "grad_norm": 0.6846469044685364, + "learning_rate": 1.2042391834156854e-05, + "loss": 1.4979, + "mean_token_accuracy": 0.652966578801473, + "num_tokens": 1462722456.0, + "step": 8725 + }, + { + "entropy": 1.6945312122503917, + "epoch": 0.9586114086402461, + "grad_norm": 0.7038013339042664, + "learning_rate": 1.2040801464338907e-05, + "loss": 1.3764, + "mean_token_accuracy": 0.6591214487950007, + "num_tokens": 1462916582.0, + "step": 8726 + }, + { + "entropy": 1.7100801467895508, + "epoch": 0.958721265551619, + "grad_norm": 0.7070258855819702, + "learning_rate": 1.2039211061580063e-05, + "loss": 1.5168, + "mean_token_accuracy": 0.6492930054664612, + "num_tokens": 1463100418.0, + "step": 8727 + }, + { + "entropy": 1.745457837978999, + "epoch": 0.958831122462992, + "grad_norm": 0.7946493625640869, + "learning_rate": 1.2037620625930659e-05, + "loss": 1.6822, + "mean_token_accuracy": 0.6400948514540991, + "num_tokens": 1463263777.0, + "step": 8728 + }, + { + "entropy": 1.762039452791214, + "epoch": 0.9589409793743648, + "grad_norm": 0.8285095691680908, + "learning_rate": 1.2036030157441026e-05, + "loss": 1.3384, + "mean_token_accuracy": 0.658534953991572, + "num_tokens": 1463407883.0, + "step": 8729 + }, + { + "entropy": 1.7134557962417603, + "epoch": 0.9590508362857378, + "grad_norm": 0.6590238809585571, + "learning_rate": 1.2034439656161509e-05, + "loss": 1.5242, + "mean_token_accuracy": 0.6427919020255407, + "num_tokens": 1463608060.0, + "step": 8730 + }, + { + "entropy": 1.6973025898138683, + "epoch": 0.9591606931971107, + "grad_norm": 0.6781467199325562, + "learning_rate": 1.203284912214244e-05, + "loss": 1.4759, + "mean_token_accuracy": 0.6487634430329005, + "num_tokens": 1463783712.0, + "step": 8731 + }, + { + "entropy": 1.676356424887975, + "epoch": 0.9592705501084837, + "grad_norm": 0.5379504561424255, + "learning_rate": 1.2031258555434164e-05, + "loss": 1.3997, + "mean_token_accuracy": 0.651170089840889, + "num_tokens": 1464018941.0, + "step": 8732 + }, + { + "entropy": 1.6823839048544567, + "epoch": 0.9593804070198566, + "grad_norm": 0.6782552003860474, + "learning_rate": 1.2029667956087017e-05, + "loss": 1.3481, + "mean_token_accuracy": 0.6624757548173269, + "num_tokens": 1464167232.0, + "step": 8733 + }, + { + "entropy": 1.7918393512566884, + "epoch": 0.9594902639312296, + "grad_norm": 0.7031469345092773, + "learning_rate": 1.2028077324151347e-05, + "loss": 1.4965, + "mean_token_accuracy": 0.6428997168938319, + "num_tokens": 1464340879.0, + "step": 8734 + }, + { + "entropy": 1.6191656390825908, + "epoch": 0.9596001208426025, + "grad_norm": 0.7140489816665649, + "learning_rate": 1.202648665967749e-05, + "loss": 1.1656, + "mean_token_accuracy": 0.688547745347023, + "num_tokens": 1464452058.0, + "step": 8735 + }, + { + "entropy": 1.6774865587552388, + "epoch": 0.9597099777539755, + "grad_norm": 0.6400611996650696, + "learning_rate": 1.2024895962715795e-05, + "loss": 1.476, + "mean_token_accuracy": 0.6562142173449198, + "num_tokens": 1464656449.0, + "step": 8736 + }, + { + "entropy": 1.7151943445205688, + "epoch": 0.9598198346653484, + "grad_norm": 0.7991637587547302, + "learning_rate": 1.2023305233316602e-05, + "loss": 1.3804, + "mean_token_accuracy": 0.6708137293656667, + "num_tokens": 1464806740.0, + "step": 8737 + }, + { + "entropy": 1.7741727034250896, + "epoch": 0.9599296915767214, + "grad_norm": 0.7432534694671631, + "learning_rate": 1.2021714471530262e-05, + "loss": 1.4837, + "mean_token_accuracy": 0.6382344514131546, + "num_tokens": 1464971963.0, + "step": 8738 + }, + { + "entropy": 1.701940377553304, + "epoch": 0.9600395484880943, + "grad_norm": 0.6169398427009583, + "learning_rate": 1.2020123677407113e-05, + "loss": 1.4066, + "mean_token_accuracy": 0.6589889874060949, + "num_tokens": 1465162928.0, + "step": 8739 + }, + { + "entropy": 1.6859253843625386, + "epoch": 0.9601494053994672, + "grad_norm": 0.7215724587440491, + "learning_rate": 1.2018532850997518e-05, + "loss": 1.3828, + "mean_token_accuracy": 0.6706574161847433, + "num_tokens": 1465300333.0, + "step": 8740 + }, + { + "entropy": 1.760906199614207, + "epoch": 0.9602592623108401, + "grad_norm": 0.6548580527305603, + "learning_rate": 1.2016941992351811e-05, + "loss": 1.5413, + "mean_token_accuracy": 0.6272419343392054, + "num_tokens": 1465507133.0, + "step": 8741 + }, + { + "entropy": 1.7261857688426971, + "epoch": 0.960369119222213, + "grad_norm": 0.6962506771087646, + "learning_rate": 1.2015351101520354e-05, + "loss": 1.4314, + "mean_token_accuracy": 0.6458855321009954, + "num_tokens": 1465673287.0, + "step": 8742 + }, + { + "entropy": 1.7670903007189434, + "epoch": 0.960478976133586, + "grad_norm": 0.6517634987831116, + "learning_rate": 1.2013760178553487e-05, + "loss": 1.2565, + "mean_token_accuracy": 0.6761279304822286, + "num_tokens": 1465809133.0, + "step": 8743 + }, + { + "entropy": 1.718291014432907, + "epoch": 0.9605888330449589, + "grad_norm": 0.7264907956123352, + "learning_rate": 1.2012169223501568e-05, + "loss": 1.3405, + "mean_token_accuracy": 0.6658484935760498, + "num_tokens": 1465965362.0, + "step": 8744 + }, + { + "entropy": 1.6515865127245586, + "epoch": 0.9606986899563319, + "grad_norm": 0.7858138680458069, + "learning_rate": 1.2010578236414949e-05, + "loss": 1.4631, + "mean_token_accuracy": 0.6571001460154852, + "num_tokens": 1466178520.0, + "step": 8745 + }, + { + "entropy": 1.6674973865350087, + "epoch": 0.9608085468677048, + "grad_norm": 0.6715067625045776, + "learning_rate": 1.2008987217343986e-05, + "loss": 1.3843, + "mean_token_accuracy": 0.6489299088716507, + "num_tokens": 1466362772.0, + "step": 8746 + }, + { + "entropy": 1.6680465439955394, + "epoch": 0.9609184037790778, + "grad_norm": 0.6776669025421143, + "learning_rate": 1.2007396166339035e-05, + "loss": 1.402, + "mean_token_accuracy": 0.649745578567187, + "num_tokens": 1466526197.0, + "step": 8747 + }, + { + "entropy": 1.6848007043202717, + "epoch": 0.9610282606904507, + "grad_norm": 0.654761016368866, + "learning_rate": 1.2005805083450443e-05, + "loss": 1.3582, + "mean_token_accuracy": 0.6620151400566101, + "num_tokens": 1466659274.0, + "step": 8748 + }, + { + "entropy": 1.6908719142278035, + "epoch": 0.9611381176018237, + "grad_norm": 0.7975518703460693, + "learning_rate": 1.2004213968728575e-05, + "loss": 1.3319, + "mean_token_accuracy": 0.6631641636292139, + "num_tokens": 1466793078.0, + "step": 8749 + }, + { + "entropy": 1.69454359014829, + "epoch": 0.9612479745131965, + "grad_norm": 0.6039302349090576, + "learning_rate": 1.200262282222379e-05, + "loss": 1.4186, + "mean_token_accuracy": 0.6558008641004562, + "num_tokens": 1466937877.0, + "step": 8750 + }, + { + "entropy": 1.7083939115206401, + "epoch": 0.9613578314245695, + "grad_norm": 0.6897109746932983, + "learning_rate": 1.200103164398644e-05, + "loss": 1.3845, + "mean_token_accuracy": 0.661809429526329, + "num_tokens": 1467157146.0, + "step": 8751 + }, + { + "entropy": 1.6259233554204304, + "epoch": 0.9614676883359424, + "grad_norm": 0.6461367607116699, + "learning_rate": 1.1999440434066896e-05, + "loss": 1.4192, + "mean_token_accuracy": 0.6577809949715933, + "num_tokens": 1467315246.0, + "step": 8752 + }, + { + "entropy": 1.7572091619173686, + "epoch": 0.9615775452473154, + "grad_norm": 0.8533002734184265, + "learning_rate": 1.199784919251551e-05, + "loss": 1.6754, + "mean_token_accuracy": 0.6236068258682886, + "num_tokens": 1467482267.0, + "step": 8753 + }, + { + "entropy": 1.770795226097107, + "epoch": 0.9616874021586883, + "grad_norm": 0.7685062885284424, + "learning_rate": 1.1996257919382646e-05, + "loss": 1.6123, + "mean_token_accuracy": 0.6360293204585711, + "num_tokens": 1467666169.0, + "step": 8754 + }, + { + "entropy": 1.7153681516647339, + "epoch": 0.9617972590700612, + "grad_norm": 0.695287823677063, + "learning_rate": 1.1994666614718667e-05, + "loss": 1.3786, + "mean_token_accuracy": 0.6639162302017212, + "num_tokens": 1467807005.0, + "step": 8755 + }, + { + "entropy": 1.7088010211785634, + "epoch": 0.9619071159814342, + "grad_norm": 0.701137900352478, + "learning_rate": 1.1993075278573938e-05, + "loss": 1.5254, + "mean_token_accuracy": 0.6290678034226099, + "num_tokens": 1468014033.0, + "step": 8756 + }, + { + "entropy": 1.6555834611256917, + "epoch": 0.9620169728928071, + "grad_norm": 0.6522089838981628, + "learning_rate": 1.1991483910998823e-05, + "loss": 1.6107, + "mean_token_accuracy": 0.6368126993378004, + "num_tokens": 1468194921.0, + "step": 8757 + }, + { + "entropy": 1.7156870265801747, + "epoch": 0.9621268298041801, + "grad_norm": 0.6632962226867676, + "learning_rate": 1.1989892512043693e-05, + "loss": 1.379, + "mean_token_accuracy": 0.6591578175624212, + "num_tokens": 1468353097.0, + "step": 8758 + }, + { + "entropy": 1.7234142522017162, + "epoch": 0.962236686715553, + "grad_norm": 0.6855953335762024, + "learning_rate": 1.1988301081758908e-05, + "loss": 1.4963, + "mean_token_accuracy": 0.6408663143714269, + "num_tokens": 1468565163.0, + "step": 8759 + }, + { + "entropy": 1.6909594734509785, + "epoch": 0.962346543626926, + "grad_norm": 0.6290444731712341, + "learning_rate": 1.1986709620194837e-05, + "loss": 1.3152, + "mean_token_accuracy": 0.6744314332803091, + "num_tokens": 1468717687.0, + "step": 8760 + }, + { + "entropy": 1.7224095662434895, + "epoch": 0.9624564005382988, + "grad_norm": 0.7756173014640808, + "learning_rate": 1.1985118127401854e-05, + "loss": 1.401, + "mean_token_accuracy": 0.6540177861849467, + "num_tokens": 1468867338.0, + "step": 8761 + }, + { + "entropy": 1.6869498590628307, + "epoch": 0.9625662574496718, + "grad_norm": 0.8160791397094727, + "learning_rate": 1.1983526603430328e-05, + "loss": 1.4635, + "mean_token_accuracy": 0.6533959607283274, + "num_tokens": 1469062599.0, + "step": 8762 + }, + { + "entropy": 1.7486900488535564, + "epoch": 0.9626761143610447, + "grad_norm": 0.6580154895782471, + "learning_rate": 1.1981935048330625e-05, + "loss": 1.3756, + "mean_token_accuracy": 0.648088201880455, + "num_tokens": 1469193764.0, + "step": 8763 + }, + { + "entropy": 1.7289330164591472, + "epoch": 0.9627859712724177, + "grad_norm": 0.8532882928848267, + "learning_rate": 1.1980343462153121e-05, + "loss": 1.2638, + "mean_token_accuracy": 0.6606898903846741, + "num_tokens": 1469364686.0, + "step": 8764 + }, + { + "entropy": 1.6669448614120483, + "epoch": 0.9628958281837906, + "grad_norm": 0.6776072978973389, + "learning_rate": 1.1978751844948188e-05, + "loss": 1.2815, + "mean_token_accuracy": 0.6700325111548106, + "num_tokens": 1469538171.0, + "step": 8765 + }, + { + "entropy": 1.7350335617860158, + "epoch": 0.9630056850951636, + "grad_norm": 0.6142683029174805, + "learning_rate": 1.1977160196766203e-05, + "loss": 1.4018, + "mean_token_accuracy": 0.6594241609176, + "num_tokens": 1469706808.0, + "step": 8766 + }, + { + "entropy": 1.6608028213183086, + "epoch": 0.9631155420065365, + "grad_norm": 0.6166122555732727, + "learning_rate": 1.1975568517657532e-05, + "loss": 1.5136, + "mean_token_accuracy": 0.6444362699985504, + "num_tokens": 1469920924.0, + "step": 8767 + }, + { + "entropy": 1.6671899060408275, + "epoch": 0.9632253989179094, + "grad_norm": 0.6786704659461975, + "learning_rate": 1.1973976807672563e-05, + "loss": 1.5595, + "mean_token_accuracy": 0.6488665342330933, + "num_tokens": 1470140541.0, + "step": 8768 + }, + { + "entropy": 1.6928558846314747, + "epoch": 0.9633352558292824, + "grad_norm": 0.6339307427406311, + "learning_rate": 1.1972385066861665e-05, + "loss": 1.3924, + "mean_token_accuracy": 0.6605489750703176, + "num_tokens": 1470311335.0, + "step": 8769 + }, + { + "entropy": 1.7137080430984497, + "epoch": 0.9634451127406553, + "grad_norm": 0.6629822850227356, + "learning_rate": 1.1970793295275216e-05, + "loss": 1.5686, + "mean_token_accuracy": 0.6393506328264872, + "num_tokens": 1470533352.0, + "step": 8770 + }, + { + "entropy": 1.731887976328532, + "epoch": 0.9635549696520282, + "grad_norm": 0.6983689665794373, + "learning_rate": 1.1969201492963599e-05, + "loss": 1.3827, + "mean_token_accuracy": 0.664002334078153, + "num_tokens": 1470669337.0, + "step": 8771 + }, + { + "entropy": 1.6695733070373535, + "epoch": 0.9636648265634011, + "grad_norm": 0.6725772023200989, + "learning_rate": 1.1967609659977188e-05, + "loss": 1.3551, + "mean_token_accuracy": 0.6574635605017344, + "num_tokens": 1470818033.0, + "step": 8772 + }, + { + "entropy": 1.688662052154541, + "epoch": 0.9637746834747741, + "grad_norm": 0.646634042263031, + "learning_rate": 1.1966017796366372e-05, + "loss": 1.5005, + "mean_token_accuracy": 0.652362714211146, + "num_tokens": 1470969505.0, + "step": 8773 + }, + { + "entropy": 1.7582306861877441, + "epoch": 0.963884540386147, + "grad_norm": 0.7543734908103943, + "learning_rate": 1.1964425902181526e-05, + "loss": 1.4083, + "mean_token_accuracy": 0.66182312866052, + "num_tokens": 1471138254.0, + "step": 8774 + }, + { + "entropy": 1.6540814240773518, + "epoch": 0.96399439729752, + "grad_norm": 0.7629412412643433, + "learning_rate": 1.1962833977473035e-05, + "loss": 1.4068, + "mean_token_accuracy": 0.6505512396494547, + "num_tokens": 1471313832.0, + "step": 8775 + }, + { + "entropy": 1.7056627968947093, + "epoch": 0.9641042542088929, + "grad_norm": 0.7697269320487976, + "learning_rate": 1.1961242022291281e-05, + "loss": 1.3822, + "mean_token_accuracy": 0.6581480453411738, + "num_tokens": 1471490485.0, + "step": 8776 + }, + { + "entropy": 1.7259169320265453, + "epoch": 0.9642141111202659, + "grad_norm": 0.6654592156410217, + "learning_rate": 1.1959650036686652e-05, + "loss": 1.3186, + "mean_token_accuracy": 0.659534772237142, + "num_tokens": 1471647049.0, + "step": 8777 + }, + { + "entropy": 1.7011475265026093, + "epoch": 0.9643239680316388, + "grad_norm": 0.7446539402008057, + "learning_rate": 1.195805802070953e-05, + "loss": 1.4798, + "mean_token_accuracy": 0.6635573208332062, + "num_tokens": 1471799236.0, + "step": 8778 + }, + { + "entropy": 1.7411305209000905, + "epoch": 0.9644338249430118, + "grad_norm": 0.6938983201980591, + "learning_rate": 1.1956465974410305e-05, + "loss": 1.3509, + "mean_token_accuracy": 0.6578657031059265, + "num_tokens": 1471915469.0, + "step": 8779 + }, + { + "entropy": 1.732979655265808, + "epoch": 0.9645436818543847, + "grad_norm": 0.7012693881988525, + "learning_rate": 1.1954873897839363e-05, + "loss": 1.2764, + "mean_token_accuracy": 0.6760233988364538, + "num_tokens": 1472040376.0, + "step": 8780 + }, + { + "entropy": 1.707295298576355, + "epoch": 0.9646535387657575, + "grad_norm": 0.7303659319877625, + "learning_rate": 1.1953281791047091e-05, + "loss": 1.413, + "mean_token_accuracy": 0.67343603571256, + "num_tokens": 1472152558.0, + "step": 8781 + }, + { + "entropy": 1.6815251310666401, + "epoch": 0.9647633956771305, + "grad_norm": 0.5562863945960999, + "learning_rate": 1.1951689654083883e-05, + "loss": 1.3436, + "mean_token_accuracy": 0.6479224115610123, + "num_tokens": 1472325563.0, + "step": 8782 + }, + { + "entropy": 1.6789535880088806, + "epoch": 0.9648732525885034, + "grad_norm": 0.7839949131011963, + "learning_rate": 1.195009748700012e-05, + "loss": 1.3481, + "mean_token_accuracy": 0.6642525096734365, + "num_tokens": 1472504702.0, + "step": 8783 + }, + { + "entropy": 1.715220332145691, + "epoch": 0.9649831094998764, + "grad_norm": 0.7262733578681946, + "learning_rate": 1.1948505289846205e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6491985072692236, + "num_tokens": 1472646861.0, + "step": 8784 + }, + { + "entropy": 1.7468859950701396, + "epoch": 0.9650929664112493, + "grad_norm": 0.6974389553070068, + "learning_rate": 1.194691306267252e-05, + "loss": 1.4079, + "mean_token_accuracy": 0.6441583534081777, + "num_tokens": 1472806006.0, + "step": 8785 + }, + { + "entropy": 1.6872367163499196, + "epoch": 0.9652028233226223, + "grad_norm": 0.6552119851112366, + "learning_rate": 1.194532080552947e-05, + "loss": 1.4084, + "mean_token_accuracy": 0.6529111266136169, + "num_tokens": 1472994372.0, + "step": 8786 + }, + { + "entropy": 1.6701744496822357, + "epoch": 0.9653126802339952, + "grad_norm": 0.7151638269424438, + "learning_rate": 1.1943728518467441e-05, + "loss": 1.2341, + "mean_token_accuracy": 0.6837707708279291, + "num_tokens": 1473126393.0, + "step": 8787 + }, + { + "entropy": 1.7374973396460216, + "epoch": 0.9654225371453682, + "grad_norm": 0.619019091129303, + "learning_rate": 1.1942136201536827e-05, + "loss": 1.4518, + "mean_token_accuracy": 0.6381538957357407, + "num_tokens": 1473343494.0, + "step": 8788 + }, + { + "entropy": 1.6976648370424907, + "epoch": 0.9655323940567411, + "grad_norm": 0.7506789565086365, + "learning_rate": 1.1940543854788026e-05, + "loss": 1.2836, + "mean_token_accuracy": 0.67548568546772, + "num_tokens": 1473474533.0, + "step": 8789 + }, + { + "entropy": 1.698626885811488, + "epoch": 0.9656422509681141, + "grad_norm": 0.6703222990036011, + "learning_rate": 1.193895147827144e-05, + "loss": 1.3658, + "mean_token_accuracy": 0.6542994330326716, + "num_tokens": 1473651782.0, + "step": 8790 + }, + { + "entropy": 1.7644267777601879, + "epoch": 0.965752107879487, + "grad_norm": 0.6878910064697266, + "learning_rate": 1.1937359072037458e-05, + "loss": 1.5286, + "mean_token_accuracy": 0.6460073043902715, + "num_tokens": 1473864651.0, + "step": 8791 + }, + { + "entropy": 1.6762928366661072, + "epoch": 0.96586196479086, + "grad_norm": 0.7150443196296692, + "learning_rate": 1.1935766636136487e-05, + "loss": 1.3087, + "mean_token_accuracy": 0.6731075594822565, + "num_tokens": 1474006770.0, + "step": 8792 + }, + { + "entropy": 1.689261128505071, + "epoch": 0.9659718217022328, + "grad_norm": 0.6254614591598511, + "learning_rate": 1.1934174170618921e-05, + "loss": 1.4649, + "mean_token_accuracy": 0.6447116434574127, + "num_tokens": 1474190445.0, + "step": 8793 + }, + { + "entropy": 1.6668556829293568, + "epoch": 0.9660816786136058, + "grad_norm": 0.7564082741737366, + "learning_rate": 1.1932581675535167e-05, + "loss": 1.3167, + "mean_token_accuracy": 0.6619139909744263, + "num_tokens": 1474344961.0, + "step": 8794 + }, + { + "entropy": 1.7443881531556447, + "epoch": 0.9661915355249787, + "grad_norm": 0.7837411761283875, + "learning_rate": 1.193098915093562e-05, + "loss": 1.4101, + "mean_token_accuracy": 0.6615714579820633, + "num_tokens": 1474529968.0, + "step": 8795 + }, + { + "entropy": 1.7280802925427754, + "epoch": 0.9663013924363516, + "grad_norm": 0.8796549439430237, + "learning_rate": 1.1929396596870688e-05, + "loss": 1.5307, + "mean_token_accuracy": 0.6449608951807022, + "num_tokens": 1474654137.0, + "step": 8796 + }, + { + "entropy": 1.714556525150935, + "epoch": 0.9664112493477246, + "grad_norm": 0.6233653426170349, + "learning_rate": 1.1927804013390771e-05, + "loss": 1.292, + "mean_token_accuracy": 0.6737811714410782, + "num_tokens": 1474821466.0, + "step": 8797 + }, + { + "entropy": 1.7329127391179402, + "epoch": 0.9665211062590975, + "grad_norm": 0.6161532402038574, + "learning_rate": 1.1926211400546276e-05, + "loss": 1.341, + "mean_token_accuracy": 0.6685069849093755, + "num_tokens": 1474978261.0, + "step": 8798 + }, + { + "entropy": 1.730038086573283, + "epoch": 0.9666309631704705, + "grad_norm": 0.7773633003234863, + "learning_rate": 1.1924618758387607e-05, + "loss": 1.3345, + "mean_token_accuracy": 0.6633727848529816, + "num_tokens": 1475124005.0, + "step": 8799 + }, + { + "entropy": 1.7478882769743602, + "epoch": 0.9667408200818434, + "grad_norm": 0.6906635165214539, + "learning_rate": 1.1923026086965171e-05, + "loss": 1.4519, + "mean_token_accuracy": 0.651598185300827, + "num_tokens": 1475288346.0, + "step": 8800 + }, + { + "entropy": 1.749391367038091, + "epoch": 0.9668506769932164, + "grad_norm": 0.7163913249969482, + "learning_rate": 1.1921433386329375e-05, + "loss": 1.3796, + "mean_token_accuracy": 0.6630617678165436, + "num_tokens": 1475422303.0, + "step": 8801 + }, + { + "entropy": 1.6840445597966511, + "epoch": 0.9669605339045892, + "grad_norm": 0.763927161693573, + "learning_rate": 1.191984065653063e-05, + "loss": 1.4901, + "mean_token_accuracy": 0.6456265101830164, + "num_tokens": 1475617488.0, + "step": 8802 + }, + { + "entropy": 1.7242066264152527, + "epoch": 0.9670703908159622, + "grad_norm": 0.6778439283370972, + "learning_rate": 1.191824789761934e-05, + "loss": 1.2706, + "mean_token_accuracy": 0.6764429658651352, + "num_tokens": 1475727115.0, + "step": 8803 + }, + { + "entropy": 1.69703604777654, + "epoch": 0.9671802477273351, + "grad_norm": 0.6198210120201111, + "learning_rate": 1.1916655109645919e-05, + "loss": 1.2735, + "mean_token_accuracy": 0.6707077473402023, + "num_tokens": 1475872434.0, + "step": 8804 + }, + { + "entropy": 1.7205248872439067, + "epoch": 0.9672901046387081, + "grad_norm": 0.8538046479225159, + "learning_rate": 1.1915062292660774e-05, + "loss": 1.3696, + "mean_token_accuracy": 0.6752174297968546, + "num_tokens": 1476017139.0, + "step": 8805 + }, + { + "entropy": 1.656046062707901, + "epoch": 0.967399961550081, + "grad_norm": 0.7897881865501404, + "learning_rate": 1.1913469446714323e-05, + "loss": 1.266, + "mean_token_accuracy": 0.670650397737821, + "num_tokens": 1476142714.0, + "step": 8806 + }, + { + "entropy": 1.7231378157933552, + "epoch": 0.967509818461454, + "grad_norm": 0.6750791668891907, + "learning_rate": 1.1911876571856975e-05, + "loss": 1.2979, + "mean_token_accuracy": 0.6721773644288381, + "num_tokens": 1476278163.0, + "step": 8807 + }, + { + "entropy": 1.720835566520691, + "epoch": 0.9676196753728269, + "grad_norm": 0.780296266078949, + "learning_rate": 1.1910283668139147e-05, + "loss": 1.4398, + "mean_token_accuracy": 0.6561943292617798, + "num_tokens": 1476432943.0, + "step": 8808 + }, + { + "entropy": 1.6540588239828746, + "epoch": 0.9677295322841998, + "grad_norm": 0.6764923334121704, + "learning_rate": 1.1908690735611246e-05, + "loss": 1.2985, + "mean_token_accuracy": 0.6647897958755493, + "num_tokens": 1476568567.0, + "step": 8809 + }, + { + "entropy": 1.6742003659407299, + "epoch": 0.9678393891955728, + "grad_norm": 0.6901260018348694, + "learning_rate": 1.1907097774323693e-05, + "loss": 1.4862, + "mean_token_accuracy": 0.6387346585591634, + "num_tokens": 1476792318.0, + "step": 8810 + }, + { + "entropy": 1.673385351896286, + "epoch": 0.9679492461069457, + "grad_norm": 0.7130966782569885, + "learning_rate": 1.1905504784326907e-05, + "loss": 1.2702, + "mean_token_accuracy": 0.6630944808324178, + "num_tokens": 1476905292.0, + "step": 8811 + }, + { + "entropy": 1.6886204878489177, + "epoch": 0.9680591030183187, + "grad_norm": 0.7114747166633606, + "learning_rate": 1.19039117656713e-05, + "loss": 1.2468, + "mean_token_accuracy": 0.6765016714731852, + "num_tokens": 1477053287.0, + "step": 8812 + }, + { + "entropy": 1.708604981501897, + "epoch": 0.9681689599296915, + "grad_norm": 0.6665516495704651, + "learning_rate": 1.1902318718407295e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6632998138666153, + "num_tokens": 1477269370.0, + "step": 8813 + }, + { + "entropy": 1.7254629333813984, + "epoch": 0.9682788168410645, + "grad_norm": 0.8017979860305786, + "learning_rate": 1.190072564258531e-05, + "loss": 1.3117, + "mean_token_accuracy": 0.6620668768882751, + "num_tokens": 1477459504.0, + "step": 8814 + }, + { + "entropy": 1.709552268187205, + "epoch": 0.9683886737524374, + "grad_norm": 0.8370741009712219, + "learning_rate": 1.1899132538255764e-05, + "loss": 1.3385, + "mean_token_accuracy": 0.6709966957569122, + "num_tokens": 1477580950.0, + "step": 8815 + }, + { + "entropy": 1.75144029657046, + "epoch": 0.9684985306638104, + "grad_norm": 0.7077965140342712, + "learning_rate": 1.1897539405469079e-05, + "loss": 1.5018, + "mean_token_accuracy": 0.642456571261088, + "num_tokens": 1477766228.0, + "step": 8816 + }, + { + "entropy": 1.6804983814557393, + "epoch": 0.9686083875751833, + "grad_norm": 0.6244013905525208, + "learning_rate": 1.189594624427567e-05, + "loss": 1.4332, + "mean_token_accuracy": 0.6555044750372568, + "num_tokens": 1477975630.0, + "step": 8817 + }, + { + "entropy": 1.700407882531484, + "epoch": 0.9687182444865563, + "grad_norm": 0.5928204655647278, + "learning_rate": 1.1894353054725976e-05, + "loss": 1.3642, + "mean_token_accuracy": 0.6752937485774358, + "num_tokens": 1478182533.0, + "step": 8818 + }, + { + "entropy": 1.7318575084209442, + "epoch": 0.9688281013979292, + "grad_norm": 0.7394362092018127, + "learning_rate": 1.1892759836870402e-05, + "loss": 1.4435, + "mean_token_accuracy": 0.6566926191250483, + "num_tokens": 1478342831.0, + "step": 8819 + }, + { + "entropy": 1.7132834196090698, + "epoch": 0.9689379583093022, + "grad_norm": 0.6716835498809814, + "learning_rate": 1.1891166590759386e-05, + "loss": 1.4787, + "mean_token_accuracy": 0.6364479611317316, + "num_tokens": 1478555411.0, + "step": 8820 + }, + { + "entropy": 1.66799263159434, + "epoch": 0.9690478152206751, + "grad_norm": 0.6324994564056396, + "learning_rate": 1.1889573316443349e-05, + "loss": 1.4213, + "mean_token_accuracy": 0.6493504792451859, + "num_tokens": 1478775775.0, + "step": 8821 + }, + { + "entropy": 1.7549628714720409, + "epoch": 0.969157672132048, + "grad_norm": 0.6879369616508484, + "learning_rate": 1.1887980013972715e-05, + "loss": 1.4407, + "mean_token_accuracy": 0.6554758697748184, + "num_tokens": 1478969172.0, + "step": 8822 + }, + { + "entropy": 1.626541276772817, + "epoch": 0.969267529043421, + "grad_norm": 0.7112913727760315, + "learning_rate": 1.1886386683397917e-05, + "loss": 1.4272, + "mean_token_accuracy": 0.6542681405941645, + "num_tokens": 1479172290.0, + "step": 8823 + }, + { + "entropy": 1.7104832927385967, + "epoch": 0.9693773859547938, + "grad_norm": 0.7072854042053223, + "learning_rate": 1.1884793324769379e-05, + "loss": 1.3244, + "mean_token_accuracy": 0.6610027849674225, + "num_tokens": 1479371313.0, + "step": 8824 + }, + { + "entropy": 1.621906081835429, + "epoch": 0.9694872428661668, + "grad_norm": 0.6670812368392944, + "learning_rate": 1.1883199938137528e-05, + "loss": 1.4952, + "mean_token_accuracy": 0.6568761318922043, + "num_tokens": 1479538947.0, + "step": 8825 + }, + { + "entropy": 1.6460838218530018, + "epoch": 0.9695970997775397, + "grad_norm": 0.6370675563812256, + "learning_rate": 1.18816065235528e-05, + "loss": 1.2052, + "mean_token_accuracy": 0.6849732001622518, + "num_tokens": 1479663363.0, + "step": 8826 + }, + { + "entropy": 1.7442928353945415, + "epoch": 0.9697069566889127, + "grad_norm": 0.7854894995689392, + "learning_rate": 1.188001308106562e-05, + "loss": 1.2911, + "mean_token_accuracy": 0.6643334925174713, + "num_tokens": 1479797528.0, + "step": 8827 + }, + { + "entropy": 1.7473202149073284, + "epoch": 0.9698168136002856, + "grad_norm": 0.7597824931144714, + "learning_rate": 1.1878419610726423e-05, + "loss": 1.3298, + "mean_token_accuracy": 0.662042036652565, + "num_tokens": 1479958379.0, + "step": 8828 + }, + { + "entropy": 1.729845941066742, + "epoch": 0.9699266705116586, + "grad_norm": 0.7279675602912903, + "learning_rate": 1.1876826112585645e-05, + "loss": 1.5532, + "mean_token_accuracy": 0.6339255919059118, + "num_tokens": 1480118370.0, + "step": 8829 + }, + { + "entropy": 1.6920412480831146, + "epoch": 0.9700365274230315, + "grad_norm": 0.6832351684570312, + "learning_rate": 1.1875232586693712e-05, + "loss": 1.4446, + "mean_token_accuracy": 0.653128465016683, + "num_tokens": 1480289535.0, + "step": 8830 + }, + { + "entropy": 1.634282539288203, + "epoch": 0.9701463843344045, + "grad_norm": 0.7434066534042358, + "learning_rate": 1.1873639033101066e-05, + "loss": 1.2848, + "mean_token_accuracy": 0.6711482803026835, + "num_tokens": 1480427662.0, + "step": 8831 + }, + { + "entropy": 1.7319086988766987, + "epoch": 0.9702562412457774, + "grad_norm": 0.6085994839668274, + "learning_rate": 1.1872045451858132e-05, + "loss": 1.3227, + "mean_token_accuracy": 0.6564686596393585, + "num_tokens": 1480575924.0, + "step": 8832 + }, + { + "entropy": 1.6918261349201202, + "epoch": 0.9703660981571504, + "grad_norm": 0.6591848731040955, + "learning_rate": 1.1870451843015357e-05, + "loss": 1.2071, + "mean_token_accuracy": 0.6775870273510615, + "num_tokens": 1480693963.0, + "step": 8833 + }, + { + "entropy": 1.6758748193581898, + "epoch": 0.9704759550685232, + "grad_norm": 0.7486820816993713, + "learning_rate": 1.186885820662317e-05, + "loss": 1.3314, + "mean_token_accuracy": 0.6618000318606695, + "num_tokens": 1480855735.0, + "step": 8834 + }, + { + "entropy": 1.7517311076323192, + "epoch": 0.9705858119798962, + "grad_norm": 0.6901195049285889, + "learning_rate": 1.1867264542732013e-05, + "loss": 1.4552, + "mean_token_accuracy": 0.6589073191086451, + "num_tokens": 1480986317.0, + "step": 8835 + }, + { + "entropy": 1.6750148733456929, + "epoch": 0.9706956688912691, + "grad_norm": 0.6650272011756897, + "learning_rate": 1.186567085139233e-05, + "loss": 1.3836, + "mean_token_accuracy": 0.6477328389883041, + "num_tokens": 1481124442.0, + "step": 8836 + }, + { + "entropy": 1.6886054178078969, + "epoch": 0.970805525802642, + "grad_norm": 0.7245535850524902, + "learning_rate": 1.1864077132654547e-05, + "loss": 1.246, + "mean_token_accuracy": 0.6800411989291509, + "num_tokens": 1481244157.0, + "step": 8837 + }, + { + "entropy": 1.755658229192098, + "epoch": 0.970915382714015, + "grad_norm": 0.6730127334594727, + "learning_rate": 1.1862483386569116e-05, + "loss": 1.3982, + "mean_token_accuracy": 0.64553735156854, + "num_tokens": 1481399744.0, + "step": 8838 + }, + { + "entropy": 1.726265827814738, + "epoch": 0.9710252396253879, + "grad_norm": 0.6427361369132996, + "learning_rate": 1.1860889613186473e-05, + "loss": 1.449, + "mean_token_accuracy": 0.6438863674799601, + "num_tokens": 1481578751.0, + "step": 8839 + }, + { + "entropy": 1.7021092474460602, + "epoch": 0.9711350965367609, + "grad_norm": 0.6495327949523926, + "learning_rate": 1.1859295812557063e-05, + "loss": 1.3721, + "mean_token_accuracy": 0.6563242822885513, + "num_tokens": 1481747669.0, + "step": 8840 + }, + { + "entropy": 1.7147242824236553, + "epoch": 0.9712449534481338, + "grad_norm": 0.6548023819923401, + "learning_rate": 1.1857701984731327e-05, + "loss": 1.5216, + "mean_token_accuracy": 0.6520404716332754, + "num_tokens": 1481960498.0, + "step": 8841 + }, + { + "entropy": 1.7369143664836884, + "epoch": 0.9713548103595068, + "grad_norm": 1.022463083267212, + "learning_rate": 1.185610812975971e-05, + "loss": 0.9632, + "mean_token_accuracy": 0.6833280275265375, + "num_tokens": 1482092594.0, + "step": 8842 + }, + { + "entropy": 1.693136473496755, + "epoch": 0.9714646672708797, + "grad_norm": 0.7875451445579529, + "learning_rate": 1.1854514247692654e-05, + "loss": 1.4815, + "mean_token_accuracy": 0.6493118107318878, + "num_tokens": 1482278952.0, + "step": 8843 + }, + { + "entropy": 1.6291471024354298, + "epoch": 0.9715745241822527, + "grad_norm": 0.5732967257499695, + "learning_rate": 1.1852920338580612e-05, + "loss": 1.3202, + "mean_token_accuracy": 0.6629867007335027, + "num_tokens": 1482496222.0, + "step": 8844 + }, + { + "entropy": 1.762572060028712, + "epoch": 0.9716843810936255, + "grad_norm": 0.7325165867805481, + "learning_rate": 1.1851326402474021e-05, + "loss": 1.4403, + "mean_token_accuracy": 0.647186944882075, + "num_tokens": 1482638902.0, + "step": 8845 + }, + { + "entropy": 1.7113976279894512, + "epoch": 0.9717942380049985, + "grad_norm": 0.5798704028129578, + "learning_rate": 1.1849732439423336e-05, + "loss": 1.2823, + "mean_token_accuracy": 0.6795357465744019, + "num_tokens": 1482850659.0, + "step": 8846 + }, + { + "entropy": 1.6824340323607128, + "epoch": 0.9719040949163714, + "grad_norm": 0.6129104495048523, + "learning_rate": 1.1848138449479e-05, + "loss": 1.3034, + "mean_token_accuracy": 0.6713794569174448, + "num_tokens": 1482991261.0, + "step": 8847 + }, + { + "entropy": 1.7262922724088032, + "epoch": 0.9720139518277444, + "grad_norm": 0.6804250478744507, + "learning_rate": 1.1846544432691466e-05, + "loss": 1.4445, + "mean_token_accuracy": 0.6579902023077011, + "num_tokens": 1483175630.0, + "step": 8848 + }, + { + "entropy": 1.6732697486877441, + "epoch": 0.9721238087391173, + "grad_norm": 0.8014521598815918, + "learning_rate": 1.1844950389111182e-05, + "loss": 1.4432, + "mean_token_accuracy": 0.6527867317199707, + "num_tokens": 1483337929.0, + "step": 8849 + }, + { + "entropy": 1.7497017979621887, + "epoch": 0.9722336656504902, + "grad_norm": 0.7554667592048645, + "learning_rate": 1.1843356318788597e-05, + "loss": 1.5084, + "mean_token_accuracy": 0.6426639705896378, + "num_tokens": 1483472141.0, + "step": 8850 + }, + { + "entropy": 1.6782042880853016, + "epoch": 0.9723435225618632, + "grad_norm": 0.6667650938034058, + "learning_rate": 1.1841762221774166e-05, + "loss": 1.3261, + "mean_token_accuracy": 0.6682502627372742, + "num_tokens": 1483655083.0, + "step": 8851 + }, + { + "entropy": 1.7021657625834148, + "epoch": 0.9724533794732361, + "grad_norm": 0.7699475288391113, + "learning_rate": 1.1840168098118341e-05, + "loss": 1.2373, + "mean_token_accuracy": 0.6808784703413645, + "num_tokens": 1483770294.0, + "step": 8852 + }, + { + "entropy": 1.6511710683504741, + "epoch": 0.9725632363846091, + "grad_norm": 0.6833035349845886, + "learning_rate": 1.1838573947871572e-05, + "loss": 1.3962, + "mean_token_accuracy": 0.6551846663157145, + "num_tokens": 1484013632.0, + "step": 8853 + }, + { + "entropy": 1.6699829399585724, + "epoch": 0.972673093295982, + "grad_norm": 0.6805309057235718, + "learning_rate": 1.1836979771084319e-05, + "loss": 1.4132, + "mean_token_accuracy": 0.6588405172030131, + "num_tokens": 1484203655.0, + "step": 8854 + }, + { + "entropy": 1.7369339366753895, + "epoch": 0.972782950207355, + "grad_norm": 0.7582236528396606, + "learning_rate": 1.183538556780703e-05, + "loss": 1.3637, + "mean_token_accuracy": 0.6499685148398081, + "num_tokens": 1484339577.0, + "step": 8855 + }, + { + "entropy": 1.6894344786802928, + "epoch": 0.9728928071187278, + "grad_norm": 0.6298988461494446, + "learning_rate": 1.1833791338090164e-05, + "loss": 1.4281, + "mean_token_accuracy": 0.6555620779593786, + "num_tokens": 1484533014.0, + "step": 8856 + }, + { + "entropy": 1.738153209288915, + "epoch": 0.9730026640301008, + "grad_norm": 0.8281303644180298, + "learning_rate": 1.1832197081984178e-05, + "loss": 1.4179, + "mean_token_accuracy": 0.6543222516775131, + "num_tokens": 1484703160.0, + "step": 8857 + }, + { + "entropy": 1.7207493782043457, + "epoch": 0.9731125209414737, + "grad_norm": 0.6813370585441589, + "learning_rate": 1.1830602799539532e-05, + "loss": 1.3754, + "mean_token_accuracy": 0.6627134084701538, + "num_tokens": 1484840728.0, + "step": 8858 + }, + { + "entropy": 1.6518064538637798, + "epoch": 0.9732223778528467, + "grad_norm": 0.6134771108627319, + "learning_rate": 1.1829008490806682e-05, + "loss": 1.3939, + "mean_token_accuracy": 0.6519312014182409, + "num_tokens": 1485038123.0, + "step": 8859 + }, + { + "entropy": 1.7453482647736867, + "epoch": 0.9733322347642196, + "grad_norm": 0.6196027398109436, + "learning_rate": 1.1827414155836083e-05, + "loss": 1.3782, + "mean_token_accuracy": 0.6594426184892654, + "num_tokens": 1485288940.0, + "step": 8860 + }, + { + "entropy": 1.6767084399859111, + "epoch": 0.9734420916755926, + "grad_norm": 0.7031689286231995, + "learning_rate": 1.1825819794678201e-05, + "loss": 1.3408, + "mean_token_accuracy": 0.6554951965808868, + "num_tokens": 1485453230.0, + "step": 8861 + }, + { + "entropy": 1.7447557151317596, + "epoch": 0.9735519485869655, + "grad_norm": 0.658467710018158, + "learning_rate": 1.1824225407383494e-05, + "loss": 1.4039, + "mean_token_accuracy": 0.6553630034128824, + "num_tokens": 1485639366.0, + "step": 8862 + }, + { + "entropy": 1.7522308230400085, + "epoch": 0.9736618054983384, + "grad_norm": 0.6083486080169678, + "learning_rate": 1.1822630994002425e-05, + "loss": 1.4194, + "mean_token_accuracy": 0.6455821990966797, + "num_tokens": 1485812666.0, + "step": 8863 + }, + { + "entropy": 1.6696155369281769, + "epoch": 0.9737716624097114, + "grad_norm": 0.6207438111305237, + "learning_rate": 1.1821036554585457e-05, + "loss": 1.4398, + "mean_token_accuracy": 0.6514710088570913, + "num_tokens": 1485967627.0, + "step": 8864 + }, + { + "entropy": 1.7358433306217194, + "epoch": 0.9738815193210842, + "grad_norm": 0.8131921291351318, + "learning_rate": 1.1819442089183051e-05, + "loss": 1.4363, + "mean_token_accuracy": 0.6631392339865366, + "num_tokens": 1486133922.0, + "step": 8865 + }, + { + "entropy": 1.711951583623886, + "epoch": 0.9739913762324572, + "grad_norm": 0.7035873532295227, + "learning_rate": 1.181784759784567e-05, + "loss": 1.403, + "mean_token_accuracy": 0.6459818432728449, + "num_tokens": 1486288914.0, + "step": 8866 + }, + { + "entropy": 1.7438312371571858, + "epoch": 0.9741012331438301, + "grad_norm": 0.810245156288147, + "learning_rate": 1.1816253080623783e-05, + "loss": 1.2218, + "mean_token_accuracy": 0.6892569859822592, + "num_tokens": 1486416868.0, + "step": 8867 + }, + { + "entropy": 1.7063380181789398, + "epoch": 0.9742110900552031, + "grad_norm": 0.7329069375991821, + "learning_rate": 1.1814658537567851e-05, + "loss": 1.3848, + "mean_token_accuracy": 0.6666723837455114, + "num_tokens": 1486561189.0, + "step": 8868 + }, + { + "entropy": 1.707608977953593, + "epoch": 0.974320946966576, + "grad_norm": 0.8022003173828125, + "learning_rate": 1.1813063968728347e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6613700141509374, + "num_tokens": 1486705794.0, + "step": 8869 + }, + { + "entropy": 1.7382404804229736, + "epoch": 0.974430803877949, + "grad_norm": 0.8011891841888428, + "learning_rate": 1.1811469374155736e-05, + "loss": 1.2486, + "mean_token_accuracy": 0.6719160179297129, + "num_tokens": 1486836957.0, + "step": 8870 + }, + { + "entropy": 1.7135269542535145, + "epoch": 0.9745406607893219, + "grad_norm": 0.7246976494789124, + "learning_rate": 1.1809874753900481e-05, + "loss": 1.4138, + "mean_token_accuracy": 0.6471135467290878, + "num_tokens": 1487021960.0, + "step": 8871 + }, + { + "entropy": 1.7171033422152202, + "epoch": 0.9746505177006949, + "grad_norm": 0.7895018458366394, + "learning_rate": 1.1808280108013056e-05, + "loss": 1.4596, + "mean_token_accuracy": 0.6534997771183649, + "num_tokens": 1487169618.0, + "step": 8872 + }, + { + "entropy": 1.6360890467961628, + "epoch": 0.9747603746120678, + "grad_norm": 0.638076901435852, + "learning_rate": 1.1806685436543929e-05, + "loss": 1.3767, + "mean_token_accuracy": 0.6550846695899963, + "num_tokens": 1487335153.0, + "step": 8873 + }, + { + "entropy": 1.7775618036588032, + "epoch": 0.9748702315234408, + "grad_norm": 0.7299431562423706, + "learning_rate": 1.1805090739543574e-05, + "loss": 1.4693, + "mean_token_accuracy": 0.6526975681384405, + "num_tokens": 1487516942.0, + "step": 8874 + }, + { + "entropy": 1.717639684677124, + "epoch": 0.9749800884348137, + "grad_norm": 0.6354700326919556, + "learning_rate": 1.1803496017062458e-05, + "loss": 1.3929, + "mean_token_accuracy": 0.6608896454175314, + "num_tokens": 1487720150.0, + "step": 8875 + }, + { + "entropy": 1.7499834895133972, + "epoch": 0.9750899453461865, + "grad_norm": 0.6952952742576599, + "learning_rate": 1.1801901269151057e-05, + "loss": 1.5107, + "mean_token_accuracy": 0.6322333912054697, + "num_tokens": 1487928726.0, + "step": 8876 + }, + { + "entropy": 1.7447476585706074, + "epoch": 0.9751998022575595, + "grad_norm": 0.7346185445785522, + "learning_rate": 1.180030649585984e-05, + "loss": 1.3801, + "mean_token_accuracy": 0.6633228411277136, + "num_tokens": 1488088706.0, + "step": 8877 + }, + { + "entropy": 1.7237921754519145, + "epoch": 0.9753096591689324, + "grad_norm": 0.6742929816246033, + "learning_rate": 1.1798711697239281e-05, + "loss": 1.542, + "mean_token_accuracy": 0.6433573961257935, + "num_tokens": 1488277908.0, + "step": 8878 + }, + { + "entropy": 1.7318544387817383, + "epoch": 0.9754195160803054, + "grad_norm": 0.7747882604598999, + "learning_rate": 1.1797116873339862e-05, + "loss": 1.449, + "mean_token_accuracy": 0.6521434336900711, + "num_tokens": 1488421687.0, + "step": 8879 + }, + { + "entropy": 1.7487476070721943, + "epoch": 0.9755293729916783, + "grad_norm": 0.67233806848526, + "learning_rate": 1.1795522024212052e-05, + "loss": 1.4115, + "mean_token_accuracy": 0.6510422080755234, + "num_tokens": 1488578364.0, + "step": 8880 + }, + { + "entropy": 1.7285469969113667, + "epoch": 0.9756392299030513, + "grad_norm": 0.714249312877655, + "learning_rate": 1.1793927149906329e-05, + "loss": 1.5166, + "mean_token_accuracy": 0.6440831869840622, + "num_tokens": 1488760116.0, + "step": 8881 + }, + { + "entropy": 1.7103685835997264, + "epoch": 0.9757490868144242, + "grad_norm": 0.6406890749931335, + "learning_rate": 1.1792332250473167e-05, + "loss": 1.3169, + "mean_token_accuracy": 0.6635753909746805, + "num_tokens": 1488954184.0, + "step": 8882 + }, + { + "entropy": 1.7467718720436096, + "epoch": 0.9758589437257972, + "grad_norm": 0.713141918182373, + "learning_rate": 1.1790737325963047e-05, + "loss": 1.4421, + "mean_token_accuracy": 0.6507671574751536, + "num_tokens": 1489118857.0, + "step": 8883 + }, + { + "entropy": 1.7769187291463215, + "epoch": 0.9759688006371701, + "grad_norm": 0.7039201855659485, + "learning_rate": 1.1789142376426446e-05, + "loss": 1.4417, + "mean_token_accuracy": 0.6531537423531214, + "num_tokens": 1489241521.0, + "step": 8884 + }, + { + "entropy": 1.6950764159361522, + "epoch": 0.9760786575485431, + "grad_norm": 0.623028039932251, + "learning_rate": 1.1787547401913844e-05, + "loss": 1.4874, + "mean_token_accuracy": 0.6512324412663778, + "num_tokens": 1489416640.0, + "step": 8885 + }, + { + "entropy": 1.739354799191157, + "epoch": 0.976188514459916, + "grad_norm": 0.6581858396530151, + "learning_rate": 1.1785952402475722e-05, + "loss": 1.4426, + "mean_token_accuracy": 0.6493807832400004, + "num_tokens": 1489591572.0, + "step": 8886 + }, + { + "entropy": 1.7338965435822804, + "epoch": 0.9762983713712889, + "grad_norm": 0.7526776790618896, + "learning_rate": 1.1784357378162563e-05, + "loss": 1.4659, + "mean_token_accuracy": 0.6593673924605051, + "num_tokens": 1489730151.0, + "step": 8887 + }, + { + "entropy": 1.7149750391642253, + "epoch": 0.9764082282826618, + "grad_norm": 0.6592543125152588, + "learning_rate": 1.1782762329024844e-05, + "loss": 1.4636, + "mean_token_accuracy": 0.6369660447041193, + "num_tokens": 1489951203.0, + "step": 8888 + }, + { + "entropy": 1.6767615675926208, + "epoch": 0.9765180851940348, + "grad_norm": 0.6977933049201965, + "learning_rate": 1.178116725511305e-05, + "loss": 1.3287, + "mean_token_accuracy": 0.6671111832062403, + "num_tokens": 1490090010.0, + "step": 8889 + }, + { + "entropy": 1.7671857078870137, + "epoch": 0.9766279421054077, + "grad_norm": 0.6696137189865112, + "learning_rate": 1.1779572156477668e-05, + "loss": 1.4625, + "mean_token_accuracy": 0.6420988490184149, + "num_tokens": 1490251825.0, + "step": 8890 + }, + { + "entropy": 1.7096356054147084, + "epoch": 0.9767377990167806, + "grad_norm": 0.7088844776153564, + "learning_rate": 1.1777977033169172e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6524170140425364, + "num_tokens": 1490465675.0, + "step": 8891 + }, + { + "entropy": 1.6997392972310383, + "epoch": 0.9768476559281536, + "grad_norm": 0.656985342502594, + "learning_rate": 1.1776381885238061e-05, + "loss": 1.2332, + "mean_token_accuracy": 0.6762174765268961, + "num_tokens": 1490560957.0, + "step": 8892 + }, + { + "entropy": 1.7471089363098145, + "epoch": 0.9769575128395265, + "grad_norm": 0.7269816994667053, + "learning_rate": 1.1774786712734809e-05, + "loss": 1.4095, + "mean_token_accuracy": 0.6508002032836279, + "num_tokens": 1490726338.0, + "step": 8893 + }, + { + "entropy": 1.7028346260388691, + "epoch": 0.9770673697508995, + "grad_norm": 0.6202099919319153, + "learning_rate": 1.1773191515709906e-05, + "loss": 1.4608, + "mean_token_accuracy": 0.6557352344195048, + "num_tokens": 1490880225.0, + "step": 8894 + }, + { + "entropy": 1.7463387648264568, + "epoch": 0.9771772266622724, + "grad_norm": 0.7726168632507324, + "learning_rate": 1.1771596294213843e-05, + "loss": 1.3347, + "mean_token_accuracy": 0.6617359022299448, + "num_tokens": 1490998821.0, + "step": 8895 + }, + { + "entropy": 1.7535992066065471, + "epoch": 0.9772870835736454, + "grad_norm": 0.7872083187103271, + "learning_rate": 1.1770001048297102e-05, + "loss": 1.4472, + "mean_token_accuracy": 0.6451671719551086, + "num_tokens": 1491127949.0, + "step": 8896 + }, + { + "entropy": 1.6302488346894581, + "epoch": 0.9773969404850182, + "grad_norm": 0.6466932892799377, + "learning_rate": 1.1768405778010175e-05, + "loss": 1.5216, + "mean_token_accuracy": 0.6445588419834772, + "num_tokens": 1491366456.0, + "step": 8897 + }, + { + "entropy": 1.703003813823064, + "epoch": 0.9775067973963912, + "grad_norm": 0.6098012328147888, + "learning_rate": 1.1766810483403554e-05, + "loss": 1.3419, + "mean_token_accuracy": 0.6562597801287969, + "num_tokens": 1491508523.0, + "step": 8898 + }, + { + "entropy": 1.7005487581094105, + "epoch": 0.9776166543077641, + "grad_norm": 0.6172084212303162, + "learning_rate": 1.1765215164527724e-05, + "loss": 1.3601, + "mean_token_accuracy": 0.6567008445660273, + "num_tokens": 1491676475.0, + "step": 8899 + }, + { + "entropy": 1.6621931393941243, + "epoch": 0.9777265112191371, + "grad_norm": 0.6719179153442383, + "learning_rate": 1.176361982143318e-05, + "loss": 1.4336, + "mean_token_accuracy": 0.6611567785342535, + "num_tokens": 1491840435.0, + "step": 8900 + }, + { + "entropy": 1.6821343700091045, + "epoch": 0.97783636813051, + "grad_norm": 0.7708391547203064, + "learning_rate": 1.176202445417041e-05, + "loss": 1.2368, + "mean_token_accuracy": 0.6823245485623678, + "num_tokens": 1491954495.0, + "step": 8901 + }, + { + "entropy": 1.7113755146662395, + "epoch": 0.977946225041883, + "grad_norm": 0.7206063866615295, + "learning_rate": 1.1760429062789913e-05, + "loss": 1.3775, + "mean_token_accuracy": 0.6614246865113577, + "num_tokens": 1492099534.0, + "step": 8902 + }, + { + "entropy": 1.7529734373092651, + "epoch": 0.9780560819532559, + "grad_norm": 0.699995219707489, + "learning_rate": 1.1758833647342176e-05, + "loss": 1.4319, + "mean_token_accuracy": 0.6588613192240397, + "num_tokens": 1492259682.0, + "step": 8903 + }, + { + "entropy": 1.7423115372657776, + "epoch": 0.9781659388646288, + "grad_norm": 0.6755629777908325, + "learning_rate": 1.1757238207877702e-05, + "loss": 1.4738, + "mean_token_accuracy": 0.6427132934331894, + "num_tokens": 1492394241.0, + "step": 8904 + }, + { + "entropy": 1.6748673518498738, + "epoch": 0.9782757957760018, + "grad_norm": 0.6021566987037659, + "learning_rate": 1.1755642744446976e-05, + "loss": 1.4454, + "mean_token_accuracy": 0.6756737381219864, + "num_tokens": 1492593369.0, + "step": 8905 + }, + { + "entropy": 1.688041518131892, + "epoch": 0.9783856526873747, + "grad_norm": 0.6667623519897461, + "learning_rate": 1.1754047257100496e-05, + "loss": 1.403, + "mean_token_accuracy": 0.6555336664120356, + "num_tokens": 1492797945.0, + "step": 8906 + }, + { + "entropy": 1.7394044895966847, + "epoch": 0.9784955095987476, + "grad_norm": 0.6365262269973755, + "learning_rate": 1.175245174588876e-05, + "loss": 0.9444, + "mean_token_accuracy": 0.6888117839892706, + "num_tokens": 1492929891.0, + "step": 8907 + }, + { + "entropy": 1.7516534825166066, + "epoch": 0.9786053665101205, + "grad_norm": 0.6223315596580505, + "learning_rate": 1.1750856210862267e-05, + "loss": 1.3887, + "mean_token_accuracy": 0.6591986964146296, + "num_tokens": 1493095539.0, + "step": 8908 + }, + { + "entropy": 1.7105275094509125, + "epoch": 0.9787152234214935, + "grad_norm": 0.8252950310707092, + "learning_rate": 1.1749260652071513e-05, + "loss": 1.2241, + "mean_token_accuracy": 0.6796365777651469, + "num_tokens": 1493206041.0, + "step": 8909 + }, + { + "entropy": 1.7503976225852966, + "epoch": 0.9788250803328664, + "grad_norm": 0.7552759051322937, + "learning_rate": 1.1747665069566998e-05, + "loss": 1.433, + "mean_token_accuracy": 0.652017816901207, + "num_tokens": 1493376417.0, + "step": 8910 + }, + { + "entropy": 1.7101737360159557, + "epoch": 0.9789349372442394, + "grad_norm": 0.636199414730072, + "learning_rate": 1.174606946339922e-05, + "loss": 1.363, + "mean_token_accuracy": 0.6580263326565424, + "num_tokens": 1493515476.0, + "step": 8911 + }, + { + "entropy": 1.7023467222849529, + "epoch": 0.9790447941556123, + "grad_norm": 0.6993868947029114, + "learning_rate": 1.174447383361868e-05, + "loss": 1.4091, + "mean_token_accuracy": 0.6553865273793539, + "num_tokens": 1493688665.0, + "step": 8912 + }, + { + "entropy": 1.689314067363739, + "epoch": 0.9791546510669853, + "grad_norm": 0.602799117565155, + "learning_rate": 1.1742878180275876e-05, + "loss": 1.3516, + "mean_token_accuracy": 0.6648583362499872, + "num_tokens": 1493896130.0, + "step": 8913 + }, + { + "entropy": 1.6816753149032593, + "epoch": 0.9792645079783582, + "grad_norm": 0.7375824451446533, + "learning_rate": 1.1741282503421314e-05, + "loss": 1.5003, + "mean_token_accuracy": 0.6500919560591379, + "num_tokens": 1494039815.0, + "step": 8914 + }, + { + "entropy": 1.7708214024702709, + "epoch": 0.9793743648897312, + "grad_norm": 0.643165647983551, + "learning_rate": 1.1739686803105497e-05, + "loss": 1.3811, + "mean_token_accuracy": 0.6472144474585851, + "num_tokens": 1494220521.0, + "step": 8915 + }, + { + "entropy": 1.7270447909832, + "epoch": 0.9794842218011041, + "grad_norm": 0.7264362573623657, + "learning_rate": 1.1738091079378924e-05, + "loss": 1.3741, + "mean_token_accuracy": 0.6521051526069641, + "num_tokens": 1494408989.0, + "step": 8916 + }, + { + "entropy": 1.7155033648014069, + "epoch": 0.979594078712477, + "grad_norm": 0.7391374111175537, + "learning_rate": 1.1736495332292099e-05, + "loss": 1.5278, + "mean_token_accuracy": 0.6366018503904343, + "num_tokens": 1494615976.0, + "step": 8917 + }, + { + "entropy": 1.7143846253554027, + "epoch": 0.9797039356238499, + "grad_norm": 0.702869713306427, + "learning_rate": 1.1734899561895532e-05, + "loss": 1.2975, + "mean_token_accuracy": 0.6688247273365656, + "num_tokens": 1494746252.0, + "step": 8918 + }, + { + "entropy": 1.7029621005058289, + "epoch": 0.9798137925352228, + "grad_norm": 0.6561051607131958, + "learning_rate": 1.1733303768239721e-05, + "loss": 1.3749, + "mean_token_accuracy": 0.6583746274312338, + "num_tokens": 1494902545.0, + "step": 8919 + }, + { + "entropy": 1.7064663370450337, + "epoch": 0.9799236494465958, + "grad_norm": 0.6154294013977051, + "learning_rate": 1.173170795137518e-05, + "loss": 1.5231, + "mean_token_accuracy": 0.6378661692142487, + "num_tokens": 1495137727.0, + "step": 8920 + }, + { + "entropy": 1.7173560659090679, + "epoch": 0.9800335063579687, + "grad_norm": 0.6912945508956909, + "learning_rate": 1.1730112111352412e-05, + "loss": 1.5634, + "mean_token_accuracy": 0.6340258419513702, + "num_tokens": 1495314972.0, + "step": 8921 + }, + { + "entropy": 1.7897585928440094, + "epoch": 0.9801433632693417, + "grad_norm": 0.6243012547492981, + "learning_rate": 1.1728516248221921e-05, + "loss": 1.4356, + "mean_token_accuracy": 0.642118309934934, + "num_tokens": 1495453177.0, + "step": 8922 + }, + { + "entropy": 1.726568082968394, + "epoch": 0.9802532201807146, + "grad_norm": 0.7456775903701782, + "learning_rate": 1.1726920362034222e-05, + "loss": 1.2715, + "mean_token_accuracy": 0.6740024735530218, + "num_tokens": 1495558366.0, + "step": 8923 + }, + { + "entropy": 1.661215364933014, + "epoch": 0.9803630770920876, + "grad_norm": 0.6240798830986023, + "learning_rate": 1.172532445283982e-05, + "loss": 1.4022, + "mean_token_accuracy": 0.6461461385091146, + "num_tokens": 1495729811.0, + "step": 8924 + }, + { + "entropy": 1.645288070042928, + "epoch": 0.9804729340034605, + "grad_norm": 0.6724650263786316, + "learning_rate": 1.1723728520689226e-05, + "loss": 1.4172, + "mean_token_accuracy": 0.660889113942782, + "num_tokens": 1495907965.0, + "step": 8925 + }, + { + "entropy": 1.6819191972414653, + "epoch": 0.9805827909148335, + "grad_norm": 0.8175992965698242, + "learning_rate": 1.172213256563295e-05, + "loss": 1.5096, + "mean_token_accuracy": 0.6531357516845068, + "num_tokens": 1496076569.0, + "step": 8926 + }, + { + "entropy": 1.691595862309138, + "epoch": 0.9806926478262064, + "grad_norm": 0.6539618372917175, + "learning_rate": 1.1720536587721506e-05, + "loss": 1.4512, + "mean_token_accuracy": 0.6612804333368937, + "num_tokens": 1496228306.0, + "step": 8927 + }, + { + "entropy": 1.6252157092094421, + "epoch": 0.9808025047375794, + "grad_norm": 0.6574737429618835, + "learning_rate": 1.1718940587005403e-05, + "loss": 1.3293, + "mean_token_accuracy": 0.6706244150797526, + "num_tokens": 1496384607.0, + "step": 8928 + }, + { + "entropy": 1.6941309372584026, + "epoch": 0.9809123616489522, + "grad_norm": 0.7366401553153992, + "learning_rate": 1.171734456353515e-05, + "loss": 1.3279, + "mean_token_accuracy": 0.6707786669333776, + "num_tokens": 1496503504.0, + "step": 8929 + }, + { + "entropy": 1.7239616513252258, + "epoch": 0.9810222185603251, + "grad_norm": 0.814553439617157, + "learning_rate": 1.171574851736127e-05, + "loss": 1.3939, + "mean_token_accuracy": 0.651360089580218, + "num_tokens": 1496619109.0, + "step": 8930 + }, + { + "entropy": 1.7386276920636494, + "epoch": 0.9811320754716981, + "grad_norm": 0.7365756034851074, + "learning_rate": 1.171415244853427e-05, + "loss": 1.2141, + "mean_token_accuracy": 0.677468384305636, + "num_tokens": 1496749789.0, + "step": 8931 + }, + { + "entropy": 1.6668656865755718, + "epoch": 0.981241932383071, + "grad_norm": 0.6614105105400085, + "learning_rate": 1.1712556357104669e-05, + "loss": 1.3721, + "mean_token_accuracy": 0.66343554854393, + "num_tokens": 1496896836.0, + "step": 8932 + }, + { + "entropy": 1.5968853334585826, + "epoch": 0.981351789294444, + "grad_norm": 0.624069094657898, + "learning_rate": 1.1710960243122978e-05, + "loss": 1.2155, + "mean_token_accuracy": 0.681601325670878, + "num_tokens": 1497046633.0, + "step": 8933 + }, + { + "entropy": 1.6975778539975483, + "epoch": 0.9814616462058169, + "grad_norm": 0.6668051481246948, + "learning_rate": 1.1709364106639715e-05, + "loss": 1.4791, + "mean_token_accuracy": 0.6465711345275243, + "num_tokens": 1497216701.0, + "step": 8934 + }, + { + "entropy": 1.6943465371926625, + "epoch": 0.9815715031171899, + "grad_norm": 0.8045838475227356, + "learning_rate": 1.17077679477054e-05, + "loss": 1.2983, + "mean_token_accuracy": 0.6777238150437673, + "num_tokens": 1497334152.0, + "step": 8935 + }, + { + "entropy": 1.6488543053468068, + "epoch": 0.9816813600285628, + "grad_norm": 0.6241557598114014, + "learning_rate": 1.1706171766370546e-05, + "loss": 1.3189, + "mean_token_accuracy": 0.678161104520162, + "num_tokens": 1497515242.0, + "step": 8936 + }, + { + "entropy": 1.7082229157288868, + "epoch": 0.9817912169399358, + "grad_norm": 0.7065755724906921, + "learning_rate": 1.1704575562685674e-05, + "loss": 1.3357, + "mean_token_accuracy": 0.6597681244214376, + "num_tokens": 1497716579.0, + "step": 8937 + }, + { + "entropy": 1.7012514372666676, + "epoch": 0.9819010738513086, + "grad_norm": 0.7709197402000427, + "learning_rate": 1.1702979336701306e-05, + "loss": 1.4173, + "mean_token_accuracy": 0.6548969050248464, + "num_tokens": 1497869693.0, + "step": 8938 + }, + { + "entropy": 1.6415949165821075, + "epoch": 0.9820109307626816, + "grad_norm": 0.5932096242904663, + "learning_rate": 1.1701383088467958e-05, + "loss": 1.4232, + "mean_token_accuracy": 0.6544928352038065, + "num_tokens": 1498106239.0, + "step": 8939 + }, + { + "entropy": 1.814208447933197, + "epoch": 0.9821207876740545, + "grad_norm": 0.6313380002975464, + "learning_rate": 1.169978681803615e-05, + "loss": 1.418, + "mean_token_accuracy": 0.6428090532620748, + "num_tokens": 1498265625.0, + "step": 8940 + }, + { + "entropy": 1.6680020491282146, + "epoch": 0.9822306445854275, + "grad_norm": 0.5956284403800964, + "learning_rate": 1.1698190525456403e-05, + "loss": 1.3458, + "mean_token_accuracy": 0.671937977274259, + "num_tokens": 1498434182.0, + "step": 8941 + }, + { + "entropy": 1.7206951081752777, + "epoch": 0.9823405014968004, + "grad_norm": 0.715988278388977, + "learning_rate": 1.1696594210779242e-05, + "loss": 1.2887, + "mean_token_accuracy": 0.6786340028047562, + "num_tokens": 1498582319.0, + "step": 8942 + }, + { + "entropy": 1.7263545493284862, + "epoch": 0.9824503584081734, + "grad_norm": 0.6619638800621033, + "learning_rate": 1.169499787405519e-05, + "loss": 1.2756, + "mean_token_accuracy": 0.6720621436834335, + "num_tokens": 1498690732.0, + "step": 8943 + }, + { + "entropy": 1.644078363974889, + "epoch": 0.9825602153195463, + "grad_norm": 0.6541385650634766, + "learning_rate": 1.1693401515334767e-05, + "loss": 1.4228, + "mean_token_accuracy": 0.6626478185256323, + "num_tokens": 1498865749.0, + "step": 8944 + }, + { + "entropy": 1.7186622321605682, + "epoch": 0.9826700722309192, + "grad_norm": 0.7101624608039856, + "learning_rate": 1.1691805134668497e-05, + "loss": 1.563, + "mean_token_accuracy": 0.6366095294555029, + "num_tokens": 1499039558.0, + "step": 8945 + }, + { + "entropy": 1.679495245218277, + "epoch": 0.9827799291422922, + "grad_norm": 0.6966460943222046, + "learning_rate": 1.169020873210691e-05, + "loss": 1.4945, + "mean_token_accuracy": 0.6450996845960617, + "num_tokens": 1499197333.0, + "step": 8946 + }, + { + "entropy": 1.722711722056071, + "epoch": 0.9828897860536651, + "grad_norm": 0.7616344094276428, + "learning_rate": 1.1688612307700522e-05, + "loss": 1.419, + "mean_token_accuracy": 0.649802620212237, + "num_tokens": 1499406096.0, + "step": 8947 + }, + { + "entropy": 1.641068955262502, + "epoch": 0.9829996429650381, + "grad_norm": 0.7064522504806519, + "learning_rate": 1.1687015861499866e-05, + "loss": 1.3548, + "mean_token_accuracy": 0.6700154940287272, + "num_tokens": 1499582395.0, + "step": 8948 + }, + { + "entropy": 1.709856649239858, + "epoch": 0.9831094998764109, + "grad_norm": 0.6546580195426941, + "learning_rate": 1.1685419393555474e-05, + "loss": 1.4311, + "mean_token_accuracy": 0.6408520142237345, + "num_tokens": 1499777610.0, + "step": 8949 + }, + { + "entropy": 1.6791809399922688, + "epoch": 0.9832193567877839, + "grad_norm": 0.7274378538131714, + "learning_rate": 1.168382290391786e-05, + "loss": 1.357, + "mean_token_accuracy": 0.668040469288826, + "num_tokens": 1499945571.0, + "step": 8950 + }, + { + "entropy": 1.7598425050576527, + "epoch": 0.9833292136991568, + "grad_norm": 1.086031198501587, + "learning_rate": 1.1682226392637561e-05, + "loss": 1.4119, + "mean_token_accuracy": 0.6470048973957697, + "num_tokens": 1500087592.0, + "step": 8951 + }, + { + "entropy": 1.682246168454488, + "epoch": 0.9834390706105298, + "grad_norm": 0.6415425539016724, + "learning_rate": 1.1680629859765107e-05, + "loss": 1.4473, + "mean_token_accuracy": 0.6519815276066462, + "num_tokens": 1500248684.0, + "step": 8952 + }, + { + "entropy": 1.646356741587321, + "epoch": 0.9835489275219027, + "grad_norm": 0.6637715697288513, + "learning_rate": 1.167903330535102e-05, + "loss": 1.4991, + "mean_token_accuracy": 0.6322367091973623, + "num_tokens": 1500476242.0, + "step": 8953 + }, + { + "entropy": 1.6522825956344604, + "epoch": 0.9836587844332757, + "grad_norm": 0.6415530443191528, + "learning_rate": 1.1677436729445837e-05, + "loss": 1.4045, + "mean_token_accuracy": 0.6576189547777176, + "num_tokens": 1500640691.0, + "step": 8954 + }, + { + "entropy": 1.7172890106836955, + "epoch": 0.9837686413446486, + "grad_norm": 0.6979526877403259, + "learning_rate": 1.167584013210009e-05, + "loss": 1.4555, + "mean_token_accuracy": 0.6452741970618566, + "num_tokens": 1500825536.0, + "step": 8955 + }, + { + "entropy": 1.726652721563975, + "epoch": 0.9838784982560216, + "grad_norm": 0.7327434420585632, + "learning_rate": 1.1674243513364303e-05, + "loss": 1.3525, + "mean_token_accuracy": 0.6641982247432073, + "num_tokens": 1500971763.0, + "step": 8956 + }, + { + "entropy": 1.6420506338278453, + "epoch": 0.9839883551673945, + "grad_norm": 0.6714190244674683, + "learning_rate": 1.1672646873289014e-05, + "loss": 1.4045, + "mean_token_accuracy": 0.6655755738417307, + "num_tokens": 1501155131.0, + "step": 8957 + }, + { + "entropy": 1.676787108182907, + "epoch": 0.9840982120787674, + "grad_norm": 0.6548340916633606, + "learning_rate": 1.1671050211924752e-05, + "loss": 1.3451, + "mean_token_accuracy": 0.6712605257829031, + "num_tokens": 1501282420.0, + "step": 8958 + }, + { + "entropy": 1.6804224650065105, + "epoch": 0.9842080689901404, + "grad_norm": 0.698941171169281, + "learning_rate": 1.1669453529322056e-05, + "loss": 1.2366, + "mean_token_accuracy": 0.6790623267491659, + "num_tokens": 1501402213.0, + "step": 8959 + }, + { + "entropy": 1.6702880064646404, + "epoch": 0.9843179259015132, + "grad_norm": 0.7242985963821411, + "learning_rate": 1.1667856825531458e-05, + "loss": 1.4095, + "mean_token_accuracy": 0.6546028355757395, + "num_tokens": 1501565497.0, + "step": 8960 + }, + { + "entropy": 1.736267864704132, + "epoch": 0.9844277828128862, + "grad_norm": 0.7420823574066162, + "learning_rate": 1.1666260100603493e-05, + "loss": 1.3009, + "mean_token_accuracy": 0.6798698008060455, + "num_tokens": 1501742697.0, + "step": 8961 + }, + { + "entropy": 1.7749883234500885, + "epoch": 0.9845376397242591, + "grad_norm": 0.8388869166374207, + "learning_rate": 1.1664663354588694e-05, + "loss": 1.6187, + "mean_token_accuracy": 0.6209886992971102, + "num_tokens": 1501930271.0, + "step": 8962 + }, + { + "entropy": 1.672831416130066, + "epoch": 0.9846474966356321, + "grad_norm": 0.6645883321762085, + "learning_rate": 1.16630665875376e-05, + "loss": 1.4393, + "mean_token_accuracy": 0.6532560338576635, + "num_tokens": 1502163533.0, + "step": 8963 + }, + { + "entropy": 1.7371017535527546, + "epoch": 0.984757353547005, + "grad_norm": 0.871916651725769, + "learning_rate": 1.1661469799500747e-05, + "loss": 1.4171, + "mean_token_accuracy": 0.6553890208403269, + "num_tokens": 1502320064.0, + "step": 8964 + }, + { + "entropy": 1.680338740348816, + "epoch": 0.984867210458378, + "grad_norm": 0.6550365686416626, + "learning_rate": 1.1659872990528674e-05, + "loss": 1.3864, + "mean_token_accuracy": 0.6682237784067789, + "num_tokens": 1502525140.0, + "step": 8965 + }, + { + "entropy": 1.7634477416674297, + "epoch": 0.9849770673697509, + "grad_norm": 0.6921877861022949, + "learning_rate": 1.1658276160671915e-05, + "loss": 1.4945, + "mean_token_accuracy": 0.6510110149780909, + "num_tokens": 1502673658.0, + "step": 8966 + }, + { + "entropy": 1.7780840198198955, + "epoch": 0.9850869242811239, + "grad_norm": 0.632499635219574, + "learning_rate": 1.1656679309981017e-05, + "loss": 1.5241, + "mean_token_accuracy": 0.6321147382259369, + "num_tokens": 1502864453.0, + "step": 8967 + }, + { + "entropy": 1.762084702650706, + "epoch": 0.9851967811924968, + "grad_norm": 0.6069890260696411, + "learning_rate": 1.1655082438506511e-05, + "loss": 1.4161, + "mean_token_accuracy": 0.6416159570217133, + "num_tokens": 1503015146.0, + "step": 8968 + }, + { + "entropy": 1.6979095737139385, + "epoch": 0.9853066381038698, + "grad_norm": 0.6491718888282776, + "learning_rate": 1.1653485546298941e-05, + "loss": 1.381, + "mean_token_accuracy": 0.6543610692024231, + "num_tokens": 1503180409.0, + "step": 8969 + }, + { + "entropy": 1.681455820798874, + "epoch": 0.9854164950152426, + "grad_norm": 0.8329312801361084, + "learning_rate": 1.1651888633408853e-05, + "loss": 1.301, + "mean_token_accuracy": 0.6834056129058202, + "num_tokens": 1503343958.0, + "step": 8970 + }, + { + "entropy": 1.7064136465390523, + "epoch": 0.9855263519266155, + "grad_norm": 0.7739664316177368, + "learning_rate": 1.1650291699886778e-05, + "loss": 1.4141, + "mean_token_accuracy": 0.6504683097203573, + "num_tokens": 1503500968.0, + "step": 8971 + }, + { + "entropy": 1.6865461766719818, + "epoch": 0.9856362088379885, + "grad_norm": 0.7742812633514404, + "learning_rate": 1.1648694745783265e-05, + "loss": 1.3123, + "mean_token_accuracy": 0.6681303232908249, + "num_tokens": 1503641625.0, + "step": 8972 + }, + { + "entropy": 1.708566923936208, + "epoch": 0.9857460657493614, + "grad_norm": 0.7213424444198608, + "learning_rate": 1.1647097771148857e-05, + "loss": 1.4823, + "mean_token_accuracy": 0.6548313399155935, + "num_tokens": 1503804043.0, + "step": 8973 + }, + { + "entropy": 1.7305469711621602, + "epoch": 0.9858559226607344, + "grad_norm": 0.7062889933586121, + "learning_rate": 1.1645500776034096e-05, + "loss": 1.3272, + "mean_token_accuracy": 0.6624001910289129, + "num_tokens": 1503908419.0, + "step": 8974 + }, + { + "entropy": 1.6859267055988312, + "epoch": 0.9859657795721073, + "grad_norm": 0.7024558186531067, + "learning_rate": 1.1643903760489523e-05, + "loss": 1.4326, + "mean_token_accuracy": 0.654796913266182, + "num_tokens": 1504084648.0, + "step": 8975 + }, + { + "entropy": 1.7165729403495789, + "epoch": 0.9860756364834803, + "grad_norm": 0.7313957214355469, + "learning_rate": 1.1642306724565688e-05, + "loss": 1.4266, + "mean_token_accuracy": 0.6520107636849085, + "num_tokens": 1504244529.0, + "step": 8976 + }, + { + "entropy": 1.6938693324724834, + "epoch": 0.9861854933948532, + "grad_norm": 0.7270261645317078, + "learning_rate": 1.1640709668313137e-05, + "loss": 1.2063, + "mean_token_accuracy": 0.6774415969848633, + "num_tokens": 1504440015.0, + "step": 8977 + }, + { + "entropy": 1.6900843977928162, + "epoch": 0.9862953503062262, + "grad_norm": 0.797430157661438, + "learning_rate": 1.1639112591782413e-05, + "loss": 1.3578, + "mean_token_accuracy": 0.6580925136804581, + "num_tokens": 1504572367.0, + "step": 8978 + }, + { + "entropy": 1.6583419442176819, + "epoch": 0.9864052072175991, + "grad_norm": 0.711826741695404, + "learning_rate": 1.1637515495024062e-05, + "loss": 1.4362, + "mean_token_accuracy": 0.6491169184446335, + "num_tokens": 1504721064.0, + "step": 8979 + }, + { + "entropy": 1.6519073247909546, + "epoch": 0.986515064128972, + "grad_norm": 0.6117995381355286, + "learning_rate": 1.163591837808863e-05, + "loss": 1.2617, + "mean_token_accuracy": 0.6822675367196401, + "num_tokens": 1504875817.0, + "step": 8980 + }, + { + "entropy": 1.742538849512736, + "epoch": 0.9866249210403449, + "grad_norm": 0.7913803458213806, + "learning_rate": 1.1634321241026671e-05, + "loss": 1.5257, + "mean_token_accuracy": 0.6468542764584223, + "num_tokens": 1505024315.0, + "step": 8981 + }, + { + "entropy": 1.7452322244644165, + "epoch": 0.9867347779517179, + "grad_norm": 0.8122254014015198, + "learning_rate": 1.163272408388873e-05, + "loss": 1.3508, + "mean_token_accuracy": 0.6734697222709656, + "num_tokens": 1505160858.0, + "step": 8982 + }, + { + "entropy": 1.6993557115395863, + "epoch": 0.9868446348630908, + "grad_norm": 0.6506452560424805, + "learning_rate": 1.163112690672536e-05, + "loss": 1.4257, + "mean_token_accuracy": 0.6508930325508118, + "num_tokens": 1505359516.0, + "step": 8983 + }, + { + "entropy": 1.7169030010700226, + "epoch": 0.9869544917744638, + "grad_norm": 0.8766597509384155, + "learning_rate": 1.1629529709587103e-05, + "loss": 1.3419, + "mean_token_accuracy": 0.6719989031553268, + "num_tokens": 1505479668.0, + "step": 8984 + }, + { + "entropy": 1.7049663464228313, + "epoch": 0.9870643486858367, + "grad_norm": 0.751346230506897, + "learning_rate": 1.1627932492524519e-05, + "loss": 1.4392, + "mean_token_accuracy": 0.6511711478233337, + "num_tokens": 1505669915.0, + "step": 8985 + }, + { + "entropy": 1.7046051720778148, + "epoch": 0.9871742055972096, + "grad_norm": 0.6194396615028381, + "learning_rate": 1.1626335255588153e-05, + "loss": 1.486, + "mean_token_accuracy": 0.6346323589483897, + "num_tokens": 1505848159.0, + "step": 8986 + }, + { + "entropy": 1.70768607656161, + "epoch": 0.9872840625085826, + "grad_norm": 0.5602802038192749, + "learning_rate": 1.1624737998828556e-05, + "loss": 1.414, + "mean_token_accuracy": 0.6471713682015737, + "num_tokens": 1506088376.0, + "step": 8987 + }, + { + "entropy": 1.62164506316185, + "epoch": 0.9873939194199555, + "grad_norm": 0.7139067649841309, + "learning_rate": 1.1623140722296285e-05, + "loss": 1.5303, + "mean_token_accuracy": 0.6505985458691915, + "num_tokens": 1506254740.0, + "step": 8988 + }, + { + "entropy": 1.7491505940755208, + "epoch": 0.9875037763313285, + "grad_norm": 0.7321544885635376, + "learning_rate": 1.162154342604189e-05, + "loss": 1.3151, + "mean_token_accuracy": 0.6853879491488138, + "num_tokens": 1506417451.0, + "step": 8989 + }, + { + "entropy": 1.6918767988681793, + "epoch": 0.9876136332427013, + "grad_norm": 0.6921842098236084, + "learning_rate": 1.1619946110115928e-05, + "loss": 1.3741, + "mean_token_accuracy": 0.6635448783636093, + "num_tokens": 1506573897.0, + "step": 8990 + }, + { + "entropy": 1.6910037795702617, + "epoch": 0.9877234901540743, + "grad_norm": 0.6403983235359192, + "learning_rate": 1.1618348774568946e-05, + "loss": 1.4347, + "mean_token_accuracy": 0.6572843343019485, + "num_tokens": 1506763992.0, + "step": 8991 + }, + { + "entropy": 1.724970320860545, + "epoch": 0.9878333470654472, + "grad_norm": 0.70880126953125, + "learning_rate": 1.1616751419451506e-05, + "loss": 1.3969, + "mean_token_accuracy": 0.6549189041058222, + "num_tokens": 1506902471.0, + "step": 8992 + }, + { + "entropy": 1.7925910154978435, + "epoch": 0.9879432039768202, + "grad_norm": 0.6777533292770386, + "learning_rate": 1.1615154044814163e-05, + "loss": 1.3396, + "mean_token_accuracy": 0.6721122364203135, + "num_tokens": 1507045033.0, + "step": 8993 + }, + { + "entropy": 1.5604525705178578, + "epoch": 0.9880530608881931, + "grad_norm": 0.6724399328231812, + "learning_rate": 1.1613556650707474e-05, + "loss": 1.2766, + "mean_token_accuracy": 0.6662530352671941, + "num_tokens": 1507227734.0, + "step": 8994 + }, + { + "entropy": 1.7361294726530712, + "epoch": 0.9881629177995661, + "grad_norm": 0.7844634652137756, + "learning_rate": 1.1611959237181991e-05, + "loss": 1.2964, + "mean_token_accuracy": 0.6645766844352087, + "num_tokens": 1507366369.0, + "step": 8995 + }, + { + "entropy": 1.7280752658843994, + "epoch": 0.988272774710939, + "grad_norm": 0.721837043762207, + "learning_rate": 1.1610361804288273e-05, + "loss": 1.3637, + "mean_token_accuracy": 0.6476641943057379, + "num_tokens": 1507549321.0, + "step": 8996 + }, + { + "entropy": 1.701300948858261, + "epoch": 0.988382631622312, + "grad_norm": 0.6382768750190735, + "learning_rate": 1.1608764352076878e-05, + "loss": 1.4257, + "mean_token_accuracy": 0.6443998465935389, + "num_tokens": 1507731929.0, + "step": 8997 + }, + { + "entropy": 1.703882485628128, + "epoch": 0.9884924885336849, + "grad_norm": 0.7071985006332397, + "learning_rate": 1.1607166880598366e-05, + "loss": 1.4985, + "mean_token_accuracy": 0.6458214769760767, + "num_tokens": 1507918984.0, + "step": 8998 + }, + { + "entropy": 1.7094257573286693, + "epoch": 0.9886023454450578, + "grad_norm": 0.8170492053031921, + "learning_rate": 1.1605569389903297e-05, + "loss": 1.3406, + "mean_token_accuracy": 0.6753592838843664, + "num_tokens": 1508052713.0, + "step": 8999 + }, + { + "entropy": 1.7019550204277039, + "epoch": 0.9887122023564308, + "grad_norm": 0.7124339938163757, + "learning_rate": 1.1603971880042228e-05, + "loss": 1.3914, + "mean_token_accuracy": 0.654958705107371, + "num_tokens": 1508206066.0, + "step": 9000 + }, + { + "entropy": 1.7603452901045482, + "epoch": 0.9888220592678036, + "grad_norm": 0.8602905869483948, + "learning_rate": 1.1602374351065725e-05, + "loss": 1.5332, + "mean_token_accuracy": 0.6360108802715937, + "num_tokens": 1508375603.0, + "step": 9001 + }, + { + "entropy": 1.7130256096522014, + "epoch": 0.9889319161791766, + "grad_norm": 0.7287706136703491, + "learning_rate": 1.1600776803024344e-05, + "loss": 1.4632, + "mean_token_accuracy": 0.6579320232073466, + "num_tokens": 1508573102.0, + "step": 9002 + }, + { + "entropy": 1.689418117205302, + "epoch": 0.9890417730905495, + "grad_norm": 0.6394194960594177, + "learning_rate": 1.1599179235968646e-05, + "loss": 1.4721, + "mean_token_accuracy": 0.6430060019095739, + "num_tokens": 1508748381.0, + "step": 9003 + }, + { + "entropy": 1.635381430387497, + "epoch": 0.9891516300019225, + "grad_norm": 0.741367757320404, + "learning_rate": 1.1597581649949194e-05, + "loss": 1.3015, + "mean_token_accuracy": 0.6785789032777151, + "num_tokens": 1508941115.0, + "step": 9004 + }, + { + "entropy": 1.6817569931348164, + "epoch": 0.9892614869132954, + "grad_norm": 0.6391214728355408, + "learning_rate": 1.1595984045016557e-05, + "loss": 1.4592, + "mean_token_accuracy": 0.6553023606538773, + "num_tokens": 1509128859.0, + "step": 9005 + }, + { + "entropy": 1.65160737435023, + "epoch": 0.9893713438246684, + "grad_norm": 0.723601222038269, + "learning_rate": 1.1594386421221289e-05, + "loss": 1.3859, + "mean_token_accuracy": 0.6749422947565714, + "num_tokens": 1509267167.0, + "step": 9006 + }, + { + "entropy": 1.6834927201271057, + "epoch": 0.9894812007360413, + "grad_norm": 0.6616725325584412, + "learning_rate": 1.1592788778613962e-05, + "loss": 1.4887, + "mean_token_accuracy": 0.624958798289299, + "num_tokens": 1509557371.0, + "step": 9007 + }, + { + "entropy": 1.7665310402711232, + "epoch": 0.9895910576474143, + "grad_norm": 0.7587254047393799, + "learning_rate": 1.1591191117245134e-05, + "loss": 1.4524, + "mean_token_accuracy": 0.6576088120539983, + "num_tokens": 1509705073.0, + "step": 9008 + }, + { + "entropy": 1.6970743139584858, + "epoch": 0.9897009145587872, + "grad_norm": 0.6766437888145447, + "learning_rate": 1.1589593437165377e-05, + "loss": 1.413, + "mean_token_accuracy": 0.6557514518499374, + "num_tokens": 1509877904.0, + "step": 9009 + }, + { + "entropy": 1.6982737878958385, + "epoch": 0.9898107714701602, + "grad_norm": 0.7253543138504028, + "learning_rate": 1.1587995738425249e-05, + "loss": 1.3851, + "mean_token_accuracy": 0.6553531636794409, + "num_tokens": 1510029228.0, + "step": 9010 + }, + { + "entropy": 1.6637147863705952, + "epoch": 0.989920628381533, + "grad_norm": 0.6019961833953857, + "learning_rate": 1.1586398021075324e-05, + "loss": 1.4421, + "mean_token_accuracy": 0.6378505776325861, + "num_tokens": 1510279098.0, + "step": 9011 + }, + { + "entropy": 1.6774966319402058, + "epoch": 0.9900304852929059, + "grad_norm": 0.655983567237854, + "learning_rate": 1.1584800285166164e-05, + "loss": 1.3409, + "mean_token_accuracy": 0.650844136873881, + "num_tokens": 1510440886.0, + "step": 9012 + }, + { + "entropy": 1.6945745448271434, + "epoch": 0.9901403422042789, + "grad_norm": 0.62674480676651, + "learning_rate": 1.1583202530748341e-05, + "loss": 1.3902, + "mean_token_accuracy": 0.6639150381088257, + "num_tokens": 1510656532.0, + "step": 9013 + }, + { + "entropy": 1.7257001300652821, + "epoch": 0.9902501991156518, + "grad_norm": 0.7610213160514832, + "learning_rate": 1.1581604757872418e-05, + "loss": 1.4418, + "mean_token_accuracy": 0.642704447110494, + "num_tokens": 1510830217.0, + "step": 9014 + }, + { + "entropy": 1.6689273913701375, + "epoch": 0.9903600560270248, + "grad_norm": 0.7523655891418457, + "learning_rate": 1.1580006966588968e-05, + "loss": 1.2532, + "mean_token_accuracy": 0.6769355684518814, + "num_tokens": 1510954384.0, + "step": 9015 + }, + { + "entropy": 1.720543771982193, + "epoch": 0.9904699129383977, + "grad_norm": 0.7777354121208191, + "learning_rate": 1.1578409156948558e-05, + "loss": 1.5624, + "mean_token_accuracy": 0.6435587803522745, + "num_tokens": 1511108028.0, + "step": 9016 + }, + { + "entropy": 1.6359238624572754, + "epoch": 0.9905797698497707, + "grad_norm": 0.8014277219772339, + "learning_rate": 1.157681132900176e-05, + "loss": 1.2437, + "mean_token_accuracy": 0.6720960934956869, + "num_tokens": 1511244389.0, + "step": 9017 + }, + { + "entropy": 1.6672624746958415, + "epoch": 0.9906896267611436, + "grad_norm": 0.7083525061607361, + "learning_rate": 1.1575213482799144e-05, + "loss": 1.4483, + "mean_token_accuracy": 0.6651191810766856, + "num_tokens": 1511417628.0, + "step": 9018 + }, + { + "entropy": 1.6869849264621735, + "epoch": 0.9907994836725166, + "grad_norm": 0.6426869630813599, + "learning_rate": 1.1573615618391279e-05, + "loss": 1.498, + "mean_token_accuracy": 0.6442697743574778, + "num_tokens": 1511656313.0, + "step": 9019 + }, + { + "entropy": 1.703253189722697, + "epoch": 0.9909093405838895, + "grad_norm": 0.6750736832618713, + "learning_rate": 1.1572017735828738e-05, + "loss": 1.3201, + "mean_token_accuracy": 0.6629547973473867, + "num_tokens": 1511768183.0, + "step": 9020 + }, + { + "entropy": 1.7203664779663086, + "epoch": 0.9910191974952625, + "grad_norm": 0.6736758947372437, + "learning_rate": 1.1570419835162093e-05, + "loss": 1.38, + "mean_token_accuracy": 0.6648065795501074, + "num_tokens": 1511913663.0, + "step": 9021 + }, + { + "entropy": 1.7273483872413635, + "epoch": 0.9911290544066353, + "grad_norm": 0.6232172846794128, + "learning_rate": 1.1568821916441916e-05, + "loss": 1.4793, + "mean_token_accuracy": 0.6454748759667078, + "num_tokens": 1512131107.0, + "step": 9022 + }, + { + "entropy": 1.7320756713549297, + "epoch": 0.9912389113180083, + "grad_norm": 0.927398145198822, + "learning_rate": 1.1567223979718786e-05, + "loss": 1.3629, + "mean_token_accuracy": 0.6540378282467524, + "num_tokens": 1512294920.0, + "step": 9023 + }, + { + "entropy": 1.7156847814718883, + "epoch": 0.9913487682293812, + "grad_norm": 0.6778035759925842, + "learning_rate": 1.156562602504327e-05, + "loss": 1.6711, + "mean_token_accuracy": 0.6444185674190521, + "num_tokens": 1512497502.0, + "step": 9024 + }, + { + "entropy": 1.7153818408648174, + "epoch": 0.9914586251407541, + "grad_norm": 0.6155846118927002, + "learning_rate": 1.1564028052465945e-05, + "loss": 1.3561, + "mean_token_accuracy": 0.6623079578081766, + "num_tokens": 1512629995.0, + "step": 9025 + }, + { + "entropy": 1.6826336582501729, + "epoch": 0.9915684820521271, + "grad_norm": 0.6194709539413452, + "learning_rate": 1.156243006203739e-05, + "loss": 1.532, + "mean_token_accuracy": 0.6453428119421005, + "num_tokens": 1512822020.0, + "step": 9026 + }, + { + "entropy": 1.6776429613431294, + "epoch": 0.9916783389635, + "grad_norm": 0.6859866976737976, + "learning_rate": 1.1560832053808172e-05, + "loss": 1.2609, + "mean_token_accuracy": 0.6728976418574651, + "num_tokens": 1512946589.0, + "step": 9027 + }, + { + "entropy": 1.6960046589374542, + "epoch": 0.991788195874873, + "grad_norm": 0.6310602426528931, + "learning_rate": 1.1559234027828872e-05, + "loss": 1.4226, + "mean_token_accuracy": 0.6640596588452657, + "num_tokens": 1513133048.0, + "step": 9028 + }, + { + "entropy": 1.7602422833442688, + "epoch": 0.9918980527862459, + "grad_norm": 0.7083531618118286, + "learning_rate": 1.155763598415007e-05, + "loss": 1.5015, + "mean_token_accuracy": 0.6424345870812734, + "num_tokens": 1513332237.0, + "step": 9029 + }, + { + "entropy": 1.6640680531660716, + "epoch": 0.9920079096976189, + "grad_norm": 0.5896248817443848, + "learning_rate": 1.155603792282234e-05, + "loss": 1.4749, + "mean_token_accuracy": 0.6422792822122574, + "num_tokens": 1513545562.0, + "step": 9030 + }, + { + "entropy": 1.732280304034551, + "epoch": 0.9921177666089918, + "grad_norm": 0.6422677040100098, + "learning_rate": 1.1554439843896261e-05, + "loss": 1.5216, + "mean_token_accuracy": 0.6359787285327911, + "num_tokens": 1513726745.0, + "step": 9031 + }, + { + "entropy": 1.6377032995224, + "epoch": 0.9922276235203648, + "grad_norm": 0.6851502060890198, + "learning_rate": 1.1552841747422409e-05, + "loss": 1.2825, + "mean_token_accuracy": 0.6651904483636221, + "num_tokens": 1513892498.0, + "step": 9032 + }, + { + "entropy": 1.7014533579349518, + "epoch": 0.9923374804317376, + "grad_norm": 0.7634040117263794, + "learning_rate": 1.1551243633451365e-05, + "loss": 1.3062, + "mean_token_accuracy": 0.6721664518117905, + "num_tokens": 1514062306.0, + "step": 9033 + }, + { + "entropy": 1.7060857713222504, + "epoch": 0.9924473373431106, + "grad_norm": 0.6701831221580505, + "learning_rate": 1.1549645502033709e-05, + "loss": 1.4121, + "mean_token_accuracy": 0.6439545353253683, + "num_tokens": 1514266388.0, + "step": 9034 + }, + { + "entropy": 1.6986914575099945, + "epoch": 0.9925571942544835, + "grad_norm": 0.678210437297821, + "learning_rate": 1.154804735322002e-05, + "loss": 1.3727, + "mean_token_accuracy": 0.6620455632607142, + "num_tokens": 1514432185.0, + "step": 9035 + }, + { + "entropy": 1.5856430729230244, + "epoch": 0.9926670511658565, + "grad_norm": 0.735240638256073, + "learning_rate": 1.154644918706088e-05, + "loss": 1.3048, + "mean_token_accuracy": 0.6784227043390274, + "num_tokens": 1514616077.0, + "step": 9036 + }, + { + "entropy": 1.7611307700475056, + "epoch": 0.9927769080772294, + "grad_norm": 0.7041000127792358, + "learning_rate": 1.1544851003606867e-05, + "loss": 1.2482, + "mean_token_accuracy": 0.6712858428557714, + "num_tokens": 1514722578.0, + "step": 9037 + }, + { + "entropy": 1.7410919765631359, + "epoch": 0.9928867649886024, + "grad_norm": 0.9079409837722778, + "learning_rate": 1.1543252802908569e-05, + "loss": 1.4019, + "mean_token_accuracy": 0.6431263387203217, + "num_tokens": 1514926211.0, + "step": 9038 + }, + { + "entropy": 1.7162447571754456, + "epoch": 0.9929966218999753, + "grad_norm": 0.8317917585372925, + "learning_rate": 1.1541654585016564e-05, + "loss": 1.5269, + "mean_token_accuracy": 0.6505968123674393, + "num_tokens": 1515076490.0, + "step": 9039 + }, + { + "entropy": 1.7665583193302155, + "epoch": 0.9931064788113482, + "grad_norm": 0.675299882888794, + "learning_rate": 1.154005634998143e-05, + "loss": 1.4275, + "mean_token_accuracy": 0.6545489778121313, + "num_tokens": 1515243623.0, + "step": 9040 + }, + { + "entropy": 1.6715355316797893, + "epoch": 0.9932163357227212, + "grad_norm": 0.6317136883735657, + "learning_rate": 1.1538458097853764e-05, + "loss": 1.5146, + "mean_token_accuracy": 0.6456638177235922, + "num_tokens": 1515452950.0, + "step": 9041 + }, + { + "entropy": 1.7157903412977855, + "epoch": 0.993326192634094, + "grad_norm": 0.7072700262069702, + "learning_rate": 1.1536859828684134e-05, + "loss": 1.318, + "mean_token_accuracy": 0.6620603998502096, + "num_tokens": 1515563514.0, + "step": 9042 + }, + { + "entropy": 1.7313476900259654, + "epoch": 0.993436049545467, + "grad_norm": 0.836708664894104, + "learning_rate": 1.1535261542523137e-05, + "loss": 1.2703, + "mean_token_accuracy": 0.6730028490225474, + "num_tokens": 1515686917.0, + "step": 9043 + }, + { + "entropy": 1.7311189671357472, + "epoch": 0.9935459064568399, + "grad_norm": 0.6318951845169067, + "learning_rate": 1.1533663239421354e-05, + "loss": 1.4732, + "mean_token_accuracy": 0.6417555063962936, + "num_tokens": 1515860537.0, + "step": 9044 + }, + { + "entropy": 1.7036487360795338, + "epoch": 0.9936557633682129, + "grad_norm": 0.7849493026733398, + "learning_rate": 1.1532064919429369e-05, + "loss": 1.4322, + "mean_token_accuracy": 0.6610707342624664, + "num_tokens": 1516011139.0, + "step": 9045 + }, + { + "entropy": 1.7278256515661876, + "epoch": 0.9937656202795858, + "grad_norm": 0.6660558581352234, + "learning_rate": 1.1530466582597766e-05, + "loss": 1.3744, + "mean_token_accuracy": 0.6509429017702738, + "num_tokens": 1516205732.0, + "step": 9046 + }, + { + "entropy": 1.723042756319046, + "epoch": 0.9938754771909588, + "grad_norm": 0.7152264714241028, + "learning_rate": 1.152886822897714e-05, + "loss": 1.5193, + "mean_token_accuracy": 0.6414504299561182, + "num_tokens": 1516391979.0, + "step": 9047 + }, + { + "entropy": 1.6957969069480896, + "epoch": 0.9939853341023317, + "grad_norm": 0.7662060856819153, + "learning_rate": 1.152726985861807e-05, + "loss": 1.3979, + "mean_token_accuracy": 0.6510555545488993, + "num_tokens": 1516550510.0, + "step": 9048 + }, + { + "entropy": 1.6899131039778392, + "epoch": 0.9940951910137047, + "grad_norm": 0.7555585503578186, + "learning_rate": 1.1525671471571148e-05, + "loss": 1.2768, + "mean_token_accuracy": 0.6714038848876953, + "num_tokens": 1516701671.0, + "step": 9049 + }, + { + "entropy": 1.7336824933687847, + "epoch": 0.9942050479250776, + "grad_norm": 0.7775252461433411, + "learning_rate": 1.1524073067886958e-05, + "loss": 1.3336, + "mean_token_accuracy": 0.6652177522579829, + "num_tokens": 1516815568.0, + "step": 9050 + }, + { + "entropy": 1.672436664501826, + "epoch": 0.9943149048364506, + "grad_norm": 0.6073688268661499, + "learning_rate": 1.1522474647616095e-05, + "loss": 1.3894, + "mean_token_accuracy": 0.658691331744194, + "num_tokens": 1516980970.0, + "step": 9051 + }, + { + "entropy": 1.7328505516052246, + "epoch": 0.9944247617478235, + "grad_norm": 0.6700348854064941, + "learning_rate": 1.1520876210809143e-05, + "loss": 1.4246, + "mean_token_accuracy": 0.6411223659912745, + "num_tokens": 1517164909.0, + "step": 9052 + }, + { + "entropy": 1.7129102945327759, + "epoch": 0.9945346186591963, + "grad_norm": 0.7768635749816895, + "learning_rate": 1.1519277757516693e-05, + "loss": 1.3948, + "mean_token_accuracy": 0.6586629996697108, + "num_tokens": 1517309049.0, + "step": 9053 + }, + { + "entropy": 1.6529461741447449, + "epoch": 0.9946444755705693, + "grad_norm": 0.593262255191803, + "learning_rate": 1.1517679287789335e-05, + "loss": 1.4001, + "mean_token_accuracy": 0.6509411931037903, + "num_tokens": 1517506632.0, + "step": 9054 + }, + { + "entropy": 1.6614431242148082, + "epoch": 0.9947543324819422, + "grad_norm": 0.6814653277397156, + "learning_rate": 1.1516080801677662e-05, + "loss": 1.4424, + "mean_token_accuracy": 0.6618533333142599, + "num_tokens": 1517654638.0, + "step": 9055 + }, + { + "entropy": 1.6925993263721466, + "epoch": 0.9948641893933152, + "grad_norm": 0.6455976963043213, + "learning_rate": 1.1514482299232266e-05, + "loss": 1.3165, + "mean_token_accuracy": 0.6803247978289922, + "num_tokens": 1517787177.0, + "step": 9056 + }, + { + "entropy": 1.693973034620285, + "epoch": 0.9949740463046881, + "grad_norm": 0.8472748398780823, + "learning_rate": 1.1512883780503737e-05, + "loss": 1.4849, + "mean_token_accuracy": 0.6628153622150421, + "num_tokens": 1517914092.0, + "step": 9057 + }, + { + "entropy": 1.6954384346803029, + "epoch": 0.9950839032160611, + "grad_norm": 0.6717925667762756, + "learning_rate": 1.1511285245542663e-05, + "loss": 1.3713, + "mean_token_accuracy": 0.6591513852278391, + "num_tokens": 1518076723.0, + "step": 9058 + }, + { + "entropy": 1.6373221576213837, + "epoch": 0.995193760127434, + "grad_norm": 0.7058557271957397, + "learning_rate": 1.1509686694399647e-05, + "loss": 1.2549, + "mean_token_accuracy": 0.6811738759279251, + "num_tokens": 1518196658.0, + "step": 9059 + }, + { + "entropy": 1.7418764730294545, + "epoch": 0.995303617038807, + "grad_norm": 0.642804741859436, + "learning_rate": 1.1508088127125274e-05, + "loss": 1.4527, + "mean_token_accuracy": 0.6466216047604879, + "num_tokens": 1518384038.0, + "step": 9060 + }, + { + "entropy": 1.6919652024904888, + "epoch": 0.9954134739501799, + "grad_norm": 2.170675039291382, + "learning_rate": 1.150648954377014e-05, + "loss": 1.4643, + "mean_token_accuracy": 0.6461801479260126, + "num_tokens": 1518542211.0, + "step": 9061 + }, + { + "entropy": 1.7008472084999084, + "epoch": 0.9955233308615529, + "grad_norm": 0.6365600228309631, + "learning_rate": 1.150489094438484e-05, + "loss": 1.482, + "mean_token_accuracy": 0.6452033768097559, + "num_tokens": 1518761319.0, + "step": 9062 + }, + { + "entropy": 1.753299355506897, + "epoch": 0.9956331877729258, + "grad_norm": 0.6528931260108948, + "learning_rate": 1.1503292329019972e-05, + "loss": 1.4625, + "mean_token_accuracy": 0.6458161721626917, + "num_tokens": 1518936829.0, + "step": 9063 + }, + { + "entropy": 1.6954053243001301, + "epoch": 0.9957430446842988, + "grad_norm": 0.788202166557312, + "learning_rate": 1.1501693697726126e-05, + "loss": 1.4025, + "mean_token_accuracy": 0.6617314616839091, + "num_tokens": 1519101793.0, + "step": 9064 + }, + { + "entropy": 1.6784932613372803, + "epoch": 0.9958529015956716, + "grad_norm": 0.5653716325759888, + "learning_rate": 1.1500095050553901e-05, + "loss": 1.4645, + "mean_token_accuracy": 0.6363365004460017, + "num_tokens": 1519305956.0, + "step": 9065 + }, + { + "entropy": 1.6899797419706981, + "epoch": 0.9959627585070445, + "grad_norm": 0.6082383990287781, + "learning_rate": 1.1498496387553892e-05, + "loss": 1.4347, + "mean_token_accuracy": 0.6532058666149775, + "num_tokens": 1519531519.0, + "step": 9066 + }, + { + "entropy": 1.676472971836726, + "epoch": 0.9960726154184175, + "grad_norm": 0.6178333759307861, + "learning_rate": 1.1496897708776703e-05, + "loss": 1.3604, + "mean_token_accuracy": 0.6715668042500814, + "num_tokens": 1519708888.0, + "step": 9067 + }, + { + "entropy": 1.7519252399603527, + "epoch": 0.9961824723297904, + "grad_norm": 0.8261198401451111, + "learning_rate": 1.1495299014272916e-05, + "loss": 1.6367, + "mean_token_accuracy": 0.6203742722670237, + "num_tokens": 1519896103.0, + "step": 9068 + }, + { + "entropy": 1.6828113396962483, + "epoch": 0.9962923292411634, + "grad_norm": 0.6337330937385559, + "learning_rate": 1.1493700304093146e-05, + "loss": 1.4801, + "mean_token_accuracy": 0.6396347731351852, + "num_tokens": 1520084614.0, + "step": 9069 + }, + { + "entropy": 1.724067787329356, + "epoch": 0.9964021861525363, + "grad_norm": 0.5894656181335449, + "learning_rate": 1.149210157828798e-05, + "loss": 1.4314, + "mean_token_accuracy": 0.6514023790756861, + "num_tokens": 1520279341.0, + "step": 9070 + }, + { + "entropy": 1.6811665495236714, + "epoch": 0.9965120430639093, + "grad_norm": 0.6790025234222412, + "learning_rate": 1.1490502836908022e-05, + "loss": 1.3181, + "mean_token_accuracy": 0.6632300714651743, + "num_tokens": 1520407513.0, + "step": 9071 + }, + { + "entropy": 1.6569512685139973, + "epoch": 0.9966218999752822, + "grad_norm": 0.6046280264854431, + "learning_rate": 1.1488904080003868e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.661707783738772, + "num_tokens": 1520561581.0, + "step": 9072 + }, + { + "entropy": 1.7094461222489674, + "epoch": 0.9967317568866552, + "grad_norm": 0.6875105500221252, + "learning_rate": 1.1487305307626125e-05, + "loss": 1.267, + "mean_token_accuracy": 0.667315478126208, + "num_tokens": 1520702575.0, + "step": 9073 + }, + { + "entropy": 1.737904926141103, + "epoch": 0.996841613798028, + "grad_norm": 0.7653663158416748, + "learning_rate": 1.1485706519825384e-05, + "loss": 1.4828, + "mean_token_accuracy": 0.6492100208997726, + "num_tokens": 1520851927.0, + "step": 9074 + }, + { + "entropy": 1.6922054886817932, + "epoch": 0.996951470709401, + "grad_norm": 0.7423431277275085, + "learning_rate": 1.1484107716652256e-05, + "loss": 1.4411, + "mean_token_accuracy": 0.6581322699785233, + "num_tokens": 1520996024.0, + "step": 9075 + }, + { + "entropy": 1.7260395387808483, + "epoch": 0.9970613276207739, + "grad_norm": 0.6210670471191406, + "learning_rate": 1.148250889815733e-05, + "loss": 1.5119, + "mean_token_accuracy": 0.6369271477063497, + "num_tokens": 1521257653.0, + "step": 9076 + }, + { + "entropy": 1.7174350917339325, + "epoch": 0.9971711845321469, + "grad_norm": 0.69709712266922, + "learning_rate": 1.148091006439122e-05, + "loss": 1.3194, + "mean_token_accuracy": 0.6692462513844172, + "num_tokens": 1521373358.0, + "step": 9077 + }, + { + "entropy": 1.6322135925292969, + "epoch": 0.9972810414435198, + "grad_norm": 0.6395667791366577, + "learning_rate": 1.1479311215404518e-05, + "loss": 1.4847, + "mean_token_accuracy": 0.6553497264782587, + "num_tokens": 1521538626.0, + "step": 9078 + }, + { + "entropy": 1.657319446404775, + "epoch": 0.9973908983548928, + "grad_norm": 0.6808715462684631, + "learning_rate": 1.1477712351247839e-05, + "loss": 1.3141, + "mean_token_accuracy": 0.6693860242764155, + "num_tokens": 1521697083.0, + "step": 9079 + }, + { + "entropy": 1.6426800390084584, + "epoch": 0.9975007552662657, + "grad_norm": 0.5870410203933716, + "learning_rate": 1.1476113471971773e-05, + "loss": 1.3934, + "mean_token_accuracy": 0.6434798091650009, + "num_tokens": 1521871698.0, + "step": 9080 + }, + { + "entropy": 1.7074416776498158, + "epoch": 0.9976106121776386, + "grad_norm": 0.6354272961616516, + "learning_rate": 1.1474514577626934e-05, + "loss": 1.437, + "mean_token_accuracy": 0.6434929817914963, + "num_tokens": 1522047384.0, + "step": 9081 + }, + { + "entropy": 1.749830315510432, + "epoch": 0.9977204690890116, + "grad_norm": 0.7293029427528381, + "learning_rate": 1.147291566826392e-05, + "loss": 1.3922, + "mean_token_accuracy": 0.6497508933146795, + "num_tokens": 1522218336.0, + "step": 9082 + }, + { + "entropy": 1.717184990644455, + "epoch": 0.9978303260003845, + "grad_norm": 0.6962136626243591, + "learning_rate": 1.1471316743933339e-05, + "loss": 1.341, + "mean_token_accuracy": 0.662678599357605, + "num_tokens": 1522347386.0, + "step": 9083 + }, + { + "entropy": 1.7010120153427124, + "epoch": 0.9979401829117575, + "grad_norm": 0.6533600091934204, + "learning_rate": 1.1469717804685795e-05, + "loss": 1.142, + "mean_token_accuracy": 0.6681769291559855, + "num_tokens": 1522515121.0, + "step": 9084 + }, + { + "entropy": 1.7459450960159302, + "epoch": 0.9980500398231303, + "grad_norm": 0.655548632144928, + "learning_rate": 1.1468118850571899e-05, + "loss": 1.3486, + "mean_token_accuracy": 0.6624786804119746, + "num_tokens": 1522668009.0, + "step": 9085 + }, + { + "entropy": 1.7290511826674144, + "epoch": 0.9981598967345033, + "grad_norm": 0.7485929131507874, + "learning_rate": 1.1466519881642246e-05, + "loss": 1.3893, + "mean_token_accuracy": 0.6591125130653381, + "num_tokens": 1522843719.0, + "step": 9086 + }, + { + "entropy": 1.6899711390336354, + "epoch": 0.9982697536458762, + "grad_norm": 0.6839129328727722, + "learning_rate": 1.146492089794745e-05, + "loss": 1.3313, + "mean_token_accuracy": 0.673391396800677, + "num_tokens": 1522991970.0, + "step": 9087 + }, + { + "entropy": 1.7311389843622844, + "epoch": 0.9983796105572492, + "grad_norm": 0.6621695756912231, + "learning_rate": 1.1463321899538117e-05, + "loss": 1.4378, + "mean_token_accuracy": 0.6520956506331762, + "num_tokens": 1523146080.0, + "step": 9088 + }, + { + "entropy": 1.7134305437405903, + "epoch": 0.9984894674686221, + "grad_norm": 0.5507893562316895, + "learning_rate": 1.1461722886464856e-05, + "loss": 1.5207, + "mean_token_accuracy": 0.6198769162098566, + "num_tokens": 1523399266.0, + "step": 9089 + }, + { + "entropy": 1.7272109687328339, + "epoch": 0.9985993243799951, + "grad_norm": 0.9652552008628845, + "learning_rate": 1.1460123858778276e-05, + "loss": 1.5345, + "mean_token_accuracy": 0.6346574972073237, + "num_tokens": 1523564638.0, + "step": 9090 + }, + { + "entropy": 1.7457146843274434, + "epoch": 0.998709181291368, + "grad_norm": 0.7757859230041504, + "learning_rate": 1.1458524816528981e-05, + "loss": 1.5039, + "mean_token_accuracy": 0.6465511868397394, + "num_tokens": 1523748062.0, + "step": 9091 + }, + { + "entropy": 1.6893859306971233, + "epoch": 0.998819038202741, + "grad_norm": 0.697685718536377, + "learning_rate": 1.1456925759767582e-05, + "loss": 1.4123, + "mean_token_accuracy": 0.6497365186611811, + "num_tokens": 1523895192.0, + "step": 9092 + }, + { + "entropy": 1.6923041641712189, + "epoch": 0.9989288951141139, + "grad_norm": 0.7961398363113403, + "learning_rate": 1.1455326688544688e-05, + "loss": 1.3345, + "mean_token_accuracy": 0.6674275547266006, + "num_tokens": 1524050097.0, + "step": 9093 + }, + { + "entropy": 1.6818099617958069, + "epoch": 0.9990387520254868, + "grad_norm": 0.6849196553230286, + "learning_rate": 1.1453727602910909e-05, + "loss": 1.3496, + "mean_token_accuracy": 0.6572927534580231, + "num_tokens": 1524209256.0, + "step": 9094 + }, + { + "entropy": 1.6935460070768993, + "epoch": 0.9991486089368598, + "grad_norm": 0.6596961617469788, + "learning_rate": 1.145212850291686e-05, + "loss": 1.2704, + "mean_token_accuracy": 0.6796439737081528, + "num_tokens": 1524398541.0, + "step": 9095 + }, + { + "entropy": 1.6655145784219105, + "epoch": 0.9992584658482326, + "grad_norm": 0.7738831043243408, + "learning_rate": 1.1450529388613144e-05, + "loss": 1.4704, + "mean_token_accuracy": 0.6514392644166946, + "num_tokens": 1524586535.0, + "step": 9096 + }, + { + "entropy": 1.635209560394287, + "epoch": 0.9993683227596056, + "grad_norm": 0.6728395223617554, + "learning_rate": 1.1448930260050375e-05, + "loss": 1.2365, + "mean_token_accuracy": 0.6900093406438828, + "num_tokens": 1524762897.0, + "step": 9097 + }, + { + "entropy": 1.71544353167216, + "epoch": 0.9994781796709785, + "grad_norm": 0.8286552429199219, + "learning_rate": 1.1447331117279168e-05, + "loss": 1.3268, + "mean_token_accuracy": 0.6621815711259842, + "num_tokens": 1524990809.0, + "step": 9098 + }, + { + "entropy": 1.6993728578090668, + "epoch": 0.9995880365823515, + "grad_norm": 0.6080448031425476, + "learning_rate": 1.144573196035013e-05, + "loss": 1.4415, + "mean_token_accuracy": 0.6439933578173319, + "num_tokens": 1525244055.0, + "step": 9099 + }, + { + "entropy": 1.7140068113803864, + "epoch": 0.9996978934937244, + "grad_norm": 0.7794548869132996, + "learning_rate": 1.144413278931388e-05, + "loss": 1.3395, + "mean_token_accuracy": 0.6707132905721664, + "num_tokens": 1525433832.0, + "step": 9100 + }, + { + "entropy": 1.800976832707723, + "epoch": 0.9998077504050974, + "grad_norm": 0.6397004723548889, + "learning_rate": 1.1442533604221025e-05, + "loss": 1.4958, + "mean_token_accuracy": 0.6379488656918207, + "num_tokens": 1525669327.0, + "step": 9101 + }, + { + "entropy": 1.7914798657099407, + "epoch": 0.9999176073164703, + "grad_norm": 0.7445523738861084, + "learning_rate": 1.144093440512218e-05, + "loss": 1.3949, + "mean_token_accuracy": 0.6527659147977829, + "num_tokens": 1525797142.0, + "step": 9102 + }, + { + "entropy": 1.7685750590430365, + "epoch": 1.0, + "grad_norm": 0.8221209049224854, + "learning_rate": 1.1439335192067961e-05, + "loss": 1.3113, + "mean_token_accuracy": 0.6720441844728258, + "num_tokens": 1525864289.0, + "step": 9103 + }, + { + "entropy": 1.7487789193789165, + "epoch": 1.0001098569113729, + "grad_norm": 0.6469790935516357, + "learning_rate": 1.1437735965108982e-05, + "loss": 1.3983, + "mean_token_accuracy": 0.6585537244876226, + "num_tokens": 1526037039.0, + "step": 9104 + }, + { + "entropy": 1.7090687155723572, + "epoch": 1.0002197138227458, + "grad_norm": 0.6354356408119202, + "learning_rate": 1.1436136724295855e-05, + "loss": 1.3507, + "mean_token_accuracy": 0.6568548729022344, + "num_tokens": 1526183317.0, + "step": 9105 + }, + { + "entropy": 1.778639147679011, + "epoch": 1.0003295707341189, + "grad_norm": 0.6832847595214844, + "learning_rate": 1.1434537469679197e-05, + "loss": 1.5105, + "mean_token_accuracy": 0.6421279708544413, + "num_tokens": 1526347692.0, + "step": 9106 + }, + { + "entropy": 1.671216607093811, + "epoch": 1.0004394276454918, + "grad_norm": 0.6764916181564331, + "learning_rate": 1.1432938201309627e-05, + "loss": 1.4491, + "mean_token_accuracy": 0.6480690489212672, + "num_tokens": 1526518664.0, + "step": 9107 + }, + { + "entropy": 1.6947729587554932, + "epoch": 1.0005492845568646, + "grad_norm": 0.7800838947296143, + "learning_rate": 1.1431338919237753e-05, + "loss": 1.4244, + "mean_token_accuracy": 0.6530092557271322, + "num_tokens": 1526722405.0, + "step": 9108 + }, + { + "entropy": 1.76162455479304, + "epoch": 1.0006591414682375, + "grad_norm": 0.6866593360900879, + "learning_rate": 1.1429739623514202e-05, + "loss": 1.3098, + "mean_token_accuracy": 0.6589891264835993, + "num_tokens": 1526831942.0, + "step": 9109 + }, + { + "entropy": 1.706797569990158, + "epoch": 1.0007689983796106, + "grad_norm": 0.6604471802711487, + "learning_rate": 1.1428140314189581e-05, + "loss": 1.5449, + "mean_token_accuracy": 0.6446791191895803, + "num_tokens": 1527004873.0, + "step": 9110 + }, + { + "entropy": 1.7111522555351257, + "epoch": 1.0008788552909835, + "grad_norm": 0.7985087633132935, + "learning_rate": 1.1426540991314516e-05, + "loss": 1.2653, + "mean_token_accuracy": 0.6687728961308798, + "num_tokens": 1527118121.0, + "step": 9111 + }, + { + "entropy": 1.7522371212641399, + "epoch": 1.0009887122023564, + "grad_norm": 0.6903984546661377, + "learning_rate": 1.1424941654939619e-05, + "loss": 1.4082, + "mean_token_accuracy": 0.6500783811012903, + "num_tokens": 1527289842.0, + "step": 9112 + }, + { + "entropy": 1.6945828100045521, + "epoch": 1.0010985691137293, + "grad_norm": 0.9061050415039062, + "learning_rate": 1.1423342305115512e-05, + "loss": 1.3049, + "mean_token_accuracy": 0.669381340344747, + "num_tokens": 1527431687.0, + "step": 9113 + }, + { + "entropy": 1.6780508855978649, + "epoch": 1.0012084260251024, + "grad_norm": 0.631228506565094, + "learning_rate": 1.1421742941892808e-05, + "loss": 1.3588, + "mean_token_accuracy": 0.6521518329779307, + "num_tokens": 1527618917.0, + "step": 9114 + }, + { + "entropy": 1.707773486773173, + "epoch": 1.0013182829364753, + "grad_norm": 0.7094998359680176, + "learning_rate": 1.1420143565322132e-05, + "loss": 1.4832, + "mean_token_accuracy": 0.6415019631385803, + "num_tokens": 1527828488.0, + "step": 9115 + }, + { + "entropy": 1.6614188154538472, + "epoch": 1.0014281398478482, + "grad_norm": 0.7236858010292053, + "learning_rate": 1.1418544175454103e-05, + "loss": 1.23, + "mean_token_accuracy": 0.6863870620727539, + "num_tokens": 1527993879.0, + "step": 9116 + }, + { + "entropy": 1.6892668704191844, + "epoch": 1.001537996759221, + "grad_norm": 0.6846614480018616, + "learning_rate": 1.1416944772339335e-05, + "loss": 1.3371, + "mean_token_accuracy": 0.6594837407271067, + "num_tokens": 1528128935.0, + "step": 9117 + }, + { + "entropy": 1.7514809270699818, + "epoch": 1.001647853670594, + "grad_norm": 0.7638584971427917, + "learning_rate": 1.1415345356028458e-05, + "loss": 1.4151, + "mean_token_accuracy": 0.6582140922546387, + "num_tokens": 1528297300.0, + "step": 9118 + }, + { + "entropy": 1.6739897926648457, + "epoch": 1.001757710581967, + "grad_norm": 0.6910973787307739, + "learning_rate": 1.1413745926572086e-05, + "loss": 1.3261, + "mean_token_accuracy": 0.6593633989493052, + "num_tokens": 1528434737.0, + "step": 9119 + }, + { + "entropy": 1.6245358089605968, + "epoch": 1.00186756749334, + "grad_norm": 0.6937578916549683, + "learning_rate": 1.1412146484020841e-05, + "loss": 1.3433, + "mean_token_accuracy": 0.6818140596151352, + "num_tokens": 1528594319.0, + "step": 9120 + }, + { + "entropy": 1.780480186144511, + "epoch": 1.0019774244047128, + "grad_norm": 0.7517448663711548, + "learning_rate": 1.1410547028425345e-05, + "loss": 1.341, + "mean_token_accuracy": 0.6648537566264471, + "num_tokens": 1528698660.0, + "step": 9121 + }, + { + "entropy": 1.7274539073308308, + "epoch": 1.0020872813160857, + "grad_norm": 0.8150880336761475, + "learning_rate": 1.140894755983622e-05, + "loss": 1.4617, + "mean_token_accuracy": 0.6476200868686041, + "num_tokens": 1528869853.0, + "step": 9122 + }, + { + "entropy": 1.6871586740016937, + "epoch": 1.0021971382274588, + "grad_norm": 0.6788790225982666, + "learning_rate": 1.1407348078304094e-05, + "loss": 1.4679, + "mean_token_accuracy": 0.6567584524552027, + "num_tokens": 1529051202.0, + "step": 9123 + }, + { + "entropy": 1.6829076210657756, + "epoch": 1.0023069951388317, + "grad_norm": 0.5760919451713562, + "learning_rate": 1.1405748583879578e-05, + "loss": 1.4402, + "mean_token_accuracy": 0.65225517253081, + "num_tokens": 1529253732.0, + "step": 9124 + }, + { + "entropy": 1.6780024766921997, + "epoch": 1.0024168520502046, + "grad_norm": 0.6377182006835938, + "learning_rate": 1.1404149076613307e-05, + "loss": 1.3531, + "mean_token_accuracy": 0.6596865554650625, + "num_tokens": 1529399965.0, + "step": 9125 + }, + { + "entropy": 1.7208391726016998, + "epoch": 1.0025267089615775, + "grad_norm": 0.6458075046539307, + "learning_rate": 1.1402549556555897e-05, + "loss": 1.5011, + "mean_token_accuracy": 0.6317235877116522, + "num_tokens": 1529603450.0, + "step": 9126 + }, + { + "entropy": 1.6888208488623302, + "epoch": 1.0026365658729506, + "grad_norm": 0.9376909732818604, + "learning_rate": 1.1400950023757974e-05, + "loss": 1.4523, + "mean_token_accuracy": 0.6374652137358984, + "num_tokens": 1529778344.0, + "step": 9127 + }, + { + "entropy": 1.6820887227853139, + "epoch": 1.0027464227843235, + "grad_norm": 0.6910247206687927, + "learning_rate": 1.1399350478270169e-05, + "loss": 1.2399, + "mean_token_accuracy": 0.678116371234258, + "num_tokens": 1529879742.0, + "step": 9128 + }, + { + "entropy": 1.6896148324012756, + "epoch": 1.0028562796956964, + "grad_norm": 0.8987225890159607, + "learning_rate": 1.1397750920143096e-05, + "loss": 1.1534, + "mean_token_accuracy": 0.6927760044733683, + "num_tokens": 1530010877.0, + "step": 9129 + }, + { + "entropy": 1.6502757966518402, + "epoch": 1.0029661366070692, + "grad_norm": 0.7082890272140503, + "learning_rate": 1.1396151349427386e-05, + "loss": 1.3722, + "mean_token_accuracy": 0.6598286827405294, + "num_tokens": 1530152017.0, + "step": 9130 + }, + { + "entropy": 1.6588487525780995, + "epoch": 1.0030759935184421, + "grad_norm": 0.6771763563156128, + "learning_rate": 1.1394551766173668e-05, + "loss": 1.4409, + "mean_token_accuracy": 0.649484987060229, + "num_tokens": 1530341414.0, + "step": 9131 + }, + { + "entropy": 1.6604489584763844, + "epoch": 1.0031858504298152, + "grad_norm": 0.6896274089813232, + "learning_rate": 1.1392952170432561e-05, + "loss": 1.4544, + "mean_token_accuracy": 0.6518164028724035, + "num_tokens": 1530547366.0, + "step": 9132 + }, + { + "entropy": 1.7379337052504222, + "epoch": 1.0032957073411881, + "grad_norm": 0.7070825695991516, + "learning_rate": 1.1391352562254696e-05, + "loss": 1.4681, + "mean_token_accuracy": 0.6329749723275503, + "num_tokens": 1530710763.0, + "step": 9133 + }, + { + "entropy": 1.7117513318856556, + "epoch": 1.003405564252561, + "grad_norm": 0.752705991268158, + "learning_rate": 1.1389752941690698e-05, + "loss": 1.3212, + "mean_token_accuracy": 0.6719024926424026, + "num_tokens": 1530851989.0, + "step": 9134 + }, + { + "entropy": 1.6691398521264393, + "epoch": 1.0035154211639339, + "grad_norm": 0.6442150473594666, + "learning_rate": 1.1388153308791196e-05, + "loss": 1.2853, + "mean_token_accuracy": 0.6734344561894735, + "num_tokens": 1531026173.0, + "step": 9135 + }, + { + "entropy": 1.7129848897457123, + "epoch": 1.003625278075307, + "grad_norm": 0.643925130367279, + "learning_rate": 1.1386553663606816e-05, + "loss": 1.3813, + "mean_token_accuracy": 0.6649036655823389, + "num_tokens": 1531167599.0, + "step": 9136 + }, + { + "entropy": 1.7421828111012776, + "epoch": 1.0037351349866799, + "grad_norm": 0.7078534960746765, + "learning_rate": 1.1384954006188188e-05, + "loss": 1.4009, + "mean_token_accuracy": 0.6498052229483923, + "num_tokens": 1531284335.0, + "step": 9137 + }, + { + "entropy": 1.6620370745658875, + "epoch": 1.0038449918980528, + "grad_norm": 0.6959486603736877, + "learning_rate": 1.1383354336585939e-05, + "loss": 1.2479, + "mean_token_accuracy": 0.6867090910673141, + "num_tokens": 1531411606.0, + "step": 9138 + }, + { + "entropy": 1.7095261812210083, + "epoch": 1.0039548488094256, + "grad_norm": 0.6359619498252869, + "learning_rate": 1.1381754654850696e-05, + "loss": 1.4088, + "mean_token_accuracy": 0.6495067228873571, + "num_tokens": 1531650481.0, + "step": 9139 + }, + { + "entropy": 1.7148667971293132, + "epoch": 1.0040647057207988, + "grad_norm": 0.626611053943634, + "learning_rate": 1.1380154961033091e-05, + "loss": 1.4655, + "mean_token_accuracy": 0.6441022356351217, + "num_tokens": 1531860240.0, + "step": 9140 + }, + { + "entropy": 1.6672312021255493, + "epoch": 1.0041745626321716, + "grad_norm": 0.6994165182113647, + "learning_rate": 1.1378555255183756e-05, + "loss": 1.3451, + "mean_token_accuracy": 0.6728704025348028, + "num_tokens": 1532028027.0, + "step": 9141 + }, + { + "entropy": 1.6781473656495411, + "epoch": 1.0042844195435445, + "grad_norm": 0.6643490791320801, + "learning_rate": 1.1376955537353314e-05, + "loss": 1.3587, + "mean_token_accuracy": 0.6705238421758016, + "num_tokens": 1532185194.0, + "step": 9142 + }, + { + "entropy": 1.695909669001897, + "epoch": 1.0043942764549174, + "grad_norm": 0.7049947381019592, + "learning_rate": 1.1375355807592401e-05, + "loss": 1.2133, + "mean_token_accuracy": 0.6815102944771448, + "num_tokens": 1532292206.0, + "step": 9143 + }, + { + "entropy": 1.6645172834396362, + "epoch": 1.0045041333662903, + "grad_norm": 0.8290209174156189, + "learning_rate": 1.1373756065951645e-05, + "loss": 1.29, + "mean_token_accuracy": 0.6677450140317281, + "num_tokens": 1532417707.0, + "step": 9144 + }, + { + "entropy": 1.7276211281617482, + "epoch": 1.0046139902776634, + "grad_norm": 0.6277461051940918, + "learning_rate": 1.1372156312481676e-05, + "loss": 1.4016, + "mean_token_accuracy": 0.6529937634865443, + "num_tokens": 1532596038.0, + "step": 9145 + }, + { + "entropy": 1.6773775219917297, + "epoch": 1.0047238471890363, + "grad_norm": 0.5622502565383911, + "learning_rate": 1.1370556547233129e-05, + "loss": 1.3817, + "mean_token_accuracy": 0.6580819934606552, + "num_tokens": 1532837671.0, + "step": 9146 + }, + { + "entropy": 1.6747903128465016, + "epoch": 1.0048337041004092, + "grad_norm": 0.7736158967018127, + "learning_rate": 1.1368956770256636e-05, + "loss": 1.43, + "mean_token_accuracy": 0.6633151968320211, + "num_tokens": 1532987212.0, + "step": 9147 + }, + { + "entropy": 1.6907167434692383, + "epoch": 1.004943561011782, + "grad_norm": 0.6770092248916626, + "learning_rate": 1.1367356981602824e-05, + "loss": 1.315, + "mean_token_accuracy": 0.6748213569323221, + "num_tokens": 1533166077.0, + "step": 9148 + }, + { + "entropy": 1.630173772573471, + "epoch": 1.0050534179231552, + "grad_norm": 0.7909526824951172, + "learning_rate": 1.1365757181322332e-05, + "loss": 1.2256, + "mean_token_accuracy": 0.676344245672226, + "num_tokens": 1533285561.0, + "step": 9149 + }, + { + "entropy": 1.6531668106714885, + "epoch": 1.005163274834528, + "grad_norm": 0.7197229266166687, + "learning_rate": 1.1364157369465791e-05, + "loss": 1.3322, + "mean_token_accuracy": 0.6645158727963766, + "num_tokens": 1533446931.0, + "step": 9150 + }, + { + "entropy": 1.7534123659133911, + "epoch": 1.005273131745901, + "grad_norm": 0.7781052589416504, + "learning_rate": 1.136255754608383e-05, + "loss": 1.3887, + "mean_token_accuracy": 0.6418828169504801, + "num_tokens": 1533605019.0, + "step": 9151 + }, + { + "entropy": 1.711018721262614, + "epoch": 1.0053829886572738, + "grad_norm": 0.6242640018463135, + "learning_rate": 1.1360957711227087e-05, + "loss": 1.3331, + "mean_token_accuracy": 0.658738394578298, + "num_tokens": 1533756344.0, + "step": 9152 + }, + { + "entropy": 1.7103002866109211, + "epoch": 1.005492845568647, + "grad_norm": 0.7828008532524109, + "learning_rate": 1.1359357864946197e-05, + "loss": 1.4497, + "mean_token_accuracy": 0.6512656211853027, + "num_tokens": 1533940363.0, + "step": 9153 + }, + { + "entropy": 1.7072757482528687, + "epoch": 1.0056027024800198, + "grad_norm": 0.6864430904388428, + "learning_rate": 1.135775800729179e-05, + "loss": 1.4115, + "mean_token_accuracy": 0.6535514990488688, + "num_tokens": 1534129155.0, + "step": 9154 + }, + { + "entropy": 1.645534763733546, + "epoch": 1.0057125593913927, + "grad_norm": 0.6666370630264282, + "learning_rate": 1.1356158138314504e-05, + "loss": 1.3353, + "mean_token_accuracy": 0.6572008977333704, + "num_tokens": 1534352556.0, + "step": 9155 + }, + { + "entropy": 1.7274697025616963, + "epoch": 1.0058224163027656, + "grad_norm": 0.762096107006073, + "learning_rate": 1.1354558258064974e-05, + "loss": 1.3457, + "mean_token_accuracy": 0.6579453647136688, + "num_tokens": 1534462733.0, + "step": 9156 + }, + { + "entropy": 1.7067709763844807, + "epoch": 1.0059322732141387, + "grad_norm": 0.7625720500946045, + "learning_rate": 1.1352958366593838e-05, + "loss": 1.1324, + "mean_token_accuracy": 0.7027974327405294, + "num_tokens": 1534566983.0, + "step": 9157 + }, + { + "entropy": 1.6603530049324036, + "epoch": 1.0060421301255116, + "grad_norm": 0.6020463705062866, + "learning_rate": 1.1351358463951722e-05, + "loss": 1.5009, + "mean_token_accuracy": 0.6399320860703787, + "num_tokens": 1534784810.0, + "step": 9158 + }, + { + "entropy": 1.7036788860956829, + "epoch": 1.0061519870368845, + "grad_norm": 0.6299261450767517, + "learning_rate": 1.1349758550189276e-05, + "loss": 1.4234, + "mean_token_accuracy": 0.6528842945893606, + "num_tokens": 1534972111.0, + "step": 9159 + }, + { + "entropy": 1.69377605120341, + "epoch": 1.0062618439482574, + "grad_norm": 0.7959760427474976, + "learning_rate": 1.1348158625357125e-05, + "loss": 1.3657, + "mean_token_accuracy": 0.6739930411179861, + "num_tokens": 1535118835.0, + "step": 9160 + }, + { + "entropy": 1.7244078516960144, + "epoch": 1.0063717008596302, + "grad_norm": 0.5939955115318298, + "learning_rate": 1.1346558689505911e-05, + "loss": 1.5987, + "mean_token_accuracy": 0.6203551987806956, + "num_tokens": 1535344945.0, + "step": 9161 + }, + { + "entropy": 1.6984511812527974, + "epoch": 1.0064815577710033, + "grad_norm": 0.6517001390457153, + "learning_rate": 1.134495874268627e-05, + "loss": 1.3917, + "mean_token_accuracy": 0.6546116669972738, + "num_tokens": 1535547924.0, + "step": 9162 + }, + { + "entropy": 1.765064944823583, + "epoch": 1.0065914146823762, + "grad_norm": 0.6505841612815857, + "learning_rate": 1.1343358784948841e-05, + "loss": 1.4628, + "mean_token_accuracy": 0.6358625143766403, + "num_tokens": 1535727355.0, + "step": 9163 + }, + { + "entropy": 1.652586172024409, + "epoch": 1.0067012715937491, + "grad_norm": 0.6075358986854553, + "learning_rate": 1.1341758816344261e-05, + "loss": 1.3771, + "mean_token_accuracy": 0.657783105969429, + "num_tokens": 1535890845.0, + "step": 9164 + }, + { + "entropy": 1.7130601306756337, + "epoch": 1.006811128505122, + "grad_norm": 0.6406556963920593, + "learning_rate": 1.1340158836923169e-05, + "loss": 1.3664, + "mean_token_accuracy": 0.6631237914164861, + "num_tokens": 1536024832.0, + "step": 9165 + }, + { + "entropy": 1.6731863021850586, + "epoch": 1.006920985416495, + "grad_norm": 0.6789574027061462, + "learning_rate": 1.1338558846736203e-05, + "loss": 1.5358, + "mean_token_accuracy": 0.644293467203776, + "num_tokens": 1536181406.0, + "step": 9166 + }, + { + "entropy": 1.7349721789360046, + "epoch": 1.007030842327868, + "grad_norm": 0.754366934299469, + "learning_rate": 1.1336958845834001e-05, + "loss": 1.3995, + "mean_token_accuracy": 0.6601565976937612, + "num_tokens": 1536312544.0, + "step": 9167 + }, + { + "entropy": 1.7296982606252034, + "epoch": 1.0071406992392409, + "grad_norm": 0.7296738624572754, + "learning_rate": 1.1335358834267202e-05, + "loss": 1.4222, + "mean_token_accuracy": 0.6510292192300161, + "num_tokens": 1536477655.0, + "step": 9168 + }, + { + "entropy": 1.6839477519194286, + "epoch": 1.0072505561506138, + "grad_norm": 0.7424401044845581, + "learning_rate": 1.1333758812086455e-05, + "loss": 1.4959, + "mean_token_accuracy": 0.6491817037264506, + "num_tokens": 1536682787.0, + "step": 9169 + }, + { + "entropy": 1.7023440599441528, + "epoch": 1.0073604130619869, + "grad_norm": 0.6158515810966492, + "learning_rate": 1.1332158779342382e-05, + "loss": 1.4231, + "mean_token_accuracy": 0.649179662267367, + "num_tokens": 1536874196.0, + "step": 9170 + }, + { + "entropy": 1.6747990051905315, + "epoch": 1.0074702699733598, + "grad_norm": 0.7934166789054871, + "learning_rate": 1.1330558736085639e-05, + "loss": 1.4705, + "mean_token_accuracy": 0.6424223830302557, + "num_tokens": 1537043636.0, + "step": 9171 + }, + { + "entropy": 1.7200193206469219, + "epoch": 1.0075801268847326, + "grad_norm": 0.625928521156311, + "learning_rate": 1.132895868236686e-05, + "loss": 1.3149, + "mean_token_accuracy": 0.6662678668896357, + "num_tokens": 1537192653.0, + "step": 9172 + }, + { + "entropy": 1.6934810976187389, + "epoch": 1.0076899837961055, + "grad_norm": 0.7504294514656067, + "learning_rate": 1.1327358618236686e-05, + "loss": 1.4699, + "mean_token_accuracy": 0.6619169364372889, + "num_tokens": 1537364617.0, + "step": 9173 + }, + { + "entropy": 1.6977708041667938, + "epoch": 1.0077998407074784, + "grad_norm": 0.7154932618141174, + "learning_rate": 1.132575854374576e-05, + "loss": 1.3676, + "mean_token_accuracy": 0.6595998754103979, + "num_tokens": 1537500964.0, + "step": 9174 + }, + { + "entropy": 1.661940226952235, + "epoch": 1.0079096976188515, + "grad_norm": 0.6199997067451477, + "learning_rate": 1.1324158458944724e-05, + "loss": 1.3844, + "mean_token_accuracy": 0.6616794715325037, + "num_tokens": 1537688873.0, + "step": 9175 + }, + { + "entropy": 1.7638680438200633, + "epoch": 1.0080195545302244, + "grad_norm": 0.6562539339065552, + "learning_rate": 1.1322558363884215e-05, + "loss": 1.5134, + "mean_token_accuracy": 0.6430147786935171, + "num_tokens": 1537938462.0, + "step": 9176 + }, + { + "entropy": 1.706565539042155, + "epoch": 1.0081294114415973, + "grad_norm": 0.7883732914924622, + "learning_rate": 1.1320958258614882e-05, + "loss": 1.4292, + "mean_token_accuracy": 0.6586494793494543, + "num_tokens": 1538090142.0, + "step": 9177 + }, + { + "entropy": 1.7051230370998383, + "epoch": 1.0082392683529702, + "grad_norm": 0.635830283164978, + "learning_rate": 1.1319358143187364e-05, + "loss": 1.4372, + "mean_token_accuracy": 0.6473241200049719, + "num_tokens": 1538303825.0, + "step": 9178 + }, + { + "entropy": 1.6625941793123882, + "epoch": 1.0083491252643433, + "grad_norm": 0.6459980607032776, + "learning_rate": 1.1317758017652304e-05, + "loss": 1.3858, + "mean_token_accuracy": 0.6522909849882126, + "num_tokens": 1538515668.0, + "step": 9179 + }, + { + "entropy": 1.69016495347023, + "epoch": 1.0084589821757162, + "grad_norm": 0.8300599455833435, + "learning_rate": 1.1316157882060347e-05, + "loss": 1.4209, + "mean_token_accuracy": 0.6421416252851486, + "num_tokens": 1538721803.0, + "step": 9180 + }, + { + "entropy": 1.7026409308115642, + "epoch": 1.008568839087089, + "grad_norm": 0.7141305804252625, + "learning_rate": 1.131455773646214e-05, + "loss": 1.3456, + "mean_token_accuracy": 0.6597028175989786, + "num_tokens": 1538867063.0, + "step": 9181 + }, + { + "entropy": 1.6708631614844005, + "epoch": 1.008678695998462, + "grad_norm": 0.7779679298400879, + "learning_rate": 1.1312957580908316e-05, + "loss": 1.399, + "mean_token_accuracy": 0.6662464737892151, + "num_tokens": 1539025340.0, + "step": 9182 + }, + { + "entropy": 1.7861079672972362, + "epoch": 1.008788552909835, + "grad_norm": 0.7467420697212219, + "learning_rate": 1.1311357415449527e-05, + "loss": 1.409, + "mean_token_accuracy": 0.6420427312453588, + "num_tokens": 1539177337.0, + "step": 9183 + }, + { + "entropy": 1.6625968714555104, + "epoch": 1.008898409821208, + "grad_norm": 0.7059647440910339, + "learning_rate": 1.1309757240136416e-05, + "loss": 1.3722, + "mean_token_accuracy": 0.6587095757325491, + "num_tokens": 1539350824.0, + "step": 9184 + }, + { + "entropy": 1.6715769072373707, + "epoch": 1.0090082667325808, + "grad_norm": 0.8573365211486816, + "learning_rate": 1.130815705501963e-05, + "loss": 1.2702, + "mean_token_accuracy": 0.6697567055622736, + "num_tokens": 1539515545.0, + "step": 9185 + }, + { + "entropy": 1.6940159698327382, + "epoch": 1.0091181236439537, + "grad_norm": 0.7451865673065186, + "learning_rate": 1.1306556860149807e-05, + "loss": 1.2517, + "mean_token_accuracy": 0.6819993555545807, + "num_tokens": 1539654828.0, + "step": 9186 + }, + { + "entropy": 1.6895244518915813, + "epoch": 1.0092279805553266, + "grad_norm": 0.662607729434967, + "learning_rate": 1.1304956655577603e-05, + "loss": 1.498, + "mean_token_accuracy": 0.6299208501974741, + "num_tokens": 1539887429.0, + "step": 9187 + }, + { + "entropy": 1.7312080164750416, + "epoch": 1.0093378374666997, + "grad_norm": 0.7649029493331909, + "learning_rate": 1.1303356441353657e-05, + "loss": 1.2637, + "mean_token_accuracy": 0.6721003005901972, + "num_tokens": 1540007181.0, + "step": 9188 + }, + { + "entropy": 1.6537673771381378, + "epoch": 1.0094476943780726, + "grad_norm": 0.6279980540275574, + "learning_rate": 1.1301756217528617e-05, + "loss": 1.4753, + "mean_token_accuracy": 0.6382663249969482, + "num_tokens": 1540226678.0, + "step": 9189 + }, + { + "entropy": 1.717966636021932, + "epoch": 1.0095575512894455, + "grad_norm": 0.6371181607246399, + "learning_rate": 1.1300155984153125e-05, + "loss": 1.5536, + "mean_token_accuracy": 0.6400111019611359, + "num_tokens": 1540398417.0, + "step": 9190 + }, + { + "entropy": 1.7388447523117065, + "epoch": 1.0096674082008184, + "grad_norm": 0.6854660511016846, + "learning_rate": 1.1298555741277837e-05, + "loss": 1.4315, + "mean_token_accuracy": 0.6481965134541193, + "num_tokens": 1540574257.0, + "step": 9191 + }, + { + "entropy": 1.7373858094215393, + "epoch": 1.0097772651121915, + "grad_norm": 0.8039863705635071, + "learning_rate": 1.1296955488953385e-05, + "loss": 1.5048, + "mean_token_accuracy": 0.6509123841921488, + "num_tokens": 1540737286.0, + "step": 9192 + }, + { + "entropy": 1.733175406853358, + "epoch": 1.0098871220235643, + "grad_norm": 0.7755190134048462, + "learning_rate": 1.1295355227230434e-05, + "loss": 1.4932, + "mean_token_accuracy": 0.6399683107932409, + "num_tokens": 1540938739.0, + "step": 9193 + }, + { + "entropy": 1.7742779056231182, + "epoch": 1.0099969789349372, + "grad_norm": 0.8532186150550842, + "learning_rate": 1.1293754956159622e-05, + "loss": 1.2785, + "mean_token_accuracy": 0.681659941871961, + "num_tokens": 1541076692.0, + "step": 9194 + }, + { + "entropy": 1.6056585907936096, + "epoch": 1.0101068358463101, + "grad_norm": 0.6729063987731934, + "learning_rate": 1.1292154675791596e-05, + "loss": 1.3879, + "mean_token_accuracy": 0.6605608214934667, + "num_tokens": 1541267393.0, + "step": 9195 + }, + { + "entropy": 1.6868411600589752, + "epoch": 1.0102166927576832, + "grad_norm": 0.6951854825019836, + "learning_rate": 1.1290554386177006e-05, + "loss": 1.3287, + "mean_token_accuracy": 0.6730459630489349, + "num_tokens": 1541408773.0, + "step": 9196 + }, + { + "entropy": 1.70897176861763, + "epoch": 1.010326549669056, + "grad_norm": 0.7483338117599487, + "learning_rate": 1.1288954087366504e-05, + "loss": 1.5295, + "mean_token_accuracy": 0.6577043558160464, + "num_tokens": 1541578695.0, + "step": 9197 + }, + { + "entropy": 1.6923380196094513, + "epoch": 1.010436406580429, + "grad_norm": 0.6193323731422424, + "learning_rate": 1.128735377941073e-05, + "loss": 1.4399, + "mean_token_accuracy": 0.651967058579127, + "num_tokens": 1541828514.0, + "step": 9198 + }, + { + "entropy": 1.6712921659151714, + "epoch": 1.0105462634918019, + "grad_norm": 0.6577204465866089, + "learning_rate": 1.1285753462360343e-05, + "loss": 1.3324, + "mean_token_accuracy": 0.6647132039070129, + "num_tokens": 1541971144.0, + "step": 9199 + }, + { + "entropy": 1.6951302190621693, + "epoch": 1.0106561204031748, + "grad_norm": 0.67694091796875, + "learning_rate": 1.1284153136265986e-05, + "loss": 1.4847, + "mean_token_accuracy": 0.656242623925209, + "num_tokens": 1542162549.0, + "step": 9200 + }, + { + "entropy": 1.658402919769287, + "epoch": 1.0107659773145479, + "grad_norm": 0.6639556884765625, + "learning_rate": 1.1282552801178308e-05, + "loss": 1.2398, + "mean_token_accuracy": 0.6856526831785837, + "num_tokens": 1542331765.0, + "step": 9201 + }, + { + "entropy": 1.7573369940121968, + "epoch": 1.0108758342259208, + "grad_norm": 0.7384838461875916, + "learning_rate": 1.1280952457147964e-05, + "loss": 1.4386, + "mean_token_accuracy": 0.6591017047564188, + "num_tokens": 1542461418.0, + "step": 9202 + }, + { + "entropy": 1.7195665736993153, + "epoch": 1.0109856911372936, + "grad_norm": 0.7522571086883545, + "learning_rate": 1.1279352104225603e-05, + "loss": 1.2697, + "mean_token_accuracy": 0.6741450677315394, + "num_tokens": 1542598399.0, + "step": 9203 + }, + { + "entropy": 1.7449373702208202, + "epoch": 1.0110955480486665, + "grad_norm": 0.7479003071784973, + "learning_rate": 1.127775174246187e-05, + "loss": 1.2921, + "mean_token_accuracy": 0.6684492280085882, + "num_tokens": 1542703548.0, + "step": 9204 + }, + { + "entropy": 1.7740589280923207, + "epoch": 1.0112054049600396, + "grad_norm": 0.7149597406387329, + "learning_rate": 1.1276151371907422e-05, + "loss": 1.4712, + "mean_token_accuracy": 0.647121841708819, + "num_tokens": 1542869904.0, + "step": 9205 + }, + { + "entropy": 1.759735494852066, + "epoch": 1.0113152618714125, + "grad_norm": 0.616786003112793, + "learning_rate": 1.1274550992612905e-05, + "loss": 1.5223, + "mean_token_accuracy": 0.6346758852402369, + "num_tokens": 1543072195.0, + "step": 9206 + }, + { + "entropy": 1.6773889164129894, + "epoch": 1.0114251187827854, + "grad_norm": 0.7276062965393066, + "learning_rate": 1.1272950604628974e-05, + "loss": 1.465, + "mean_token_accuracy": 0.6493238161007563, + "num_tokens": 1543242043.0, + "step": 9207 + }, + { + "entropy": 1.7213714122772217, + "epoch": 1.0115349756941583, + "grad_norm": 0.5921209454536438, + "learning_rate": 1.1271350208006277e-05, + "loss": 1.4313, + "mean_token_accuracy": 0.6454088240861893, + "num_tokens": 1543424668.0, + "step": 9208 + }, + { + "entropy": 1.7043544550736744, + "epoch": 1.0116448326055314, + "grad_norm": 0.703683078289032, + "learning_rate": 1.1269749802795475e-05, + "loss": 1.5242, + "mean_token_accuracy": 0.639569049080213, + "num_tokens": 1543590504.0, + "step": 9209 + }, + { + "entropy": 1.7013638118902843, + "epoch": 1.0117546895169043, + "grad_norm": 0.651660144329071, + "learning_rate": 1.1268149389047207e-05, + "loss": 1.3612, + "mean_token_accuracy": 0.6684914082288742, + "num_tokens": 1543742205.0, + "step": 9210 + }, + { + "entropy": 1.7225966254870098, + "epoch": 1.0118645464282772, + "grad_norm": 0.680589497089386, + "learning_rate": 1.1266548966812136e-05, + "loss": 1.4609, + "mean_token_accuracy": 0.6508485525846481, + "num_tokens": 1543960759.0, + "step": 9211 + }, + { + "entropy": 1.690873513619105, + "epoch": 1.01197440333965, + "grad_norm": 0.6258131265640259, + "learning_rate": 1.1264948536140908e-05, + "loss": 1.2756, + "mean_token_accuracy": 0.6746162871519724, + "num_tokens": 1544079815.0, + "step": 9212 + }, + { + "entropy": 1.7661484678586323, + "epoch": 1.012084260251023, + "grad_norm": 0.7420499324798584, + "learning_rate": 1.126334809708418e-05, + "loss": 1.4235, + "mean_token_accuracy": 0.6455812205870947, + "num_tokens": 1544236306.0, + "step": 9213 + }, + { + "entropy": 1.6909300088882446, + "epoch": 1.012194117162396, + "grad_norm": 0.7056745886802673, + "learning_rate": 1.1261747649692598e-05, + "loss": 1.4225, + "mean_token_accuracy": 0.6687211891015371, + "num_tokens": 1544360026.0, + "step": 9214 + }, + { + "entropy": 1.7200450201829274, + "epoch": 1.012303974073769, + "grad_norm": 0.6737309694290161, + "learning_rate": 1.1260147194016826e-05, + "loss": 1.4383, + "mean_token_accuracy": 0.6554515163103739, + "num_tokens": 1544518333.0, + "step": 9215 + }, + { + "entropy": 1.6989065408706665, + "epoch": 1.0124138309851418, + "grad_norm": 0.7748240232467651, + "learning_rate": 1.1258546730107511e-05, + "loss": 1.3029, + "mean_token_accuracy": 0.6642332822084427, + "num_tokens": 1544633183.0, + "step": 9216 + }, + { + "entropy": 1.684309144814809, + "epoch": 1.0125236878965147, + "grad_norm": 0.7276560068130493, + "learning_rate": 1.1256946258015309e-05, + "loss": 1.3735, + "mean_token_accuracy": 0.6629331211249033, + "num_tokens": 1544766272.0, + "step": 9217 + }, + { + "entropy": 1.7121345500151317, + "epoch": 1.0126335448078878, + "grad_norm": 0.6448559761047363, + "learning_rate": 1.1255345777790874e-05, + "loss": 1.351, + "mean_token_accuracy": 0.6598079651594162, + "num_tokens": 1544978083.0, + "step": 9218 + }, + { + "entropy": 1.6897433201471965, + "epoch": 1.0127434017192607, + "grad_norm": 0.6677665114402771, + "learning_rate": 1.1253745289484858e-05, + "loss": 1.2992, + "mean_token_accuracy": 0.6672473748524984, + "num_tokens": 1545113400.0, + "step": 9219 + }, + { + "entropy": 1.730336219072342, + "epoch": 1.0128532586306336, + "grad_norm": 0.7854005694389343, + "learning_rate": 1.1252144793147919e-05, + "loss": 1.3935, + "mean_token_accuracy": 0.666775236527125, + "num_tokens": 1545256599.0, + "step": 9220 + }, + { + "entropy": 1.6706956028938293, + "epoch": 1.0129631155420065, + "grad_norm": 0.6474412083625793, + "learning_rate": 1.1250544288830712e-05, + "loss": 1.3071, + "mean_token_accuracy": 0.6717335432767868, + "num_tokens": 1545436360.0, + "step": 9221 + }, + { + "entropy": 1.670490821202596, + "epoch": 1.0130729724533796, + "grad_norm": 0.666471004486084, + "learning_rate": 1.1248943776583892e-05, + "loss": 1.3748, + "mean_token_accuracy": 0.6653418590625128, + "num_tokens": 1545607551.0, + "step": 9222 + }, + { + "entropy": 1.6969726085662842, + "epoch": 1.0131828293647525, + "grad_norm": 0.7258543968200684, + "learning_rate": 1.124734325645811e-05, + "loss": 1.3163, + "mean_token_accuracy": 0.6724565674861273, + "num_tokens": 1545796908.0, + "step": 9223 + }, + { + "entropy": 1.7342469195524852, + "epoch": 1.0132926862761253, + "grad_norm": 0.7631778717041016, + "learning_rate": 1.1245742728504028e-05, + "loss": 1.4204, + "mean_token_accuracy": 0.6610995680093765, + "num_tokens": 1545909631.0, + "step": 9224 + }, + { + "entropy": 1.7213138242562611, + "epoch": 1.0134025431874982, + "grad_norm": 0.6091188192367554, + "learning_rate": 1.1244142192772301e-05, + "loss": 1.4003, + "mean_token_accuracy": 0.6554784874121348, + "num_tokens": 1546167237.0, + "step": 9225 + }, + { + "entropy": 1.6888580818970997, + "epoch": 1.0135124000988711, + "grad_norm": 0.684411346912384, + "learning_rate": 1.1242541649313577e-05, + "loss": 1.4356, + "mean_token_accuracy": 0.6582817882299423, + "num_tokens": 1546363932.0, + "step": 9226 + }, + { + "entropy": 1.7218577762444813, + "epoch": 1.0136222570102442, + "grad_norm": 0.5998579263687134, + "learning_rate": 1.1240941098178527e-05, + "loss": 1.4542, + "mean_token_accuracy": 0.6470504850149155, + "num_tokens": 1546546664.0, + "step": 9227 + }, + { + "entropy": 1.6980265875657399, + "epoch": 1.013732113921617, + "grad_norm": 0.9041829109191895, + "learning_rate": 1.1239340539417796e-05, + "loss": 1.4107, + "mean_token_accuracy": 0.651981790860494, + "num_tokens": 1546699355.0, + "step": 9228 + }, + { + "entropy": 1.653926134109497, + "epoch": 1.01384197083299, + "grad_norm": 1.0387535095214844, + "learning_rate": 1.1237739973082045e-05, + "loss": 1.085, + "mean_token_accuracy": 0.6834785888592402, + "num_tokens": 1546888748.0, + "step": 9229 + }, + { + "entropy": 1.7255582809448242, + "epoch": 1.0139518277443629, + "grad_norm": 0.6586278676986694, + "learning_rate": 1.123613939922193e-05, + "loss": 1.3067, + "mean_token_accuracy": 0.66642597814401, + "num_tokens": 1547058192.0, + "step": 9230 + }, + { + "entropy": 1.6987995107968648, + "epoch": 1.014061684655736, + "grad_norm": 0.6259772777557373, + "learning_rate": 1.1234538817888112e-05, + "loss": 1.3602, + "mean_token_accuracy": 0.6535337815682093, + "num_tokens": 1547227451.0, + "step": 9231 + }, + { + "entropy": 1.7403975526491802, + "epoch": 1.0141715415671089, + "grad_norm": 0.6632811427116394, + "learning_rate": 1.1232938229131243e-05, + "loss": 1.3447, + "mean_token_accuracy": 0.6578138470649719, + "num_tokens": 1547376096.0, + "step": 9232 + }, + { + "entropy": 1.7203827500343323, + "epoch": 1.0142813984784818, + "grad_norm": 0.6725448369979858, + "learning_rate": 1.1231337633001987e-05, + "loss": 1.4274, + "mean_token_accuracy": 0.6475430677334467, + "num_tokens": 1547535222.0, + "step": 9233 + }, + { + "entropy": 1.7150332828362782, + "epoch": 1.0143912553898546, + "grad_norm": 0.6448984742164612, + "learning_rate": 1.1229737029550997e-05, + "loss": 1.2242, + "mean_token_accuracy": 0.6787202705939611, + "num_tokens": 1547643675.0, + "step": 9234 + }, + { + "entropy": 1.7125314672787983, + "epoch": 1.0145011123012277, + "grad_norm": 0.6407131552696228, + "learning_rate": 1.1228136418828934e-05, + "loss": 1.4369, + "mean_token_accuracy": 0.6412733842929205, + "num_tokens": 1547864654.0, + "step": 9235 + }, + { + "entropy": 1.6636697153250377, + "epoch": 1.0146109692126006, + "grad_norm": 0.7547443509101868, + "learning_rate": 1.1226535800886456e-05, + "loss": 1.2837, + "mean_token_accuracy": 0.6684808333714803, + "num_tokens": 1547998232.0, + "step": 9236 + }, + { + "entropy": 1.6625105440616608, + "epoch": 1.0147208261239735, + "grad_norm": 0.6787840723991394, + "learning_rate": 1.1224935175774225e-05, + "loss": 1.2093, + "mean_token_accuracy": 0.6760126401980718, + "num_tokens": 1548179995.0, + "step": 9237 + }, + { + "entropy": 1.7261524299780528, + "epoch": 1.0148306830353464, + "grad_norm": 0.7945687174797058, + "learning_rate": 1.1223334543542892e-05, + "loss": 1.3306, + "mean_token_accuracy": 0.6550563474496206, + "num_tokens": 1548324181.0, + "step": 9238 + }, + { + "entropy": 1.626392384370168, + "epoch": 1.0149405399467193, + "grad_norm": 0.5870576500892639, + "learning_rate": 1.1221733904243126e-05, + "loss": 1.314, + "mean_token_accuracy": 0.6767490158478419, + "num_tokens": 1548479091.0, + "step": 9239 + }, + { + "entropy": 1.62749649087588, + "epoch": 1.0150503968580924, + "grad_norm": 0.6715527772903442, + "learning_rate": 1.1220133257925581e-05, + "loss": 1.3297, + "mean_token_accuracy": 0.6707568516333898, + "num_tokens": 1548664070.0, + "step": 9240 + }, + { + "entropy": 1.7191180487473805, + "epoch": 1.0151602537694653, + "grad_norm": 0.6779627203941345, + "learning_rate": 1.1218532604640912e-05, + "loss": 1.4021, + "mean_token_accuracy": 0.6458870420853297, + "num_tokens": 1548850266.0, + "step": 9241 + }, + { + "entropy": 1.7357692917188008, + "epoch": 1.0152701106808382, + "grad_norm": 0.7277780175209045, + "learning_rate": 1.121693194443979e-05, + "loss": 1.2538, + "mean_token_accuracy": 0.6712505420049032, + "num_tokens": 1548990486.0, + "step": 9242 + }, + { + "entropy": 1.7432827452818553, + "epoch": 1.015379967592211, + "grad_norm": 0.6973701119422913, + "learning_rate": 1.1215331277372869e-05, + "loss": 1.3871, + "mean_token_accuracy": 0.6440560271342596, + "num_tokens": 1549135814.0, + "step": 9243 + }, + { + "entropy": 1.707447479168574, + "epoch": 1.0154898245035842, + "grad_norm": 0.7135317325592041, + "learning_rate": 1.1213730603490808e-05, + "loss": 1.5025, + "mean_token_accuracy": 0.6496869872013727, + "num_tokens": 1549283138.0, + "step": 9244 + }, + { + "entropy": 1.7765212555726368, + "epoch": 1.015599681414957, + "grad_norm": 0.6586817502975464, + "learning_rate": 1.1212129922844275e-05, + "loss": 1.3391, + "mean_token_accuracy": 0.6648561110099157, + "num_tokens": 1549420931.0, + "step": 9245 + }, + { + "entropy": 1.7636926869551341, + "epoch": 1.01570953832633, + "grad_norm": 0.6714787483215332, + "learning_rate": 1.1210529235483921e-05, + "loss": 1.4032, + "mean_token_accuracy": 0.6477454006671906, + "num_tokens": 1549548264.0, + "step": 9246 + }, + { + "entropy": 1.7096338669459026, + "epoch": 1.0158193952377028, + "grad_norm": 0.7031419277191162, + "learning_rate": 1.1208928541460413e-05, + "loss": 1.4511, + "mean_token_accuracy": 0.6563627272844315, + "num_tokens": 1549736242.0, + "step": 9247 + }, + { + "entropy": 1.6731836001078289, + "epoch": 1.015929252149076, + "grad_norm": 0.8989787697792053, + "learning_rate": 1.1207327840824408e-05, + "loss": 1.4856, + "mean_token_accuracy": 0.6486278722683588, + "num_tokens": 1549903701.0, + "step": 9248 + }, + { + "entropy": 1.6696616013844807, + "epoch": 1.0160391090604488, + "grad_norm": 0.6424921751022339, + "learning_rate": 1.1205727133626577e-05, + "loss": 1.3334, + "mean_token_accuracy": 0.6658105552196503, + "num_tokens": 1550064704.0, + "step": 9249 + }, + { + "entropy": 1.7057221233844757, + "epoch": 1.0161489659718217, + "grad_norm": 0.6833515763282776, + "learning_rate": 1.1204126419917567e-05, + "loss": 1.3819, + "mean_token_accuracy": 0.6675165841976801, + "num_tokens": 1550210298.0, + "step": 9250 + }, + { + "entropy": 1.6603924830754597, + "epoch": 1.0162588228831946, + "grad_norm": 0.7398085594177246, + "learning_rate": 1.1202525699748053e-05, + "loss": 1.4715, + "mean_token_accuracy": 0.6583651875456175, + "num_tokens": 1550374210.0, + "step": 9251 + }, + { + "entropy": 1.7534627715746562, + "epoch": 1.0163686797945677, + "grad_norm": 0.7214245796203613, + "learning_rate": 1.120092497316869e-05, + "loss": 1.5149, + "mean_token_accuracy": 0.6404918928941091, + "num_tokens": 1550522260.0, + "step": 9252 + }, + { + "entropy": 1.7200071314970653, + "epoch": 1.0164785367059406, + "grad_norm": 0.6468613743782043, + "learning_rate": 1.1199324240230143e-05, + "loss": 1.4526, + "mean_token_accuracy": 0.6536131302515665, + "num_tokens": 1550680157.0, + "step": 9253 + }, + { + "entropy": 1.7434161007404327, + "epoch": 1.0165883936173135, + "grad_norm": 0.6495728492736816, + "learning_rate": 1.1197723500983069e-05, + "loss": 1.3692, + "mean_token_accuracy": 0.6529008895158768, + "num_tokens": 1550834840.0, + "step": 9254 + }, + { + "entropy": 1.729017287492752, + "epoch": 1.0166982505286863, + "grad_norm": 0.6990881562232971, + "learning_rate": 1.119612275547814e-05, + "loss": 1.5721, + "mean_token_accuracy": 0.6412505855162939, + "num_tokens": 1551054269.0, + "step": 9255 + }, + { + "entropy": 1.7043547133604686, + "epoch": 1.0168081074400592, + "grad_norm": 0.6473329663276672, + "learning_rate": 1.1194522003766013e-05, + "loss": 1.315, + "mean_token_accuracy": 0.6622092028458914, + "num_tokens": 1551189361.0, + "step": 9256 + }, + { + "entropy": 1.7061095635096233, + "epoch": 1.0169179643514323, + "grad_norm": 0.7081418633460999, + "learning_rate": 1.1192921245897353e-05, + "loss": 1.4447, + "mean_token_accuracy": 0.6577693919340769, + "num_tokens": 1551368041.0, + "step": 9257 + }, + { + "entropy": 1.7616630991299946, + "epoch": 1.0170278212628052, + "grad_norm": 0.7121334075927734, + "learning_rate": 1.1191320481922823e-05, + "loss": 1.264, + "mean_token_accuracy": 0.6756730278333029, + "num_tokens": 1551481052.0, + "step": 9258 + }, + { + "entropy": 1.6953161557515461, + "epoch": 1.017137678174178, + "grad_norm": 0.6922043561935425, + "learning_rate": 1.1189719711893088e-05, + "loss": 1.3064, + "mean_token_accuracy": 0.664389913280805, + "num_tokens": 1551614812.0, + "step": 9259 + }, + { + "entropy": 1.6914753516515095, + "epoch": 1.017247535085551, + "grad_norm": 0.7497768402099609, + "learning_rate": 1.1188118935858802e-05, + "loss": 1.2366, + "mean_token_accuracy": 0.6768055210510889, + "num_tokens": 1551732083.0, + "step": 9260 + }, + { + "entropy": 1.6624971628189087, + "epoch": 1.017357391996924, + "grad_norm": 0.6488621830940247, + "learning_rate": 1.1186518153870643e-05, + "loss": 1.3451, + "mean_token_accuracy": 0.661164661248525, + "num_tokens": 1551882678.0, + "step": 9261 + }, + { + "entropy": 1.712691217660904, + "epoch": 1.017467248908297, + "grad_norm": 0.6433484554290771, + "learning_rate": 1.1184917365979267e-05, + "loss": 1.3778, + "mean_token_accuracy": 0.6611688236395518, + "num_tokens": 1552010626.0, + "step": 9262 + }, + { + "entropy": 1.6859545807043712, + "epoch": 1.0175771058196699, + "grad_norm": 0.6317065954208374, + "learning_rate": 1.118331657223534e-05, + "loss": 1.307, + "mean_token_accuracy": 0.6647968838612238, + "num_tokens": 1552155014.0, + "step": 9263 + }, + { + "entropy": 1.7237164676189423, + "epoch": 1.0176869627310428, + "grad_norm": 0.5941221117973328, + "learning_rate": 1.1181715772689524e-05, + "loss": 1.3943, + "mean_token_accuracy": 0.6520277112722397, + "num_tokens": 1552338624.0, + "step": 9264 + }, + { + "entropy": 1.7700924972693126, + "epoch": 1.0177968196424159, + "grad_norm": 0.8220915794372559, + "learning_rate": 1.1180114967392488e-05, + "loss": 1.5014, + "mean_token_accuracy": 0.6256469786167145, + "num_tokens": 1552565673.0, + "step": 9265 + }, + { + "entropy": 1.6931136548519135, + "epoch": 1.0179066765537887, + "grad_norm": 0.7540929913520813, + "learning_rate": 1.1178514156394893e-05, + "loss": 1.3396, + "mean_token_accuracy": 0.6597989400227865, + "num_tokens": 1552709040.0, + "step": 9266 + }, + { + "entropy": 1.7067534426848094, + "epoch": 1.0180165334651616, + "grad_norm": 0.720111608505249, + "learning_rate": 1.1176913339747406e-05, + "loss": 1.3746, + "mean_token_accuracy": 0.6579870879650116, + "num_tokens": 1552884550.0, + "step": 9267 + }, + { + "entropy": 1.7605009973049164, + "epoch": 1.0181263903765345, + "grad_norm": 0.77629554271698, + "learning_rate": 1.1175312517500692e-05, + "loss": 1.3439, + "mean_token_accuracy": 0.6593941003084183, + "num_tokens": 1553031590.0, + "step": 9268 + }, + { + "entropy": 1.7027594049771626, + "epoch": 1.0182362472879074, + "grad_norm": 0.5735223889350891, + "learning_rate": 1.1173711689705413e-05, + "loss": 1.5773, + "mean_token_accuracy": 0.6314045637845993, + "num_tokens": 1553236524.0, + "step": 9269 + }, + { + "entropy": 1.6763208210468292, + "epoch": 1.0183461041992805, + "grad_norm": 0.7643057107925415, + "learning_rate": 1.117211085641224e-05, + "loss": 1.2915, + "mean_token_accuracy": 0.6760772267977396, + "num_tokens": 1553392639.0, + "step": 9270 + }, + { + "entropy": 1.6933831572532654, + "epoch": 1.0184559611106534, + "grad_norm": 0.6687283515930176, + "learning_rate": 1.1170510017671836e-05, + "loss": 1.5079, + "mean_token_accuracy": 0.636181429028511, + "num_tokens": 1553611294.0, + "step": 9271 + }, + { + "entropy": 1.7319226165612538, + "epoch": 1.0185658180220263, + "grad_norm": 0.5630615949630737, + "learning_rate": 1.1168909173534866e-05, + "loss": 1.637, + "mean_token_accuracy": 0.6280501782894135, + "num_tokens": 1553812798.0, + "step": 9272 + }, + { + "entropy": 1.6394573052724202, + "epoch": 1.0186756749333992, + "grad_norm": 0.8335058093070984, + "learning_rate": 1.1167308324051998e-05, + "loss": 1.5129, + "mean_token_accuracy": 0.6522385478019714, + "num_tokens": 1553984737.0, + "step": 9273 + }, + { + "entropy": 1.6925493081410725, + "epoch": 1.0187855318447723, + "grad_norm": 0.6182948350906372, + "learning_rate": 1.1165707469273894e-05, + "loss": 1.3235, + "mean_token_accuracy": 0.6617102771997452, + "num_tokens": 1554122672.0, + "step": 9274 + }, + { + "entropy": 1.7493834793567657, + "epoch": 1.0188953887561452, + "grad_norm": 0.6503170132637024, + "learning_rate": 1.116410660925123e-05, + "loss": 1.4509, + "mean_token_accuracy": 0.6459475109974543, + "num_tokens": 1554293612.0, + "step": 9275 + }, + { + "entropy": 1.7210332651933034, + "epoch": 1.019005245667518, + "grad_norm": 0.6399514675140381, + "learning_rate": 1.1162505744034658e-05, + "loss": 1.3569, + "mean_token_accuracy": 0.6521624475717545, + "num_tokens": 1554536874.0, + "step": 9276 + }, + { + "entropy": 1.6515512764453888, + "epoch": 1.019115102578891, + "grad_norm": 0.6285982728004456, + "learning_rate": 1.1160904873674855e-05, + "loss": 1.275, + "mean_token_accuracy": 0.6685600280761719, + "num_tokens": 1554689700.0, + "step": 9277 + }, + { + "entropy": 1.7064630885918934, + "epoch": 1.019224959490264, + "grad_norm": 0.8239073753356934, + "learning_rate": 1.1159303998222484e-05, + "loss": 1.3523, + "mean_token_accuracy": 0.6559799164533615, + "num_tokens": 1554826581.0, + "step": 9278 + }, + { + "entropy": 1.7321425378322601, + "epoch": 1.019334816401637, + "grad_norm": 0.8235211968421936, + "learning_rate": 1.1157703117728216e-05, + "loss": 1.3853, + "mean_token_accuracy": 0.6528972536325455, + "num_tokens": 1554968925.0, + "step": 9279 + }, + { + "entropy": 1.6756745378176372, + "epoch": 1.0194446733130098, + "grad_norm": 0.808443009853363, + "learning_rate": 1.1156102232242714e-05, + "loss": 1.3856, + "mean_token_accuracy": 0.6638946781555811, + "num_tokens": 1555130010.0, + "step": 9280 + }, + { + "entropy": 1.7489099601904552, + "epoch": 1.0195545302243827, + "grad_norm": 0.7170142531394958, + "learning_rate": 1.1154501341816648e-05, + "loss": 1.3066, + "mean_token_accuracy": 0.6800417453050613, + "num_tokens": 1555288245.0, + "step": 9281 + }, + { + "entropy": 1.707829624414444, + "epoch": 1.0196643871357556, + "grad_norm": 0.7252710461616516, + "learning_rate": 1.115290044650068e-05, + "loss": 1.3392, + "mean_token_accuracy": 0.6582317799329758, + "num_tokens": 1555441544.0, + "step": 9282 + }, + { + "entropy": 1.6867429316043854, + "epoch": 1.0197742440471287, + "grad_norm": 0.6245051026344299, + "learning_rate": 1.1151299546345487e-05, + "loss": 1.3129, + "mean_token_accuracy": 0.6731636921564738, + "num_tokens": 1555573014.0, + "step": 9283 + }, + { + "entropy": 1.6290603975454967, + "epoch": 1.0198841009585016, + "grad_norm": 10.985240936279297, + "learning_rate": 1.1149698641401729e-05, + "loss": 1.1759, + "mean_token_accuracy": 0.6769275714953741, + "num_tokens": 1555740917.0, + "step": 9284 + }, + { + "entropy": 1.7704954346021016, + "epoch": 1.0199939578698745, + "grad_norm": 0.7116957306861877, + "learning_rate": 1.1148097731720075e-05, + "loss": 1.3332, + "mean_token_accuracy": 0.6628136684497198, + "num_tokens": 1555866851.0, + "step": 9285 + }, + { + "entropy": 1.7090757687886555, + "epoch": 1.0201038147812473, + "grad_norm": 0.7023559808731079, + "learning_rate": 1.1146496817351198e-05, + "loss": 1.378, + "mean_token_accuracy": 0.6548497478167216, + "num_tokens": 1556020336.0, + "step": 9286 + }, + { + "entropy": 1.6991868913173676, + "epoch": 1.0202136716926204, + "grad_norm": 0.6621536612510681, + "learning_rate": 1.1144895898345763e-05, + "loss": 1.4705, + "mean_token_accuracy": 0.6551912526289622, + "num_tokens": 1556223903.0, + "step": 9287 + }, + { + "entropy": 1.6916466653347015, + "epoch": 1.0203235286039933, + "grad_norm": 0.6319494843482971, + "learning_rate": 1.1143294974754432e-05, + "loss": 1.3627, + "mean_token_accuracy": 0.6651053031285604, + "num_tokens": 1556404805.0, + "step": 9288 + }, + { + "entropy": 1.7253743211428325, + "epoch": 1.0204333855153662, + "grad_norm": 0.8715941309928894, + "learning_rate": 1.1141694046627887e-05, + "loss": 1.3739, + "mean_token_accuracy": 0.6772264291842779, + "num_tokens": 1556567447.0, + "step": 9289 + }, + { + "entropy": 1.6666424969832103, + "epoch": 1.020543242426739, + "grad_norm": 0.7647740244865417, + "learning_rate": 1.1140093114016785e-05, + "loss": 1.5055, + "mean_token_accuracy": 0.6350631018479665, + "num_tokens": 1556756290.0, + "step": 9290 + }, + { + "entropy": 1.708291381597519, + "epoch": 1.0206530993381122, + "grad_norm": 0.6904531121253967, + "learning_rate": 1.11384921769718e-05, + "loss": 1.3246, + "mean_token_accuracy": 0.6716498136520386, + "num_tokens": 1556892411.0, + "step": 9291 + }, + { + "entropy": 1.7041618327299755, + "epoch": 1.020762956249485, + "grad_norm": 0.7279467582702637, + "learning_rate": 1.1136891235543602e-05, + "loss": 1.3717, + "mean_token_accuracy": 0.653698722521464, + "num_tokens": 1557050513.0, + "step": 9292 + }, + { + "entropy": 1.6928143699963887, + "epoch": 1.020872813160858, + "grad_norm": 0.6164371371269226, + "learning_rate": 1.1135290289782856e-05, + "loss": 1.2497, + "mean_token_accuracy": 0.684166838725408, + "num_tokens": 1557186695.0, + "step": 9293 + }, + { + "entropy": 1.6356267134348552, + "epoch": 1.0209826700722309, + "grad_norm": 0.6712630391120911, + "learning_rate": 1.1133689339740232e-05, + "loss": 1.2799, + "mean_token_accuracy": 0.6790573745965958, + "num_tokens": 1557342961.0, + "step": 9294 + }, + { + "entropy": 1.7505371868610382, + "epoch": 1.0210925269836038, + "grad_norm": 0.7134815454483032, + "learning_rate": 1.1132088385466404e-05, + "loss": 1.3846, + "mean_token_accuracy": 0.6622013101975123, + "num_tokens": 1557505250.0, + "step": 9295 + }, + { + "entropy": 1.730526864528656, + "epoch": 1.0212023838949769, + "grad_norm": 0.7285529971122742, + "learning_rate": 1.1130487427012035e-05, + "loss": 1.2494, + "mean_token_accuracy": 0.671364982922872, + "num_tokens": 1557626583.0, + "step": 9296 + }, + { + "entropy": 1.6375334958235424, + "epoch": 1.0213122408063497, + "grad_norm": 0.5679100155830383, + "learning_rate": 1.11288864644278e-05, + "loss": 1.5019, + "mean_token_accuracy": 0.64617853363355, + "num_tokens": 1557848562.0, + "step": 9297 + }, + { + "entropy": 1.6906728843847911, + "epoch": 1.0214220977177226, + "grad_norm": 0.7008233070373535, + "learning_rate": 1.1127285497764366e-05, + "loss": 1.4128, + "mean_token_accuracy": 0.6613838970661163, + "num_tokens": 1558008515.0, + "step": 9298 + }, + { + "entropy": 1.7250931958357494, + "epoch": 1.0215319546290955, + "grad_norm": 0.7736313343048096, + "learning_rate": 1.1125684527072403e-05, + "loss": 1.5166, + "mean_token_accuracy": 0.6381128629048666, + "num_tokens": 1558218566.0, + "step": 9299 + }, + { + "entropy": 1.7384556730588276, + "epoch": 1.0216418115404686, + "grad_norm": 0.7157011032104492, + "learning_rate": 1.1124083552402578e-05, + "loss": 1.3946, + "mean_token_accuracy": 0.6412435173988342, + "num_tokens": 1558416646.0, + "step": 9300 + }, + { + "entropy": 1.7145174245039623, + "epoch": 1.0217516684518415, + "grad_norm": 0.5953468680381775, + "learning_rate": 1.1122482573805572e-05, + "loss": 1.3498, + "mean_token_accuracy": 0.6631358712911606, + "num_tokens": 1558570368.0, + "step": 9301 + }, + { + "entropy": 1.5895135502020519, + "epoch": 1.0218615253632144, + "grad_norm": 0.6971820592880249, + "learning_rate": 1.1120881591332042e-05, + "loss": 1.3617, + "mean_token_accuracy": 0.6736765454212824, + "num_tokens": 1558745606.0, + "step": 9302 + }, + { + "entropy": 1.7481009860833485, + "epoch": 1.0219713822745873, + "grad_norm": 0.6487974524497986, + "learning_rate": 1.1119280605032667e-05, + "loss": 1.5241, + "mean_token_accuracy": 0.629365916053454, + "num_tokens": 1558940755.0, + "step": 9303 + }, + { + "entropy": 1.7577315270900726, + "epoch": 1.0220812391859604, + "grad_norm": 0.7471747398376465, + "learning_rate": 1.111767961495811e-05, + "loss": 1.5136, + "mean_token_accuracy": 0.6320231805245081, + "num_tokens": 1559112446.0, + "step": 9304 + }, + { + "entropy": 1.6948171555995941, + "epoch": 1.0221910960973333, + "grad_norm": 0.642125129699707, + "learning_rate": 1.111607862115905e-05, + "loss": 1.4423, + "mean_token_accuracy": 0.6560766796271006, + "num_tokens": 1559341272.0, + "step": 9305 + }, + { + "entropy": 1.682332714398702, + "epoch": 1.0223009530087062, + "grad_norm": 0.6562165021896362, + "learning_rate": 1.1114477623686155e-05, + "loss": 1.3956, + "mean_token_accuracy": 0.6665085901816686, + "num_tokens": 1559494474.0, + "step": 9306 + }, + { + "entropy": 1.724548449118932, + "epoch": 1.022410809920079, + "grad_norm": 0.6904338002204895, + "learning_rate": 1.1112876622590091e-05, + "loss": 1.4954, + "mean_token_accuracy": 0.656151756644249, + "num_tokens": 1559687605.0, + "step": 9307 + }, + { + "entropy": 1.7673320770263672, + "epoch": 1.022520666831452, + "grad_norm": 0.6390427350997925, + "learning_rate": 1.1111275617921538e-05, + "loss": 1.3457, + "mean_token_accuracy": 0.6566664973894755, + "num_tokens": 1559816863.0, + "step": 9308 + }, + { + "entropy": 1.7288634777069092, + "epoch": 1.022630523742825, + "grad_norm": 0.8318937420845032, + "learning_rate": 1.1109674609731158e-05, + "loss": 1.4264, + "mean_token_accuracy": 0.6647811233997345, + "num_tokens": 1559943550.0, + "step": 9309 + }, + { + "entropy": 1.6661728421847026, + "epoch": 1.022740380654198, + "grad_norm": 0.7150062918663025, + "learning_rate": 1.1108073598069624e-05, + "loss": 1.3994, + "mean_token_accuracy": 0.6527441392342249, + "num_tokens": 1560098393.0, + "step": 9310 + }, + { + "entropy": 1.6526323854923248, + "epoch": 1.0228502375655708, + "grad_norm": 0.765347421169281, + "learning_rate": 1.1106472582987615e-05, + "loss": 1.359, + "mean_token_accuracy": 0.6697281499703726, + "num_tokens": 1560239415.0, + "step": 9311 + }, + { + "entropy": 1.6831820905208588, + "epoch": 1.0229600944769437, + "grad_norm": 0.7733060121536255, + "learning_rate": 1.1104871564535792e-05, + "loss": 1.3801, + "mean_token_accuracy": 0.6575480302174886, + "num_tokens": 1560411622.0, + "step": 9312 + }, + { + "entropy": 1.7815355956554413, + "epoch": 1.0230699513883168, + "grad_norm": 0.7969313859939575, + "learning_rate": 1.1103270542764832e-05, + "loss": 1.3305, + "mean_token_accuracy": 0.665749246875445, + "num_tokens": 1560515174.0, + "step": 9313 + }, + { + "entropy": 1.7089110016822815, + "epoch": 1.0231798082996897, + "grad_norm": 0.6208791136741638, + "learning_rate": 1.1101669517725409e-05, + "loss": 1.4192, + "mean_token_accuracy": 0.6394672940174738, + "num_tokens": 1560678366.0, + "step": 9314 + }, + { + "entropy": 1.6642550726731618, + "epoch": 1.0232896652110626, + "grad_norm": 0.7100003361701965, + "learning_rate": 1.110006848946819e-05, + "loss": 1.3558, + "mean_token_accuracy": 0.672847161690394, + "num_tokens": 1560836440.0, + "step": 9315 + }, + { + "entropy": 1.6587112347284954, + "epoch": 1.0233995221224355, + "grad_norm": 0.631491482257843, + "learning_rate": 1.1098467458043844e-05, + "loss": 1.3645, + "mean_token_accuracy": 0.6515796532233556, + "num_tokens": 1561021294.0, + "step": 9316 + }, + { + "entropy": 1.7398035724957783, + "epoch": 1.0235093790338086, + "grad_norm": 0.7306511402130127, + "learning_rate": 1.1096866423503054e-05, + "loss": 1.5682, + "mean_token_accuracy": 0.6455154716968536, + "num_tokens": 1561199258.0, + "step": 9317 + }, + { + "entropy": 1.6927287181218464, + "epoch": 1.0236192359451814, + "grad_norm": 0.6335356831550598, + "learning_rate": 1.1095265385896484e-05, + "loss": 1.4913, + "mean_token_accuracy": 0.6352100173632304, + "num_tokens": 1561425004.0, + "step": 9318 + }, + { + "entropy": 1.7034152448177338, + "epoch": 1.0237290928565543, + "grad_norm": 0.6361052989959717, + "learning_rate": 1.1093664345274804e-05, + "loss": 1.5067, + "mean_token_accuracy": 0.6494300862153372, + "num_tokens": 1561637991.0, + "step": 9319 + }, + { + "entropy": 1.69016628464063, + "epoch": 1.0238389497679272, + "grad_norm": 0.6373481750488281, + "learning_rate": 1.1092063301688691e-05, + "loss": 1.4863, + "mean_token_accuracy": 0.6439661830663681, + "num_tokens": 1561838602.0, + "step": 9320 + }, + { + "entropy": 1.6587995290756226, + "epoch": 1.0239488066793, + "grad_norm": 0.8336928486824036, + "learning_rate": 1.1090462255188819e-05, + "loss": 1.2563, + "mean_token_accuracy": 0.6774145613114039, + "num_tokens": 1561963251.0, + "step": 9321 + }, + { + "entropy": 1.7257398466269176, + "epoch": 1.0240586635906732, + "grad_norm": 0.6993498802185059, + "learning_rate": 1.1088861205825853e-05, + "loss": 1.3805, + "mean_token_accuracy": 0.6668401459852854, + "num_tokens": 1562127473.0, + "step": 9322 + }, + { + "entropy": 1.7249635954697926, + "epoch": 1.024168520502046, + "grad_norm": 0.7952443957328796, + "learning_rate": 1.1087260153650474e-05, + "loss": 1.7184, + "mean_token_accuracy": 0.638072150448958, + "num_tokens": 1562317748.0, + "step": 9323 + }, + { + "entropy": 1.6867842574914296, + "epoch": 1.024278377413419, + "grad_norm": 0.749860405921936, + "learning_rate": 1.1085659098713348e-05, + "loss": 1.4717, + "mean_token_accuracy": 0.6456845154364904, + "num_tokens": 1562512974.0, + "step": 9324 + }, + { + "entropy": 1.7148866256078084, + "epoch": 1.0243882343247919, + "grad_norm": 0.8800878524780273, + "learning_rate": 1.1084058041065151e-05, + "loss": 1.4662, + "mean_token_accuracy": 0.6518640766541163, + "num_tokens": 1562660251.0, + "step": 9325 + }, + { + "entropy": 1.7287410298983257, + "epoch": 1.024498091236165, + "grad_norm": 0.7516173720359802, + "learning_rate": 1.1082456980756553e-05, + "loss": 1.4594, + "mean_token_accuracy": 0.6475322792927424, + "num_tokens": 1562818950.0, + "step": 9326 + }, + { + "entropy": 1.6619328260421753, + "epoch": 1.0246079481475379, + "grad_norm": 0.589447557926178, + "learning_rate": 1.1080855917838232e-05, + "loss": 1.3906, + "mean_token_accuracy": 0.662062461177508, + "num_tokens": 1562994934.0, + "step": 9327 + }, + { + "entropy": 1.6725085377693176, + "epoch": 1.0247178050589107, + "grad_norm": 0.5970123410224915, + "learning_rate": 1.1079254852360852e-05, + "loss": 1.4908, + "mean_token_accuracy": 0.6354695955912272, + "num_tokens": 1563198043.0, + "step": 9328 + }, + { + "entropy": 1.7263079186280568, + "epoch": 1.0248276619702836, + "grad_norm": 0.7884548902511597, + "learning_rate": 1.1077653784375098e-05, + "loss": 1.3299, + "mean_token_accuracy": 0.6538351277510325, + "num_tokens": 1563372999.0, + "step": 9329 + }, + { + "entropy": 1.755046049753825, + "epoch": 1.0249375188816567, + "grad_norm": 0.8799434900283813, + "learning_rate": 1.1076052713931633e-05, + "loss": 1.5743, + "mean_token_accuracy": 0.6435807049274445, + "num_tokens": 1563531399.0, + "step": 9330 + }, + { + "entropy": 1.6757254600524902, + "epoch": 1.0250473757930296, + "grad_norm": 0.7605611085891724, + "learning_rate": 1.1074451641081135e-05, + "loss": 1.3706, + "mean_token_accuracy": 0.6580003201961517, + "num_tokens": 1563710017.0, + "step": 9331 + }, + { + "entropy": 1.7006694972515106, + "epoch": 1.0251572327044025, + "grad_norm": 0.7495110630989075, + "learning_rate": 1.1072850565874274e-05, + "loss": 1.3559, + "mean_token_accuracy": 0.6717531283696493, + "num_tokens": 1563867033.0, + "step": 9332 + }, + { + "entropy": 1.6994576354821522, + "epoch": 1.0252670896157754, + "grad_norm": 0.7062970399856567, + "learning_rate": 1.107124948836173e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6614094525575638, + "num_tokens": 1563988943.0, + "step": 9333 + }, + { + "entropy": 1.681195815404256, + "epoch": 1.0253769465271483, + "grad_norm": 0.8650986552238464, + "learning_rate": 1.1069648408594168e-05, + "loss": 1.444, + "mean_token_accuracy": 0.6560016522804896, + "num_tokens": 1564179018.0, + "step": 9334 + }, + { + "entropy": 1.734657605489095, + "epoch": 1.0254868034385214, + "grad_norm": 0.688232958316803, + "learning_rate": 1.1068047326622269e-05, + "loss": 1.5821, + "mean_token_accuracy": 0.6293347378571829, + "num_tokens": 1564399374.0, + "step": 9335 + }, + { + "entropy": 1.7198534111181896, + "epoch": 1.0255966603498943, + "grad_norm": 0.712509036064148, + "learning_rate": 1.1066446242496697e-05, + "loss": 1.3292, + "mean_token_accuracy": 0.6663158188263575, + "num_tokens": 1564521995.0, + "step": 9336 + }, + { + "entropy": 1.752720485130946, + "epoch": 1.0257065172612672, + "grad_norm": 1.8393864631652832, + "learning_rate": 1.1064845156268135e-05, + "loss": 1.3724, + "mean_token_accuracy": 0.6571303755044937, + "num_tokens": 1564664216.0, + "step": 9337 + }, + { + "entropy": 1.6954265733559926, + "epoch": 1.02581637417264, + "grad_norm": 0.6746184825897217, + "learning_rate": 1.1063244067987253e-05, + "loss": 1.4452, + "mean_token_accuracy": 0.6532609164714813, + "num_tokens": 1564867480.0, + "step": 9338 + }, + { + "entropy": 1.7057288686434429, + "epoch": 1.0259262310840132, + "grad_norm": 0.6397607326507568, + "learning_rate": 1.1061642977704726e-05, + "loss": 1.5297, + "mean_token_accuracy": 0.6352156102657318, + "num_tokens": 1565064367.0, + "step": 9339 + }, + { + "entropy": 1.7565435369809468, + "epoch": 1.026036087995386, + "grad_norm": 0.7192218899726868, + "learning_rate": 1.1060041885471224e-05, + "loss": 1.4612, + "mean_token_accuracy": 0.6381704757610956, + "num_tokens": 1565271382.0, + "step": 9340 + }, + { + "entropy": 1.7230738202730815, + "epoch": 1.026145944906759, + "grad_norm": 0.6718366742134094, + "learning_rate": 1.1058440791337424e-05, + "loss": 1.5053, + "mean_token_accuracy": 0.6260107507308325, + "num_tokens": 1565498782.0, + "step": 9341 + }, + { + "entropy": 1.7352421184380848, + "epoch": 1.0262558018181318, + "grad_norm": 0.8205738663673401, + "learning_rate": 1.1056839695354e-05, + "loss": 1.6207, + "mean_token_accuracy": 0.6656580666700999, + "num_tokens": 1565694130.0, + "step": 9342 + }, + { + "entropy": 1.709681620200475, + "epoch": 1.026365658729505, + "grad_norm": 0.6529027223587036, + "learning_rate": 1.1055238597571627e-05, + "loss": 1.4073, + "mean_token_accuracy": 0.656768262386322, + "num_tokens": 1565886376.0, + "step": 9343 + }, + { + "entropy": 1.7247290511926014, + "epoch": 1.0264755156408778, + "grad_norm": 0.8053439855575562, + "learning_rate": 1.1053637498040972e-05, + "loss": 1.3863, + "mean_token_accuracy": 0.6635664304097494, + "num_tokens": 1566024854.0, + "step": 9344 + }, + { + "entropy": 1.7492102185885112, + "epoch": 1.0265853725522507, + "grad_norm": 0.7567391395568848, + "learning_rate": 1.105203639681272e-05, + "loss": 1.4823, + "mean_token_accuracy": 0.6328875770171484, + "num_tokens": 1566239010.0, + "step": 9345 + }, + { + "entropy": 1.6845263640085857, + "epoch": 1.0266952294636236, + "grad_norm": 0.658109724521637, + "learning_rate": 1.1050435293937535e-05, + "loss": 1.5708, + "mean_token_accuracy": 0.6598745634158453, + "num_tokens": 1566396108.0, + "step": 9346 + }, + { + "entropy": 1.7013383607069652, + "epoch": 1.0268050863749965, + "grad_norm": 0.7290819883346558, + "learning_rate": 1.10488341894661e-05, + "loss": 1.309, + "mean_token_accuracy": 0.6661685655514399, + "num_tokens": 1566506452.0, + "step": 9347 + }, + { + "entropy": 1.7153269449869792, + "epoch": 1.0269149432863696, + "grad_norm": 0.6383672952651978, + "learning_rate": 1.104723308344908e-05, + "loss": 1.5736, + "mean_token_accuracy": 0.6496073206265768, + "num_tokens": 1566669636.0, + "step": 9348 + }, + { + "entropy": 1.7498332460721333, + "epoch": 1.0270248001977424, + "grad_norm": 1.3603650331497192, + "learning_rate": 1.1045631975937162e-05, + "loss": 1.54, + "mean_token_accuracy": 0.6541606038808823, + "num_tokens": 1566846993.0, + "step": 9349 + }, + { + "entropy": 1.6750881572564442, + "epoch": 1.0271346571091153, + "grad_norm": 0.8021945357322693, + "learning_rate": 1.1044030866981003e-05, + "loss": 1.4035, + "mean_token_accuracy": 0.664098913470904, + "num_tokens": 1567016917.0, + "step": 9350 + }, + { + "entropy": 1.680233657360077, + "epoch": 1.0272445140204882, + "grad_norm": 0.6845235824584961, + "learning_rate": 1.1042429756631291e-05, + "loss": 1.4367, + "mean_token_accuracy": 0.6555875589450201, + "num_tokens": 1567176851.0, + "step": 9351 + }, + { + "entropy": 1.6933209796746571, + "epoch": 1.0273543709318613, + "grad_norm": 0.7372247576713562, + "learning_rate": 1.1040828644938697e-05, + "loss": 1.4267, + "mean_token_accuracy": 0.6577855745951334, + "num_tokens": 1567371811.0, + "step": 9352 + }, + { + "entropy": 1.7011990348498027, + "epoch": 1.0274642278432342, + "grad_norm": 0.7555881142616272, + "learning_rate": 1.1039227531953896e-05, + "loss": 1.2464, + "mean_token_accuracy": 0.6763677497704824, + "num_tokens": 1567559137.0, + "step": 9353 + }, + { + "entropy": 1.7057754397392273, + "epoch": 1.027574084754607, + "grad_norm": 0.7010595202445984, + "learning_rate": 1.1037626417727558e-05, + "loss": 1.4121, + "mean_token_accuracy": 0.6522766401370367, + "num_tokens": 1567756111.0, + "step": 9354 + }, + { + "entropy": 1.7608352601528168, + "epoch": 1.02768394166598, + "grad_norm": 0.7014980912208557, + "learning_rate": 1.1036025302310364e-05, + "loss": 1.4579, + "mean_token_accuracy": 0.6561769843101501, + "num_tokens": 1567928704.0, + "step": 9355 + }, + { + "entropy": 1.7167177398999531, + "epoch": 1.027793798577353, + "grad_norm": 0.7096652984619141, + "learning_rate": 1.1034424185752982e-05, + "loss": 1.301, + "mean_token_accuracy": 0.6627550721168518, + "num_tokens": 1568041208.0, + "step": 9356 + }, + { + "entropy": 1.706625332434972, + "epoch": 1.027903655488726, + "grad_norm": 0.7294109463691711, + "learning_rate": 1.1032823068106092e-05, + "loss": 1.4412, + "mean_token_accuracy": 0.6564443955818812, + "num_tokens": 1568214563.0, + "step": 9357 + }, + { + "entropy": 1.7395167748133342, + "epoch": 1.0280135124000989, + "grad_norm": 0.7195934653282166, + "learning_rate": 1.1031221949420368e-05, + "loss": 1.3897, + "mean_token_accuracy": 0.649207149942716, + "num_tokens": 1568397977.0, + "step": 9358 + }, + { + "entropy": 1.692352334658305, + "epoch": 1.0281233693114717, + "grad_norm": 0.6182077527046204, + "learning_rate": 1.1029620829746482e-05, + "loss": 1.4435, + "mean_token_accuracy": 0.6503968785206476, + "num_tokens": 1568599949.0, + "step": 9359 + }, + { + "entropy": 1.7287443379561107, + "epoch": 1.0282332262228449, + "grad_norm": 0.7307857871055603, + "learning_rate": 1.102801970913511e-05, + "loss": 1.3121, + "mean_token_accuracy": 0.6638143807649612, + "num_tokens": 1568741614.0, + "step": 9360 + }, + { + "entropy": 1.699091374874115, + "epoch": 1.0283430831342177, + "grad_norm": 0.6680180430412292, + "learning_rate": 1.1026418587636926e-05, + "loss": 1.4316, + "mean_token_accuracy": 0.6453281243642172, + "num_tokens": 1568903127.0, + "step": 9361 + }, + { + "entropy": 1.7088652749856312, + "epoch": 1.0284529400455906, + "grad_norm": 0.7400436997413635, + "learning_rate": 1.1024817465302604e-05, + "loss": 1.3959, + "mean_token_accuracy": 0.6520246714353561, + "num_tokens": 1569104488.0, + "step": 9362 + }, + { + "entropy": 1.6986040870348613, + "epoch": 1.0285627969569635, + "grad_norm": 0.6973623037338257, + "learning_rate": 1.1023216342182825e-05, + "loss": 1.3094, + "mean_token_accuracy": 0.6667328427235285, + "num_tokens": 1569258463.0, + "step": 9363 + }, + { + "entropy": 1.7184610863526661, + "epoch": 1.0286726538683364, + "grad_norm": 0.7891613245010376, + "learning_rate": 1.1021615218328257e-05, + "loss": 1.5252, + "mean_token_accuracy": 0.6617070535818735, + "num_tokens": 1569458294.0, + "step": 9364 + }, + { + "entropy": 1.6800651748975117, + "epoch": 1.0287825107797095, + "grad_norm": 0.6154624223709106, + "learning_rate": 1.102001409378958e-05, + "loss": 1.3427, + "mean_token_accuracy": 0.6658696780602137, + "num_tokens": 1569657703.0, + "step": 9365 + }, + { + "entropy": 1.6739746828873951, + "epoch": 1.0288923676910824, + "grad_norm": 0.5887352228164673, + "learning_rate": 1.101841296861746e-05, + "loss": 1.3319, + "mean_token_accuracy": 0.664004052678744, + "num_tokens": 1569812239.0, + "step": 9366 + }, + { + "entropy": 1.7404913504918416, + "epoch": 1.0290022246024553, + "grad_norm": 0.6177912354469299, + "learning_rate": 1.1016811842862583e-05, + "loss": 1.3795, + "mean_token_accuracy": 0.6624182611703873, + "num_tokens": 1569971143.0, + "step": 9367 + }, + { + "entropy": 1.7285182078679402, + "epoch": 1.0291120815138282, + "grad_norm": 0.6963897347450256, + "learning_rate": 1.1015210716575614e-05, + "loss": 1.2931, + "mean_token_accuracy": 0.6709794253110886, + "num_tokens": 1570122158.0, + "step": 9368 + }, + { + "entropy": 1.7291064659754436, + "epoch": 1.0292219384252013, + "grad_norm": 0.6220366358757019, + "learning_rate": 1.1013609589807237e-05, + "loss": 1.3733, + "mean_token_accuracy": 0.6541631271441778, + "num_tokens": 1570257383.0, + "step": 9369 + }, + { + "entropy": 1.6683486600716908, + "epoch": 1.0293317953365742, + "grad_norm": 0.6968728303909302, + "learning_rate": 1.1012008462608119e-05, + "loss": 1.3097, + "mean_token_accuracy": 0.6743961970011393, + "num_tokens": 1570389364.0, + "step": 9370 + }, + { + "entropy": 1.680118163426717, + "epoch": 1.029441652247947, + "grad_norm": 0.7007773518562317, + "learning_rate": 1.1010407335028944e-05, + "loss": 1.3684, + "mean_token_accuracy": 0.6575128883123398, + "num_tokens": 1570532077.0, + "step": 9371 + }, + { + "entropy": 1.697032392024994, + "epoch": 1.02955150915932, + "grad_norm": 0.6026526093482971, + "learning_rate": 1.1008806207120376e-05, + "loss": 1.3735, + "mean_token_accuracy": 0.6517840176820755, + "num_tokens": 1570684541.0, + "step": 9372 + }, + { + "entropy": 1.6714433928330739, + "epoch": 1.029661366070693, + "grad_norm": 0.6352996230125427, + "learning_rate": 1.1007205078933099e-05, + "loss": 1.3515, + "mean_token_accuracy": 0.6700502683719, + "num_tokens": 1570836377.0, + "step": 9373 + }, + { + "entropy": 1.6749180654684703, + "epoch": 1.029771222982066, + "grad_norm": 0.6775603890419006, + "learning_rate": 1.1005603950517783e-05, + "loss": 1.3308, + "mean_token_accuracy": 0.6705063283443451, + "num_tokens": 1571008501.0, + "step": 9374 + }, + { + "entropy": 1.6891617675622304, + "epoch": 1.0298810798934388, + "grad_norm": 0.6425476670265198, + "learning_rate": 1.1004002821925104e-05, + "loss": 1.3842, + "mean_token_accuracy": 0.6577885945638021, + "num_tokens": 1571178622.0, + "step": 9375 + }, + { + "entropy": 1.6456943849722545, + "epoch": 1.0299909368048117, + "grad_norm": 2.2865750789642334, + "learning_rate": 1.1002401693205738e-05, + "loss": 1.3875, + "mean_token_accuracy": 0.6783433457215627, + "num_tokens": 1571353888.0, + "step": 9376 + }, + { + "entropy": 1.6456839342912037, + "epoch": 1.0301007937161846, + "grad_norm": 0.7852012515068054, + "learning_rate": 1.1000800564410362e-05, + "loss": 1.3242, + "mean_token_accuracy": 0.6605374266703924, + "num_tokens": 1571473748.0, + "step": 9377 + }, + { + "entropy": 1.699092835187912, + "epoch": 1.0302106506275577, + "grad_norm": 0.6551658511161804, + "learning_rate": 1.0999199435589643e-05, + "loss": 1.5777, + "mean_token_accuracy": 0.6421516289313635, + "num_tokens": 1571656099.0, + "step": 9378 + }, + { + "entropy": 1.7373557190100353, + "epoch": 1.0303205075389306, + "grad_norm": 0.6427181363105774, + "learning_rate": 1.0997598306794269e-05, + "loss": 1.578, + "mean_token_accuracy": 0.641086682677269, + "num_tokens": 1571911580.0, + "step": 9379 + }, + { + "entropy": 1.6774665514628093, + "epoch": 1.0304303644503034, + "grad_norm": 0.6472753286361694, + "learning_rate": 1.09959971780749e-05, + "loss": 1.2805, + "mean_token_accuracy": 0.6775522033373514, + "num_tokens": 1572069418.0, + "step": 9380 + }, + { + "entropy": 1.7251865367094676, + "epoch": 1.0305402213616763, + "grad_norm": 0.7229856848716736, + "learning_rate": 1.0994396049482221e-05, + "loss": 1.2864, + "mean_token_accuracy": 0.6660173137982687, + "num_tokens": 1572223424.0, + "step": 9381 + }, + { + "entropy": 1.6989657084147136, + "epoch": 1.0306500782730494, + "grad_norm": 0.6592692732810974, + "learning_rate": 1.0992794921066908e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6701207856337229, + "num_tokens": 1572364920.0, + "step": 9382 + }, + { + "entropy": 1.703797886768977, + "epoch": 1.0307599351844223, + "grad_norm": 0.7170230746269226, + "learning_rate": 1.0991193792879629e-05, + "loss": 1.3009, + "mean_token_accuracy": 0.665116066733996, + "num_tokens": 1572482987.0, + "step": 9383 + }, + { + "entropy": 1.7276430229345958, + "epoch": 1.0308697920957952, + "grad_norm": 0.7376964092254639, + "learning_rate": 1.0989592664971061e-05, + "loss": 1.5201, + "mean_token_accuracy": 0.6401703308025996, + "num_tokens": 1572654914.0, + "step": 9384 + }, + { + "entropy": 1.7409641345342, + "epoch": 1.030979649007168, + "grad_norm": 0.8233011364936829, + "learning_rate": 1.0987991537391884e-05, + "loss": 1.4287, + "mean_token_accuracy": 0.6417080561319987, + "num_tokens": 1572849673.0, + "step": 9385 + }, + { + "entropy": 1.7794389128684998, + "epoch": 1.0310895059185412, + "grad_norm": 0.7736028432846069, + "learning_rate": 1.0986390410192767e-05, + "loss": 1.5405, + "mean_token_accuracy": 0.6407246440649033, + "num_tokens": 1573037823.0, + "step": 9386 + }, + { + "entropy": 1.7333606382211049, + "epoch": 1.031199362829914, + "grad_norm": 0.6458233594894409, + "learning_rate": 1.0984789283424389e-05, + "loss": 1.3172, + "mean_token_accuracy": 0.6686715831359228, + "num_tokens": 1573199853.0, + "step": 9387 + }, + { + "entropy": 1.6926281054814656, + "epoch": 1.031309219741287, + "grad_norm": 0.7213067412376404, + "learning_rate": 1.0983188157137423e-05, + "loss": 1.2975, + "mean_token_accuracy": 0.6710375199715296, + "num_tokens": 1573346257.0, + "step": 9388 + }, + { + "entropy": 1.7269733548164368, + "epoch": 1.0314190766526599, + "grad_norm": 0.6135429739952087, + "learning_rate": 1.0981587031382543e-05, + "loss": 1.3502, + "mean_token_accuracy": 0.6585359672705332, + "num_tokens": 1573502913.0, + "step": 9389 + }, + { + "entropy": 1.6842507719993591, + "epoch": 1.0315289335640327, + "grad_norm": 0.6283948421478271, + "learning_rate": 1.0979985906210424e-05, + "loss": 1.3511, + "mean_token_accuracy": 0.6634029597043991, + "num_tokens": 1573673503.0, + "step": 9390 + }, + { + "entropy": 1.6653286516666412, + "epoch": 1.0316387904754059, + "grad_norm": 0.6210644245147705, + "learning_rate": 1.0978384781671747e-05, + "loss": 1.4805, + "mean_token_accuracy": 0.648143524924914, + "num_tokens": 1573902167.0, + "step": 9391 + }, + { + "entropy": 1.6455399890740712, + "epoch": 1.0317486473867787, + "grad_norm": 0.6163156628608704, + "learning_rate": 1.0976783657817178e-05, + "loss": 1.4123, + "mean_token_accuracy": 0.6596850504477819, + "num_tokens": 1574126639.0, + "step": 9392 + }, + { + "entropy": 1.6833748817443848, + "epoch": 1.0318585042981516, + "grad_norm": 0.6697494983673096, + "learning_rate": 1.0975182534697397e-05, + "loss": 1.3195, + "mean_token_accuracy": 0.6766639103492101, + "num_tokens": 1574279133.0, + "step": 9393 + }, + { + "entropy": 1.7093348105748494, + "epoch": 1.0319683612095245, + "grad_norm": 0.7185570001602173, + "learning_rate": 1.0973581412363078e-05, + "loss": 1.381, + "mean_token_accuracy": 0.6626923183600107, + "num_tokens": 1574416811.0, + "step": 9394 + }, + { + "entropy": 1.6518169144789379, + "epoch": 1.0320782181208976, + "grad_norm": 0.6190440654754639, + "learning_rate": 1.0971980290864896e-05, + "loss": 1.3933, + "mean_token_accuracy": 0.6565068662166595, + "num_tokens": 1574590620.0, + "step": 9395 + }, + { + "entropy": 1.7660084863503773, + "epoch": 1.0321880750322705, + "grad_norm": 0.8229091167449951, + "learning_rate": 1.0970379170253523e-05, + "loss": 1.6284, + "mean_token_accuracy": 0.6307843253016472, + "num_tokens": 1574792296.0, + "step": 9396 + }, + { + "entropy": 1.674621005853017, + "epoch": 1.0322979319436434, + "grad_norm": 0.6914384961128235, + "learning_rate": 1.0968778050579638e-05, + "loss": 1.3486, + "mean_token_accuracy": 0.6684543887774149, + "num_tokens": 1574979186.0, + "step": 9397 + }, + { + "entropy": 1.700808932383855, + "epoch": 1.0324077888550163, + "grad_norm": 0.646267831325531, + "learning_rate": 1.096717693189391e-05, + "loss": 1.3497, + "mean_token_accuracy": 0.65216397245725, + "num_tokens": 1575130170.0, + "step": 9398 + }, + { + "entropy": 1.6885426342487335, + "epoch": 1.0325176457663894, + "grad_norm": 0.6223624348640442, + "learning_rate": 1.096557581424702e-05, + "loss": 1.3716, + "mean_token_accuracy": 0.6467955311139425, + "num_tokens": 1575370782.0, + "step": 9399 + }, + { + "entropy": 1.7195010880629222, + "epoch": 1.0326275026777623, + "grad_norm": 0.6038949489593506, + "learning_rate": 1.0963974697689644e-05, + "loss": 1.3991, + "mean_token_accuracy": 0.6538459608952204, + "num_tokens": 1575633582.0, + "step": 9400 + }, + { + "entropy": 1.7567541698614757, + "epoch": 1.0327373595891352, + "grad_norm": 0.770776093006134, + "learning_rate": 1.0962373582272445e-05, + "loss": 1.5789, + "mean_token_accuracy": 0.6492965420087179, + "num_tokens": 1575782782.0, + "step": 9401 + }, + { + "entropy": 1.7692484458287556, + "epoch": 1.032847216500508, + "grad_norm": 0.8557153344154358, + "learning_rate": 1.0960772468046109e-05, + "loss": 1.4258, + "mean_token_accuracy": 0.6477571874856949, + "num_tokens": 1575969960.0, + "step": 9402 + }, + { + "entropy": 1.681354542573293, + "epoch": 1.032957073411881, + "grad_norm": 0.6943827867507935, + "learning_rate": 1.095917135506131e-05, + "loss": 1.2469, + "mean_token_accuracy": 0.6881647706031799, + "num_tokens": 1576101672.0, + "step": 9403 + }, + { + "entropy": 1.7285095751285553, + "epoch": 1.033066930323254, + "grad_norm": 0.6692385673522949, + "learning_rate": 1.0957570243368711e-05, + "loss": 1.4383, + "mean_token_accuracy": 0.6471123496691386, + "num_tokens": 1576249549.0, + "step": 9404 + }, + { + "entropy": 1.74024698138237, + "epoch": 1.033176787234627, + "grad_norm": 0.9306145310401917, + "learning_rate": 1.0955969133019e-05, + "loss": 1.236, + "mean_token_accuracy": 0.6720249801874161, + "num_tokens": 1576405097.0, + "step": 9405 + }, + { + "entropy": 1.7044260899225872, + "epoch": 1.0332866441459998, + "grad_norm": 0.7469892501831055, + "learning_rate": 1.0954368024062846e-05, + "loss": 1.4853, + "mean_token_accuracy": 0.6485099146763483, + "num_tokens": 1576584110.0, + "step": 9406 + }, + { + "entropy": 1.704027235507965, + "epoch": 1.0333965010573727, + "grad_norm": 0.7305955290794373, + "learning_rate": 1.0952766916550923e-05, + "loss": 1.4969, + "mean_token_accuracy": 0.6464851995309194, + "num_tokens": 1576770943.0, + "step": 9407 + }, + { + "entropy": 1.7666937212149303, + "epoch": 1.0335063579687458, + "grad_norm": 0.6984846591949463, + "learning_rate": 1.0951165810533903e-05, + "loss": 1.3663, + "mean_token_accuracy": 0.6565556079149246, + "num_tokens": 1576907284.0, + "step": 9408 + }, + { + "entropy": 1.7006201644738514, + "epoch": 1.0336162148801187, + "grad_norm": 0.7005829215049744, + "learning_rate": 1.094956470606247e-05, + "loss": 1.4357, + "mean_token_accuracy": 0.6611707657575607, + "num_tokens": 1577102449.0, + "step": 9409 + }, + { + "entropy": 1.673817624648412, + "epoch": 1.0337260717914916, + "grad_norm": 0.7164878845214844, + "learning_rate": 1.0947963603187284e-05, + "loss": 1.436, + "mean_token_accuracy": 0.6602627138296763, + "num_tokens": 1577306757.0, + "step": 9410 + }, + { + "entropy": 1.6474266449610393, + "epoch": 1.0338359287028644, + "grad_norm": 0.723376452922821, + "learning_rate": 1.094636250195903e-05, + "loss": 1.4225, + "mean_token_accuracy": 0.6634102612733841, + "num_tokens": 1577508937.0, + "step": 9411 + }, + { + "entropy": 1.6886802514394124, + "epoch": 1.0339457856142376, + "grad_norm": 0.7515102624893188, + "learning_rate": 1.094476140242838e-05, + "loss": 1.3389, + "mean_token_accuracy": 0.6653014322121938, + "num_tokens": 1577668263.0, + "step": 9412 + }, + { + "entropy": 1.6901151835918427, + "epoch": 1.0340556425256104, + "grad_norm": 0.7329745888710022, + "learning_rate": 1.0943160304646004e-05, + "loss": 1.2741, + "mean_token_accuracy": 0.6680738429228464, + "num_tokens": 1577798964.0, + "step": 9413 + }, + { + "entropy": 1.6920847098032634, + "epoch": 1.0341654994369833, + "grad_norm": 0.6827352643013, + "learning_rate": 1.0941559208662575e-05, + "loss": 1.3344, + "mean_token_accuracy": 0.6580119580030441, + "num_tokens": 1577947634.0, + "step": 9414 + }, + { + "entropy": 1.7117530802885692, + "epoch": 1.0342753563483562, + "grad_norm": 0.6250885725021362, + "learning_rate": 1.0939958114528782e-05, + "loss": 1.294, + "mean_token_accuracy": 0.666492278377215, + "num_tokens": 1578098070.0, + "step": 9415 + }, + { + "entropy": 1.7154468695322673, + "epoch": 1.034385213259729, + "grad_norm": 0.6659483909606934, + "learning_rate": 1.0938357022295277e-05, + "loss": 1.329, + "mean_token_accuracy": 0.6750149528185526, + "num_tokens": 1578226451.0, + "step": 9416 + }, + { + "entropy": 1.7065132061640422, + "epoch": 1.0344950701711022, + "grad_norm": 0.7499461770057678, + "learning_rate": 1.0936755932012748e-05, + "loss": 1.4039, + "mean_token_accuracy": 0.6496814092000326, + "num_tokens": 1578361910.0, + "step": 9417 + }, + { + "entropy": 1.7363367676734924, + "epoch": 1.034604927082475, + "grad_norm": 0.6667954921722412, + "learning_rate": 1.0935154843731868e-05, + "loss": 1.3555, + "mean_token_accuracy": 0.6621254285176595, + "num_tokens": 1578520491.0, + "step": 9418 + }, + { + "entropy": 1.7204224864641826, + "epoch": 1.034714783993848, + "grad_norm": 0.7654107213020325, + "learning_rate": 1.0933553757503306e-05, + "loss": 1.4836, + "mean_token_accuracy": 0.6469293584426244, + "num_tokens": 1578708477.0, + "step": 9419 + }, + { + "entropy": 1.725062648455302, + "epoch": 1.0348246409052209, + "grad_norm": 0.6829729676246643, + "learning_rate": 1.0931952673377735e-05, + "loss": 1.3266, + "mean_token_accuracy": 0.6637885620196661, + "num_tokens": 1578888821.0, + "step": 9420 + }, + { + "entropy": 1.6738385657469432, + "epoch": 1.034934497816594, + "grad_norm": 0.538158655166626, + "learning_rate": 1.0930351591405836e-05, + "loss": 1.3432, + "mean_token_accuracy": 0.6584972242514292, + "num_tokens": 1579082733.0, + "step": 9421 + }, + { + "entropy": 1.7132914861043294, + "epoch": 1.0350443547279669, + "grad_norm": 0.6430938839912415, + "learning_rate": 1.0928750511638272e-05, + "loss": 1.4578, + "mean_token_accuracy": 0.6506678561369578, + "num_tokens": 1579299272.0, + "step": 9422 + }, + { + "entropy": 1.7596480747063954, + "epoch": 1.0351542116393397, + "grad_norm": 0.7233623266220093, + "learning_rate": 1.0927149434125725e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.6559812525908152, + "num_tokens": 1579418957.0, + "step": 9423 + }, + { + "entropy": 1.6402805646260579, + "epoch": 1.0352640685507126, + "grad_norm": 0.7694196701049805, + "learning_rate": 1.092554835891887e-05, + "loss": 1.2518, + "mean_token_accuracy": 0.6738807211319605, + "num_tokens": 1579551997.0, + "step": 9424 + }, + { + "entropy": 1.6636198858420055, + "epoch": 1.0353739254620857, + "grad_norm": 0.7831659317016602, + "learning_rate": 1.092394728606837e-05, + "loss": 1.4856, + "mean_token_accuracy": 0.6563919832309087, + "num_tokens": 1579768552.0, + "step": 9425 + }, + { + "entropy": 1.716377208630244, + "epoch": 1.0354837823734586, + "grad_norm": 0.6124777793884277, + "learning_rate": 1.0922346215624905e-05, + "loss": 1.521, + "mean_token_accuracy": 0.6268165409564972, + "num_tokens": 1580016863.0, + "step": 9426 + }, + { + "entropy": 1.7089830438296, + "epoch": 1.0355936392848315, + "grad_norm": 0.6345846652984619, + "learning_rate": 1.092074514763915e-05, + "loss": 1.3273, + "mean_token_accuracy": 0.655559649070104, + "num_tokens": 1580174116.0, + "step": 9427 + }, + { + "entropy": 1.726669172445933, + "epoch": 1.0357034961962044, + "grad_norm": 0.7052589654922485, + "learning_rate": 1.0919144082161773e-05, + "loss": 1.3349, + "mean_token_accuracy": 0.6611567487319311, + "num_tokens": 1580325786.0, + "step": 9428 + }, + { + "entropy": 1.71493865052859, + "epoch": 1.0358133531075775, + "grad_norm": 0.6326825618743896, + "learning_rate": 1.0917543019243451e-05, + "loss": 1.3176, + "mean_token_accuracy": 0.6714158058166504, + "num_tokens": 1580469908.0, + "step": 9429 + }, + { + "entropy": 1.6959039668242137, + "epoch": 1.0359232100189504, + "grad_norm": 0.6697741746902466, + "learning_rate": 1.0915941958934855e-05, + "loss": 1.2301, + "mean_token_accuracy": 0.6766091585159302, + "num_tokens": 1580630331.0, + "step": 9430 + }, + { + "entropy": 1.672759582599004, + "epoch": 1.0360330669303233, + "grad_norm": 0.7174257636070251, + "learning_rate": 1.0914340901286657e-05, + "loss": 1.5949, + "mean_token_accuracy": 0.646736760934194, + "num_tokens": 1580830522.0, + "step": 9431 + }, + { + "entropy": 1.7258618871370952, + "epoch": 1.0361429238416962, + "grad_norm": 0.6316325664520264, + "learning_rate": 1.0912739846349529e-05, + "loss": 1.419, + "mean_token_accuracy": 0.6607301781574885, + "num_tokens": 1581014308.0, + "step": 9432 + }, + { + "entropy": 1.7705416182676952, + "epoch": 1.036252780753069, + "grad_norm": 0.6447232961654663, + "learning_rate": 1.0911138794174151e-05, + "loss": 1.3577, + "mean_token_accuracy": 0.6522252509991328, + "num_tokens": 1581141631.0, + "step": 9433 + }, + { + "entropy": 1.6815461615721385, + "epoch": 1.0363626376644421, + "grad_norm": 0.6908884644508362, + "learning_rate": 1.0909537744811186e-05, + "loss": 1.3045, + "mean_token_accuracy": 0.6645476520061493, + "num_tokens": 1581289338.0, + "step": 9434 + }, + { + "entropy": 1.6792431473731995, + "epoch": 1.036472494575815, + "grad_norm": 0.6091166734695435, + "learning_rate": 1.090793669831131e-05, + "loss": 1.4287, + "mean_token_accuracy": 0.6663202345371246, + "num_tokens": 1581475004.0, + "step": 9435 + }, + { + "entropy": 1.6384399731953938, + "epoch": 1.036582351487188, + "grad_norm": 0.6524921655654907, + "learning_rate": 1.0906335654725199e-05, + "loss": 1.443, + "mean_token_accuracy": 0.6610978494087855, + "num_tokens": 1581635537.0, + "step": 9436 + }, + { + "entropy": 1.7117190460364025, + "epoch": 1.0366922083985608, + "grad_norm": 0.5974973440170288, + "learning_rate": 1.090473461410352e-05, + "loss": 1.5023, + "mean_token_accuracy": 0.6504662285248438, + "num_tokens": 1581826389.0, + "step": 9437 + }, + { + "entropy": 1.7102805376052856, + "epoch": 1.036802065309934, + "grad_norm": 0.7070255279541016, + "learning_rate": 1.0903133576496952e-05, + "loss": 1.5332, + "mean_token_accuracy": 0.640349547068278, + "num_tokens": 1582049870.0, + "step": 9438 + }, + { + "entropy": 1.7368799050649006, + "epoch": 1.0369119222213068, + "grad_norm": 0.6297962069511414, + "learning_rate": 1.0901532541956159e-05, + "loss": 1.389, + "mean_token_accuracy": 0.64411032696565, + "num_tokens": 1582205781.0, + "step": 9439 + }, + { + "entropy": 1.7114115158716838, + "epoch": 1.0370217791326797, + "grad_norm": 0.6687158346176147, + "learning_rate": 1.0899931510531814e-05, + "loss": 1.6336, + "mean_token_accuracy": 0.6240918189287186, + "num_tokens": 1582406654.0, + "step": 9440 + }, + { + "entropy": 1.6156066060066223, + "epoch": 1.0371316360440526, + "grad_norm": 0.7124824523925781, + "learning_rate": 1.0898330482274598e-05, + "loss": 1.3993, + "mean_token_accuracy": 0.6696003576119741, + "num_tokens": 1582566705.0, + "step": 9441 + }, + { + "entropy": 1.7156145075956981, + "epoch": 1.0372414929554257, + "grad_norm": 0.6782826781272888, + "learning_rate": 1.089672945723517e-05, + "loss": 1.4816, + "mean_token_accuracy": 0.6556824495395025, + "num_tokens": 1582742277.0, + "step": 9442 + }, + { + "entropy": 1.593651960293452, + "epoch": 1.0373513498667986, + "grad_norm": 0.8009786009788513, + "learning_rate": 1.089512843546421e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.6646837691466013, + "num_tokens": 1582926809.0, + "step": 9443 + }, + { + "entropy": 1.7035342653592427, + "epoch": 1.0374612067781714, + "grad_norm": 0.7360708713531494, + "learning_rate": 1.0893527417012391e-05, + "loss": 1.453, + "mean_token_accuracy": 0.6582255164782206, + "num_tokens": 1583084184.0, + "step": 9444 + }, + { + "entropy": 1.6633445918560028, + "epoch": 1.0375710636895443, + "grad_norm": 0.5686823129653931, + "learning_rate": 1.0891926401930379e-05, + "loss": 1.3507, + "mean_token_accuracy": 0.6624014725287756, + "num_tokens": 1583226432.0, + "step": 9445 + }, + { + "entropy": 1.698949267466863, + "epoch": 1.0376809206009172, + "grad_norm": 0.757599413394928, + "learning_rate": 1.0890325390268846e-05, + "loss": 1.2624, + "mean_token_accuracy": 0.674264038602511, + "num_tokens": 1583345570.0, + "step": 9446 + }, + { + "entropy": 1.6697109937667847, + "epoch": 1.0377907775122903, + "grad_norm": 0.7264062166213989, + "learning_rate": 1.088872438207847e-05, + "loss": 1.3789, + "mean_token_accuracy": 0.6585222283999125, + "num_tokens": 1583594017.0, + "step": 9447 + }, + { + "entropy": 1.664307415485382, + "epoch": 1.0379006344236632, + "grad_norm": 0.6853240728378296, + "learning_rate": 1.0887123377409911e-05, + "loss": 1.3018, + "mean_token_accuracy": 0.6689639985561371, + "num_tokens": 1583728618.0, + "step": 9448 + }, + { + "entropy": 1.6331247488657634, + "epoch": 1.038010491335036, + "grad_norm": 0.7877902984619141, + "learning_rate": 1.0885522376313848e-05, + "loss": 1.3652, + "mean_token_accuracy": 0.6695854564507803, + "num_tokens": 1583859662.0, + "step": 9449 + }, + { + "entropy": 1.788600593805313, + "epoch": 1.038120348246409, + "grad_norm": 0.7138171195983887, + "learning_rate": 1.0883921378840954e-05, + "loss": 1.3607, + "mean_token_accuracy": 0.6501663575569788, + "num_tokens": 1584058402.0, + "step": 9450 + }, + { + "entropy": 1.697994162638982, + "epoch": 1.038230205157782, + "grad_norm": 1.0916004180908203, + "learning_rate": 1.0882320385041893e-05, + "loss": 1.3157, + "mean_token_accuracy": 0.6665289948383967, + "num_tokens": 1584191366.0, + "step": 9451 + }, + { + "entropy": 1.698983242114385, + "epoch": 1.038340062069155, + "grad_norm": 0.7171838283538818, + "learning_rate": 1.0880719394967336e-05, + "loss": 1.4387, + "mean_token_accuracy": 0.6508918007214864, + "num_tokens": 1584338635.0, + "step": 9452 + }, + { + "entropy": 1.7182096342245738, + "epoch": 1.0384499189805279, + "grad_norm": 0.7586554884910583, + "learning_rate": 1.0879118408667964e-05, + "loss": 1.2876, + "mean_token_accuracy": 0.667991022268931, + "num_tokens": 1584457176.0, + "step": 9453 + }, + { + "entropy": 1.6738979419072468, + "epoch": 1.0385597758919007, + "grad_norm": 0.706785261631012, + "learning_rate": 1.0877517426194433e-05, + "loss": 1.2904, + "mean_token_accuracy": 0.6816811164220175, + "num_tokens": 1584625605.0, + "step": 9454 + }, + { + "entropy": 1.6561195055643718, + "epoch": 1.0386696328032738, + "grad_norm": 0.7076295614242554, + "learning_rate": 1.0875916447597423e-05, + "loss": 1.5181, + "mean_token_accuracy": 0.6429178069035212, + "num_tokens": 1584812931.0, + "step": 9455 + }, + { + "entropy": 1.7122123738129933, + "epoch": 1.0387794897146467, + "grad_norm": 0.7200018167495728, + "learning_rate": 1.0874315472927601e-05, + "loss": 1.4502, + "mean_token_accuracy": 0.6555789758761724, + "num_tokens": 1584970989.0, + "step": 9456 + }, + { + "entropy": 1.6943861742814381, + "epoch": 1.0388893466260196, + "grad_norm": 0.8031889796257019, + "learning_rate": 1.087271450223564e-05, + "loss": 1.5488, + "mean_token_accuracy": 0.6436296353737513, + "num_tokens": 1585140553.0, + "step": 9457 + }, + { + "entropy": 1.718834122021993, + "epoch": 1.0389992035373925, + "grad_norm": 0.5605930089950562, + "learning_rate": 1.0871113535572203e-05, + "loss": 1.4931, + "mean_token_accuracy": 0.6522552420695623, + "num_tokens": 1585352103.0, + "step": 9458 + }, + { + "entropy": 1.6309671302636464, + "epoch": 1.0391090604487654, + "grad_norm": 0.6874315142631531, + "learning_rate": 1.0869512572987971e-05, + "loss": 1.3105, + "mean_token_accuracy": 0.6766562660535177, + "num_tokens": 1585488952.0, + "step": 9459 + }, + { + "entropy": 1.7397524615128834, + "epoch": 1.0392189173601385, + "grad_norm": 0.7973026037216187, + "learning_rate": 1.0867911614533599e-05, + "loss": 1.5217, + "mean_token_accuracy": 0.6329129338264465, + "num_tokens": 1585683287.0, + "step": 9460 + }, + { + "entropy": 1.6979398131370544, + "epoch": 1.0393287742715114, + "grad_norm": 0.7028073668479919, + "learning_rate": 1.0866310660259769e-05, + "loss": 1.3715, + "mean_token_accuracy": 0.6600656112035116, + "num_tokens": 1585861769.0, + "step": 9461 + }, + { + "entropy": 1.6728238761425018, + "epoch": 1.0394386311828843, + "grad_norm": 0.6784250736236572, + "learning_rate": 1.0864709710217149e-05, + "loss": 1.4738, + "mean_token_accuracy": 0.64061538875103, + "num_tokens": 1586053141.0, + "step": 9462 + }, + { + "entropy": 1.7052525381247203, + "epoch": 1.0395484880942572, + "grad_norm": 0.6356661915779114, + "learning_rate": 1.0863108764456403e-05, + "loss": 1.3033, + "mean_token_accuracy": 0.6688580562671026, + "num_tokens": 1586187994.0, + "step": 9463 + }, + { + "entropy": 1.6879315078258514, + "epoch": 1.0396583450056303, + "grad_norm": 0.6088063716888428, + "learning_rate": 1.0861507823028201e-05, + "loss": 1.2934, + "mean_token_accuracy": 0.6732538690169653, + "num_tokens": 1586321606.0, + "step": 9464 + }, + { + "entropy": 1.6835426688194275, + "epoch": 1.0397682019170031, + "grad_norm": 0.7947443127632141, + "learning_rate": 1.0859906885983221e-05, + "loss": 1.4395, + "mean_token_accuracy": 0.6450006117423376, + "num_tokens": 1586497070.0, + "step": 9465 + }, + { + "entropy": 1.678231567144394, + "epoch": 1.039878058828376, + "grad_norm": 0.6062014102935791, + "learning_rate": 1.0858305953372117e-05, + "loss": 1.411, + "mean_token_accuracy": 0.6603821168343226, + "num_tokens": 1586704728.0, + "step": 9466 + }, + { + "entropy": 1.757646510998408, + "epoch": 1.039987915739749, + "grad_norm": 0.7607711553573608, + "learning_rate": 1.0856705025245566e-05, + "loss": 1.2984, + "mean_token_accuracy": 0.6734205285708109, + "num_tokens": 1586815108.0, + "step": 9467 + }, + { + "entropy": 1.6538205246130626, + "epoch": 1.040097772651122, + "grad_norm": 0.8425273895263672, + "learning_rate": 1.0855104101654241e-05, + "loss": 1.1927, + "mean_token_accuracy": 0.6824866682291031, + "num_tokens": 1586938071.0, + "step": 9468 + }, + { + "entropy": 1.678248147169749, + "epoch": 1.040207629562495, + "grad_norm": 0.6422735452651978, + "learning_rate": 1.0853503182648806e-05, + "loss": 1.5994, + "mean_token_accuracy": 0.6251678119103113, + "num_tokens": 1587160509.0, + "step": 9469 + }, + { + "entropy": 1.7205635011196136, + "epoch": 1.0403174864738678, + "grad_norm": 0.717113733291626, + "learning_rate": 1.0851902268279923e-05, + "loss": 1.4603, + "mean_token_accuracy": 0.6581100126107534, + "num_tokens": 1587294558.0, + "step": 9470 + }, + { + "entropy": 1.6604284048080444, + "epoch": 1.0404273433852407, + "grad_norm": 0.6005326509475708, + "learning_rate": 1.0850301358598276e-05, + "loss": 1.5035, + "mean_token_accuracy": 0.6511695633331934, + "num_tokens": 1587540147.0, + "step": 9471 + }, + { + "entropy": 1.7822232246398926, + "epoch": 1.0405372002966136, + "grad_norm": 0.6600624918937683, + "learning_rate": 1.0848700453654517e-05, + "loss": 1.3744, + "mean_token_accuracy": 0.6612835874160131, + "num_tokens": 1587676331.0, + "step": 9472 + }, + { + "entropy": 1.6617770393689473, + "epoch": 1.0406470572079867, + "grad_norm": 0.5907867550849915, + "learning_rate": 1.0847099553499321e-05, + "loss": 1.3851, + "mean_token_accuracy": 0.6616502503554026, + "num_tokens": 1587839664.0, + "step": 9473 + }, + { + "entropy": 1.7285722692807515, + "epoch": 1.0407569141193596, + "grad_norm": 0.6949423551559448, + "learning_rate": 1.0845498658183358e-05, + "loss": 1.6408, + "mean_token_accuracy": 0.6250172158082327, + "num_tokens": 1588028296.0, + "step": 9474 + }, + { + "entropy": 1.6520145336786907, + "epoch": 1.0408667710307324, + "grad_norm": 0.6275820136070251, + "learning_rate": 1.084389776775729e-05, + "loss": 1.3768, + "mean_token_accuracy": 0.6688976238171259, + "num_tokens": 1588161759.0, + "step": 9475 + }, + { + "entropy": 1.727418303489685, + "epoch": 1.0409766279421053, + "grad_norm": 0.690833330154419, + "learning_rate": 1.0842296882271785e-05, + "loss": 1.3988, + "mean_token_accuracy": 0.6522354880968729, + "num_tokens": 1588316836.0, + "step": 9476 + }, + { + "entropy": 1.7261015077431996, + "epoch": 1.0410864848534784, + "grad_norm": 0.7379058003425598, + "learning_rate": 1.0840696001777519e-05, + "loss": 1.4334, + "mean_token_accuracy": 0.6473392049471537, + "num_tokens": 1588497270.0, + "step": 9477 + }, + { + "entropy": 1.7033613324165344, + "epoch": 1.0411963417648513, + "grad_norm": 0.6721797585487366, + "learning_rate": 1.0839095126325148e-05, + "loss": 1.4466, + "mean_token_accuracy": 0.6510754525661469, + "num_tokens": 1588687763.0, + "step": 9478 + }, + { + "entropy": 1.7521330614884694, + "epoch": 1.0413061986762242, + "grad_norm": 0.7003641128540039, + "learning_rate": 1.0837494255965347e-05, + "loss": 1.4311, + "mean_token_accuracy": 0.6499675313631693, + "num_tokens": 1588893987.0, + "step": 9479 + }, + { + "entropy": 1.6936258971691132, + "epoch": 1.041416055587597, + "grad_norm": 0.6926009654998779, + "learning_rate": 1.0835893390748777e-05, + "loss": 1.3398, + "mean_token_accuracy": 0.6582322865724564, + "num_tokens": 1589149049.0, + "step": 9480 + }, + { + "entropy": 1.7185107568899791, + "epoch": 1.0415259124989702, + "grad_norm": 0.6344667077064514, + "learning_rate": 1.083429253072611e-05, + "loss": 1.3841, + "mean_token_accuracy": 0.6531223158041636, + "num_tokens": 1589317687.0, + "step": 9481 + }, + { + "entropy": 1.7045740981896718, + "epoch": 1.041635769410343, + "grad_norm": 0.6675848960876465, + "learning_rate": 1.0832691675948004e-05, + "loss": 1.3395, + "mean_token_accuracy": 0.6758049378792444, + "num_tokens": 1589477279.0, + "step": 9482 + }, + { + "entropy": 1.717594563961029, + "epoch": 1.041745626321716, + "grad_norm": 0.9027393460273743, + "learning_rate": 1.0831090826465139e-05, + "loss": 1.4093, + "mean_token_accuracy": 0.6508564899365107, + "num_tokens": 1589628011.0, + "step": 9483 + }, + { + "entropy": 1.5932787358760834, + "epoch": 1.0418554832330889, + "grad_norm": 0.5880972146987915, + "learning_rate": 1.0829489982328168e-05, + "loss": 1.3118, + "mean_token_accuracy": 0.6709119379520416, + "num_tokens": 1589802651.0, + "step": 9484 + }, + { + "entropy": 1.728572279214859, + "epoch": 1.0419653401444617, + "grad_norm": 0.629753589630127, + "learning_rate": 1.0827889143587761e-05, + "loss": 1.3639, + "mean_token_accuracy": 0.6476372530062994, + "num_tokens": 1590000495.0, + "step": 9485 + }, + { + "entropy": 1.7684464951356251, + "epoch": 1.0420751970558348, + "grad_norm": 0.7243369817733765, + "learning_rate": 1.082628831029459e-05, + "loss": 1.337, + "mean_token_accuracy": 0.6554118494192759, + "num_tokens": 1590147039.0, + "step": 9486 + }, + { + "entropy": 1.7246462404727936, + "epoch": 1.0421850539672077, + "grad_norm": 0.7119016647338867, + "learning_rate": 1.0824687482499312e-05, + "loss": 1.6279, + "mean_token_accuracy": 0.626791646083196, + "num_tokens": 1590345057.0, + "step": 9487 + }, + { + "entropy": 1.6625853478908539, + "epoch": 1.0422949108785806, + "grad_norm": 0.6488005518913269, + "learning_rate": 1.0823086660252595e-05, + "loss": 1.3786, + "mean_token_accuracy": 0.6684714208046595, + "num_tokens": 1590517394.0, + "step": 9488 + }, + { + "entropy": 1.6413832604885101, + "epoch": 1.0424047677899535, + "grad_norm": 0.6880024075508118, + "learning_rate": 1.0821485843605114e-05, + "loss": 1.3171, + "mean_token_accuracy": 0.6710259020328522, + "num_tokens": 1590676951.0, + "step": 9489 + }, + { + "entropy": 1.7233413557211559, + "epoch": 1.0425146247013266, + "grad_norm": 0.8669329881668091, + "learning_rate": 1.0819885032607516e-05, + "loss": 1.4437, + "mean_token_accuracy": 0.6574445317188898, + "num_tokens": 1590798961.0, + "step": 9490 + }, + { + "entropy": 1.7381323476632435, + "epoch": 1.0426244816126995, + "grad_norm": 0.7421051263809204, + "learning_rate": 1.0818284227310479e-05, + "loss": 1.3651, + "mean_token_accuracy": 0.6702295790115992, + "num_tokens": 1590925866.0, + "step": 9491 + }, + { + "entropy": 1.69859579205513, + "epoch": 1.0427343385240724, + "grad_norm": 0.6176729202270508, + "learning_rate": 1.0816683427764665e-05, + "loss": 1.3806, + "mean_token_accuracy": 0.6713998268047968, + "num_tokens": 1591079868.0, + "step": 9492 + }, + { + "entropy": 1.6749196946620941, + "epoch": 1.0428441954354453, + "grad_norm": 0.766416072845459, + "learning_rate": 1.0815082634020737e-05, + "loss": 1.3999, + "mean_token_accuracy": 0.6613041559855143, + "num_tokens": 1591232390.0, + "step": 9493 + }, + { + "entropy": 1.7065080106258392, + "epoch": 1.0429540523468184, + "grad_norm": 0.5942599177360535, + "learning_rate": 1.0813481846129358e-05, + "loss": 1.3596, + "mean_token_accuracy": 0.6696609854698181, + "num_tokens": 1591415037.0, + "step": 9494 + }, + { + "entropy": 1.6899320483207703, + "epoch": 1.0430639092581913, + "grad_norm": 0.6180544495582581, + "learning_rate": 1.0811881064141201e-05, + "loss": 1.4791, + "mean_token_accuracy": 0.6475685685873032, + "num_tokens": 1591633165.0, + "step": 9495 + }, + { + "entropy": 1.7490257620811462, + "epoch": 1.0431737661695641, + "grad_norm": 0.8704851269721985, + "learning_rate": 1.0810280288106918e-05, + "loss": 1.5085, + "mean_token_accuracy": 0.6373362342516581, + "num_tokens": 1591830572.0, + "step": 9496 + }, + { + "entropy": 1.7038207252820332, + "epoch": 1.043283623080937, + "grad_norm": 0.6665728092193604, + "learning_rate": 1.0808679518077178e-05, + "loss": 1.3859, + "mean_token_accuracy": 0.6533089727163315, + "num_tokens": 1592012205.0, + "step": 9497 + }, + { + "entropy": 1.6818484663963318, + "epoch": 1.04339347999231, + "grad_norm": 0.7445968985557556, + "learning_rate": 1.0807078754102649e-05, + "loss": 1.3426, + "mean_token_accuracy": 0.662312775850296, + "num_tokens": 1592147760.0, + "step": 9498 + }, + { + "entropy": 1.7490168611208599, + "epoch": 1.043503336903683, + "grad_norm": 0.6908554434776306, + "learning_rate": 1.0805477996233988e-05, + "loss": 1.409, + "mean_token_accuracy": 0.649876077969869, + "num_tokens": 1592273034.0, + "step": 9499 + }, + { + "entropy": 1.6453217168649037, + "epoch": 1.043613193815056, + "grad_norm": 0.7113070487976074, + "learning_rate": 1.0803877244521863e-05, + "loss": 1.3044, + "mean_token_accuracy": 0.6699142803748449, + "num_tokens": 1592419247.0, + "step": 9500 + }, + { + "entropy": 1.7486879030863445, + "epoch": 1.0437230507264288, + "grad_norm": 0.7228788137435913, + "learning_rate": 1.0802276499016932e-05, + "loss": 1.4574, + "mean_token_accuracy": 0.6567816833655039, + "num_tokens": 1592549064.0, + "step": 9501 + }, + { + "entropy": 1.7028538783391316, + "epoch": 1.0438329076378017, + "grad_norm": 0.7245208024978638, + "learning_rate": 1.0800675759769861e-05, + "loss": 1.4313, + "mean_token_accuracy": 0.653529609243075, + "num_tokens": 1592725321.0, + "step": 9502 + }, + { + "entropy": 1.6976373294989269, + "epoch": 1.0439427645491748, + "grad_norm": 0.5360068678855896, + "learning_rate": 1.0799075026831317e-05, + "loss": 1.4317, + "mean_token_accuracy": 0.6476470828056335, + "num_tokens": 1592938657.0, + "step": 9503 + }, + { + "entropy": 1.6830028196175892, + "epoch": 1.0440526214605477, + "grad_norm": 0.5777772665023804, + "learning_rate": 1.0797474300251952e-05, + "loss": 1.4256, + "mean_token_accuracy": 0.6514229973157247, + "num_tokens": 1593133783.0, + "step": 9504 + }, + { + "entropy": 1.6651501556237538, + "epoch": 1.0441624783719206, + "grad_norm": 0.6623477935791016, + "learning_rate": 1.0795873580082434e-05, + "loss": 1.4702, + "mean_token_accuracy": 0.6620995352665583, + "num_tokens": 1593338733.0, + "step": 9505 + }, + { + "entropy": 1.698107163111369, + "epoch": 1.0442723352832934, + "grad_norm": 0.7003724575042725, + "learning_rate": 1.0794272866373431e-05, + "loss": 1.5227, + "mean_token_accuracy": 0.6455037742853165, + "num_tokens": 1593586152.0, + "step": 9506 + }, + { + "entropy": 1.7461525400479634, + "epoch": 1.0443821921946665, + "grad_norm": 0.8416768908500671, + "learning_rate": 1.0792672159175595e-05, + "loss": 1.6376, + "mean_token_accuracy": 0.6324817140897115, + "num_tokens": 1593765469.0, + "step": 9507 + }, + { + "entropy": 1.7584339678287506, + "epoch": 1.0444920491060394, + "grad_norm": 0.7773275971412659, + "learning_rate": 1.079107145853959e-05, + "loss": 1.4037, + "mean_token_accuracy": 0.6495963931083679, + "num_tokens": 1593887086.0, + "step": 9508 + }, + { + "entropy": 1.715292066335678, + "epoch": 1.0446019060174123, + "grad_norm": 0.7239483594894409, + "learning_rate": 1.0789470764516084e-05, + "loss": 1.2976, + "mean_token_accuracy": 0.6715022226174673, + "num_tokens": 1594044689.0, + "step": 9509 + }, + { + "entropy": 1.6825797359148662, + "epoch": 1.0447117629287852, + "grad_norm": 0.7251731753349304, + "learning_rate": 1.0787870077155728e-05, + "loss": 1.2752, + "mean_token_accuracy": 0.6671033104260763, + "num_tokens": 1594163502.0, + "step": 9510 + }, + { + "entropy": 1.7215838134288788, + "epoch": 1.044821619840158, + "grad_norm": 0.7168553471565247, + "learning_rate": 1.0786269396509193e-05, + "loss": 1.2838, + "mean_token_accuracy": 0.6646192222833633, + "num_tokens": 1594317624.0, + "step": 9511 + }, + { + "entropy": 1.718077729145686, + "epoch": 1.0449314767515312, + "grad_norm": 0.6914111375808716, + "learning_rate": 1.0784668722627134e-05, + "loss": 1.3862, + "mean_token_accuracy": 0.6595685482025146, + "num_tokens": 1594465247.0, + "step": 9512 + }, + { + "entropy": 1.7051511108875275, + "epoch": 1.045041333662904, + "grad_norm": 0.7132536768913269, + "learning_rate": 1.0783068055560212e-05, + "loss": 1.3335, + "mean_token_accuracy": 0.6642508854468664, + "num_tokens": 1594646030.0, + "step": 9513 + }, + { + "entropy": 1.7331341405709584, + "epoch": 1.045151190574277, + "grad_norm": 0.760560929775238, + "learning_rate": 1.0781467395359086e-05, + "loss": 1.4119, + "mean_token_accuracy": 0.6549452046553293, + "num_tokens": 1594796111.0, + "step": 9514 + }, + { + "entropy": 1.6813835899035137, + "epoch": 1.0452610474856499, + "grad_norm": 0.6038804650306702, + "learning_rate": 1.0779866742074427e-05, + "loss": 1.3664, + "mean_token_accuracy": 0.655574768781662, + "num_tokens": 1594951516.0, + "step": 9515 + }, + { + "entropy": 1.753594805796941, + "epoch": 1.045370904397023, + "grad_norm": 0.6593688130378723, + "learning_rate": 1.0778266095756877e-05, + "loss": 1.3677, + "mean_token_accuracy": 0.64958788951238, + "num_tokens": 1595102148.0, + "step": 9516 + }, + { + "entropy": 1.696462760368983, + "epoch": 1.0454807613083958, + "grad_norm": 0.8179159164428711, + "learning_rate": 1.077666545645711e-05, + "loss": 1.342, + "mean_token_accuracy": 0.6669958929220835, + "num_tokens": 1595253275.0, + "step": 9517 + }, + { + "entropy": 1.7647280593713124, + "epoch": 1.0455906182197687, + "grad_norm": 0.7071588039398193, + "learning_rate": 1.077506482422578e-05, + "loss": 1.4662, + "mean_token_accuracy": 0.6489268441994985, + "num_tokens": 1595420875.0, + "step": 9518 + }, + { + "entropy": 1.661277323961258, + "epoch": 1.0457004751311416, + "grad_norm": 0.6176816821098328, + "learning_rate": 1.0773464199113545e-05, + "loss": 1.4074, + "mean_token_accuracy": 0.6495037625233332, + "num_tokens": 1595639530.0, + "step": 9519 + }, + { + "entropy": 1.6793854931990306, + "epoch": 1.0458103320425147, + "grad_norm": 0.6099405288696289, + "learning_rate": 1.0771863581171067e-05, + "loss": 1.4901, + "mean_token_accuracy": 0.638078898191452, + "num_tokens": 1595805557.0, + "step": 9520 + }, + { + "entropy": 1.6664031247297924, + "epoch": 1.0459201889538876, + "grad_norm": 0.7855103611946106, + "learning_rate": 1.0770262970449007e-05, + "loss": 1.2644, + "mean_token_accuracy": 0.6843874802192053, + "num_tokens": 1596024907.0, + "step": 9521 + }, + { + "entropy": 1.7892861366271973, + "epoch": 1.0460300458652605, + "grad_norm": 0.6650031208992004, + "learning_rate": 1.0768662366998017e-05, + "loss": 1.5564, + "mean_token_accuracy": 0.6340744346380234, + "num_tokens": 1596223113.0, + "step": 9522 + }, + { + "entropy": 1.6714712381362915, + "epoch": 1.0461399027766334, + "grad_norm": 0.6163212656974792, + "learning_rate": 1.0767061770868758e-05, + "loss": 1.4224, + "mean_token_accuracy": 0.6527364104986191, + "num_tokens": 1596404722.0, + "step": 9523 + }, + { + "entropy": 1.7213294704755147, + "epoch": 1.0462497596880063, + "grad_norm": 0.7242403030395508, + "learning_rate": 1.0765461182111894e-05, + "loss": 1.4046, + "mean_token_accuracy": 0.6574538151423136, + "num_tokens": 1596556498.0, + "step": 9524 + }, + { + "entropy": 1.70473250746727, + "epoch": 1.0463596165993794, + "grad_norm": 0.8077401518821716, + "learning_rate": 1.0763860600778073e-05, + "loss": 1.3111, + "mean_token_accuracy": 0.6717382967472076, + "num_tokens": 1596692410.0, + "step": 9525 + }, + { + "entropy": 1.65737051765124, + "epoch": 1.0464694735107523, + "grad_norm": 0.6047679781913757, + "learning_rate": 1.0762260026917957e-05, + "loss": 1.5421, + "mean_token_accuracy": 0.6482739100853602, + "num_tokens": 1596893524.0, + "step": 9526 + }, + { + "entropy": 1.6772952377796173, + "epoch": 1.0465793304221251, + "grad_norm": 0.6844344139099121, + "learning_rate": 1.076065946058221e-05, + "loss": 1.3097, + "mean_token_accuracy": 0.6696969568729401, + "num_tokens": 1597033357.0, + "step": 9527 + }, + { + "entropy": 1.690885325272878, + "epoch": 1.046689187333498, + "grad_norm": 0.722217857837677, + "learning_rate": 1.0759058901821478e-05, + "loss": 1.4674, + "mean_token_accuracy": 0.6511874943971634, + "num_tokens": 1597219302.0, + "step": 9528 + }, + { + "entropy": 1.7689124047756195, + "epoch": 1.0467990442448711, + "grad_norm": 0.7030705809593201, + "learning_rate": 1.0757458350686423e-05, + "loss": 1.3782, + "mean_token_accuracy": 0.6555833717187246, + "num_tokens": 1597354184.0, + "step": 9529 + }, + { + "entropy": 1.7530174255371094, + "epoch": 1.046908901156244, + "grad_norm": 0.6865312457084656, + "learning_rate": 1.0755857807227705e-05, + "loss": 1.2826, + "mean_token_accuracy": 0.6668087194363276, + "num_tokens": 1597483504.0, + "step": 9530 + }, + { + "entropy": 1.664690375328064, + "epoch": 1.047018758067617, + "grad_norm": 0.7193317413330078, + "learning_rate": 1.0754257271495976e-05, + "loss": 1.4993, + "mean_token_accuracy": 0.6674816509087881, + "num_tokens": 1597629526.0, + "step": 9531 + }, + { + "entropy": 1.6781654755274455, + "epoch": 1.0471286149789898, + "grad_norm": 0.8282174468040466, + "learning_rate": 1.0752656743541892e-05, + "loss": 1.2977, + "mean_token_accuracy": 0.6744814167420069, + "num_tokens": 1597772249.0, + "step": 9532 + }, + { + "entropy": 1.653449535369873, + "epoch": 1.047238471890363, + "grad_norm": 0.6748833060264587, + "learning_rate": 1.0751056223416116e-05, + "loss": 1.4044, + "mean_token_accuracy": 0.6702044556538264, + "num_tokens": 1597951987.0, + "step": 9533 + }, + { + "entropy": 1.6903007527192433, + "epoch": 1.0473483288017358, + "grad_norm": 0.7806357741355896, + "learning_rate": 1.074945571116929e-05, + "loss": 1.3952, + "mean_token_accuracy": 0.6694519221782684, + "num_tokens": 1598150824.0, + "step": 9534 + }, + { + "entropy": 1.681130548318227, + "epoch": 1.0474581857131087, + "grad_norm": 0.6809220314025879, + "learning_rate": 1.0747855206852083e-05, + "loss": 1.3198, + "mean_token_accuracy": 0.6696507086356481, + "num_tokens": 1598324940.0, + "step": 9535 + }, + { + "entropy": 1.7262980838616688, + "epoch": 1.0475680426244816, + "grad_norm": 0.763521671295166, + "learning_rate": 1.0746254710515148e-05, + "loss": 1.4035, + "mean_token_accuracy": 0.6502132068077723, + "num_tokens": 1598501856.0, + "step": 9536 + }, + { + "entropy": 1.738362580537796, + "epoch": 1.0476778995358544, + "grad_norm": 0.6573911309242249, + "learning_rate": 1.0744654222209132e-05, + "loss": 1.476, + "mean_token_accuracy": 0.65400230884552, + "num_tokens": 1598670504.0, + "step": 9537 + }, + { + "entropy": 1.6924460927645366, + "epoch": 1.0477877564472275, + "grad_norm": 0.8024819493293762, + "learning_rate": 1.0743053741984692e-05, + "loss": 1.3561, + "mean_token_accuracy": 0.6752625207106272, + "num_tokens": 1598838975.0, + "step": 9538 + }, + { + "entropy": 1.6892497539520264, + "epoch": 1.0478976133586004, + "grad_norm": 0.5688782930374146, + "learning_rate": 1.0741453269892495e-05, + "loss": 1.3445, + "mean_token_accuracy": 0.6590389758348465, + "num_tokens": 1599027494.0, + "step": 9539 + }, + { + "entropy": 1.6760010123252869, + "epoch": 1.0480074702699733, + "grad_norm": 0.6238588094711304, + "learning_rate": 1.0739852805983177e-05, + "loss": 1.3616, + "mean_token_accuracy": 0.6570176730553309, + "num_tokens": 1599237306.0, + "step": 9540 + }, + { + "entropy": 1.7172687649726868, + "epoch": 1.0481173271813462, + "grad_norm": 0.6968215703964233, + "learning_rate": 1.0738252350307403e-05, + "loss": 1.3149, + "mean_token_accuracy": 0.669064129392306, + "num_tokens": 1599419358.0, + "step": 9541 + }, + { + "entropy": 1.7211929361025493, + "epoch": 1.0482271840927193, + "grad_norm": 0.8353010416030884, + "learning_rate": 1.0736651902915827e-05, + "loss": 1.3156, + "mean_token_accuracy": 0.6618754814068476, + "num_tokens": 1599527770.0, + "step": 9542 + }, + { + "entropy": 1.6436882019042969, + "epoch": 1.0483370410040922, + "grad_norm": 0.6856302618980408, + "learning_rate": 1.0735051463859097e-05, + "loss": 1.2963, + "mean_token_accuracy": 0.6674405237038931, + "num_tokens": 1599705089.0, + "step": 9543 + }, + { + "entropy": 1.6677443285783131, + "epoch": 1.048446897915465, + "grad_norm": 0.6206963658332825, + "learning_rate": 1.0733451033187866e-05, + "loss": 1.3537, + "mean_token_accuracy": 0.6627790033817291, + "num_tokens": 1599908469.0, + "step": 9544 + }, + { + "entropy": 1.669652263323466, + "epoch": 1.048556754826838, + "grad_norm": 0.6658746004104614, + "learning_rate": 1.0731850610952796e-05, + "loss": 1.3356, + "mean_token_accuracy": 0.6593130081892014, + "num_tokens": 1600099601.0, + "step": 9545 + }, + { + "entropy": 1.6830444832642872, + "epoch": 1.048666611738211, + "grad_norm": 0.7324784994125366, + "learning_rate": 1.0730250197204528e-05, + "loss": 1.5043, + "mean_token_accuracy": 0.6441976577043533, + "num_tokens": 1600322089.0, + "step": 9546 + }, + { + "entropy": 1.6887101829051971, + "epoch": 1.048776468649584, + "grad_norm": 0.8754149079322815, + "learning_rate": 1.0728649791993722e-05, + "loss": 1.7007, + "mean_token_accuracy": 0.6487985526522001, + "num_tokens": 1600493611.0, + "step": 9547 + }, + { + "entropy": 1.679084559281667, + "epoch": 1.0488863255609568, + "grad_norm": 0.8017110228538513, + "learning_rate": 1.0727049395371029e-05, + "loss": 1.2833, + "mean_token_accuracy": 0.6837884237368902, + "num_tokens": 1600609476.0, + "step": 9548 + }, + { + "entropy": 1.7725566426912944, + "epoch": 1.0489961824723297, + "grad_norm": 0.7964447736740112, + "learning_rate": 1.07254490073871e-05, + "loss": 1.386, + "mean_token_accuracy": 0.6649324297904968, + "num_tokens": 1600749785.0, + "step": 9549 + }, + { + "entropy": 1.717088649670283, + "epoch": 1.0491060393837026, + "grad_norm": 0.7279574871063232, + "learning_rate": 1.072384862809258e-05, + "loss": 1.3735, + "mean_token_accuracy": 0.6538225511709849, + "num_tokens": 1600931046.0, + "step": 9550 + }, + { + "entropy": 1.7207684218883514, + "epoch": 1.0492158962950757, + "grad_norm": 0.7171173095703125, + "learning_rate": 1.0722248257538135e-05, + "loss": 1.384, + "mean_token_accuracy": 0.6570334682861964, + "num_tokens": 1601116183.0, + "step": 9551 + }, + { + "entropy": 1.6767055094242096, + "epoch": 1.0493257532064486, + "grad_norm": 0.7148609757423401, + "learning_rate": 1.0720647895774402e-05, + "loss": 1.2768, + "mean_token_accuracy": 0.674123153090477, + "num_tokens": 1601237406.0, + "step": 9552 + }, + { + "entropy": 1.7004509270191193, + "epoch": 1.0494356101178215, + "grad_norm": 0.5615423321723938, + "learning_rate": 1.0719047542852037e-05, + "loss": 1.3725, + "mean_token_accuracy": 0.6597498307625452, + "num_tokens": 1601434893.0, + "step": 9553 + }, + { + "entropy": 1.6728576719760895, + "epoch": 1.0495454670291944, + "grad_norm": 0.6771143078804016, + "learning_rate": 1.0717447198821693e-05, + "loss": 1.4928, + "mean_token_accuracy": 0.6323941498994827, + "num_tokens": 1601650254.0, + "step": 9554 + }, + { + "entropy": 1.6481030186017354, + "epoch": 1.0496553239405675, + "grad_norm": 0.6681134104728699, + "learning_rate": 1.0715846863734019e-05, + "loss": 1.4834, + "mean_token_accuracy": 0.6682325402895609, + "num_tokens": 1601806562.0, + "step": 9555 + }, + { + "entropy": 1.6618581414222717, + "epoch": 1.0497651808519404, + "grad_norm": 0.6858922839164734, + "learning_rate": 1.071424653763966e-05, + "loss": 1.3277, + "mean_token_accuracy": 0.6726219256718954, + "num_tokens": 1601992968.0, + "step": 9556 + }, + { + "entropy": 1.6976956526438396, + "epoch": 1.0498750377633133, + "grad_norm": 0.6189122200012207, + "learning_rate": 1.0712646220589274e-05, + "loss": 1.3589, + "mean_token_accuracy": 0.6756937205791473, + "num_tokens": 1602146119.0, + "step": 9557 + }, + { + "entropy": 1.7072361807028453, + "epoch": 1.0499848946746861, + "grad_norm": 0.8546844720840454, + "learning_rate": 1.07110459126335e-05, + "loss": 1.4876, + "mean_token_accuracy": 0.6624875615040461, + "num_tokens": 1602380098.0, + "step": 9558 + }, + { + "entropy": 1.6925819118817647, + "epoch": 1.0500947515860592, + "grad_norm": 0.6426894068717957, + "learning_rate": 1.0709445613822997e-05, + "loss": 1.3083, + "mean_token_accuracy": 0.664939691623052, + "num_tokens": 1602549743.0, + "step": 9559 + }, + { + "entropy": 1.731207271416982, + "epoch": 1.0502046084974321, + "grad_norm": 0.8742700219154358, + "learning_rate": 1.0707845324208407e-05, + "loss": 1.2079, + "mean_token_accuracy": 0.6823851068814596, + "num_tokens": 1602677648.0, + "step": 9560 + }, + { + "entropy": 1.7196010947227478, + "epoch": 1.050314465408805, + "grad_norm": 0.6880813241004944, + "learning_rate": 1.0706245043840381e-05, + "loss": 1.5014, + "mean_token_accuracy": 0.6513290057579676, + "num_tokens": 1602849621.0, + "step": 9561 + }, + { + "entropy": 1.6641053060690563, + "epoch": 1.050424322320178, + "grad_norm": 0.6426426768302917, + "learning_rate": 1.070464477276957e-05, + "loss": 1.5101, + "mean_token_accuracy": 0.6436965962251028, + "num_tokens": 1603025923.0, + "step": 9562 + }, + { + "entropy": 1.7633658051490784, + "epoch": 1.0505341792315508, + "grad_norm": 0.6683143973350525, + "learning_rate": 1.0703044511046617e-05, + "loss": 1.4956, + "mean_token_accuracy": 0.6683969696362814, + "num_tokens": 1603184478.0, + "step": 9563 + }, + { + "entropy": 1.6500650942325592, + "epoch": 1.050644036142924, + "grad_norm": 0.669915497303009, + "learning_rate": 1.070144425872217e-05, + "loss": 1.5075, + "mean_token_accuracy": 0.6701604525248209, + "num_tokens": 1603368813.0, + "step": 9564 + }, + { + "entropy": 1.7405575414498646, + "epoch": 1.0507538930542968, + "grad_norm": 0.683010458946228, + "learning_rate": 1.069984401584688e-05, + "loss": 1.4898, + "mean_token_accuracy": 0.6517573595046997, + "num_tokens": 1603532608.0, + "step": 9565 + }, + { + "entropy": 1.6904991964499156, + "epoch": 1.0508637499656697, + "grad_norm": 0.7113816738128662, + "learning_rate": 1.069824378247139e-05, + "loss": 1.3158, + "mean_token_accuracy": 0.6657985001802444, + "num_tokens": 1603688565.0, + "step": 9566 + }, + { + "entropy": 1.6928423345088959, + "epoch": 1.0509736068770426, + "grad_norm": 0.6442391276359558, + "learning_rate": 1.0696643558646346e-05, + "loss": 1.4616, + "mean_token_accuracy": 0.6492231587568918, + "num_tokens": 1603907279.0, + "step": 9567 + }, + { + "entropy": 1.7089947859446208, + "epoch": 1.0510834637884157, + "grad_norm": 0.698707103729248, + "learning_rate": 1.0695043344422402e-05, + "loss": 1.4134, + "mean_token_accuracy": 0.6618327150742213, + "num_tokens": 1604075541.0, + "step": 9568 + }, + { + "entropy": 1.672034611304601, + "epoch": 1.0511933206997885, + "grad_norm": 0.5879615545272827, + "learning_rate": 1.0693443139850194e-05, + "loss": 1.3688, + "mean_token_accuracy": 0.670054112871488, + "num_tokens": 1604239307.0, + "step": 9569 + }, + { + "entropy": 1.7428979178269703, + "epoch": 1.0513031776111614, + "grad_norm": 0.6346009969711304, + "learning_rate": 1.0691842944980373e-05, + "loss": 1.3989, + "mean_token_accuracy": 0.6469153513511022, + "num_tokens": 1604416450.0, + "step": 9570 + }, + { + "entropy": 1.6811433633168538, + "epoch": 1.0514130345225343, + "grad_norm": 0.6256464719772339, + "learning_rate": 1.0690242759863587e-05, + "loss": 1.367, + "mean_token_accuracy": 0.6655552933613459, + "num_tokens": 1604583184.0, + "step": 9571 + }, + { + "entropy": 1.7027230560779572, + "epoch": 1.0515228914339074, + "grad_norm": 0.6681546568870544, + "learning_rate": 1.0688642584550477e-05, + "loss": 1.3715, + "mean_token_accuracy": 0.6578748822212219, + "num_tokens": 1604805908.0, + "step": 9572 + }, + { + "entropy": 1.7179262538750966, + "epoch": 1.0516327483452803, + "grad_norm": 0.6444426774978638, + "learning_rate": 1.0687042419091688e-05, + "loss": 1.5337, + "mean_token_accuracy": 0.6471273948748907, + "num_tokens": 1605033262.0, + "step": 9573 + }, + { + "entropy": 1.7171581784884136, + "epoch": 1.0517426052566532, + "grad_norm": 0.6714052557945251, + "learning_rate": 1.0685442263537867e-05, + "loss": 1.427, + "mean_token_accuracy": 0.6538668225208918, + "num_tokens": 1605187982.0, + "step": 9574 + }, + { + "entropy": 1.698363830645879, + "epoch": 1.051852462168026, + "grad_norm": 0.6877774000167847, + "learning_rate": 1.0683842117939655e-05, + "loss": 1.3865, + "mean_token_accuracy": 0.6598330289125443, + "num_tokens": 1605342223.0, + "step": 9575 + }, + { + "entropy": 1.7184595068295796, + "epoch": 1.0519623190793992, + "grad_norm": 0.5890787243843079, + "learning_rate": 1.0682241982347697e-05, + "loss": 1.5158, + "mean_token_accuracy": 0.6271322419246038, + "num_tokens": 1605559031.0, + "step": 9576 + }, + { + "entropy": 1.7405591209729512, + "epoch": 1.052072175990772, + "grad_norm": 0.7134595513343811, + "learning_rate": 1.0680641856812638e-05, + "loss": 1.4408, + "mean_token_accuracy": 0.6583415667215983, + "num_tokens": 1605702610.0, + "step": 9577 + }, + { + "entropy": 1.6781224409739177, + "epoch": 1.052182032902145, + "grad_norm": 0.72991943359375, + "learning_rate": 1.067904174138512e-05, + "loss": 1.4711, + "mean_token_accuracy": 0.6530647377173106, + "num_tokens": 1605865046.0, + "step": 9578 + }, + { + "entropy": 1.7373049159844716, + "epoch": 1.0522918898135178, + "grad_norm": 0.9875661134719849, + "learning_rate": 1.0677441636115786e-05, + "loss": 1.4841, + "mean_token_accuracy": 0.6617722262938818, + "num_tokens": 1605989563.0, + "step": 9579 + }, + { + "entropy": 1.7485832075277965, + "epoch": 1.0524017467248907, + "grad_norm": 0.7092475295066833, + "learning_rate": 1.0675841541055283e-05, + "loss": 1.4333, + "mean_token_accuracy": 0.6555873850981394, + "num_tokens": 1606154356.0, + "step": 9580 + }, + { + "entropy": 1.6484754184881847, + "epoch": 1.0525116036362638, + "grad_norm": 0.7261121869087219, + "learning_rate": 1.0674241456254244e-05, + "loss": 1.4312, + "mean_token_accuracy": 0.6665322482585907, + "num_tokens": 1606317868.0, + "step": 9581 + }, + { + "entropy": 1.6414496501286824, + "epoch": 1.0526214605476367, + "grad_norm": 0.6463345289230347, + "learning_rate": 1.0672641381763315e-05, + "loss": 1.2341, + "mean_token_accuracy": 0.6869133959213892, + "num_tokens": 1606467019.0, + "step": 9582 + }, + { + "entropy": 1.7304282387097676, + "epoch": 1.0527313174590096, + "grad_norm": 0.6692349910736084, + "learning_rate": 1.0671041317633144e-05, + "loss": 1.2577, + "mean_token_accuracy": 0.6683614204327265, + "num_tokens": 1606592230.0, + "step": 9583 + }, + { + "entropy": 1.6495085159937541, + "epoch": 1.0528411743703825, + "grad_norm": 0.6861699223518372, + "learning_rate": 1.0669441263914364e-05, + "loss": 1.2424, + "mean_token_accuracy": 0.6809816161791483, + "num_tokens": 1606726297.0, + "step": 9584 + }, + { + "entropy": 1.6947397887706757, + "epoch": 1.0529510312817556, + "grad_norm": 0.6416590213775635, + "learning_rate": 1.066784122065762e-05, + "loss": 1.3894, + "mean_token_accuracy": 0.6536195824543635, + "num_tokens": 1606944421.0, + "step": 9585 + }, + { + "entropy": 1.7330600718657176, + "epoch": 1.0530608881931285, + "grad_norm": 0.7335972189903259, + "learning_rate": 1.0666241187913553e-05, + "loss": 1.415, + "mean_token_accuracy": 0.6541456083456675, + "num_tokens": 1607071607.0, + "step": 9586 + }, + { + "entropy": 1.7887234091758728, + "epoch": 1.0531707451045014, + "grad_norm": 0.8420895338058472, + "learning_rate": 1.06646411657328e-05, + "loss": 1.6066, + "mean_token_accuracy": 0.6439038117726644, + "num_tokens": 1607230227.0, + "step": 9587 + }, + { + "entropy": 1.7334937155246735, + "epoch": 1.0532806020158743, + "grad_norm": 0.7040529251098633, + "learning_rate": 1.0663041154166e-05, + "loss": 1.5178, + "mean_token_accuracy": 0.6528173585732778, + "num_tokens": 1607420300.0, + "step": 9588 + }, + { + "entropy": 1.7212029000123341, + "epoch": 1.0533904589272474, + "grad_norm": 0.8923618793487549, + "learning_rate": 1.0661441153263801e-05, + "loss": 1.6135, + "mean_token_accuracy": 0.6504168063402176, + "num_tokens": 1607561102.0, + "step": 9589 + }, + { + "entropy": 1.7322006324927013, + "epoch": 1.0535003158386202, + "grad_norm": 0.6851010918617249, + "learning_rate": 1.0659841163076834e-05, + "loss": 1.2743, + "mean_token_accuracy": 0.6731189688046774, + "num_tokens": 1607681433.0, + "step": 9590 + }, + { + "entropy": 1.6935873130957286, + "epoch": 1.0536101727499931, + "grad_norm": 0.6381398439407349, + "learning_rate": 1.0658241183655741e-05, + "loss": 1.3826, + "mean_token_accuracy": 0.6610639144976934, + "num_tokens": 1607883422.0, + "step": 9591 + }, + { + "entropy": 1.7271686891714733, + "epoch": 1.053720029661366, + "grad_norm": 0.6199973225593567, + "learning_rate": 1.0656641215051165e-05, + "loss": 1.5337, + "mean_token_accuracy": 0.6284688164790472, + "num_tokens": 1608114811.0, + "step": 9592 + }, + { + "entropy": 1.7459026277065277, + "epoch": 1.053829886572739, + "grad_norm": 0.9864705204963684, + "learning_rate": 1.0655041257313735e-05, + "loss": 1.3567, + "mean_token_accuracy": 0.6629860103130341, + "num_tokens": 1608251495.0, + "step": 9593 + }, + { + "entropy": 1.7121766805648804, + "epoch": 1.053939743484112, + "grad_norm": 0.7664281725883484, + "learning_rate": 1.0653441310494092e-05, + "loss": 1.287, + "mean_token_accuracy": 0.6731170068184534, + "num_tokens": 1608369641.0, + "step": 9594 + }, + { + "entropy": 1.6202069719632466, + "epoch": 1.054049600395485, + "grad_norm": 0.5870428681373596, + "learning_rate": 1.0651841374642882e-05, + "loss": 1.3897, + "mean_token_accuracy": 0.6637488653262457, + "num_tokens": 1608579940.0, + "step": 9595 + }, + { + "entropy": 1.727886547644933, + "epoch": 1.0541594573068578, + "grad_norm": 0.844563364982605, + "learning_rate": 1.065024144981073e-05, + "loss": 1.5118, + "mean_token_accuracy": 0.6562522749106089, + "num_tokens": 1608758095.0, + "step": 9596 + }, + { + "entropy": 1.7154951989650726, + "epoch": 1.0542693142182307, + "grad_norm": 0.7168102860450745, + "learning_rate": 1.064864153604828e-05, + "loss": 1.5547, + "mean_token_accuracy": 0.641726886232694, + "num_tokens": 1608920813.0, + "step": 9597 + }, + { + "entropy": 1.7120743890603383, + "epoch": 1.0543791711296038, + "grad_norm": 0.8167440891265869, + "learning_rate": 1.0647041633406168e-05, + "loss": 1.4122, + "mean_token_accuracy": 0.6548740615447363, + "num_tokens": 1609056254.0, + "step": 9598 + }, + { + "entropy": 1.6860410471757252, + "epoch": 1.0544890280409767, + "grad_norm": 0.6746973395347595, + "learning_rate": 1.0645441741935029e-05, + "loss": 1.4384, + "mean_token_accuracy": 0.6527743736902872, + "num_tokens": 1609290219.0, + "step": 9599 + }, + { + "entropy": 1.653940111398697, + "epoch": 1.0545988849523495, + "grad_norm": 0.7178821563720703, + "learning_rate": 1.0643841861685498e-05, + "loss": 1.3891, + "mean_token_accuracy": 0.6595231195290884, + "num_tokens": 1609491031.0, + "step": 9600 + }, + { + "entropy": 1.6904591818650563, + "epoch": 1.0547087418637224, + "grad_norm": 0.6442198157310486, + "learning_rate": 1.0642241992708215e-05, + "loss": 1.4274, + "mean_token_accuracy": 0.6542298197746277, + "num_tokens": 1609653550.0, + "step": 9601 + }, + { + "entropy": 1.7299334208170574, + "epoch": 1.0548185987750955, + "grad_norm": 0.7151714563369751, + "learning_rate": 1.0640642135053807e-05, + "loss": 1.354, + "mean_token_accuracy": 0.6594923585653305, + "num_tokens": 1609798123.0, + "step": 9602 + }, + { + "entropy": 1.7396175960699718, + "epoch": 1.0549284556864684, + "grad_norm": 0.7707934975624084, + "learning_rate": 1.0639042288772914e-05, + "loss": 1.3926, + "mean_token_accuracy": 0.6571768969297409, + "num_tokens": 1609941400.0, + "step": 9603 + }, + { + "entropy": 1.768477698167165, + "epoch": 1.0550383125978413, + "grad_norm": 0.6910679936408997, + "learning_rate": 1.0637442453916173e-05, + "loss": 1.5426, + "mean_token_accuracy": 0.6517347097396851, + "num_tokens": 1610105272.0, + "step": 9604 + }, + { + "entropy": 1.6502993007500966, + "epoch": 1.0551481695092142, + "grad_norm": 0.6803098320960999, + "learning_rate": 1.0635842630534215e-05, + "loss": 1.4061, + "mean_token_accuracy": 0.6640617698431015, + "num_tokens": 1610247173.0, + "step": 9605 + }, + { + "entropy": 1.6991427938143413, + "epoch": 1.055258026420587, + "grad_norm": 0.6311363577842712, + "learning_rate": 1.063424281867767e-05, + "loss": 1.4876, + "mean_token_accuracy": 0.6401338577270508, + "num_tokens": 1610423935.0, + "step": 9606 + }, + { + "entropy": 1.6480543613433838, + "epoch": 1.0553678833319602, + "grad_norm": 0.8264037370681763, + "learning_rate": 1.063264301839718e-05, + "loss": 1.5157, + "mean_token_accuracy": 0.6646409928798676, + "num_tokens": 1610616702.0, + "step": 9607 + }, + { + "entropy": 1.7201192478338878, + "epoch": 1.055477740243333, + "grad_norm": 0.7642366886138916, + "learning_rate": 1.0631043229743367e-05, + "loss": 1.3135, + "mean_token_accuracy": 0.6787395030260086, + "num_tokens": 1610783350.0, + "step": 9608 + }, + { + "entropy": 1.7671165466308594, + "epoch": 1.055587597154706, + "grad_norm": 0.737358570098877, + "learning_rate": 1.0629443452766872e-05, + "loss": 1.5055, + "mean_token_accuracy": 0.6605880657831827, + "num_tokens": 1610956661.0, + "step": 9609 + }, + { + "entropy": 1.7348363002141316, + "epoch": 1.0556974540660788, + "grad_norm": 0.7109798192977905, + "learning_rate": 1.0627843687518326e-05, + "loss": 1.4493, + "mean_token_accuracy": 0.6522191713253657, + "num_tokens": 1611087838.0, + "step": 9610 + }, + { + "entropy": 1.6798150340716045, + "epoch": 1.055807310977452, + "grad_norm": 0.6326707601547241, + "learning_rate": 1.062624393404836e-05, + "loss": 1.3727, + "mean_token_accuracy": 0.6628447274367014, + "num_tokens": 1611244410.0, + "step": 9611 + }, + { + "entropy": 1.6709414720535278, + "epoch": 1.0559171678888248, + "grad_norm": 0.7551990747451782, + "learning_rate": 1.06246441924076e-05, + "loss": 1.2086, + "mean_token_accuracy": 0.6848486711581548, + "num_tokens": 1611363852.0, + "step": 9612 + }, + { + "entropy": 1.6593234837055206, + "epoch": 1.0560270248001977, + "grad_norm": 0.7147453427314758, + "learning_rate": 1.062304446264669e-05, + "loss": 1.331, + "mean_token_accuracy": 0.6691244542598724, + "num_tokens": 1611543725.0, + "step": 9613 + }, + { + "entropy": 1.7118070423603058, + "epoch": 1.0561368817115706, + "grad_norm": 0.6040416359901428, + "learning_rate": 1.0621444744816247e-05, + "loss": 1.3854, + "mean_token_accuracy": 0.6613721499840418, + "num_tokens": 1611719789.0, + "step": 9614 + }, + { + "entropy": 1.693102478981018, + "epoch": 1.0562467386229437, + "grad_norm": 0.8449906706809998, + "learning_rate": 1.061984503896691e-05, + "loss": 1.4027, + "mean_token_accuracy": 0.684621180097262, + "num_tokens": 1611874620.0, + "step": 9615 + }, + { + "entropy": 1.6907888650894165, + "epoch": 1.0563565955343166, + "grad_norm": 0.7158522605895996, + "learning_rate": 1.0618245345149308e-05, + "loss": 1.2702, + "mean_token_accuracy": 0.672495091954867, + "num_tokens": 1611997469.0, + "step": 9616 + }, + { + "entropy": 1.7449814677238464, + "epoch": 1.0564664524456895, + "grad_norm": 0.7022985816001892, + "learning_rate": 1.0616645663414064e-05, + "loss": 1.3994, + "mean_token_accuracy": 0.6574458430210749, + "num_tokens": 1612189165.0, + "step": 9617 + }, + { + "entropy": 1.6879704495271046, + "epoch": 1.0565763093570624, + "grad_norm": 0.7382509112358093, + "learning_rate": 1.0615045993811813e-05, + "loss": 1.288, + "mean_token_accuracy": 0.6710261305173238, + "num_tokens": 1612327205.0, + "step": 9618 + }, + { + "entropy": 1.7922049760818481, + "epoch": 1.0566861662684353, + "grad_norm": 0.7976166009902954, + "learning_rate": 1.0613446336393187e-05, + "loss": 1.5642, + "mean_token_accuracy": 0.6370315402746201, + "num_tokens": 1612482869.0, + "step": 9619 + }, + { + "entropy": 1.7130910356839497, + "epoch": 1.0567960231798084, + "grad_norm": 0.6342209577560425, + "learning_rate": 1.0611846691208805e-05, + "loss": 1.3953, + "mean_token_accuracy": 0.6711895515521368, + "num_tokens": 1612653926.0, + "step": 9620 + }, + { + "entropy": 1.68388170003891, + "epoch": 1.0569058800911812, + "grad_norm": 0.6616874933242798, + "learning_rate": 1.0610247058309305e-05, + "loss": 1.3314, + "mean_token_accuracy": 0.6681681573390961, + "num_tokens": 1612792712.0, + "step": 9621 + }, + { + "entropy": 1.6615887383619945, + "epoch": 1.0570157370025541, + "grad_norm": 0.6471387147903442, + "learning_rate": 1.0608647437745308e-05, + "loss": 1.5439, + "mean_token_accuracy": 0.622841422756513, + "num_tokens": 1613069195.0, + "step": 9622 + }, + { + "entropy": 1.689245601495107, + "epoch": 1.057125593913927, + "grad_norm": 0.7069512009620667, + "learning_rate": 1.0607047829567443e-05, + "loss": 1.3479, + "mean_token_accuracy": 0.6640896399815878, + "num_tokens": 1613260754.0, + "step": 9623 + }, + { + "entropy": 1.703204204638799, + "epoch": 1.0572354508253001, + "grad_norm": 0.7228965759277344, + "learning_rate": 1.0605448233826338e-05, + "loss": 1.2755, + "mean_token_accuracy": 0.6698009570439657, + "num_tokens": 1613369068.0, + "step": 9624 + }, + { + "entropy": 1.6650657554467518, + "epoch": 1.057345307736673, + "grad_norm": 0.661736786365509, + "learning_rate": 1.0603848650572616e-05, + "loss": 1.5366, + "mean_token_accuracy": 0.6506972561279932, + "num_tokens": 1613545729.0, + "step": 9625 + }, + { + "entropy": 1.653905838727951, + "epoch": 1.057455164648046, + "grad_norm": 0.725866973400116, + "learning_rate": 1.0602249079856905e-05, + "loss": 1.4377, + "mean_token_accuracy": 0.6448950469493866, + "num_tokens": 1613757396.0, + "step": 9626 + }, + { + "entropy": 1.6793735524018605, + "epoch": 1.0575650215594188, + "grad_norm": 0.6681774854660034, + "learning_rate": 1.0600649521729836e-05, + "loss": 1.4342, + "mean_token_accuracy": 0.6574407368898392, + "num_tokens": 1613895809.0, + "step": 9627 + }, + { + "entropy": 1.7233494420846303, + "epoch": 1.057674878470792, + "grad_norm": 0.617106556892395, + "learning_rate": 1.0599049976242028e-05, + "loss": 1.3149, + "mean_token_accuracy": 0.6670562823613485, + "num_tokens": 1614054222.0, + "step": 9628 + }, + { + "entropy": 1.7311593393484752, + "epoch": 1.0577847353821648, + "grad_norm": 0.791827380657196, + "learning_rate": 1.0597450443444106e-05, + "loss": 1.3224, + "mean_token_accuracy": 0.6638512363036474, + "num_tokens": 1614246882.0, + "step": 9629 + }, + { + "entropy": 1.7481309374173482, + "epoch": 1.0578945922935377, + "grad_norm": 0.6565181016921997, + "learning_rate": 1.0595850923386699e-05, + "loss": 1.5325, + "mean_token_accuracy": 0.6478005150953928, + "num_tokens": 1614418530.0, + "step": 9630 + }, + { + "entropy": 1.7112793425718944, + "epoch": 1.0580044492049105, + "grad_norm": 0.8694975972175598, + "learning_rate": 1.0594251416120426e-05, + "loss": 1.5551, + "mean_token_accuracy": 0.6495432009299597, + "num_tokens": 1614553386.0, + "step": 9631 + }, + { + "entropy": 1.708936999241511, + "epoch": 1.0581143061162837, + "grad_norm": 0.6418928503990173, + "learning_rate": 1.0592651921695912e-05, + "loss": 1.4058, + "mean_token_accuracy": 0.658669908841451, + "num_tokens": 1614708336.0, + "step": 9632 + }, + { + "entropy": 1.6334985593954723, + "epoch": 1.0582241630276565, + "grad_norm": 0.7269279360771179, + "learning_rate": 1.0591052440163783e-05, + "loss": 1.2101, + "mean_token_accuracy": 0.6838384419679642, + "num_tokens": 1614846838.0, + "step": 9633 + }, + { + "entropy": 1.7367220322291057, + "epoch": 1.0583340199390294, + "grad_norm": 0.930919349193573, + "learning_rate": 1.0589452971574659e-05, + "loss": 1.1158, + "mean_token_accuracy": 0.6759131153424581, + "num_tokens": 1615008974.0, + "step": 9634 + }, + { + "entropy": 1.7149739066759746, + "epoch": 1.0584438768504023, + "grad_norm": 0.7244778871536255, + "learning_rate": 1.0587853515979163e-05, + "loss": 1.2775, + "mean_token_accuracy": 0.6694569289684296, + "num_tokens": 1615125425.0, + "step": 9635 + }, + { + "entropy": 1.7364940841992695, + "epoch": 1.0585537337617752, + "grad_norm": 0.7459930181503296, + "learning_rate": 1.0586254073427919e-05, + "loss": 1.5224, + "mean_token_accuracy": 0.6431458294391632, + "num_tokens": 1615289010.0, + "step": 9636 + }, + { + "entropy": 1.774601896603902, + "epoch": 1.0586635906731483, + "grad_norm": 0.6885215640068054, + "learning_rate": 1.0584654643971546e-05, + "loss": 1.5666, + "mean_token_accuracy": 0.6418042729298273, + "num_tokens": 1615450775.0, + "step": 9637 + }, + { + "entropy": 1.694488286972046, + "epoch": 1.0587734475845212, + "grad_norm": 0.6608237624168396, + "learning_rate": 1.0583055227660666e-05, + "loss": 1.3102, + "mean_token_accuracy": 0.6682019531726837, + "num_tokens": 1615623137.0, + "step": 9638 + }, + { + "entropy": 1.7278088827927907, + "epoch": 1.058883304495894, + "grad_norm": 0.697512686252594, + "learning_rate": 1.05814558245459e-05, + "loss": 1.4574, + "mean_token_accuracy": 0.6336638778448105, + "num_tokens": 1615802189.0, + "step": 9639 + }, + { + "entropy": 1.7499482830365498, + "epoch": 1.058993161407267, + "grad_norm": 0.6239208579063416, + "learning_rate": 1.057985643467787e-05, + "loss": 1.4644, + "mean_token_accuracy": 0.637439583738645, + "num_tokens": 1615995816.0, + "step": 9640 + }, + { + "entropy": 1.6262230277061462, + "epoch": 1.05910301831864, + "grad_norm": 0.8068125247955322, + "learning_rate": 1.0578257058107193e-05, + "loss": 1.269, + "mean_token_accuracy": 0.6698660800854365, + "num_tokens": 1616120440.0, + "step": 9641 + }, + { + "entropy": 1.6767154037952423, + "epoch": 1.059212875230013, + "grad_norm": 0.680270254611969, + "learning_rate": 1.0576657694884494e-05, + "loss": 1.2518, + "mean_token_accuracy": 0.6656695355971655, + "num_tokens": 1616265383.0, + "step": 9642 + }, + { + "entropy": 1.6900179982185364, + "epoch": 1.0593227321413858, + "grad_norm": 0.6536128520965576, + "learning_rate": 1.0575058345060386e-05, + "loss": 1.3902, + "mean_token_accuracy": 0.6535529990990957, + "num_tokens": 1616443547.0, + "step": 9643 + }, + { + "entropy": 1.7220917642116547, + "epoch": 1.0594325890527587, + "grad_norm": 0.6382746696472168, + "learning_rate": 1.0573459008685485e-05, + "loss": 1.5317, + "mean_token_accuracy": 0.6338553329308828, + "num_tokens": 1616648707.0, + "step": 9644 + }, + { + "entropy": 1.649655689795812, + "epoch": 1.0595424459641318, + "grad_norm": 0.6632347106933594, + "learning_rate": 1.057185968581042e-05, + "loss": 1.2947, + "mean_token_accuracy": 0.6613847464323044, + "num_tokens": 1616790214.0, + "step": 9645 + }, + { + "entropy": 1.686415046453476, + "epoch": 1.0596523028755047, + "grad_norm": 0.7371233105659485, + "learning_rate": 1.0570260376485801e-05, + "loss": 1.4037, + "mean_token_accuracy": 0.6672069480021795, + "num_tokens": 1616952135.0, + "step": 9646 + }, + { + "entropy": 1.651140828927358, + "epoch": 1.0597621597868776, + "grad_norm": 0.6713118553161621, + "learning_rate": 1.0568661080762246e-05, + "loss": 1.518, + "mean_token_accuracy": 0.6458087265491486, + "num_tokens": 1617144223.0, + "step": 9647 + }, + { + "entropy": 1.6354533036549885, + "epoch": 1.0598720166982505, + "grad_norm": 0.811916708946228, + "learning_rate": 1.0567061798690379e-05, + "loss": 1.207, + "mean_token_accuracy": 0.6824970146020254, + "num_tokens": 1617304698.0, + "step": 9648 + }, + { + "entropy": 1.7508118848005931, + "epoch": 1.0599818736096234, + "grad_norm": 0.6534659266471863, + "learning_rate": 1.0565462530320806e-05, + "loss": 1.4521, + "mean_token_accuracy": 0.6644961486260096, + "num_tokens": 1617462814.0, + "step": 9649 + }, + { + "entropy": 1.6995843648910522, + "epoch": 1.0600917305209965, + "grad_norm": 0.6688631772994995, + "learning_rate": 1.0563863275704147e-05, + "loss": 1.355, + "mean_token_accuracy": 0.6675297270218531, + "num_tokens": 1617591628.0, + "step": 9650 + }, + { + "entropy": 1.6866484085718791, + "epoch": 1.0602015874323694, + "grad_norm": 0.6986563205718994, + "learning_rate": 1.0562264034891024e-05, + "loss": 1.5377, + "mean_token_accuracy": 0.6672149201234182, + "num_tokens": 1617732897.0, + "step": 9651 + }, + { + "entropy": 1.6906297703584034, + "epoch": 1.0603114443437422, + "grad_norm": 0.7244104743003845, + "learning_rate": 1.0560664807932041e-05, + "loss": 1.6056, + "mean_token_accuracy": 0.6466974268356959, + "num_tokens": 1617916608.0, + "step": 9652 + }, + { + "entropy": 1.7492324809233348, + "epoch": 1.0604213012551151, + "grad_norm": 0.8112777471542358, + "learning_rate": 1.0559065594877822e-05, + "loss": 1.3205, + "mean_token_accuracy": 0.6659993777672449, + "num_tokens": 1618039898.0, + "step": 9653 + }, + { + "entropy": 1.704162836074829, + "epoch": 1.0605311581664882, + "grad_norm": 0.7873448729515076, + "learning_rate": 1.055746639577898e-05, + "loss": 1.1877, + "mean_token_accuracy": 0.6737140913804373, + "num_tokens": 1618223214.0, + "step": 9654 + }, + { + "entropy": 1.691829393307368, + "epoch": 1.0606410150778611, + "grad_norm": 0.9849948287010193, + "learning_rate": 1.0555867210686127e-05, + "loss": 1.3944, + "mean_token_accuracy": 0.6720232317845026, + "num_tokens": 1618379497.0, + "step": 9655 + }, + { + "entropy": 1.6794477105140686, + "epoch": 1.060750871989234, + "grad_norm": 0.6636627912521362, + "learning_rate": 1.055426803964987e-05, + "loss": 1.377, + "mean_token_accuracy": 0.6529742429653803, + "num_tokens": 1618604731.0, + "step": 9656 + }, + { + "entropy": 1.7792643805344899, + "epoch": 1.060860728900607, + "grad_norm": 0.6718061566352844, + "learning_rate": 1.0552668882720836e-05, + "loss": 1.3205, + "mean_token_accuracy": 0.6614718536535898, + "num_tokens": 1618716722.0, + "step": 9657 + }, + { + "entropy": 1.7258899211883545, + "epoch": 1.06097058581198, + "grad_norm": 0.7656777501106262, + "learning_rate": 1.0551069739949626e-05, + "loss": 1.4781, + "mean_token_accuracy": 0.6475807974735895, + "num_tokens": 1618904778.0, + "step": 9658 + }, + { + "entropy": 1.7146598994731903, + "epoch": 1.061080442723353, + "grad_norm": 0.7096765637397766, + "learning_rate": 1.0549470611386861e-05, + "loss": 1.4426, + "mean_token_accuracy": 0.6532019923130671, + "num_tokens": 1619050535.0, + "step": 9659 + }, + { + "entropy": 1.7547457814216614, + "epoch": 1.0611902996347258, + "grad_norm": 0.7451149225234985, + "learning_rate": 1.0547871497083147e-05, + "loss": 1.4693, + "mean_token_accuracy": 0.6539823710918427, + "num_tokens": 1619236759.0, + "step": 9660 + }, + { + "entropy": 1.675736625989278, + "epoch": 1.0613001565460987, + "grad_norm": 0.583570122718811, + "learning_rate": 1.0546272397089094e-05, + "loss": 1.4404, + "mean_token_accuracy": 0.6391840279102325, + "num_tokens": 1619447379.0, + "step": 9661 + }, + { + "entropy": 1.7973829209804535, + "epoch": 1.0614100134574715, + "grad_norm": 0.7883579134941101, + "learning_rate": 1.0544673311455313e-05, + "loss": 1.4437, + "mean_token_accuracy": 0.6529788474241892, + "num_tokens": 1619594379.0, + "step": 9662 + }, + { + "entropy": 1.7165298263231914, + "epoch": 1.0615198703688447, + "grad_norm": 0.7997109293937683, + "learning_rate": 1.0543074240232421e-05, + "loss": 1.6165, + "mean_token_accuracy": 0.6245667189359665, + "num_tokens": 1619800773.0, + "step": 9663 + }, + { + "entropy": 1.7447736859321594, + "epoch": 1.0616297272802175, + "grad_norm": 0.7188670039176941, + "learning_rate": 1.0541475183471022e-05, + "loss": 1.3859, + "mean_token_accuracy": 0.6584334820508957, + "num_tokens": 1619954114.0, + "step": 9664 + }, + { + "entropy": 1.6913307011127472, + "epoch": 1.0617395841915904, + "grad_norm": 0.748576283454895, + "learning_rate": 1.0539876141221726e-05, + "loss": 1.3055, + "mean_token_accuracy": 0.6666964242855707, + "num_tokens": 1620087595.0, + "step": 9665 + }, + { + "entropy": 1.6912503639856975, + "epoch": 1.0618494411029633, + "grad_norm": 0.6023520231246948, + "learning_rate": 1.0538277113535145e-05, + "loss": 1.2883, + "mean_token_accuracy": 0.6658832430839539, + "num_tokens": 1620223231.0, + "step": 9666 + }, + { + "entropy": 1.7426739136377971, + "epoch": 1.0619592980143364, + "grad_norm": 0.654004693031311, + "learning_rate": 1.0536678100461885e-05, + "loss": 1.4584, + "mean_token_accuracy": 0.6539844125509262, + "num_tokens": 1620423582.0, + "step": 9667 + }, + { + "entropy": 1.6927513976891835, + "epoch": 1.0620691549257093, + "grad_norm": 0.7091237902641296, + "learning_rate": 1.053507910205255e-05, + "loss": 1.4725, + "mean_token_accuracy": 0.6411188344160715, + "num_tokens": 1620653454.0, + "step": 9668 + }, + { + "entropy": 1.7462256650129955, + "epoch": 1.0621790118370822, + "grad_norm": 0.6891539692878723, + "learning_rate": 1.0533480118357757e-05, + "loss": 1.4739, + "mean_token_accuracy": 0.6464346051216125, + "num_tokens": 1620877012.0, + "step": 9669 + }, + { + "entropy": 1.671479304631551, + "epoch": 1.062288868748455, + "grad_norm": 0.6748672127723694, + "learning_rate": 1.0531881149428107e-05, + "loss": 1.4269, + "mean_token_accuracy": 0.6441149214903513, + "num_tokens": 1621057118.0, + "step": 9670 + }, + { + "entropy": 1.7517466147740681, + "epoch": 1.0623987256598282, + "grad_norm": 0.7266637086868286, + "learning_rate": 1.0530282195314206e-05, + "loss": 1.4868, + "mean_token_accuracy": 0.6445014526446661, + "num_tokens": 1621240078.0, + "step": 9671 + }, + { + "entropy": 1.6607401569684346, + "epoch": 1.062508582571201, + "grad_norm": 0.6144156455993652, + "learning_rate": 1.0528683256066666e-05, + "loss": 1.4003, + "mean_token_accuracy": 0.646324540177981, + "num_tokens": 1621410254.0, + "step": 9672 + }, + { + "entropy": 1.6924620866775513, + "epoch": 1.062618439482574, + "grad_norm": 0.8956554532051086, + "learning_rate": 1.0527084331736084e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.6591135859489441, + "num_tokens": 1621581882.0, + "step": 9673 + }, + { + "entropy": 1.6320221523443859, + "epoch": 1.0627282963939468, + "grad_norm": 0.6442832350730896, + "learning_rate": 1.0525485422373069e-05, + "loss": 1.437, + "mean_token_accuracy": 0.6561227341492971, + "num_tokens": 1621796589.0, + "step": 9674 + }, + { + "entropy": 1.7225764592488606, + "epoch": 1.0628381533053197, + "grad_norm": 0.7406787872314453, + "learning_rate": 1.0523886528028231e-05, + "loss": 1.3413, + "mean_token_accuracy": 0.6588184783856074, + "num_tokens": 1621947807.0, + "step": 9675 + }, + { + "entropy": 1.662076324224472, + "epoch": 1.0629480102166928, + "grad_norm": 0.5989936590194702, + "learning_rate": 1.0522287648752165e-05, + "loss": 1.3468, + "mean_token_accuracy": 0.670153538386027, + "num_tokens": 1622139639.0, + "step": 9676 + }, + { + "entropy": 1.6708702047665913, + "epoch": 1.0630578671280657, + "grad_norm": 0.7162211537361145, + "learning_rate": 1.0520688784595484e-05, + "loss": 1.4332, + "mean_token_accuracy": 0.6640763978163401, + "num_tokens": 1622282259.0, + "step": 9677 + }, + { + "entropy": 1.7043314973513286, + "epoch": 1.0631677240394386, + "grad_norm": 0.6820145845413208, + "learning_rate": 1.0519089935608786e-05, + "loss": 1.4393, + "mean_token_accuracy": 0.6579158157110214, + "num_tokens": 1622477913.0, + "step": 9678 + }, + { + "entropy": 1.6165697475274403, + "epoch": 1.0632775809508115, + "grad_norm": 0.6088776588439941, + "learning_rate": 1.0517491101842672e-05, + "loss": 1.1827, + "mean_token_accuracy": 0.6940498252709707, + "num_tokens": 1622603559.0, + "step": 9679 + }, + { + "entropy": 1.7742246389389038, + "epoch": 1.0633874378621846, + "grad_norm": 0.635688841342926, + "learning_rate": 1.0515892283347752e-05, + "loss": 1.5138, + "mean_token_accuracy": 0.6343726913134257, + "num_tokens": 1622820459.0, + "step": 9680 + }, + { + "entropy": 1.6375213364760082, + "epoch": 1.0634972947735575, + "grad_norm": 0.7309091687202454, + "learning_rate": 1.051429348017462e-05, + "loss": 1.3063, + "mean_token_accuracy": 0.6767990191777548, + "num_tokens": 1623019426.0, + "step": 9681 + }, + { + "entropy": 1.6735208332538605, + "epoch": 1.0636071516849304, + "grad_norm": 0.8735835552215576, + "learning_rate": 1.051269469237388e-05, + "loss": 1.239, + "mean_token_accuracy": 0.6743840475877126, + "num_tokens": 1623154750.0, + "step": 9682 + }, + { + "entropy": 1.7003762324651082, + "epoch": 1.0637170085963032, + "grad_norm": 0.5599207878112793, + "learning_rate": 1.0511095919996135e-05, + "loss": 1.496, + "mean_token_accuracy": 0.6529860148827235, + "num_tokens": 1623356174.0, + "step": 9683 + }, + { + "entropy": 1.726138601700465, + "epoch": 1.0638268655076764, + "grad_norm": 0.6613774299621582, + "learning_rate": 1.0509497163091983e-05, + "loss": 1.3858, + "mean_token_accuracy": 0.6613100071748098, + "num_tokens": 1623511935.0, + "step": 9684 + }, + { + "entropy": 1.7090193430582683, + "epoch": 1.0639367224190492, + "grad_norm": 0.6702026724815369, + "learning_rate": 1.0507898421712023e-05, + "loss": 1.4125, + "mean_token_accuracy": 0.6465659638245901, + "num_tokens": 1623742462.0, + "step": 9685 + }, + { + "entropy": 1.7133901019891102, + "epoch": 1.0640465793304221, + "grad_norm": 0.732112467288971, + "learning_rate": 1.0506299695906859e-05, + "loss": 1.3698, + "mean_token_accuracy": 0.6699735869963964, + "num_tokens": 1623909947.0, + "step": 9686 + }, + { + "entropy": 1.7531900107860565, + "epoch": 1.064156436241795, + "grad_norm": 0.7114512324333191, + "learning_rate": 1.0504700985727087e-05, + "loss": 1.4496, + "mean_token_accuracy": 0.6486657311518987, + "num_tokens": 1624037188.0, + "step": 9687 + }, + { + "entropy": 1.6981233954429626, + "epoch": 1.064266293153168, + "grad_norm": 0.6429248452186584, + "learning_rate": 1.0503102291223302e-05, + "loss": 1.2754, + "mean_token_accuracy": 0.6718081583579382, + "num_tokens": 1624161312.0, + "step": 9688 + }, + { + "entropy": 1.7247523069381714, + "epoch": 1.064376150064541, + "grad_norm": 0.7967625856399536, + "learning_rate": 1.050150361244611e-05, + "loss": 1.317, + "mean_token_accuracy": 0.6654373556375504, + "num_tokens": 1624312357.0, + "step": 9689 + }, + { + "entropy": 1.7157942950725555, + "epoch": 1.064486006975914, + "grad_norm": 0.6844218373298645, + "learning_rate": 1.0499904949446102e-05, + "loss": 1.5019, + "mean_token_accuracy": 0.6439647078514099, + "num_tokens": 1624499884.0, + "step": 9690 + }, + { + "entropy": 1.7442339460055034, + "epoch": 1.0645958638872868, + "grad_norm": 0.8911353349685669, + "learning_rate": 1.0498306302273877e-05, + "loss": 1.4668, + "mean_token_accuracy": 0.6469593246777853, + "num_tokens": 1624631885.0, + "step": 9691 + }, + { + "entropy": 1.6330601076285045, + "epoch": 1.0647057207986597, + "grad_norm": 0.6595163345336914, + "learning_rate": 1.0496707670980032e-05, + "loss": 1.3273, + "mean_token_accuracy": 0.6642039865255356, + "num_tokens": 1624809064.0, + "step": 9692 + }, + { + "entropy": 1.677077313264211, + "epoch": 1.0648155777100328, + "grad_norm": 0.6782556176185608, + "learning_rate": 1.0495109055615162e-05, + "loss": 1.3071, + "mean_token_accuracy": 0.6632417937119802, + "num_tokens": 1624974504.0, + "step": 9693 + }, + { + "entropy": 1.6450525323549907, + "epoch": 1.0649254346214057, + "grad_norm": 0.7020920515060425, + "learning_rate": 1.0493510456229863e-05, + "loss": 1.2382, + "mean_token_accuracy": 0.6796882202227911, + "num_tokens": 1625109730.0, + "step": 9694 + }, + { + "entropy": 1.6747917830944061, + "epoch": 1.0650352915327785, + "grad_norm": 0.7329574227333069, + "learning_rate": 1.0491911872874732e-05, + "loss": 1.3547, + "mean_token_accuracy": 0.6668652594089508, + "num_tokens": 1625269037.0, + "step": 9695 + }, + { + "entropy": 1.6843490501244862, + "epoch": 1.0651451484441514, + "grad_norm": 0.75140780210495, + "learning_rate": 1.0490313305600357e-05, + "loss": 1.351, + "mean_token_accuracy": 0.6596083492040634, + "num_tokens": 1625423315.0, + "step": 9696 + }, + { + "entropy": 1.6949997544288635, + "epoch": 1.0652550053555245, + "grad_norm": 0.7336523532867432, + "learning_rate": 1.0488714754457338e-05, + "loss": 1.5667, + "mean_token_accuracy": 0.6420057465632757, + "num_tokens": 1625614264.0, + "step": 9697 + }, + { + "entropy": 1.67600150903066, + "epoch": 1.0653648622668974, + "grad_norm": 0.7469053864479065, + "learning_rate": 1.048711621949627e-05, + "loss": 1.3705, + "mean_token_accuracy": 0.6591821859280268, + "num_tokens": 1625755751.0, + "step": 9698 + }, + { + "entropy": 1.7073884507020314, + "epoch": 1.0654747191782703, + "grad_norm": 0.6153196692466736, + "learning_rate": 1.048551770076774e-05, + "loss": 1.4425, + "mean_token_accuracy": 0.6455186804135641, + "num_tokens": 1625925207.0, + "step": 9699 + }, + { + "entropy": 1.726867179075877, + "epoch": 1.0655845760896432, + "grad_norm": 0.9850101470947266, + "learning_rate": 1.048391919832234e-05, + "loss": 1.4454, + "mean_token_accuracy": 0.656451866030693, + "num_tokens": 1626065708.0, + "step": 9700 + }, + { + "entropy": 1.6206376453240712, + "epoch": 1.065694433001016, + "grad_norm": 0.6840752959251404, + "learning_rate": 1.0482320712210667e-05, + "loss": 1.2084, + "mean_token_accuracy": 0.689173142115275, + "num_tokens": 1626205586.0, + "step": 9701 + }, + { + "entropy": 1.7038061221440632, + "epoch": 1.0658042899123892, + "grad_norm": 0.7342552542686462, + "learning_rate": 1.0480722242483311e-05, + "loss": 1.2624, + "mean_token_accuracy": 0.6706964919964472, + "num_tokens": 1626338702.0, + "step": 9702 + }, + { + "entropy": 1.7553867201010387, + "epoch": 1.065914146823762, + "grad_norm": 0.6570054292678833, + "learning_rate": 1.0479123789190862e-05, + "loss": 1.3466, + "mean_token_accuracy": 0.6499860535065333, + "num_tokens": 1626493092.0, + "step": 9703 + }, + { + "entropy": 1.6693990528583527, + "epoch": 1.066024003735135, + "grad_norm": 0.5803617835044861, + "learning_rate": 1.0477525352383913e-05, + "loss": 1.3904, + "mean_token_accuracy": 0.6566586941480637, + "num_tokens": 1626717365.0, + "step": 9704 + }, + { + "entropy": 1.7204264203707378, + "epoch": 1.0661338606465078, + "grad_norm": 0.8450800776481628, + "learning_rate": 1.0475926932113048e-05, + "loss": 1.2561, + "mean_token_accuracy": 0.6803312748670578, + "num_tokens": 1626849414.0, + "step": 9705 + }, + { + "entropy": 1.6471377710501354, + "epoch": 1.066243717557881, + "grad_norm": 0.7381437420845032, + "learning_rate": 1.0474328528428857e-05, + "loss": 1.3509, + "mean_token_accuracy": 0.6623278111219406, + "num_tokens": 1626997472.0, + "step": 9706 + }, + { + "entropy": 1.648034284512202, + "epoch": 1.0663535744692538, + "grad_norm": 0.5970849394798279, + "learning_rate": 1.0472730141381934e-05, + "loss": 1.4102, + "mean_token_accuracy": 0.6520186911026636, + "num_tokens": 1627182309.0, + "step": 9707 + }, + { + "entropy": 1.6744927664597828, + "epoch": 1.0664634313806267, + "grad_norm": 0.7309461236000061, + "learning_rate": 1.0471131771022864e-05, + "loss": 1.5346, + "mean_token_accuracy": 0.6382336765527725, + "num_tokens": 1627436905.0, + "step": 9708 + }, + { + "entropy": 1.7173547446727753, + "epoch": 1.0665732882919996, + "grad_norm": 0.7076787948608398, + "learning_rate": 1.0469533417402233e-05, + "loss": 1.3205, + "mean_token_accuracy": 0.6694023460149765, + "num_tokens": 1627595498.0, + "step": 9709 + }, + { + "entropy": 1.7104172905286152, + "epoch": 1.0666831452033727, + "grad_norm": 0.7134943008422852, + "learning_rate": 1.0467935080570635e-05, + "loss": 1.4921, + "mean_token_accuracy": 0.6477473825216293, + "num_tokens": 1627769844.0, + "step": 9710 + }, + { + "entropy": 1.7438405752182007, + "epoch": 1.0667930021147456, + "grad_norm": 0.8070123791694641, + "learning_rate": 1.0466336760578651e-05, + "loss": 1.2289, + "mean_token_accuracy": 0.6676356494426727, + "num_tokens": 1627874501.0, + "step": 9711 + }, + { + "entropy": 1.6932413180669148, + "epoch": 1.0669028590261185, + "grad_norm": 0.6592340469360352, + "learning_rate": 1.0464738457476864e-05, + "loss": 1.3901, + "mean_token_accuracy": 0.6631415237983068, + "num_tokens": 1628013193.0, + "step": 9712 + }, + { + "entropy": 1.7657971382141113, + "epoch": 1.0670127159374914, + "grad_norm": 0.6301009654998779, + "learning_rate": 1.0463140171315869e-05, + "loss": 1.4882, + "mean_token_accuracy": 0.6417807191610336, + "num_tokens": 1628199079.0, + "step": 9713 + }, + { + "entropy": 1.7166667381922405, + "epoch": 1.0671225728488642, + "grad_norm": 0.6923064589500427, + "learning_rate": 1.0461541902146242e-05, + "loss": 1.4851, + "mean_token_accuracy": 0.6553737769524256, + "num_tokens": 1628345776.0, + "step": 9714 + }, + { + "entropy": 1.6853701770305634, + "epoch": 1.0672324297602374, + "grad_norm": 0.9525777697563171, + "learning_rate": 1.0459943650018571e-05, + "loss": 1.4003, + "mean_token_accuracy": 0.6418146789073944, + "num_tokens": 1628531537.0, + "step": 9715 + }, + { + "entropy": 1.6456667979558308, + "epoch": 1.0673422866716102, + "grad_norm": 0.7955992817878723, + "learning_rate": 1.0458345414983443e-05, + "loss": 1.4437, + "mean_token_accuracy": 0.6676210363705953, + "num_tokens": 1628676576.0, + "step": 9716 + }, + { + "entropy": 1.7199549674987793, + "epoch": 1.0674521435829831, + "grad_norm": 0.7301570773124695, + "learning_rate": 1.0456747197091437e-05, + "loss": 1.2948, + "mean_token_accuracy": 0.6685295353333155, + "num_tokens": 1628797611.0, + "step": 9717 + }, + { + "entropy": 1.709757685661316, + "epoch": 1.067562000494356, + "grad_norm": 0.6962376832962036, + "learning_rate": 1.0455148996393134e-05, + "loss": 1.4389, + "mean_token_accuracy": 0.6500872025887171, + "num_tokens": 1628968130.0, + "step": 9718 + }, + { + "entropy": 1.7100279132525127, + "epoch": 1.0676718574057291, + "grad_norm": 0.6749715805053711, + "learning_rate": 1.0453550812939123e-05, + "loss": 1.4132, + "mean_token_accuracy": 0.657318522532781, + "num_tokens": 1629132479.0, + "step": 9719 + }, + { + "entropy": 1.7299424409866333, + "epoch": 1.067781714317102, + "grad_norm": 0.7030600905418396, + "learning_rate": 1.0451952646779984e-05, + "loss": 1.5058, + "mean_token_accuracy": 0.6384957631429037, + "num_tokens": 1629288118.0, + "step": 9720 + }, + { + "entropy": 1.6266062955061595, + "epoch": 1.067891571228475, + "grad_norm": 0.5976377725601196, + "learning_rate": 1.0450354497966296e-05, + "loss": 1.252, + "mean_token_accuracy": 0.6704409321149191, + "num_tokens": 1629440160.0, + "step": 9721 + }, + { + "entropy": 1.7321474353472393, + "epoch": 1.0680014281398478, + "grad_norm": 0.7913817763328552, + "learning_rate": 1.044875636654864e-05, + "loss": 1.5509, + "mean_token_accuracy": 0.6350030054648718, + "num_tokens": 1629614991.0, + "step": 9722 + }, + { + "entropy": 1.7179767787456512, + "epoch": 1.0681112850512209, + "grad_norm": 0.7277995944023132, + "learning_rate": 1.0447158252577595e-05, + "loss": 1.397, + "mean_token_accuracy": 0.6543081154425939, + "num_tokens": 1629790080.0, + "step": 9723 + }, + { + "entropy": 1.7230423092842102, + "epoch": 1.0682211419625938, + "grad_norm": 0.6613367795944214, + "learning_rate": 1.0445560156103742e-05, + "loss": 1.5439, + "mean_token_accuracy": 0.6340614507595698, + "num_tokens": 1630005087.0, + "step": 9724 + }, + { + "entropy": 1.6821261048316956, + "epoch": 1.0683309988739667, + "grad_norm": 0.7301710844039917, + "learning_rate": 1.0443962077177662e-05, + "loss": 1.4798, + "mean_token_accuracy": 0.6496898879607519, + "num_tokens": 1630248651.0, + "step": 9725 + }, + { + "entropy": 1.6728065013885498, + "epoch": 1.0684408557853395, + "grad_norm": 0.6719411611557007, + "learning_rate": 1.0442364015849933e-05, + "loss": 1.5279, + "mean_token_accuracy": 0.6336749543746313, + "num_tokens": 1630450396.0, + "step": 9726 + }, + { + "entropy": 1.6456262568632762, + "epoch": 1.0685507126967124, + "grad_norm": 0.6920890808105469, + "learning_rate": 1.044076597217113e-05, + "loss": 1.4419, + "mean_token_accuracy": 0.6407264123360316, + "num_tokens": 1630661340.0, + "step": 9727 + }, + { + "entropy": 1.664272427558899, + "epoch": 1.0686605696080855, + "grad_norm": 0.8365959525108337, + "learning_rate": 1.0439167946191833e-05, + "loss": 1.2577, + "mean_token_accuracy": 0.6765812089045843, + "num_tokens": 1630830641.0, + "step": 9728 + }, + { + "entropy": 1.757549246152242, + "epoch": 1.0687704265194584, + "grad_norm": 0.8349844217300415, + "learning_rate": 1.0437569937962617e-05, + "loss": 1.4603, + "mean_token_accuracy": 0.6479651033878326, + "num_tokens": 1630990387.0, + "step": 9729 + }, + { + "entropy": 1.6663007040818532, + "epoch": 1.0688802834308313, + "grad_norm": 0.5842850208282471, + "learning_rate": 1.0435971947534056e-05, + "loss": 1.4114, + "mean_token_accuracy": 0.6464737504720688, + "num_tokens": 1631194523.0, + "step": 9730 + }, + { + "entropy": 1.7366156081358592, + "epoch": 1.0689901403422042, + "grad_norm": 0.6315268278121948, + "learning_rate": 1.0434373974956732e-05, + "loss": 1.3055, + "mean_token_accuracy": 0.6716218789418539, + "num_tokens": 1631350147.0, + "step": 9731 + }, + { + "entropy": 1.7530793845653534, + "epoch": 1.0690999972535773, + "grad_norm": 0.7210341691970825, + "learning_rate": 1.0432776020281217e-05, + "loss": 1.5249, + "mean_token_accuracy": 0.6390533894300461, + "num_tokens": 1631523960.0, + "step": 9732 + }, + { + "entropy": 1.6565876404444377, + "epoch": 1.0692098541649502, + "grad_norm": 0.5894297957420349, + "learning_rate": 1.0431178083558085e-05, + "loss": 1.5349, + "mean_token_accuracy": 0.6446276158094406, + "num_tokens": 1631694097.0, + "step": 9733 + }, + { + "entropy": 1.7121220330397289, + "epoch": 1.069319711076323, + "grad_norm": 0.8435656428337097, + "learning_rate": 1.0429580164837912e-05, + "loss": 1.5071, + "mean_token_accuracy": 0.6510050147771835, + "num_tokens": 1631875264.0, + "step": 9734 + }, + { + "entropy": 1.6705400049686432, + "epoch": 1.069429567987696, + "grad_norm": 0.7857469320297241, + "learning_rate": 1.0427982264171265e-05, + "loss": 1.4161, + "mean_token_accuracy": 0.6750262777010599, + "num_tokens": 1632003546.0, + "step": 9735 + }, + { + "entropy": 1.7056404054164886, + "epoch": 1.069539424899069, + "grad_norm": 0.7281525731086731, + "learning_rate": 1.0426384381608723e-05, + "loss": 1.3679, + "mean_token_accuracy": 0.655732790629069, + "num_tokens": 1632145433.0, + "step": 9736 + }, + { + "entropy": 1.6636238992214203, + "epoch": 1.069649281810442, + "grad_norm": 0.6364181041717529, + "learning_rate": 1.042478651720086e-05, + "loss": 1.421, + "mean_token_accuracy": 0.6475005000829697, + "num_tokens": 1632329209.0, + "step": 9737 + }, + { + "entropy": 1.724616914987564, + "epoch": 1.0697591387218148, + "grad_norm": 0.7049274444580078, + "learning_rate": 1.0423188670998243e-05, + "loss": 1.4647, + "mean_token_accuracy": 0.65199646850427, + "num_tokens": 1632496345.0, + "step": 9738 + }, + { + "entropy": 1.7012827197710674, + "epoch": 1.0698689956331877, + "grad_norm": 0.6477790474891663, + "learning_rate": 1.0421590843051443e-05, + "loss": 1.395, + "mean_token_accuracy": 0.6610339830319086, + "num_tokens": 1632699990.0, + "step": 9739 + }, + { + "entropy": 1.701954831679662, + "epoch": 1.0699788525445606, + "grad_norm": 0.7536341547966003, + "learning_rate": 1.0419993033411036e-05, + "loss": 1.4754, + "mean_token_accuracy": 0.6506841977437338, + "num_tokens": 1632878568.0, + "step": 9740 + }, + { + "entropy": 1.6800040304660797, + "epoch": 1.0700887094559337, + "grad_norm": 0.7120772004127502, + "learning_rate": 1.0418395242127586e-05, + "loss": 1.3509, + "mean_token_accuracy": 0.6788023312886556, + "num_tokens": 1633019001.0, + "step": 9741 + }, + { + "entropy": 1.6614234050114949, + "epoch": 1.0701985663673066, + "grad_norm": 0.5462971925735474, + "learning_rate": 1.0416797469251665e-05, + "loss": 1.3423, + "mean_token_accuracy": 0.6563159028689066, + "num_tokens": 1633205177.0, + "step": 9742 + }, + { + "entropy": 1.6736981471379597, + "epoch": 1.0703084232786795, + "grad_norm": 0.5824480056762695, + "learning_rate": 1.0415199714833839e-05, + "loss": 1.4978, + "mean_token_accuracy": 0.6424353569746017, + "num_tokens": 1633445240.0, + "step": 9743 + }, + { + "entropy": 1.6671138405799866, + "epoch": 1.0704182801900524, + "grad_norm": 0.6380411386489868, + "learning_rate": 1.0413601978924679e-05, + "loss": 1.2962, + "mean_token_accuracy": 0.6786693433920542, + "num_tokens": 1633617985.0, + "step": 9744 + }, + { + "entropy": 1.6733331382274628, + "epoch": 1.0705281371014255, + "grad_norm": 0.7337895035743713, + "learning_rate": 1.0412004261574756e-05, + "loss": 1.2254, + "mean_token_accuracy": 0.6779622882604599, + "num_tokens": 1633735989.0, + "step": 9745 + }, + { + "entropy": 1.7590945859750111, + "epoch": 1.0706379940127984, + "grad_norm": 0.7717983722686768, + "learning_rate": 1.041040656283463e-05, + "loss": 1.3141, + "mean_token_accuracy": 0.6691213548183441, + "num_tokens": 1633873474.0, + "step": 9746 + }, + { + "entropy": 1.7039412657419841, + "epoch": 1.0707478509241712, + "grad_norm": 0.8194169998168945, + "learning_rate": 1.040880888275487e-05, + "loss": 1.2822, + "mean_token_accuracy": 0.670879011352857, + "num_tokens": 1634023457.0, + "step": 9747 + }, + { + "entropy": 1.7141053279240925, + "epoch": 1.0708577078355441, + "grad_norm": 0.8548807501792908, + "learning_rate": 1.0407211221386045e-05, + "loss": 1.5504, + "mean_token_accuracy": 0.6367798795302709, + "num_tokens": 1634192980.0, + "step": 9748 + }, + { + "entropy": 1.7354576488335927, + "epoch": 1.0709675647469172, + "grad_norm": 0.7616019248962402, + "learning_rate": 1.0405613578778715e-05, + "loss": 1.4807, + "mean_token_accuracy": 0.6475146313508352, + "num_tokens": 1634353012.0, + "step": 9749 + }, + { + "entropy": 1.646492878595988, + "epoch": 1.0710774216582901, + "grad_norm": 0.6295143365859985, + "learning_rate": 1.0404015954983446e-05, + "loss": 1.2916, + "mean_token_accuracy": 0.6755774716536204, + "num_tokens": 1634499347.0, + "step": 9750 + }, + { + "entropy": 1.7243276337782543, + "epoch": 1.071187278569663, + "grad_norm": 0.7246981263160706, + "learning_rate": 1.0402418350050807e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6525032967329025, + "num_tokens": 1634629111.0, + "step": 9751 + }, + { + "entropy": 1.688949167728424, + "epoch": 1.071297135481036, + "grad_norm": 0.6447790265083313, + "learning_rate": 1.0400820764031359e-05, + "loss": 1.3286, + "mean_token_accuracy": 0.6776885588963827, + "num_tokens": 1634768536.0, + "step": 9752 + }, + { + "entropy": 1.6156230966250102, + "epoch": 1.0714069923924088, + "grad_norm": 0.6778178811073303, + "learning_rate": 1.039922319697566e-05, + "loss": 1.4044, + "mean_token_accuracy": 0.6686372607946396, + "num_tokens": 1634933718.0, + "step": 9753 + }, + { + "entropy": 1.7385461231072743, + "epoch": 1.0715168493037819, + "grad_norm": 0.7039634585380554, + "learning_rate": 1.0397625648934279e-05, + "loss": 1.4781, + "mean_token_accuracy": 0.6421026686827341, + "num_tokens": 1635106063.0, + "step": 9754 + }, + { + "entropy": 1.7002881566683452, + "epoch": 1.0716267062151548, + "grad_norm": 0.5606304407119751, + "learning_rate": 1.0396028119957775e-05, + "loss": 1.4618, + "mean_token_accuracy": 0.6472335507472357, + "num_tokens": 1635323066.0, + "step": 9755 + }, + { + "entropy": 1.720334788163503, + "epoch": 1.0717365631265277, + "grad_norm": 0.6186944842338562, + "learning_rate": 1.0394430610096704e-05, + "loss": 1.4535, + "mean_token_accuracy": 0.648893857995669, + "num_tokens": 1635538160.0, + "step": 9756 + }, + { + "entropy": 1.710259069999059, + "epoch": 1.0718464200379005, + "grad_norm": 0.7848376631736755, + "learning_rate": 1.0392833119401635e-05, + "loss": 1.2818, + "mean_token_accuracy": 0.6685756246248881, + "num_tokens": 1635684409.0, + "step": 9757 + }, + { + "entropy": 1.679042249917984, + "epoch": 1.0719562769492736, + "grad_norm": 0.6115646362304688, + "learning_rate": 1.0391235647923125e-05, + "loss": 1.3574, + "mean_token_accuracy": 0.6604473541180292, + "num_tokens": 1635818851.0, + "step": 9758 + }, + { + "entropy": 1.6841739316781361, + "epoch": 1.0720661338606465, + "grad_norm": 0.6064473390579224, + "learning_rate": 1.0389638195711731e-05, + "loss": 1.3239, + "mean_token_accuracy": 0.6654881288607916, + "num_tokens": 1636047560.0, + "step": 9759 + }, + { + "entropy": 1.7591745456059773, + "epoch": 1.0721759907720194, + "grad_norm": 0.7367409467697144, + "learning_rate": 1.0388040762818015e-05, + "loss": 1.4377, + "mean_token_accuracy": 0.6540845880905787, + "num_tokens": 1636161616.0, + "step": 9760 + }, + { + "entropy": 1.7117444177468617, + "epoch": 1.0722858476833923, + "grad_norm": 2.199622869491577, + "learning_rate": 1.0386443349292532e-05, + "loss": 1.2059, + "mean_token_accuracy": 0.6679257899522781, + "num_tokens": 1636339502.0, + "step": 9761 + }, + { + "entropy": 1.7388999263445537, + "epoch": 1.0723957045947654, + "grad_norm": 0.6870440244674683, + "learning_rate": 1.0384845955185838e-05, + "loss": 1.5565, + "mean_token_accuracy": 0.6418974051872889, + "num_tokens": 1636544108.0, + "step": 9762 + }, + { + "entropy": 1.6883673071861267, + "epoch": 1.0725055615061383, + "grad_norm": 0.6861622929573059, + "learning_rate": 1.0383248580548495e-05, + "loss": 1.2584, + "mean_token_accuracy": 0.6789047420024872, + "num_tokens": 1636730212.0, + "step": 9763 + }, + { + "entropy": 1.7474354803562164, + "epoch": 1.0726154184175112, + "grad_norm": 0.8235689401626587, + "learning_rate": 1.0381651225431055e-05, + "loss": 1.5772, + "mean_token_accuracy": 0.6468819305300713, + "num_tokens": 1636875441.0, + "step": 9764 + }, + { + "entropy": 1.7631232539812725, + "epoch": 1.072725275328884, + "grad_norm": 0.6376375555992126, + "learning_rate": 1.0380053889884077e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.65848508477211, + "num_tokens": 1637056648.0, + "step": 9765 + }, + { + "entropy": 1.7007540861765544, + "epoch": 1.072835132240257, + "grad_norm": 0.6686075329780579, + "learning_rate": 1.0378456573958113e-05, + "loss": 1.404, + "mean_token_accuracy": 0.6725321859121323, + "num_tokens": 1637238698.0, + "step": 9766 + }, + { + "entropy": 1.7597143749396007, + "epoch": 1.07294498915163, + "grad_norm": 0.686677098274231, + "learning_rate": 1.037685927770372e-05, + "loss": 1.5594, + "mean_token_accuracy": 0.6368110875288645, + "num_tokens": 1637454987.0, + "step": 9767 + }, + { + "entropy": 1.736111968755722, + "epoch": 1.073054846063003, + "grad_norm": 0.7504826188087463, + "learning_rate": 1.0375262001171446e-05, + "loss": 1.7144, + "mean_token_accuracy": 0.6141124417384466, + "num_tokens": 1637622399.0, + "step": 9768 + }, + { + "entropy": 1.7867793242136638, + "epoch": 1.0731647029743758, + "grad_norm": 0.6543484926223755, + "learning_rate": 1.0373664744411851e-05, + "loss": 1.34, + "mean_token_accuracy": 0.667723630865415, + "num_tokens": 1637753172.0, + "step": 9769 + }, + { + "entropy": 1.6704789002736409, + "epoch": 1.0732745598857487, + "grad_norm": 0.6091136336326599, + "learning_rate": 1.0372067507475485e-05, + "loss": 1.2838, + "mean_token_accuracy": 0.6668292681376139, + "num_tokens": 1637948045.0, + "step": 9770 + }, + { + "entropy": 1.7835040887196858, + "epoch": 1.0733844167971218, + "grad_norm": 0.6930747032165527, + "learning_rate": 1.0370470290412898e-05, + "loss": 1.402, + "mean_token_accuracy": 0.6510342458883921, + "num_tokens": 1638092529.0, + "step": 9771 + }, + { + "entropy": 1.632412811120351, + "epoch": 1.0734942737084947, + "grad_norm": 0.6597646474838257, + "learning_rate": 1.0368873093274646e-05, + "loss": 1.2968, + "mean_token_accuracy": 0.6696435958147049, + "num_tokens": 1638232322.0, + "step": 9772 + }, + { + "entropy": 1.6681885520617168, + "epoch": 1.0736041306198676, + "grad_norm": 0.6119679808616638, + "learning_rate": 1.0367275916111272e-05, + "loss": 1.3269, + "mean_token_accuracy": 0.6568154295285543, + "num_tokens": 1638417721.0, + "step": 9773 + }, + { + "entropy": 1.7815796037515004, + "epoch": 1.0737139875312405, + "grad_norm": 0.6741944551467896, + "learning_rate": 1.036567875897333e-05, + "loss": 1.4554, + "mean_token_accuracy": 0.6437022139628729, + "num_tokens": 1638573018.0, + "step": 9774 + }, + { + "entropy": 1.658657729625702, + "epoch": 1.0738238444426136, + "grad_norm": 0.6922010779380798, + "learning_rate": 1.0364081621911372e-05, + "loss": 1.4375, + "mean_token_accuracy": 0.6637585858503977, + "num_tokens": 1638755544.0, + "step": 9775 + }, + { + "entropy": 1.6960961520671844, + "epoch": 1.0739337013539865, + "grad_norm": 0.7771033048629761, + "learning_rate": 1.0362484504975943e-05, + "loss": 1.3166, + "mean_token_accuracy": 0.6613381505012512, + "num_tokens": 1638893033.0, + "step": 9776 + }, + { + "entropy": 1.7312343815962474, + "epoch": 1.0740435582653594, + "grad_norm": 0.6502153873443604, + "learning_rate": 1.0360887408217592e-05, + "loss": 1.4751, + "mean_token_accuracy": 0.6359160343805949, + "num_tokens": 1639088128.0, + "step": 9777 + }, + { + "entropy": 1.7355634570121765, + "epoch": 1.0741534151767322, + "grad_norm": 0.6641053557395935, + "learning_rate": 1.0359290331686869e-05, + "loss": 1.5899, + "mean_token_accuracy": 0.6462592383225759, + "num_tokens": 1639267913.0, + "step": 9778 + }, + { + "entropy": 1.7013458808263142, + "epoch": 1.0742632720881051, + "grad_norm": 0.7097647786140442, + "learning_rate": 1.0357693275434315e-05, + "loss": 1.1678, + "mean_token_accuracy": 0.682997981707255, + "num_tokens": 1639367322.0, + "step": 9779 + }, + { + "entropy": 1.725685566663742, + "epoch": 1.0743731289994782, + "grad_norm": 0.6841909289360046, + "learning_rate": 1.0356096239510478e-05, + "loss": 1.3812, + "mean_token_accuracy": 0.6562798221906027, + "num_tokens": 1639546232.0, + "step": 9780 + }, + { + "entropy": 1.6580406824747722, + "epoch": 1.0744829859108511, + "grad_norm": 0.6318546533584595, + "learning_rate": 1.035449922396591e-05, + "loss": 1.3734, + "mean_token_accuracy": 0.668061430255572, + "num_tokens": 1639715575.0, + "step": 9781 + }, + { + "entropy": 1.750498543183009, + "epoch": 1.074592842822224, + "grad_norm": 0.7357514500617981, + "learning_rate": 1.0352902228851147e-05, + "loss": 1.2926, + "mean_token_accuracy": 0.6660207162300745, + "num_tokens": 1639837278.0, + "step": 9782 + }, + { + "entropy": 1.7398191094398499, + "epoch": 1.0747026997335969, + "grad_norm": 0.787640392780304, + "learning_rate": 1.0351305254216736e-05, + "loss": 1.3594, + "mean_token_accuracy": 0.6593456069628397, + "num_tokens": 1639989121.0, + "step": 9783 + }, + { + "entropy": 1.6935386459032695, + "epoch": 1.07481255664497, + "grad_norm": 0.6656416654586792, + "learning_rate": 1.0349708300113228e-05, + "loss": 1.3941, + "mean_token_accuracy": 0.6612067172924677, + "num_tokens": 1640152318.0, + "step": 9784 + }, + { + "entropy": 1.717629502216975, + "epoch": 1.0749224135563429, + "grad_norm": 0.7953632473945618, + "learning_rate": 1.0348111366591154e-05, + "loss": 1.5034, + "mean_token_accuracy": 0.6524255921443304, + "num_tokens": 1640280142.0, + "step": 9785 + }, + { + "entropy": 1.7047446469465892, + "epoch": 1.0750322704677158, + "grad_norm": 0.6888314485549927, + "learning_rate": 1.034651445370106e-05, + "loss": 1.4914, + "mean_token_accuracy": 0.6492563138405482, + "num_tokens": 1640440654.0, + "step": 9786 + }, + { + "entropy": 1.709777424732844, + "epoch": 1.0751421273790887, + "grad_norm": 0.5959087610244751, + "learning_rate": 1.0344917561493492e-05, + "loss": 1.4593, + "mean_token_accuracy": 0.6443605422973633, + "num_tokens": 1640638440.0, + "step": 9787 + }, + { + "entropy": 1.7329801519711812, + "epoch": 1.0752519842904618, + "grad_norm": 0.6564949750900269, + "learning_rate": 1.0343320690018988e-05, + "loss": 1.4895, + "mean_token_accuracy": 0.6578076879183451, + "num_tokens": 1640779771.0, + "step": 9788 + }, + { + "entropy": 1.722126881281535, + "epoch": 1.0753618412018346, + "grad_norm": 0.6596241593360901, + "learning_rate": 1.0341723839328086e-05, + "loss": 1.4626, + "mean_token_accuracy": 0.6395512421925863, + "num_tokens": 1640969536.0, + "step": 9789 + }, + { + "entropy": 1.6550799508889515, + "epoch": 1.0754716981132075, + "grad_norm": 0.6088923215866089, + "learning_rate": 1.0340127009471331e-05, + "loss": 1.4409, + "mean_token_accuracy": 0.6460276196400324, + "num_tokens": 1641181637.0, + "step": 9790 + }, + { + "entropy": 1.6901710430781047, + "epoch": 1.0755815550245804, + "grad_norm": 0.677692711353302, + "learning_rate": 1.0338530200499258e-05, + "loss": 1.3289, + "mean_token_accuracy": 0.670496458808581, + "num_tokens": 1641338802.0, + "step": 9791 + }, + { + "entropy": 1.7122070491313934, + "epoch": 1.0756914119359535, + "grad_norm": 0.8194560408592224, + "learning_rate": 1.0336933412462402e-05, + "loss": 1.3465, + "mean_token_accuracy": 0.6500441581010818, + "num_tokens": 1641506448.0, + "step": 9792 + }, + { + "entropy": 1.737431804339091, + "epoch": 1.0758012688473264, + "grad_norm": 0.6265955567359924, + "learning_rate": 1.0335336645411309e-05, + "loss": 1.3948, + "mean_token_accuracy": 0.6440109014511108, + "num_tokens": 1641693580.0, + "step": 9793 + }, + { + "entropy": 1.7306747833887737, + "epoch": 1.0759111257586993, + "grad_norm": 0.6415075063705444, + "learning_rate": 1.0333739899396511e-05, + "loss": 1.4351, + "mean_token_accuracy": 0.6500951796770096, + "num_tokens": 1641847952.0, + "step": 9794 + }, + { + "entropy": 1.720722109079361, + "epoch": 1.0760209826700722, + "grad_norm": 0.7167672514915466, + "learning_rate": 1.0332143174468545e-05, + "loss": 1.4079, + "mean_token_accuracy": 0.6544150163729986, + "num_tokens": 1642023634.0, + "step": 9795 + }, + { + "entropy": 1.654203087091446, + "epoch": 1.076130839581445, + "grad_norm": 0.581329345703125, + "learning_rate": 1.0330546470677946e-05, + "loss": 1.2703, + "mean_token_accuracy": 0.6772001385688782, + "num_tokens": 1642227381.0, + "step": 9796 + }, + { + "entropy": 1.7064690093199413, + "epoch": 1.0762406964928182, + "grad_norm": 0.6623792052268982, + "learning_rate": 1.0328949788075249e-05, + "loss": 1.4675, + "mean_token_accuracy": 0.6539207597573599, + "num_tokens": 1642396240.0, + "step": 9797 + }, + { + "entropy": 1.662476509809494, + "epoch": 1.076350553404191, + "grad_norm": 0.6676307320594788, + "learning_rate": 1.0327353126710988e-05, + "loss": 1.4906, + "mean_token_accuracy": 0.6464128841956457, + "num_tokens": 1642538098.0, + "step": 9798 + }, + { + "entropy": 1.671265075604121, + "epoch": 1.076460410315564, + "grad_norm": 0.6107703447341919, + "learning_rate": 1.03257564866357e-05, + "loss": 1.358, + "mean_token_accuracy": 0.6674867620070776, + "num_tokens": 1642748380.0, + "step": 9799 + }, + { + "entropy": 1.7271918257077534, + "epoch": 1.0765702672269368, + "grad_norm": 0.6320644617080688, + "learning_rate": 1.0324159867899914e-05, + "loss": 1.4831, + "mean_token_accuracy": 0.6485229134559631, + "num_tokens": 1642965144.0, + "step": 9800 + }, + { + "entropy": 1.6610159476598103, + "epoch": 1.07668012413831, + "grad_norm": 0.9616381525993347, + "learning_rate": 1.0322563270554167e-05, + "loss": 1.2259, + "mean_token_accuracy": 0.6779639472564062, + "num_tokens": 1643120751.0, + "step": 9801 + }, + { + "entropy": 1.6773190399010975, + "epoch": 1.0767899810496828, + "grad_norm": 0.7179288268089294, + "learning_rate": 1.0320966694648984e-05, + "loss": 1.2882, + "mean_token_accuracy": 0.6666442155838013, + "num_tokens": 1643266003.0, + "step": 9802 + }, + { + "entropy": 1.724582443634669, + "epoch": 1.0768998379610557, + "grad_norm": 0.6458866596221924, + "learning_rate": 1.03193701402349e-05, + "loss": 1.4392, + "mean_token_accuracy": 0.6562464485565821, + "num_tokens": 1643464964.0, + "step": 9803 + }, + { + "entropy": 1.7303006847699482, + "epoch": 1.0770096948724286, + "grad_norm": 0.7385509014129639, + "learning_rate": 1.0317773607362445e-05, + "loss": 1.5418, + "mean_token_accuracy": 0.6399680574735006, + "num_tokens": 1643676724.0, + "step": 9804 + }, + { + "entropy": 1.7020288407802582, + "epoch": 1.0771195517838017, + "grad_norm": 0.7342250347137451, + "learning_rate": 1.0316177096082142e-05, + "loss": 1.4731, + "mean_token_accuracy": 0.6379378736019135, + "num_tokens": 1643837858.0, + "step": 9805 + }, + { + "entropy": 1.6457345684369404, + "epoch": 1.0772294086951746, + "grad_norm": 0.6080856323242188, + "learning_rate": 1.0314580606444531e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6683526982863744, + "num_tokens": 1644029727.0, + "step": 9806 + }, + { + "entropy": 1.6607999900976818, + "epoch": 1.0773392656065475, + "grad_norm": 0.6070815324783325, + "learning_rate": 1.0312984138500137e-05, + "loss": 1.3415, + "mean_token_accuracy": 0.6586224585771561, + "num_tokens": 1644184892.0, + "step": 9807 + }, + { + "entropy": 1.7429456015427907, + "epoch": 1.0774491225179204, + "grad_norm": 0.6538578271865845, + "learning_rate": 1.0311387692299481e-05, + "loss": 1.4834, + "mean_token_accuracy": 0.6407992839813232, + "num_tokens": 1644349707.0, + "step": 9808 + }, + { + "entropy": 1.6649847229321797, + "epoch": 1.0775589794292935, + "grad_norm": 1.274907112121582, + "learning_rate": 1.0309791267893097e-05, + "loss": 1.2798, + "mean_token_accuracy": 0.6626059412956238, + "num_tokens": 1644577887.0, + "step": 9809 + }, + { + "entropy": 1.7314873437086742, + "epoch": 1.0776688363406663, + "grad_norm": 2.2484679222106934, + "learning_rate": 1.030819486533151e-05, + "loss": 1.0967, + "mean_token_accuracy": 0.6872533162434896, + "num_tokens": 1644760231.0, + "step": 9810 + }, + { + "entropy": 1.6759747962156932, + "epoch": 1.0777786932520392, + "grad_norm": 0.7670673131942749, + "learning_rate": 1.0306598484665237e-05, + "loss": 1.3782, + "mean_token_accuracy": 0.6530605256557465, + "num_tokens": 1644987826.0, + "step": 9811 + }, + { + "entropy": 1.702736069758733, + "epoch": 1.0778885501634121, + "grad_norm": 0.6072533130645752, + "learning_rate": 1.0305002125944815e-05, + "loss": 1.3696, + "mean_token_accuracy": 0.6654748469591141, + "num_tokens": 1645161886.0, + "step": 9812 + }, + { + "entropy": 1.6799784203370411, + "epoch": 1.077998407074785, + "grad_norm": 0.6296765804290771, + "learning_rate": 1.0303405789220762e-05, + "loss": 1.2927, + "mean_token_accuracy": 0.6687972942988077, + "num_tokens": 1645290821.0, + "step": 9813 + }, + { + "entropy": 1.7050584852695465, + "epoch": 1.078108263986158, + "grad_norm": 0.6202853918075562, + "learning_rate": 1.03018094745436e-05, + "loss": 1.3272, + "mean_token_accuracy": 0.6606413920720419, + "num_tokens": 1645434124.0, + "step": 9814 + }, + { + "entropy": 1.6356126467386882, + "epoch": 1.078218120897531, + "grad_norm": 0.6204116344451904, + "learning_rate": 1.0300213181963854e-05, + "loss": 1.4115, + "mean_token_accuracy": 0.661191796263059, + "num_tokens": 1645655256.0, + "step": 9815 + }, + { + "entropy": 1.6903501550356548, + "epoch": 1.0783279778089039, + "grad_norm": 0.8098730444908142, + "learning_rate": 1.0298616911532047e-05, + "loss": 1.3938, + "mean_token_accuracy": 0.6561371485392252, + "num_tokens": 1645794366.0, + "step": 9816 + }, + { + "entropy": 1.7309677203496296, + "epoch": 1.0784378347202768, + "grad_norm": 0.769802451133728, + "learning_rate": 1.0297020663298695e-05, + "loss": 1.3183, + "mean_token_accuracy": 0.654662013053894, + "num_tokens": 1646020983.0, + "step": 9817 + }, + { + "entropy": 1.688058316707611, + "epoch": 1.0785476916316499, + "grad_norm": 0.6309769749641418, + "learning_rate": 1.0295424437314326e-05, + "loss": 1.5407, + "mean_token_accuracy": 0.6352472951014837, + "num_tokens": 1646262332.0, + "step": 9818 + }, + { + "entropy": 1.6809982061386108, + "epoch": 1.0786575485430228, + "grad_norm": 0.7366631031036377, + "learning_rate": 1.0293828233629457e-05, + "loss": 1.3507, + "mean_token_accuracy": 0.6608734428882599, + "num_tokens": 1646417726.0, + "step": 9819 + }, + { + "entropy": 1.7148310641447704, + "epoch": 1.0787674054543956, + "grad_norm": 2.1722970008850098, + "learning_rate": 1.0292232052294603e-05, + "loss": 1.3295, + "mean_token_accuracy": 0.6597887873649597, + "num_tokens": 1646608030.0, + "step": 9820 + }, + { + "entropy": 1.6972604592641194, + "epoch": 1.0788772623657685, + "grad_norm": 0.7574262619018555, + "learning_rate": 1.0290635893360288e-05, + "loss": 1.3516, + "mean_token_accuracy": 0.6627415219942728, + "num_tokens": 1646720566.0, + "step": 9821 + }, + { + "entropy": 1.7421917816003163, + "epoch": 1.0789871192771416, + "grad_norm": 0.6693733930587769, + "learning_rate": 1.0289039756877026e-05, + "loss": 1.5256, + "mean_token_accuracy": 0.6440355281035105, + "num_tokens": 1646877724.0, + "step": 9822 + }, + { + "entropy": 1.7440255184968312, + "epoch": 1.0790969761885145, + "grad_norm": 0.8541271686553955, + "learning_rate": 1.0287443642895334e-05, + "loss": 1.6604, + "mean_token_accuracy": 0.6371288100878397, + "num_tokens": 1647087449.0, + "step": 9823 + }, + { + "entropy": 1.6293854117393494, + "epoch": 1.0792068330998874, + "grad_norm": 0.678485095500946, + "learning_rate": 1.0285847551465731e-05, + "loss": 1.3659, + "mean_token_accuracy": 0.6574168552954992, + "num_tokens": 1647232759.0, + "step": 9824 + }, + { + "entropy": 1.6905154486497243, + "epoch": 1.0793166900112603, + "grad_norm": 0.6696950197219849, + "learning_rate": 1.0284251482638731e-05, + "loss": 1.3161, + "mean_token_accuracy": 0.6695791979630789, + "num_tokens": 1647386243.0, + "step": 9825 + }, + { + "entropy": 1.7025805910428364, + "epoch": 1.0794265469226332, + "grad_norm": 0.6888556480407715, + "learning_rate": 1.028265543646485e-05, + "loss": 1.3759, + "mean_token_accuracy": 0.6593132664759954, + "num_tokens": 1647595259.0, + "step": 9826 + }, + { + "entropy": 1.7074210743109386, + "epoch": 1.0795364038340063, + "grad_norm": 0.8074763417243958, + "learning_rate": 1.02810594129946e-05, + "loss": 1.4319, + "mean_token_accuracy": 0.6571053018172582, + "num_tokens": 1647787234.0, + "step": 9827 + }, + { + "entropy": 1.6747658252716064, + "epoch": 1.0796462607453792, + "grad_norm": 0.6728916168212891, + "learning_rate": 1.0279463412278499e-05, + "loss": 1.4022, + "mean_token_accuracy": 0.6656891653935114, + "num_tokens": 1647936897.0, + "step": 9828 + }, + { + "entropy": 1.6993980407714844, + "epoch": 1.079756117656752, + "grad_norm": 0.6834884881973267, + "learning_rate": 1.0277867434367052e-05, + "loss": 1.4363, + "mean_token_accuracy": 0.6489211916923523, + "num_tokens": 1648124511.0, + "step": 9829 + }, + { + "entropy": 1.6820717453956604, + "epoch": 1.079865974568125, + "grad_norm": 0.7455261945724487, + "learning_rate": 1.0276271479310775e-05, + "loss": 1.1957, + "mean_token_accuracy": 0.6947454114754995, + "num_tokens": 1648246242.0, + "step": 9830 + }, + { + "entropy": 1.7340157429377239, + "epoch": 1.079975831479498, + "grad_norm": 0.7091799974441528, + "learning_rate": 1.0274675547160184e-05, + "loss": 1.3353, + "mean_token_accuracy": 0.6555547267198563, + "num_tokens": 1648388192.0, + "step": 9831 + }, + { + "entropy": 1.7246295909086864, + "epoch": 1.080085688390871, + "grad_norm": 0.6388477683067322, + "learning_rate": 1.0273079637965782e-05, + "loss": 1.5495, + "mean_token_accuracy": 0.6348314036925634, + "num_tokens": 1648607042.0, + "step": 9832 + }, + { + "entropy": 1.7295263310273488, + "epoch": 1.0801955453022438, + "grad_norm": 0.7163142561912537, + "learning_rate": 1.0271483751778082e-05, + "loss": 1.3953, + "mean_token_accuracy": 0.6609020779530207, + "num_tokens": 1648770793.0, + "step": 9833 + }, + { + "entropy": 1.6535666485627492, + "epoch": 1.0803054022136167, + "grad_norm": 0.6457258462905884, + "learning_rate": 1.0269887888647594e-05, + "loss": 1.2601, + "mean_token_accuracy": 0.6786867479483286, + "num_tokens": 1648918233.0, + "step": 9834 + }, + { + "entropy": 1.6825013260046642, + "epoch": 1.0804152591249898, + "grad_norm": 0.6680422425270081, + "learning_rate": 1.0268292048624825e-05, + "loss": 1.4099, + "mean_token_accuracy": 0.6632524182399114, + "num_tokens": 1649110520.0, + "step": 9835 + }, + { + "entropy": 1.700180431207021, + "epoch": 1.0805251160363627, + "grad_norm": 0.6605114340782166, + "learning_rate": 1.026669623176028e-05, + "loss": 1.4498, + "mean_token_accuracy": 0.6626182099183401, + "num_tokens": 1649282053.0, + "step": 9836 + }, + { + "entropy": 1.6609856685002644, + "epoch": 1.0806349729477356, + "grad_norm": 0.7333995699882507, + "learning_rate": 1.0265100438104474e-05, + "loss": 1.2677, + "mean_token_accuracy": 0.6766239404678345, + "num_tokens": 1649457935.0, + "step": 9837 + }, + { + "entropy": 1.693364332119624, + "epoch": 1.0807448298591085, + "grad_norm": 0.7352896928787231, + "learning_rate": 1.0263504667707904e-05, + "loss": 1.4006, + "mean_token_accuracy": 0.6677973767121633, + "num_tokens": 1649597344.0, + "step": 9838 + }, + { + "entropy": 1.6948012510935466, + "epoch": 1.0808546867704814, + "grad_norm": 0.7008348107337952, + "learning_rate": 1.026190892062108e-05, + "loss": 1.404, + "mean_token_accuracy": 0.6567817181348801, + "num_tokens": 1649740110.0, + "step": 9839 + }, + { + "entropy": 1.7075625856717427, + "epoch": 1.0809645436818545, + "grad_norm": 0.752145528793335, + "learning_rate": 1.0260313196894509e-05, + "loss": 1.3352, + "mean_token_accuracy": 0.6571847250064214, + "num_tokens": 1649877492.0, + "step": 9840 + }, + { + "entropy": 1.6945746143658955, + "epoch": 1.0810744005932273, + "grad_norm": 0.9085291624069214, + "learning_rate": 1.025871749657869e-05, + "loss": 1.3813, + "mean_token_accuracy": 0.6665191451708475, + "num_tokens": 1650073878.0, + "step": 9841 + }, + { + "entropy": 1.7060600022474925, + "epoch": 1.0811842575046002, + "grad_norm": 0.8114275336265564, + "learning_rate": 1.0257121819724125e-05, + "loss": 1.3438, + "mean_token_accuracy": 0.6567393392324448, + "num_tokens": 1650206487.0, + "step": 9842 + }, + { + "entropy": 1.6965892314910889, + "epoch": 1.0812941144159731, + "grad_norm": 0.6386088728904724, + "learning_rate": 1.0255526166381326e-05, + "loss": 1.4076, + "mean_token_accuracy": 0.6541461398204168, + "num_tokens": 1650355875.0, + "step": 9843 + }, + { + "entropy": 1.6367349326610565, + "epoch": 1.0814039713273462, + "grad_norm": 0.6096007823944092, + "learning_rate": 1.0253930536600785e-05, + "loss": 1.3704, + "mean_token_accuracy": 0.6580093254645666, + "num_tokens": 1650560937.0, + "step": 9844 + }, + { + "entropy": 1.6642896234989166, + "epoch": 1.081513828238719, + "grad_norm": 0.7645293474197388, + "learning_rate": 1.0252334930433005e-05, + "loss": 1.294, + "mean_token_accuracy": 0.6714354753494263, + "num_tokens": 1650749246.0, + "step": 9845 + }, + { + "entropy": 1.7149154146512349, + "epoch": 1.081623685150092, + "grad_norm": 0.6272317171096802, + "learning_rate": 1.0250739347928492e-05, + "loss": 1.4153, + "mean_token_accuracy": 0.6595138013362885, + "num_tokens": 1650926377.0, + "step": 9846 + }, + { + "entropy": 1.6645729045073192, + "epoch": 1.0817335420614649, + "grad_norm": 0.7638152241706848, + "learning_rate": 1.0249143789137736e-05, + "loss": 1.3517, + "mean_token_accuracy": 0.6574498365322748, + "num_tokens": 1651131120.0, + "step": 9847 + }, + { + "entropy": 1.7273275057474773, + "epoch": 1.081843398972838, + "grad_norm": 0.8124344944953918, + "learning_rate": 1.0247548254111242e-05, + "loss": 1.3278, + "mean_token_accuracy": 0.6669291456540426, + "num_tokens": 1651296563.0, + "step": 9848 + }, + { + "entropy": 1.7501426339149475, + "epoch": 1.0819532558842109, + "grad_norm": 0.8257563710212708, + "learning_rate": 1.0245952742899508e-05, + "loss": 1.4294, + "mean_token_accuracy": 0.6583471794923147, + "num_tokens": 1651419353.0, + "step": 9849 + }, + { + "entropy": 1.7237468461195629, + "epoch": 1.0820631127955838, + "grad_norm": 0.6573739051818848, + "learning_rate": 1.024435725555303e-05, + "loss": 1.4584, + "mean_token_accuracy": 0.6424074321985245, + "num_tokens": 1651615401.0, + "step": 9850 + }, + { + "entropy": 1.738576332728068, + "epoch": 1.0821729697069566, + "grad_norm": 0.7192042469978333, + "learning_rate": 1.0242761792122303e-05, + "loss": 1.4456, + "mean_token_accuracy": 0.6536912967761358, + "num_tokens": 1651759046.0, + "step": 9851 + }, + { + "entropy": 1.6815000077088673, + "epoch": 1.0822828266183295, + "grad_norm": 0.6549572944641113, + "learning_rate": 1.0241166352657825e-05, + "loss": 1.4403, + "mean_token_accuracy": 0.6523531973361969, + "num_tokens": 1651935854.0, + "step": 9852 + }, + { + "entropy": 1.7222835222880046, + "epoch": 1.0823926835297026, + "grad_norm": 0.5829499363899231, + "learning_rate": 1.023957093721009e-05, + "loss": 1.3928, + "mean_token_accuracy": 0.6541026532649994, + "num_tokens": 1652165819.0, + "step": 9853 + }, + { + "entropy": 1.7308641870816548, + "epoch": 1.0825025404410755, + "grad_norm": 0.5812973380088806, + "learning_rate": 1.023797554582959e-05, + "loss": 1.3832, + "mean_token_accuracy": 0.6509318649768829, + "num_tokens": 1652345735.0, + "step": 9854 + }, + { + "entropy": 1.696856160958608, + "epoch": 1.0826123973524484, + "grad_norm": 0.6950253844261169, + "learning_rate": 1.0236380178566825e-05, + "loss": 1.3401, + "mean_token_accuracy": 0.6734130581219991, + "num_tokens": 1652491455.0, + "step": 9855 + }, + { + "entropy": 1.7205109894275665, + "epoch": 1.0827222542638213, + "grad_norm": 0.7813112139701843, + "learning_rate": 1.023478483547228e-05, + "loss": 1.2933, + "mean_token_accuracy": 0.6657413095235825, + "num_tokens": 1652678524.0, + "step": 9856 + }, + { + "entropy": 1.7302239338556926, + "epoch": 1.0828321111751944, + "grad_norm": 0.6658751368522644, + "learning_rate": 1.0233189516596452e-05, + "loss": 1.3485, + "mean_token_accuracy": 0.6597934563954672, + "num_tokens": 1652856846.0, + "step": 9857 + }, + { + "entropy": 1.7021392385164897, + "epoch": 1.0829419680865673, + "grad_norm": 0.6967145800590515, + "learning_rate": 1.023159422198983e-05, + "loss": 1.4445, + "mean_token_accuracy": 0.6574411243200302, + "num_tokens": 1653065215.0, + "step": 9858 + }, + { + "entropy": 1.7235744297504425, + "epoch": 1.0830518249979402, + "grad_norm": 0.6974611282348633, + "learning_rate": 1.0229998951702902e-05, + "loss": 1.3955, + "mean_token_accuracy": 0.6510575066010157, + "num_tokens": 1653238703.0, + "step": 9859 + }, + { + "entropy": 1.7360788782437642, + "epoch": 1.083161681909313, + "grad_norm": 0.6182504892349243, + "learning_rate": 1.0228403705786165e-05, + "loss": 1.3991, + "mean_token_accuracy": 0.656227042277654, + "num_tokens": 1653453495.0, + "step": 9860 + }, + { + "entropy": 1.7091187338034313, + "epoch": 1.0832715388206862, + "grad_norm": 0.6004095077514648, + "learning_rate": 1.0226808484290097e-05, + "loss": 1.5411, + "mean_token_accuracy": 0.6317160924275717, + "num_tokens": 1653719905.0, + "step": 9861 + }, + { + "entropy": 1.6987085143725078, + "epoch": 1.083381395732059, + "grad_norm": 0.6815454959869385, + "learning_rate": 1.0225213287265194e-05, + "loss": 1.4007, + "mean_token_accuracy": 0.646802599231402, + "num_tokens": 1653908379.0, + "step": 9862 + }, + { + "entropy": 1.755209634701411, + "epoch": 1.083491252643432, + "grad_norm": 0.7040994167327881, + "learning_rate": 1.0223618114761947e-05, + "loss": 1.4096, + "mean_token_accuracy": 0.6451922804117203, + "num_tokens": 1654083065.0, + "step": 9863 + }, + { + "entropy": 1.680431107680003, + "epoch": 1.0836011095548048, + "grad_norm": 0.7204059362411499, + "learning_rate": 1.022202296683083e-05, + "loss": 1.4142, + "mean_token_accuracy": 0.6432947367429733, + "num_tokens": 1654261728.0, + "step": 9864 + }, + { + "entropy": 1.7355269491672516, + "epoch": 1.0837109664661777, + "grad_norm": 0.8181194067001343, + "learning_rate": 1.0220427843522338e-05, + "loss": 1.3979, + "mean_token_accuracy": 0.6544067362944285, + "num_tokens": 1654389163.0, + "step": 9865 + }, + { + "entropy": 1.6419854164123535, + "epoch": 1.0838208233775508, + "grad_norm": 0.6498574018478394, + "learning_rate": 1.0218832744886956e-05, + "loss": 1.2833, + "mean_token_accuracy": 0.6642761528491974, + "num_tokens": 1654574592.0, + "step": 9866 + }, + { + "entropy": 1.6292428175608318, + "epoch": 1.0839306802889237, + "grad_norm": 0.6409704685211182, + "learning_rate": 1.0217237670975158e-05, + "loss": 1.3204, + "mean_token_accuracy": 0.6685640662908554, + "num_tokens": 1654727006.0, + "step": 9867 + }, + { + "entropy": 1.704335480928421, + "epoch": 1.0840405372002966, + "grad_norm": 0.9147383570671082, + "learning_rate": 1.021564262183744e-05, + "loss": 1.5267, + "mean_token_accuracy": 0.6415324260791143, + "num_tokens": 1654887057.0, + "step": 9868 + }, + { + "entropy": 1.737158477306366, + "epoch": 1.0841503941116695, + "grad_norm": 0.6425780057907104, + "learning_rate": 1.0214047597524281e-05, + "loss": 1.3627, + "mean_token_accuracy": 0.6585712929566702, + "num_tokens": 1655026529.0, + "step": 9869 + }, + { + "entropy": 1.6733458836873372, + "epoch": 1.0842602510230426, + "grad_norm": 0.65185546875, + "learning_rate": 1.021245259808616e-05, + "loss": 1.5595, + "mean_token_accuracy": 0.6183687796195348, + "num_tokens": 1655310252.0, + "step": 9870 + }, + { + "entropy": 1.7314467032750447, + "epoch": 1.0843701079344155, + "grad_norm": 0.6148692965507507, + "learning_rate": 1.0210857623573558e-05, + "loss": 1.5472, + "mean_token_accuracy": 0.623336007197698, + "num_tokens": 1655485693.0, + "step": 9871 + }, + { + "entropy": 1.7008031606674194, + "epoch": 1.0844799648457883, + "grad_norm": 0.6279149651527405, + "learning_rate": 1.0209262674036961e-05, + "loss": 1.3351, + "mean_token_accuracy": 0.6545123358567556, + "num_tokens": 1655652876.0, + "step": 9872 + }, + { + "entropy": 1.7076662480831146, + "epoch": 1.0845898217571612, + "grad_norm": 0.7002870440483093, + "learning_rate": 1.0207667749526838e-05, + "loss": 1.5737, + "mean_token_accuracy": 0.6307255576054255, + "num_tokens": 1655813676.0, + "step": 9873 + }, + { + "entropy": 1.6783838669459026, + "epoch": 1.0846996786685343, + "grad_norm": 0.6915937662124634, + "learning_rate": 1.0206072850093676e-05, + "loss": 1.3763, + "mean_token_accuracy": 0.6629201124111811, + "num_tokens": 1655992944.0, + "step": 9874 + }, + { + "entropy": 1.6777593195438385, + "epoch": 1.0848095355799072, + "grad_norm": 0.8328781723976135, + "learning_rate": 1.0204477975787955e-05, + "loss": 1.5274, + "mean_token_accuracy": 0.6462227056423823, + "num_tokens": 1656134359.0, + "step": 9875 + }, + { + "entropy": 1.6952139933904011, + "epoch": 1.08491939249128, + "grad_norm": 0.6985744833946228, + "learning_rate": 1.0202883126660142e-05, + "loss": 1.4285, + "mean_token_accuracy": 0.641810322801272, + "num_tokens": 1656339761.0, + "step": 9876 + }, + { + "entropy": 1.6837367415428162, + "epoch": 1.085029249402653, + "grad_norm": 0.6384702324867249, + "learning_rate": 1.020128830276072e-05, + "loss": 1.4694, + "mean_token_accuracy": 0.645171602567037, + "num_tokens": 1656548333.0, + "step": 9877 + }, + { + "entropy": 1.650167852640152, + "epoch": 1.0851391063140259, + "grad_norm": 0.6957221627235413, + "learning_rate": 1.0199693504140165e-05, + "loss": 1.3905, + "mean_token_accuracy": 0.6591930339733759, + "num_tokens": 1656743652.0, + "step": 9878 + }, + { + "entropy": 1.6500220100084941, + "epoch": 1.085248963225399, + "grad_norm": 0.6716198325157166, + "learning_rate": 1.0198098730848947e-05, + "loss": 1.4271, + "mean_token_accuracy": 0.6517676363388697, + "num_tokens": 1656927088.0, + "step": 9879 + }, + { + "entropy": 1.7115015387535095, + "epoch": 1.0853588201367719, + "grad_norm": 0.6852779388427734, + "learning_rate": 1.0196503982937545e-05, + "loss": 1.3624, + "mean_token_accuracy": 0.6609803885221481, + "num_tokens": 1657077951.0, + "step": 9880 + }, + { + "entropy": 1.6878548562526703, + "epoch": 1.0854686770481448, + "grad_norm": 0.6171632409095764, + "learning_rate": 1.0194909260456428e-05, + "loss": 1.2938, + "mean_token_accuracy": 0.6820201476414999, + "num_tokens": 1657237315.0, + "step": 9881 + }, + { + "entropy": 1.6748429437478383, + "epoch": 1.0855785339595176, + "grad_norm": 0.6681820154190063, + "learning_rate": 1.0193314563456074e-05, + "loss": 1.3424, + "mean_token_accuracy": 0.6564933856328329, + "num_tokens": 1657386937.0, + "step": 9882 + }, + { + "entropy": 1.7561264435450237, + "epoch": 1.0856883908708908, + "grad_norm": 0.8080701231956482, + "learning_rate": 1.0191719891986947e-05, + "loss": 1.452, + "mean_token_accuracy": 0.6485906491676966, + "num_tokens": 1657566975.0, + "step": 9883 + }, + { + "entropy": 1.7150229513645172, + "epoch": 1.0857982477822636, + "grad_norm": 0.6068223714828491, + "learning_rate": 1.0190125246099525e-05, + "loss": 1.1525, + "mean_token_accuracy": 0.6776071439186732, + "num_tokens": 1657768813.0, + "step": 9884 + }, + { + "entropy": 1.7351139684518178, + "epoch": 1.0859081046936365, + "grad_norm": 0.706877589225769, + "learning_rate": 1.0188530625844269e-05, + "loss": 1.372, + "mean_token_accuracy": 0.6516173481941223, + "num_tokens": 1657909155.0, + "step": 9885 + }, + { + "entropy": 1.7090483804543812, + "epoch": 1.0860179616050094, + "grad_norm": 0.7106319665908813, + "learning_rate": 1.0186936031271654e-05, + "loss": 1.3158, + "mean_token_accuracy": 0.6649338553349177, + "num_tokens": 1658033454.0, + "step": 9886 + }, + { + "entropy": 1.7353846828142803, + "epoch": 1.0861278185163825, + "grad_norm": 0.7911872267723083, + "learning_rate": 1.0185341462432152e-05, + "loss": 1.3776, + "mean_token_accuracy": 0.6581158141295115, + "num_tokens": 1658185568.0, + "step": 9887 + }, + { + "entropy": 1.693009227514267, + "epoch": 1.0862376754277554, + "grad_norm": 0.7232357859611511, + "learning_rate": 1.018374691937622e-05, + "loss": 1.3956, + "mean_token_accuracy": 0.6537212679783503, + "num_tokens": 1658345153.0, + "step": 9888 + }, + { + "entropy": 1.6881878475348155, + "epoch": 1.0863475323391283, + "grad_norm": 0.6509016156196594, + "learning_rate": 1.0182152402154332e-05, + "loss": 1.4972, + "mean_token_accuracy": 0.6434793770313263, + "num_tokens": 1658541107.0, + "step": 9889 + }, + { + "entropy": 1.7099956174691517, + "epoch": 1.0864573892505012, + "grad_norm": 0.7336589694023132, + "learning_rate": 1.0180557910816955e-05, + "loss": 1.4108, + "mean_token_accuracy": 0.6635782122612, + "num_tokens": 1658686363.0, + "step": 9890 + }, + { + "entropy": 1.6696670254071553, + "epoch": 1.086567246161874, + "grad_norm": 0.7499677538871765, + "learning_rate": 1.0178963445414546e-05, + "loss": 1.297, + "mean_token_accuracy": 0.6625167379776636, + "num_tokens": 1658800203.0, + "step": 9891 + }, + { + "entropy": 1.7122901181379955, + "epoch": 1.0866771030732472, + "grad_norm": 0.7370545864105225, + "learning_rate": 1.0177369005997576e-05, + "loss": 1.4498, + "mean_token_accuracy": 0.6478169759114584, + "num_tokens": 1658963986.0, + "step": 9892 + }, + { + "entropy": 1.7717590828736622, + "epoch": 1.08678695998462, + "grad_norm": 0.72324138879776, + "learning_rate": 1.0175774592616509e-05, + "loss": 1.487, + "mean_token_accuracy": 0.6404697100321451, + "num_tokens": 1659147012.0, + "step": 9893 + }, + { + "entropy": 1.731563498576482, + "epoch": 1.086896816895993, + "grad_norm": 0.7684550881385803, + "learning_rate": 1.0174180205321801e-05, + "loss": 1.3065, + "mean_token_accuracy": 0.675625761349996, + "num_tokens": 1659280859.0, + "step": 9894 + }, + { + "entropy": 1.7210414310296376, + "epoch": 1.0870066738073658, + "grad_norm": 0.7703231573104858, + "learning_rate": 1.017258584416392e-05, + "loss": 1.3729, + "mean_token_accuracy": 0.6620122243960699, + "num_tokens": 1659426959.0, + "step": 9895 + }, + { + "entropy": 1.713478038708369, + "epoch": 1.087116530718739, + "grad_norm": 0.9372931718826294, + "learning_rate": 1.0170991509193324e-05, + "loss": 1.3353, + "mean_token_accuracy": 0.6640975425640742, + "num_tokens": 1659591324.0, + "step": 9896 + }, + { + "entropy": 1.6873964667320251, + "epoch": 1.0872263876301118, + "grad_norm": 0.5912502408027649, + "learning_rate": 1.0169397200460469e-05, + "loss": 1.3739, + "mean_token_accuracy": 0.6547368913888931, + "num_tokens": 1659753168.0, + "step": 9897 + }, + { + "entropy": 1.67390971382459, + "epoch": 1.0873362445414847, + "grad_norm": 0.6598351001739502, + "learning_rate": 1.0167802918015821e-05, + "loss": 1.2891, + "mean_token_accuracy": 0.672528882821401, + "num_tokens": 1659897374.0, + "step": 9898 + }, + { + "entropy": 1.6686325172583263, + "epoch": 1.0874461014528576, + "grad_norm": 0.7137023210525513, + "learning_rate": 1.0166208661909837e-05, + "loss": 1.2901, + "mean_token_accuracy": 0.6805033435424169, + "num_tokens": 1660054636.0, + "step": 9899 + }, + { + "entropy": 1.7360956966876984, + "epoch": 1.0875559583642307, + "grad_norm": 0.7920895218849182, + "learning_rate": 1.0164614432192973e-05, + "loss": 1.5854, + "mean_token_accuracy": 0.6431082089742025, + "num_tokens": 1660269145.0, + "step": 9900 + }, + { + "entropy": 1.7179748117923737, + "epoch": 1.0876658152756036, + "grad_norm": 0.738042414188385, + "learning_rate": 1.0163020228915686e-05, + "loss": 1.4252, + "mean_token_accuracy": 0.6560932546854019, + "num_tokens": 1660431180.0, + "step": 9901 + }, + { + "entropy": 1.7385500172773998, + "epoch": 1.0877756721869765, + "grad_norm": 0.6019150018692017, + "learning_rate": 1.0161426052128432e-05, + "loss": 1.4104, + "mean_token_accuracy": 0.6502055029074351, + "num_tokens": 1660601241.0, + "step": 9902 + }, + { + "entropy": 1.6881616115570068, + "epoch": 1.0878855290983493, + "grad_norm": 0.7434528470039368, + "learning_rate": 1.0159831901881663e-05, + "loss": 1.2115, + "mean_token_accuracy": 0.6797519276539484, + "num_tokens": 1660764313.0, + "step": 9903 + }, + { + "entropy": 1.7372826635837555, + "epoch": 1.0879953860097222, + "grad_norm": 0.7365524172782898, + "learning_rate": 1.0158237778225835e-05, + "loss": 1.515, + "mean_token_accuracy": 0.6425711264212927, + "num_tokens": 1660964668.0, + "step": 9904 + }, + { + "entropy": 1.7024028201897938, + "epoch": 1.0881052429210953, + "grad_norm": 0.8199495077133179, + "learning_rate": 1.0156643681211404e-05, + "loss": 1.3366, + "mean_token_accuracy": 0.6539936810731888, + "num_tokens": 1661112643.0, + "step": 9905 + }, + { + "entropy": 1.6678318579991658, + "epoch": 1.0882150998324682, + "grad_norm": 0.816861629486084, + "learning_rate": 1.0155049610888823e-05, + "loss": 1.2508, + "mean_token_accuracy": 0.6761003037293752, + "num_tokens": 1661236856.0, + "step": 9906 + }, + { + "entropy": 1.6942639748255413, + "epoch": 1.088324956743841, + "grad_norm": 0.7153278589248657, + "learning_rate": 1.0153455567308537e-05, + "loss": 1.4531, + "mean_token_accuracy": 0.6470590929190317, + "num_tokens": 1661388508.0, + "step": 9907 + }, + { + "entropy": 1.7340314586957295, + "epoch": 1.088434813655214, + "grad_norm": 0.6582464575767517, + "learning_rate": 1.0151861550521006e-05, + "loss": 1.5217, + "mean_token_accuracy": 0.6363318214813868, + "num_tokens": 1661604834.0, + "step": 9908 + }, + { + "entropy": 1.7097918391227722, + "epoch": 1.088544670566587, + "grad_norm": 0.6414450407028198, + "learning_rate": 1.0150267560576667e-05, + "loss": 1.5432, + "mean_token_accuracy": 0.6336255719264349, + "num_tokens": 1661801434.0, + "step": 9909 + }, + { + "entropy": 1.7380212744077046, + "epoch": 1.08865452747796, + "grad_norm": 0.7711119055747986, + "learning_rate": 1.014867359752598e-05, + "loss": 1.35, + "mean_token_accuracy": 0.6672601054112116, + "num_tokens": 1661937596.0, + "step": 9910 + }, + { + "entropy": 1.678007831176122, + "epoch": 1.0887643843893329, + "grad_norm": 0.8995655179023743, + "learning_rate": 1.0147079661419393e-05, + "loss": 1.4534, + "mean_token_accuracy": 0.6521992137034734, + "num_tokens": 1662131802.0, + "step": 9911 + }, + { + "entropy": 1.7503215471903484, + "epoch": 1.0888742413007058, + "grad_norm": 0.7043768167495728, + "learning_rate": 1.0145485752307347e-05, + "loss": 1.3609, + "mean_token_accuracy": 0.6602404067913691, + "num_tokens": 1662303751.0, + "step": 9912 + }, + { + "entropy": 1.7225368320941925, + "epoch": 1.0889840982120789, + "grad_norm": 0.6886836290359497, + "learning_rate": 1.0143891870240293e-05, + "loss": 1.5237, + "mean_token_accuracy": 0.6407229552666346, + "num_tokens": 1662476819.0, + "step": 9913 + }, + { + "entropy": 1.7340431312719982, + "epoch": 1.0890939551234518, + "grad_norm": 0.7423052787780762, + "learning_rate": 1.0142298015268678e-05, + "loss": 1.3407, + "mean_token_accuracy": 0.6658698171377182, + "num_tokens": 1662668061.0, + "step": 9914 + }, + { + "entropy": 1.6915172338485718, + "epoch": 1.0892038120348246, + "grad_norm": 0.640897274017334, + "learning_rate": 1.0140704187442942e-05, + "loss": 1.3072, + "mean_token_accuracy": 0.6653468410174052, + "num_tokens": 1662812165.0, + "step": 9915 + }, + { + "entropy": 1.754497468471527, + "epoch": 1.0893136689461975, + "grad_norm": 0.7400673627853394, + "learning_rate": 1.0139110386813528e-05, + "loss": 1.4021, + "mean_token_accuracy": 0.6460580776135126, + "num_tokens": 1662991292.0, + "step": 9916 + }, + { + "entropy": 1.70304274559021, + "epoch": 1.0894235258575704, + "grad_norm": 0.7350078821182251, + "learning_rate": 1.0137516613430887e-05, + "loss": 1.3661, + "mean_token_accuracy": 0.6609525481859843, + "num_tokens": 1663118044.0, + "step": 9917 + }, + { + "entropy": 1.7494067947069805, + "epoch": 1.0895333827689435, + "grad_norm": 0.81744784116745, + "learning_rate": 1.0135922867345455e-05, + "loss": 1.5288, + "mean_token_accuracy": 0.6569081693887711, + "num_tokens": 1663262827.0, + "step": 9918 + }, + { + "entropy": 1.7277030646800995, + "epoch": 1.0896432396803164, + "grad_norm": 0.9427797794342041, + "learning_rate": 1.0134329148607675e-05, + "loss": 1.4552, + "mean_token_accuracy": 0.6570529192686081, + "num_tokens": 1663396238.0, + "step": 9919 + }, + { + "entropy": 1.661819577217102, + "epoch": 1.0897530965916893, + "grad_norm": 0.7879918217658997, + "learning_rate": 1.0132735457267988e-05, + "loss": 1.3381, + "mean_token_accuracy": 0.6635206490755081, + "num_tokens": 1663526020.0, + "step": 9920 + }, + { + "entropy": 1.6900553206602733, + "epoch": 1.0898629535030622, + "grad_norm": 0.6344413161277771, + "learning_rate": 1.0131141793376833e-05, + "loss": 1.3869, + "mean_token_accuracy": 0.6595876067876816, + "num_tokens": 1663719329.0, + "step": 9921 + }, + { + "entropy": 1.7128386199474335, + "epoch": 1.0899728104144353, + "grad_norm": 0.658137321472168, + "learning_rate": 1.012954815698465e-05, + "loss": 1.5023, + "mean_token_accuracy": 0.6380962332089742, + "num_tokens": 1663912510.0, + "step": 9922 + }, + { + "entropy": 1.664399077494939, + "epoch": 1.0900826673258082, + "grad_norm": 0.7193596363067627, + "learning_rate": 1.0127954548141872e-05, + "loss": 1.4288, + "mean_token_accuracy": 0.6621369272470474, + "num_tokens": 1664042226.0, + "step": 9923 + }, + { + "entropy": 1.6875610550244649, + "epoch": 1.090192524237181, + "grad_norm": 0.6304190158843994, + "learning_rate": 1.012636096689894e-05, + "loss": 1.3007, + "mean_token_accuracy": 0.658969427148501, + "num_tokens": 1664197536.0, + "step": 9924 + }, + { + "entropy": 1.6750660041968028, + "epoch": 1.090302381148554, + "grad_norm": 0.6103596091270447, + "learning_rate": 1.0124767413306294e-05, + "loss": 1.5455, + "mean_token_accuracy": 0.636797179778417, + "num_tokens": 1664372063.0, + "step": 9925 + }, + { + "entropy": 1.7024609645207722, + "epoch": 1.090412238059927, + "grad_norm": 0.7331560850143433, + "learning_rate": 1.0123173887414361e-05, + "loss": 1.2627, + "mean_token_accuracy": 0.6728994299968084, + "num_tokens": 1664500629.0, + "step": 9926 + }, + { + "entropy": 1.7319872776667278, + "epoch": 1.0905220949713, + "grad_norm": 0.6502282619476318, + "learning_rate": 1.012158038927358e-05, + "loss": 1.3303, + "mean_token_accuracy": 0.6671723872423172, + "num_tokens": 1664704233.0, + "step": 9927 + }, + { + "entropy": 1.7720895409584045, + "epoch": 1.0906319518826728, + "grad_norm": 0.8043599128723145, + "learning_rate": 1.0119986918934386e-05, + "loss": 1.446, + "mean_token_accuracy": 0.6564847181240717, + "num_tokens": 1664856266.0, + "step": 9928 + }, + { + "entropy": 1.7761450012524922, + "epoch": 1.0907418087940457, + "grad_norm": 0.6992666721343994, + "learning_rate": 1.0118393476447204e-05, + "loss": 1.3832, + "mean_token_accuracy": 0.6636711110671362, + "num_tokens": 1665006546.0, + "step": 9929 + }, + { + "entropy": 1.6626348197460175, + "epoch": 1.0908516657054186, + "grad_norm": 0.6073324680328369, + "learning_rate": 1.0116800061862475e-05, + "loss": 1.2507, + "mean_token_accuracy": 0.67312224706014, + "num_tokens": 1665181716.0, + "step": 9930 + }, + { + "entropy": 1.6563841303189595, + "epoch": 1.0909615226167917, + "grad_norm": 0.6241437196731567, + "learning_rate": 1.0115206675230626e-05, + "loss": 1.3984, + "mean_token_accuracy": 0.6542405039072037, + "num_tokens": 1665356676.0, + "step": 9931 + }, + { + "entropy": 1.689384828011195, + "epoch": 1.0910713795281646, + "grad_norm": 0.7169914245605469, + "learning_rate": 1.011361331660209e-05, + "loss": 1.3182, + "mean_token_accuracy": 0.6685070743163427, + "num_tokens": 1665542132.0, + "step": 9932 + }, + { + "entropy": 1.7403077880541484, + "epoch": 1.0911812364395375, + "grad_norm": 0.6693525910377502, + "learning_rate": 1.0112019986027289e-05, + "loss": 1.5033, + "mean_token_accuracy": 0.6436150471369425, + "num_tokens": 1665764372.0, + "step": 9933 + }, + { + "entropy": 1.7170814077059429, + "epoch": 1.0912910933509103, + "grad_norm": 0.6054666638374329, + "learning_rate": 1.0110426683556657e-05, + "loss": 1.3651, + "mean_token_accuracy": 0.6551655034224192, + "num_tokens": 1665966456.0, + "step": 9934 + }, + { + "entropy": 1.7085198163986206, + "epoch": 1.0914009502622835, + "grad_norm": 0.6800384521484375, + "learning_rate": 1.0108833409240617e-05, + "loss": 1.36, + "mean_token_accuracy": 0.6609861155351003, + "num_tokens": 1666121414.0, + "step": 9935 + }, + { + "entropy": 1.7367458045482635, + "epoch": 1.0915108071736563, + "grad_norm": 0.5863933563232422, + "learning_rate": 1.0107240163129599e-05, + "loss": 1.4401, + "mean_token_accuracy": 0.6531588186820348, + "num_tokens": 1666295279.0, + "step": 9936 + }, + { + "entropy": 1.7235424220561981, + "epoch": 1.0916206640850292, + "grad_norm": 0.7675713896751404, + "learning_rate": 1.010564694527403e-05, + "loss": 1.5024, + "mean_token_accuracy": 0.6675131072600683, + "num_tokens": 1666457168.0, + "step": 9937 + }, + { + "entropy": 1.682725340127945, + "epoch": 1.091730520996402, + "grad_norm": 0.7586541175842285, + "learning_rate": 1.0104053755724332e-05, + "loss": 1.3828, + "mean_token_accuracy": 0.6533033003409704, + "num_tokens": 1666667783.0, + "step": 9938 + }, + { + "entropy": 1.736223300298055, + "epoch": 1.0918403779077752, + "grad_norm": 0.7098135948181152, + "learning_rate": 1.0102460594530926e-05, + "loss": 1.3948, + "mean_token_accuracy": 0.6576603204011917, + "num_tokens": 1666801846.0, + "step": 9939 + }, + { + "entropy": 1.7084941665331523, + "epoch": 1.091950234819148, + "grad_norm": 0.724420964717865, + "learning_rate": 1.0100867461744241e-05, + "loss": 1.4758, + "mean_token_accuracy": 0.646695002913475, + "num_tokens": 1666982440.0, + "step": 9940 + }, + { + "entropy": 1.74443985025088, + "epoch": 1.092060091730521, + "grad_norm": 0.7071523666381836, + "learning_rate": 1.0099274357414692e-05, + "loss": 1.4043, + "mean_token_accuracy": 0.6590453336636225, + "num_tokens": 1667133865.0, + "step": 9941 + }, + { + "entropy": 1.6637418170770009, + "epoch": 1.0921699486418939, + "grad_norm": 0.591380774974823, + "learning_rate": 1.0097681281592706e-05, + "loss": 1.3282, + "mean_token_accuracy": 0.6629678010940552, + "num_tokens": 1667279421.0, + "step": 9942 + }, + { + "entropy": 1.6820252339045207, + "epoch": 1.0922798055532668, + "grad_norm": 0.6717654466629028, + "learning_rate": 1.0096088234328702e-05, + "loss": 1.4755, + "mean_token_accuracy": 0.6481594145298004, + "num_tokens": 1667473211.0, + "step": 9943 + }, + { + "entropy": 1.709025154511134, + "epoch": 1.0923896624646399, + "grad_norm": 0.6753855347633362, + "learning_rate": 1.0094495215673097e-05, + "loss": 1.2966, + "mean_token_accuracy": 0.667145162820816, + "num_tokens": 1667604956.0, + "step": 9944 + }, + { + "entropy": 1.625400871038437, + "epoch": 1.0924995193760128, + "grad_norm": 0.64048832654953, + "learning_rate": 1.009290222567631e-05, + "loss": 1.3972, + "mean_token_accuracy": 0.660579577088356, + "num_tokens": 1667823844.0, + "step": 9945 + }, + { + "entropy": 1.7338625093301137, + "epoch": 1.0926093762873856, + "grad_norm": 0.7985219359397888, + "learning_rate": 1.009130926438876e-05, + "loss": 1.6674, + "mean_token_accuracy": 0.6493135193983713, + "num_tokens": 1668007284.0, + "step": 9946 + }, + { + "entropy": 1.6951390206813812, + "epoch": 1.0927192331987585, + "grad_norm": 0.683193027973175, + "learning_rate": 1.008971633186086e-05, + "loss": 1.2785, + "mean_token_accuracy": 0.6708967983722687, + "num_tokens": 1668145759.0, + "step": 9947 + }, + { + "entropy": 1.6314593156178792, + "epoch": 1.0928290901101316, + "grad_norm": 0.7132555842399597, + "learning_rate": 1.0088123428143029e-05, + "loss": 1.3441, + "mean_token_accuracy": 0.681462566057841, + "num_tokens": 1668277008.0, + "step": 9948 + }, + { + "entropy": 1.7529467344284058, + "epoch": 1.0929389470215045, + "grad_norm": 0.670924186706543, + "learning_rate": 1.008653055328568e-05, + "loss": 1.429, + "mean_token_accuracy": 0.641497532526652, + "num_tokens": 1668483054.0, + "step": 9949 + }, + { + "entropy": 1.6520490248998005, + "epoch": 1.0930488039328774, + "grad_norm": 0.8519325256347656, + "learning_rate": 1.0084937707339229e-05, + "loss": 1.4219, + "mean_token_accuracy": 0.6672419607639313, + "num_tokens": 1668700174.0, + "step": 9950 + }, + { + "entropy": 1.736632893482844, + "epoch": 1.0931586608442503, + "grad_norm": 0.7080869674682617, + "learning_rate": 1.0083344890354086e-05, + "loss": 1.4226, + "mean_token_accuracy": 0.6710045486688614, + "num_tokens": 1668855553.0, + "step": 9951 + }, + { + "entropy": 1.742478887240092, + "epoch": 1.0932685177556234, + "grad_norm": 0.6985324025154114, + "learning_rate": 1.0081752102380667e-05, + "loss": 1.3687, + "mean_token_accuracy": 0.6526035120089849, + "num_tokens": 1669025165.0, + "step": 9952 + }, + { + "entropy": 1.7326435049374898, + "epoch": 1.0933783746669963, + "grad_norm": 0.6467759609222412, + "learning_rate": 1.0080159343469373e-05, + "loss": 1.3327, + "mean_token_accuracy": 0.6626055538654327, + "num_tokens": 1669163361.0, + "step": 9953 + }, + { + "entropy": 1.6840360065301259, + "epoch": 1.0934882315783692, + "grad_norm": 0.6494070291519165, + "learning_rate": 1.0078566613670626e-05, + "loss": 1.4666, + "mean_token_accuracy": 0.6533608982960383, + "num_tokens": 1669347018.0, + "step": 9954 + }, + { + "entropy": 1.717919021844864, + "epoch": 1.093598088489742, + "grad_norm": 0.6406670808792114, + "learning_rate": 1.0076973913034833e-05, + "loss": 1.3224, + "mean_token_accuracy": 0.6631946166356405, + "num_tokens": 1669490134.0, + "step": 9955 + }, + { + "entropy": 1.7073165476322174, + "epoch": 1.093707945401115, + "grad_norm": 0.7670049667358398, + "learning_rate": 1.0075381241612396e-05, + "loss": 1.3305, + "mean_token_accuracy": 0.6583481182654699, + "num_tokens": 1669620317.0, + "step": 9956 + }, + { + "entropy": 1.7269740998744965, + "epoch": 1.093817802312488, + "grad_norm": 0.8087154626846313, + "learning_rate": 1.0073788599453727e-05, + "loss": 1.3544, + "mean_token_accuracy": 0.6522675156593323, + "num_tokens": 1669805743.0, + "step": 9957 + }, + { + "entropy": 1.7226569155852, + "epoch": 1.093927659223861, + "grad_norm": 0.6575363874435425, + "learning_rate": 1.0072195986609235e-05, + "loss": 1.6043, + "mean_token_accuracy": 0.6334054693579674, + "num_tokens": 1670020161.0, + "step": 9958 + }, + { + "entropy": 1.7135057151317596, + "epoch": 1.0940375161352338, + "grad_norm": 0.6211276054382324, + "learning_rate": 1.0070603403129315e-05, + "loss": 1.4269, + "mean_token_accuracy": 0.6416071703036627, + "num_tokens": 1670240465.0, + "step": 9959 + }, + { + "entropy": 1.725637008746465, + "epoch": 1.0941473730466067, + "grad_norm": 0.7707021236419678, + "learning_rate": 1.0069010849064382e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.6549892872571945, + "num_tokens": 1670366147.0, + "step": 9960 + }, + { + "entropy": 1.7043171326319377, + "epoch": 1.0942572299579798, + "grad_norm": 0.7570623755455017, + "learning_rate": 1.0067418324464838e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6776840935150782, + "num_tokens": 1670481710.0, + "step": 9961 + }, + { + "entropy": 1.7652468581994374, + "epoch": 1.0943670868693527, + "grad_norm": 0.6389201879501343, + "learning_rate": 1.0065825829381082e-05, + "loss": 1.5209, + "mean_token_accuracy": 0.6202053825060526, + "num_tokens": 1670709151.0, + "step": 9962 + }, + { + "entropy": 1.7265647252400715, + "epoch": 1.0944769437807256, + "grad_norm": 0.5859116911888123, + "learning_rate": 1.0064233363863519e-05, + "loss": 1.4948, + "mean_token_accuracy": 0.6309501181046168, + "num_tokens": 1670961566.0, + "step": 9963 + }, + { + "entropy": 1.7115404605865479, + "epoch": 1.0945868006920985, + "grad_norm": 0.6820839047431946, + "learning_rate": 1.0062640927962546e-05, + "loss": 1.3777, + "mean_token_accuracy": 0.6574893345435461, + "num_tokens": 1671153616.0, + "step": 9964 + }, + { + "entropy": 1.6882510085900624, + "epoch": 1.0946966576034716, + "grad_norm": 0.6248074769973755, + "learning_rate": 1.0061048521728565e-05, + "loss": 1.4502, + "mean_token_accuracy": 0.655212844411532, + "num_tokens": 1671336660.0, + "step": 9965 + }, + { + "entropy": 1.7038983503977458, + "epoch": 1.0948065145148445, + "grad_norm": 0.6728511452674866, + "learning_rate": 1.0059456145211976e-05, + "loss": 1.3841, + "mean_token_accuracy": 0.6533484607934952, + "num_tokens": 1671508735.0, + "step": 9966 + }, + { + "entropy": 1.6372570097446442, + "epoch": 1.0949163714262173, + "grad_norm": 0.7651037573814392, + "learning_rate": 1.0057863798463178e-05, + "loss": 1.436, + "mean_token_accuracy": 0.6575490534305573, + "num_tokens": 1671716110.0, + "step": 9967 + }, + { + "entropy": 1.7102164427439372, + "epoch": 1.0950262283375902, + "grad_norm": 1.415974736213684, + "learning_rate": 1.0056271481532565e-05, + "loss": 1.4185, + "mean_token_accuracy": 0.6585031648476919, + "num_tokens": 1671875078.0, + "step": 9968 + }, + { + "entropy": 1.7216593126455944, + "epoch": 1.095136085248963, + "grad_norm": 0.6866213083267212, + "learning_rate": 1.0054679194470533e-05, + "loss": 1.2383, + "mean_token_accuracy": 0.6811109681924185, + "num_tokens": 1672000646.0, + "step": 9969 + }, + { + "entropy": 1.666219154993693, + "epoch": 1.0952459421603362, + "grad_norm": 0.7179189324378967, + "learning_rate": 1.0053086937327481e-05, + "loss": 1.4343, + "mean_token_accuracy": 0.6527023464441299, + "num_tokens": 1672171592.0, + "step": 9970 + }, + { + "entropy": 1.707016150156657, + "epoch": 1.095355799071709, + "grad_norm": 0.6981037855148315, + "learning_rate": 1.0051494710153797e-05, + "loss": 1.5801, + "mean_token_accuracy": 0.6489623288313547, + "num_tokens": 1672358507.0, + "step": 9971 + }, + { + "entropy": 1.7377264102300007, + "epoch": 1.095465655983082, + "grad_norm": 0.7055451273918152, + "learning_rate": 1.004990251299988e-05, + "loss": 1.5114, + "mean_token_accuracy": 0.6383561591307322, + "num_tokens": 1672529837.0, + "step": 9972 + }, + { + "entropy": 1.708618571360906, + "epoch": 1.0955755128944549, + "grad_norm": 0.7005475163459778, + "learning_rate": 1.0048310345916123e-05, + "loss": 1.3085, + "mean_token_accuracy": 0.6701053728659948, + "num_tokens": 1672715868.0, + "step": 9973 + }, + { + "entropy": 1.7293658057848613, + "epoch": 1.095685369805828, + "grad_norm": 0.7964652180671692, + "learning_rate": 1.0046718208952912e-05, + "loss": 1.5353, + "mean_token_accuracy": 0.6405654648939768, + "num_tokens": 1672895461.0, + "step": 9974 + }, + { + "entropy": 1.6681481798489888, + "epoch": 1.0957952267172009, + "grad_norm": 0.6238622069358826, + "learning_rate": 1.0045126102160641e-05, + "loss": 1.4342, + "mean_token_accuracy": 0.6553277472654978, + "num_tokens": 1673112425.0, + "step": 9975 + }, + { + "entropy": 1.700081080198288, + "epoch": 1.0959050836285738, + "grad_norm": 0.6713470816612244, + "learning_rate": 1.0043534025589702e-05, + "loss": 1.3626, + "mean_token_accuracy": 0.6709864139556885, + "num_tokens": 1673262686.0, + "step": 9976 + }, + { + "entropy": 1.6879205107688904, + "epoch": 1.0960149405399466, + "grad_norm": 0.6403784155845642, + "learning_rate": 1.004194197929047e-05, + "loss": 1.4501, + "mean_token_accuracy": 0.637953132390976, + "num_tokens": 1673474045.0, + "step": 9977 + }, + { + "entropy": 1.658721258242925, + "epoch": 1.0961247974513197, + "grad_norm": 0.6213784217834473, + "learning_rate": 1.004034996331335e-05, + "loss": 1.3654, + "mean_token_accuracy": 0.6668369323015213, + "num_tokens": 1673639563.0, + "step": 9978 + }, + { + "entropy": 1.7311672468980153, + "epoch": 1.0962346543626926, + "grad_norm": 0.6326048374176025, + "learning_rate": 1.0038757977708722e-05, + "loss": 1.4684, + "mean_token_accuracy": 0.6399320314327875, + "num_tokens": 1673825028.0, + "step": 9979 + }, + { + "entropy": 1.6954893171787262, + "epoch": 1.0963445112740655, + "grad_norm": 0.9208673238754272, + "learning_rate": 1.003716602252697e-05, + "loss": 1.2998, + "mean_token_accuracy": 0.6691089371840159, + "num_tokens": 1673951048.0, + "step": 9980 + }, + { + "entropy": 1.7091851830482483, + "epoch": 1.0964543681854384, + "grad_norm": 63.81305694580078, + "learning_rate": 1.0035574097818478e-05, + "loss": 1.4792, + "mean_token_accuracy": 0.6564100285371145, + "num_tokens": 1674122349.0, + "step": 9981 + }, + { + "entropy": 1.686219314734141, + "epoch": 1.0965642250968115, + "grad_norm": 0.6443544030189514, + "learning_rate": 1.0033982203633632e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6670923282702764, + "num_tokens": 1674297304.0, + "step": 9982 + }, + { + "entropy": 1.7111739615599315, + "epoch": 1.0966740820081844, + "grad_norm": 0.7431286573410034, + "learning_rate": 1.0032390340022813e-05, + "loss": 1.4799, + "mean_token_accuracy": 0.6537399043639501, + "num_tokens": 1674461127.0, + "step": 9983 + }, + { + "entropy": 1.71789946158727, + "epoch": 1.0967839389195573, + "grad_norm": 0.683925449848175, + "learning_rate": 1.0030798507036408e-05, + "loss": 1.4389, + "mean_token_accuracy": 0.6607331385215124, + "num_tokens": 1674594551.0, + "step": 9984 + }, + { + "entropy": 1.6557001272837322, + "epoch": 1.0968937958309302, + "grad_norm": 0.6890281438827515, + "learning_rate": 1.0029206704724787e-05, + "loss": 1.3217, + "mean_token_accuracy": 0.6550944646199545, + "num_tokens": 1674795787.0, + "step": 9985 + }, + { + "entropy": 1.6847777664661407, + "epoch": 1.097003652742303, + "grad_norm": 0.6635385751724243, + "learning_rate": 1.002761493313834e-05, + "loss": 1.3433, + "mean_token_accuracy": 0.6774813532829285, + "num_tokens": 1674947703.0, + "step": 9986 + }, + { + "entropy": 1.7151075502236683, + "epoch": 1.0971135096536762, + "grad_norm": 0.7632783055305481, + "learning_rate": 1.0026023192327441e-05, + "loss": 1.4479, + "mean_token_accuracy": 0.6485897650321325, + "num_tokens": 1675092387.0, + "step": 9987 + }, + { + "entropy": 1.646423081556956, + "epoch": 1.097223366565049, + "grad_norm": 0.7513181567192078, + "learning_rate": 1.0024431482342471e-05, + "loss": 1.31, + "mean_token_accuracy": 0.6654202590386072, + "num_tokens": 1675232012.0, + "step": 9988 + }, + { + "entropy": 1.718563437461853, + "epoch": 1.097333223476422, + "grad_norm": 0.621102511882782, + "learning_rate": 1.0022839803233804e-05, + "loss": 1.3573, + "mean_token_accuracy": 0.6684616009394327, + "num_tokens": 1675402739.0, + "step": 9989 + }, + { + "entropy": 1.605027476946513, + "epoch": 1.0974430803877948, + "grad_norm": 0.7413462996482849, + "learning_rate": 1.0021248155051817e-05, + "loss": 1.1547, + "mean_token_accuracy": 0.6949248611927032, + "num_tokens": 1675515525.0, + "step": 9990 + }, + { + "entropy": 1.6858182946840923, + "epoch": 1.097552937299168, + "grad_norm": 0.6468738317489624, + "learning_rate": 1.0019656537846883e-05, + "loss": 1.2763, + "mean_token_accuracy": 0.6762718011935552, + "num_tokens": 1675645268.0, + "step": 9991 + }, + { + "entropy": 1.6803157031536102, + "epoch": 1.0976627942105408, + "grad_norm": 0.5877875685691833, + "learning_rate": 1.0018064951669377e-05, + "loss": 1.3821, + "mean_token_accuracy": 0.6426830291748047, + "num_tokens": 1675906428.0, + "step": 9992 + }, + { + "entropy": 1.7092790802319844, + "epoch": 1.0977726511219137, + "grad_norm": 0.7025560140609741, + "learning_rate": 1.0016473396569676e-05, + "loss": 1.2588, + "mean_token_accuracy": 0.6746116280555725, + "num_tokens": 1676046321.0, + "step": 9993 + }, + { + "entropy": 1.7590789496898651, + "epoch": 1.0978825080332866, + "grad_norm": 0.792087733745575, + "learning_rate": 1.0014881872598147e-05, + "loss": 1.2788, + "mean_token_accuracy": 0.6598645945390066, + "num_tokens": 1676194845.0, + "step": 9994 + }, + { + "entropy": 1.743706077337265, + "epoch": 1.0979923649446597, + "grad_norm": 0.7016844749450684, + "learning_rate": 1.0013290379805164e-05, + "loss": 1.4946, + "mean_token_accuracy": 0.6423351069291433, + "num_tokens": 1676362780.0, + "step": 9995 + }, + { + "entropy": 1.7073632975419362, + "epoch": 1.0981022218560326, + "grad_norm": 0.8244330286979675, + "learning_rate": 1.00116989182411e-05, + "loss": 1.4003, + "mean_token_accuracy": 0.6536079297463099, + "num_tokens": 1676508306.0, + "step": 9996 + }, + { + "entropy": 1.731406440337499, + "epoch": 1.0982120787674055, + "grad_norm": 0.7456120252609253, + "learning_rate": 1.0010107487956311e-05, + "loss": 1.3884, + "mean_token_accuracy": 0.6658417532841364, + "num_tokens": 1676649132.0, + "step": 9997 + }, + { + "entropy": 1.7547302941481273, + "epoch": 1.0983219356787783, + "grad_norm": 0.6900354623794556, + "learning_rate": 1.0008516089001178e-05, + "loss": 1.4422, + "mean_token_accuracy": 0.6387932747602463, + "num_tokens": 1676829373.0, + "step": 9998 + }, + { + "entropy": 1.7983198165893555, + "epoch": 1.0984317925901514, + "grad_norm": 0.6247063875198364, + "learning_rate": 1.0006924721426069e-05, + "loss": 1.5958, + "mean_token_accuracy": 0.6131992489099503, + "num_tokens": 1677076619.0, + "step": 9999 + }, + { + "entropy": 1.753205378850301, + "epoch": 1.0985416495015243, + "grad_norm": 0.6765521764755249, + "learning_rate": 1.0005333385281338e-05, + "loss": 1.5413, + "mean_token_accuracy": 0.6390999456246694, + "num_tokens": 1677305713.0, + "step": 10000 + }, + { + "entropy": 1.6796150207519531, + "epoch": 1.0986515064128972, + "grad_norm": 0.7337918281555176, + "learning_rate": 1.000374208061736e-05, + "loss": 1.2874, + "mean_token_accuracy": 0.6799869785706202, + "num_tokens": 1677445019.0, + "step": 10001 + }, + { + "entropy": 1.711920936902364, + "epoch": 1.09876136332427, + "grad_norm": 0.5600767731666565, + "learning_rate": 1.0002150807484497e-05, + "loss": 1.3903, + "mean_token_accuracy": 0.653274749716123, + "num_tokens": 1677642988.0, + "step": 10002 + }, + { + "entropy": 1.733855148156484, + "epoch": 1.098871220235643, + "grad_norm": 0.7659547328948975, + "learning_rate": 1.0000559565933109e-05, + "loss": 1.2707, + "mean_token_accuracy": 0.6706570088863373, + "num_tokens": 1677754531.0, + "step": 10003 + }, + { + "entropy": 1.710139532883962, + "epoch": 1.098981077147016, + "grad_norm": 0.75276118516922, + "learning_rate": 9.998968356013561e-06, + "loss": 1.3375, + "mean_token_accuracy": 0.6552453736464182, + "num_tokens": 1677905273.0, + "step": 10004 + }, + { + "entropy": 1.7893259624640148, + "epoch": 1.099090934058389, + "grad_norm": 0.6971526145935059, + "learning_rate": 9.997377177776212e-06, + "loss": 1.5402, + "mean_token_accuracy": 0.6377905060847601, + "num_tokens": 1678053177.0, + "step": 10005 + }, + { + "entropy": 1.6512891054153442, + "epoch": 1.0992007909697619, + "grad_norm": 0.6249794960021973, + "learning_rate": 9.995786031271428e-06, + "loss": 1.4462, + "mean_token_accuracy": 0.6493960867325465, + "num_tokens": 1678225106.0, + "step": 10006 + }, + { + "entropy": 1.6760885218779247, + "epoch": 1.0993106478811348, + "grad_norm": 0.772819995880127, + "learning_rate": 9.99419491654956e-06, + "loss": 1.4534, + "mean_token_accuracy": 0.659936378399531, + "num_tokens": 1678418651.0, + "step": 10007 + }, + { + "entropy": 1.7428101301193237, + "epoch": 1.0994205047925079, + "grad_norm": 0.6936253905296326, + "learning_rate": 9.992603833660972e-06, + "loss": 1.3324, + "mean_token_accuracy": 0.6534697463115057, + "num_tokens": 1678577045.0, + "step": 10008 + }, + { + "entropy": 1.7122528354326885, + "epoch": 1.0995303617038807, + "grad_norm": 0.6630085110664368, + "learning_rate": 9.991012782656015e-06, + "loss": 1.6186, + "mean_token_accuracy": 0.6261989126602808, + "num_tokens": 1678761263.0, + "step": 10009 + }, + { + "entropy": 1.708151896794637, + "epoch": 1.0996402186152536, + "grad_norm": 0.673546314239502, + "learning_rate": 9.989421763585052e-06, + "loss": 1.4439, + "mean_token_accuracy": 0.6554606457551321, + "num_tokens": 1678951453.0, + "step": 10010 + }, + { + "entropy": 1.6987049877643585, + "epoch": 1.0997500755266265, + "grad_norm": 0.7956987023353577, + "learning_rate": 9.987830776498435e-06, + "loss": 1.5238, + "mean_token_accuracy": 0.6471638679504395, + "num_tokens": 1679144538.0, + "step": 10011 + }, + { + "entropy": 1.668361673752467, + "epoch": 1.0998599324379996, + "grad_norm": 0.6938173174858093, + "learning_rate": 9.986239821446517e-06, + "loss": 1.3803, + "mean_token_accuracy": 0.6642382641633352, + "num_tokens": 1679298989.0, + "step": 10012 + }, + { + "entropy": 1.6760740081469219, + "epoch": 1.0999697893493725, + "grad_norm": 0.8040129542350769, + "learning_rate": 9.984648898479652e-06, + "loss": 1.5746, + "mean_token_accuracy": 0.6352566902836164, + "num_tokens": 1679527067.0, + "step": 10013 + }, + { + "entropy": 1.6475440760453541, + "epoch": 1.1000796462607454, + "grad_norm": 0.5550295114517212, + "learning_rate": 9.983058007648192e-06, + "loss": 1.4691, + "mean_token_accuracy": 0.6531208554903666, + "num_tokens": 1679766761.0, + "step": 10014 + }, + { + "entropy": 1.679537256558736, + "epoch": 1.1001895031721183, + "grad_norm": 0.7648224830627441, + "learning_rate": 9.981467149002486e-06, + "loss": 1.3501, + "mean_token_accuracy": 0.6587068686882654, + "num_tokens": 1679937622.0, + "step": 10015 + }, + { + "entropy": 1.7292282382647197, + "epoch": 1.1002993600834912, + "grad_norm": 0.6542650461196899, + "learning_rate": 9.979876322592886e-06, + "loss": 1.4841, + "mean_token_accuracy": 0.6392849683761597, + "num_tokens": 1680115451.0, + "step": 10016 + }, + { + "entropy": 1.6712367534637451, + "epoch": 1.1004092169948643, + "grad_norm": 0.6149039268493652, + "learning_rate": 9.978285528469744e-06, + "loss": 1.3236, + "mean_token_accuracy": 0.6738909035921097, + "num_tokens": 1680311480.0, + "step": 10017 + }, + { + "entropy": 1.7433668871720631, + "epoch": 1.1005190739062372, + "grad_norm": 0.7319428324699402, + "learning_rate": 9.976694766683401e-06, + "loss": 1.3905, + "mean_token_accuracy": 0.6612731317679087, + "num_tokens": 1680488251.0, + "step": 10018 + }, + { + "entropy": 1.7435412506262462, + "epoch": 1.10062893081761, + "grad_norm": 0.7963857650756836, + "learning_rate": 9.97510403728421e-06, + "loss": 1.5602, + "mean_token_accuracy": 0.6402197231849035, + "num_tokens": 1680631108.0, + "step": 10019 + }, + { + "entropy": 1.6780545115470886, + "epoch": 1.100738787728983, + "grad_norm": 0.685632050037384, + "learning_rate": 9.973513340322515e-06, + "loss": 1.4263, + "mean_token_accuracy": 0.6653469949960709, + "num_tokens": 1680763623.0, + "step": 10020 + }, + { + "entropy": 1.6828766167163849, + "epoch": 1.100848644640356, + "grad_norm": 0.6240759491920471, + "learning_rate": 9.971922675848655e-06, + "loss": 1.308, + "mean_token_accuracy": 0.6743075450261434, + "num_tokens": 1680908342.0, + "step": 10021 + }, + { + "entropy": 1.705411026875178, + "epoch": 1.100958501551729, + "grad_norm": 0.7477165460586548, + "learning_rate": 9.970332043912982e-06, + "loss": 1.3975, + "mean_token_accuracy": 0.6521026045084, + "num_tokens": 1681071612.0, + "step": 10022 + }, + { + "entropy": 1.7263496617476146, + "epoch": 1.1010683584631018, + "grad_norm": 0.7266680002212524, + "learning_rate": 9.968741444565839e-06, + "loss": 1.2693, + "mean_token_accuracy": 0.6663507620493571, + "num_tokens": 1681213720.0, + "step": 10023 + }, + { + "entropy": 1.699666867653529, + "epoch": 1.1011782153744747, + "grad_norm": 0.7280838489532471, + "learning_rate": 9.96715087785756e-06, + "loss": 1.3408, + "mean_token_accuracy": 0.6538164764642715, + "num_tokens": 1681381420.0, + "step": 10024 + }, + { + "entropy": 1.7014712989330292, + "epoch": 1.1012880722858478, + "grad_norm": 0.7642046809196472, + "learning_rate": 9.965560343838494e-06, + "loss": 1.3778, + "mean_token_accuracy": 0.65878793100516, + "num_tokens": 1681534063.0, + "step": 10025 + }, + { + "entropy": 1.7197924951712291, + "epoch": 1.1013979291972207, + "grad_norm": 0.8084545135498047, + "learning_rate": 9.963969842558979e-06, + "loss": 1.4746, + "mean_token_accuracy": 0.6574273506800333, + "num_tokens": 1681719997.0, + "step": 10026 + }, + { + "entropy": 1.6657897333304088, + "epoch": 1.1015077861085936, + "grad_norm": 0.6369723081588745, + "learning_rate": 9.962379374069344e-06, + "loss": 1.5711, + "mean_token_accuracy": 0.6259370992581049, + "num_tokens": 1681973686.0, + "step": 10027 + }, + { + "entropy": 1.6844724615414937, + "epoch": 1.1016176430199665, + "grad_norm": 0.6429160237312317, + "learning_rate": 9.960788938419938e-06, + "loss": 1.4057, + "mean_token_accuracy": 0.6732848683993021, + "num_tokens": 1682149098.0, + "step": 10028 + }, + { + "entropy": 1.723353534936905, + "epoch": 1.1017274999313393, + "grad_norm": 0.6367024779319763, + "learning_rate": 9.959198535661097e-06, + "loss": 1.5805, + "mean_token_accuracy": 0.6324650992949804, + "num_tokens": 1682363134.0, + "step": 10029 + }, + { + "entropy": 1.6801141500473022, + "epoch": 1.1018373568427124, + "grad_norm": 0.7213335633277893, + "learning_rate": 9.957608165843148e-06, + "loss": 1.3366, + "mean_token_accuracy": 0.6565315226713816, + "num_tokens": 1682553091.0, + "step": 10030 + }, + { + "entropy": 1.6784232159455617, + "epoch": 1.1019472137540853, + "grad_norm": 0.7102720737457275, + "learning_rate": 9.956017829016434e-06, + "loss": 1.3336, + "mean_token_accuracy": 0.6537942936023077, + "num_tokens": 1682722356.0, + "step": 10031 + }, + { + "entropy": 1.680375188589096, + "epoch": 1.1020570706654582, + "grad_norm": 0.6875813007354736, + "learning_rate": 9.954427525231285e-06, + "loss": 1.3502, + "mean_token_accuracy": 0.6625233242909113, + "num_tokens": 1682842970.0, + "step": 10032 + }, + { + "entropy": 1.6692634721597035, + "epoch": 1.102166927576831, + "grad_norm": 0.700435996055603, + "learning_rate": 9.952837254538032e-06, + "loss": 1.3591, + "mean_token_accuracy": 0.667348379890124, + "num_tokens": 1683041864.0, + "step": 10033 + }, + { + "entropy": 1.7761492331822712, + "epoch": 1.1022767844882042, + "grad_norm": 0.7642155885696411, + "learning_rate": 9.95124701698701e-06, + "loss": 1.462, + "mean_token_accuracy": 0.6416449447472891, + "num_tokens": 1683184924.0, + "step": 10034 + }, + { + "entropy": 1.725660542647044, + "epoch": 1.102386641399577, + "grad_norm": 0.6598086357116699, + "learning_rate": 9.949656812628548e-06, + "loss": 1.3182, + "mean_token_accuracy": 0.6549786279598871, + "num_tokens": 1683372899.0, + "step": 10035 + }, + { + "entropy": 1.6732657253742218, + "epoch": 1.10249649831095, + "grad_norm": 0.9407162070274353, + "learning_rate": 9.948066641512972e-06, + "loss": 1.3752, + "mean_token_accuracy": 0.6609608381986618, + "num_tokens": 1683562266.0, + "step": 10036 + }, + { + "entropy": 1.7500144441922505, + "epoch": 1.1026063552223229, + "grad_norm": 0.6773711442947388, + "learning_rate": 9.946476503690613e-06, + "loss": 1.4346, + "mean_token_accuracy": 0.6503184189399084, + "num_tokens": 1683732368.0, + "step": 10037 + }, + { + "entropy": 1.7161312401294708, + "epoch": 1.102716212133696, + "grad_norm": 0.6016209125518799, + "learning_rate": 9.944886399211802e-06, + "loss": 1.3286, + "mean_token_accuracy": 0.6561945378780365, + "num_tokens": 1683895566.0, + "step": 10038 + }, + { + "entropy": 1.7138621707757313, + "epoch": 1.1028260690450689, + "grad_norm": 0.7198561429977417, + "learning_rate": 9.943296328126855e-06, + "loss": 1.3559, + "mean_token_accuracy": 0.6556108146905899, + "num_tokens": 1684035609.0, + "step": 10039 + }, + { + "entropy": 1.7401485840479534, + "epoch": 1.1029359259564417, + "grad_norm": 0.7214450240135193, + "learning_rate": 9.941706290486107e-06, + "loss": 1.4328, + "mean_token_accuracy": 0.6506765186786652, + "num_tokens": 1684183063.0, + "step": 10040 + }, + { + "entropy": 1.713246077299118, + "epoch": 1.1030457828678146, + "grad_norm": 0.7159080505371094, + "learning_rate": 9.940116286339876e-06, + "loss": 1.2452, + "mean_token_accuracy": 0.67661052942276, + "num_tokens": 1684309228.0, + "step": 10041 + }, + { + "entropy": 1.7424029608567555, + "epoch": 1.1031556397791875, + "grad_norm": 0.598591148853302, + "learning_rate": 9.938526315738488e-06, + "loss": 1.3506, + "mean_token_accuracy": 0.6587058206399282, + "num_tokens": 1684467440.0, + "step": 10042 + }, + { + "entropy": 1.737044592698415, + "epoch": 1.1032654966905606, + "grad_norm": 0.7071549296379089, + "learning_rate": 9.936936378732264e-06, + "loss": 1.3585, + "mean_token_accuracy": 0.6568170885245005, + "num_tokens": 1684616936.0, + "step": 10043 + }, + { + "entropy": 1.6824077864487965, + "epoch": 1.1033753536019335, + "grad_norm": 0.7109892964363098, + "learning_rate": 9.935346475371526e-06, + "loss": 1.3406, + "mean_token_accuracy": 0.6530480086803436, + "num_tokens": 1684799276.0, + "step": 10044 + }, + { + "entropy": 1.658627490202586, + "epoch": 1.1034852105133064, + "grad_norm": 0.6493405699729919, + "learning_rate": 9.933756605706589e-06, + "loss": 1.4513, + "mean_token_accuracy": 0.6560260156790415, + "num_tokens": 1684941620.0, + "step": 10045 + }, + { + "entropy": 1.7143846948941548, + "epoch": 1.1035950674246793, + "grad_norm": 0.7175660133361816, + "learning_rate": 9.93216676978778e-06, + "loss": 1.3808, + "mean_token_accuracy": 0.6502297967672348, + "num_tokens": 1685141645.0, + "step": 10046 + }, + { + "entropy": 1.7108404437700908, + "epoch": 1.1037049243360524, + "grad_norm": 0.665665864944458, + "learning_rate": 9.930576967665405e-06, + "loss": 1.3979, + "mean_token_accuracy": 0.6681078970432281, + "num_tokens": 1685310487.0, + "step": 10047 + }, + { + "entropy": 1.6906990508238475, + "epoch": 1.1038147812474253, + "grad_norm": 0.685772716999054, + "learning_rate": 9.928987199389791e-06, + "loss": 1.2396, + "mean_token_accuracy": 0.6812936266263326, + "num_tokens": 1685454139.0, + "step": 10048 + }, + { + "entropy": 1.6968371470769246, + "epoch": 1.1039246381587982, + "grad_norm": 0.6249828934669495, + "learning_rate": 9.92739746501125e-06, + "loss": 1.4544, + "mean_token_accuracy": 0.6454744736353556, + "num_tokens": 1685605872.0, + "step": 10049 + }, + { + "entropy": 1.6949062744776409, + "epoch": 1.104034495070171, + "grad_norm": 0.656091034412384, + "learning_rate": 9.925807764580094e-06, + "loss": 1.3709, + "mean_token_accuracy": 0.6586327403783798, + "num_tokens": 1685794273.0, + "step": 10050 + }, + { + "entropy": 1.7288226087888081, + "epoch": 1.1041443519815441, + "grad_norm": 0.7473734617233276, + "learning_rate": 9.924218098146636e-06, + "loss": 1.2089, + "mean_token_accuracy": 0.6816525955994924, + "num_tokens": 1685887419.0, + "step": 10051 + }, + { + "entropy": 1.6433724264303844, + "epoch": 1.104254208892917, + "grad_norm": 0.6661230325698853, + "learning_rate": 9.922628465761197e-06, + "loss": 1.2899, + "mean_token_accuracy": 0.6614332050085068, + "num_tokens": 1686038032.0, + "step": 10052 + }, + { + "entropy": 1.72390815615654, + "epoch": 1.10436406580429, + "grad_norm": 0.750630259513855, + "learning_rate": 9.921038867474076e-06, + "loss": 1.4065, + "mean_token_accuracy": 0.6597993671894073, + "num_tokens": 1686238542.0, + "step": 10053 + }, + { + "entropy": 1.7463493744532268, + "epoch": 1.1044739227156628, + "grad_norm": 0.7480951547622681, + "learning_rate": 9.919449303335591e-06, + "loss": 1.3189, + "mean_token_accuracy": 0.664255807797114, + "num_tokens": 1686360222.0, + "step": 10054 + }, + { + "entropy": 1.7227412561575572, + "epoch": 1.1045837796270357, + "grad_norm": 0.7310038208961487, + "learning_rate": 9.917859773396048e-06, + "loss": 1.4055, + "mean_token_accuracy": 0.6522340675195059, + "num_tokens": 1686517954.0, + "step": 10055 + }, + { + "entropy": 1.727901021639506, + "epoch": 1.1046936365384088, + "grad_norm": 0.7204070687294006, + "learning_rate": 9.916270277705755e-06, + "loss": 1.3779, + "mean_token_accuracy": 0.6602382163206736, + "num_tokens": 1686720706.0, + "step": 10056 + }, + { + "entropy": 1.6805065770943959, + "epoch": 1.1048034934497817, + "grad_norm": 0.747397780418396, + "learning_rate": 9.914680816315018e-06, + "loss": 1.4117, + "mean_token_accuracy": 0.6577341059843699, + "num_tokens": 1686851842.0, + "step": 10057 + }, + { + "entropy": 1.683358073234558, + "epoch": 1.1049133503611546, + "grad_norm": 0.8031777739524841, + "learning_rate": 9.913091389274149e-06, + "loss": 1.398, + "mean_token_accuracy": 0.6626959492762884, + "num_tokens": 1686999977.0, + "step": 10058 + }, + { + "entropy": 1.7599800129731495, + "epoch": 1.1050232072725275, + "grad_norm": 0.6084749698638916, + "learning_rate": 9.911501996633446e-06, + "loss": 1.3626, + "mean_token_accuracy": 0.6569390346606573, + "num_tokens": 1687162443.0, + "step": 10059 + }, + { + "entropy": 1.666684329509735, + "epoch": 1.1051330641839006, + "grad_norm": 0.7338747978210449, + "learning_rate": 9.909912638443211e-06, + "loss": 1.3339, + "mean_token_accuracy": 0.6666023383537928, + "num_tokens": 1687326579.0, + "step": 10060 + }, + { + "entropy": 1.7396563490231831, + "epoch": 1.1052429210952734, + "grad_norm": 0.5960805416107178, + "learning_rate": 9.908323314753754e-06, + "loss": 1.4937, + "mean_token_accuracy": 0.6390989075104395, + "num_tokens": 1687537916.0, + "step": 10061 + }, + { + "entropy": 1.7278473377227783, + "epoch": 1.1053527780066463, + "grad_norm": 0.640707790851593, + "learning_rate": 9.90673402561537e-06, + "loss": 1.3766, + "mean_token_accuracy": 0.6638915787140528, + "num_tokens": 1687698161.0, + "step": 10062 + }, + { + "entropy": 1.723745624224345, + "epoch": 1.1054626349180192, + "grad_norm": 0.7135167717933655, + "learning_rate": 9.90514477107836e-06, + "loss": 1.4669, + "mean_token_accuracy": 0.6489771803220113, + "num_tokens": 1687859122.0, + "step": 10063 + }, + { + "entropy": 1.650217165549596, + "epoch": 1.1055724918293923, + "grad_norm": 0.6993624567985535, + "learning_rate": 9.90355555119303e-06, + "loss": 1.2399, + "mean_token_accuracy": 0.684073825677236, + "num_tokens": 1687974398.0, + "step": 10064 + }, + { + "entropy": 1.7590695818265278, + "epoch": 1.1056823487407652, + "grad_norm": 0.6988198757171631, + "learning_rate": 9.901966366009665e-06, + "loss": 1.5556, + "mean_token_accuracy": 0.6316021184126536, + "num_tokens": 1688190329.0, + "step": 10065 + }, + { + "entropy": 1.7328562041123707, + "epoch": 1.105792205652138, + "grad_norm": 0.8319157958030701, + "learning_rate": 9.900377215578575e-06, + "loss": 1.3012, + "mean_token_accuracy": 0.6607611576716105, + "num_tokens": 1688300281.0, + "step": 10066 + }, + { + "entropy": 1.6762152512868245, + "epoch": 1.105902062563511, + "grad_norm": 0.7458414435386658, + "learning_rate": 9.89878809995005e-06, + "loss": 1.2462, + "mean_token_accuracy": 0.6730685979127884, + "num_tokens": 1688449740.0, + "step": 10067 + }, + { + "entropy": 1.6644122898578644, + "epoch": 1.1060119194748839, + "grad_norm": 0.5826370716094971, + "learning_rate": 9.897199019174386e-06, + "loss": 1.427, + "mean_token_accuracy": 0.6464564104874929, + "num_tokens": 1688657245.0, + "step": 10068 + }, + { + "entropy": 1.7347849011421204, + "epoch": 1.106121776386257, + "grad_norm": 1.0048359632492065, + "learning_rate": 9.895609973301873e-06, + "loss": 1.4001, + "mean_token_accuracy": 0.6557883818944296, + "num_tokens": 1688785006.0, + "step": 10069 + }, + { + "entropy": 1.7074712614218395, + "epoch": 1.1062316332976299, + "grad_norm": 0.7061405777931213, + "learning_rate": 9.89402096238281e-06, + "loss": 1.2721, + "mean_token_accuracy": 0.6725035260121027, + "num_tokens": 1688900935.0, + "step": 10070 + }, + { + "entropy": 1.7450923323631287, + "epoch": 1.1063414902090027, + "grad_norm": 0.6660796403884888, + "learning_rate": 9.892431986467483e-06, + "loss": 1.3757, + "mean_token_accuracy": 0.6571687310934067, + "num_tokens": 1689061066.0, + "step": 10071 + }, + { + "entropy": 1.6484363277753193, + "epoch": 1.1064513471203756, + "grad_norm": 0.5785127282142639, + "learning_rate": 9.890843045606185e-06, + "loss": 1.3894, + "mean_token_accuracy": 0.6500343084335327, + "num_tokens": 1689306241.0, + "step": 10072 + }, + { + "entropy": 1.7250319123268127, + "epoch": 1.1065612040317487, + "grad_norm": 0.6741638779640198, + "learning_rate": 9.889254139849207e-06, + "loss": 1.4156, + "mean_token_accuracy": 0.6609347065289816, + "num_tokens": 1689490952.0, + "step": 10073 + }, + { + "entropy": 1.703676551580429, + "epoch": 1.1066710609431216, + "grad_norm": 0.6628722548484802, + "learning_rate": 9.887665269246833e-06, + "loss": 1.3862, + "mean_token_accuracy": 0.6559304048617681, + "num_tokens": 1689668182.0, + "step": 10074 + }, + { + "entropy": 1.7785408198833466, + "epoch": 1.1067809178544945, + "grad_norm": 0.6795310974121094, + "learning_rate": 9.886076433849352e-06, + "loss": 1.3857, + "mean_token_accuracy": 0.6603186577558517, + "num_tokens": 1689843179.0, + "step": 10075 + }, + { + "entropy": 1.7740286191304524, + "epoch": 1.1068907747658674, + "grad_norm": 0.6643602848052979, + "learning_rate": 9.884487633707052e-06, + "loss": 1.455, + "mean_token_accuracy": 0.6375104387601217, + "num_tokens": 1690010012.0, + "step": 10076 + }, + { + "entropy": 1.7466478248437245, + "epoch": 1.1070006316772405, + "grad_norm": 0.7298927903175354, + "learning_rate": 9.882898868870212e-06, + "loss": 1.4145, + "mean_token_accuracy": 0.6563018610080084, + "num_tokens": 1690159229.0, + "step": 10077 + }, + { + "entropy": 1.705216646194458, + "epoch": 1.1071104885886134, + "grad_norm": 0.793967068195343, + "learning_rate": 9.88131013938912e-06, + "loss": 1.3992, + "mean_token_accuracy": 0.6699225157499313, + "num_tokens": 1690302272.0, + "step": 10078 + }, + { + "entropy": 1.70258762439092, + "epoch": 1.1072203454999863, + "grad_norm": 0.7318503260612488, + "learning_rate": 9.87972144531406e-06, + "loss": 1.3234, + "mean_token_accuracy": 0.6641089816888174, + "num_tokens": 1690461700.0, + "step": 10079 + }, + { + "entropy": 1.6951699952284496, + "epoch": 1.1073302024113592, + "grad_norm": 0.8104332089424133, + "learning_rate": 9.87813278669531e-06, + "loss": 1.3786, + "mean_token_accuracy": 0.677444338798523, + "num_tokens": 1690598643.0, + "step": 10080 + }, + { + "entropy": 1.688471108675003, + "epoch": 1.107440059322732, + "grad_norm": 0.5393524169921875, + "learning_rate": 9.876544163583153e-06, + "loss": 1.4725, + "mean_token_accuracy": 0.6343822181224823, + "num_tokens": 1690799088.0, + "step": 10081 + }, + { + "entropy": 1.683466762304306, + "epoch": 1.1075499162341051, + "grad_norm": 0.6404281854629517, + "learning_rate": 9.87495557602787e-06, + "loss": 1.386, + "mean_token_accuracy": 0.6577825993299484, + "num_tokens": 1690951222.0, + "step": 10082 + }, + { + "entropy": 1.6536591549714406, + "epoch": 1.107659773145478, + "grad_norm": 0.6884099245071411, + "learning_rate": 9.873367024079728e-06, + "loss": 1.4333, + "mean_token_accuracy": 0.6448961248000463, + "num_tokens": 1691125533.0, + "step": 10083 + }, + { + "entropy": 1.6448639531930287, + "epoch": 1.107769630056851, + "grad_norm": 0.583249032497406, + "learning_rate": 9.871778507789016e-06, + "loss": 1.4251, + "mean_token_accuracy": 0.6626767565806707, + "num_tokens": 1691309375.0, + "step": 10084 + }, + { + "entropy": 1.7260893980662029, + "epoch": 1.1078794869682238, + "grad_norm": 0.8317601084709167, + "learning_rate": 9.870190027206009e-06, + "loss": 1.4023, + "mean_token_accuracy": 0.6650147537390391, + "num_tokens": 1691463284.0, + "step": 10085 + }, + { + "entropy": 1.7166006167729695, + "epoch": 1.107989343879597, + "grad_norm": 0.6556655168533325, + "learning_rate": 9.868601582380974e-06, + "loss": 1.3163, + "mean_token_accuracy": 0.6799081613620123, + "num_tokens": 1691575414.0, + "step": 10086 + }, + { + "entropy": 1.7317289213339488, + "epoch": 1.1080992007909698, + "grad_norm": 0.729681134223938, + "learning_rate": 9.867013173364191e-06, + "loss": 1.2541, + "mean_token_accuracy": 0.6766142894824346, + "num_tokens": 1691669164.0, + "step": 10087 + }, + { + "entropy": 1.6821042597293854, + "epoch": 1.1082090577023427, + "grad_norm": 0.6675170063972473, + "learning_rate": 9.865424800205931e-06, + "loss": 1.3605, + "mean_token_accuracy": 0.6503806213537852, + "num_tokens": 1691815021.0, + "step": 10088 + }, + { + "entropy": 1.751885672410329, + "epoch": 1.1083189146137156, + "grad_norm": 0.5960071086883545, + "learning_rate": 9.863836462956464e-06, + "loss": 1.3456, + "mean_token_accuracy": 0.6711924225091934, + "num_tokens": 1691976476.0, + "step": 10089 + }, + { + "entropy": 1.684192289908727, + "epoch": 1.1084287715250887, + "grad_norm": 0.7027954459190369, + "learning_rate": 9.862248161666062e-06, + "loss": 1.4732, + "mean_token_accuracy": 0.6473731994628906, + "num_tokens": 1692149190.0, + "step": 10090 + }, + { + "entropy": 1.648518443107605, + "epoch": 1.1085386284364616, + "grad_norm": 0.8666717410087585, + "learning_rate": 9.860659896384991e-06, + "loss": 1.4343, + "mean_token_accuracy": 0.661418413122495, + "num_tokens": 1692355012.0, + "step": 10091 + }, + { + "entropy": 1.6971223453680675, + "epoch": 1.1086484853478344, + "grad_norm": 0.7207356095314026, + "learning_rate": 9.859071667163523e-06, + "loss": 1.2235, + "mean_token_accuracy": 0.675402487317721, + "num_tokens": 1692474814.0, + "step": 10092 + }, + { + "entropy": 1.699401597181956, + "epoch": 1.1087583422592073, + "grad_norm": 0.747488260269165, + "learning_rate": 9.857483474051921e-06, + "loss": 1.5966, + "mean_token_accuracy": 0.6380815704663595, + "num_tokens": 1692666384.0, + "step": 10093 + }, + { + "entropy": 1.6512778798739116, + "epoch": 1.1088681991705802, + "grad_norm": 0.5988063812255859, + "learning_rate": 9.855895317100456e-06, + "loss": 1.5544, + "mean_token_accuracy": 0.6298639525969824, + "num_tokens": 1692884550.0, + "step": 10094 + }, + { + "entropy": 1.7332661549250286, + "epoch": 1.1089780560819533, + "grad_norm": 0.6148350238800049, + "learning_rate": 9.854307196359383e-06, + "loss": 1.4389, + "mean_token_accuracy": 0.6564859499533972, + "num_tokens": 1693097634.0, + "step": 10095 + }, + { + "entropy": 1.637395977973938, + "epoch": 1.1090879129933262, + "grad_norm": 0.6527997255325317, + "learning_rate": 9.852719111878973e-06, + "loss": 1.495, + "mean_token_accuracy": 0.6438925464948019, + "num_tokens": 1693345822.0, + "step": 10096 + }, + { + "entropy": 1.7149374882380168, + "epoch": 1.109197769904699, + "grad_norm": 0.7964677214622498, + "learning_rate": 9.851131063709488e-06, + "loss": 1.3364, + "mean_token_accuracy": 0.6784713963667551, + "num_tokens": 1693495662.0, + "step": 10097 + }, + { + "entropy": 1.681052456299464, + "epoch": 1.109307626816072, + "grad_norm": 0.6391304135322571, + "learning_rate": 9.849543051901187e-06, + "loss": 1.3896, + "mean_token_accuracy": 0.6593037992715836, + "num_tokens": 1693661136.0, + "step": 10098 + }, + { + "entropy": 1.70050710439682, + "epoch": 1.109417483727445, + "grad_norm": 0.5992792844772339, + "learning_rate": 9.847955076504327e-06, + "loss": 1.3061, + "mean_token_accuracy": 0.6639832506577173, + "num_tokens": 1693809033.0, + "step": 10099 + }, + { + "entropy": 1.6677427391211193, + "epoch": 1.109527340638818, + "grad_norm": 0.6866241693496704, + "learning_rate": 9.846367137569175e-06, + "loss": 1.2431, + "mean_token_accuracy": 0.6873250852028528, + "num_tokens": 1693937839.0, + "step": 10100 + }, + { + "entropy": 1.7384430766105652, + "epoch": 1.1096371975501909, + "grad_norm": 0.72084641456604, + "learning_rate": 9.844779235145975e-06, + "loss": 1.583, + "mean_token_accuracy": 0.6379265685876211, + "num_tokens": 1694143956.0, + "step": 10101 + }, + { + "entropy": 1.7250956892967224, + "epoch": 1.1097470544615637, + "grad_norm": 0.5843518376350403, + "learning_rate": 9.843191369285e-06, + "loss": 1.3725, + "mean_token_accuracy": 0.6559849927822748, + "num_tokens": 1694333230.0, + "step": 10102 + }, + { + "entropy": 1.6686339875062306, + "epoch": 1.1098569113729368, + "grad_norm": 0.6618912220001221, + "learning_rate": 9.841603540036493e-06, + "loss": 1.2857, + "mean_token_accuracy": 0.6752923329671224, + "num_tokens": 1694528211.0, + "step": 10103 + }, + { + "entropy": 1.7217063804467518, + "epoch": 1.1099667682843097, + "grad_norm": 0.7229596972465515, + "learning_rate": 9.84001574745071e-06, + "loss": 1.3442, + "mean_token_accuracy": 0.6586703856786092, + "num_tokens": 1694668137.0, + "step": 10104 + }, + { + "entropy": 1.730820248524348, + "epoch": 1.1100766251956826, + "grad_norm": 0.7770981788635254, + "learning_rate": 9.838427991577913e-06, + "loss": 1.3819, + "mean_token_accuracy": 0.6443741470575333, + "num_tokens": 1694834662.0, + "step": 10105 + }, + { + "entropy": 1.7381211817264557, + "epoch": 1.1101864821070555, + "grad_norm": 0.6995685696601868, + "learning_rate": 9.83684027246834e-06, + "loss": 1.4073, + "mean_token_accuracy": 0.6486761023600897, + "num_tokens": 1694996990.0, + "step": 10106 + }, + { + "entropy": 1.7615481615066528, + "epoch": 1.1102963390184284, + "grad_norm": 0.7999100089073181, + "learning_rate": 9.835252590172248e-06, + "loss": 1.418, + "mean_token_accuracy": 0.6523937930663427, + "num_tokens": 1695161564.0, + "step": 10107 + }, + { + "entropy": 1.698384553194046, + "epoch": 1.1104061959298015, + "grad_norm": 0.766127347946167, + "learning_rate": 9.833664944739894e-06, + "loss": 1.3847, + "mean_token_accuracy": 0.6614054441452026, + "num_tokens": 1695302392.0, + "step": 10108 + }, + { + "entropy": 1.7386977672576904, + "epoch": 1.1105160528411744, + "grad_norm": 0.7259992361068726, + "learning_rate": 9.832077336221511e-06, + "loss": 1.4899, + "mean_token_accuracy": 0.6413588871558508, + "num_tokens": 1695452887.0, + "step": 10109 + }, + { + "entropy": 1.721059521039327, + "epoch": 1.1106259097525473, + "grad_norm": 0.6974899172782898, + "learning_rate": 9.830489764667357e-06, + "loss": 1.4765, + "mean_token_accuracy": 0.650780513882637, + "num_tokens": 1695615069.0, + "step": 10110 + }, + { + "entropy": 1.7291888693968456, + "epoch": 1.1107357666639202, + "grad_norm": 0.7660084366798401, + "learning_rate": 9.828902230127675e-06, + "loss": 1.4252, + "mean_token_accuracy": 0.6462711741526922, + "num_tokens": 1695760221.0, + "step": 10111 + }, + { + "entropy": 1.7433270911375682, + "epoch": 1.1108456235752933, + "grad_norm": 0.7739673852920532, + "learning_rate": 9.827314732652708e-06, + "loss": 1.4445, + "mean_token_accuracy": 0.650582085053126, + "num_tokens": 1695917595.0, + "step": 10112 + }, + { + "entropy": 1.6808188458283742, + "epoch": 1.1109554804866661, + "grad_norm": 0.7548496723175049, + "learning_rate": 9.825727272292702e-06, + "loss": 1.3041, + "mean_token_accuracy": 0.6782331267992655, + "num_tokens": 1696031484.0, + "step": 10113 + }, + { + "entropy": 1.7230580151081085, + "epoch": 1.111065337398039, + "grad_norm": 0.7264763116836548, + "learning_rate": 9.824139849097901e-06, + "loss": 1.3437, + "mean_token_accuracy": 0.6618181715408961, + "num_tokens": 1696143962.0, + "step": 10114 + }, + { + "entropy": 1.7376162310441334, + "epoch": 1.111175194309412, + "grad_norm": 0.6344247460365295, + "learning_rate": 9.822552463118542e-06, + "loss": 1.4366, + "mean_token_accuracy": 0.6638104766607285, + "num_tokens": 1696315979.0, + "step": 10115 + }, + { + "entropy": 1.7340149482091267, + "epoch": 1.111285051220785, + "grad_norm": 1.0014694929122925, + "learning_rate": 9.820965114404866e-06, + "loss": 1.327, + "mean_token_accuracy": 0.6686884462833405, + "num_tokens": 1696452363.0, + "step": 10116 + }, + { + "entropy": 1.6129749516646068, + "epoch": 1.111394908132158, + "grad_norm": 0.6101342439651489, + "learning_rate": 9.819377803007117e-06, + "loss": 1.3782, + "mean_token_accuracy": 0.669797440369924, + "num_tokens": 1696599085.0, + "step": 10117 + }, + { + "entropy": 1.7004303236802418, + "epoch": 1.1115047650435308, + "grad_norm": 0.7235705852508545, + "learning_rate": 9.817790528975527e-06, + "loss": 1.4595, + "mean_token_accuracy": 0.6571828325589498, + "num_tokens": 1696746504.0, + "step": 10118 + }, + { + "entropy": 1.6813008785247803, + "epoch": 1.1116146219549037, + "grad_norm": 0.6818208694458008, + "learning_rate": 9.81620329236033e-06, + "loss": 1.3373, + "mean_token_accuracy": 0.6721784075101217, + "num_tokens": 1696892330.0, + "step": 10119 + }, + { + "entropy": 1.716547667980194, + "epoch": 1.1117244788662766, + "grad_norm": 0.684902548789978, + "learning_rate": 9.81461609321177e-06, + "loss": 1.3562, + "mean_token_accuracy": 0.6633862257003784, + "num_tokens": 1697027411.0, + "step": 10120 + }, + { + "entropy": 1.7614688177903493, + "epoch": 1.1118343357776497, + "grad_norm": 0.6902977824211121, + "learning_rate": 9.813028931580073e-06, + "loss": 1.4259, + "mean_token_accuracy": 0.6516207158565521, + "num_tokens": 1697221211.0, + "step": 10121 + }, + { + "entropy": 1.7099326650301616, + "epoch": 1.1119441926890226, + "grad_norm": 0.7409700751304626, + "learning_rate": 9.811441807515477e-06, + "loss": 1.4281, + "mean_token_accuracy": 0.6582437505324682, + "num_tokens": 1697396444.0, + "step": 10122 + }, + { + "entropy": 1.6779835720856984, + "epoch": 1.1120540496003954, + "grad_norm": 0.621612012386322, + "learning_rate": 9.809854721068213e-06, + "loss": 1.4229, + "mean_token_accuracy": 0.6485381374756495, + "num_tokens": 1697565929.0, + "step": 10123 + }, + { + "entropy": 1.6557986438274384, + "epoch": 1.1121639065117683, + "grad_norm": 0.6872241497039795, + "learning_rate": 9.808267672288509e-06, + "loss": 1.3494, + "mean_token_accuracy": 0.666273444890976, + "num_tokens": 1697738024.0, + "step": 10124 + }, + { + "entropy": 1.7175208032131195, + "epoch": 1.1122737634231414, + "grad_norm": 0.5981009602546692, + "learning_rate": 9.806680661226595e-06, + "loss": 1.459, + "mean_token_accuracy": 0.6348882069190344, + "num_tokens": 1697938594.0, + "step": 10125 + }, + { + "entropy": 1.720637023448944, + "epoch": 1.1123836203345143, + "grad_norm": 0.7893303632736206, + "learning_rate": 9.805093687932707e-06, + "loss": 1.3752, + "mean_token_accuracy": 0.6728624453147253, + "num_tokens": 1698075068.0, + "step": 10126 + }, + { + "entropy": 1.7189677953720093, + "epoch": 1.1124934772458872, + "grad_norm": 0.7821738123893738, + "learning_rate": 9.80350675245706e-06, + "loss": 1.4315, + "mean_token_accuracy": 0.653843825062116, + "num_tokens": 1698231670.0, + "step": 10127 + }, + { + "entropy": 1.756723403930664, + "epoch": 1.11260333415726, + "grad_norm": 0.7259140610694885, + "learning_rate": 9.801919854849884e-06, + "loss": 1.4513, + "mean_token_accuracy": 0.6415894875923792, + "num_tokens": 1698432467.0, + "step": 10128 + }, + { + "entropy": 1.751261701186498, + "epoch": 1.1127131910686332, + "grad_norm": 0.6469233632087708, + "learning_rate": 9.800332995161408e-06, + "loss": 1.4379, + "mean_token_accuracy": 0.6382074107726415, + "num_tokens": 1698590937.0, + "step": 10129 + }, + { + "entropy": 1.735701670249303, + "epoch": 1.112823047980006, + "grad_norm": 0.667665421962738, + "learning_rate": 9.798746173441852e-06, + "loss": 1.3649, + "mean_token_accuracy": 0.6617190291484197, + "num_tokens": 1698784904.0, + "step": 10130 + }, + { + "entropy": 1.7073118388652802, + "epoch": 1.112932904891379, + "grad_norm": 0.8839547634124756, + "learning_rate": 9.797159389741436e-06, + "loss": 1.3859, + "mean_token_accuracy": 0.6646972000598907, + "num_tokens": 1698955976.0, + "step": 10131 + }, + { + "entropy": 1.6580509543418884, + "epoch": 1.1130427618027519, + "grad_norm": 0.6089791655540466, + "learning_rate": 9.795572644110387e-06, + "loss": 1.4237, + "mean_token_accuracy": 0.6523320525884628, + "num_tokens": 1699205226.0, + "step": 10132 + }, + { + "entropy": 1.7225276331106822, + "epoch": 1.1131526187141247, + "grad_norm": 0.6253435611724854, + "learning_rate": 9.793985936598916e-06, + "loss": 1.34, + "mean_token_accuracy": 0.6591206341981888, + "num_tokens": 1699364197.0, + "step": 10133 + }, + { + "entropy": 1.7497197190920513, + "epoch": 1.1132624756254978, + "grad_norm": 0.7956197261810303, + "learning_rate": 9.792399267257249e-06, + "loss": 1.4329, + "mean_token_accuracy": 0.6548638641834259, + "num_tokens": 1699500152.0, + "step": 10134 + }, + { + "entropy": 1.7095544238885243, + "epoch": 1.1133723325368707, + "grad_norm": 0.6860001683235168, + "learning_rate": 9.790812636135603e-06, + "loss": 1.5067, + "mean_token_accuracy": 0.6451443135738373, + "num_tokens": 1699661286.0, + "step": 10135 + }, + { + "entropy": 1.6957333187262218, + "epoch": 1.1134821894482436, + "grad_norm": 0.7276600003242493, + "learning_rate": 9.78922604328419e-06, + "loss": 1.5212, + "mean_token_accuracy": 0.6416138807932535, + "num_tokens": 1699844103.0, + "step": 10136 + }, + { + "entropy": 1.7566113372643788, + "epoch": 1.1135920463596165, + "grad_norm": 0.7104360461235046, + "learning_rate": 9.787639488753224e-06, + "loss": 1.4888, + "mean_token_accuracy": 0.6481777926286062, + "num_tokens": 1699984609.0, + "step": 10137 + }, + { + "entropy": 1.6825304627418518, + "epoch": 1.1137019032709896, + "grad_norm": 0.8023889064788818, + "learning_rate": 9.78605297259293e-06, + "loss": 1.2334, + "mean_token_accuracy": 0.6820906003316244, + "num_tokens": 1700098327.0, + "step": 10138 + }, + { + "entropy": 1.673277239004771, + "epoch": 1.1138117601823625, + "grad_norm": 0.7964149117469788, + "learning_rate": 9.784466494853507e-06, + "loss": 1.3369, + "mean_token_accuracy": 0.6647952695687612, + "num_tokens": 1700260846.0, + "step": 10139 + }, + { + "entropy": 1.6351796289285023, + "epoch": 1.1139216170937354, + "grad_norm": 0.6151949763298035, + "learning_rate": 9.782880055585171e-06, + "loss": 1.5037, + "mean_token_accuracy": 0.640854095419248, + "num_tokens": 1700482399.0, + "step": 10140 + }, + { + "entropy": 1.7372311453024547, + "epoch": 1.1140314740051083, + "grad_norm": 0.8570227026939392, + "learning_rate": 9.781293654838137e-06, + "loss": 1.4559, + "mean_token_accuracy": 0.6682560443878174, + "num_tokens": 1700618804.0, + "step": 10141 + }, + { + "entropy": 1.6593137284119923, + "epoch": 1.1141413309164814, + "grad_norm": 0.5903623700141907, + "learning_rate": 9.779707292662605e-06, + "loss": 1.3958, + "mean_token_accuracy": 0.6583308031161627, + "num_tokens": 1700820971.0, + "step": 10142 + }, + { + "entropy": 1.6421323815981548, + "epoch": 1.1142511878278543, + "grad_norm": 0.6660314798355103, + "learning_rate": 9.778120969108791e-06, + "loss": 1.2946, + "mean_token_accuracy": 0.6711133569478989, + "num_tokens": 1700977095.0, + "step": 10143 + }, + { + "entropy": 1.629296710093816, + "epoch": 1.1143610447392271, + "grad_norm": 0.6648311018943787, + "learning_rate": 9.776534684226898e-06, + "loss": 1.4811, + "mean_token_accuracy": 0.6564251184463501, + "num_tokens": 1701153164.0, + "step": 10144 + }, + { + "entropy": 1.746991515159607, + "epoch": 1.1144709016506, + "grad_norm": 0.7114366888999939, + "learning_rate": 9.774948438067127e-06, + "loss": 1.347, + "mean_token_accuracy": 0.6591640909512838, + "num_tokens": 1701290473.0, + "step": 10145 + }, + { + "entropy": 1.7070113221804302, + "epoch": 1.114580758561973, + "grad_norm": 0.7355979681015015, + "learning_rate": 9.773362230679685e-06, + "loss": 1.5335, + "mean_token_accuracy": 0.6442695558071136, + "num_tokens": 1701450663.0, + "step": 10146 + }, + { + "entropy": 1.7952332894007366, + "epoch": 1.114690615473346, + "grad_norm": 0.7171587347984314, + "learning_rate": 9.771776062114782e-06, + "loss": 1.5082, + "mean_token_accuracy": 0.6332686841487885, + "num_tokens": 1701691000.0, + "step": 10147 + }, + { + "entropy": 1.7332034011681874, + "epoch": 1.114800472384719, + "grad_norm": 0.6650365591049194, + "learning_rate": 9.77018993242261e-06, + "loss": 1.3343, + "mean_token_accuracy": 0.6661591629187266, + "num_tokens": 1701854437.0, + "step": 10148 + }, + { + "entropy": 1.7049450874328613, + "epoch": 1.1149103292960918, + "grad_norm": 0.6518258452415466, + "learning_rate": 9.76860384165337e-06, + "loss": 1.2154, + "mean_token_accuracy": 0.6860535194476446, + "num_tokens": 1702016466.0, + "step": 10149 + }, + { + "entropy": 1.6941988567511241, + "epoch": 1.1150201862074647, + "grad_norm": 0.6656258702278137, + "learning_rate": 9.76701778985727e-06, + "loss": 1.4502, + "mean_token_accuracy": 0.6606006671984991, + "num_tokens": 1702179287.0, + "step": 10150 + }, + { + "entropy": 1.6935183207194011, + "epoch": 1.1151300431188378, + "grad_norm": 0.7154098749160767, + "learning_rate": 9.765431777084495e-06, + "loss": 1.2172, + "mean_token_accuracy": 0.6906551122665405, + "num_tokens": 1702313501.0, + "step": 10151 + }, + { + "entropy": 1.7146691580613453, + "epoch": 1.1152399000302107, + "grad_norm": 0.7676160335540771, + "learning_rate": 9.763845803385247e-06, + "loss": 1.5107, + "mean_token_accuracy": 0.664627286295096, + "num_tokens": 1702453001.0, + "step": 10152 + }, + { + "entropy": 1.6825979848702748, + "epoch": 1.1153497569415836, + "grad_norm": 0.5976483225822449, + "learning_rate": 9.76225986880973e-06, + "loss": 1.537, + "mean_token_accuracy": 0.636245513955752, + "num_tokens": 1702684575.0, + "step": 10153 + }, + { + "entropy": 1.6788121958573659, + "epoch": 1.1154596138529564, + "grad_norm": 0.6797428131103516, + "learning_rate": 9.760673973408124e-06, + "loss": 1.2018, + "mean_token_accuracy": 0.6839652607838312, + "num_tokens": 1702832136.0, + "step": 10154 + }, + { + "entropy": 1.6775270501772563, + "epoch": 1.1155694707643296, + "grad_norm": 0.7173194885253906, + "learning_rate": 9.75908811723063e-06, + "loss": 1.3752, + "mean_token_accuracy": 0.6599200914303461, + "num_tokens": 1702990937.0, + "step": 10155 + }, + { + "entropy": 1.6935907403628032, + "epoch": 1.1156793276757024, + "grad_norm": 0.7358232140541077, + "learning_rate": 9.757502300327439e-06, + "loss": 1.2976, + "mean_token_accuracy": 0.6701224446296692, + "num_tokens": 1703131461.0, + "step": 10156 + }, + { + "entropy": 1.6839358309904735, + "epoch": 1.1157891845870753, + "grad_norm": 0.7180026173591614, + "learning_rate": 9.755916522748738e-06, + "loss": 1.3543, + "mean_token_accuracy": 0.6787949800491333, + "num_tokens": 1703265354.0, + "step": 10157 + }, + { + "entropy": 1.7061149676640828, + "epoch": 1.1158990414984482, + "grad_norm": 0.7453353404998779, + "learning_rate": 9.754330784544719e-06, + "loss": 1.3589, + "mean_token_accuracy": 0.6774944067001343, + "num_tokens": 1703403406.0, + "step": 10158 + }, + { + "entropy": 1.6841832200686138, + "epoch": 1.116008898409821, + "grad_norm": 0.7039199471473694, + "learning_rate": 9.752745085765571e-06, + "loss": 1.4147, + "mean_token_accuracy": 0.6500913898150126, + "num_tokens": 1703568329.0, + "step": 10159 + }, + { + "entropy": 1.7383232315381367, + "epoch": 1.1161187553211942, + "grad_norm": 0.6420716047286987, + "learning_rate": 9.751159426461479e-06, + "loss": 1.5264, + "mean_token_accuracy": 0.6397146930297216, + "num_tokens": 1703758280.0, + "step": 10160 + }, + { + "entropy": 1.7103537619113922, + "epoch": 1.116228612232567, + "grad_norm": 0.7599209547042847, + "learning_rate": 9.749573806682629e-06, + "loss": 1.5267, + "mean_token_accuracy": 0.6386250903209051, + "num_tokens": 1703928952.0, + "step": 10161 + }, + { + "entropy": 1.6944385866324108, + "epoch": 1.11633846914394, + "grad_norm": 0.6873871684074402, + "learning_rate": 9.747988226479203e-06, + "loss": 1.3201, + "mean_token_accuracy": 0.6644426584243774, + "num_tokens": 1704065206.0, + "step": 10162 + }, + { + "entropy": 1.65885129570961, + "epoch": 1.1164483260553129, + "grad_norm": 0.62119460105896, + "learning_rate": 9.746402685901384e-06, + "loss": 1.409, + "mean_token_accuracy": 0.6450504660606384, + "num_tokens": 1704239787.0, + "step": 10163 + }, + { + "entropy": 1.6746040880680084, + "epoch": 1.116558182966686, + "grad_norm": 0.6383149027824402, + "learning_rate": 9.74481718499936e-06, + "loss": 1.401, + "mean_token_accuracy": 0.6668302963177363, + "num_tokens": 1704427935.0, + "step": 10164 + }, + { + "entropy": 1.7026494840780895, + "epoch": 1.1166680398780588, + "grad_norm": 0.6862279772758484, + "learning_rate": 9.743231723823301e-06, + "loss": 1.3652, + "mean_token_accuracy": 0.6631535540024439, + "num_tokens": 1704561300.0, + "step": 10165 + }, + { + "entropy": 1.6878803571065266, + "epoch": 1.1167778967894317, + "grad_norm": 1.0565212965011597, + "learning_rate": 9.741646302423392e-06, + "loss": 1.5243, + "mean_token_accuracy": 0.6648477713267008, + "num_tokens": 1704716778.0, + "step": 10166 + }, + { + "entropy": 1.774049351612727, + "epoch": 1.1168877537008046, + "grad_norm": 0.6734504103660583, + "learning_rate": 9.740060920849816e-06, + "loss": 1.4133, + "mean_token_accuracy": 0.6444449126720428, + "num_tokens": 1704906978.0, + "step": 10167 + }, + { + "entropy": 1.7018288373947144, + "epoch": 1.1169976106121777, + "grad_norm": 0.5699096322059631, + "learning_rate": 9.73847557915274e-06, + "loss": 1.3479, + "mean_token_accuracy": 0.6530384172995886, + "num_tokens": 1705118342.0, + "step": 10168 + }, + { + "entropy": 1.7284921209017436, + "epoch": 1.1171074675235506, + "grad_norm": 0.6984654068946838, + "learning_rate": 9.73689027738234e-06, + "loss": 1.5368, + "mean_token_accuracy": 0.6358341524998347, + "num_tokens": 1705278596.0, + "step": 10169 + }, + { + "entropy": 1.7392705778280895, + "epoch": 1.1172173244349235, + "grad_norm": 0.6023355722427368, + "learning_rate": 9.735305015588803e-06, + "loss": 1.4588, + "mean_token_accuracy": 0.6480032652616501, + "num_tokens": 1705479803.0, + "step": 10170 + }, + { + "entropy": 1.6727214256922405, + "epoch": 1.1173271813462964, + "grad_norm": 0.708677351474762, + "learning_rate": 9.733719793822285e-06, + "loss": 1.3237, + "mean_token_accuracy": 0.6617665340503057, + "num_tokens": 1705612132.0, + "step": 10171 + }, + { + "entropy": 1.7155958612759907, + "epoch": 1.1174370382576693, + "grad_norm": 0.6951910853385925, + "learning_rate": 9.732134612132967e-06, + "loss": 1.5166, + "mean_token_accuracy": 0.6467616135875384, + "num_tokens": 1705786778.0, + "step": 10172 + }, + { + "entropy": 1.7597291270891826, + "epoch": 1.1175468951690424, + "grad_norm": 0.8650582432746887, + "learning_rate": 9.730549470571017e-06, + "loss": 1.3505, + "mean_token_accuracy": 0.667744422952334, + "num_tokens": 1705918795.0, + "step": 10173 + }, + { + "entropy": 1.7015631298224132, + "epoch": 1.1176567520804153, + "grad_norm": 0.6916826367378235, + "learning_rate": 9.728964369186604e-06, + "loss": 1.4126, + "mean_token_accuracy": 0.6624699632326762, + "num_tokens": 1706077148.0, + "step": 10174 + }, + { + "entropy": 1.6976262032985687, + "epoch": 1.1177666089917881, + "grad_norm": 0.6640357375144958, + "learning_rate": 9.727379308029894e-06, + "loss": 1.2898, + "mean_token_accuracy": 0.6710817664861679, + "num_tokens": 1706214005.0, + "step": 10175 + }, + { + "entropy": 1.6960356036822002, + "epoch": 1.117876465903161, + "grad_norm": 0.6526516675949097, + "learning_rate": 9.72579428715106e-06, + "loss": 1.4422, + "mean_token_accuracy": 0.6493511895338694, + "num_tokens": 1706416877.0, + "step": 10176 + }, + { + "entropy": 1.7513096928596497, + "epoch": 1.1179863228145341, + "grad_norm": 0.6430375576019287, + "learning_rate": 9.724209306600259e-06, + "loss": 1.365, + "mean_token_accuracy": 0.668521781762441, + "num_tokens": 1706557592.0, + "step": 10177 + }, + { + "entropy": 1.6746535897254944, + "epoch": 1.118096179725907, + "grad_norm": 0.6057274341583252, + "learning_rate": 9.72262436642766e-06, + "loss": 1.3255, + "mean_token_accuracy": 0.6679912606875101, + "num_tokens": 1706756422.0, + "step": 10178 + }, + { + "entropy": 1.7983420590559642, + "epoch": 1.11820603663728, + "grad_norm": 0.7294691205024719, + "learning_rate": 9.721039466683425e-06, + "loss": 1.4899, + "mean_token_accuracy": 0.6450391262769699, + "num_tokens": 1706915673.0, + "step": 10179 + }, + { + "entropy": 1.6967030266920726, + "epoch": 1.1183158935486528, + "grad_norm": 2.2237155437469482, + "learning_rate": 9.719454607417713e-06, + "loss": 1.1928, + "mean_token_accuracy": 0.6813047230243683, + "num_tokens": 1707094039.0, + "step": 10180 + }, + { + "entropy": 1.684626470009486, + "epoch": 1.118425750460026, + "grad_norm": 0.637450635433197, + "learning_rate": 9.717869788680686e-06, + "loss": 1.4283, + "mean_token_accuracy": 0.6514883587757746, + "num_tokens": 1707258674.0, + "step": 10181 + }, + { + "entropy": 1.7461927036444347, + "epoch": 1.1185356073713988, + "grad_norm": 0.7752982378005981, + "learning_rate": 9.716285010522507e-06, + "loss": 1.29, + "mean_token_accuracy": 0.6688565959533056, + "num_tokens": 1707367308.0, + "step": 10182 + }, + { + "entropy": 1.6707812150319417, + "epoch": 1.1186454642827717, + "grad_norm": 0.6544961929321289, + "learning_rate": 9.71470027299332e-06, + "loss": 1.5234, + "mean_token_accuracy": 0.6454186936219534, + "num_tokens": 1707582766.0, + "step": 10183 + }, + { + "entropy": 1.6422998011112213, + "epoch": 1.1187553211941446, + "grad_norm": 0.6208282709121704, + "learning_rate": 9.713115576143294e-06, + "loss": 1.4995, + "mean_token_accuracy": 0.6404968003431956, + "num_tokens": 1707802739.0, + "step": 10184 + }, + { + "entropy": 1.7650231917699177, + "epoch": 1.1188651781055177, + "grad_norm": 0.7994515895843506, + "learning_rate": 9.711530920022583e-06, + "loss": 1.3585, + "mean_token_accuracy": 0.6573351373275121, + "num_tokens": 1707927893.0, + "step": 10185 + }, + { + "entropy": 1.650092860062917, + "epoch": 1.1189750350168906, + "grad_norm": 0.6611128449440002, + "learning_rate": 9.709946304681337e-06, + "loss": 1.3735, + "mean_token_accuracy": 0.654491126537323, + "num_tokens": 1708061832.0, + "step": 10186 + }, + { + "entropy": 1.6721069812774658, + "epoch": 1.1190848919282634, + "grad_norm": 0.608931303024292, + "learning_rate": 9.708361730169704e-06, + "loss": 1.3893, + "mean_token_accuracy": 0.6576156516869863, + "num_tokens": 1708203863.0, + "step": 10187 + }, + { + "entropy": 1.720770001411438, + "epoch": 1.1191947488396363, + "grad_norm": 0.6923795342445374, + "learning_rate": 9.706777196537848e-06, + "loss": 1.422, + "mean_token_accuracy": 0.6553168892860413, + "num_tokens": 1708401936.0, + "step": 10188 + }, + { + "entropy": 1.7493318518002827, + "epoch": 1.1193046057510092, + "grad_norm": 0.7753176093101501, + "learning_rate": 9.705192703835905e-06, + "loss": 1.336, + "mean_token_accuracy": 0.6591468950112661, + "num_tokens": 1708521454.0, + "step": 10189 + }, + { + "entropy": 1.7393775284290314, + "epoch": 1.1194144626623823, + "grad_norm": 0.7612557411193848, + "learning_rate": 9.703608252114032e-06, + "loss": 1.3429, + "mean_token_accuracy": 0.6587264835834503, + "num_tokens": 1708653355.0, + "step": 10190 + }, + { + "entropy": 1.6912192503611247, + "epoch": 1.1195243195737552, + "grad_norm": 0.6567522883415222, + "learning_rate": 9.702023841422375e-06, + "loss": 1.4079, + "mean_token_accuracy": 0.6546053836743037, + "num_tokens": 1708820340.0, + "step": 10191 + }, + { + "entropy": 1.651652197043101, + "epoch": 1.119634176485128, + "grad_norm": 0.5810186862945557, + "learning_rate": 9.700439471811076e-06, + "loss": 1.411, + "mean_token_accuracy": 0.642088994383812, + "num_tokens": 1709066402.0, + "step": 10192 + }, + { + "entropy": 1.6859426498413086, + "epoch": 1.119744033396501, + "grad_norm": 0.7347911596298218, + "learning_rate": 9.698855143330279e-06, + "loss": 1.3373, + "mean_token_accuracy": 0.6684578359127045, + "num_tokens": 1709226238.0, + "step": 10193 + }, + { + "entropy": 1.7300080160299938, + "epoch": 1.119853890307874, + "grad_norm": 0.5946059823036194, + "learning_rate": 9.697270856030139e-06, + "loss": 1.4268, + "mean_token_accuracy": 0.641315350929896, + "num_tokens": 1709429497.0, + "step": 10194 + }, + { + "entropy": 1.7237819532553356, + "epoch": 1.119963747219247, + "grad_norm": 0.6461696624755859, + "learning_rate": 9.695686609960781e-06, + "loss": 1.3313, + "mean_token_accuracy": 0.6617699911197027, + "num_tokens": 1709578553.0, + "step": 10195 + }, + { + "entropy": 1.7087758978207905, + "epoch": 1.1200736041306198, + "grad_norm": 0.6898693442344666, + "learning_rate": 9.694102405172359e-06, + "loss": 1.5797, + "mean_token_accuracy": 0.6312484840552012, + "num_tokens": 1709792879.0, + "step": 10196 + }, + { + "entropy": 1.7287095288435619, + "epoch": 1.1201834610419927, + "grad_norm": 0.6966397166252136, + "learning_rate": 9.692518241715007e-06, + "loss": 1.4683, + "mean_token_accuracy": 0.642145057519277, + "num_tokens": 1709980761.0, + "step": 10197 + }, + { + "entropy": 1.6889378329118092, + "epoch": 1.1202933179533658, + "grad_norm": 0.7332677245140076, + "learning_rate": 9.690934119638864e-06, + "loss": 1.3267, + "mean_token_accuracy": 0.670543372631073, + "num_tokens": 1710162953.0, + "step": 10198 + }, + { + "entropy": 1.7048714061578114, + "epoch": 1.1204031748647387, + "grad_norm": 0.643816351890564, + "learning_rate": 9.68935003899406e-06, + "loss": 1.483, + "mean_token_accuracy": 0.6404896924893061, + "num_tokens": 1710319457.0, + "step": 10199 + }, + { + "entropy": 1.758160392443339, + "epoch": 1.1205130317761116, + "grad_norm": 0.7269825339317322, + "learning_rate": 9.687765999830747e-06, + "loss": 1.4181, + "mean_token_accuracy": 0.6542981912692388, + "num_tokens": 1710487316.0, + "step": 10200 + }, + { + "entropy": 1.6718091368675232, + "epoch": 1.1206228886874845, + "grad_norm": 0.6098527908325195, + "learning_rate": 9.686182002199043e-06, + "loss": 1.3717, + "mean_token_accuracy": 0.6548734953006109, + "num_tokens": 1710721505.0, + "step": 10201 + }, + { + "entropy": 1.658282607793808, + "epoch": 1.1207327455988576, + "grad_norm": 0.7469896078109741, + "learning_rate": 9.684598046149086e-06, + "loss": 1.3578, + "mean_token_accuracy": 0.6641291330258051, + "num_tokens": 1710872364.0, + "step": 10202 + }, + { + "entropy": 1.6753500401973724, + "epoch": 1.1208426025102305, + "grad_norm": 0.6354155540466309, + "learning_rate": 9.68301413173101e-06, + "loss": 1.3854, + "mean_token_accuracy": 0.6567136198282242, + "num_tokens": 1711009667.0, + "step": 10203 + }, + { + "entropy": 1.7254823247591655, + "epoch": 1.1209524594216034, + "grad_norm": 0.8729540109634399, + "learning_rate": 9.681430258994942e-06, + "loss": 1.5197, + "mean_token_accuracy": 0.6521262973546982, + "num_tokens": 1711175837.0, + "step": 10204 + }, + { + "entropy": 1.7196756303310394, + "epoch": 1.1210623163329763, + "grad_norm": 0.9252813458442688, + "learning_rate": 9.67984642799101e-06, + "loss": 1.5275, + "mean_token_accuracy": 0.6681269109249115, + "num_tokens": 1711325831.0, + "step": 10205 + }, + { + "entropy": 1.6983699103196461, + "epoch": 1.1211721732443491, + "grad_norm": 1.3726911544799805, + "learning_rate": 9.67826263876935e-06, + "loss": 1.3746, + "mean_token_accuracy": 0.6633789986371994, + "num_tokens": 1711546558.0, + "step": 10206 + }, + { + "entropy": 1.7186132570107777, + "epoch": 1.1212820301557223, + "grad_norm": 0.6234872937202454, + "learning_rate": 9.676678891380075e-06, + "loss": 1.564, + "mean_token_accuracy": 0.643887793024381, + "num_tokens": 1711737822.0, + "step": 10207 + }, + { + "entropy": 1.7215098639329274, + "epoch": 1.1213918870670951, + "grad_norm": 0.6350100040435791, + "learning_rate": 9.67509518587332e-06, + "loss": 1.4629, + "mean_token_accuracy": 0.627614696820577, + "num_tokens": 1711928499.0, + "step": 10208 + }, + { + "entropy": 1.6704553961753845, + "epoch": 1.121501743978468, + "grad_norm": 0.8866394758224487, + "learning_rate": 9.673511522299206e-06, + "loss": 1.2698, + "mean_token_accuracy": 0.6800111383199692, + "num_tokens": 1712068332.0, + "step": 10209 + }, + { + "entropy": 1.6835826337337494, + "epoch": 1.121611600889841, + "grad_norm": 0.787550151348114, + "learning_rate": 9.671927900707853e-06, + "loss": 1.3623, + "mean_token_accuracy": 0.6513439963261286, + "num_tokens": 1712231679.0, + "step": 10210 + }, + { + "entropy": 1.658888618151347, + "epoch": 1.121721457801214, + "grad_norm": 0.6265230774879456, + "learning_rate": 9.670344321149382e-06, + "loss": 1.3855, + "mean_token_accuracy": 0.6652615120013555, + "num_tokens": 1712419094.0, + "step": 10211 + }, + { + "entropy": 1.701241821050644, + "epoch": 1.121831314712587, + "grad_norm": 0.6618353724479675, + "learning_rate": 9.66876078367392e-06, + "loss": 1.2214, + "mean_token_accuracy": 0.6815350999434789, + "num_tokens": 1712520934.0, + "step": 10212 + }, + { + "entropy": 1.674480825662613, + "epoch": 1.1219411716239598, + "grad_norm": 1.002471685409546, + "learning_rate": 9.667177288331575e-06, + "loss": 1.4104, + "mean_token_accuracy": 0.6764763842026392, + "num_tokens": 1712659842.0, + "step": 10213 + }, + { + "entropy": 1.7319279114405315, + "epoch": 1.1220510285353327, + "grad_norm": 0.7504306435585022, + "learning_rate": 9.665593835172469e-06, + "loss": 1.5625, + "mean_token_accuracy": 0.6419167965650558, + "num_tokens": 1712808419.0, + "step": 10214 + }, + { + "entropy": 1.7441862523555756, + "epoch": 1.1221608854467058, + "grad_norm": 0.771921694278717, + "learning_rate": 9.664010424246718e-06, + "loss": 1.4994, + "mean_token_accuracy": 0.6410591999689738, + "num_tokens": 1713000413.0, + "step": 10215 + }, + { + "entropy": 1.680644154548645, + "epoch": 1.1222707423580787, + "grad_norm": 0.6322289109230042, + "learning_rate": 9.662427055604433e-06, + "loss": 1.4454, + "mean_token_accuracy": 0.6500407656033834, + "num_tokens": 1713179456.0, + "step": 10216 + }, + { + "entropy": 1.72195503115654, + "epoch": 1.1223805992694516, + "grad_norm": 0.7085158824920654, + "learning_rate": 9.66084372929573e-06, + "loss": 1.2239, + "mean_token_accuracy": 0.6777483820915222, + "num_tokens": 1713317703.0, + "step": 10217 + }, + { + "entropy": 1.6902472376823425, + "epoch": 1.1224904561808244, + "grad_norm": 0.7547399997711182, + "learning_rate": 9.659260445370721e-06, + "loss": 1.2484, + "mean_token_accuracy": 0.6817424396673838, + "num_tokens": 1713448541.0, + "step": 10218 + }, + { + "entropy": 1.719992220401764, + "epoch": 1.1226003130921973, + "grad_norm": 0.7051261067390442, + "learning_rate": 9.65767720387951e-06, + "loss": 1.408, + "mean_token_accuracy": 0.6687562465667725, + "num_tokens": 1713608815.0, + "step": 10219 + }, + { + "entropy": 1.7180972397327423, + "epoch": 1.1227101700035704, + "grad_norm": 0.7280387878417969, + "learning_rate": 9.656094004872214e-06, + "loss": 1.4462, + "mean_token_accuracy": 0.6457540740569433, + "num_tokens": 1713764991.0, + "step": 10220 + }, + { + "entropy": 1.7479403515656788, + "epoch": 1.1228200269149433, + "grad_norm": 0.7459531426429749, + "learning_rate": 9.65451084839894e-06, + "loss": 1.4455, + "mean_token_accuracy": 0.6594701160987219, + "num_tokens": 1713947251.0, + "step": 10221 + }, + { + "entropy": 1.662820319334666, + "epoch": 1.1229298838263162, + "grad_norm": 0.7663895487785339, + "learning_rate": 9.652927734509785e-06, + "loss": 1.2602, + "mean_token_accuracy": 0.6720548172791799, + "num_tokens": 1714098388.0, + "step": 10222 + }, + { + "entropy": 1.6706760227680206, + "epoch": 1.123039740737689, + "grad_norm": 0.7139101624488831, + "learning_rate": 9.651344663254867e-06, + "loss": 1.2496, + "mean_token_accuracy": 0.6699910461902618, + "num_tokens": 1714223112.0, + "step": 10223 + }, + { + "entropy": 1.7492280701796215, + "epoch": 1.1231495976490622, + "grad_norm": 0.7086498141288757, + "learning_rate": 9.649761634684278e-06, + "loss": 1.3808, + "mean_token_accuracy": 0.6561706811189651, + "num_tokens": 1714384472.0, + "step": 10224 + }, + { + "entropy": 1.7483586271603901, + "epoch": 1.123259454560435, + "grad_norm": 0.7158522605895996, + "learning_rate": 9.648178648848124e-06, + "loss": 1.336, + "mean_token_accuracy": 0.6585302899281184, + "num_tokens": 1714505046.0, + "step": 10225 + }, + { + "entropy": 1.727395882209142, + "epoch": 1.123369311471808, + "grad_norm": 0.713912844657898, + "learning_rate": 9.646595705796512e-06, + "loss": 1.5567, + "mean_token_accuracy": 0.622910718123118, + "num_tokens": 1714731130.0, + "step": 10226 + }, + { + "entropy": 1.655521293481191, + "epoch": 1.1234791683831808, + "grad_norm": 0.6293887495994568, + "learning_rate": 9.64501280557953e-06, + "loss": 1.3659, + "mean_token_accuracy": 0.6622976015011469, + "num_tokens": 1714907443.0, + "step": 10227 + }, + { + "entropy": 1.7207797865072887, + "epoch": 1.123589025294554, + "grad_norm": 0.6388752460479736, + "learning_rate": 9.643429948247285e-06, + "loss": 1.4711, + "mean_token_accuracy": 0.6455638408660889, + "num_tokens": 1715072791.0, + "step": 10228 + }, + { + "entropy": 1.7234470546245575, + "epoch": 1.1236988822059268, + "grad_norm": 0.733909547328949, + "learning_rate": 9.641847133849871e-06, + "loss": 1.3224, + "mean_token_accuracy": 0.6681032180786133, + "num_tokens": 1715205714.0, + "step": 10229 + }, + { + "entropy": 1.6593499183654785, + "epoch": 1.1238087391172997, + "grad_norm": 0.7304568290710449, + "learning_rate": 9.640264362437383e-06, + "loss": 1.3619, + "mean_token_accuracy": 0.6740380873282751, + "num_tokens": 1715356891.0, + "step": 10230 + }, + { + "entropy": 1.6985628008842468, + "epoch": 1.1239185960286726, + "grad_norm": 0.7651453614234924, + "learning_rate": 9.638681634059912e-06, + "loss": 1.329, + "mean_token_accuracy": 0.6615240027507147, + "num_tokens": 1715495634.0, + "step": 10231 + }, + { + "entropy": 1.7773073414961498, + "epoch": 1.1240284529400455, + "grad_norm": 0.8025757074356079, + "learning_rate": 9.63709894876756e-06, + "loss": 1.4464, + "mean_token_accuracy": 0.6638560245434443, + "num_tokens": 1715615936.0, + "step": 10232 + }, + { + "entropy": 1.703110893567403, + "epoch": 1.1241383098514186, + "grad_norm": 0.7909550666809082, + "learning_rate": 9.63551630661041e-06, + "loss": 1.3458, + "mean_token_accuracy": 0.6680939247210821, + "num_tokens": 1715769008.0, + "step": 10233 + }, + { + "entropy": 1.720233827829361, + "epoch": 1.1242481667627915, + "grad_norm": 0.7866725921630859, + "learning_rate": 9.633933707638549e-06, + "loss": 1.3118, + "mean_token_accuracy": 0.6679093490044276, + "num_tokens": 1715904405.0, + "step": 10234 + }, + { + "entropy": 1.7014137109120686, + "epoch": 1.1243580236741644, + "grad_norm": 0.9128050804138184, + "learning_rate": 9.632351151902078e-06, + "loss": 1.4969, + "mean_token_accuracy": 0.6651032914717993, + "num_tokens": 1716056585.0, + "step": 10235 + }, + { + "entropy": 1.7811519304911296, + "epoch": 1.1244678805855373, + "grad_norm": 0.6634161472320557, + "learning_rate": 9.630768639451074e-06, + "loss": 1.5714, + "mean_token_accuracy": 0.6314926048119863, + "num_tokens": 1716297004.0, + "step": 10236 + }, + { + "entropy": 1.6649401287237804, + "epoch": 1.1245777374969104, + "grad_norm": 0.9859393239021301, + "learning_rate": 9.629186170335623e-06, + "loss": 1.4163, + "mean_token_accuracy": 0.6583981762329737, + "num_tokens": 1716441728.0, + "step": 10237 + }, + { + "entropy": 1.6984333594640095, + "epoch": 1.1246875944082833, + "grad_norm": 0.5795386433601379, + "learning_rate": 9.627603744605816e-06, + "loss": 1.3759, + "mean_token_accuracy": 0.6487035552660624, + "num_tokens": 1716646155.0, + "step": 10238 + }, + { + "entropy": 1.7283507784207661, + "epoch": 1.1247974513196561, + "grad_norm": 0.6568630337715149, + "learning_rate": 9.626021362311728e-06, + "loss": 1.4842, + "mean_token_accuracy": 0.6527669827143351, + "num_tokens": 1716845898.0, + "step": 10239 + }, + { + "entropy": 1.672084202369054, + "epoch": 1.124907308231029, + "grad_norm": 1.0509593486785889, + "learning_rate": 9.624439023503447e-06, + "loss": 1.4772, + "mean_token_accuracy": 0.6506913155317307, + "num_tokens": 1717018504.0, + "step": 10240 + }, + { + "entropy": 1.747216780980428, + "epoch": 1.1250171651424021, + "grad_norm": 0.6140730977058411, + "learning_rate": 9.62285672823105e-06, + "loss": 1.509, + "mean_token_accuracy": 0.6518428673346838, + "num_tokens": 1717198945.0, + "step": 10241 + }, + { + "entropy": 1.7177268067995708, + "epoch": 1.125127022053775, + "grad_norm": 0.6890478730201721, + "learning_rate": 9.62127447654462e-06, + "loss": 1.3421, + "mean_token_accuracy": 0.6564944684505463, + "num_tokens": 1717368940.0, + "step": 10242 + }, + { + "entropy": 1.7818762163321178, + "epoch": 1.125236878965148, + "grad_norm": 0.7375714778900146, + "learning_rate": 9.619692268494227e-06, + "loss": 1.4952, + "mean_token_accuracy": 0.653364305694898, + "num_tokens": 1717525260.0, + "step": 10243 + }, + { + "entropy": 1.7333811124165852, + "epoch": 1.1253467358765208, + "grad_norm": 0.8810262084007263, + "learning_rate": 9.618110104129959e-06, + "loss": 1.2485, + "mean_token_accuracy": 0.673724964261055, + "num_tokens": 1717622207.0, + "step": 10244 + }, + { + "entropy": 1.7475936810175579, + "epoch": 1.1254565927878937, + "grad_norm": 0.7619924545288086, + "learning_rate": 9.616527983501875e-06, + "loss": 1.3755, + "mean_token_accuracy": 0.6646767059961954, + "num_tokens": 1717792417.0, + "step": 10245 + }, + { + "entropy": 1.6934519012769063, + "epoch": 1.1255664496992668, + "grad_norm": 0.7149166464805603, + "learning_rate": 9.61494590666006e-06, + "loss": 1.5686, + "mean_token_accuracy": 0.6368276750048002, + "num_tokens": 1717962583.0, + "step": 10246 + }, + { + "entropy": 1.7014482418696086, + "epoch": 1.1256763066106397, + "grad_norm": 0.7237895131111145, + "learning_rate": 9.613363873654587e-06, + "loss": 1.455, + "mean_token_accuracy": 0.6583812286456426, + "num_tokens": 1718136025.0, + "step": 10247 + }, + { + "entropy": 1.6409784257411957, + "epoch": 1.1257861635220126, + "grad_norm": 0.8253493905067444, + "learning_rate": 9.611781884535515e-06, + "loss": 1.4142, + "mean_token_accuracy": 0.6619662940502167, + "num_tokens": 1718279754.0, + "step": 10248 + }, + { + "entropy": 1.713402251402537, + "epoch": 1.1258960204333854, + "grad_norm": 0.7551915049552917, + "learning_rate": 9.610199939352927e-06, + "loss": 1.4834, + "mean_token_accuracy": 0.6495722184578577, + "num_tokens": 1718426265.0, + "step": 10249 + }, + { + "entropy": 1.701746533314387, + "epoch": 1.1260058773447585, + "grad_norm": 0.6706552505493164, + "learning_rate": 9.608618038156885e-06, + "loss": 1.2812, + "mean_token_accuracy": 0.6634306162595749, + "num_tokens": 1718558139.0, + "step": 10250 + }, + { + "entropy": 1.7403425474961598, + "epoch": 1.1261157342561314, + "grad_norm": 0.7633783221244812, + "learning_rate": 9.60703618099745e-06, + "loss": 1.4703, + "mean_token_accuracy": 0.6518524537483851, + "num_tokens": 1718700729.0, + "step": 10251 + }, + { + "entropy": 1.726717124382655, + "epoch": 1.1262255911675043, + "grad_norm": 0.6118089556694031, + "learning_rate": 9.605454367924694e-06, + "loss": 1.5331, + "mean_token_accuracy": 0.6415149420499802, + "num_tokens": 1718933466.0, + "step": 10252 + }, + { + "entropy": 1.6945171753565471, + "epoch": 1.1263354480788772, + "grad_norm": 0.8243867754936218, + "learning_rate": 9.603872598988681e-06, + "loss": 1.5107, + "mean_token_accuracy": 0.6461230466763178, + "num_tokens": 1719113356.0, + "step": 10253 + }, + { + "entropy": 1.7175917228062947, + "epoch": 1.1264453049902503, + "grad_norm": 0.6808179020881653, + "learning_rate": 9.60229087423947e-06, + "loss": 1.3696, + "mean_token_accuracy": 0.6517705669005712, + "num_tokens": 1719278633.0, + "step": 10254 + }, + { + "entropy": 1.7561777532100677, + "epoch": 1.1265551619016232, + "grad_norm": 0.7055034041404724, + "learning_rate": 9.60070919372712e-06, + "loss": 1.3642, + "mean_token_accuracy": 0.6630295763413111, + "num_tokens": 1719416802.0, + "step": 10255 + }, + { + "entropy": 1.6309455533822377, + "epoch": 1.126665018812996, + "grad_norm": 0.7051904797554016, + "learning_rate": 9.599127557501702e-06, + "loss": 1.3411, + "mean_token_accuracy": 0.66075432797273, + "num_tokens": 1719589328.0, + "step": 10256 + }, + { + "entropy": 1.7621269524097443, + "epoch": 1.126774875724369, + "grad_norm": 0.6432750821113586, + "learning_rate": 9.597545965613256e-06, + "loss": 1.5804, + "mean_token_accuracy": 0.6246517151594162, + "num_tokens": 1719767545.0, + "step": 10257 + }, + { + "entropy": 1.7293944557507832, + "epoch": 1.1268847326357418, + "grad_norm": 0.7736344337463379, + "learning_rate": 9.595964418111852e-06, + "loss": 1.4588, + "mean_token_accuracy": 0.6443672925233841, + "num_tokens": 1719921859.0, + "step": 10258 + }, + { + "entropy": 1.6619472404321034, + "epoch": 1.126994589547115, + "grad_norm": 0.6600391268730164, + "learning_rate": 9.594382915047541e-06, + "loss": 1.4106, + "mean_token_accuracy": 0.6570078432559967, + "num_tokens": 1720055743.0, + "step": 10259 + }, + { + "entropy": 1.703255335489909, + "epoch": 1.1271044464584878, + "grad_norm": 0.6473222374916077, + "learning_rate": 9.59280145647038e-06, + "loss": 1.2278, + "mean_token_accuracy": 0.6826945741971334, + "num_tokens": 1720194410.0, + "step": 10260 + }, + { + "entropy": 1.7209681471188862, + "epoch": 1.1272143033698607, + "grad_norm": 0.6969286203384399, + "learning_rate": 9.591220042430413e-06, + "loss": 1.3104, + "mean_token_accuracy": 0.6570959637562434, + "num_tokens": 1720355987.0, + "step": 10261 + }, + { + "entropy": 1.6787172555923462, + "epoch": 1.1273241602812336, + "grad_norm": 0.6830558180809021, + "learning_rate": 9.589638672977707e-06, + "loss": 1.4813, + "mean_token_accuracy": 0.6633206804593405, + "num_tokens": 1720540451.0, + "step": 10262 + }, + { + "entropy": 1.7526487509409587, + "epoch": 1.1274340171926067, + "grad_norm": 0.681906521320343, + "learning_rate": 9.588057348162291e-06, + "loss": 1.4437, + "mean_token_accuracy": 0.6427949617306391, + "num_tokens": 1720713011.0, + "step": 10263 + }, + { + "entropy": 1.6857905586560566, + "epoch": 1.1275438741039796, + "grad_norm": 0.7251749634742737, + "learning_rate": 9.586476068034227e-06, + "loss": 1.3212, + "mean_token_accuracy": 0.6568873276313146, + "num_tokens": 1720891843.0, + "step": 10264 + }, + { + "entropy": 1.670927365620931, + "epoch": 1.1276537310153525, + "grad_norm": 0.6126587986946106, + "learning_rate": 9.58489483264356e-06, + "loss": 1.4053, + "mean_token_accuracy": 0.6479257047176361, + "num_tokens": 1721080212.0, + "step": 10265 + }, + { + "entropy": 1.7660788198312123, + "epoch": 1.1277635879267254, + "grad_norm": 0.7999944686889648, + "learning_rate": 9.583313642040334e-06, + "loss": 1.6588, + "mean_token_accuracy": 0.6271585474411646, + "num_tokens": 1721260940.0, + "step": 10266 + }, + { + "entropy": 1.6764280597368877, + "epoch": 1.1278734448380985, + "grad_norm": 0.7556807398796082, + "learning_rate": 9.581732496274589e-06, + "loss": 1.44, + "mean_token_accuracy": 0.6474957416454951, + "num_tokens": 1721416623.0, + "step": 10267 + }, + { + "entropy": 1.6823217471440632, + "epoch": 1.1279833017494714, + "grad_norm": 0.7498142719268799, + "learning_rate": 9.58015139539638e-06, + "loss": 1.3952, + "mean_token_accuracy": 0.6678342968225479, + "num_tokens": 1721589531.0, + "step": 10268 + }, + { + "entropy": 1.6793174644311268, + "epoch": 1.1280931586608443, + "grad_norm": 0.9057360887527466, + "learning_rate": 9.578570339455731e-06, + "loss": 1.3778, + "mean_token_accuracy": 0.6720141271750132, + "num_tokens": 1721737220.0, + "step": 10269 + }, + { + "entropy": 1.7173177699247997, + "epoch": 1.1282030155722171, + "grad_norm": 0.7818934917449951, + "learning_rate": 9.576989328502692e-06, + "loss": 1.3202, + "mean_token_accuracy": 0.675959994395574, + "num_tokens": 1721852887.0, + "step": 10270 + }, + { + "entropy": 1.7108666598796844, + "epoch": 1.12831287248359, + "grad_norm": 0.6389757394790649, + "learning_rate": 9.575408362587303e-06, + "loss": 1.418, + "mean_token_accuracy": 0.66609459122022, + "num_tokens": 1722068639.0, + "step": 10271 + }, + { + "entropy": 1.695683737595876, + "epoch": 1.1284227293949631, + "grad_norm": 0.607473611831665, + "learning_rate": 9.573827441759595e-06, + "loss": 1.5285, + "mean_token_accuracy": 0.6294114092985789, + "num_tokens": 1722254953.0, + "step": 10272 + }, + { + "entropy": 1.6890762945016224, + "epoch": 1.128532586306336, + "grad_norm": 0.8829686045646667, + "learning_rate": 9.572246566069605e-06, + "loss": 1.4423, + "mean_token_accuracy": 0.6620638519525528, + "num_tokens": 1722387360.0, + "step": 10273 + }, + { + "entropy": 1.689795712629954, + "epoch": 1.128642443217709, + "grad_norm": 0.727443516254425, + "learning_rate": 9.570665735567371e-06, + "loss": 1.4261, + "mean_token_accuracy": 0.6524536609649658, + "num_tokens": 1722559707.0, + "step": 10274 + }, + { + "entropy": 1.6225083768367767, + "epoch": 1.1287523001290818, + "grad_norm": 0.7552088499069214, + "learning_rate": 9.569084950302919e-06, + "loss": 1.3415, + "mean_token_accuracy": 0.6804608354965845, + "num_tokens": 1722701488.0, + "step": 10275 + }, + { + "entropy": 1.6765993038813274, + "epoch": 1.128862157040455, + "grad_norm": 0.7234074473381042, + "learning_rate": 9.567504210326282e-06, + "loss": 1.4674, + "mean_token_accuracy": 0.653687963883082, + "num_tokens": 1722919230.0, + "step": 10276 + }, + { + "entropy": 1.641531725724538, + "epoch": 1.1289720139518278, + "grad_norm": 0.5420734882354736, + "learning_rate": 9.565923515687496e-06, + "loss": 1.4152, + "mean_token_accuracy": 0.6417432824770609, + "num_tokens": 1723122116.0, + "step": 10277 + }, + { + "entropy": 1.7413820525010426, + "epoch": 1.1290818708632007, + "grad_norm": 0.7966932654380798, + "learning_rate": 9.564342866436582e-06, + "loss": 1.459, + "mean_token_accuracy": 0.6337345441182455, + "num_tokens": 1723305443.0, + "step": 10278 + }, + { + "entropy": 1.6700741648674011, + "epoch": 1.1291917277745736, + "grad_norm": 0.7032240033149719, + "learning_rate": 9.562762262623569e-06, + "loss": 1.2664, + "mean_token_accuracy": 0.6797453612089157, + "num_tokens": 1723427892.0, + "step": 10279 + }, + { + "entropy": 1.7009910543759663, + "epoch": 1.1293015846859467, + "grad_norm": 0.6182928085327148, + "learning_rate": 9.561181704298487e-06, + "loss": 1.3533, + "mean_token_accuracy": 0.6629110823074976, + "num_tokens": 1723591055.0, + "step": 10280 + }, + { + "entropy": 1.6887823740641277, + "epoch": 1.1294114415973195, + "grad_norm": 0.6807572245597839, + "learning_rate": 9.55960119151135e-06, + "loss": 1.5122, + "mean_token_accuracy": 0.6364335119724274, + "num_tokens": 1723783650.0, + "step": 10281 + }, + { + "entropy": 1.7669847408930461, + "epoch": 1.1295212985086924, + "grad_norm": 0.8603620529174805, + "learning_rate": 9.558020724312192e-06, + "loss": 1.3234, + "mean_token_accuracy": 0.6592119683821996, + "num_tokens": 1723913943.0, + "step": 10282 + }, + { + "entropy": 1.8033428092797597, + "epoch": 1.1296311554200653, + "grad_norm": 0.6308757066726685, + "learning_rate": 9.556440302751022e-06, + "loss": 1.4724, + "mean_token_accuracy": 0.6299006392558416, + "num_tokens": 1724146940.0, + "step": 10283 + }, + { + "entropy": 1.6769826412200928, + "epoch": 1.1297410123314382, + "grad_norm": 0.7638330459594727, + "learning_rate": 9.554859926877868e-06, + "loss": 1.4675, + "mean_token_accuracy": 0.6560545514027277, + "num_tokens": 1724296007.0, + "step": 10284 + }, + { + "entropy": 1.7535623212655385, + "epoch": 1.1298508692428113, + "grad_norm": 0.7146167755126953, + "learning_rate": 9.553279596742748e-06, + "loss": 1.5313, + "mean_token_accuracy": 0.6488161732753118, + "num_tokens": 1724415625.0, + "step": 10285 + }, + { + "entropy": 1.7697202265262604, + "epoch": 1.1299607261541842, + "grad_norm": 0.7635106444358826, + "learning_rate": 9.551699312395677e-06, + "loss": 1.2609, + "mean_token_accuracy": 0.6754236469666163, + "num_tokens": 1724518575.0, + "step": 10286 + }, + { + "entropy": 1.6063568989435832, + "epoch": 1.130070583065557, + "grad_norm": 0.8617009520530701, + "learning_rate": 9.550119073886666e-06, + "loss": 1.3429, + "mean_token_accuracy": 0.6730814675490061, + "num_tokens": 1724716862.0, + "step": 10287 + }, + { + "entropy": 1.7419047852357228, + "epoch": 1.13018043997693, + "grad_norm": 0.7265612483024597, + "learning_rate": 9.548538881265739e-06, + "loss": 1.5776, + "mean_token_accuracy": 0.6297195802132288, + "num_tokens": 1724911990.0, + "step": 10288 + }, + { + "entropy": 1.7049497961997986, + "epoch": 1.130290296888303, + "grad_norm": 1.0515176057815552, + "learning_rate": 9.546958734582897e-06, + "loss": 1.333, + "mean_token_accuracy": 0.6726740250984827, + "num_tokens": 1725084372.0, + "step": 10289 + }, + { + "entropy": 1.6889955898125966, + "epoch": 1.130400153799676, + "grad_norm": 0.6568050980567932, + "learning_rate": 9.545378633888158e-06, + "loss": 1.4114, + "mean_token_accuracy": 0.6661112556854883, + "num_tokens": 1725262103.0, + "step": 10290 + }, + { + "entropy": 1.7066051562627156, + "epoch": 1.1305100107110488, + "grad_norm": 0.6112655997276306, + "learning_rate": 9.543798579231534e-06, + "loss": 1.4617, + "mean_token_accuracy": 0.647816851735115, + "num_tokens": 1725470273.0, + "step": 10291 + }, + { + "entropy": 1.6733331779638927, + "epoch": 1.1306198676224217, + "grad_norm": 0.5629302263259888, + "learning_rate": 9.542218570663024e-06, + "loss": 1.543, + "mean_token_accuracy": 0.6284714738527933, + "num_tokens": 1725724252.0, + "step": 10292 + }, + { + "entropy": 1.7002252141634624, + "epoch": 1.1307297245337948, + "grad_norm": 0.6977674961090088, + "learning_rate": 9.540638608232637e-06, + "loss": 1.3992, + "mean_token_accuracy": 0.6548378119866053, + "num_tokens": 1725901325.0, + "step": 10293 + }, + { + "entropy": 1.7188651661078136, + "epoch": 1.1308395814451677, + "grad_norm": 0.707028329372406, + "learning_rate": 9.539058691990388e-06, + "loss": 1.3586, + "mean_token_accuracy": 0.6680986136198044, + "num_tokens": 1726022572.0, + "step": 10294 + }, + { + "entropy": 1.7041955292224884, + "epoch": 1.1309494383565406, + "grad_norm": 0.583182692527771, + "learning_rate": 9.537478821986266e-06, + "loss": 1.4493, + "mean_token_accuracy": 0.648201530178388, + "num_tokens": 1726212529.0, + "step": 10295 + }, + { + "entropy": 1.6689602136611938, + "epoch": 1.1310592952679135, + "grad_norm": 0.6370671391487122, + "learning_rate": 9.535898998270283e-06, + "loss": 1.403, + "mean_token_accuracy": 0.6426753501097361, + "num_tokens": 1726394511.0, + "step": 10296 + }, + { + "entropy": 1.6657602687676747, + "epoch": 1.1311691521792864, + "grad_norm": 0.7945041656494141, + "learning_rate": 9.534319220892438e-06, + "loss": 1.4308, + "mean_token_accuracy": 0.6577966163555781, + "num_tokens": 1726563985.0, + "step": 10297 + }, + { + "entropy": 1.776595026254654, + "epoch": 1.1312790090906595, + "grad_norm": 0.7368420958518982, + "learning_rate": 9.53273948990273e-06, + "loss": 1.3903, + "mean_token_accuracy": 0.6460030823945999, + "num_tokens": 1726750625.0, + "step": 10298 + }, + { + "entropy": 1.7575420339902241, + "epoch": 1.1313888660020324, + "grad_norm": 0.7476623058319092, + "learning_rate": 9.531159805351151e-06, + "loss": 1.5496, + "mean_token_accuracy": 0.6646288931369781, + "num_tokens": 1726878077.0, + "step": 10299 + }, + { + "entropy": 1.6953720152378082, + "epoch": 1.1314987229134053, + "grad_norm": 0.6216321587562561, + "learning_rate": 9.52958016728771e-06, + "loss": 1.3568, + "mean_token_accuracy": 0.6530263473590215, + "num_tokens": 1727019274.0, + "step": 10300 + }, + { + "entropy": 1.751002957423528, + "epoch": 1.1316085798247781, + "grad_norm": 0.6717413663864136, + "learning_rate": 9.528000575762387e-06, + "loss": 1.3837, + "mean_token_accuracy": 0.6547732700904211, + "num_tokens": 1727175939.0, + "step": 10301 + }, + { + "entropy": 1.7171097993850708, + "epoch": 1.1317184367361512, + "grad_norm": 0.6593231558799744, + "learning_rate": 9.526421030825186e-06, + "loss": 1.504, + "mean_token_accuracy": 0.6401314934094747, + "num_tokens": 1727319618.0, + "step": 10302 + }, + { + "entropy": 1.730803112188975, + "epoch": 1.1318282936475241, + "grad_norm": 0.6500627994537354, + "learning_rate": 9.524841532526095e-06, + "loss": 1.3106, + "mean_token_accuracy": 0.6745495200157166, + "num_tokens": 1727473341.0, + "step": 10303 + }, + { + "entropy": 1.7114306290944417, + "epoch": 1.131938150558897, + "grad_norm": 0.6906517148017883, + "learning_rate": 9.523262080915103e-06, + "loss": 1.3155, + "mean_token_accuracy": 0.672325387597084, + "num_tokens": 1727600003.0, + "step": 10304 + }, + { + "entropy": 1.6404125392436981, + "epoch": 1.13204800747027, + "grad_norm": 0.7134620547294617, + "learning_rate": 9.521682676042201e-06, + "loss": 1.2493, + "mean_token_accuracy": 0.6794395595788956, + "num_tokens": 1727738890.0, + "step": 10305 + }, + { + "entropy": 1.7687196135520935, + "epoch": 1.132157864381643, + "grad_norm": 0.7907180190086365, + "learning_rate": 9.520103317957382e-06, + "loss": 1.3394, + "mean_token_accuracy": 0.6640162070592245, + "num_tokens": 1727889905.0, + "step": 10306 + }, + { + "entropy": 1.7121462921301525, + "epoch": 1.132267721293016, + "grad_norm": 0.8624327182769775, + "learning_rate": 9.51852400671062e-06, + "loss": 1.2975, + "mean_token_accuracy": 0.6706186135609945, + "num_tokens": 1728016133.0, + "step": 10307 + }, + { + "entropy": 1.7604290346304576, + "epoch": 1.1323775782043888, + "grad_norm": 0.680280327796936, + "learning_rate": 9.516944742351905e-06, + "loss": 1.4087, + "mean_token_accuracy": 0.6513306001822153, + "num_tokens": 1728165640.0, + "step": 10308 + }, + { + "entropy": 1.7208806375662486, + "epoch": 1.1324874351157617, + "grad_norm": 0.7024528384208679, + "learning_rate": 9.515365524931223e-06, + "loss": 1.2785, + "mean_token_accuracy": 0.6805547028779984, + "num_tokens": 1728291634.0, + "step": 10309 + }, + { + "entropy": 1.6809004644552867, + "epoch": 1.1325972920271346, + "grad_norm": 0.640552818775177, + "learning_rate": 9.513786354498554e-06, + "loss": 1.4281, + "mean_token_accuracy": 0.6382468740145365, + "num_tokens": 1728470807.0, + "step": 10310 + }, + { + "entropy": 1.6941138605276744, + "epoch": 1.1327071489385077, + "grad_norm": 0.6164855360984802, + "learning_rate": 9.512207231103874e-06, + "loss": 1.4135, + "mean_token_accuracy": 0.6559168150027593, + "num_tokens": 1728644963.0, + "step": 10311 + }, + { + "entropy": 1.6928605437278748, + "epoch": 1.1328170058498805, + "grad_norm": 0.7542420625686646, + "learning_rate": 9.51062815479717e-06, + "loss": 1.5822, + "mean_token_accuracy": 0.6550649454196295, + "num_tokens": 1728820062.0, + "step": 10312 + }, + { + "entropy": 1.686714122692744, + "epoch": 1.1329268627612534, + "grad_norm": 0.6460275650024414, + "learning_rate": 9.509049125628407e-06, + "loss": 1.3163, + "mean_token_accuracy": 0.6617647508780161, + "num_tokens": 1728959484.0, + "step": 10313 + }, + { + "entropy": 1.7324243982632954, + "epoch": 1.1330367196726263, + "grad_norm": 0.718798816204071, + "learning_rate": 9.50747014364757e-06, + "loss": 1.4294, + "mean_token_accuracy": 0.6527051776647568, + "num_tokens": 1729080689.0, + "step": 10314 + }, + { + "entropy": 1.758222073316574, + "epoch": 1.1331465765839994, + "grad_norm": 0.6609033942222595, + "learning_rate": 9.505891208904634e-06, + "loss": 1.3854, + "mean_token_accuracy": 0.6570114940404892, + "num_tokens": 1729234542.0, + "step": 10315 + }, + { + "entropy": 1.7237963378429413, + "epoch": 1.1332564334953723, + "grad_norm": 0.7275382280349731, + "learning_rate": 9.504312321449565e-06, + "loss": 1.3887, + "mean_token_accuracy": 0.6633484015862147, + "num_tokens": 1729382944.0, + "step": 10316 + }, + { + "entropy": 1.7187721331914265, + "epoch": 1.1333662904067452, + "grad_norm": 0.738908052444458, + "learning_rate": 9.502733481332334e-06, + "loss": 1.4098, + "mean_token_accuracy": 0.6463323136170706, + "num_tokens": 1729543587.0, + "step": 10317 + }, + { + "entropy": 1.6943478484948475, + "epoch": 1.133476147318118, + "grad_norm": 0.7214493155479431, + "learning_rate": 9.501154688602921e-06, + "loss": 1.5356, + "mean_token_accuracy": 0.6358891526858012, + "num_tokens": 1729722777.0, + "step": 10318 + }, + { + "entropy": 1.658001681168874, + "epoch": 1.1335860042294912, + "grad_norm": 0.6282760500907898, + "learning_rate": 9.499575943311279e-06, + "loss": 1.3592, + "mean_token_accuracy": 0.6529184977213541, + "num_tokens": 1729894534.0, + "step": 10319 + }, + { + "entropy": 1.6976789931456249, + "epoch": 1.133695861140864, + "grad_norm": 0.6558998823165894, + "learning_rate": 9.497997245507387e-06, + "loss": 1.4736, + "mean_token_accuracy": 0.6610572139422098, + "num_tokens": 1730086808.0, + "step": 10320 + }, + { + "entropy": 1.7426810363928478, + "epoch": 1.133805718052237, + "grad_norm": 0.6194723844528198, + "learning_rate": 9.496418595241203e-06, + "loss": 1.4611, + "mean_token_accuracy": 0.6435061097145081, + "num_tokens": 1730288789.0, + "step": 10321 + }, + { + "entropy": 1.7035086651643117, + "epoch": 1.1339155749636098, + "grad_norm": 0.6074888706207275, + "learning_rate": 9.494839992562697e-06, + "loss": 1.4105, + "mean_token_accuracy": 0.6378484318653742, + "num_tokens": 1730467433.0, + "step": 10322 + }, + { + "entropy": 1.7067280213038127, + "epoch": 1.1340254318749827, + "grad_norm": 0.6873534917831421, + "learning_rate": 9.49326143752182e-06, + "loss": 1.3172, + "mean_token_accuracy": 0.664742906888326, + "num_tokens": 1730640614.0, + "step": 10323 + }, + { + "entropy": 1.6762806077798207, + "epoch": 1.1341352887863558, + "grad_norm": 0.6909382939338684, + "learning_rate": 9.491682930168548e-06, + "loss": 1.3528, + "mean_token_accuracy": 0.6638672153155009, + "num_tokens": 1730807800.0, + "step": 10324 + }, + { + "entropy": 1.6831237574418385, + "epoch": 1.1342451456977287, + "grad_norm": 0.7416048049926758, + "learning_rate": 9.490104470552823e-06, + "loss": 1.4919, + "mean_token_accuracy": 0.6549220134814581, + "num_tokens": 1731057934.0, + "step": 10325 + }, + { + "entropy": 1.724802275498708, + "epoch": 1.1343550026091016, + "grad_norm": 0.7422142624855042, + "learning_rate": 9.488526058724617e-06, + "loss": 1.4261, + "mean_token_accuracy": 0.6417362888654073, + "num_tokens": 1731249169.0, + "step": 10326 + }, + { + "entropy": 1.6890951693058014, + "epoch": 1.1344648595204745, + "grad_norm": 0.7085136771202087, + "learning_rate": 9.48694769473388e-06, + "loss": 1.3171, + "mean_token_accuracy": 0.6658613979816437, + "num_tokens": 1731364488.0, + "step": 10327 + }, + { + "entropy": 1.797446479399999, + "epoch": 1.1345747164318476, + "grad_norm": 0.7604406476020813, + "learning_rate": 9.485369378630564e-06, + "loss": 1.3586, + "mean_token_accuracy": 0.66508649289608, + "num_tokens": 1731510025.0, + "step": 10328 + }, + { + "entropy": 1.69396177927653, + "epoch": 1.1346845733432205, + "grad_norm": 0.6808403134346008, + "learning_rate": 9.483791110464624e-06, + "loss": 1.3246, + "mean_token_accuracy": 0.6774620612462362, + "num_tokens": 1731644186.0, + "step": 10329 + }, + { + "entropy": 1.7862180769443512, + "epoch": 1.1347944302545934, + "grad_norm": 0.7217937707901001, + "learning_rate": 9.482212890286017e-06, + "loss": 1.4788, + "mean_token_accuracy": 0.6508872807025909, + "num_tokens": 1731839500.0, + "step": 10330 + }, + { + "entropy": 1.706006020307541, + "epoch": 1.1349042871659663, + "grad_norm": 0.6489999890327454, + "learning_rate": 9.480634718144684e-06, + "loss": 1.3578, + "mean_token_accuracy": 0.6593449711799622, + "num_tokens": 1731976383.0, + "step": 10331 + }, + { + "entropy": 1.7095771531263988, + "epoch": 1.1350141440773394, + "grad_norm": 0.6405286192893982, + "learning_rate": 9.47905659409058e-06, + "loss": 1.412, + "mean_token_accuracy": 0.6624357551336288, + "num_tokens": 1732123313.0, + "step": 10332 + }, + { + "entropy": 1.715343068043391, + "epoch": 1.1351240009887122, + "grad_norm": 0.7203224301338196, + "learning_rate": 9.477478518173646e-06, + "loss": 1.5001, + "mean_token_accuracy": 0.6489410251379013, + "num_tokens": 1732294493.0, + "step": 10333 + }, + { + "entropy": 1.6658440132935841, + "epoch": 1.1352338579000851, + "grad_norm": 0.6969232559204102, + "learning_rate": 9.475900490443835e-06, + "loss": 1.2221, + "mean_token_accuracy": 0.6730835686127344, + "num_tokens": 1732415579.0, + "step": 10334 + }, + { + "entropy": 1.7030311127503712, + "epoch": 1.135343714811458, + "grad_norm": 0.6568850874900818, + "learning_rate": 9.474322510951082e-06, + "loss": 1.3944, + "mean_token_accuracy": 0.6627188473939896, + "num_tokens": 1732592393.0, + "step": 10335 + }, + { + "entropy": 1.7397877375284831, + "epoch": 1.135453571722831, + "grad_norm": 0.7089440822601318, + "learning_rate": 9.472744579745338e-06, + "loss": 1.3188, + "mean_token_accuracy": 0.6615829467773438, + "num_tokens": 1732742092.0, + "step": 10336 + }, + { + "entropy": 1.7489128410816193, + "epoch": 1.135563428634204, + "grad_norm": 0.6160045862197876, + "learning_rate": 9.471166696876539e-06, + "loss": 1.4619, + "mean_token_accuracy": 0.646346777677536, + "num_tokens": 1732946886.0, + "step": 10337 + }, + { + "entropy": 1.7360928257306416, + "epoch": 1.135673285545577, + "grad_norm": 1.8664852380752563, + "learning_rate": 9.469588862394624e-06, + "loss": 1.1419, + "mean_token_accuracy": 0.675204411149025, + "num_tokens": 1733119323.0, + "step": 10338 + }, + { + "entropy": 1.6926367580890656, + "epoch": 1.1357831424569498, + "grad_norm": 0.7028651237487793, + "learning_rate": 9.468011076349532e-06, + "loss": 1.4818, + "mean_token_accuracy": 0.6468727837006251, + "num_tokens": 1733299458.0, + "step": 10339 + }, + { + "entropy": 1.7644418974717457, + "epoch": 1.1358929993683227, + "grad_norm": 0.7585068941116333, + "learning_rate": 9.466433338791202e-06, + "loss": 1.3099, + "mean_token_accuracy": 0.6703908890485764, + "num_tokens": 1733419610.0, + "step": 10340 + }, + { + "entropy": 1.7097221116224925, + "epoch": 1.1360028562796958, + "grad_norm": 0.7578231692314148, + "learning_rate": 9.46485564976956e-06, + "loss": 1.3979, + "mean_token_accuracy": 0.6605977068344752, + "num_tokens": 1733588689.0, + "step": 10341 + }, + { + "entropy": 1.7715651094913483, + "epoch": 1.1361127131910687, + "grad_norm": 0.6864378452301025, + "learning_rate": 9.463278009334552e-06, + "loss": 1.4961, + "mean_token_accuracy": 0.6296594391266505, + "num_tokens": 1733758712.0, + "step": 10342 + }, + { + "entropy": 1.6944345732529957, + "epoch": 1.1362225701024415, + "grad_norm": 1.1330305337905884, + "learning_rate": 9.461700417536095e-06, + "loss": 1.3299, + "mean_token_accuracy": 0.6746721168359121, + "num_tokens": 1733939337.0, + "step": 10343 + }, + { + "entropy": 1.672745595375697, + "epoch": 1.1363324270138144, + "grad_norm": 0.6669848561286926, + "learning_rate": 9.460122874424136e-06, + "loss": 1.5127, + "mean_token_accuracy": 0.649186576406161, + "num_tokens": 1734107917.0, + "step": 10344 + }, + { + "entropy": 1.693463295698166, + "epoch": 1.1364422839251875, + "grad_norm": 0.6603065133094788, + "learning_rate": 9.458545380048585e-06, + "loss": 1.2553, + "mean_token_accuracy": 0.67331130305926, + "num_tokens": 1734223657.0, + "step": 10345 + }, + { + "entropy": 1.7278470595677693, + "epoch": 1.1365521408365604, + "grad_norm": 1.3493287563323975, + "learning_rate": 9.456967934459383e-06, + "loss": 1.3466, + "mean_token_accuracy": 0.660364697376887, + "num_tokens": 1734412197.0, + "step": 10346 + }, + { + "entropy": 1.6575153172016144, + "epoch": 1.1366619977479333, + "grad_norm": 0.6611323952674866, + "learning_rate": 9.455390537706451e-06, + "loss": 1.2062, + "mean_token_accuracy": 0.680364117026329, + "num_tokens": 1734545860.0, + "step": 10347 + }, + { + "entropy": 1.656428058942159, + "epoch": 1.1367718546593062, + "grad_norm": 0.6214374303817749, + "learning_rate": 9.453813189839709e-06, + "loss": 1.4702, + "mean_token_accuracy": 0.6467104901870092, + "num_tokens": 1734744456.0, + "step": 10348 + }, + { + "entropy": 1.7128709455331166, + "epoch": 1.136881711570679, + "grad_norm": 0.804625928401947, + "learning_rate": 9.452235890909083e-06, + "loss": 1.523, + "mean_token_accuracy": 0.650082861383756, + "num_tokens": 1734900916.0, + "step": 10349 + }, + { + "entropy": 1.700402319431305, + "epoch": 1.1369915684820522, + "grad_norm": 0.7087069749832153, + "learning_rate": 9.450658640964498e-06, + "loss": 1.234, + "mean_token_accuracy": 0.6779408504565557, + "num_tokens": 1735001282.0, + "step": 10350 + }, + { + "entropy": 1.745913565158844, + "epoch": 1.137101425393425, + "grad_norm": 0.7145112156867981, + "learning_rate": 9.449081440055865e-06, + "loss": 1.4814, + "mean_token_accuracy": 0.6550154387950897, + "num_tokens": 1735153742.0, + "step": 10351 + }, + { + "entropy": 1.7599404752254486, + "epoch": 1.137211282304798, + "grad_norm": 1.112202763557434, + "learning_rate": 9.447504288233104e-06, + "loss": 1.5053, + "mean_token_accuracy": 0.6597884198029836, + "num_tokens": 1735351148.0, + "step": 10352 + }, + { + "entropy": 1.6744179526964824, + "epoch": 1.137321139216171, + "grad_norm": 0.5947840213775635, + "learning_rate": 9.44592718554614e-06, + "loss": 1.294, + "mean_token_accuracy": 0.6707391689221064, + "num_tokens": 1735548975.0, + "step": 10353 + }, + { + "entropy": 1.7266633212566376, + "epoch": 1.137430996127544, + "grad_norm": 0.6907797455787659, + "learning_rate": 9.444350132044873e-06, + "loss": 1.4707, + "mean_token_accuracy": 0.6492672860622406, + "num_tokens": 1735725525.0, + "step": 10354 + }, + { + "entropy": 1.7150587638219197, + "epoch": 1.1375408530389168, + "grad_norm": 0.639342725276947, + "learning_rate": 9.442773127779226e-06, + "loss": 1.3635, + "mean_token_accuracy": 0.6725068837404251, + "num_tokens": 1735894005.0, + "step": 10355 + }, + { + "entropy": 1.6278500159581502, + "epoch": 1.1376507099502897, + "grad_norm": 0.719607412815094, + "learning_rate": 9.44119617279911e-06, + "loss": 1.2805, + "mean_token_accuracy": 0.67085100710392, + "num_tokens": 1736025755.0, + "step": 10356 + }, + { + "entropy": 1.6743311981360118, + "epoch": 1.1377605668616626, + "grad_norm": 0.6438021659851074, + "learning_rate": 9.439619267154428e-06, + "loss": 1.3495, + "mean_token_accuracy": 0.6617470035950342, + "num_tokens": 1736176599.0, + "step": 10357 + }, + { + "entropy": 1.6663430829842885, + "epoch": 1.1378704237730357, + "grad_norm": 0.6496438384056091, + "learning_rate": 9.438042410895097e-06, + "loss": 1.3258, + "mean_token_accuracy": 0.6678551882505417, + "num_tokens": 1736314214.0, + "step": 10358 + }, + { + "entropy": 1.6967433889706929, + "epoch": 1.1379802806844086, + "grad_norm": 0.6643569469451904, + "learning_rate": 9.436465604071019e-06, + "loss": 1.3103, + "mean_token_accuracy": 0.6723993321259817, + "num_tokens": 1736448597.0, + "step": 10359 + }, + { + "entropy": 1.6814461847146351, + "epoch": 1.1380901375957815, + "grad_norm": 0.6767435073852539, + "learning_rate": 9.434888846732097e-06, + "loss": 1.4335, + "mean_token_accuracy": 0.6468447397152582, + "num_tokens": 1736668920.0, + "step": 10360 + }, + { + "entropy": 1.7169758081436157, + "epoch": 1.1381999945071544, + "grad_norm": 0.8308820128440857, + "learning_rate": 9.43331213892824e-06, + "loss": 1.3841, + "mean_token_accuracy": 0.6509679108858109, + "num_tokens": 1736800070.0, + "step": 10361 + }, + { + "entropy": 1.6866010129451752, + "epoch": 1.1383098514185273, + "grad_norm": 0.8533633351325989, + "learning_rate": 9.431735480709352e-06, + "loss": 1.4519, + "mean_token_accuracy": 0.6826412826776505, + "num_tokens": 1736939842.0, + "step": 10362 + }, + { + "entropy": 1.7029000719388325, + "epoch": 1.1384197083299004, + "grad_norm": 0.797741711139679, + "learning_rate": 9.430158872125324e-06, + "loss": 1.4602, + "mean_token_accuracy": 0.6410268098115921, + "num_tokens": 1737129340.0, + "step": 10363 + }, + { + "entropy": 1.7252192397912343, + "epoch": 1.1385295652412732, + "grad_norm": 0.7115998864173889, + "learning_rate": 9.42858231322606e-06, + "loss": 1.3813, + "mean_token_accuracy": 0.656056766708692, + "num_tokens": 1737301283.0, + "step": 10364 + }, + { + "entropy": 1.7449837823708851, + "epoch": 1.1386394221526461, + "grad_norm": 0.7513498663902283, + "learning_rate": 9.427005804061462e-06, + "loss": 1.532, + "mean_token_accuracy": 0.6449931561946869, + "num_tokens": 1737480963.0, + "step": 10365 + }, + { + "entropy": 1.7203444143136342, + "epoch": 1.1387492790640192, + "grad_norm": 0.7123376131057739, + "learning_rate": 9.425429344681415e-06, + "loss": 1.6309, + "mean_token_accuracy": 0.6379441867272059, + "num_tokens": 1737657860.0, + "step": 10366 + }, + { + "entropy": 1.6939558287461598, + "epoch": 1.1388591359753921, + "grad_norm": 0.6163555979728699, + "learning_rate": 9.423852935135824e-06, + "loss": 1.2301, + "mean_token_accuracy": 0.6741809546947479, + "num_tokens": 1737806597.0, + "step": 10367 + }, + { + "entropy": 1.614011029402415, + "epoch": 1.138968992886765, + "grad_norm": 0.7325506210327148, + "learning_rate": 9.42227657547458e-06, + "loss": 1.2167, + "mean_token_accuracy": 0.6826542516549429, + "num_tokens": 1737916809.0, + "step": 10368 + }, + { + "entropy": 1.682138333717982, + "epoch": 1.139078849798138, + "grad_norm": 0.5863914489746094, + "learning_rate": 9.420700265747566e-06, + "loss": 1.3563, + "mean_token_accuracy": 0.6610041856765747, + "num_tokens": 1738100933.0, + "step": 10369 + }, + { + "entropy": 1.6821511387825012, + "epoch": 1.1391887067095108, + "grad_norm": 0.6776061654090881, + "learning_rate": 9.419124006004681e-06, + "loss": 1.4566, + "mean_token_accuracy": 0.6502898782491684, + "num_tokens": 1738277417.0, + "step": 10370 + }, + { + "entropy": 1.6609342396259308, + "epoch": 1.1392985636208839, + "grad_norm": 0.617734432220459, + "learning_rate": 9.417547796295807e-06, + "loss": 1.4432, + "mean_token_accuracy": 0.6545563538869222, + "num_tokens": 1738457334.0, + "step": 10371 + }, + { + "entropy": 1.7074103355407715, + "epoch": 1.1394084205322568, + "grad_norm": 0.6897749900817871, + "learning_rate": 9.415971636670832e-06, + "loss": 1.3867, + "mean_token_accuracy": 0.6595341066519419, + "num_tokens": 1738604909.0, + "step": 10372 + }, + { + "entropy": 1.6968832810719807, + "epoch": 1.1395182774436297, + "grad_norm": 0.6201480031013489, + "learning_rate": 9.41439552717964e-06, + "loss": 1.4053, + "mean_token_accuracy": 0.6468455741802851, + "num_tokens": 1738777998.0, + "step": 10373 + }, + { + "entropy": 1.635752648115158, + "epoch": 1.1396281343550025, + "grad_norm": 0.5653038620948792, + "learning_rate": 9.412819467872119e-06, + "loss": 1.2893, + "mean_token_accuracy": 0.6629117280244827, + "num_tokens": 1738979425.0, + "step": 10374 + }, + { + "entropy": 1.6903326710065205, + "epoch": 1.1397379912663754, + "grad_norm": 0.7527031898498535, + "learning_rate": 9.411243458798144e-06, + "loss": 1.4398, + "mean_token_accuracy": 0.6569599111874899, + "num_tokens": 1739148251.0, + "step": 10375 + }, + { + "entropy": 1.7592523097991943, + "epoch": 1.1398478481777485, + "grad_norm": 0.6619188785552979, + "learning_rate": 9.409667500007595e-06, + "loss": 1.4471, + "mean_token_accuracy": 0.664801706870397, + "num_tokens": 1739340860.0, + "step": 10376 + }, + { + "entropy": 1.6681431134541829, + "epoch": 1.1399577050891214, + "grad_norm": 0.6245502829551697, + "learning_rate": 9.408091591550359e-06, + "loss": 1.4495, + "mean_token_accuracy": 0.6568524142106374, + "num_tokens": 1739547206.0, + "step": 10377 + }, + { + "entropy": 1.6391673783461254, + "epoch": 1.1400675620004943, + "grad_norm": 0.6753904223442078, + "learning_rate": 9.406515733476302e-06, + "loss": 1.4371, + "mean_token_accuracy": 0.6508794724941254, + "num_tokens": 1739790829.0, + "step": 10378 + }, + { + "entropy": 1.6744989454746246, + "epoch": 1.1401774189118674, + "grad_norm": 0.6560745239257812, + "learning_rate": 9.404939925835304e-06, + "loss": 1.3288, + "mean_token_accuracy": 0.6673153092463812, + "num_tokens": 1739931875.0, + "step": 10379 + }, + { + "entropy": 1.7254528601964314, + "epoch": 1.1402872758232403, + "grad_norm": 0.6491901278495789, + "learning_rate": 9.403364168677242e-06, + "loss": 1.3784, + "mean_token_accuracy": 0.6630249718825022, + "num_tokens": 1740098056.0, + "step": 10380 + }, + { + "entropy": 1.7695842186609905, + "epoch": 1.1403971327346132, + "grad_norm": 0.739123523235321, + "learning_rate": 9.401788462051981e-06, + "loss": 1.4259, + "mean_token_accuracy": 0.6524376769860586, + "num_tokens": 1740252560.0, + "step": 10381 + }, + { + "entropy": 1.6941548983256023, + "epoch": 1.140506989645986, + "grad_norm": 0.7095143795013428, + "learning_rate": 9.400212806009396e-06, + "loss": 1.4692, + "mean_token_accuracy": 0.6424577981233597, + "num_tokens": 1740446012.0, + "step": 10382 + }, + { + "entropy": 1.7164734701315563, + "epoch": 1.140616846557359, + "grad_norm": 0.7843037843704224, + "learning_rate": 9.398637200599357e-06, + "loss": 1.5502, + "mean_token_accuracy": 0.6457971682151159, + "num_tokens": 1740601766.0, + "step": 10383 + }, + { + "entropy": 1.7246180772781372, + "epoch": 1.140726703468732, + "grad_norm": 0.6460191607475281, + "learning_rate": 9.397061645871728e-06, + "loss": 1.4905, + "mean_token_accuracy": 0.6386567503213882, + "num_tokens": 1740768615.0, + "step": 10384 + }, + { + "entropy": 1.6967721978823345, + "epoch": 1.140836560380105, + "grad_norm": 0.6672912240028381, + "learning_rate": 9.395486141876374e-06, + "loss": 1.4701, + "mean_token_accuracy": 0.6470180948575338, + "num_tokens": 1740969706.0, + "step": 10385 + }, + { + "entropy": 1.6823700368404388, + "epoch": 1.1409464172914778, + "grad_norm": 0.7741503119468689, + "learning_rate": 9.393910688663164e-06, + "loss": 1.2931, + "mean_token_accuracy": 0.6733145167430242, + "num_tokens": 1741106299.0, + "step": 10386 + }, + { + "entropy": 1.7498717904090881, + "epoch": 1.1410562742028507, + "grad_norm": 0.7507087588310242, + "learning_rate": 9.392335286281953e-06, + "loss": 1.5033, + "mean_token_accuracy": 0.6538749684890112, + "num_tokens": 1741254646.0, + "step": 10387 + }, + { + "entropy": 1.6514920592308044, + "epoch": 1.1411661311142236, + "grad_norm": 0.5859827399253845, + "learning_rate": 9.390759934782607e-06, + "loss": 1.3249, + "mean_token_accuracy": 0.6833820442358652, + "num_tokens": 1741432870.0, + "step": 10388 + }, + { + "entropy": 1.684400051832199, + "epoch": 1.1412759880255967, + "grad_norm": 0.6740873456001282, + "learning_rate": 9.389184634214985e-06, + "loss": 1.3331, + "mean_token_accuracy": 0.6611009438832601, + "num_tokens": 1741574186.0, + "step": 10389 + }, + { + "entropy": 1.742189993460973, + "epoch": 1.1413858449369696, + "grad_norm": 0.7069135308265686, + "learning_rate": 9.387609384628945e-06, + "loss": 1.2605, + "mean_token_accuracy": 0.6775266925493876, + "num_tokens": 1741691346.0, + "step": 10390 + }, + { + "entropy": 1.735118528207143, + "epoch": 1.1414957018483425, + "grad_norm": 0.8108046054840088, + "learning_rate": 9.386034186074335e-06, + "loss": 1.3915, + "mean_token_accuracy": 0.66219495733579, + "num_tokens": 1741833271.0, + "step": 10391 + }, + { + "entropy": 1.7121002276738484, + "epoch": 1.1416055587597156, + "grad_norm": 0.7746871113777161, + "learning_rate": 9.384459038601024e-06, + "loss": 1.3267, + "mean_token_accuracy": 0.6555629670619965, + "num_tokens": 1741960943.0, + "step": 10392 + }, + { + "entropy": 1.7076607942581177, + "epoch": 1.1417154156710885, + "grad_norm": 0.6252023577690125, + "learning_rate": 9.382883942258849e-06, + "loss": 1.3896, + "mean_token_accuracy": 0.653433953722318, + "num_tokens": 1742171872.0, + "step": 10393 + }, + { + "entropy": 1.6984122693538666, + "epoch": 1.1418252725824614, + "grad_norm": 0.7633947134017944, + "learning_rate": 9.381308897097671e-06, + "loss": 1.4597, + "mean_token_accuracy": 0.6523663302262624, + "num_tokens": 1742335760.0, + "step": 10394 + }, + { + "entropy": 1.682322899500529, + "epoch": 1.1419351294938342, + "grad_norm": 0.6641841530799866, + "learning_rate": 9.37973390316734e-06, + "loss": 1.4247, + "mean_token_accuracy": 0.6529939075311025, + "num_tokens": 1742492989.0, + "step": 10395 + }, + { + "entropy": 1.7189187506834667, + "epoch": 1.1420449864052071, + "grad_norm": 0.6823170185089111, + "learning_rate": 9.378158960517701e-06, + "loss": 1.345, + "mean_token_accuracy": 0.6566206763188044, + "num_tokens": 1742644794.0, + "step": 10396 + }, + { + "entropy": 1.629441926876704, + "epoch": 1.1421548433165802, + "grad_norm": 0.654874861240387, + "learning_rate": 9.376584069198593e-06, + "loss": 1.3227, + "mean_token_accuracy": 0.6684759259223938, + "num_tokens": 1742817557.0, + "step": 10397 + }, + { + "entropy": 1.6485731303691864, + "epoch": 1.1422647002279531, + "grad_norm": 0.6740143895149231, + "learning_rate": 9.375009229259878e-06, + "loss": 1.4679, + "mean_token_accuracy": 0.6339434087276459, + "num_tokens": 1743044080.0, + "step": 10398 + }, + { + "entropy": 1.6696706712245941, + "epoch": 1.142374557139326, + "grad_norm": 0.7383650541305542, + "learning_rate": 9.37343444075138e-06, + "loss": 1.3688, + "mean_token_accuracy": 0.6621562987565994, + "num_tokens": 1743274614.0, + "step": 10399 + }, + { + "entropy": 1.6913128296534221, + "epoch": 1.142484414050699, + "grad_norm": 0.5945432782173157, + "learning_rate": 9.371859703722952e-06, + "loss": 1.3708, + "mean_token_accuracy": 0.6589928964773814, + "num_tokens": 1743456863.0, + "step": 10400 + }, + { + "entropy": 1.7033980588118236, + "epoch": 1.1425942709620718, + "grad_norm": 0.6133428812026978, + "learning_rate": 9.370285018224432e-06, + "loss": 1.2997, + "mean_token_accuracy": 0.6748911092678705, + "num_tokens": 1743617303.0, + "step": 10401 + }, + { + "entropy": 1.6469106773535411, + "epoch": 1.1427041278734449, + "grad_norm": 0.7003388404846191, + "learning_rate": 9.368710384305656e-06, + "loss": 1.2912, + "mean_token_accuracy": 0.6673061301310858, + "num_tokens": 1743751670.0, + "step": 10402 + }, + { + "entropy": 1.6901063521703084, + "epoch": 1.1428139847848178, + "grad_norm": 0.663631796836853, + "learning_rate": 9.367135802016463e-06, + "loss": 1.4043, + "mean_token_accuracy": 0.6495856940746307, + "num_tokens": 1743894039.0, + "step": 10403 + }, + { + "entropy": 1.7655975222587585, + "epoch": 1.1429238416961907, + "grad_norm": 0.8853231072425842, + "learning_rate": 9.365561271406684e-06, + "loss": 1.526, + "mean_token_accuracy": 0.628182902932167, + "num_tokens": 1744066506.0, + "step": 10404 + }, + { + "entropy": 1.6902997593084972, + "epoch": 1.1430336986075638, + "grad_norm": 0.8013792634010315, + "learning_rate": 9.363986792526152e-06, + "loss": 1.4213, + "mean_token_accuracy": 0.6655001491308212, + "num_tokens": 1744211659.0, + "step": 10405 + }, + { + "entropy": 1.6334332625071208, + "epoch": 1.1431435555189366, + "grad_norm": 0.6941357254981995, + "learning_rate": 9.362412365424704e-06, + "loss": 1.2846, + "mean_token_accuracy": 0.6686849494775137, + "num_tokens": 1744364432.0, + "step": 10406 + }, + { + "entropy": 1.6745346983273823, + "epoch": 1.1432534124303095, + "grad_norm": 0.7382486462593079, + "learning_rate": 9.360837990152167e-06, + "loss": 1.3855, + "mean_token_accuracy": 0.6577971825997034, + "num_tokens": 1744536714.0, + "step": 10407 + }, + { + "entropy": 1.6726201673348744, + "epoch": 1.1433632693416824, + "grad_norm": 0.7212955951690674, + "learning_rate": 9.359263666758367e-06, + "loss": 1.3708, + "mean_token_accuracy": 0.6609119226535162, + "num_tokens": 1744731550.0, + "step": 10408 + }, + { + "entropy": 1.690768967072169, + "epoch": 1.1434731262530553, + "grad_norm": 0.7584574818611145, + "learning_rate": 9.357689395293134e-06, + "loss": 1.3912, + "mean_token_accuracy": 0.6541225661834081, + "num_tokens": 1744924719.0, + "step": 10409 + }, + { + "entropy": 1.6714553038279216, + "epoch": 1.1435829831644284, + "grad_norm": 0.7702672481536865, + "learning_rate": 9.356115175806292e-06, + "loss": 1.3676, + "mean_token_accuracy": 0.6619028945763906, + "num_tokens": 1745084791.0, + "step": 10410 + }, + { + "entropy": 1.7582578659057617, + "epoch": 1.1436928400758013, + "grad_norm": 0.6678996086120605, + "learning_rate": 9.354541008347661e-06, + "loss": 1.3649, + "mean_token_accuracy": 0.6453887671232224, + "num_tokens": 1745250672.0, + "step": 10411 + }, + { + "entropy": 1.6818044086297352, + "epoch": 1.1438026969871742, + "grad_norm": 0.7201360464096069, + "learning_rate": 9.352966892967072e-06, + "loss": 1.478, + "mean_token_accuracy": 0.6579025636116663, + "num_tokens": 1745445321.0, + "step": 10412 + }, + { + "entropy": 1.7616774141788483, + "epoch": 1.143912553898547, + "grad_norm": 0.6419490575790405, + "learning_rate": 9.351392829714332e-06, + "loss": 1.5105, + "mean_token_accuracy": 0.636634940902392, + "num_tokens": 1745691802.0, + "step": 10413 + }, + { + "entropy": 1.7232487003008525, + "epoch": 1.1440224108099202, + "grad_norm": 0.6756424903869629, + "learning_rate": 9.349818818639267e-06, + "loss": 1.5376, + "mean_token_accuracy": 0.661085287729899, + "num_tokens": 1745864777.0, + "step": 10414 + }, + { + "entropy": 1.7252596020698547, + "epoch": 1.144132267721293, + "grad_norm": 0.6202812790870667, + "learning_rate": 9.348244859791698e-06, + "loss": 1.3156, + "mean_token_accuracy": 0.664531409740448, + "num_tokens": 1745989005.0, + "step": 10415 + }, + { + "entropy": 1.6829163233439128, + "epoch": 1.144242124632666, + "grad_norm": 0.7266920208930969, + "learning_rate": 9.346670953221429e-06, + "loss": 1.4073, + "mean_token_accuracy": 0.6535108834505081, + "num_tokens": 1746165103.0, + "step": 10416 + }, + { + "entropy": 1.7211932837963104, + "epoch": 1.1443519815440388, + "grad_norm": 0.6822185516357422, + "learning_rate": 9.34509709897828e-06, + "loss": 1.5674, + "mean_token_accuracy": 0.6551593492428461, + "num_tokens": 1746403549.0, + "step": 10417 + }, + { + "entropy": 1.6584857602914174, + "epoch": 1.144461838455412, + "grad_norm": 0.6132012605667114, + "learning_rate": 9.343523297112066e-06, + "loss": 1.3687, + "mean_token_accuracy": 0.6635250995556513, + "num_tokens": 1746571701.0, + "step": 10418 + }, + { + "entropy": 1.6848807831605275, + "epoch": 1.1445716953667848, + "grad_norm": 0.6472894549369812, + "learning_rate": 9.341949547672588e-06, + "loss": 1.335, + "mean_token_accuracy": 0.6618951757748922, + "num_tokens": 1746734240.0, + "step": 10419 + }, + { + "entropy": 1.718974103530248, + "epoch": 1.1446815522781577, + "grad_norm": 0.736495852470398, + "learning_rate": 9.340375850709663e-06, + "loss": 1.4658, + "mean_token_accuracy": 0.6561418076356252, + "num_tokens": 1746909958.0, + "step": 10420 + }, + { + "entropy": 1.7155489722887676, + "epoch": 1.1447914091895306, + "grad_norm": 0.6326528191566467, + "learning_rate": 9.338802206273097e-06, + "loss": 1.4693, + "mean_token_accuracy": 0.6499947756528854, + "num_tokens": 1747163992.0, + "step": 10421 + }, + { + "entropy": 1.6991062760353088, + "epoch": 1.1449012661009035, + "grad_norm": 0.6131667494773865, + "learning_rate": 9.337228614412688e-06, + "loss": 1.351, + "mean_token_accuracy": 0.6598118593295416, + "num_tokens": 1747305956.0, + "step": 10422 + }, + { + "entropy": 1.7058619757493336, + "epoch": 1.1450111230122766, + "grad_norm": 0.7168628573417664, + "learning_rate": 9.335655075178243e-06, + "loss": 1.3225, + "mean_token_accuracy": 0.6609494437774023, + "num_tokens": 1747424468.0, + "step": 10423 + }, + { + "entropy": 1.6946961383024852, + "epoch": 1.1451209799236495, + "grad_norm": 0.6358611583709717, + "learning_rate": 9.33408158861957e-06, + "loss": 1.3029, + "mean_token_accuracy": 0.6581896990537643, + "num_tokens": 1747556121.0, + "step": 10424 + }, + { + "entropy": 1.782209446032842, + "epoch": 1.1452308368350224, + "grad_norm": 0.7476517558097839, + "learning_rate": 9.33250815478646e-06, + "loss": 1.3775, + "mean_token_accuracy": 0.6569018463293711, + "num_tokens": 1747682025.0, + "step": 10425 + }, + { + "entropy": 1.6908418933550518, + "epoch": 1.1453406937463952, + "grad_norm": 0.7563366293907166, + "learning_rate": 9.330934773728717e-06, + "loss": 1.3391, + "mean_token_accuracy": 0.6665392766396204, + "num_tokens": 1747809976.0, + "step": 10426 + }, + { + "entropy": 1.6805396974086761, + "epoch": 1.1454505506577684, + "grad_norm": 0.5949506163597107, + "learning_rate": 9.32936144549614e-06, + "loss": 1.4936, + "mean_token_accuracy": 0.6355966081221899, + "num_tokens": 1747996141.0, + "step": 10427 + }, + { + "entropy": 1.679563969373703, + "epoch": 1.1455604075691412, + "grad_norm": 0.6496044397354126, + "learning_rate": 9.327788170138514e-06, + "loss": 1.3572, + "mean_token_accuracy": 0.6697363605101904, + "num_tokens": 1748139185.0, + "step": 10428 + }, + { + "entropy": 1.6627104580402374, + "epoch": 1.1456702644805141, + "grad_norm": 0.623802661895752, + "learning_rate": 9.326214947705641e-06, + "loss": 1.3569, + "mean_token_accuracy": 0.6598296562830607, + "num_tokens": 1748306822.0, + "step": 10429 + }, + { + "entropy": 1.713492641846339, + "epoch": 1.145780121391887, + "grad_norm": 0.7390007972717285, + "learning_rate": 9.324641778247313e-06, + "loss": 1.4243, + "mean_token_accuracy": 0.6549296230077744, + "num_tokens": 1748528987.0, + "step": 10430 + }, + { + "entropy": 1.6868635416030884, + "epoch": 1.1458899783032601, + "grad_norm": 0.6720066666603088, + "learning_rate": 9.323068661813315e-06, + "loss": 1.3167, + "mean_token_accuracy": 0.6549607117970785, + "num_tokens": 1748712876.0, + "step": 10431 + }, + { + "entropy": 1.6507586737473805, + "epoch": 1.145999835214633, + "grad_norm": 0.6831554174423218, + "learning_rate": 9.321495598453438e-06, + "loss": 1.2928, + "mean_token_accuracy": 0.6722377041975657, + "num_tokens": 1748849425.0, + "step": 10432 + }, + { + "entropy": 1.6795489092667897, + "epoch": 1.1461096921260059, + "grad_norm": 0.6272848844528198, + "learning_rate": 9.319922588217472e-06, + "loss": 1.4739, + "mean_token_accuracy": 0.6550329575935999, + "num_tokens": 1749033503.0, + "step": 10433 + }, + { + "entropy": 1.6624679764111836, + "epoch": 1.1462195490373788, + "grad_norm": 0.7027580738067627, + "learning_rate": 9.318349631155197e-06, + "loss": 1.3611, + "mean_token_accuracy": 0.6647091160217921, + "num_tokens": 1749269891.0, + "step": 10434 + }, + { + "entropy": 1.6587688227494557, + "epoch": 1.1463294059487517, + "grad_norm": 0.695829451084137, + "learning_rate": 9.316776727316397e-06, + "loss": 1.5764, + "mean_token_accuracy": 0.6489768524964651, + "num_tokens": 1749462840.0, + "step": 10435 + }, + { + "entropy": 1.72525155544281, + "epoch": 1.1464392628601248, + "grad_norm": 0.6153085231781006, + "learning_rate": 9.31520387675086e-06, + "loss": 1.4573, + "mean_token_accuracy": 0.6477632522583008, + "num_tokens": 1749623332.0, + "step": 10436 + }, + { + "entropy": 1.6733653446038563, + "epoch": 1.1465491197714976, + "grad_norm": 0.6889209747314453, + "learning_rate": 9.313631079508357e-06, + "loss": 1.2139, + "mean_token_accuracy": 0.6837521890799204, + "num_tokens": 1749756254.0, + "step": 10437 + }, + { + "entropy": 1.6137581169605255, + "epoch": 1.1466589766828705, + "grad_norm": 0.6261329054832458, + "learning_rate": 9.312058335638669e-06, + "loss": 1.2555, + "mean_token_accuracy": 0.6895642032225927, + "num_tokens": 1749906250.0, + "step": 10438 + }, + { + "entropy": 1.6432836850484211, + "epoch": 1.1467688335942434, + "grad_norm": 0.6834116578102112, + "learning_rate": 9.31048564519158e-06, + "loss": 1.3124, + "mean_token_accuracy": 0.66798102358977, + "num_tokens": 1750054646.0, + "step": 10439 + }, + { + "entropy": 1.6783512830734253, + "epoch": 1.1468786905056165, + "grad_norm": 0.6411421895027161, + "learning_rate": 9.308913008216855e-06, + "loss": 1.3029, + "mean_token_accuracy": 0.6628169417381287, + "num_tokens": 1750191191.0, + "step": 10440 + }, + { + "entropy": 1.7019068499406178, + "epoch": 1.1469885474169894, + "grad_norm": 0.8326993584632874, + "learning_rate": 9.30734042476427e-06, + "loss": 1.4707, + "mean_token_accuracy": 0.6603338221708933, + "num_tokens": 1750340860.0, + "step": 10441 + }, + { + "entropy": 1.7151079376538594, + "epoch": 1.1470984043283623, + "grad_norm": 0.8003994822502136, + "learning_rate": 9.305767894883602e-06, + "loss": 1.4788, + "mean_token_accuracy": 0.6640694737434387, + "num_tokens": 1750456588.0, + "step": 10442 + }, + { + "entropy": 1.743663897116979, + "epoch": 1.1472082612397352, + "grad_norm": 0.6902558207511902, + "learning_rate": 9.304195418624614e-06, + "loss": 1.4213, + "mean_token_accuracy": 0.6569770723581314, + "num_tokens": 1750652408.0, + "step": 10443 + }, + { + "entropy": 1.6893725295861561, + "epoch": 1.1473181181511083, + "grad_norm": 0.650435745716095, + "learning_rate": 9.302622996037074e-06, + "loss": 1.3089, + "mean_token_accuracy": 0.6710364570220312, + "num_tokens": 1750803978.0, + "step": 10444 + }, + { + "entropy": 1.7381121218204498, + "epoch": 1.1474279750624812, + "grad_norm": 0.6412340402603149, + "learning_rate": 9.301050627170758e-06, + "loss": 1.4465, + "mean_token_accuracy": 0.653094212214152, + "num_tokens": 1750997989.0, + "step": 10445 + }, + { + "entropy": 1.7343399027983348, + "epoch": 1.147537831973854, + "grad_norm": 0.6787511706352234, + "learning_rate": 9.299478312075421e-06, + "loss": 1.4958, + "mean_token_accuracy": 0.6529847681522369, + "num_tokens": 1751158800.0, + "step": 10446 + }, + { + "entropy": 1.6631225248177846, + "epoch": 1.147647688885227, + "grad_norm": 0.6875215768814087, + "learning_rate": 9.297906050800824e-06, + "loss": 1.1925, + "mean_token_accuracy": 0.6849933316310247, + "num_tokens": 1751279105.0, + "step": 10447 + }, + { + "entropy": 1.7318195203940074, + "epoch": 1.1477575457965998, + "grad_norm": 0.8004332780838013, + "learning_rate": 9.296333843396743e-06, + "loss": 1.4193, + "mean_token_accuracy": 0.6542117198308309, + "num_tokens": 1751478243.0, + "step": 10448 + }, + { + "entropy": 1.7312237322330475, + "epoch": 1.147867402707973, + "grad_norm": 0.6288403868675232, + "learning_rate": 9.294761689912921e-06, + "loss": 1.3694, + "mean_token_accuracy": 0.6531921078761419, + "num_tokens": 1751639847.0, + "step": 10449 + }, + { + "entropy": 1.665016194184621, + "epoch": 1.1479772596193458, + "grad_norm": 0.7054689526557922, + "learning_rate": 9.293189590399126e-06, + "loss": 1.5536, + "mean_token_accuracy": 0.6347174296776453, + "num_tokens": 1751821535.0, + "step": 10450 + }, + { + "entropy": 1.6455471416314442, + "epoch": 1.1480871165307187, + "grad_norm": 0.6462990641593933, + "learning_rate": 9.291617544905112e-06, + "loss": 1.2751, + "mean_token_accuracy": 0.6752283871173859, + "num_tokens": 1751950364.0, + "step": 10451 + }, + { + "entropy": 1.6657202740510304, + "epoch": 1.1481969734420916, + "grad_norm": 0.6472091674804688, + "learning_rate": 9.29004555348063e-06, + "loss": 1.4169, + "mean_token_accuracy": 0.6555340985457102, + "num_tokens": 1752107814.0, + "step": 10452 + }, + { + "entropy": 1.7669294873873393, + "epoch": 1.1483068303534647, + "grad_norm": 0.7394276857376099, + "learning_rate": 9.288473616175438e-06, + "loss": 1.3615, + "mean_token_accuracy": 0.6475641032059988, + "num_tokens": 1752250559.0, + "step": 10453 + }, + { + "entropy": 1.6974846025307972, + "epoch": 1.1484166872648376, + "grad_norm": 0.7621778845787048, + "learning_rate": 9.286901733039286e-06, + "loss": 1.4146, + "mean_token_accuracy": 0.6701687673727671, + "num_tokens": 1752411368.0, + "step": 10454 + }, + { + "entropy": 1.7656433582305908, + "epoch": 1.1485265441762105, + "grad_norm": 0.7843154072761536, + "learning_rate": 9.285329904121918e-06, + "loss": 1.3917, + "mean_token_accuracy": 0.658236563205719, + "num_tokens": 1752544724.0, + "step": 10455 + }, + { + "entropy": 1.7151671946048737, + "epoch": 1.1486364010875834, + "grad_norm": 0.6612775325775146, + "learning_rate": 9.283758129473088e-06, + "loss": 1.3915, + "mean_token_accuracy": 0.6535660674174627, + "num_tokens": 1752679479.0, + "step": 10456 + }, + { + "entropy": 1.7036021947860718, + "epoch": 1.1487462579989565, + "grad_norm": 0.6847598552703857, + "learning_rate": 9.282186409142542e-06, + "loss": 1.3911, + "mean_token_accuracy": 0.6572980483373007, + "num_tokens": 1752868566.0, + "step": 10457 + }, + { + "entropy": 1.7341304918130238, + "epoch": 1.1488561149103294, + "grad_norm": 0.66192227602005, + "learning_rate": 9.280614743180019e-06, + "loss": 1.4441, + "mean_token_accuracy": 0.6463885257641474, + "num_tokens": 1753043553.0, + "step": 10458 + }, + { + "entropy": 1.7356309394041698, + "epoch": 1.1489659718217022, + "grad_norm": 0.7760790586471558, + "learning_rate": 9.279043131635266e-06, + "loss": 1.483, + "mean_token_accuracy": 0.6427653779586157, + "num_tokens": 1753185343.0, + "step": 10459 + }, + { + "entropy": 1.7046063840389252, + "epoch": 1.1490758287330751, + "grad_norm": 0.6848695874214172, + "learning_rate": 9.277471574558023e-06, + "loss": 1.4344, + "mean_token_accuracy": 0.6615995417038599, + "num_tokens": 1753355296.0, + "step": 10460 + }, + { + "entropy": 1.7546161313851674, + "epoch": 1.149185685644448, + "grad_norm": 0.6744615435600281, + "learning_rate": 9.275900071998028e-06, + "loss": 1.3556, + "mean_token_accuracy": 0.6578517059485117, + "num_tokens": 1753553622.0, + "step": 10461 + }, + { + "entropy": 1.7109164694945018, + "epoch": 1.1492955425558211, + "grad_norm": 0.6636914014816284, + "learning_rate": 9.274328624005019e-06, + "loss": 1.3815, + "mean_token_accuracy": 0.6661298722028732, + "num_tokens": 1753736512.0, + "step": 10462 + }, + { + "entropy": 1.6936496595541637, + "epoch": 1.149405399467194, + "grad_norm": 0.7392176389694214, + "learning_rate": 9.272757230628731e-06, + "loss": 1.5186, + "mean_token_accuracy": 0.6480444173018137, + "num_tokens": 1753937725.0, + "step": 10463 + }, + { + "entropy": 1.7084623177846272, + "epoch": 1.1495152563785669, + "grad_norm": 0.7401105165481567, + "learning_rate": 9.271185891918896e-06, + "loss": 1.3537, + "mean_token_accuracy": 0.6713838477929434, + "num_tokens": 1754103003.0, + "step": 10464 + }, + { + "entropy": 1.686184932788213, + "epoch": 1.1496251132899398, + "grad_norm": 0.6153541803359985, + "learning_rate": 9.269614607925255e-06, + "loss": 1.5945, + "mean_token_accuracy": 0.6229482889175415, + "num_tokens": 1754331188.0, + "step": 10465 + }, + { + "entropy": 1.7238997519016266, + "epoch": 1.1497349702013129, + "grad_norm": 0.7695441246032715, + "learning_rate": 9.268043378697527e-06, + "loss": 1.4725, + "mean_token_accuracy": 0.6536758492390314, + "num_tokens": 1754507673.0, + "step": 10466 + }, + { + "entropy": 1.710547149181366, + "epoch": 1.1498448271126858, + "grad_norm": 0.6528117060661316, + "learning_rate": 9.266472204285443e-06, + "loss": 1.3943, + "mean_token_accuracy": 0.6460304210583369, + "num_tokens": 1754693959.0, + "step": 10467 + }, + { + "entropy": 1.6860848863919575, + "epoch": 1.1499546840240586, + "grad_norm": 0.6279901266098022, + "learning_rate": 9.264901084738737e-06, + "loss": 1.3695, + "mean_token_accuracy": 0.6599749426047007, + "num_tokens": 1754862001.0, + "step": 10468 + }, + { + "entropy": 1.7022302746772766, + "epoch": 1.1500645409354315, + "grad_norm": 0.6494450569152832, + "learning_rate": 9.263330020107131e-06, + "loss": 1.3224, + "mean_token_accuracy": 0.6655841370423635, + "num_tokens": 1755021180.0, + "step": 10469 + }, + { + "entropy": 1.7287100454171498, + "epoch": 1.1501743978468046, + "grad_norm": 0.8179412484169006, + "learning_rate": 9.261759010440343e-06, + "loss": 1.4208, + "mean_token_accuracy": 0.6409466514984766, + "num_tokens": 1755199159.0, + "step": 10470 + }, + { + "entropy": 1.6620949506759644, + "epoch": 1.1502842547581775, + "grad_norm": 0.7252711057662964, + "learning_rate": 9.260188055788104e-06, + "loss": 1.3515, + "mean_token_accuracy": 0.6578169663747152, + "num_tokens": 1755368309.0, + "step": 10471 + }, + { + "entropy": 1.6833390891551971, + "epoch": 1.1503941116695504, + "grad_norm": 0.6849291324615479, + "learning_rate": 9.258617156200127e-06, + "loss": 1.3125, + "mean_token_accuracy": 0.6736855655908585, + "num_tokens": 1755533771.0, + "step": 10472 + }, + { + "entropy": 1.7997891108194988, + "epoch": 1.1505039685809233, + "grad_norm": 0.7142224907875061, + "learning_rate": 9.257046311726128e-06, + "loss": 1.4591, + "mean_token_accuracy": 0.6437089890241623, + "num_tokens": 1755708002.0, + "step": 10473 + }, + { + "entropy": 1.7047271529833476, + "epoch": 1.1506138254922962, + "grad_norm": 0.700008749961853, + "learning_rate": 9.255475522415834e-06, + "loss": 1.4242, + "mean_token_accuracy": 0.6596666872501373, + "num_tokens": 1755849614.0, + "step": 10474 + }, + { + "entropy": 1.7262776792049408, + "epoch": 1.1507236824036693, + "grad_norm": 0.6418355703353882, + "learning_rate": 9.25390478831895e-06, + "loss": 1.4052, + "mean_token_accuracy": 0.6510264078776041, + "num_tokens": 1756022118.0, + "step": 10475 + }, + { + "entropy": 1.7786755760510762, + "epoch": 1.1508335393150422, + "grad_norm": 0.9499866962432861, + "learning_rate": 9.252334109485193e-06, + "loss": 1.6351, + "mean_token_accuracy": 0.632032627860705, + "num_tokens": 1756176047.0, + "step": 10476 + }, + { + "entropy": 1.7582121590773265, + "epoch": 1.150943396226415, + "grad_norm": 2.6958770751953125, + "learning_rate": 9.250763485964276e-06, + "loss": 1.1807, + "mean_token_accuracy": 0.6710349669059118, + "num_tokens": 1756372478.0, + "step": 10477 + }, + { + "entropy": 1.7144115070501964, + "epoch": 1.151053253137788, + "grad_norm": 0.6320227384567261, + "learning_rate": 9.249192917805905e-06, + "loss": 1.3936, + "mean_token_accuracy": 0.6546385983626047, + "num_tokens": 1756525920.0, + "step": 10478 + }, + { + "entropy": 1.749243050813675, + "epoch": 1.151163110049161, + "grad_norm": 0.7566484212875366, + "learning_rate": 9.247622405059786e-06, + "loss": 1.414, + "mean_token_accuracy": 0.6681007444858551, + "num_tokens": 1756682079.0, + "step": 10479 + }, + { + "entropy": 1.7483246127764385, + "epoch": 1.151272966960534, + "grad_norm": 0.7398757934570312, + "learning_rate": 9.246051947775635e-06, + "loss": 1.4072, + "mean_token_accuracy": 0.6565983096758524, + "num_tokens": 1756881456.0, + "step": 10480 + }, + { + "entropy": 1.7031634449958801, + "epoch": 1.1513828238719068, + "grad_norm": 0.6389073133468628, + "learning_rate": 9.244481546003146e-06, + "loss": 1.3583, + "mean_token_accuracy": 0.660191277662913, + "num_tokens": 1757043583.0, + "step": 10481 + }, + { + "entropy": 1.6907376945018768, + "epoch": 1.1514926807832797, + "grad_norm": 0.6452929377555847, + "learning_rate": 9.242911199792024e-06, + "loss": 1.3741, + "mean_token_accuracy": 0.6537407586971918, + "num_tokens": 1757198541.0, + "step": 10482 + }, + { + "entropy": 1.7258997162183125, + "epoch": 1.1516025376946528, + "grad_norm": 0.6563553810119629, + "learning_rate": 9.24134090919197e-06, + "loss": 1.3968, + "mean_token_accuracy": 0.6483776172002157, + "num_tokens": 1757403114.0, + "step": 10483 + }, + { + "entropy": 1.5953759948412578, + "epoch": 1.1517123946060257, + "grad_norm": 1.8490371704101562, + "learning_rate": 9.239770674252689e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.7037697086731592, + "num_tokens": 1757563792.0, + "step": 10484 + }, + { + "entropy": 1.6668463846047719, + "epoch": 1.1518222515173986, + "grad_norm": 0.696306049823761, + "learning_rate": 9.238200495023867e-06, + "loss": 1.4716, + "mean_token_accuracy": 0.6465398073196411, + "num_tokens": 1757728133.0, + "step": 10485 + }, + { + "entropy": 1.702815721432368, + "epoch": 1.1519321084287715, + "grad_norm": 0.7206531167030334, + "learning_rate": 9.236630371555208e-06, + "loss": 1.354, + "mean_token_accuracy": 0.669020434220632, + "num_tokens": 1757887891.0, + "step": 10486 + }, + { + "entropy": 1.6901472806930542, + "epoch": 1.1520419653401444, + "grad_norm": 0.6455077528953552, + "learning_rate": 9.235060303896404e-06, + "loss": 1.4898, + "mean_token_accuracy": 0.6457482799887657, + "num_tokens": 1758055648.0, + "step": 10487 + }, + { + "entropy": 1.6960370043913524, + "epoch": 1.1521518222515175, + "grad_norm": 0.7689752578735352, + "learning_rate": 9.233490292097143e-06, + "loss": 1.475, + "mean_token_accuracy": 0.6641001453002294, + "num_tokens": 1758224944.0, + "step": 10488 + }, + { + "entropy": 1.6482413212458293, + "epoch": 1.1522616791628904, + "grad_norm": 0.6643248796463013, + "learning_rate": 9.231920336207123e-06, + "loss": 1.3675, + "mean_token_accuracy": 0.6523696879545847, + "num_tokens": 1758393832.0, + "step": 10489 + }, + { + "entropy": 1.752677987019221, + "epoch": 1.1523715360742632, + "grad_norm": 0.684615969657898, + "learning_rate": 9.230350436276026e-06, + "loss": 1.3543, + "mean_token_accuracy": 0.663479283452034, + "num_tokens": 1758530887.0, + "step": 10490 + }, + { + "entropy": 1.6707546810309093, + "epoch": 1.1524813929856361, + "grad_norm": 0.5739973783493042, + "learning_rate": 9.228780592353538e-06, + "loss": 1.3706, + "mean_token_accuracy": 0.6499018023411433, + "num_tokens": 1758710774.0, + "step": 10491 + }, + { + "entropy": 1.7167290846506755, + "epoch": 1.1525912498970092, + "grad_norm": 0.6053609848022461, + "learning_rate": 9.227210804489348e-06, + "loss": 1.4804, + "mean_token_accuracy": 0.6375697354475657, + "num_tokens": 1758891290.0, + "step": 10492 + }, + { + "entropy": 1.7572990953922272, + "epoch": 1.1527011068083821, + "grad_norm": 0.8494213819503784, + "learning_rate": 9.225641072733136e-06, + "loss": 1.6531, + "mean_token_accuracy": 0.6410497824350992, + "num_tokens": 1759083232.0, + "step": 10493 + }, + { + "entropy": 1.7099438905715942, + "epoch": 1.152810963719755, + "grad_norm": 0.6985329389572144, + "learning_rate": 9.224071397134585e-06, + "loss": 1.548, + "mean_token_accuracy": 0.6346048961083094, + "num_tokens": 1759304006.0, + "step": 10494 + }, + { + "entropy": 1.7394586006800334, + "epoch": 1.1529208206311279, + "grad_norm": 0.5961000919342041, + "learning_rate": 9.222501777743375e-06, + "loss": 1.2539, + "mean_token_accuracy": 0.6759810944398245, + "num_tokens": 1759443975.0, + "step": 10495 + }, + { + "entropy": 1.6950910985469818, + "epoch": 1.153030677542501, + "grad_norm": 0.6235581636428833, + "learning_rate": 9.220932214609181e-06, + "loss": 1.2665, + "mean_token_accuracy": 0.6792470415433248, + "num_tokens": 1759580860.0, + "step": 10496 + }, + { + "entropy": 1.7201534907023113, + "epoch": 1.1531405344538739, + "grad_norm": 0.6948989629745483, + "learning_rate": 9.21936270778168e-06, + "loss": 1.3425, + "mean_token_accuracy": 0.6627224882443746, + "num_tokens": 1759735334.0, + "step": 10497 + }, + { + "entropy": 1.655206690231959, + "epoch": 1.1532503913652468, + "grad_norm": 0.5705309510231018, + "learning_rate": 9.217793257310552e-06, + "loss": 1.3007, + "mean_token_accuracy": 0.676471064488093, + "num_tokens": 1759958339.0, + "step": 10498 + }, + { + "entropy": 1.708439866701762, + "epoch": 1.1533602482766196, + "grad_norm": 0.7828124761581421, + "learning_rate": 9.216223863245459e-06, + "loss": 1.5709, + "mean_token_accuracy": 0.6466249401370684, + "num_tokens": 1760152326.0, + "step": 10499 + }, + { + "entropy": 1.7445420026779175, + "epoch": 1.1534701051879925, + "grad_norm": 0.7042776346206665, + "learning_rate": 9.214654525636078e-06, + "loss": 1.3096, + "mean_token_accuracy": 0.6746334433555603, + "num_tokens": 1760290174.0, + "step": 10500 + }, + { + "entropy": 1.681450366973877, + "epoch": 1.1535799620993656, + "grad_norm": 0.6809564828872681, + "learning_rate": 9.21308524453208e-06, + "loss": 1.2993, + "mean_token_accuracy": 0.6653634657462438, + "num_tokens": 1760441480.0, + "step": 10501 + }, + { + "entropy": 1.7187366684277852, + "epoch": 1.1536898190107385, + "grad_norm": 0.57969069480896, + "learning_rate": 9.211516019983127e-06, + "loss": 1.3651, + "mean_token_accuracy": 0.6566129624843597, + "num_tokens": 1760598812.0, + "step": 10502 + }, + { + "entropy": 1.7298386891682942, + "epoch": 1.1537996759221114, + "grad_norm": 0.7230368256568909, + "learning_rate": 9.209946852038882e-06, + "loss": 1.5234, + "mean_token_accuracy": 0.6430017203092575, + "num_tokens": 1760754936.0, + "step": 10503 + }, + { + "entropy": 1.7077820599079132, + "epoch": 1.1539095328334843, + "grad_norm": 0.7134849429130554, + "learning_rate": 9.20837774074902e-06, + "loss": 1.3521, + "mean_token_accuracy": 0.6613487799962362, + "num_tokens": 1760918109.0, + "step": 10504 + }, + { + "entropy": 1.6505942145983379, + "epoch": 1.1540193897448574, + "grad_norm": 0.649359405040741, + "learning_rate": 9.20680868616319e-06, + "loss": 1.5086, + "mean_token_accuracy": 0.6458548208077749, + "num_tokens": 1761083688.0, + "step": 10505 + }, + { + "entropy": 1.636115938425064, + "epoch": 1.1541292466562303, + "grad_norm": 0.7054543495178223, + "learning_rate": 9.205239688331056e-06, + "loss": 1.415, + "mean_token_accuracy": 0.6505183627208074, + "num_tokens": 1761280525.0, + "step": 10506 + }, + { + "entropy": 1.7434692879517872, + "epoch": 1.1542391035676032, + "grad_norm": 0.7469452619552612, + "learning_rate": 9.203670747302283e-06, + "loss": 1.3334, + "mean_token_accuracy": 0.6721020837624868, + "num_tokens": 1761449006.0, + "step": 10507 + }, + { + "entropy": 1.71237579981486, + "epoch": 1.154348960478976, + "grad_norm": 0.799959659576416, + "learning_rate": 9.202101863126516e-06, + "loss": 1.5013, + "mean_token_accuracy": 0.6525566975275675, + "num_tokens": 1761612171.0, + "step": 10508 + }, + { + "entropy": 1.7190298636754353, + "epoch": 1.1544588173903492, + "grad_norm": 0.7386515140533447, + "learning_rate": 9.200533035853414e-06, + "loss": 1.3877, + "mean_token_accuracy": 0.6623720477024714, + "num_tokens": 1761765552.0, + "step": 10509 + }, + { + "entropy": 1.708103507757187, + "epoch": 1.154568674301722, + "grad_norm": 0.7165181040763855, + "learning_rate": 9.198964265532638e-06, + "loss": 1.372, + "mean_token_accuracy": 0.6593515028556188, + "num_tokens": 1761922081.0, + "step": 10510 + }, + { + "entropy": 1.682081123193105, + "epoch": 1.154678531213095, + "grad_norm": 0.7013752460479736, + "learning_rate": 9.197395552213823e-06, + "loss": 1.384, + "mean_token_accuracy": 0.6498614301284155, + "num_tokens": 1762114261.0, + "step": 10511 + }, + { + "entropy": 1.7438469529151917, + "epoch": 1.1547883881244678, + "grad_norm": 0.8890546560287476, + "learning_rate": 9.195826895946629e-06, + "loss": 1.6896, + "mean_token_accuracy": 0.6446986744801203, + "num_tokens": 1762363635.0, + "step": 10512 + }, + { + "entropy": 1.7093018889427185, + "epoch": 1.1548982450358407, + "grad_norm": 0.7026628255844116, + "learning_rate": 9.194258296780705e-06, + "loss": 1.34, + "mean_token_accuracy": 0.6579982489347458, + "num_tokens": 1762525972.0, + "step": 10513 + }, + { + "entropy": 1.6976705988248189, + "epoch": 1.1550081019472138, + "grad_norm": 0.8118287324905396, + "learning_rate": 9.19268975476569e-06, + "loss": 1.225, + "mean_token_accuracy": 0.673242911696434, + "num_tokens": 1762630219.0, + "step": 10514 + }, + { + "entropy": 1.7051588793595631, + "epoch": 1.1551179588585867, + "grad_norm": 0.6836156249046326, + "learning_rate": 9.191121269951226e-06, + "loss": 1.3528, + "mean_token_accuracy": 0.6582736670970917, + "num_tokens": 1762794612.0, + "step": 10515 + }, + { + "entropy": 1.7200697461764018, + "epoch": 1.1552278157699596, + "grad_norm": 0.6674354076385498, + "learning_rate": 9.189552842386964e-06, + "loss": 1.2994, + "mean_token_accuracy": 0.6657363077004751, + "num_tokens": 1762965830.0, + "step": 10516 + }, + { + "entropy": 1.7609045306841533, + "epoch": 1.1553376726813325, + "grad_norm": 0.6270747780799866, + "learning_rate": 9.187984472122535e-06, + "loss": 1.4016, + "mean_token_accuracy": 0.6604256083567938, + "num_tokens": 1763128044.0, + "step": 10517 + }, + { + "entropy": 1.659463216861089, + "epoch": 1.1554475295927056, + "grad_norm": 0.7091048955917358, + "learning_rate": 9.186416159207582e-06, + "loss": 1.4481, + "mean_token_accuracy": 0.6355709036191305, + "num_tokens": 1763315602.0, + "step": 10518 + }, + { + "entropy": 1.7180581390857697, + "epoch": 1.1555573865040785, + "grad_norm": 0.7278168797492981, + "learning_rate": 9.184847903691743e-06, + "loss": 1.3962, + "mean_token_accuracy": 0.6445047954718272, + "num_tokens": 1763462039.0, + "step": 10519 + }, + { + "entropy": 1.7248013814290364, + "epoch": 1.1556672434154514, + "grad_norm": 0.98234623670578, + "learning_rate": 9.183279705624645e-06, + "loss": 1.3433, + "mean_token_accuracy": 0.6596641639868418, + "num_tokens": 1763596149.0, + "step": 10520 + }, + { + "entropy": 1.7311313549677532, + "epoch": 1.1557771003268242, + "grad_norm": 0.6669163107872009, + "learning_rate": 9.181711565055927e-06, + "loss": 1.4256, + "mean_token_accuracy": 0.6505987147490183, + "num_tokens": 1763791721.0, + "step": 10521 + }, + { + "entropy": 1.722319593032201, + "epoch": 1.1558869572381973, + "grad_norm": 0.6654046177864075, + "learning_rate": 9.180143482035223e-06, + "loss": 1.3032, + "mean_token_accuracy": 0.6619775195916494, + "num_tokens": 1763927643.0, + "step": 10522 + }, + { + "entropy": 1.7048958043257396, + "epoch": 1.1559968141495702, + "grad_norm": 0.6961905360221863, + "learning_rate": 9.178575456612154e-06, + "loss": 1.2973, + "mean_token_accuracy": 0.6727237900098165, + "num_tokens": 1764080454.0, + "step": 10523 + }, + { + "entropy": 1.7435453335444133, + "epoch": 1.1561066710609431, + "grad_norm": 0.7452827095985413, + "learning_rate": 9.177007488836354e-06, + "loss": 1.3283, + "mean_token_accuracy": 0.670777623852094, + "num_tokens": 1764211025.0, + "step": 10524 + }, + { + "entropy": 1.7101693550745647, + "epoch": 1.156216527972316, + "grad_norm": 1.0145291090011597, + "learning_rate": 9.175439578757442e-06, + "loss": 1.698, + "mean_token_accuracy": 0.6353831539551417, + "num_tokens": 1764418930.0, + "step": 10525 + }, + { + "entropy": 1.766825556755066, + "epoch": 1.1563263848836889, + "grad_norm": 0.7082020044326782, + "learning_rate": 9.173871726425045e-06, + "loss": 1.4726, + "mean_token_accuracy": 0.6487270891666412, + "num_tokens": 1764572213.0, + "step": 10526 + }, + { + "entropy": 1.7133028507232666, + "epoch": 1.156436241795062, + "grad_norm": 0.7147353887557983, + "learning_rate": 9.17230393188879e-06, + "loss": 1.2518, + "mean_token_accuracy": 0.6745504637559255, + "num_tokens": 1764723324.0, + "step": 10527 + }, + { + "entropy": 1.6591166456540425, + "epoch": 1.1565460987064349, + "grad_norm": 0.7346095442771912, + "learning_rate": 9.170736195198287e-06, + "loss": 1.4666, + "mean_token_accuracy": 0.6519047121206919, + "num_tokens": 1764898490.0, + "step": 10528 + }, + { + "entropy": 1.6714920202891033, + "epoch": 1.1566559556178078, + "grad_norm": 0.6665278673171997, + "learning_rate": 9.169168516403158e-06, + "loss": 1.3278, + "mean_token_accuracy": 0.6733681559562683, + "num_tokens": 1765035645.0, + "step": 10529 + }, + { + "entropy": 1.7320491870244343, + "epoch": 1.1567658125291806, + "grad_norm": 0.726340651512146, + "learning_rate": 9.167600895553024e-06, + "loss": 1.3063, + "mean_token_accuracy": 0.6701697111129761, + "num_tokens": 1765181838.0, + "step": 10530 + }, + { + "entropy": 1.6742952664693196, + "epoch": 1.1568756694405538, + "grad_norm": 0.9233806729316711, + "learning_rate": 9.166033332697495e-06, + "loss": 1.3761, + "mean_token_accuracy": 0.66909788052241, + "num_tokens": 1765313129.0, + "step": 10531 + }, + { + "entropy": 1.680614411830902, + "epoch": 1.1569855263519266, + "grad_norm": 0.6643198132514954, + "learning_rate": 9.164465827886184e-06, + "loss": 1.3118, + "mean_token_accuracy": 0.6700858275095621, + "num_tokens": 1765454598.0, + "step": 10532 + }, + { + "entropy": 1.7060332397619884, + "epoch": 1.1570953832632995, + "grad_norm": 0.6335230469703674, + "learning_rate": 9.162898381168705e-06, + "loss": 1.3871, + "mean_token_accuracy": 0.6586262285709381, + "num_tokens": 1765649572.0, + "step": 10533 + }, + { + "entropy": 1.71233864625295, + "epoch": 1.1572052401746724, + "grad_norm": 0.6753906011581421, + "learning_rate": 9.161330992594662e-06, + "loss": 1.4858, + "mean_token_accuracy": 0.6469430774450302, + "num_tokens": 1765840368.0, + "step": 10534 + }, + { + "entropy": 1.7542717456817627, + "epoch": 1.1573150970860455, + "grad_norm": 0.6666431427001953, + "learning_rate": 9.159763662213664e-06, + "loss": 1.4361, + "mean_token_accuracy": 0.648838589588801, + "num_tokens": 1765994363.0, + "step": 10535 + }, + { + "entropy": 1.6961637834707897, + "epoch": 1.1574249539974184, + "grad_norm": 0.7073807120323181, + "learning_rate": 9.158196390075319e-06, + "loss": 1.3327, + "mean_token_accuracy": 0.6611540814240774, + "num_tokens": 1766109247.0, + "step": 10536 + }, + { + "entropy": 1.680997868378957, + "epoch": 1.1575348109087913, + "grad_norm": 0.7748100757598877, + "learning_rate": 9.156629176229225e-06, + "loss": 1.3899, + "mean_token_accuracy": 0.6582418978214264, + "num_tokens": 1766252569.0, + "step": 10537 + }, + { + "entropy": 1.7486574749151866, + "epoch": 1.1576446678201642, + "grad_norm": 0.7355571389198303, + "learning_rate": 9.15506202072499e-06, + "loss": 1.4158, + "mean_token_accuracy": 0.6534449557463328, + "num_tokens": 1766416656.0, + "step": 10538 + }, + { + "entropy": 1.7009160220623016, + "epoch": 1.157754524731537, + "grad_norm": 0.6310091614723206, + "learning_rate": 9.153494923612212e-06, + "loss": 1.4498, + "mean_token_accuracy": 0.6585008750359217, + "num_tokens": 1766557961.0, + "step": 10539 + }, + { + "entropy": 1.7415493031342824, + "epoch": 1.1578643816429102, + "grad_norm": 0.6227964758872986, + "learning_rate": 9.151927884940486e-06, + "loss": 1.4646, + "mean_token_accuracy": 0.6527653783559799, + "num_tokens": 1766779349.0, + "step": 10540 + }, + { + "entropy": 1.7468430002530415, + "epoch": 1.157974238554283, + "grad_norm": 0.7541377544403076, + "learning_rate": 9.150360904759405e-06, + "loss": 1.3462, + "mean_token_accuracy": 0.668052484591802, + "num_tokens": 1766924931.0, + "step": 10541 + }, + { + "entropy": 1.7139411966005962, + "epoch": 1.158084095465656, + "grad_norm": 0.8286843299865723, + "learning_rate": 9.148793983118574e-06, + "loss": 1.4628, + "mean_token_accuracy": 0.6435067802667618, + "num_tokens": 1767096773.0, + "step": 10542 + }, + { + "entropy": 1.732242186864217, + "epoch": 1.1581939523770288, + "grad_norm": 0.7068530917167664, + "learning_rate": 9.147227120067576e-06, + "loss": 1.3388, + "mean_token_accuracy": 0.6692612071832021, + "num_tokens": 1767257451.0, + "step": 10543 + }, + { + "entropy": 1.7326354285081227, + "epoch": 1.158303809288402, + "grad_norm": 0.8493311405181885, + "learning_rate": 9.145660315656006e-06, + "loss": 1.4362, + "mean_token_accuracy": 0.6661918113629023, + "num_tokens": 1767412405.0, + "step": 10544 + }, + { + "entropy": 1.797954519589742, + "epoch": 1.1584136661997748, + "grad_norm": 0.8520449995994568, + "learning_rate": 9.144093569933454e-06, + "loss": 1.5181, + "mean_token_accuracy": 0.6419855256875356, + "num_tokens": 1767572561.0, + "step": 10545 + }, + { + "entropy": 1.704055945078532, + "epoch": 1.1585235231111477, + "grad_norm": 0.6789255738258362, + "learning_rate": 9.142526882949501e-06, + "loss": 1.4423, + "mean_token_accuracy": 0.6528183867534002, + "num_tokens": 1767792584.0, + "step": 10546 + }, + { + "entropy": 1.7003831168015797, + "epoch": 1.1586333800225206, + "grad_norm": 0.6809309720993042, + "learning_rate": 9.140960254753733e-06, + "loss": 1.3725, + "mean_token_accuracy": 0.6628607759873072, + "num_tokens": 1767932953.0, + "step": 10547 + }, + { + "entropy": 1.7150506675243378, + "epoch": 1.1587432369338937, + "grad_norm": 0.9768050312995911, + "learning_rate": 9.13939368539574e-06, + "loss": 1.3804, + "mean_token_accuracy": 0.6604229360818863, + "num_tokens": 1768075316.0, + "step": 10548 + }, + { + "entropy": 1.6081635057926178, + "epoch": 1.1588530938452666, + "grad_norm": 0.6204017400741577, + "learning_rate": 9.137827174925095e-06, + "loss": 1.4556, + "mean_token_accuracy": 0.6706610669692358, + "num_tokens": 1768245765.0, + "step": 10549 + }, + { + "entropy": 1.6946631868680317, + "epoch": 1.1589629507566395, + "grad_norm": 0.6550582647323608, + "learning_rate": 9.136260723391383e-06, + "loss": 1.3747, + "mean_token_accuracy": 0.6594513903061548, + "num_tokens": 1768426866.0, + "step": 10550 + }, + { + "entropy": 1.6690677801767986, + "epoch": 1.1590728076680124, + "grad_norm": 0.7413309216499329, + "learning_rate": 9.13469433084418e-06, + "loss": 1.3875, + "mean_token_accuracy": 0.653538167476654, + "num_tokens": 1768621316.0, + "step": 10551 + }, + { + "entropy": 1.675765037536621, + "epoch": 1.1591826645793852, + "grad_norm": 0.7278109192848206, + "learning_rate": 9.13312799733306e-06, + "loss": 1.2493, + "mean_token_accuracy": 0.6701826651891073, + "num_tokens": 1768773290.0, + "step": 10552 + }, + { + "entropy": 1.6604806085427601, + "epoch": 1.1592925214907583, + "grad_norm": 0.7603628635406494, + "learning_rate": 9.131561722907593e-06, + "loss": 1.2599, + "mean_token_accuracy": 0.6719126949707667, + "num_tokens": 1768919033.0, + "step": 10553 + }, + { + "entropy": 1.7303833464781444, + "epoch": 1.1594023784021312, + "grad_norm": 0.71886146068573, + "learning_rate": 9.129995507617362e-06, + "loss": 1.6364, + "mean_token_accuracy": 0.6442072639862696, + "num_tokens": 1769065412.0, + "step": 10554 + }, + { + "entropy": 1.7273939549922943, + "epoch": 1.1595122353135041, + "grad_norm": 0.7979735732078552, + "learning_rate": 9.128429351511929e-06, + "loss": 1.4078, + "mean_token_accuracy": 0.6563597470521927, + "num_tokens": 1769204785.0, + "step": 10555 + }, + { + "entropy": 1.7173643112182617, + "epoch": 1.1596220922248772, + "grad_norm": 0.5847103595733643, + "learning_rate": 9.126863254640863e-06, + "loss": 1.4891, + "mean_token_accuracy": 0.6459435870250066, + "num_tokens": 1769424428.0, + "step": 10556 + }, + { + "entropy": 1.7192479570706685, + "epoch": 1.15973194913625, + "grad_norm": 0.7932802438735962, + "learning_rate": 9.12529721705373e-06, + "loss": 1.538, + "mean_token_accuracy": 0.6482977941632271, + "num_tokens": 1769575003.0, + "step": 10557 + }, + { + "entropy": 1.662235786517461, + "epoch": 1.159841806047623, + "grad_norm": 0.7710309028625488, + "learning_rate": 9.123731238800098e-06, + "loss": 1.4101, + "mean_token_accuracy": 0.6681878517071406, + "num_tokens": 1769738956.0, + "step": 10558 + }, + { + "entropy": 1.753949224948883, + "epoch": 1.1599516629589959, + "grad_norm": 0.7551962733268738, + "learning_rate": 9.122165319929521e-06, + "loss": 1.504, + "mean_token_accuracy": 0.6444283723831177, + "num_tokens": 1769908470.0, + "step": 10559 + }, + { + "entropy": 1.7544064223766327, + "epoch": 1.1600615198703688, + "grad_norm": 0.7502493262290955, + "learning_rate": 9.120599460491572e-06, + "loss": 1.3027, + "mean_token_accuracy": 0.679939478635788, + "num_tokens": 1770051861.0, + "step": 10560 + }, + { + "entropy": 1.728769302368164, + "epoch": 1.1601713767817419, + "grad_norm": 0.6876187920570374, + "learning_rate": 9.119033660535802e-06, + "loss": 1.3227, + "mean_token_accuracy": 0.6703376968701681, + "num_tokens": 1770199848.0, + "step": 10561 + }, + { + "entropy": 1.7336504260698955, + "epoch": 1.1602812336931148, + "grad_norm": 0.8140459060668945, + "learning_rate": 9.117467920111767e-06, + "loss": 1.2416, + "mean_token_accuracy": 0.6855403482913971, + "num_tokens": 1770304890.0, + "step": 10562 + }, + { + "entropy": 1.723543256521225, + "epoch": 1.1603910906044876, + "grad_norm": 0.8142033219337463, + "learning_rate": 9.115902239269026e-06, + "loss": 1.4238, + "mean_token_accuracy": 0.6583587676286697, + "num_tokens": 1770475091.0, + "step": 10563 + }, + { + "entropy": 1.7559408446153004, + "epoch": 1.1605009475158605, + "grad_norm": 0.708025336265564, + "learning_rate": 9.114336618057126e-06, + "loss": 1.3814, + "mean_token_accuracy": 0.6672280778487524, + "num_tokens": 1770627827.0, + "step": 10564 + }, + { + "entropy": 1.7441905339558919, + "epoch": 1.1606108044272334, + "grad_norm": 0.6231316328048706, + "learning_rate": 9.112771056525625e-06, + "loss": 1.3323, + "mean_token_accuracy": 0.6605872611204783, + "num_tokens": 1770819185.0, + "step": 10565 + }, + { + "entropy": 1.7347522576649983, + "epoch": 1.1607206613386065, + "grad_norm": 0.6096704602241516, + "learning_rate": 9.111205554724071e-06, + "loss": 1.3962, + "mean_token_accuracy": 0.6540986547867457, + "num_tokens": 1770986955.0, + "step": 10566 + }, + { + "entropy": 1.698427716890971, + "epoch": 1.1608305182499794, + "grad_norm": 0.6909480690956116, + "learning_rate": 9.109640112702009e-06, + "loss": 1.3849, + "mean_token_accuracy": 0.6506121506293615, + "num_tokens": 1771145897.0, + "step": 10567 + }, + { + "entropy": 1.6268266638120015, + "epoch": 1.1609403751613523, + "grad_norm": 0.5729960203170776, + "learning_rate": 9.108074730508985e-06, + "loss": 1.324, + "mean_token_accuracy": 0.6655399600664774, + "num_tokens": 1771300536.0, + "step": 10568 + }, + { + "entropy": 1.7214660545190175, + "epoch": 1.1610502320727254, + "grad_norm": 0.6441773772239685, + "learning_rate": 9.106509408194543e-06, + "loss": 1.2798, + "mean_token_accuracy": 0.665215253829956, + "num_tokens": 1771434573.0, + "step": 10569 + }, + { + "entropy": 1.7466843525568645, + "epoch": 1.1611600889840983, + "grad_norm": 0.6851255297660828, + "learning_rate": 9.104944145808228e-06, + "loss": 1.3357, + "mean_token_accuracy": 0.6567875295877457, + "num_tokens": 1771601038.0, + "step": 10570 + }, + { + "entropy": 1.7371169924736023, + "epoch": 1.1612699458954712, + "grad_norm": 0.8068298697471619, + "learning_rate": 9.103378943399572e-06, + "loss": 1.4549, + "mean_token_accuracy": 0.646860788265864, + "num_tokens": 1771772926.0, + "step": 10571 + }, + { + "entropy": 1.7305392722288768, + "epoch": 1.161379802806844, + "grad_norm": 0.7280715703964233, + "learning_rate": 9.101813801018125e-06, + "loss": 1.3148, + "mean_token_accuracy": 0.6666086862484614, + "num_tokens": 1771920615.0, + "step": 10572 + }, + { + "entropy": 1.7404690285523732, + "epoch": 1.161489659718217, + "grad_norm": 0.9056682586669922, + "learning_rate": 9.100248718713406e-06, + "loss": 1.4988, + "mean_token_accuracy": 0.6431169708569845, + "num_tokens": 1772118214.0, + "step": 10573 + }, + { + "entropy": 1.7107574343681335, + "epoch": 1.16159951662959, + "grad_norm": 0.663151204586029, + "learning_rate": 9.098683696534964e-06, + "loss": 1.4899, + "mean_token_accuracy": 0.6537577112515768, + "num_tokens": 1772305129.0, + "step": 10574 + }, + { + "entropy": 1.7315536936124165, + "epoch": 1.161709373540963, + "grad_norm": 0.6387749910354614, + "learning_rate": 9.09711873453233e-06, + "loss": 1.3131, + "mean_token_accuracy": 0.659678096572558, + "num_tokens": 1772445507.0, + "step": 10575 + }, + { + "entropy": 1.7294196883837383, + "epoch": 1.1618192304523358, + "grad_norm": 0.6368371844291687, + "learning_rate": 9.095553832755026e-06, + "loss": 1.4576, + "mean_token_accuracy": 0.6409991731246313, + "num_tokens": 1772685191.0, + "step": 10576 + }, + { + "entropy": 1.738920897245407, + "epoch": 1.1619290873637087, + "grad_norm": 0.5927242636680603, + "learning_rate": 9.093988991252585e-06, + "loss": 1.3886, + "mean_token_accuracy": 0.6494887272516886, + "num_tokens": 1772865464.0, + "step": 10577 + }, + { + "entropy": 1.6951357523600261, + "epoch": 1.1620389442750816, + "grad_norm": 0.7614024877548218, + "learning_rate": 9.092424210074537e-06, + "loss": 1.4212, + "mean_token_accuracy": 0.6512744178374609, + "num_tokens": 1773014220.0, + "step": 10578 + }, + { + "entropy": 1.6392890711625416, + "epoch": 1.1621488011864547, + "grad_norm": 0.6055826544761658, + "learning_rate": 9.090859489270399e-06, + "loss": 1.3891, + "mean_token_accuracy": 0.655634676416715, + "num_tokens": 1773199400.0, + "step": 10579 + }, + { + "entropy": 1.6917970776557922, + "epoch": 1.1622586580978276, + "grad_norm": 0.6927284002304077, + "learning_rate": 9.0892948288897e-06, + "loss": 1.3529, + "mean_token_accuracy": 0.6633793711662292, + "num_tokens": 1773382971.0, + "step": 10580 + }, + { + "entropy": 1.691469391187032, + "epoch": 1.1623685150092005, + "grad_norm": 0.7069520950317383, + "learning_rate": 9.087730228981959e-06, + "loss": 1.3401, + "mean_token_accuracy": 0.6657597869634628, + "num_tokens": 1773539956.0, + "step": 10581 + }, + { + "entropy": 1.6694080928961437, + "epoch": 1.1624783719205736, + "grad_norm": 0.6818525791168213, + "learning_rate": 9.086165689596696e-06, + "loss": 1.2939, + "mean_token_accuracy": 0.6682254274686178, + "num_tokens": 1773664225.0, + "step": 10582 + }, + { + "entropy": 1.6743863622347515, + "epoch": 1.1625882288319465, + "grad_norm": 0.8346628546714783, + "learning_rate": 9.084601210783424e-06, + "loss": 1.4255, + "mean_token_accuracy": 0.6574391573667526, + "num_tokens": 1773827963.0, + "step": 10583 + }, + { + "entropy": 1.6976170639197032, + "epoch": 1.1626980857433193, + "grad_norm": 0.6019466519355774, + "learning_rate": 9.083036792591662e-06, + "loss": 1.4136, + "mean_token_accuracy": 0.6440401424964269, + "num_tokens": 1774002026.0, + "step": 10584 + }, + { + "entropy": 1.7483911216259003, + "epoch": 1.1628079426546922, + "grad_norm": 0.7636407613754272, + "learning_rate": 9.081472435070917e-06, + "loss": 1.4012, + "mean_token_accuracy": 0.6655046790838242, + "num_tokens": 1774159224.0, + "step": 10585 + }, + { + "entropy": 1.6063493490219116, + "epoch": 1.1629177995660651, + "grad_norm": 0.6063027381896973, + "learning_rate": 9.079908138270711e-06, + "loss": 1.3721, + "mean_token_accuracy": 0.6649445941050848, + "num_tokens": 1774345751.0, + "step": 10586 + }, + { + "entropy": 1.7763068775335948, + "epoch": 1.1630276564774382, + "grad_norm": 0.8216478228569031, + "learning_rate": 9.078343902240546e-06, + "loss": 1.4016, + "mean_token_accuracy": 0.6733109205961227, + "num_tokens": 1774478712.0, + "step": 10587 + }, + { + "entropy": 1.6169381241003673, + "epoch": 1.163137513388811, + "grad_norm": 0.6574183106422424, + "learning_rate": 9.076779727029929e-06, + "loss": 1.1698, + "mean_token_accuracy": 0.6915866086880366, + "num_tokens": 1774608731.0, + "step": 10588 + }, + { + "entropy": 1.756181428829829, + "epoch": 1.163247370300184, + "grad_norm": 0.7906789183616638, + "learning_rate": 9.075215612688369e-06, + "loss": 1.3784, + "mean_token_accuracy": 0.6525656481583914, + "num_tokens": 1774744469.0, + "step": 10589 + }, + { + "entropy": 1.7244456708431244, + "epoch": 1.1633572272115569, + "grad_norm": 0.7453427910804749, + "learning_rate": 9.073651559265365e-06, + "loss": 1.5139, + "mean_token_accuracy": 0.6516165683666865, + "num_tokens": 1774919441.0, + "step": 10590 + }, + { + "entropy": 1.6759937008221943, + "epoch": 1.1634670841229298, + "grad_norm": 0.741671085357666, + "learning_rate": 9.072087566810422e-06, + "loss": 1.385, + "mean_token_accuracy": 0.6523188451925913, + "num_tokens": 1775078071.0, + "step": 10591 + }, + { + "entropy": 1.72449991106987, + "epoch": 1.1635769410343029, + "grad_norm": 0.7586898803710938, + "learning_rate": 9.07052363537304e-06, + "loss": 1.2781, + "mean_token_accuracy": 0.684979259967804, + "num_tokens": 1775227702.0, + "step": 10592 + }, + { + "entropy": 1.6677986184755962, + "epoch": 1.1636867979456758, + "grad_norm": 0.731613278388977, + "learning_rate": 9.068959765002714e-06, + "loss": 1.5139, + "mean_token_accuracy": 0.6585745165745417, + "num_tokens": 1775376632.0, + "step": 10593 + }, + { + "entropy": 1.7536945442358653, + "epoch": 1.1637966548570486, + "grad_norm": 0.9093847274780273, + "learning_rate": 9.06739595574894e-06, + "loss": 1.3927, + "mean_token_accuracy": 0.6645366350809733, + "num_tokens": 1775517987.0, + "step": 10594 + }, + { + "entropy": 1.7309378584225972, + "epoch": 1.1639065117684217, + "grad_norm": 0.7094044089317322, + "learning_rate": 9.065832207661218e-06, + "loss": 1.3985, + "mean_token_accuracy": 0.6560509552558264, + "num_tokens": 1775747878.0, + "step": 10595 + }, + { + "entropy": 1.7115402321020763, + "epoch": 1.1640163686797946, + "grad_norm": 0.655071496963501, + "learning_rate": 9.06426852078903e-06, + "loss": 1.2848, + "mean_token_accuracy": 0.6730567514896393, + "num_tokens": 1775897435.0, + "step": 10596 + }, + { + "entropy": 1.7259460389614105, + "epoch": 1.1641262255911675, + "grad_norm": 0.6214396357536316, + "learning_rate": 9.062704895181873e-06, + "loss": 1.4306, + "mean_token_accuracy": 0.6395227412382761, + "num_tokens": 1776133092.0, + "step": 10597 + }, + { + "entropy": 1.6866820653279622, + "epoch": 1.1642360825025404, + "grad_norm": 0.9714513421058655, + "learning_rate": 9.061141330889234e-06, + "loss": 1.3075, + "mean_token_accuracy": 0.6768196622530619, + "num_tokens": 1776291999.0, + "step": 10598 + }, + { + "entropy": 1.6818428039550781, + "epoch": 1.1643459394139133, + "grad_norm": 0.6200037002563477, + "learning_rate": 9.059577827960597e-06, + "loss": 1.4208, + "mean_token_accuracy": 0.6529973646004995, + "num_tokens": 1776497288.0, + "step": 10599 + }, + { + "entropy": 1.686517169078191, + "epoch": 1.1644557963252864, + "grad_norm": 0.5967657566070557, + "learning_rate": 9.058014386445449e-06, + "loss": 1.3172, + "mean_token_accuracy": 0.6720605492591858, + "num_tokens": 1776678153.0, + "step": 10600 + }, + { + "entropy": 1.7095843454202015, + "epoch": 1.1645656532366593, + "grad_norm": 0.6960015892982483, + "learning_rate": 9.05645100639327e-06, + "loss": 1.5391, + "mean_token_accuracy": 0.6424980262915293, + "num_tokens": 1776864339.0, + "step": 10601 + }, + { + "entropy": 1.6995967328548431, + "epoch": 1.1646755101480322, + "grad_norm": 0.6774857044219971, + "learning_rate": 9.05488768785354e-06, + "loss": 1.4723, + "mean_token_accuracy": 0.6477284729480743, + "num_tokens": 1777018164.0, + "step": 10602 + }, + { + "entropy": 1.6932755609353383, + "epoch": 1.164785367059405, + "grad_norm": 0.6286726593971252, + "learning_rate": 9.053324430875734e-06, + "loss": 1.3633, + "mean_token_accuracy": 0.6565052568912506, + "num_tokens": 1777149134.0, + "step": 10603 + }, + { + "entropy": 1.6836401224136353, + "epoch": 1.164895223970778, + "grad_norm": 0.8434138894081116, + "learning_rate": 9.051761235509339e-06, + "loss": 1.3775, + "mean_token_accuracy": 0.6582860300938288, + "num_tokens": 1777336064.0, + "step": 10604 + }, + { + "entropy": 1.6700752675533295, + "epoch": 1.165005080882151, + "grad_norm": 0.6058101058006287, + "learning_rate": 9.050198101803822e-06, + "loss": 1.3903, + "mean_token_accuracy": 0.654156357049942, + "num_tokens": 1777523437.0, + "step": 10605 + }, + { + "entropy": 1.754345069328944, + "epoch": 1.165114937793524, + "grad_norm": 0.7618310451507568, + "learning_rate": 9.048635029808654e-06, + "loss": 1.3307, + "mean_token_accuracy": 0.6736029783884684, + "num_tokens": 1777664563.0, + "step": 10606 + }, + { + "entropy": 1.722790112098058, + "epoch": 1.1652247947048968, + "grad_norm": 0.5938490033149719, + "learning_rate": 9.04707201957331e-06, + "loss": 1.3779, + "mean_token_accuracy": 0.6630453765392303, + "num_tokens": 1777833158.0, + "step": 10607 + }, + { + "entropy": 1.7870861391226451, + "epoch": 1.16533465161627, + "grad_norm": 0.7509839534759521, + "learning_rate": 9.045509071147255e-06, + "loss": 1.3353, + "mean_token_accuracy": 0.665691594282786, + "num_tokens": 1777949791.0, + "step": 10608 + }, + { + "entropy": 1.6829339563846588, + "epoch": 1.1654445085276428, + "grad_norm": 0.6705135703086853, + "learning_rate": 9.043946184579957e-06, + "loss": 1.2543, + "mean_token_accuracy": 0.6714605540037155, + "num_tokens": 1778052170.0, + "step": 10609 + }, + { + "entropy": 1.7535866002241771, + "epoch": 1.1655543654390157, + "grad_norm": 0.8111270666122437, + "learning_rate": 9.042383359920886e-06, + "loss": 1.3505, + "mean_token_accuracy": 0.6616497834523519, + "num_tokens": 1778182029.0, + "step": 10610 + }, + { + "entropy": 1.6741001804669697, + "epoch": 1.1656642223503886, + "grad_norm": 0.5639720559120178, + "learning_rate": 9.040820597219493e-06, + "loss": 1.4687, + "mean_token_accuracy": 0.6465960890054703, + "num_tokens": 1778397245.0, + "step": 10611 + }, + { + "entropy": 1.7868964572747548, + "epoch": 1.1657740792617615, + "grad_norm": 0.7194597125053406, + "learning_rate": 9.039257896525249e-06, + "loss": 1.5443, + "mean_token_accuracy": 0.6437151481707891, + "num_tokens": 1778582965.0, + "step": 10612 + }, + { + "entropy": 1.7630162437756856, + "epoch": 1.1658839361731346, + "grad_norm": 0.7208252549171448, + "learning_rate": 9.037695257887608e-06, + "loss": 1.4503, + "mean_token_accuracy": 0.6444578021764755, + "num_tokens": 1778811502.0, + "step": 10613 + }, + { + "entropy": 1.628514697154363, + "epoch": 1.1659937930845075, + "grad_norm": 0.6529536843299866, + "learning_rate": 9.03613268135603e-06, + "loss": 1.3149, + "mean_token_accuracy": 0.6686781197786331, + "num_tokens": 1778965602.0, + "step": 10614 + }, + { + "entropy": 1.6970041394233704, + "epoch": 1.1661036499958803, + "grad_norm": 0.7727194428443909, + "learning_rate": 9.034570166979961e-06, + "loss": 1.4644, + "mean_token_accuracy": 0.6570507635672888, + "num_tokens": 1779171505.0, + "step": 10615 + }, + { + "entropy": 1.7038895587126415, + "epoch": 1.1662135069072532, + "grad_norm": 0.6358299255371094, + "learning_rate": 9.033007714808865e-06, + "loss": 1.3361, + "mean_token_accuracy": 0.6585201720396677, + "num_tokens": 1779300269.0, + "step": 10616 + }, + { + "entropy": 1.6607412695884705, + "epoch": 1.1663233638186263, + "grad_norm": 0.641280472278595, + "learning_rate": 9.03144532489219e-06, + "loss": 1.2479, + "mean_token_accuracy": 0.6724933038155237, + "num_tokens": 1779431418.0, + "step": 10617 + }, + { + "entropy": 1.7358726660410564, + "epoch": 1.1664332207299992, + "grad_norm": 0.6991965770721436, + "learning_rate": 9.029882997279383e-06, + "loss": 1.3864, + "mean_token_accuracy": 0.6562596013148626, + "num_tokens": 1779579018.0, + "step": 10618 + }, + { + "entropy": 1.6888511975606282, + "epoch": 1.166543077641372, + "grad_norm": 0.6948026418685913, + "learning_rate": 9.02832073201989e-06, + "loss": 1.2935, + "mean_token_accuracy": 0.6627761671940485, + "num_tokens": 1779728921.0, + "step": 10619 + }, + { + "entropy": 1.6387386123339336, + "epoch": 1.166652934552745, + "grad_norm": 0.543950080871582, + "learning_rate": 9.026758529163158e-06, + "loss": 1.5132, + "mean_token_accuracy": 0.6369695862134298, + "num_tokens": 1779938084.0, + "step": 10620 + }, + { + "entropy": 1.696452538172404, + "epoch": 1.166762791464118, + "grad_norm": 0.8097180724143982, + "learning_rate": 9.025196388758626e-06, + "loss": 1.3434, + "mean_token_accuracy": 0.6701660056908926, + "num_tokens": 1780093963.0, + "step": 10621 + }, + { + "entropy": 1.6749595602353413, + "epoch": 1.166872648375491, + "grad_norm": 0.7634894251823425, + "learning_rate": 9.023634310855744e-06, + "loss": 1.3388, + "mean_token_accuracy": 0.670145645737648, + "num_tokens": 1780267420.0, + "step": 10622 + }, + { + "entropy": 1.6456829011440277, + "epoch": 1.1669825052868639, + "grad_norm": 0.7166178822517395, + "learning_rate": 9.02207229550394e-06, + "loss": 1.4062, + "mean_token_accuracy": 0.6510950972636541, + "num_tokens": 1780461631.0, + "step": 10623 + }, + { + "entropy": 1.731689711411794, + "epoch": 1.1670923621982368, + "grad_norm": 0.8911299109458923, + "learning_rate": 9.020510342752662e-06, + "loss": 1.3846, + "mean_token_accuracy": 0.652527650197347, + "num_tokens": 1780628175.0, + "step": 10624 + }, + { + "entropy": 1.7384942670663197, + "epoch": 1.1672022191096096, + "grad_norm": 0.7990009188652039, + "learning_rate": 9.018948452651336e-06, + "loss": 1.5015, + "mean_token_accuracy": 0.6490287284056345, + "num_tokens": 1780796578.0, + "step": 10625 + }, + { + "entropy": 1.7568972607453663, + "epoch": 1.1673120760209827, + "grad_norm": 0.6731627583503723, + "learning_rate": 9.0173866252494e-06, + "loss": 1.4538, + "mean_token_accuracy": 0.6447356839974722, + "num_tokens": 1780980358.0, + "step": 10626 + }, + { + "entropy": 1.7077033917109172, + "epoch": 1.1674219329323556, + "grad_norm": 1.3267916440963745, + "learning_rate": 9.015824860596283e-06, + "loss": 1.4833, + "mean_token_accuracy": 0.6471607486406962, + "num_tokens": 1781190806.0, + "step": 10627 + }, + { + "entropy": 1.7151707013448079, + "epoch": 1.1675317898437285, + "grad_norm": 0.6183844208717346, + "learning_rate": 9.014263158741418e-06, + "loss": 1.4084, + "mean_token_accuracy": 0.6550938785076141, + "num_tokens": 1781328149.0, + "step": 10628 + }, + { + "entropy": 1.7326987187067668, + "epoch": 1.1676416467551014, + "grad_norm": 0.7378236055374146, + "learning_rate": 9.012701519734226e-06, + "loss": 1.4359, + "mean_token_accuracy": 0.6496442258358002, + "num_tokens": 1781522900.0, + "step": 10629 + }, + { + "entropy": 1.6831368406613667, + "epoch": 1.1677515036664745, + "grad_norm": 0.6311535239219666, + "learning_rate": 9.011139943624137e-06, + "loss": 1.3995, + "mean_token_accuracy": 0.6628275960683823, + "num_tokens": 1781696717.0, + "step": 10630 + }, + { + "entropy": 1.754588007926941, + "epoch": 1.1678613605778474, + "grad_norm": 0.6265390515327454, + "learning_rate": 9.009578430460572e-06, + "loss": 1.5914, + "mean_token_accuracy": 0.6241682320833206, + "num_tokens": 1781891433.0, + "step": 10631 + }, + { + "entropy": 1.692752718925476, + "epoch": 1.1679712174892203, + "grad_norm": 0.6025134921073914, + "learning_rate": 9.008016980292956e-06, + "loss": 1.4682, + "mean_token_accuracy": 0.6510246594746908, + "num_tokens": 1782053228.0, + "step": 10632 + }, + { + "entropy": 1.7124259273211162, + "epoch": 1.1680810744005932, + "grad_norm": 0.7642148733139038, + "learning_rate": 9.006455593170698e-06, + "loss": 1.1932, + "mean_token_accuracy": 0.6781323105096817, + "num_tokens": 1782152083.0, + "step": 10633 + }, + { + "entropy": 1.6471915046374004, + "epoch": 1.1681909313119663, + "grad_norm": 0.6959193348884583, + "learning_rate": 9.004894269143228e-06, + "loss": 1.3874, + "mean_token_accuracy": 0.6629950155814489, + "num_tokens": 1782300299.0, + "step": 10634 + }, + { + "entropy": 1.6989375551541646, + "epoch": 1.1683007882233392, + "grad_norm": 0.6835771799087524, + "learning_rate": 9.003333008259953e-06, + "loss": 1.3153, + "mean_token_accuracy": 0.6625100125869116, + "num_tokens": 1782431834.0, + "step": 10635 + }, + { + "entropy": 1.7035725514094036, + "epoch": 1.168410645134712, + "grad_norm": 0.678627610206604, + "learning_rate": 9.001771810570288e-06, + "loss": 1.4835, + "mean_token_accuracy": 0.6454518338044485, + "num_tokens": 1782610660.0, + "step": 10636 + }, + { + "entropy": 1.7128291328748066, + "epoch": 1.168520502046085, + "grad_norm": 0.6007285118103027, + "learning_rate": 9.000210676123648e-06, + "loss": 1.3285, + "mean_token_accuracy": 0.6597598244746526, + "num_tokens": 1782755084.0, + "step": 10637 + }, + { + "entropy": 1.7302992641925812, + "epoch": 1.1686303589574578, + "grad_norm": 0.5934082865715027, + "learning_rate": 8.998649604969436e-06, + "loss": 1.3524, + "mean_token_accuracy": 0.6561450411876043, + "num_tokens": 1782932096.0, + "step": 10638 + }, + { + "entropy": 1.6810977458953857, + "epoch": 1.168740215868831, + "grad_norm": 0.616631269454956, + "learning_rate": 8.997088597157062e-06, + "loss": 1.4466, + "mean_token_accuracy": 0.652444009979566, + "num_tokens": 1783155895.0, + "step": 10639 + }, + { + "entropy": 1.7775660753250122, + "epoch": 1.1688500727802038, + "grad_norm": 0.859024703502655, + "learning_rate": 8.995527652735933e-06, + "loss": 1.3859, + "mean_token_accuracy": 0.6765128125747045, + "num_tokens": 1783280758.0, + "step": 10640 + }, + { + "entropy": 1.6725508570671082, + "epoch": 1.1689599296915767, + "grad_norm": 0.7639785408973694, + "learning_rate": 8.99396677175545e-06, + "loss": 1.409, + "mean_token_accuracy": 0.6709824502468109, + "num_tokens": 1783440353.0, + "step": 10641 + }, + { + "entropy": 1.7496330042680104, + "epoch": 1.1690697866029496, + "grad_norm": 0.6356641054153442, + "learning_rate": 8.992405954265014e-06, + "loss": 1.3488, + "mean_token_accuracy": 0.665415291984876, + "num_tokens": 1783593270.0, + "step": 10642 + }, + { + "entropy": 1.6593229870001476, + "epoch": 1.1691796435143227, + "grad_norm": 0.7678477168083191, + "learning_rate": 8.990845200314027e-06, + "loss": 1.3097, + "mean_token_accuracy": 0.6617792199055353, + "num_tokens": 1783719742.0, + "step": 10643 + }, + { + "entropy": 1.702727844317754, + "epoch": 1.1692895004256956, + "grad_norm": 0.682217538356781, + "learning_rate": 8.989284509951881e-06, + "loss": 1.3527, + "mean_token_accuracy": 0.6565568794806799, + "num_tokens": 1783873275.0, + "step": 10644 + }, + { + "entropy": 1.7255015075206757, + "epoch": 1.1693993573370685, + "grad_norm": 0.7633523344993591, + "learning_rate": 8.98772388322798e-06, + "loss": 1.3573, + "mean_token_accuracy": 0.6607132703065872, + "num_tokens": 1783994243.0, + "step": 10645 + }, + { + "entropy": 1.672673612833023, + "epoch": 1.1695092142484413, + "grad_norm": 0.7300711274147034, + "learning_rate": 8.986163320191706e-06, + "loss": 1.4695, + "mean_token_accuracy": 0.6509375472863516, + "num_tokens": 1784144916.0, + "step": 10646 + }, + { + "entropy": 1.7136195699373882, + "epoch": 1.1696190711598144, + "grad_norm": 0.5912143588066101, + "learning_rate": 8.984602820892454e-06, + "loss": 1.4903, + "mean_token_accuracy": 0.6366753627856573, + "num_tokens": 1784386249.0, + "step": 10647 + }, + { + "entropy": 1.6643874446551006, + "epoch": 1.1697289280711873, + "grad_norm": 0.8985964059829712, + "learning_rate": 8.983042385379618e-06, + "loss": 1.4004, + "mean_token_accuracy": 0.6662670622269312, + "num_tokens": 1784544876.0, + "step": 10648 + }, + { + "entropy": 1.6806229849656422, + "epoch": 1.1698387849825602, + "grad_norm": 0.6527777314186096, + "learning_rate": 8.98148201370258e-06, + "loss": 1.3905, + "mean_token_accuracy": 0.6572650174299876, + "num_tokens": 1784747792.0, + "step": 10649 + }, + { + "entropy": 1.7049620548884075, + "epoch": 1.169948641893933, + "grad_norm": 0.6680081486701965, + "learning_rate": 8.979921705910729e-06, + "loss": 1.4625, + "mean_token_accuracy": 0.6404824604590734, + "num_tokens": 1784944930.0, + "step": 10650 + }, + { + "entropy": 1.6514282921950023, + "epoch": 1.170058498805306, + "grad_norm": 0.6086418032646179, + "learning_rate": 8.978361462053444e-06, + "loss": 1.4598, + "mean_token_accuracy": 0.6551013191541036, + "num_tokens": 1785164307.0, + "step": 10651 + }, + { + "entropy": 1.6851195593674977, + "epoch": 1.170168355716679, + "grad_norm": 0.6363802552223206, + "learning_rate": 8.976801282180108e-06, + "loss": 1.4424, + "mean_token_accuracy": 0.638342077533404, + "num_tokens": 1785428083.0, + "step": 10652 + }, + { + "entropy": 1.7672754526138306, + "epoch": 1.170278212628052, + "grad_norm": 0.6229258179664612, + "learning_rate": 8.975241166340097e-06, + "loss": 1.435, + "mean_token_accuracy": 0.6456956764062246, + "num_tokens": 1785603506.0, + "step": 10653 + }, + { + "entropy": 1.672398070494334, + "epoch": 1.1703880695394249, + "grad_norm": 0.5889727473258972, + "learning_rate": 8.973681114582795e-06, + "loss": 1.3637, + "mean_token_accuracy": 0.6640516370534897, + "num_tokens": 1785801159.0, + "step": 10654 + }, + { + "entropy": 1.783601274092992, + "epoch": 1.1704979264507978, + "grad_norm": 0.7808632254600525, + "learning_rate": 8.972121126957571e-06, + "loss": 1.2955, + "mean_token_accuracy": 0.6714789718389511, + "num_tokens": 1785921513.0, + "step": 10655 + }, + { + "entropy": 1.7051396469275157, + "epoch": 1.1706077833621709, + "grad_norm": 0.7725319862365723, + "learning_rate": 8.9705612035138e-06, + "loss": 1.5335, + "mean_token_accuracy": 0.6427340308825175, + "num_tokens": 1786062181.0, + "step": 10656 + }, + { + "entropy": 1.7101481556892395, + "epoch": 1.1707176402735437, + "grad_norm": 0.7353886961936951, + "learning_rate": 8.969001344300854e-06, + "loss": 1.2933, + "mean_token_accuracy": 0.6786648482084274, + "num_tokens": 1786199326.0, + "step": 10657 + }, + { + "entropy": 1.7510856886704762, + "epoch": 1.1708274971849166, + "grad_norm": 0.7046499848365784, + "learning_rate": 8.967441549368097e-06, + "loss": 1.4565, + "mean_token_accuracy": 0.6432525664567947, + "num_tokens": 1786349644.0, + "step": 10658 + }, + { + "entropy": 1.6620566546916962, + "epoch": 1.1709373540962895, + "grad_norm": 0.5971559882164001, + "learning_rate": 8.9658818187649e-06, + "loss": 1.3713, + "mean_token_accuracy": 0.6542538553476334, + "num_tokens": 1786576097.0, + "step": 10659 + }, + { + "entropy": 1.7027663091818492, + "epoch": 1.1710472110076626, + "grad_norm": 1.0174574851989746, + "learning_rate": 8.964322152540627e-06, + "loss": 1.3846, + "mean_token_accuracy": 0.6615005234877268, + "num_tokens": 1786752269.0, + "step": 10660 + }, + { + "entropy": 1.734482745329539, + "epoch": 1.1711570679190355, + "grad_norm": 0.6983833909034729, + "learning_rate": 8.962762550744642e-06, + "loss": 1.341, + "mean_token_accuracy": 0.6733351896206538, + "num_tokens": 1786886030.0, + "step": 10661 + }, + { + "entropy": 1.7492407063643138, + "epoch": 1.1712669248304084, + "grad_norm": 0.7310764789581299, + "learning_rate": 8.9612030134263e-06, + "loss": 1.4714, + "mean_token_accuracy": 0.6528683652480444, + "num_tokens": 1787072947.0, + "step": 10662 + }, + { + "entropy": 1.5964481433232625, + "epoch": 1.1713767817417813, + "grad_norm": 0.6488633751869202, + "learning_rate": 8.95964354063497e-06, + "loss": 1.2987, + "mean_token_accuracy": 0.6710949192444483, + "num_tokens": 1787201005.0, + "step": 10663 + }, + { + "entropy": 1.7364132006963093, + "epoch": 1.1714866386531542, + "grad_norm": 0.7075624465942383, + "learning_rate": 8.958084132419999e-06, + "loss": 1.4657, + "mean_token_accuracy": 0.6590905785560608, + "num_tokens": 1787332503.0, + "step": 10664 + }, + { + "entropy": 1.7492092450459797, + "epoch": 1.1715964955645273, + "grad_norm": 0.6287668943405151, + "learning_rate": 8.956524788830742e-06, + "loss": 1.4, + "mean_token_accuracy": 0.6587251722812653, + "num_tokens": 1787490700.0, + "step": 10665 + }, + { + "entropy": 1.6878098646799724, + "epoch": 1.1717063524759002, + "grad_norm": 0.6984691023826599, + "learning_rate": 8.95496550991656e-06, + "loss": 1.4229, + "mean_token_accuracy": 0.6428747077782949, + "num_tokens": 1787656504.0, + "step": 10666 + }, + { + "entropy": 1.7307902872562408, + "epoch": 1.171816209387273, + "grad_norm": 0.636269748210907, + "learning_rate": 8.953406295726796e-06, + "loss": 1.5128, + "mean_token_accuracy": 0.6517880360285441, + "num_tokens": 1787851407.0, + "step": 10667 + }, + { + "entropy": 1.7220154702663422, + "epoch": 1.171926066298646, + "grad_norm": 0.6204155087471008, + "learning_rate": 8.951847146310801e-06, + "loss": 1.4483, + "mean_token_accuracy": 0.6446654995282491, + "num_tokens": 1788023011.0, + "step": 10668 + }, + { + "entropy": 1.7286285956700642, + "epoch": 1.172035923210019, + "grad_norm": 0.7028345465660095, + "learning_rate": 8.950288061717924e-06, + "loss": 1.5394, + "mean_token_accuracy": 0.6381612122058868, + "num_tokens": 1788248357.0, + "step": 10669 + }, + { + "entropy": 1.6964424749215443, + "epoch": 1.172145780121392, + "grad_norm": 0.780795693397522, + "learning_rate": 8.948729041997502e-06, + "loss": 1.4833, + "mean_token_accuracy": 0.6638787587483724, + "num_tokens": 1788381104.0, + "step": 10670 + }, + { + "entropy": 1.7409202357133229, + "epoch": 1.1722556370327648, + "grad_norm": 0.700515627861023, + "learning_rate": 8.94717008719888e-06, + "loss": 1.3702, + "mean_token_accuracy": 0.6486127773920695, + "num_tokens": 1788513068.0, + "step": 10671 + }, + { + "entropy": 1.6607150733470917, + "epoch": 1.1723654939441377, + "grad_norm": 0.6845481991767883, + "learning_rate": 8.945611197371404e-06, + "loss": 1.3526, + "mean_token_accuracy": 0.6619810660680135, + "num_tokens": 1788675191.0, + "step": 10672 + }, + { + "entropy": 1.6844545602798462, + "epoch": 1.1724753508555108, + "grad_norm": 0.5828627943992615, + "learning_rate": 8.944052372564404e-06, + "loss": 1.333, + "mean_token_accuracy": 0.6642112135887146, + "num_tokens": 1788823629.0, + "step": 10673 + }, + { + "entropy": 1.7418619493643444, + "epoch": 1.1725852077668837, + "grad_norm": 0.7149393558502197, + "learning_rate": 8.942493612827223e-06, + "loss": 1.4441, + "mean_token_accuracy": 0.6542845120032629, + "num_tokens": 1788993849.0, + "step": 10674 + }, + { + "entropy": 1.7660021980603535, + "epoch": 1.1726950646782566, + "grad_norm": 0.6694035530090332, + "learning_rate": 8.940934918209193e-06, + "loss": 1.4416, + "mean_token_accuracy": 0.6477504769961039, + "num_tokens": 1789149567.0, + "step": 10675 + }, + { + "entropy": 1.7088763415813446, + "epoch": 1.1728049215896295, + "grad_norm": 0.7105270028114319, + "learning_rate": 8.939376288759643e-06, + "loss": 1.4383, + "mean_token_accuracy": 0.6585270663102468, + "num_tokens": 1789297771.0, + "step": 10676 + }, + { + "entropy": 1.7306538224220276, + "epoch": 1.1729147785010023, + "grad_norm": 0.6987410187721252, + "learning_rate": 8.937817724527901e-06, + "loss": 1.4355, + "mean_token_accuracy": 0.6463360438744227, + "num_tokens": 1789463092.0, + "step": 10677 + }, + { + "entropy": 1.6846852699915569, + "epoch": 1.1730246354123754, + "grad_norm": 0.7629522085189819, + "learning_rate": 8.936259225563306e-06, + "loss": 1.2583, + "mean_token_accuracy": 0.6747141232093176, + "num_tokens": 1789581228.0, + "step": 10678 + }, + { + "entropy": 1.7274243632952373, + "epoch": 1.1731344923237483, + "grad_norm": 0.6886153221130371, + "learning_rate": 8.934700791915171e-06, + "loss": 1.3786, + "mean_token_accuracy": 0.6586506168047587, + "num_tokens": 1789767348.0, + "step": 10679 + }, + { + "entropy": 1.7268758118152618, + "epoch": 1.1732443492351212, + "grad_norm": 0.7361603379249573, + "learning_rate": 8.933142423632828e-06, + "loss": 1.3816, + "mean_token_accuracy": 0.666677271326383, + "num_tokens": 1789899146.0, + "step": 10680 + }, + { + "entropy": 1.6961700121561687, + "epoch": 1.173354206146494, + "grad_norm": 0.5983572006225586, + "learning_rate": 8.931584120765598e-06, + "loss": 1.4038, + "mean_token_accuracy": 0.66066013276577, + "num_tokens": 1790122194.0, + "step": 10681 + }, + { + "entropy": 1.7338594396909077, + "epoch": 1.1734640630578672, + "grad_norm": 0.6352316737174988, + "learning_rate": 8.930025883362796e-06, + "loss": 1.344, + "mean_token_accuracy": 0.6731750816106796, + "num_tokens": 1790289767.0, + "step": 10682 + }, + { + "entropy": 1.692486047744751, + "epoch": 1.17357391996924, + "grad_norm": 0.7538011074066162, + "learning_rate": 8.928467711473741e-06, + "loss": 1.4321, + "mean_token_accuracy": 0.6721002409855524, + "num_tokens": 1790441707.0, + "step": 10683 + }, + { + "entropy": 1.73250612616539, + "epoch": 1.173683776880613, + "grad_norm": 0.6547481417655945, + "learning_rate": 8.926909605147751e-06, + "loss": 1.345, + "mean_token_accuracy": 0.6578451991081238, + "num_tokens": 1790604605.0, + "step": 10684 + }, + { + "entropy": 1.6946922838687897, + "epoch": 1.1737936337919859, + "grad_norm": 0.6833810210227966, + "learning_rate": 8.925351564434137e-06, + "loss": 1.364, + "mean_token_accuracy": 0.6670823097229004, + "num_tokens": 1790749799.0, + "step": 10685 + }, + { + "entropy": 1.7278658747673035, + "epoch": 1.173903490703359, + "grad_norm": 0.6715664267539978, + "learning_rate": 8.92379358938221e-06, + "loss": 1.4199, + "mean_token_accuracy": 0.6526618450880051, + "num_tokens": 1790927175.0, + "step": 10686 + }, + { + "entropy": 1.7412831882635753, + "epoch": 1.1740133476147319, + "grad_norm": 0.8361808061599731, + "learning_rate": 8.922235680041284e-06, + "loss": 1.5667, + "mean_token_accuracy": 0.661175494392713, + "num_tokens": 1791071508.0, + "step": 10687 + }, + { + "entropy": 1.7346645096937816, + "epoch": 1.1741232045261047, + "grad_norm": 0.697149395942688, + "learning_rate": 8.920677836460661e-06, + "loss": 1.2796, + "mean_token_accuracy": 0.6704900513092676, + "num_tokens": 1791186740.0, + "step": 10688 + }, + { + "entropy": 1.742050697406133, + "epoch": 1.1742330614374776, + "grad_norm": 0.6318445801734924, + "learning_rate": 8.919120058689643e-06, + "loss": 1.5916, + "mean_token_accuracy": 0.6259209712346395, + "num_tokens": 1791436608.0, + "step": 10689 + }, + { + "entropy": 1.6182755033175151, + "epoch": 1.1743429183488505, + "grad_norm": 0.6061080098152161, + "learning_rate": 8.917562346777544e-06, + "loss": 1.4126, + "mean_token_accuracy": 0.6556143959363302, + "num_tokens": 1791640128.0, + "step": 10690 + }, + { + "entropy": 1.6585151453812916, + "epoch": 1.1744527752602236, + "grad_norm": 0.6770459413528442, + "learning_rate": 8.916004700773656e-06, + "loss": 1.2284, + "mean_token_accuracy": 0.6863802125056585, + "num_tokens": 1791778020.0, + "step": 10691 + }, + { + "entropy": 1.69165035088857, + "epoch": 1.1745626321715965, + "grad_norm": 0.5943127274513245, + "learning_rate": 8.914447120727278e-06, + "loss": 1.4704, + "mean_token_accuracy": 0.6474858671426773, + "num_tokens": 1792003193.0, + "step": 10692 + }, + { + "entropy": 1.6488823493321736, + "epoch": 1.1746724890829694, + "grad_norm": 0.6268026828765869, + "learning_rate": 8.912889606687713e-06, + "loss": 1.1967, + "mean_token_accuracy": 0.687493771314621, + "num_tokens": 1792114106.0, + "step": 10693 + }, + { + "entropy": 1.6368895769119263, + "epoch": 1.1747823459943423, + "grad_norm": 0.6344706416130066, + "learning_rate": 8.911332158704248e-06, + "loss": 1.4322, + "mean_token_accuracy": 0.6635664999485016, + "num_tokens": 1792283248.0, + "step": 10694 + }, + { + "entropy": 1.6913351913293202, + "epoch": 1.1748922029057154, + "grad_norm": 0.6644214987754822, + "learning_rate": 8.909774776826179e-06, + "loss": 1.4297, + "mean_token_accuracy": 0.647185837229093, + "num_tokens": 1792453230.0, + "step": 10695 + }, + { + "entropy": 1.7606900731722515, + "epoch": 1.1750020598170883, + "grad_norm": 0.6442691683769226, + "learning_rate": 8.908217461102799e-06, + "loss": 1.3776, + "mean_token_accuracy": 0.6546642581621805, + "num_tokens": 1792613034.0, + "step": 10696 + }, + { + "entropy": 1.7363630533218384, + "epoch": 1.1751119167284612, + "grad_norm": 0.8015692830085754, + "learning_rate": 8.906660211583392e-06, + "loss": 1.2139, + "mean_token_accuracy": 0.6720298528671265, + "num_tokens": 1792743828.0, + "step": 10697 + }, + { + "entropy": 1.7080905040105183, + "epoch": 1.175221773639834, + "grad_norm": 0.6090119481086731, + "learning_rate": 8.905103028317245e-06, + "loss": 1.4163, + "mean_token_accuracy": 0.6519571195046107, + "num_tokens": 1792938546.0, + "step": 10698 + }, + { + "entropy": 1.7202429076035817, + "epoch": 1.1753316305512072, + "grad_norm": 0.980828583240509, + "learning_rate": 8.903545911353648e-06, + "loss": 1.1951, + "mean_token_accuracy": 0.6983717431624731, + "num_tokens": 1793050711.0, + "step": 10699 + }, + { + "entropy": 1.6536558071772258, + "epoch": 1.17544148746258, + "grad_norm": 0.8090218305587769, + "learning_rate": 8.901988860741875e-06, + "loss": 1.4403, + "mean_token_accuracy": 0.6533640176057816, + "num_tokens": 1793218498.0, + "step": 10700 + }, + { + "entropy": 1.701753854751587, + "epoch": 1.175551344373953, + "grad_norm": 0.622660219669342, + "learning_rate": 8.900431876531205e-06, + "loss": 1.4098, + "mean_token_accuracy": 0.6528751403093338, + "num_tokens": 1793407396.0, + "step": 10701 + }, + { + "entropy": 1.743919461965561, + "epoch": 1.1756612012853258, + "grad_norm": 0.6665404438972473, + "learning_rate": 8.898874958770928e-06, + "loss": 1.3872, + "mean_token_accuracy": 0.6619693537553152, + "num_tokens": 1793570421.0, + "step": 10702 + }, + { + "entropy": 1.7298048436641693, + "epoch": 1.1757710581966987, + "grad_norm": 0.8393076658248901, + "learning_rate": 8.897318107510307e-06, + "loss": 1.5443, + "mean_token_accuracy": 0.6459775815407435, + "num_tokens": 1793738540.0, + "step": 10703 + }, + { + "entropy": 1.7660633722941081, + "epoch": 1.1758809151080718, + "grad_norm": 0.7265772223472595, + "learning_rate": 8.895761322798622e-06, + "loss": 1.471, + "mean_token_accuracy": 0.6382889002561569, + "num_tokens": 1793889951.0, + "step": 10704 + }, + { + "entropy": 1.7584838569164276, + "epoch": 1.1759907720194447, + "grad_norm": 0.6831865906715393, + "learning_rate": 8.894204604685142e-06, + "loss": 1.4035, + "mean_token_accuracy": 0.6512691229581833, + "num_tokens": 1794068010.0, + "step": 10705 + }, + { + "entropy": 1.7787149449189503, + "epoch": 1.1761006289308176, + "grad_norm": 0.8004885911941528, + "learning_rate": 8.892647953219136e-06, + "loss": 1.3326, + "mean_token_accuracy": 0.6619590371847153, + "num_tokens": 1794185919.0, + "step": 10706 + }, + { + "entropy": 1.6391962865988414, + "epoch": 1.1762104858421905, + "grad_norm": 0.6488991975784302, + "learning_rate": 8.891091368449876e-06, + "loss": 1.3778, + "mean_token_accuracy": 0.6589486648639044, + "num_tokens": 1794377843.0, + "step": 10707 + }, + { + "entropy": 1.73589222629865, + "epoch": 1.1763203427535636, + "grad_norm": 0.7107129693031311, + "learning_rate": 8.88953485042662e-06, + "loss": 1.3963, + "mean_token_accuracy": 0.661611388127009, + "num_tokens": 1794530068.0, + "step": 10708 + }, + { + "entropy": 1.691332995891571, + "epoch": 1.1764301996649364, + "grad_norm": 0.7042721509933472, + "learning_rate": 8.887978399198636e-06, + "loss": 1.2553, + "mean_token_accuracy": 0.6832821269830068, + "num_tokens": 1794666670.0, + "step": 10709 + }, + { + "entropy": 1.7346055905024211, + "epoch": 1.1765400565763093, + "grad_norm": 0.6356518268585205, + "learning_rate": 8.886422014815188e-06, + "loss": 1.3837, + "mean_token_accuracy": 0.6597516189018885, + "num_tokens": 1794846449.0, + "step": 10710 + }, + { + "entropy": 1.7051764130592346, + "epoch": 1.1766499134876822, + "grad_norm": 0.657356321811676, + "learning_rate": 8.884865697325526e-06, + "loss": 1.47, + "mean_token_accuracy": 0.6433271119991938, + "num_tokens": 1795086141.0, + "step": 10711 + }, + { + "entropy": 1.7161107162634532, + "epoch": 1.1767597703990553, + "grad_norm": 0.6534709930419922, + "learning_rate": 8.883309446778914e-06, + "loss": 1.4793, + "mean_token_accuracy": 0.6407648821671804, + "num_tokens": 1795262028.0, + "step": 10712 + }, + { + "entropy": 1.6933226088682811, + "epoch": 1.1768696273104282, + "grad_norm": 0.8174028396606445, + "learning_rate": 8.881753263224604e-06, + "loss": 1.5551, + "mean_token_accuracy": 0.6449368943770727, + "num_tokens": 1795415394.0, + "step": 10713 + }, + { + "entropy": 1.6775444547335308, + "epoch": 1.176979484221801, + "grad_norm": 0.6747733950614929, + "learning_rate": 8.880197146711846e-06, + "loss": 1.3102, + "mean_token_accuracy": 0.6573974937200546, + "num_tokens": 1795587872.0, + "step": 10714 + }, + { + "entropy": 1.641041358311971, + "epoch": 1.177089341133174, + "grad_norm": 0.8012470006942749, + "learning_rate": 8.878641097289895e-06, + "loss": 1.4218, + "mean_token_accuracy": 0.6504307389259338, + "num_tokens": 1795768734.0, + "step": 10715 + }, + { + "entropy": 1.740955690542857, + "epoch": 1.1771991980445469, + "grad_norm": 0.6754148006439209, + "learning_rate": 8.877085115008e-06, + "loss": 1.4466, + "mean_token_accuracy": 0.6471899896860123, + "num_tokens": 1795996496.0, + "step": 10716 + }, + { + "entropy": 1.6761998136838276, + "epoch": 1.17730905495592, + "grad_norm": 0.6564405560493469, + "learning_rate": 8.875529199915403e-06, + "loss": 1.4154, + "mean_token_accuracy": 0.6573975533246994, + "num_tokens": 1796179868.0, + "step": 10717 + }, + { + "entropy": 1.6646329561869304, + "epoch": 1.1774189118672929, + "grad_norm": 0.6660974025726318, + "learning_rate": 8.873973352061346e-06, + "loss": 1.3027, + "mean_token_accuracy": 0.6660636613766352, + "num_tokens": 1796375561.0, + "step": 10718 + }, + { + "entropy": 1.6921138167381287, + "epoch": 1.1775287687786657, + "grad_norm": 0.6286952495574951, + "learning_rate": 8.87241757149508e-06, + "loss": 1.3583, + "mean_token_accuracy": 0.6696690519650778, + "num_tokens": 1796529528.0, + "step": 10719 + }, + { + "entropy": 1.627532919247945, + "epoch": 1.1776386256900386, + "grad_norm": 2.4623351097106934, + "learning_rate": 8.870861858265836e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6974131315946579, + "num_tokens": 1796667856.0, + "step": 10720 + }, + { + "entropy": 1.7433740397294362, + "epoch": 1.1777484826014117, + "grad_norm": 0.7270897626876831, + "learning_rate": 8.869306212422852e-06, + "loss": 1.3554, + "mean_token_accuracy": 0.6652982632319132, + "num_tokens": 1796814869.0, + "step": 10721 + }, + { + "entropy": 1.6887870232264202, + "epoch": 1.1778583395127846, + "grad_norm": 0.6307252049446106, + "learning_rate": 8.867750634015372e-06, + "loss": 1.3855, + "mean_token_accuracy": 0.6616760591665903, + "num_tokens": 1797013937.0, + "step": 10722 + }, + { + "entropy": 1.7013648450374603, + "epoch": 1.1779681964241575, + "grad_norm": 0.664087176322937, + "learning_rate": 8.86619512309262e-06, + "loss": 1.3015, + "mean_token_accuracy": 0.6622590919335684, + "num_tokens": 1797176520.0, + "step": 10723 + }, + { + "entropy": 1.758970280488332, + "epoch": 1.1780780533355304, + "grad_norm": 0.709904670715332, + "learning_rate": 8.864639679703833e-06, + "loss": 1.4653, + "mean_token_accuracy": 0.6567131032546362, + "num_tokens": 1797330029.0, + "step": 10724 + }, + { + "entropy": 1.7270347674687703, + "epoch": 1.1781879102469035, + "grad_norm": 0.6952686905860901, + "learning_rate": 8.863084303898238e-06, + "loss": 1.4274, + "mean_token_accuracy": 0.6522839615742365, + "num_tokens": 1797485981.0, + "step": 10725 + }, + { + "entropy": 1.6494085093339284, + "epoch": 1.1782977671582764, + "grad_norm": 0.8430054783821106, + "learning_rate": 8.86152899572506e-06, + "loss": 1.3444, + "mean_token_accuracy": 0.6608125517765681, + "num_tokens": 1797666072.0, + "step": 10726 + }, + { + "entropy": 1.6977934141953785, + "epoch": 1.1784076240696493, + "grad_norm": 0.7214722633361816, + "learning_rate": 8.859973755233525e-06, + "loss": 1.4748, + "mean_token_accuracy": 0.648172547419866, + "num_tokens": 1797856352.0, + "step": 10727 + }, + { + "entropy": 1.6916989386081696, + "epoch": 1.1785174809810222, + "grad_norm": 0.7626371383666992, + "learning_rate": 8.858418582472859e-06, + "loss": 1.3687, + "mean_token_accuracy": 0.651703084508578, + "num_tokens": 1797979260.0, + "step": 10728 + }, + { + "entropy": 1.7365763584772747, + "epoch": 1.178627337892395, + "grad_norm": 0.7373912334442139, + "learning_rate": 8.856863477492276e-06, + "loss": 1.3676, + "mean_token_accuracy": 0.6566950579484304, + "num_tokens": 1798131140.0, + "step": 10729 + }, + { + "entropy": 1.6413574417432149, + "epoch": 1.1787371948037682, + "grad_norm": 0.7868739366531372, + "learning_rate": 8.855308440341001e-06, + "loss": 1.213, + "mean_token_accuracy": 0.6854518900314966, + "num_tokens": 1798266166.0, + "step": 10730 + }, + { + "entropy": 1.6832148929437, + "epoch": 1.178847051715141, + "grad_norm": 0.6691809892654419, + "learning_rate": 8.853753471068249e-06, + "loss": 1.2974, + "mean_token_accuracy": 0.6681927392880121, + "num_tokens": 1798393542.0, + "step": 10731 + }, + { + "entropy": 1.6947985390822093, + "epoch": 1.178956908626514, + "grad_norm": 0.6206928491592407, + "learning_rate": 8.852198569723231e-06, + "loss": 1.3536, + "mean_token_accuracy": 0.6597307672103246, + "num_tokens": 1798562655.0, + "step": 10732 + }, + { + "entropy": 1.6786811153093975, + "epoch": 1.1790667655378868, + "grad_norm": 0.6354871988296509, + "learning_rate": 8.850643736355157e-06, + "loss": 1.3047, + "mean_token_accuracy": 0.6605040381352106, + "num_tokens": 1798727276.0, + "step": 10733 + }, + { + "entropy": 1.6901488800843556, + "epoch": 1.17917662244926, + "grad_norm": 0.6511650681495667, + "learning_rate": 8.849088971013246e-06, + "loss": 1.3516, + "mean_token_accuracy": 0.6556326846281687, + "num_tokens": 1798878099.0, + "step": 10734 + }, + { + "entropy": 1.7377649943033855, + "epoch": 1.1792864793606328, + "grad_norm": 0.7267980575561523, + "learning_rate": 8.847534273746696e-06, + "loss": 1.515, + "mean_token_accuracy": 0.6588891347249349, + "num_tokens": 1799033920.0, + "step": 10735 + }, + { + "entropy": 1.6604502499103546, + "epoch": 1.1793963362720057, + "grad_norm": 0.6588174700737, + "learning_rate": 8.845979644604716e-06, + "loss": 1.4775, + "mean_token_accuracy": 0.6402100125948588, + "num_tokens": 1799277414.0, + "step": 10736 + }, + { + "entropy": 1.7129139800866444, + "epoch": 1.1795061931833786, + "grad_norm": 0.7839108109474182, + "learning_rate": 8.844425083636514e-06, + "loss": 1.3845, + "mean_token_accuracy": 0.6433479189872742, + "num_tokens": 1799462174.0, + "step": 10737 + }, + { + "entropy": 1.6579439043998718, + "epoch": 1.1796160500947517, + "grad_norm": 0.7659602761268616, + "learning_rate": 8.842870590891284e-06, + "loss": 1.3638, + "mean_token_accuracy": 0.6679676622152328, + "num_tokens": 1799659246.0, + "step": 10738 + }, + { + "entropy": 1.7055364549160004, + "epoch": 1.1797259070061246, + "grad_norm": 0.7582058906555176, + "learning_rate": 8.841316166418225e-06, + "loss": 1.4981, + "mean_token_accuracy": 0.6421041041612625, + "num_tokens": 1799856918.0, + "step": 10739 + }, + { + "entropy": 1.6829807460308075, + "epoch": 1.1798357639174974, + "grad_norm": 0.6783363223075867, + "learning_rate": 8.83976181026654e-06, + "loss": 1.3274, + "mean_token_accuracy": 0.6636816610892614, + "num_tokens": 1800016390.0, + "step": 10740 + }, + { + "entropy": 1.684312105178833, + "epoch": 1.1799456208288703, + "grad_norm": 0.7491908669471741, + "learning_rate": 8.83820752248542e-06, + "loss": 1.3397, + "mean_token_accuracy": 0.6631839076677958, + "num_tokens": 1800167294.0, + "step": 10741 + }, + { + "entropy": 1.739410251379013, + "epoch": 1.1800554777402432, + "grad_norm": 0.7408508062362671, + "learning_rate": 8.836653303124057e-06, + "loss": 1.2769, + "mean_token_accuracy": 0.6693633794784546, + "num_tokens": 1800278529.0, + "step": 10742 + }, + { + "entropy": 1.6683301428953807, + "epoch": 1.1801653346516163, + "grad_norm": 0.7159736752510071, + "learning_rate": 8.835099152231645e-06, + "loss": 1.5041, + "mean_token_accuracy": 0.6455606669187546, + "num_tokens": 1800503623.0, + "step": 10743 + }, + { + "entropy": 1.6624947686990101, + "epoch": 1.1802751915629892, + "grad_norm": 0.6846541166305542, + "learning_rate": 8.833545069857366e-06, + "loss": 1.3794, + "mean_token_accuracy": 0.6705669413010279, + "num_tokens": 1800684862.0, + "step": 10744 + }, + { + "entropy": 1.702225963274638, + "epoch": 1.180385048474362, + "grad_norm": 0.699865460395813, + "learning_rate": 8.831991056050408e-06, + "loss": 1.5913, + "mean_token_accuracy": 0.6365682830413183, + "num_tokens": 1800903631.0, + "step": 10745 + }, + { + "entropy": 1.6989426116148632, + "epoch": 1.1804949053857352, + "grad_norm": 0.6623237729072571, + "learning_rate": 8.830437110859959e-06, + "loss": 1.5188, + "mean_token_accuracy": 0.6346626182397207, + "num_tokens": 1801123618.0, + "step": 10746 + }, + { + "entropy": 1.7860455016295116, + "epoch": 1.180604762297108, + "grad_norm": 0.6535719633102417, + "learning_rate": 8.828883234335197e-06, + "loss": 1.3937, + "mean_token_accuracy": 0.6469403405984243, + "num_tokens": 1801317371.0, + "step": 10747 + }, + { + "entropy": 1.6487139264742534, + "epoch": 1.180714619208481, + "grad_norm": 0.6619005799293518, + "learning_rate": 8.827329426525301e-06, + "loss": 1.3683, + "mean_token_accuracy": 0.6584922273953756, + "num_tokens": 1801487160.0, + "step": 10748 + }, + { + "entropy": 1.7167851825555165, + "epoch": 1.1808244761198539, + "grad_norm": 0.9309948086738586, + "learning_rate": 8.825775687479454e-06, + "loss": 1.3891, + "mean_token_accuracy": 0.6533536563316981, + "num_tokens": 1801615083.0, + "step": 10749 + }, + { + "entropy": 1.7493426501750946, + "epoch": 1.1809343330312267, + "grad_norm": 0.8402960300445557, + "learning_rate": 8.824222017246824e-06, + "loss": 1.3704, + "mean_token_accuracy": 0.6674526085456213, + "num_tokens": 1801759170.0, + "step": 10750 + }, + { + "entropy": 1.6924077570438385, + "epoch": 1.1810441899425999, + "grad_norm": 0.6291844248771667, + "learning_rate": 8.822668415876582e-06, + "loss": 1.5256, + "mean_token_accuracy": 0.6429929981629053, + "num_tokens": 1801944217.0, + "step": 10751 + }, + { + "entropy": 1.7071336607138317, + "epoch": 1.1811540468539727, + "grad_norm": 0.7020394802093506, + "learning_rate": 8.821114883417909e-06, + "loss": 1.3756, + "mean_token_accuracy": 0.6600728432337443, + "num_tokens": 1802074925.0, + "step": 10752 + }, + { + "entropy": 1.664092222849528, + "epoch": 1.1812639037653456, + "grad_norm": 0.7180442810058594, + "learning_rate": 8.81956141991997e-06, + "loss": 1.2898, + "mean_token_accuracy": 0.6710790693759918, + "num_tokens": 1802243459.0, + "step": 10753 + }, + { + "entropy": 1.6833447615305583, + "epoch": 1.1813737606767185, + "grad_norm": 0.6144715547561646, + "learning_rate": 8.818008025431925e-06, + "loss": 1.434, + "mean_token_accuracy": 0.6551510939995447, + "num_tokens": 1802442490.0, + "step": 10754 + }, + { + "entropy": 1.7438991864522297, + "epoch": 1.1814836175880914, + "grad_norm": 0.755179226398468, + "learning_rate": 8.816454700002946e-06, + "loss": 1.343, + "mean_token_accuracy": 0.6712101946274439, + "num_tokens": 1802583232.0, + "step": 10755 + }, + { + "entropy": 1.6965516308943431, + "epoch": 1.1815934744994645, + "grad_norm": 0.7071336507797241, + "learning_rate": 8.814901443682189e-06, + "loss": 1.4545, + "mean_token_accuracy": 0.6644222984711329, + "num_tokens": 1802775291.0, + "step": 10756 + }, + { + "entropy": 1.8494134942690532, + "epoch": 1.1817033314108374, + "grad_norm": 0.7613623142242432, + "learning_rate": 8.813348256518816e-06, + "loss": 1.489, + "mean_token_accuracy": 0.6528632789850235, + "num_tokens": 1802943235.0, + "step": 10757 + }, + { + "entropy": 1.7241133948167164, + "epoch": 1.1818131883222103, + "grad_norm": 0.6806331276893616, + "learning_rate": 8.811795138561989e-06, + "loss": 1.3547, + "mean_token_accuracy": 0.653764029343923, + "num_tokens": 1803086552.0, + "step": 10758 + }, + { + "entropy": 1.6509768664836884, + "epoch": 1.1819230452335834, + "grad_norm": 0.7953295111656189, + "learning_rate": 8.810242089860857e-06, + "loss": 1.4753, + "mean_token_accuracy": 0.6489654282728831, + "num_tokens": 1803273180.0, + "step": 10759 + }, + { + "entropy": 1.701873242855072, + "epoch": 1.1820329021449563, + "grad_norm": 0.701553463935852, + "learning_rate": 8.808689110464576e-06, + "loss": 1.3899, + "mean_token_accuracy": 0.6529113153616587, + "num_tokens": 1803435603.0, + "step": 10760 + }, + { + "entropy": 1.7141969501972198, + "epoch": 1.1821427590563292, + "grad_norm": 0.7244220972061157, + "learning_rate": 8.807136200422301e-06, + "loss": 1.5109, + "mean_token_accuracy": 0.6525500317414602, + "num_tokens": 1803580415.0, + "step": 10761 + }, + { + "entropy": 1.6993577778339386, + "epoch": 1.182252615967702, + "grad_norm": 0.6596866250038147, + "learning_rate": 8.805583359783175e-06, + "loss": 1.4599, + "mean_token_accuracy": 0.6497125774621964, + "num_tokens": 1803719774.0, + "step": 10762 + }, + { + "entropy": 1.7196275393168132, + "epoch": 1.182362472879075, + "grad_norm": 0.5810356736183167, + "learning_rate": 8.804030588596344e-06, + "loss": 1.5008, + "mean_token_accuracy": 0.6446505437294642, + "num_tokens": 1803972288.0, + "step": 10763 + }, + { + "entropy": 1.695581078529358, + "epoch": 1.182472329790448, + "grad_norm": 0.6525010466575623, + "learning_rate": 8.802477886910958e-06, + "loss": 1.3401, + "mean_token_accuracy": 0.6595296412706375, + "num_tokens": 1804124653.0, + "step": 10764 + }, + { + "entropy": 1.7033733328183491, + "epoch": 1.182582186701821, + "grad_norm": 0.7598459124565125, + "learning_rate": 8.800925254776158e-06, + "loss": 1.3434, + "mean_token_accuracy": 0.6662160108486811, + "num_tokens": 1804257013.0, + "step": 10765 + }, + { + "entropy": 1.6759057243665059, + "epoch": 1.1826920436131938, + "grad_norm": 0.6761953234672546, + "learning_rate": 8.799372692241082e-06, + "loss": 1.428, + "mean_token_accuracy": 0.669055625796318, + "num_tokens": 1804452630.0, + "step": 10766 + }, + { + "entropy": 1.7233761151631672, + "epoch": 1.1828019005245667, + "grad_norm": 0.6960268616676331, + "learning_rate": 8.797820199354868e-06, + "loss": 1.5065, + "mean_token_accuracy": 0.6348318805297216, + "num_tokens": 1804614525.0, + "step": 10767 + }, + { + "entropy": 1.685812105735143, + "epoch": 1.1829117574359396, + "grad_norm": 0.7641476988792419, + "learning_rate": 8.796267776166651e-06, + "loss": 1.5683, + "mean_token_accuracy": 0.6436462799708048, + "num_tokens": 1804786476.0, + "step": 10768 + }, + { + "entropy": 1.6836872696876526, + "epoch": 1.1830216143473127, + "grad_norm": 0.5971675515174866, + "learning_rate": 8.794715422725569e-06, + "loss": 1.52, + "mean_token_accuracy": 0.6463221857945124, + "num_tokens": 1805022456.0, + "step": 10769 + }, + { + "entropy": 1.7146795690059662, + "epoch": 1.1831314712586856, + "grad_norm": 0.781304657459259, + "learning_rate": 8.793163139080744e-06, + "loss": 1.5337, + "mean_token_accuracy": 0.6382714013258616, + "num_tokens": 1805203477.0, + "step": 10770 + }, + { + "entropy": 1.641987790664037, + "epoch": 1.1832413281700584, + "grad_norm": 0.7032956480979919, + "learning_rate": 8.791610925281315e-06, + "loss": 1.3017, + "mean_token_accuracy": 0.6680291642745336, + "num_tokens": 1805370746.0, + "step": 10771 + }, + { + "entropy": 1.765711506207784, + "epoch": 1.1833511850814316, + "grad_norm": 0.9066851735115051, + "learning_rate": 8.790058781376409e-06, + "loss": 1.4206, + "mean_token_accuracy": 0.6616054326295853, + "num_tokens": 1805496309.0, + "step": 10772 + }, + { + "entropy": 1.7519434293111165, + "epoch": 1.1834610419928044, + "grad_norm": 0.8009188175201416, + "learning_rate": 8.788506707415143e-06, + "loss": 1.4096, + "mean_token_accuracy": 0.6672770380973816, + "num_tokens": 1805641288.0, + "step": 10773 + }, + { + "entropy": 1.768102914094925, + "epoch": 1.1835708989041773, + "grad_norm": 0.6128711700439453, + "learning_rate": 8.786954703446643e-06, + "loss": 1.5932, + "mean_token_accuracy": 0.6311574280261993, + "num_tokens": 1805848452.0, + "step": 10774 + }, + { + "entropy": 1.7200669348239899, + "epoch": 1.1836807558155502, + "grad_norm": 0.674370527267456, + "learning_rate": 8.78540276952003e-06, + "loss": 1.3235, + "mean_token_accuracy": 0.6750156929095587, + "num_tokens": 1805980538.0, + "step": 10775 + }, + { + "entropy": 1.6917518973350525, + "epoch": 1.183790612726923, + "grad_norm": 0.6382037997245789, + "learning_rate": 8.78385090568442e-06, + "loss": 1.4685, + "mean_token_accuracy": 0.6526532918214798, + "num_tokens": 1806141214.0, + "step": 10776 + }, + { + "entropy": 1.755267471075058, + "epoch": 1.1839004696382962, + "grad_norm": 0.7073934078216553, + "learning_rate": 8.78229911198893e-06, + "loss": 1.2182, + "mean_token_accuracy": 0.6788963029781977, + "num_tokens": 1806280417.0, + "step": 10777 + }, + { + "entropy": 1.6847423215707142, + "epoch": 1.184010326549669, + "grad_norm": 0.7584076523780823, + "learning_rate": 8.780747388482678e-06, + "loss": 1.2184, + "mean_token_accuracy": 0.6779392212629318, + "num_tokens": 1806421411.0, + "step": 10778 + }, + { + "entropy": 1.6994816462198894, + "epoch": 1.184120183461042, + "grad_norm": 0.6640441417694092, + "learning_rate": 8.779195735214768e-06, + "loss": 1.3675, + "mean_token_accuracy": 0.6564560582240423, + "num_tokens": 1806579038.0, + "step": 10779 + }, + { + "entropy": 1.7090040544668834, + "epoch": 1.1842300403724149, + "grad_norm": 0.7332303524017334, + "learning_rate": 8.777644152234312e-06, + "loss": 1.2549, + "mean_token_accuracy": 0.6839319815238317, + "num_tokens": 1806722045.0, + "step": 10780 + }, + { + "entropy": 1.7189118365446727, + "epoch": 1.1843398972837877, + "grad_norm": 0.6345376372337341, + "learning_rate": 8.776092639590418e-06, + "loss": 1.4359, + "mean_token_accuracy": 0.659914493560791, + "num_tokens": 1806887963.0, + "step": 10781 + }, + { + "entropy": 1.7617081105709076, + "epoch": 1.1844497541951609, + "grad_norm": 0.8099861741065979, + "learning_rate": 8.77454119733219e-06, + "loss": 1.4132, + "mean_token_accuracy": 0.6468125134706497, + "num_tokens": 1807042559.0, + "step": 10782 + }, + { + "entropy": 1.7302567660808563, + "epoch": 1.1845596111065337, + "grad_norm": 0.8026572465896606, + "learning_rate": 8.77298982550873e-06, + "loss": 1.5754, + "mean_token_accuracy": 0.6293011705080668, + "num_tokens": 1807278669.0, + "step": 10783 + }, + { + "entropy": 1.7094827393690746, + "epoch": 1.1846694680179066, + "grad_norm": 0.6681255102157593, + "learning_rate": 8.771438524169137e-06, + "loss": 1.3552, + "mean_token_accuracy": 0.6640477081139883, + "num_tokens": 1807442397.0, + "step": 10784 + }, + { + "entropy": 1.7145535846551259, + "epoch": 1.1847793249292797, + "grad_norm": 21.605440139770508, + "learning_rate": 8.769887293362514e-06, + "loss": 1.4412, + "mean_token_accuracy": 0.6432745158672333, + "num_tokens": 1807645099.0, + "step": 10785 + }, + { + "entropy": 1.7830155591169994, + "epoch": 1.1848891818406526, + "grad_norm": 0.6564657092094421, + "learning_rate": 8.768336133137949e-06, + "loss": 1.4377, + "mean_token_accuracy": 0.6420264492432276, + "num_tokens": 1807780953.0, + "step": 10786 + }, + { + "entropy": 1.6700923939545949, + "epoch": 1.1849990387520255, + "grad_norm": 0.700512707233429, + "learning_rate": 8.766785043544544e-06, + "loss": 1.4123, + "mean_token_accuracy": 0.6515941818555196, + "num_tokens": 1807943402.0, + "step": 10787 + }, + { + "entropy": 1.671025017897288, + "epoch": 1.1851088956633984, + "grad_norm": 0.6476449370384216, + "learning_rate": 8.765234024631381e-06, + "loss": 1.3315, + "mean_token_accuracy": 0.6557556490103403, + "num_tokens": 1808108811.0, + "step": 10788 + }, + { + "entropy": 1.7127976814905803, + "epoch": 1.1852187525747713, + "grad_norm": 0.542065441608429, + "learning_rate": 8.763683076447558e-06, + "loss": 1.5378, + "mean_token_accuracy": 0.6265371342500051, + "num_tokens": 1808343132.0, + "step": 10789 + }, + { + "entropy": 1.7127373119195302, + "epoch": 1.1853286094861444, + "grad_norm": 0.7368000745773315, + "learning_rate": 8.762132199042158e-06, + "loss": 1.3349, + "mean_token_accuracy": 0.6565342048803965, + "num_tokens": 1808552364.0, + "step": 10790 + }, + { + "entropy": 1.7234003643194835, + "epoch": 1.1854384663975173, + "grad_norm": 0.733325719833374, + "learning_rate": 8.760581392464265e-06, + "loss": 1.479, + "mean_token_accuracy": 0.6574785908063253, + "num_tokens": 1808736493.0, + "step": 10791 + }, + { + "entropy": 1.7298449873924255, + "epoch": 1.1855483233088902, + "grad_norm": 0.6609643697738647, + "learning_rate": 8.759030656762961e-06, + "loss": 1.5159, + "mean_token_accuracy": 0.6325143476327261, + "num_tokens": 1808941867.0, + "step": 10792 + }, + { + "entropy": 1.7493693828582764, + "epoch": 1.185658180220263, + "grad_norm": 0.7295409440994263, + "learning_rate": 8.757479991987328e-06, + "loss": 1.542, + "mean_token_accuracy": 0.6383609374364217, + "num_tokens": 1809188308.0, + "step": 10793 + }, + { + "entropy": 1.7492066224416096, + "epoch": 1.185768037131636, + "grad_norm": 0.7301694750785828, + "learning_rate": 8.755929398186441e-06, + "loss": 1.5574, + "mean_token_accuracy": 0.6535097360610962, + "num_tokens": 1809380493.0, + "step": 10794 + }, + { + "entropy": 1.6621710260709126, + "epoch": 1.185877894043009, + "grad_norm": 0.7311023473739624, + "learning_rate": 8.754378875409378e-06, + "loss": 1.4412, + "mean_token_accuracy": 0.6498973866303762, + "num_tokens": 1809561584.0, + "step": 10795 + }, + { + "entropy": 1.6603956421216328, + "epoch": 1.185987750954382, + "grad_norm": 0.6298139691352844, + "learning_rate": 8.752828423705213e-06, + "loss": 1.3381, + "mean_token_accuracy": 0.6642551869153976, + "num_tokens": 1809753841.0, + "step": 10796 + }, + { + "entropy": 1.7171143392721813, + "epoch": 1.1860976078657548, + "grad_norm": 0.635201096534729, + "learning_rate": 8.751278043123015e-06, + "loss": 1.4912, + "mean_token_accuracy": 0.6408715645472208, + "num_tokens": 1810007570.0, + "step": 10797 + }, + { + "entropy": 1.6833914419015248, + "epoch": 1.186207464777128, + "grad_norm": 0.6429863572120667, + "learning_rate": 8.749727733711852e-06, + "loss": 1.4519, + "mean_token_accuracy": 0.6536713739236196, + "num_tokens": 1810172296.0, + "step": 10798 + }, + { + "entropy": 1.6702220439910889, + "epoch": 1.1863173216885008, + "grad_norm": 0.5836479663848877, + "learning_rate": 8.748177495520795e-06, + "loss": 1.3426, + "mean_token_accuracy": 0.6664466510216395, + "num_tokens": 1810364719.0, + "step": 10799 + }, + { + "entropy": 1.6623725195725758, + "epoch": 1.1864271785998737, + "grad_norm": 0.7200176119804382, + "learning_rate": 8.746627328598903e-06, + "loss": 1.3376, + "mean_token_accuracy": 0.6745273669560751, + "num_tokens": 1810517478.0, + "step": 10800 + }, + { + "entropy": 1.7162803411483765, + "epoch": 1.1865370355112466, + "grad_norm": 0.7739757895469666, + "learning_rate": 8.74507723299524e-06, + "loss": 1.4352, + "mean_token_accuracy": 0.636848971247673, + "num_tokens": 1810687094.0, + "step": 10801 + }, + { + "entropy": 1.7077897389729817, + "epoch": 1.1866468924226194, + "grad_norm": 0.9581501483917236, + "learning_rate": 8.74352720875887e-06, + "loss": 1.3625, + "mean_token_accuracy": 0.6712329884370168, + "num_tokens": 1810838305.0, + "step": 10802 + }, + { + "entropy": 1.7443882822990417, + "epoch": 1.1867567493339926, + "grad_norm": 0.5825392007827759, + "learning_rate": 8.741977255938848e-06, + "loss": 1.4246, + "mean_token_accuracy": 0.6382510860761007, + "num_tokens": 1811036111.0, + "step": 10803 + }, + { + "entropy": 1.7159675359725952, + "epoch": 1.1868666062453654, + "grad_norm": 0.6169284582138062, + "learning_rate": 8.740427374584225e-06, + "loss": 1.353, + "mean_token_accuracy": 0.650434414545695, + "num_tokens": 1811216805.0, + "step": 10804 + }, + { + "entropy": 1.7108531892299652, + "epoch": 1.1869764631567383, + "grad_norm": 0.7137644290924072, + "learning_rate": 8.73887756474406e-06, + "loss": 1.3657, + "mean_token_accuracy": 0.6566335658232371, + "num_tokens": 1811341956.0, + "step": 10805 + }, + { + "entropy": 1.735455960035324, + "epoch": 1.1870863200681112, + "grad_norm": 0.5706676840782166, + "learning_rate": 8.7373278264674e-06, + "loss": 1.4739, + "mean_token_accuracy": 0.6481334368387858, + "num_tokens": 1811539451.0, + "step": 10806 + }, + { + "entropy": 1.7320310175418854, + "epoch": 1.1871961769794843, + "grad_norm": 0.6939385533332825, + "learning_rate": 8.735778159803289e-06, + "loss": 1.3383, + "mean_token_accuracy": 0.6659232576688131, + "num_tokens": 1811690465.0, + "step": 10807 + }, + { + "entropy": 1.6345330973466237, + "epoch": 1.1873060338908572, + "grad_norm": 0.6689730286598206, + "learning_rate": 8.734228564800787e-06, + "loss": 1.2998, + "mean_token_accuracy": 0.6712810496489207, + "num_tokens": 1811851641.0, + "step": 10808 + }, + { + "entropy": 1.7186749478181202, + "epoch": 1.18741589080223, + "grad_norm": 0.6938754916191101, + "learning_rate": 8.732679041508927e-06, + "loss": 1.3595, + "mean_token_accuracy": 0.6612470696369807, + "num_tokens": 1812004102.0, + "step": 10809 + }, + { + "entropy": 1.738366852204005, + "epoch": 1.187525747713603, + "grad_norm": 0.6082279682159424, + "learning_rate": 8.731129589976752e-06, + "loss": 1.3528, + "mean_token_accuracy": 0.661902000506719, + "num_tokens": 1812140283.0, + "step": 10810 + }, + { + "entropy": 1.73800332347552, + "epoch": 1.187635604624976, + "grad_norm": 0.7404204607009888, + "learning_rate": 8.729580210253307e-06, + "loss": 1.474, + "mean_token_accuracy": 0.6457099169492722, + "num_tokens": 1812288672.0, + "step": 10811 + }, + { + "entropy": 1.7052789727846782, + "epoch": 1.187745461536349, + "grad_norm": 0.6835205554962158, + "learning_rate": 8.728030902387623e-06, + "loss": 1.4069, + "mean_token_accuracy": 0.661319280664126, + "num_tokens": 1812489937.0, + "step": 10812 + }, + { + "entropy": 1.6847817699114482, + "epoch": 1.1878553184477219, + "grad_norm": 0.6975307464599609, + "learning_rate": 8.726481666428735e-06, + "loss": 1.5141, + "mean_token_accuracy": 0.6451181322336197, + "num_tokens": 1812717330.0, + "step": 10813 + }, + { + "entropy": 1.7484122415383656, + "epoch": 1.1879651753590947, + "grad_norm": 0.8225982189178467, + "learning_rate": 8.724932502425681e-06, + "loss": 1.3702, + "mean_token_accuracy": 0.6497304985920588, + "num_tokens": 1812917091.0, + "step": 10814 + }, + { + "entropy": 1.6711904605229695, + "epoch": 1.1880750322704676, + "grad_norm": 0.5482514500617981, + "learning_rate": 8.723383410427486e-06, + "loss": 1.4879, + "mean_token_accuracy": 0.6315444807211558, + "num_tokens": 1813125811.0, + "step": 10815 + }, + { + "entropy": 1.7230869730313618, + "epoch": 1.1881848891818407, + "grad_norm": 0.818645179271698, + "learning_rate": 8.721834390483181e-06, + "loss": 1.4077, + "mean_token_accuracy": 0.6621546596288681, + "num_tokens": 1813307367.0, + "step": 10816 + }, + { + "entropy": 1.682017187277476, + "epoch": 1.1882947460932136, + "grad_norm": 0.6782887578010559, + "learning_rate": 8.720285442641794e-06, + "loss": 1.5252, + "mean_token_accuracy": 0.6403040736913681, + "num_tokens": 1813517516.0, + "step": 10817 + }, + { + "entropy": 1.6661075949668884, + "epoch": 1.1884046030045865, + "grad_norm": 0.6994887590408325, + "learning_rate": 8.718736566952342e-06, + "loss": 1.3352, + "mean_token_accuracy": 0.6600988954305649, + "num_tokens": 1813674638.0, + "step": 10818 + }, + { + "entropy": 1.7056255837281544, + "epoch": 1.1885144599159594, + "grad_norm": 0.6169335246086121, + "learning_rate": 8.717187763463848e-06, + "loss": 1.4025, + "mean_token_accuracy": 0.6571420232454935, + "num_tokens": 1813822167.0, + "step": 10819 + }, + { + "entropy": 1.709171086549759, + "epoch": 1.1886243168273325, + "grad_norm": 0.6775344610214233, + "learning_rate": 8.715639032225338e-06, + "loss": 1.4238, + "mean_token_accuracy": 0.6446866790453593, + "num_tokens": 1813991064.0, + "step": 10820 + }, + { + "entropy": 1.6793767909208934, + "epoch": 1.1887341737387054, + "grad_norm": 0.791778564453125, + "learning_rate": 8.71409037328582e-06, + "loss": 1.489, + "mean_token_accuracy": 0.6477701465288798, + "num_tokens": 1814176897.0, + "step": 10821 + }, + { + "entropy": 1.7169764240582783, + "epoch": 1.1888440306500783, + "grad_norm": 0.6778224110603333, + "learning_rate": 8.71254178669431e-06, + "loss": 1.5503, + "mean_token_accuracy": 0.6353256702423096, + "num_tokens": 1814358467.0, + "step": 10822 + }, + { + "entropy": 1.6720323065916698, + "epoch": 1.1889538875614512, + "grad_norm": 0.6832537055015564, + "learning_rate": 8.710993272499826e-06, + "loss": 1.2303, + "mean_token_accuracy": 0.677433043718338, + "num_tokens": 1814480540.0, + "step": 10823 + }, + { + "entropy": 1.7010047535101573, + "epoch": 1.1890637444728243, + "grad_norm": 0.8217154145240784, + "learning_rate": 8.70944483075137e-06, + "loss": 1.338, + "mean_token_accuracy": 0.6667589843273163, + "num_tokens": 1814617055.0, + "step": 10824 + }, + { + "entropy": 1.7079274654388428, + "epoch": 1.1891736013841971, + "grad_norm": 0.8178585767745972, + "learning_rate": 8.707896461497957e-06, + "loss": 1.3209, + "mean_token_accuracy": 0.6651990612347921, + "num_tokens": 1814759656.0, + "step": 10825 + }, + { + "entropy": 1.690351406733195, + "epoch": 1.18928345829557, + "grad_norm": 0.6807016134262085, + "learning_rate": 8.706348164788582e-06, + "loss": 1.4074, + "mean_token_accuracy": 0.6610402117172877, + "num_tokens": 1814904145.0, + "step": 10826 + }, + { + "entropy": 1.7773006558418274, + "epoch": 1.189393315206943, + "grad_norm": 0.8337060213088989, + "learning_rate": 8.704799940672257e-06, + "loss": 1.3194, + "mean_token_accuracy": 0.668373758594195, + "num_tokens": 1815086239.0, + "step": 10827 + }, + { + "entropy": 1.6671875913937886, + "epoch": 1.1895031721183158, + "grad_norm": 0.7558709383010864, + "learning_rate": 8.703251789197981e-06, + "loss": 1.4228, + "mean_token_accuracy": 0.6599519302447637, + "num_tokens": 1815233304.0, + "step": 10828 + }, + { + "entropy": 1.6764814754327138, + "epoch": 1.189613029029689, + "grad_norm": 0.648366391658783, + "learning_rate": 8.701703710414752e-06, + "loss": 1.2463, + "mean_token_accuracy": 0.6833883871634802, + "num_tokens": 1815365343.0, + "step": 10829 + }, + { + "entropy": 1.6759169201056163, + "epoch": 1.1897228859410618, + "grad_norm": 0.7814769744873047, + "learning_rate": 8.700155704371562e-06, + "loss": 1.4332, + "mean_token_accuracy": 0.6664823815226555, + "num_tokens": 1815511637.0, + "step": 10830 + }, + { + "entropy": 1.7373320559660594, + "epoch": 1.1898327428524347, + "grad_norm": 0.8521638512611389, + "learning_rate": 8.698607771117408e-06, + "loss": 1.4448, + "mean_token_accuracy": 0.652740036447843, + "num_tokens": 1815650747.0, + "step": 10831 + }, + { + "entropy": 1.686434547106425, + "epoch": 1.1899425997638076, + "grad_norm": 0.707066535949707, + "learning_rate": 8.697059910701283e-06, + "loss": 1.1549, + "mean_token_accuracy": 0.6953272720177969, + "num_tokens": 1815758439.0, + "step": 10832 + }, + { + "entropy": 1.7592855592568715, + "epoch": 1.1900524566751807, + "grad_norm": 0.6283326745033264, + "learning_rate": 8.69551212317217e-06, + "loss": 1.3828, + "mean_token_accuracy": 0.6532629181941351, + "num_tokens": 1815916712.0, + "step": 10833 + }, + { + "entropy": 1.6812595228354137, + "epoch": 1.1901623135865536, + "grad_norm": 0.887874960899353, + "learning_rate": 8.693964408579063e-06, + "loss": 1.3895, + "mean_token_accuracy": 0.6594204902648926, + "num_tokens": 1816075205.0, + "step": 10834 + }, + { + "entropy": 1.6642636756102245, + "epoch": 1.1902721704979264, + "grad_norm": 0.6853379011154175, + "learning_rate": 8.692416766970943e-06, + "loss": 1.3377, + "mean_token_accuracy": 0.6647604952255884, + "num_tokens": 1816224025.0, + "step": 10835 + }, + { + "entropy": 1.7936367491881053, + "epoch": 1.1903820274092993, + "grad_norm": 0.7250938415527344, + "learning_rate": 8.690869198396792e-06, + "loss": 1.4598, + "mean_token_accuracy": 0.6410937756299973, + "num_tokens": 1816370800.0, + "step": 10836 + }, + { + "entropy": 1.7596316039562225, + "epoch": 1.1904918843206724, + "grad_norm": 0.7456021308898926, + "learning_rate": 8.689321702905593e-06, + "loss": 1.4467, + "mean_token_accuracy": 0.6487318376700083, + "num_tokens": 1816518599.0, + "step": 10837 + }, + { + "entropy": 1.7355043093363445, + "epoch": 1.1906017412320453, + "grad_norm": 0.5878375768661499, + "learning_rate": 8.687774280546317e-06, + "loss": 1.5659, + "mean_token_accuracy": 0.6355293492476145, + "num_tokens": 1816824813.0, + "step": 10838 + }, + { + "entropy": 1.6184170246124268, + "epoch": 1.1907115981434182, + "grad_norm": 0.7229267954826355, + "learning_rate": 8.686226931367943e-06, + "loss": 1.355, + "mean_token_accuracy": 0.6660072356462479, + "num_tokens": 1816987791.0, + "step": 10839 + }, + { + "entropy": 1.731922020514806, + "epoch": 1.190821455054791, + "grad_norm": 0.6348045468330383, + "learning_rate": 8.684679655419445e-06, + "loss": 1.4086, + "mean_token_accuracy": 0.6459181507428488, + "num_tokens": 1817155835.0, + "step": 10840 + }, + { + "entropy": 1.7074114779631298, + "epoch": 1.190931311966164, + "grad_norm": 10.530064582824707, + "learning_rate": 8.683132452749796e-06, + "loss": 1.5041, + "mean_token_accuracy": 0.6442484011252722, + "num_tokens": 1817336230.0, + "step": 10841 + }, + { + "entropy": 1.6497264802455902, + "epoch": 1.191041168877537, + "grad_norm": 0.6446982622146606, + "learning_rate": 8.681585323407958e-06, + "loss": 1.5598, + "mean_token_accuracy": 0.6426790108283361, + "num_tokens": 1817608365.0, + "step": 10842 + }, + { + "entropy": 1.7166444063186646, + "epoch": 1.19115102578891, + "grad_norm": 0.6891461610794067, + "learning_rate": 8.6800382674429e-06, + "loss": 1.5312, + "mean_token_accuracy": 0.6481931606928507, + "num_tokens": 1817825491.0, + "step": 10843 + }, + { + "entropy": 1.7337975700696309, + "epoch": 1.1912608827002829, + "grad_norm": 0.6657007932662964, + "learning_rate": 8.678491284903583e-06, + "loss": 1.4386, + "mean_token_accuracy": 0.6465141177177429, + "num_tokens": 1817977223.0, + "step": 10844 + }, + { + "entropy": 1.6892333626747131, + "epoch": 1.1913707396116557, + "grad_norm": 0.6128289103507996, + "learning_rate": 8.676944375838973e-06, + "loss": 1.2792, + "mean_token_accuracy": 0.6714215278625488, + "num_tokens": 1818149277.0, + "step": 10845 + }, + { + "entropy": 1.597786416610082, + "epoch": 1.1914805965230288, + "grad_norm": 0.6063182950019836, + "learning_rate": 8.67539754029803e-06, + "loss": 1.4619, + "mean_token_accuracy": 0.6459900289773941, + "num_tokens": 1818347912.0, + "step": 10846 + }, + { + "entropy": 1.6953892509142559, + "epoch": 1.1915904534344017, + "grad_norm": 0.8109437823295593, + "learning_rate": 8.673850778329702e-06, + "loss": 1.4544, + "mean_token_accuracy": 0.6425779660542806, + "num_tokens": 1818571841.0, + "step": 10847 + }, + { + "entropy": 1.655045618613561, + "epoch": 1.1917003103457746, + "grad_norm": 0.6422619819641113, + "learning_rate": 8.67230408998295e-06, + "loss": 1.3572, + "mean_token_accuracy": 0.6548277189334234, + "num_tokens": 1818751000.0, + "step": 10848 + }, + { + "entropy": 1.6748607456684113, + "epoch": 1.1918101672571475, + "grad_norm": 0.753288984298706, + "learning_rate": 8.670757475306728e-06, + "loss": 1.3551, + "mean_token_accuracy": 0.6647098064422607, + "num_tokens": 1818937047.0, + "step": 10849 + }, + { + "entropy": 1.70504829287529, + "epoch": 1.1919200241685206, + "grad_norm": 0.5776710510253906, + "learning_rate": 8.669210934349978e-06, + "loss": 1.4304, + "mean_token_accuracy": 0.6487905929485956, + "num_tokens": 1819120691.0, + "step": 10850 + }, + { + "entropy": 1.6992063224315643, + "epoch": 1.1920298810798935, + "grad_norm": 0.9151628017425537, + "learning_rate": 8.667664467161652e-06, + "loss": 1.4308, + "mean_token_accuracy": 0.6610411157210668, + "num_tokens": 1819289227.0, + "step": 10851 + }, + { + "entropy": 1.6430395245552063, + "epoch": 1.1921397379912664, + "grad_norm": 0.7337287068367004, + "learning_rate": 8.666118073790699e-06, + "loss": 1.4605, + "mean_token_accuracy": 0.652332549293836, + "num_tokens": 1819495147.0, + "step": 10852 + }, + { + "entropy": 1.698825587828954, + "epoch": 1.1922495949026393, + "grad_norm": 0.75420743227005, + "learning_rate": 8.664571754286052e-06, + "loss": 1.4167, + "mean_token_accuracy": 0.6623470187187195, + "num_tokens": 1819635916.0, + "step": 10853 + }, + { + "entropy": 1.6793596645196278, + "epoch": 1.1923594518140122, + "grad_norm": 0.618486225605011, + "learning_rate": 8.663025508696658e-06, + "loss": 1.3148, + "mean_token_accuracy": 0.6688097268342972, + "num_tokens": 1819786330.0, + "step": 10854 + }, + { + "entropy": 1.6689561307430267, + "epoch": 1.1924693087253853, + "grad_norm": 0.7865815758705139, + "learning_rate": 8.661479337071458e-06, + "loss": 1.3624, + "mean_token_accuracy": 0.6614319185415903, + "num_tokens": 1819922056.0, + "step": 10855 + }, + { + "entropy": 1.691734939813614, + "epoch": 1.1925791656367581, + "grad_norm": 0.7773484587669373, + "learning_rate": 8.659933239459377e-06, + "loss": 1.428, + "mean_token_accuracy": 0.6591572364171346, + "num_tokens": 1820169282.0, + "step": 10856 + }, + { + "entropy": 1.7865646183490753, + "epoch": 1.192689022548131, + "grad_norm": 0.7435487508773804, + "learning_rate": 8.658387215909358e-06, + "loss": 1.3392, + "mean_token_accuracy": 0.6749976028998693, + "num_tokens": 1820290334.0, + "step": 10857 + }, + { + "entropy": 1.6983545819918315, + "epoch": 1.192798879459504, + "grad_norm": 0.6907163262367249, + "learning_rate": 8.656841266470328e-06, + "loss": 1.2468, + "mean_token_accuracy": 0.6775921235481898, + "num_tokens": 1820415779.0, + "step": 10858 + }, + { + "entropy": 1.6848465104897816, + "epoch": 1.192908736370877, + "grad_norm": 0.6214163303375244, + "learning_rate": 8.65529539119122e-06, + "loss": 1.345, + "mean_token_accuracy": 0.6662961939970652, + "num_tokens": 1820575417.0, + "step": 10859 + }, + { + "entropy": 1.7312945226828258, + "epoch": 1.19301859328225, + "grad_norm": 0.5840948224067688, + "learning_rate": 8.65374959012095e-06, + "loss": 1.541, + "mean_token_accuracy": 0.6397745758295059, + "num_tokens": 1820797132.0, + "step": 10860 + }, + { + "entropy": 1.7128262619177501, + "epoch": 1.1931284501936228, + "grad_norm": 0.6750525832176208, + "learning_rate": 8.65220386330845e-06, + "loss": 1.3281, + "mean_token_accuracy": 0.6626399159431458, + "num_tokens": 1820916118.0, + "step": 10861 + }, + { + "entropy": 1.7520559827486675, + "epoch": 1.1932383071049957, + "grad_norm": 0.5905542969703674, + "learning_rate": 8.650658210802638e-06, + "loss": 1.4636, + "mean_token_accuracy": 0.6265908926725388, + "num_tokens": 1821134408.0, + "step": 10862 + }, + { + "entropy": 1.6462377607822418, + "epoch": 1.1933481640163688, + "grad_norm": 0.7132760286331177, + "learning_rate": 8.649112632652436e-06, + "loss": 1.3858, + "mean_token_accuracy": 0.6677844027678171, + "num_tokens": 1821314158.0, + "step": 10863 + }, + { + "entropy": 1.6968292494614918, + "epoch": 1.1934580209277417, + "grad_norm": 0.6396412253379822, + "learning_rate": 8.647567128906764e-06, + "loss": 1.3326, + "mean_token_accuracy": 0.6573519359032313, + "num_tokens": 1821452147.0, + "step": 10864 + }, + { + "entropy": 1.7002749343713124, + "epoch": 1.1935678778391146, + "grad_norm": 0.5961291790008545, + "learning_rate": 8.646021699614529e-06, + "loss": 1.4085, + "mean_token_accuracy": 0.6624472538630167, + "num_tokens": 1821639995.0, + "step": 10865 + }, + { + "entropy": 1.7443738182385762, + "epoch": 1.1936777347504874, + "grad_norm": 0.6922990679740906, + "learning_rate": 8.644476344824646e-06, + "loss": 1.3906, + "mean_token_accuracy": 0.6587434560060501, + "num_tokens": 1821779295.0, + "step": 10866 + }, + { + "entropy": 1.6607798635959625, + "epoch": 1.1937875916618603, + "grad_norm": 0.5818439722061157, + "learning_rate": 8.642931064586028e-06, + "loss": 1.308, + "mean_token_accuracy": 0.6683350056409836, + "num_tokens": 1821913214.0, + "step": 10867 + }, + { + "entropy": 1.7286332647005718, + "epoch": 1.1938974485732334, + "grad_norm": 0.7446157336235046, + "learning_rate": 8.641385858947576e-06, + "loss": 1.4779, + "mean_token_accuracy": 0.6418144504229227, + "num_tokens": 1822091301.0, + "step": 10868 + }, + { + "entropy": 1.748667687177658, + "epoch": 1.1940073054846063, + "grad_norm": 0.7008844017982483, + "learning_rate": 8.6398407279582e-06, + "loss": 1.5479, + "mean_token_accuracy": 0.6476845939954122, + "num_tokens": 1822281478.0, + "step": 10869 + }, + { + "entropy": 1.7366498708724976, + "epoch": 1.1941171623959792, + "grad_norm": 0.7748090028762817, + "learning_rate": 8.638295671666803e-06, + "loss": 1.471, + "mean_token_accuracy": 0.6507512678702673, + "num_tokens": 1822443339.0, + "step": 10870 + }, + { + "entropy": 1.730059305826823, + "epoch": 1.194227019307352, + "grad_norm": 0.675847053527832, + "learning_rate": 8.636750690122282e-06, + "loss": 1.4335, + "mean_token_accuracy": 0.6394089609384537, + "num_tokens": 1822638722.0, + "step": 10871 + }, + { + "entropy": 1.6546126703421276, + "epoch": 1.1943368762187252, + "grad_norm": 0.5662322640419006, + "learning_rate": 8.63520578337354e-06, + "loss": 1.4372, + "mean_token_accuracy": 0.6361754983663559, + "num_tokens": 1822865064.0, + "step": 10872 + }, + { + "entropy": 1.701521893342336, + "epoch": 1.194446733130098, + "grad_norm": 0.6960839033126831, + "learning_rate": 8.633660951469468e-06, + "loss": 1.4204, + "mean_token_accuracy": 0.6519134740034739, + "num_tokens": 1823055053.0, + "step": 10873 + }, + { + "entropy": 1.6866462131341298, + "epoch": 1.194556590041471, + "grad_norm": 0.6791787147521973, + "learning_rate": 8.632116194458955e-06, + "loss": 1.3331, + "mean_token_accuracy": 0.6622498879830042, + "num_tokens": 1823224555.0, + "step": 10874 + }, + { + "entropy": 1.7383518815040588, + "epoch": 1.1946664469528439, + "grad_norm": 0.7168798446655273, + "learning_rate": 8.630571512390901e-06, + "loss": 1.4732, + "mean_token_accuracy": 0.6602436949809393, + "num_tokens": 1823381692.0, + "step": 10875 + }, + { + "entropy": 1.7333435515562694, + "epoch": 1.194776303864217, + "grad_norm": 0.6332979798316956, + "learning_rate": 8.629026905314195e-06, + "loss": 1.4628, + "mean_token_accuracy": 0.6372295717398325, + "num_tokens": 1823557246.0, + "step": 10876 + }, + { + "entropy": 1.7318945527076721, + "epoch": 1.1948861607755898, + "grad_norm": 0.7273834943771362, + "learning_rate": 8.627482373277715e-06, + "loss": 1.5831, + "mean_token_accuracy": 0.6298131893078486, + "num_tokens": 1823721277.0, + "step": 10877 + }, + { + "entropy": 1.689674695332845, + "epoch": 1.1949960176869627, + "grad_norm": 0.6805070042610168, + "learning_rate": 8.625937916330349e-06, + "loss": 1.2654, + "mean_token_accuracy": 0.6795346190532049, + "num_tokens": 1823846743.0, + "step": 10878 + }, + { + "entropy": 1.6561415096124013, + "epoch": 1.1951058745983356, + "grad_norm": 0.6901777386665344, + "learning_rate": 8.62439353452098e-06, + "loss": 1.3924, + "mean_token_accuracy": 0.6656059821446737, + "num_tokens": 1824066791.0, + "step": 10879 + }, + { + "entropy": 1.6609856188297272, + "epoch": 1.1952157315097085, + "grad_norm": 0.6951460242271423, + "learning_rate": 8.622849227898484e-06, + "loss": 1.202, + "mean_token_accuracy": 0.6859797437985738, + "num_tokens": 1824221799.0, + "step": 10880 + }, + { + "entropy": 1.6828400393327076, + "epoch": 1.1953255884210816, + "grad_norm": 0.8013219237327576, + "learning_rate": 8.621304996511737e-06, + "loss": 1.5402, + "mean_token_accuracy": 0.6594565212726593, + "num_tokens": 1824404281.0, + "step": 10881 + }, + { + "entropy": 1.758839060862859, + "epoch": 1.1954354453324545, + "grad_norm": 0.8230046629905701, + "learning_rate": 8.61976084040962e-06, + "loss": 1.3892, + "mean_token_accuracy": 0.6508858899275461, + "num_tokens": 1824542346.0, + "step": 10882 + }, + { + "entropy": 1.7173049648602803, + "epoch": 1.1955453022438274, + "grad_norm": 0.6363534331321716, + "learning_rate": 8.618216759640994e-06, + "loss": 1.5549, + "mean_token_accuracy": 0.628744641939799, + "num_tokens": 1824762577.0, + "step": 10883 + }, + { + "entropy": 1.678319166103999, + "epoch": 1.1956551591552003, + "grad_norm": 0.7358280420303345, + "learning_rate": 8.616672754254738e-06, + "loss": 1.428, + "mean_token_accuracy": 0.6569743702809016, + "num_tokens": 1824896107.0, + "step": 10884 + }, + { + "entropy": 1.6763150095939636, + "epoch": 1.1957650160665734, + "grad_norm": 0.8936296701431274, + "learning_rate": 8.615128824299716e-06, + "loss": 1.4788, + "mean_token_accuracy": 0.6430085202058157, + "num_tokens": 1825169621.0, + "step": 10885 + }, + { + "entropy": 1.7438491185506184, + "epoch": 1.1958748729779463, + "grad_norm": 0.6828886866569519, + "learning_rate": 8.613584969824789e-06, + "loss": 1.5277, + "mean_token_accuracy": 0.6488937735557556, + "num_tokens": 1825324021.0, + "step": 10886 + }, + { + "entropy": 1.7129732171694438, + "epoch": 1.1959847298893191, + "grad_norm": 0.8133248090744019, + "learning_rate": 8.612041190878826e-06, + "loss": 1.3015, + "mean_token_accuracy": 0.672540470957756, + "num_tokens": 1825460259.0, + "step": 10887 + }, + { + "entropy": 1.6727862358093262, + "epoch": 1.196094586800692, + "grad_norm": 0.6648197174072266, + "learning_rate": 8.610497487510679e-06, + "loss": 1.414, + "mean_token_accuracy": 0.6671945502360662, + "num_tokens": 1825626619.0, + "step": 10888 + }, + { + "entropy": 1.7124856114387512, + "epoch": 1.1962044437120651, + "grad_norm": 0.8533644080162048, + "learning_rate": 8.60895385976921e-06, + "loss": 1.4938, + "mean_token_accuracy": 0.652462845047315, + "num_tokens": 1825775596.0, + "step": 10889 + }, + { + "entropy": 1.6300967534383137, + "epoch": 1.196314300623438, + "grad_norm": 0.697281002998352, + "learning_rate": 8.607410307703279e-06, + "loss": 1.3898, + "mean_token_accuracy": 0.6632688790559769, + "num_tokens": 1825950828.0, + "step": 10890 + }, + { + "entropy": 1.6891121864318848, + "epoch": 1.196424157534811, + "grad_norm": 0.7355936169624329, + "learning_rate": 8.605866831361729e-06, + "loss": 1.572, + "mean_token_accuracy": 0.6447887768348058, + "num_tokens": 1826134511.0, + "step": 10891 + }, + { + "entropy": 1.7174355785051982, + "epoch": 1.1965340144461838, + "grad_norm": 0.6898308992385864, + "learning_rate": 8.604323430793416e-06, + "loss": 1.4689, + "mean_token_accuracy": 0.6526208321253458, + "num_tokens": 1826298089.0, + "step": 10892 + }, + { + "entropy": 1.7147394319375355, + "epoch": 1.1966438713575567, + "grad_norm": 0.8916130661964417, + "learning_rate": 8.602780106047189e-06, + "loss": 1.364, + "mean_token_accuracy": 0.6715402801831564, + "num_tokens": 1826432931.0, + "step": 10893 + }, + { + "entropy": 1.6138789653778076, + "epoch": 1.1967537282689298, + "grad_norm": 0.7221713662147522, + "learning_rate": 8.60123685717189e-06, + "loss": 1.4328, + "mean_token_accuracy": 0.6576006362835566, + "num_tokens": 1826627859.0, + "step": 10894 + }, + { + "entropy": 1.7023044029871623, + "epoch": 1.1968635851803027, + "grad_norm": 0.571751594543457, + "learning_rate": 8.59969368421636e-06, + "loss": 1.3235, + "mean_token_accuracy": 0.6696832726399103, + "num_tokens": 1826792129.0, + "step": 10895 + }, + { + "entropy": 1.7759801348050435, + "epoch": 1.1969734420916756, + "grad_norm": 0.738571286201477, + "learning_rate": 8.598150587229448e-06, + "loss": 1.4592, + "mean_token_accuracy": 0.6435786783695221, + "num_tokens": 1826939218.0, + "step": 10896 + }, + { + "entropy": 1.6735620200634003, + "epoch": 1.1970832990030484, + "grad_norm": 0.6554346680641174, + "learning_rate": 8.596607566259986e-06, + "loss": 1.4253, + "mean_token_accuracy": 0.6584400484959284, + "num_tokens": 1827121356.0, + "step": 10897 + }, + { + "entropy": 1.6918166776498158, + "epoch": 1.1971931559144215, + "grad_norm": 0.7005612254142761, + "learning_rate": 8.595064621356812e-06, + "loss": 1.3349, + "mean_token_accuracy": 0.6674779852231344, + "num_tokens": 1827261219.0, + "step": 10898 + }, + { + "entropy": 1.6832281549771626, + "epoch": 1.1973030128257944, + "grad_norm": 0.732524573802948, + "learning_rate": 8.593521752568759e-06, + "loss": 1.3192, + "mean_token_accuracy": 0.6616079111893972, + "num_tokens": 1827424352.0, + "step": 10899 + }, + { + "entropy": 1.7052031954129536, + "epoch": 1.1974128697371673, + "grad_norm": 0.7440763115882874, + "learning_rate": 8.591978959944657e-06, + "loss": 1.2866, + "mean_token_accuracy": 0.6621012737353643, + "num_tokens": 1827566352.0, + "step": 10900 + }, + { + "entropy": 1.748506526152293, + "epoch": 1.1975227266485402, + "grad_norm": 0.6760443449020386, + "learning_rate": 8.590436243533336e-06, + "loss": 1.3757, + "mean_token_accuracy": 0.6591590344905853, + "num_tokens": 1827705988.0, + "step": 10901 + }, + { + "entropy": 1.7245979209740956, + "epoch": 1.1976325835599133, + "grad_norm": 0.6143633127212524, + "learning_rate": 8.588893603383623e-06, + "loss": 1.5103, + "mean_token_accuracy": 0.6388898193836212, + "num_tokens": 1827921089.0, + "step": 10902 + }, + { + "entropy": 1.6794546246528625, + "epoch": 1.1977424404712862, + "grad_norm": 0.6420578956604004, + "learning_rate": 8.58735103954434e-06, + "loss": 1.5082, + "mean_token_accuracy": 0.641454761226972, + "num_tokens": 1828112111.0, + "step": 10903 + }, + { + "entropy": 1.70048584540685, + "epoch": 1.197852297382659, + "grad_norm": 0.6062077879905701, + "learning_rate": 8.585808552064312e-06, + "loss": 1.3617, + "mean_token_accuracy": 0.6477002501487732, + "num_tokens": 1828253988.0, + "step": 10904 + }, + { + "entropy": 1.6546235779921215, + "epoch": 1.197962154294032, + "grad_norm": 0.6344867944717407, + "learning_rate": 8.584266140992355e-06, + "loss": 1.4448, + "mean_token_accuracy": 0.6534637212753296, + "num_tokens": 1828444002.0, + "step": 10905 + }, + { + "entropy": 1.6595034301280975, + "epoch": 1.1980720112054049, + "grad_norm": 0.7396848797798157, + "learning_rate": 8.582723806377281e-06, + "loss": 1.1545, + "mean_token_accuracy": 0.6930899421374003, + "num_tokens": 1828558474.0, + "step": 10906 + }, + { + "entropy": 1.7764694193998973, + "epoch": 1.198181868116778, + "grad_norm": 0.7311699390411377, + "learning_rate": 8.581181548267914e-06, + "loss": 1.4681, + "mean_token_accuracy": 0.647409662604332, + "num_tokens": 1828672601.0, + "step": 10907 + }, + { + "entropy": 1.778613011042277, + "epoch": 1.1982917250281508, + "grad_norm": 0.8004505634307861, + "learning_rate": 8.579639366713062e-06, + "loss": 1.4917, + "mean_token_accuracy": 0.6318371693293253, + "num_tokens": 1828867425.0, + "step": 10908 + }, + { + "entropy": 1.7501141329606373, + "epoch": 1.1984015819395237, + "grad_norm": 0.8574265241622925, + "learning_rate": 8.578097261761531e-06, + "loss": 1.3178, + "mean_token_accuracy": 0.6586999098459879, + "num_tokens": 1829025448.0, + "step": 10909 + }, + { + "entropy": 1.7619508107503254, + "epoch": 1.1985114388508966, + "grad_norm": 0.7897709608078003, + "learning_rate": 8.57655523346213e-06, + "loss": 1.5334, + "mean_token_accuracy": 0.6314461479584376, + "num_tokens": 1829224005.0, + "step": 10910 + }, + { + "entropy": 1.717555691798528, + "epoch": 1.1986212957622697, + "grad_norm": 0.6715591549873352, + "learning_rate": 8.575013281863666e-06, + "loss": 1.3903, + "mean_token_accuracy": 0.6613827695449194, + "num_tokens": 1829394320.0, + "step": 10911 + }, + { + "entropy": 1.7155894537766774, + "epoch": 1.1987311526736426, + "grad_norm": 0.8104733228683472, + "learning_rate": 8.573471407014934e-06, + "loss": 1.3106, + "mean_token_accuracy": 0.663354347149531, + "num_tokens": 1829539454.0, + "step": 10912 + }, + { + "entropy": 1.6335892776648204, + "epoch": 1.1988410095850155, + "grad_norm": 0.6717244386672974, + "learning_rate": 8.571929608964743e-06, + "loss": 1.1869, + "mean_token_accuracy": 0.6866246312856674, + "num_tokens": 1829642311.0, + "step": 10913 + }, + { + "entropy": 1.7017800013224285, + "epoch": 1.1989508664963884, + "grad_norm": 0.5946372151374817, + "learning_rate": 8.570387887761886e-06, + "loss": 1.4284, + "mean_token_accuracy": 0.6471086144447327, + "num_tokens": 1829869402.0, + "step": 10914 + }, + { + "entropy": 1.6842081248760223, + "epoch": 1.1990607234077615, + "grad_norm": 0.6334558725357056, + "learning_rate": 8.568846243455156e-06, + "loss": 1.3793, + "mean_token_accuracy": 0.6581207563479742, + "num_tokens": 1830025157.0, + "step": 10915 + }, + { + "entropy": 1.7087959746519725, + "epoch": 1.1991705803191344, + "grad_norm": 0.6897690296173096, + "learning_rate": 8.56730467609335e-06, + "loss": 1.499, + "mean_token_accuracy": 0.6366796096165975, + "num_tokens": 1830195791.0, + "step": 10916 + }, + { + "entropy": 1.6709490915139515, + "epoch": 1.1992804372305073, + "grad_norm": 0.633358895778656, + "learning_rate": 8.56576318572525e-06, + "loss": 1.2466, + "mean_token_accuracy": 0.6887932568788528, + "num_tokens": 1830355273.0, + "step": 10917 + }, + { + "entropy": 1.696038504441579, + "epoch": 1.1993902941418801, + "grad_norm": 0.8158591985702515, + "learning_rate": 8.564221772399649e-06, + "loss": 1.5133, + "mean_token_accuracy": 0.6450046946605047, + "num_tokens": 1830511215.0, + "step": 10918 + }, + { + "entropy": 1.7114079197247822, + "epoch": 1.199500151053253, + "grad_norm": 0.6601821780204773, + "learning_rate": 8.562680436165334e-06, + "loss": 1.4599, + "mean_token_accuracy": 0.6562004834413528, + "num_tokens": 1830664540.0, + "step": 10919 + }, + { + "entropy": 1.636279861132304, + "epoch": 1.1996100079646261, + "grad_norm": 0.6178733110427856, + "learning_rate": 8.561139177071082e-06, + "loss": 1.3889, + "mean_token_accuracy": 0.6608523726463318, + "num_tokens": 1830824816.0, + "step": 10920 + }, + { + "entropy": 1.695862223704656, + "epoch": 1.199719864875999, + "grad_norm": 0.7451301217079163, + "learning_rate": 8.559597995165678e-06, + "loss": 1.3195, + "mean_token_accuracy": 0.6666155556837717, + "num_tokens": 1830947228.0, + "step": 10921 + }, + { + "entropy": 1.7582411766052246, + "epoch": 1.199829721787372, + "grad_norm": 0.864019513130188, + "learning_rate": 8.558056890497897e-06, + "loss": 1.3974, + "mean_token_accuracy": 0.6508052796125412, + "num_tokens": 1831092466.0, + "step": 10922 + }, + { + "entropy": 1.7041733066240947, + "epoch": 1.1999395786987448, + "grad_norm": 0.6823435425758362, + "learning_rate": 8.556515863116518e-06, + "loss": 1.2998, + "mean_token_accuracy": 0.6597320288419724, + "num_tokens": 1831280239.0, + "step": 10923 + }, + { + "entropy": 1.7025948067506154, + "epoch": 1.200049435610118, + "grad_norm": 0.6404684782028198, + "learning_rate": 8.554974913070306e-06, + "loss": 1.4125, + "mean_token_accuracy": 0.6431457748015722, + "num_tokens": 1831481003.0, + "step": 10924 + }, + { + "entropy": 1.7434356113274891, + "epoch": 1.2001592925214908, + "grad_norm": 0.769716203212738, + "learning_rate": 8.553434040408037e-06, + "loss": 1.4012, + "mean_token_accuracy": 0.6592916697263718, + "num_tokens": 1831679505.0, + "step": 10925 + }, + { + "entropy": 1.7073861261208851, + "epoch": 1.2002691494328637, + "grad_norm": 0.6649128198623657, + "learning_rate": 8.551893245178482e-06, + "loss": 1.359, + "mean_token_accuracy": 0.6671257416407267, + "num_tokens": 1831833327.0, + "step": 10926 + }, + { + "entropy": 1.693003276983897, + "epoch": 1.2003790063442366, + "grad_norm": 0.6499382257461548, + "learning_rate": 8.550352527430402e-06, + "loss": 1.4374, + "mean_token_accuracy": 0.6594889660676321, + "num_tokens": 1832003734.0, + "step": 10927 + }, + { + "entropy": 1.644775668780009, + "epoch": 1.2004888632556097, + "grad_norm": 0.618766725063324, + "learning_rate": 8.548811887212558e-06, + "loss": 1.495, + "mean_token_accuracy": 0.655649391313394, + "num_tokens": 1832145698.0, + "step": 10928 + }, + { + "entropy": 1.7419310013453166, + "epoch": 1.2005987201669825, + "grad_norm": 0.7380454540252686, + "learning_rate": 8.547271324573716e-06, + "loss": 1.4547, + "mean_token_accuracy": 0.6507051835457484, + "num_tokens": 1832300473.0, + "step": 10929 + }, + { + "entropy": 1.7053045133749645, + "epoch": 1.2007085770783554, + "grad_norm": 0.6194471716880798, + "learning_rate": 8.545730839562627e-06, + "loss": 1.4298, + "mean_token_accuracy": 0.648463194568952, + "num_tokens": 1832468480.0, + "step": 10930 + }, + { + "entropy": 1.7070962289969127, + "epoch": 1.2008184339897283, + "grad_norm": 0.7254568934440613, + "learning_rate": 8.544190432228053e-06, + "loss": 1.3819, + "mean_token_accuracy": 0.6639789591232935, + "num_tokens": 1832639575.0, + "step": 10931 + }, + { + "entropy": 1.7318992813428242, + "epoch": 1.2009282909011012, + "grad_norm": 0.7872775197029114, + "learning_rate": 8.542650102618748e-06, + "loss": 1.3596, + "mean_token_accuracy": 0.6584235628445944, + "num_tokens": 1832820036.0, + "step": 10932 + }, + { + "entropy": 1.6651087601979573, + "epoch": 1.2010381478124743, + "grad_norm": 0.6679090857505798, + "learning_rate": 8.541109850783458e-06, + "loss": 1.3423, + "mean_token_accuracy": 0.6579601069291433, + "num_tokens": 1833005066.0, + "step": 10933 + }, + { + "entropy": 1.691778947909673, + "epoch": 1.2011480047238472, + "grad_norm": 0.6940400004386902, + "learning_rate": 8.539569676770931e-06, + "loss": 1.2484, + "mean_token_accuracy": 0.6750961343447367, + "num_tokens": 1833137014.0, + "step": 10934 + }, + { + "entropy": 1.754847486813863, + "epoch": 1.20125786163522, + "grad_norm": 0.788187563419342, + "learning_rate": 8.53802958062992e-06, + "loss": 1.3827, + "mean_token_accuracy": 0.6513949334621429, + "num_tokens": 1833284638.0, + "step": 10935 + }, + { + "entropy": 1.7428893844286601, + "epoch": 1.201367718546593, + "grad_norm": 0.6965903043746948, + "learning_rate": 8.536489562409159e-06, + "loss": 1.6019, + "mean_token_accuracy": 0.627113069097201, + "num_tokens": 1833452637.0, + "step": 10936 + }, + { + "entropy": 1.7226960361003876, + "epoch": 1.201477575457966, + "grad_norm": 0.7512861490249634, + "learning_rate": 8.534949622157393e-06, + "loss": 1.5185, + "mean_token_accuracy": 0.6288545529047648, + "num_tokens": 1833649388.0, + "step": 10937 + }, + { + "entropy": 1.711164077123006, + "epoch": 1.201587432369339, + "grad_norm": 0.7107270359992981, + "learning_rate": 8.533409759923364e-06, + "loss": 1.3231, + "mean_token_accuracy": 0.6556845357020696, + "num_tokens": 1833816986.0, + "step": 10938 + }, + { + "entropy": 1.7271955609321594, + "epoch": 1.2016972892807118, + "grad_norm": 0.6369715929031372, + "learning_rate": 8.531869975755803e-06, + "loss": 1.398, + "mean_token_accuracy": 0.6558120846748352, + "num_tokens": 1833950907.0, + "step": 10939 + }, + { + "entropy": 1.6842861076196034, + "epoch": 1.2018071461920847, + "grad_norm": 0.6507421135902405, + "learning_rate": 8.530330269703445e-06, + "loss": 1.2904, + "mean_token_accuracy": 0.6633835931619009, + "num_tokens": 1834127190.0, + "step": 10940 + }, + { + "entropy": 1.6939348876476288, + "epoch": 1.2019170031034578, + "grad_norm": 0.6615996360778809, + "learning_rate": 8.52879064181502e-06, + "loss": 1.433, + "mean_token_accuracy": 0.661163717508316, + "num_tokens": 1834286480.0, + "step": 10941 + }, + { + "entropy": 1.7181770503520966, + "epoch": 1.2020268600148307, + "grad_norm": 0.6543670892715454, + "learning_rate": 8.52725109213926e-06, + "loss": 1.2943, + "mean_token_accuracy": 0.6620519210894903, + "num_tokens": 1834394332.0, + "step": 10942 + }, + { + "entropy": 1.6917479634284973, + "epoch": 1.2021367169262036, + "grad_norm": 0.8514935374259949, + "learning_rate": 8.525711620724885e-06, + "loss": 1.6089, + "mean_token_accuracy": 0.6384094009796778, + "num_tokens": 1834567370.0, + "step": 10943 + }, + { + "entropy": 1.7038015524546306, + "epoch": 1.2022465738375765, + "grad_norm": 0.7576673626899719, + "learning_rate": 8.524172227620628e-06, + "loss": 1.4333, + "mean_token_accuracy": 0.6669259319702784, + "num_tokens": 1834731150.0, + "step": 10944 + }, + { + "entropy": 1.698100248972575, + "epoch": 1.2023564307489494, + "grad_norm": 0.7677764892578125, + "learning_rate": 8.522632912875201e-06, + "loss": 1.2893, + "mean_token_accuracy": 0.6776777257521948, + "num_tokens": 1834881903.0, + "step": 10945 + }, + { + "entropy": 1.7503166198730469, + "epoch": 1.2024662876603225, + "grad_norm": 0.8348533511161804, + "learning_rate": 8.521093676537327e-06, + "loss": 1.5078, + "mean_token_accuracy": 0.64637457827727, + "num_tokens": 1835009118.0, + "step": 10946 + }, + { + "entropy": 1.7325368821620941, + "epoch": 1.2025761445716954, + "grad_norm": 0.7055541276931763, + "learning_rate": 8.519554518655719e-06, + "loss": 1.3927, + "mean_token_accuracy": 0.6497747053702673, + "num_tokens": 1835147384.0, + "step": 10947 + }, + { + "entropy": 1.7662848830223083, + "epoch": 1.2026860014830683, + "grad_norm": 0.6840864419937134, + "learning_rate": 8.518015439279092e-06, + "loss": 1.3965, + "mean_token_accuracy": 0.6465002000331879, + "num_tokens": 1835316504.0, + "step": 10948 + }, + { + "entropy": 1.6147024432818096, + "epoch": 1.2027958583944414, + "grad_norm": 0.6623427867889404, + "learning_rate": 8.516476438456164e-06, + "loss": 1.3179, + "mean_token_accuracy": 0.6583419640858968, + "num_tokens": 1835510113.0, + "step": 10949 + }, + { + "entropy": 1.69747061530749, + "epoch": 1.2029057153058142, + "grad_norm": 0.8042090535163879, + "learning_rate": 8.51493751623563e-06, + "loss": 1.4357, + "mean_token_accuracy": 0.6584860185782114, + "num_tokens": 1835682732.0, + "step": 10950 + }, + { + "entropy": 1.7218117117881775, + "epoch": 1.2030155722171871, + "grad_norm": 0.613860547542572, + "learning_rate": 8.513398672666209e-06, + "loss": 1.3976, + "mean_token_accuracy": 0.6480874568223953, + "num_tokens": 1835857692.0, + "step": 10951 + }, + { + "entropy": 1.7380519111951191, + "epoch": 1.20312542912856, + "grad_norm": 0.7758024334907532, + "learning_rate": 8.5118599077966e-06, + "loss": 1.4748, + "mean_token_accuracy": 0.6393528680006663, + "num_tokens": 1836015807.0, + "step": 10952 + }, + { + "entropy": 1.6796276768048604, + "epoch": 1.203235286039933, + "grad_norm": 0.5999566912651062, + "learning_rate": 8.5103212216755e-06, + "loss": 1.3092, + "mean_token_accuracy": 0.6672457307577133, + "num_tokens": 1836153324.0, + "step": 10953 + }, + { + "entropy": 1.7185988624890645, + "epoch": 1.203345142951306, + "grad_norm": 0.8109869360923767, + "learning_rate": 8.508782614351612e-06, + "loss": 1.4122, + "mean_token_accuracy": 0.6546374360720316, + "num_tokens": 1836311706.0, + "step": 10954 + }, + { + "entropy": 1.6729619602362316, + "epoch": 1.203454999862679, + "grad_norm": 0.6391358971595764, + "learning_rate": 8.507244085873636e-06, + "loss": 1.4177, + "mean_token_accuracy": 0.6587773958841959, + "num_tokens": 1836484187.0, + "step": 10955 + }, + { + "entropy": 1.7137231330076854, + "epoch": 1.2035648567740518, + "grad_norm": 0.6148737072944641, + "learning_rate": 8.505705636290256e-06, + "loss": 1.4516, + "mean_token_accuracy": 0.6422171841065089, + "num_tokens": 1836722681.0, + "step": 10956 + }, + { + "entropy": 1.7291185359160106, + "epoch": 1.2036747136854247, + "grad_norm": 0.7713000178337097, + "learning_rate": 8.504167265650171e-06, + "loss": 1.523, + "mean_token_accuracy": 0.629439448316892, + "num_tokens": 1836930155.0, + "step": 10957 + }, + { + "entropy": 1.6895070970058441, + "epoch": 1.2037845705967976, + "grad_norm": 0.627571702003479, + "learning_rate": 8.50262897400207e-06, + "loss": 1.3553, + "mean_token_accuracy": 0.6645805637041727, + "num_tokens": 1837103755.0, + "step": 10958 + }, + { + "entropy": 1.6793027222156525, + "epoch": 1.2038944275081707, + "grad_norm": 0.6628625392913818, + "learning_rate": 8.501090761394633e-06, + "loss": 1.3049, + "mean_token_accuracy": 0.6793260723352432, + "num_tokens": 1837258622.0, + "step": 10959 + }, + { + "entropy": 1.7522801260153453, + "epoch": 1.2040042844195435, + "grad_norm": 0.7251481413841248, + "learning_rate": 8.499552627876548e-06, + "loss": 1.2863, + "mean_token_accuracy": 0.6737864712874094, + "num_tokens": 1837364398.0, + "step": 10960 + }, + { + "entropy": 1.6999558309714, + "epoch": 1.2041141413309164, + "grad_norm": 0.6430142521858215, + "learning_rate": 8.498014573496495e-06, + "loss": 1.368, + "mean_token_accuracy": 0.6580288509527842, + "num_tokens": 1837556811.0, + "step": 10961 + }, + { + "entropy": 1.7127414047718048, + "epoch": 1.2042239982422895, + "grad_norm": 0.7526107430458069, + "learning_rate": 8.496476598303154e-06, + "loss": 1.3032, + "mean_token_accuracy": 0.6637988835573196, + "num_tokens": 1837690847.0, + "step": 10962 + }, + { + "entropy": 1.7249715427557628, + "epoch": 1.2043338551536624, + "grad_norm": 0.7242283225059509, + "learning_rate": 8.4949387023452e-06, + "loss": 1.4256, + "mean_token_accuracy": 0.6612179130315781, + "num_tokens": 1837830879.0, + "step": 10963 + }, + { + "entropy": 1.6988115906715393, + "epoch": 1.2044437120650353, + "grad_norm": 0.6785094141960144, + "learning_rate": 8.493400885671308e-06, + "loss": 1.3657, + "mean_token_accuracy": 0.6599143246809641, + "num_tokens": 1837977639.0, + "step": 10964 + }, + { + "entropy": 1.6827894548575084, + "epoch": 1.2045535689764082, + "grad_norm": 0.6206066012382507, + "learning_rate": 8.491863148330148e-06, + "loss": 1.3765, + "mean_token_accuracy": 0.6473558694124222, + "num_tokens": 1838195501.0, + "step": 10965 + }, + { + "entropy": 1.7016201118628185, + "epoch": 1.204663425887781, + "grad_norm": 0.7276713252067566, + "learning_rate": 8.49032549037039e-06, + "loss": 1.4146, + "mean_token_accuracy": 0.6548609832922617, + "num_tokens": 1838330606.0, + "step": 10966 + }, + { + "entropy": 1.678989330927531, + "epoch": 1.2047732827991542, + "grad_norm": 0.8118691444396973, + "learning_rate": 8.488787911840702e-06, + "loss": 1.4573, + "mean_token_accuracy": 0.642837405204773, + "num_tokens": 1838496302.0, + "step": 10967 + }, + { + "entropy": 1.6498075425624847, + "epoch": 1.204883139710527, + "grad_norm": 0.7537748217582703, + "learning_rate": 8.48725041278974e-06, + "loss": 1.2895, + "mean_token_accuracy": 0.6750341604153315, + "num_tokens": 1838650065.0, + "step": 10968 + }, + { + "entropy": 1.6907791793346405, + "epoch": 1.2049929966219, + "grad_norm": 0.9500882029533386, + "learning_rate": 8.48571299326617e-06, + "loss": 1.4808, + "mean_token_accuracy": 0.6541274686654409, + "num_tokens": 1838802465.0, + "step": 10969 + }, + { + "entropy": 1.6472548147042592, + "epoch": 1.2051028535332728, + "grad_norm": 0.6497575044631958, + "learning_rate": 8.484175653318656e-06, + "loss": 1.2956, + "mean_token_accuracy": 0.6712877601385117, + "num_tokens": 1838940896.0, + "step": 10970 + }, + { + "entropy": 1.7175530691941578, + "epoch": 1.2052127104446457, + "grad_norm": 0.7882832288742065, + "learning_rate": 8.482638392995845e-06, + "loss": 1.3548, + "mean_token_accuracy": 0.6520499388376871, + "num_tokens": 1839095122.0, + "step": 10971 + }, + { + "entropy": 1.6997943917910259, + "epoch": 1.2053225673560188, + "grad_norm": 0.65944504737854, + "learning_rate": 8.481101212346395e-06, + "loss": 1.3365, + "mean_token_accuracy": 0.6599059452613195, + "num_tokens": 1839282288.0, + "step": 10972 + }, + { + "entropy": 1.6870457927385967, + "epoch": 1.2054324242673917, + "grad_norm": 0.6719939708709717, + "learning_rate": 8.479564111418959e-06, + "loss": 1.426, + "mean_token_accuracy": 0.6513770818710327, + "num_tokens": 1839448385.0, + "step": 10973 + }, + { + "entropy": 1.7068449358145397, + "epoch": 1.2055422811787646, + "grad_norm": 0.6699382066726685, + "learning_rate": 8.47802709026218e-06, + "loss": 1.4808, + "mean_token_accuracy": 0.6471947580575943, + "num_tokens": 1839597423.0, + "step": 10974 + }, + { + "entropy": 1.712907761335373, + "epoch": 1.2056521380901377, + "grad_norm": 0.6794223189353943, + "learning_rate": 8.476490148924705e-06, + "loss": 1.3044, + "mean_token_accuracy": 0.6599731842676798, + "num_tokens": 1839736794.0, + "step": 10975 + }, + { + "entropy": 1.74131045738856, + "epoch": 1.2057619950015106, + "grad_norm": 0.8693950772285461, + "learning_rate": 8.474953287455185e-06, + "loss": 1.407, + "mean_token_accuracy": 0.6535183389981588, + "num_tokens": 1839864797.0, + "step": 10976 + }, + { + "entropy": 1.738874187072118, + "epoch": 1.2058718519128835, + "grad_norm": 0.8112277984619141, + "learning_rate": 8.473416505902254e-06, + "loss": 1.5832, + "mean_token_accuracy": 0.6483089849352837, + "num_tokens": 1840003600.0, + "step": 10977 + }, + { + "entropy": 1.7444894413153331, + "epoch": 1.2059817088242564, + "grad_norm": 0.6465990543365479, + "learning_rate": 8.471879804314552e-06, + "loss": 1.5445, + "mean_token_accuracy": 0.6178958763678869, + "num_tokens": 1840308433.0, + "step": 10978 + }, + { + "entropy": 1.728948066631953, + "epoch": 1.2060915657356293, + "grad_norm": 0.6564865112304688, + "learning_rate": 8.470343182740716e-06, + "loss": 1.4047, + "mean_token_accuracy": 0.6490548650423685, + "num_tokens": 1840478644.0, + "step": 10979 + }, + { + "entropy": 1.7226575712362926, + "epoch": 1.2062014226470024, + "grad_norm": 0.7290470600128174, + "learning_rate": 8.468806641229376e-06, + "loss": 1.2962, + "mean_token_accuracy": 0.6657624244689941, + "num_tokens": 1840621628.0, + "step": 10980 + }, + { + "entropy": 1.7463841636975606, + "epoch": 1.2063112795583752, + "grad_norm": 0.6011817455291748, + "learning_rate": 8.467270179829166e-06, + "loss": 1.4322, + "mean_token_accuracy": 0.6443581183751425, + "num_tokens": 1840798045.0, + "step": 10981 + }, + { + "entropy": 1.773529440164566, + "epoch": 1.2064211364697481, + "grad_norm": 0.6532623767852783, + "learning_rate": 8.465733798588715e-06, + "loss": 1.4649, + "mean_token_accuracy": 0.631449893116951, + "num_tokens": 1840950614.0, + "step": 10982 + }, + { + "entropy": 1.6585228244463603, + "epoch": 1.206530993381121, + "grad_norm": 0.6496007442474365, + "learning_rate": 8.464197497556646e-06, + "loss": 1.4284, + "mean_token_accuracy": 0.6490184764067332, + "num_tokens": 1841136948.0, + "step": 10983 + }, + { + "entropy": 1.7097909947236378, + "epoch": 1.206640850292494, + "grad_norm": 0.6547970175743103, + "learning_rate": 8.462661276781583e-06, + "loss": 1.4973, + "mean_token_accuracy": 0.6476222276687622, + "num_tokens": 1841300598.0, + "step": 10984 + }, + { + "entropy": 1.7845760981241863, + "epoch": 1.206750707203867, + "grad_norm": 0.8349284529685974, + "learning_rate": 8.46112513631215e-06, + "loss": 1.5017, + "mean_token_accuracy": 0.6529583881298701, + "num_tokens": 1841457868.0, + "step": 10985 + }, + { + "entropy": 1.7160189151763916, + "epoch": 1.20686056411524, + "grad_norm": 0.746083676815033, + "learning_rate": 8.459589076196957e-06, + "loss": 1.2057, + "mean_token_accuracy": 0.6863613526026408, + "num_tokens": 1841583833.0, + "step": 10986 + }, + { + "entropy": 1.6878787875175476, + "epoch": 1.2069704210266128, + "grad_norm": 0.7189993858337402, + "learning_rate": 8.458053096484628e-06, + "loss": 1.3567, + "mean_token_accuracy": 0.6640171358982722, + "num_tokens": 1841780454.0, + "step": 10987 + }, + { + "entropy": 1.7334311107794445, + "epoch": 1.207080277937986, + "grad_norm": 0.652119517326355, + "learning_rate": 8.456517197223774e-06, + "loss": 1.5206, + "mean_token_accuracy": 0.6425420294205347, + "num_tokens": 1841985689.0, + "step": 10988 + }, + { + "entropy": 1.6652919054031372, + "epoch": 1.2071901348493588, + "grad_norm": 0.6829299330711365, + "learning_rate": 8.454981378463006e-06, + "loss": 1.4756, + "mean_token_accuracy": 0.6531298210223516, + "num_tokens": 1842141908.0, + "step": 10989 + }, + { + "entropy": 1.7330115834871929, + "epoch": 1.2072999917607317, + "grad_norm": 0.7318177819252014, + "learning_rate": 8.453445640250928e-06, + "loss": 1.2572, + "mean_token_accuracy": 0.6711813112099966, + "num_tokens": 1842253992.0, + "step": 10990 + }, + { + "entropy": 1.7475427587827046, + "epoch": 1.2074098486721045, + "grad_norm": 0.6801440119743347, + "learning_rate": 8.451909982636148e-06, + "loss": 1.4171, + "mean_token_accuracy": 0.6432561924060186, + "num_tokens": 1842417979.0, + "step": 10991 + }, + { + "entropy": 1.7295256853103638, + "epoch": 1.2075197055834774, + "grad_norm": 0.6919019222259521, + "learning_rate": 8.450374405667267e-06, + "loss": 1.4833, + "mean_token_accuracy": 0.643854022026062, + "num_tokens": 1842583946.0, + "step": 10992 + }, + { + "entropy": 1.735103686650594, + "epoch": 1.2076295624948505, + "grad_norm": 0.7653998136520386, + "learning_rate": 8.448838909392889e-06, + "loss": 1.3802, + "mean_token_accuracy": 0.655050535996755, + "num_tokens": 1842739381.0, + "step": 10993 + }, + { + "entropy": 1.7508669197559357, + "epoch": 1.2077394194062234, + "grad_norm": 0.8188372254371643, + "learning_rate": 8.447303493861612e-06, + "loss": 1.5111, + "mean_token_accuracy": 0.6337236364682516, + "num_tokens": 1842933440.0, + "step": 10994 + }, + { + "entropy": 1.6848741968472798, + "epoch": 1.2078492763175963, + "grad_norm": 0.5986067056655884, + "learning_rate": 8.445768159122028e-06, + "loss": 1.3676, + "mean_token_accuracy": 0.6652501175800959, + "num_tokens": 1843132075.0, + "step": 10995 + }, + { + "entropy": 1.7106738686561584, + "epoch": 1.2079591332289692, + "grad_norm": 0.6997463703155518, + "learning_rate": 8.44423290522273e-06, + "loss": 1.4677, + "mean_token_accuracy": 0.6429871618747711, + "num_tokens": 1843296882.0, + "step": 10996 + }, + { + "entropy": 1.7586935957272847, + "epoch": 1.208068990140342, + "grad_norm": 0.7393748164176941, + "learning_rate": 8.44269773221231e-06, + "loss": 1.447, + "mean_token_accuracy": 0.6556659440199534, + "num_tokens": 1843432063.0, + "step": 10997 + }, + { + "entropy": 1.685716986656189, + "epoch": 1.2081788470517152, + "grad_norm": 0.6346744894981384, + "learning_rate": 8.441162640139354e-06, + "loss": 1.4373, + "mean_token_accuracy": 0.6571964671214422, + "num_tokens": 1843608301.0, + "step": 10998 + }, + { + "entropy": 1.7133116920789082, + "epoch": 1.208288703963088, + "grad_norm": 0.7435621023178101, + "learning_rate": 8.439627629052446e-06, + "loss": 1.4443, + "mean_token_accuracy": 0.6661744117736816, + "num_tokens": 1843798714.0, + "step": 10999 + }, + { + "entropy": 1.619499186674754, + "epoch": 1.208398560874461, + "grad_norm": 0.6935999989509583, + "learning_rate": 8.438092699000172e-06, + "loss": 1.2591, + "mean_token_accuracy": 0.675690621137619, + "num_tokens": 1843939118.0, + "step": 11000 + }, + { + "entropy": 1.757647732893626, + "epoch": 1.208508417785834, + "grad_norm": 0.6426697373390198, + "learning_rate": 8.436557850031109e-06, + "loss": 1.4623, + "mean_token_accuracy": 0.6505621820688248, + "num_tokens": 1844111185.0, + "step": 11001 + }, + { + "entropy": 1.7143224676450093, + "epoch": 1.208618274697207, + "grad_norm": 0.7159050703048706, + "learning_rate": 8.435023082193834e-06, + "loss": 1.4011, + "mean_token_accuracy": 0.669948066274325, + "num_tokens": 1844278495.0, + "step": 11002 + }, + { + "entropy": 1.6677929162979126, + "epoch": 1.2087281316085798, + "grad_norm": 0.8504517674446106, + "learning_rate": 8.433488395536924e-06, + "loss": 1.4652, + "mean_token_accuracy": 0.6526702543099722, + "num_tokens": 1844426506.0, + "step": 11003 + }, + { + "entropy": 1.6635715464750926, + "epoch": 1.2088379885199527, + "grad_norm": 0.6439080834388733, + "learning_rate": 8.431953790108946e-06, + "loss": 1.3158, + "mean_token_accuracy": 0.6697449535131454, + "num_tokens": 1844581041.0, + "step": 11004 + }, + { + "entropy": 1.717222531636556, + "epoch": 1.2089478454313256, + "grad_norm": 0.5454255938529968, + "learning_rate": 8.430419265958481e-06, + "loss": 1.4645, + "mean_token_accuracy": 0.6585534413655599, + "num_tokens": 1844751255.0, + "step": 11005 + }, + { + "entropy": 1.7076788544654846, + "epoch": 1.2090577023426987, + "grad_norm": 0.7401055097579956, + "learning_rate": 8.42888482313408e-06, + "loss": 1.3668, + "mean_token_accuracy": 0.6625747780005137, + "num_tokens": 1844902026.0, + "step": 11006 + }, + { + "entropy": 1.707229753335317, + "epoch": 1.2091675592540716, + "grad_norm": 0.7380411624908447, + "learning_rate": 8.42735046168432e-06, + "loss": 1.2464, + "mean_token_accuracy": 0.6756529162327448, + "num_tokens": 1845022200.0, + "step": 11007 + }, + { + "entropy": 1.7667948305606842, + "epoch": 1.2092774161654445, + "grad_norm": 0.652038037776947, + "learning_rate": 8.42581618165776e-06, + "loss": 1.3872, + "mean_token_accuracy": 0.6549219787120819, + "num_tokens": 1845206105.0, + "step": 11008 + }, + { + "entropy": 1.6876067121823628, + "epoch": 1.2093872730768174, + "grad_norm": 0.7061187028884888, + "learning_rate": 8.424281983102956e-06, + "loss": 1.2664, + "mean_token_accuracy": 0.6709717114766439, + "num_tokens": 1845316113.0, + "step": 11009 + }, + { + "entropy": 1.7546610136826832, + "epoch": 1.2094971299881905, + "grad_norm": 0.7472836971282959, + "learning_rate": 8.422747866068464e-06, + "loss": 1.4804, + "mean_token_accuracy": 0.6357733458280563, + "num_tokens": 1845544449.0, + "step": 11010 + }, + { + "entropy": 1.7322336435317993, + "epoch": 1.2096069868995634, + "grad_norm": 0.6114717125892639, + "learning_rate": 8.421213830602846e-06, + "loss": 1.463, + "mean_token_accuracy": 0.6512684375047684, + "num_tokens": 1845756608.0, + "step": 11011 + }, + { + "entropy": 1.7614449659983318, + "epoch": 1.2097168438109362, + "grad_norm": 0.6005818843841553, + "learning_rate": 8.419679876754643e-06, + "loss": 1.5256, + "mean_token_accuracy": 0.6367160379886627, + "num_tokens": 1845970779.0, + "step": 11012 + }, + { + "entropy": 1.72640464703242, + "epoch": 1.2098267007223091, + "grad_norm": 0.7229748964309692, + "learning_rate": 8.418146004572412e-06, + "loss": 1.3674, + "mean_token_accuracy": 0.6622246205806732, + "num_tokens": 1846110227.0, + "step": 11013 + }, + { + "entropy": 1.6716107626756032, + "epoch": 1.2099365576336822, + "grad_norm": 0.6086723804473877, + "learning_rate": 8.416612214104695e-06, + "loss": 1.3978, + "mean_token_accuracy": 0.6556883007287979, + "num_tokens": 1846294061.0, + "step": 11014 + }, + { + "entropy": 1.6809994280338287, + "epoch": 1.2100464145450551, + "grad_norm": 0.7691161632537842, + "learning_rate": 8.415078505400041e-06, + "loss": 1.4725, + "mean_token_accuracy": 0.6410997360944748, + "num_tokens": 1846452635.0, + "step": 11015 + }, + { + "entropy": 1.6480069359143574, + "epoch": 1.210156271456428, + "grad_norm": 0.6271137595176697, + "learning_rate": 8.413544878506983e-06, + "loss": 1.5155, + "mean_token_accuracy": 0.64403468867143, + "num_tokens": 1846614016.0, + "step": 11016 + }, + { + "entropy": 1.6463837722937267, + "epoch": 1.210266128367801, + "grad_norm": 0.9350231885910034, + "learning_rate": 8.412011333474068e-06, + "loss": 1.4401, + "mean_token_accuracy": 0.6404287169377009, + "num_tokens": 1846826350.0, + "step": 11017 + }, + { + "entropy": 1.732763757308324, + "epoch": 1.2103759852791738, + "grad_norm": 0.7772718071937561, + "learning_rate": 8.410477870349825e-06, + "loss": 1.4685, + "mean_token_accuracy": 0.6515309810638428, + "num_tokens": 1846958581.0, + "step": 11018 + }, + { + "entropy": 1.721984734137853, + "epoch": 1.210485842190547, + "grad_norm": 0.6624974012374878, + "learning_rate": 8.408944489182791e-06, + "loss": 1.3843, + "mean_token_accuracy": 0.6445004592339197, + "num_tokens": 1847163105.0, + "step": 11019 + }, + { + "entropy": 1.690103272596995, + "epoch": 1.2105956991019198, + "grad_norm": 0.7604218125343323, + "learning_rate": 8.4074111900215e-06, + "loss": 1.448, + "mean_token_accuracy": 0.6505727221568426, + "num_tokens": 1847318720.0, + "step": 11020 + }, + { + "entropy": 1.673486590385437, + "epoch": 1.2107055560132927, + "grad_norm": 0.7544184923171997, + "learning_rate": 8.405877972914472e-06, + "loss": 1.5931, + "mean_token_accuracy": 0.6497178276379904, + "num_tokens": 1847464207.0, + "step": 11021 + }, + { + "entropy": 1.739910493294398, + "epoch": 1.2108154129246655, + "grad_norm": 0.82258141040802, + "learning_rate": 8.404344837910237e-06, + "loss": 1.5897, + "mean_token_accuracy": 0.6385711828867594, + "num_tokens": 1847632525.0, + "step": 11022 + }, + { + "entropy": 1.7209635078907013, + "epoch": 1.2109252698360387, + "grad_norm": 0.6614201068878174, + "learning_rate": 8.402811785057326e-06, + "loss": 1.399, + "mean_token_accuracy": 0.6512503723303477, + "num_tokens": 1847774532.0, + "step": 11023 + }, + { + "entropy": 1.7443317274252574, + "epoch": 1.2110351267474115, + "grad_norm": 0.6455691456794739, + "learning_rate": 8.40127881440424e-06, + "loss": 1.3867, + "mean_token_accuracy": 0.6450707316398621, + "num_tokens": 1847906774.0, + "step": 11024 + }, + { + "entropy": 1.6787743270397186, + "epoch": 1.2111449836587844, + "grad_norm": 0.7437204122543335, + "learning_rate": 8.399745925999517e-06, + "loss": 1.3348, + "mean_token_accuracy": 0.6639335205157598, + "num_tokens": 1848048858.0, + "step": 11025 + }, + { + "entropy": 1.7229706346988678, + "epoch": 1.2112548405701573, + "grad_norm": 0.6177759170532227, + "learning_rate": 8.39821311989166e-06, + "loss": 1.4536, + "mean_token_accuracy": 0.6526401787996292, + "num_tokens": 1848219021.0, + "step": 11026 + }, + { + "entropy": 1.7722203433513641, + "epoch": 1.2113646974815304, + "grad_norm": 0.6613593697547913, + "learning_rate": 8.396680396129189e-06, + "loss": 1.5069, + "mean_token_accuracy": 0.6412953784068426, + "num_tokens": 1848416777.0, + "step": 11027 + }, + { + "entropy": 1.7179987331231434, + "epoch": 1.2114745543929033, + "grad_norm": 0.7174702882766724, + "learning_rate": 8.395147754760604e-06, + "loss": 1.417, + "mean_token_accuracy": 0.6555820604165395, + "num_tokens": 1848589063.0, + "step": 11028 + }, + { + "entropy": 1.7602262993653615, + "epoch": 1.2115844113042762, + "grad_norm": 0.7947672009468079, + "learning_rate": 8.393615195834425e-06, + "loss": 1.4152, + "mean_token_accuracy": 0.6568863987922668, + "num_tokens": 1848739349.0, + "step": 11029 + }, + { + "entropy": 1.7349158922831218, + "epoch": 1.211694268215649, + "grad_norm": 0.6535570621490479, + "learning_rate": 8.392082719399146e-06, + "loss": 1.411, + "mean_token_accuracy": 0.6554250419139862, + "num_tokens": 1848886234.0, + "step": 11030 + }, + { + "entropy": 1.7216412425041199, + "epoch": 1.211804125127022, + "grad_norm": 0.6314913034439087, + "learning_rate": 8.390550325503276e-06, + "loss": 1.3573, + "mean_token_accuracy": 0.644097218910853, + "num_tokens": 1849037728.0, + "step": 11031 + }, + { + "entropy": 1.6654905676841736, + "epoch": 1.211913982038395, + "grad_norm": 0.5710697770118713, + "learning_rate": 8.389018014195316e-06, + "loss": 1.3456, + "mean_token_accuracy": 0.6632640808820724, + "num_tokens": 1849207967.0, + "step": 11032 + }, + { + "entropy": 1.7055143018563588, + "epoch": 1.212023838949768, + "grad_norm": 0.631976842880249, + "learning_rate": 8.387485785523755e-06, + "loss": 1.4246, + "mean_token_accuracy": 0.6355055769284567, + "num_tokens": 1849411860.0, + "step": 11033 + }, + { + "entropy": 1.719924658536911, + "epoch": 1.2121336958611408, + "grad_norm": 0.6693115830421448, + "learning_rate": 8.38595363953709e-06, + "loss": 1.4092, + "mean_token_accuracy": 0.6476466059684753, + "num_tokens": 1849617211.0, + "step": 11034 + }, + { + "entropy": 1.7284215490023296, + "epoch": 1.2122435527725137, + "grad_norm": 0.7452521324157715, + "learning_rate": 8.384421576283819e-06, + "loss": 1.4597, + "mean_token_accuracy": 0.6476357032855352, + "num_tokens": 1849792205.0, + "step": 11035 + }, + { + "entropy": 1.6705568730831146, + "epoch": 1.2123534096838868, + "grad_norm": 0.5901700854301453, + "learning_rate": 8.382889595812422e-06, + "loss": 1.3592, + "mean_token_accuracy": 0.6637533108393351, + "num_tokens": 1849966783.0, + "step": 11036 + }, + { + "entropy": 1.679235577583313, + "epoch": 1.2124632665952597, + "grad_norm": 0.7355685830116272, + "learning_rate": 8.381357698171392e-06, + "loss": 1.4727, + "mean_token_accuracy": 0.6543498982985815, + "num_tokens": 1850129797.0, + "step": 11037 + }, + { + "entropy": 1.6582284073034923, + "epoch": 1.2125731235066326, + "grad_norm": 0.7128838300704956, + "learning_rate": 8.379825883409213e-06, + "loss": 1.3672, + "mean_token_accuracy": 0.6587399691343307, + "num_tokens": 1850314612.0, + "step": 11038 + }, + { + "entropy": 1.7182820936044056, + "epoch": 1.2126829804180055, + "grad_norm": 0.9296267032623291, + "learning_rate": 8.378294151574362e-06, + "loss": 1.4561, + "mean_token_accuracy": 0.6555204093456268, + "num_tokens": 1850481038.0, + "step": 11039 + }, + { + "entropy": 1.6882583896319072, + "epoch": 1.2127928373293786, + "grad_norm": 0.7641075253486633, + "learning_rate": 8.376762502715318e-06, + "loss": 1.4607, + "mean_token_accuracy": 0.6370103309551874, + "num_tokens": 1850666045.0, + "step": 11040 + }, + { + "entropy": 1.7282833755016327, + "epoch": 1.2129026942407515, + "grad_norm": 0.6613611578941345, + "learning_rate": 8.375230936880562e-06, + "loss": 1.3988, + "mean_token_accuracy": 0.6569731831550598, + "num_tokens": 1850870551.0, + "step": 11041 + }, + { + "entropy": 1.7475760380427043, + "epoch": 1.2130125511521244, + "grad_norm": 0.7780677080154419, + "learning_rate": 8.373699454118562e-06, + "loss": 1.2312, + "mean_token_accuracy": 0.6765096088250478, + "num_tokens": 1850988764.0, + "step": 11042 + }, + { + "entropy": 1.7152721087137859, + "epoch": 1.2131224080634972, + "grad_norm": 0.6480224132537842, + "learning_rate": 8.372168054477791e-06, + "loss": 1.4921, + "mean_token_accuracy": 0.6434395660956701, + "num_tokens": 1851178116.0, + "step": 11043 + }, + { + "entropy": 1.6879489123821259, + "epoch": 1.2132322649748701, + "grad_norm": 0.6920694708824158, + "learning_rate": 8.370636738006721e-06, + "loss": 1.4977, + "mean_token_accuracy": 0.6338366170724233, + "num_tokens": 1851382859.0, + "step": 11044 + }, + { + "entropy": 1.6802996695041656, + "epoch": 1.2133421218862432, + "grad_norm": 0.6248618364334106, + "learning_rate": 8.369105504753809e-06, + "loss": 1.3379, + "mean_token_accuracy": 0.6674815913041433, + "num_tokens": 1851589206.0, + "step": 11045 + }, + { + "entropy": 1.7379266719023387, + "epoch": 1.2134519787976161, + "grad_norm": 0.7720683813095093, + "learning_rate": 8.367574354767522e-06, + "loss": 1.3548, + "mean_token_accuracy": 0.6515401800473531, + "num_tokens": 1851737929.0, + "step": 11046 + }, + { + "entropy": 1.7030988434950511, + "epoch": 1.213561835708989, + "grad_norm": 0.7740477919578552, + "learning_rate": 8.366043288096324e-06, + "loss": 1.4222, + "mean_token_accuracy": 0.6508887757857641, + "num_tokens": 1851939859.0, + "step": 11047 + }, + { + "entropy": 1.6766453782717388, + "epoch": 1.213671692620362, + "grad_norm": 0.6776142120361328, + "learning_rate": 8.364512304788664e-06, + "loss": 1.4908, + "mean_token_accuracy": 0.6671174516280493, + "num_tokens": 1852092528.0, + "step": 11048 + }, + { + "entropy": 1.6063755849997203, + "epoch": 1.213781549531735, + "grad_norm": 0.5421578884124756, + "learning_rate": 8.362981404893005e-06, + "loss": 1.5063, + "mean_token_accuracy": 0.6495156238476435, + "num_tokens": 1852332284.0, + "step": 11049 + }, + { + "entropy": 1.7231159309546153, + "epoch": 1.213891406443108, + "grad_norm": 0.8034752011299133, + "learning_rate": 8.361450588457798e-06, + "loss": 1.4637, + "mean_token_accuracy": 0.6396220078070959, + "num_tokens": 1852504941.0, + "step": 11050 + }, + { + "entropy": 1.759638677040736, + "epoch": 1.2140012633544808, + "grad_norm": 0.6964645981788635, + "learning_rate": 8.35991985553149e-06, + "loss": 1.3326, + "mean_token_accuracy": 0.6753019044796625, + "num_tokens": 1852671091.0, + "step": 11051 + }, + { + "entropy": 1.769709587097168, + "epoch": 1.2141111202658537, + "grad_norm": 0.6132997870445251, + "learning_rate": 8.358389206162525e-06, + "loss": 1.5146, + "mean_token_accuracy": 0.6466521521409353, + "num_tokens": 1852894196.0, + "step": 11052 + }, + { + "entropy": 1.7432759602864583, + "epoch": 1.2142209771772268, + "grad_norm": 0.7836261987686157, + "learning_rate": 8.356858640399354e-06, + "loss": 1.2905, + "mean_token_accuracy": 0.6691566308339437, + "num_tokens": 1853022071.0, + "step": 11053 + }, + { + "entropy": 1.6997772653897603, + "epoch": 1.2143308340885997, + "grad_norm": 0.6840148568153381, + "learning_rate": 8.355328158290415e-06, + "loss": 1.3656, + "mean_token_accuracy": 0.6704634875059128, + "num_tokens": 1853143220.0, + "step": 11054 + }, + { + "entropy": 1.725580135981242, + "epoch": 1.2144406909999725, + "grad_norm": 0.7722833752632141, + "learning_rate": 8.35379775988415e-06, + "loss": 1.571, + "mean_token_accuracy": 0.6398867269357046, + "num_tokens": 1853345019.0, + "step": 11055 + }, + { + "entropy": 1.6314020156860352, + "epoch": 1.2145505479113454, + "grad_norm": 0.892280101776123, + "learning_rate": 8.352267445228994e-06, + "loss": 1.4697, + "mean_token_accuracy": 0.6704972585042318, + "num_tokens": 1853539231.0, + "step": 11056 + }, + { + "entropy": 1.6972508529822032, + "epoch": 1.2146604048227183, + "grad_norm": 0.615267276763916, + "learning_rate": 8.350737214373379e-06, + "loss": 1.4657, + "mean_token_accuracy": 0.6365112711985906, + "num_tokens": 1853703622.0, + "step": 11057 + }, + { + "entropy": 1.6537209053834279, + "epoch": 1.2147702617340914, + "grad_norm": 0.6618078947067261, + "learning_rate": 8.349207067365737e-06, + "loss": 1.271, + "mean_token_accuracy": 0.6827361087004343, + "num_tokens": 1853882138.0, + "step": 11058 + }, + { + "entropy": 1.6730584800243378, + "epoch": 1.2148801186454643, + "grad_norm": 0.6180942058563232, + "learning_rate": 8.347677004254498e-06, + "loss": 1.332, + "mean_token_accuracy": 0.6662278970082601, + "num_tokens": 1854028245.0, + "step": 11059 + }, + { + "entropy": 1.6949761112531025, + "epoch": 1.2149899755568372, + "grad_norm": 0.6167245507240295, + "learning_rate": 8.346147025088086e-06, + "loss": 1.3307, + "mean_token_accuracy": 0.6576797862847646, + "num_tokens": 1854185316.0, + "step": 11060 + }, + { + "entropy": 1.7193231880664825, + "epoch": 1.21509983246821, + "grad_norm": 0.6622843146324158, + "learning_rate": 8.344617129914923e-06, + "loss": 1.528, + "mean_token_accuracy": 0.63414998849233, + "num_tokens": 1854426177.0, + "step": 11061 + }, + { + "entropy": 1.6867660681406658, + "epoch": 1.2152096893795832, + "grad_norm": 0.6055188775062561, + "learning_rate": 8.343087318783434e-06, + "loss": 1.4881, + "mean_token_accuracy": 0.6526160339514414, + "num_tokens": 1854610116.0, + "step": 11062 + }, + { + "entropy": 1.7061599691708882, + "epoch": 1.215319546290956, + "grad_norm": 0.680685818195343, + "learning_rate": 8.34155759174203e-06, + "loss": 1.4192, + "mean_token_accuracy": 0.6544150362412134, + "num_tokens": 1854755139.0, + "step": 11063 + }, + { + "entropy": 1.6967225869496663, + "epoch": 1.215429403202329, + "grad_norm": 0.6835984587669373, + "learning_rate": 8.340027948839135e-06, + "loss": 1.3267, + "mean_token_accuracy": 0.6695795605580012, + "num_tokens": 1854901981.0, + "step": 11064 + }, + { + "entropy": 1.7517095704873402, + "epoch": 1.2155392601137018, + "grad_norm": 0.8013256192207336, + "learning_rate": 8.338498390123158e-06, + "loss": 1.4562, + "mean_token_accuracy": 0.6485675225655237, + "num_tokens": 1855044987.0, + "step": 11065 + }, + { + "entropy": 1.7563750843207042, + "epoch": 1.215649117025075, + "grad_norm": 0.7665896415710449, + "learning_rate": 8.3369689156425e-06, + "loss": 1.3885, + "mean_token_accuracy": 0.6616611480712891, + "num_tokens": 1855188518.0, + "step": 11066 + }, + { + "entropy": 1.7391284902890523, + "epoch": 1.2157589739364478, + "grad_norm": 0.8238292336463928, + "learning_rate": 8.335439525445586e-06, + "loss": 1.3099, + "mean_token_accuracy": 0.6730262041091919, + "num_tokens": 1855310644.0, + "step": 11067 + }, + { + "entropy": 1.6750684281190236, + "epoch": 1.2158688308478207, + "grad_norm": 0.5978335738182068, + "learning_rate": 8.333910219580804e-06, + "loss": 1.4055, + "mean_token_accuracy": 0.6556108246246973, + "num_tokens": 1855483085.0, + "step": 11068 + }, + { + "entropy": 1.6777072350184123, + "epoch": 1.2159786877591936, + "grad_norm": 0.7173717617988586, + "learning_rate": 8.332380998096561e-06, + "loss": 1.4097, + "mean_token_accuracy": 0.6578392386436462, + "num_tokens": 1855663183.0, + "step": 11069 + }, + { + "entropy": 1.7260019779205322, + "epoch": 1.2160885446705665, + "grad_norm": 0.7578794956207275, + "learning_rate": 8.330851861041262e-06, + "loss": 1.339, + "mean_token_accuracy": 0.6605818818012873, + "num_tokens": 1855792392.0, + "step": 11070 + }, + { + "entropy": 1.6965550482273102, + "epoch": 1.2161984015819396, + "grad_norm": 0.6967483162879944, + "learning_rate": 8.329322808463294e-06, + "loss": 1.4111, + "mean_token_accuracy": 0.660852442185084, + "num_tokens": 1855970158.0, + "step": 11071 + }, + { + "entropy": 1.7621172269185383, + "epoch": 1.2163082584933125, + "grad_norm": 0.6754755973815918, + "learning_rate": 8.327793840411056e-06, + "loss": 1.3595, + "mean_token_accuracy": 0.6517157753308614, + "num_tokens": 1856107207.0, + "step": 11072 + }, + { + "entropy": 1.727922797203064, + "epoch": 1.2164181154046854, + "grad_norm": 0.7006334662437439, + "learning_rate": 8.326264956932946e-06, + "loss": 1.4191, + "mean_token_accuracy": 0.6418725997209549, + "num_tokens": 1856296668.0, + "step": 11073 + }, + { + "entropy": 1.6740870575110118, + "epoch": 1.2165279723160582, + "grad_norm": 0.5527358055114746, + "learning_rate": 8.324736158077338e-06, + "loss": 1.1741, + "mean_token_accuracy": 0.6742709130048752, + "num_tokens": 1856484822.0, + "step": 11074 + }, + { + "entropy": 1.6873212854067485, + "epoch": 1.2166378292274314, + "grad_norm": 0.6192285418510437, + "learning_rate": 8.323207443892626e-06, + "loss": 1.3717, + "mean_token_accuracy": 0.6412427127361298, + "num_tokens": 1856734436.0, + "step": 11075 + }, + { + "entropy": 1.7283975680669148, + "epoch": 1.2167476861388042, + "grad_norm": 0.6958233714103699, + "learning_rate": 8.321678814427195e-06, + "loss": 1.5502, + "mean_token_accuracy": 0.6426873902479807, + "num_tokens": 1856955797.0, + "step": 11076 + }, + { + "entropy": 1.7120100259780884, + "epoch": 1.2168575430501771, + "grad_norm": 0.7286651134490967, + "learning_rate": 8.320150269729421e-06, + "loss": 1.436, + "mean_token_accuracy": 0.6634295533100764, + "num_tokens": 1857107820.0, + "step": 11077 + }, + { + "entropy": 1.6884620587031047, + "epoch": 1.21696739996155, + "grad_norm": 0.8027754426002502, + "learning_rate": 8.318621809847682e-06, + "loss": 1.4571, + "mean_token_accuracy": 0.6525517205397288, + "num_tokens": 1857282314.0, + "step": 11078 + }, + { + "entropy": 1.7306037942568462, + "epoch": 1.2170772568729231, + "grad_norm": 0.755138635635376, + "learning_rate": 8.317093434830358e-06, + "loss": 1.1819, + "mean_token_accuracy": 0.6871931801239649, + "num_tokens": 1857387879.0, + "step": 11079 + }, + { + "entropy": 1.693780501683553, + "epoch": 1.217187113784296, + "grad_norm": 0.7477782368659973, + "learning_rate": 8.315565144725814e-06, + "loss": 1.4305, + "mean_token_accuracy": 0.667605901757876, + "num_tokens": 1857536614.0, + "step": 11080 + }, + { + "entropy": 1.7177151342233021, + "epoch": 1.217296970695669, + "grad_norm": 0.6408316493034363, + "learning_rate": 8.314036939582426e-06, + "loss": 1.4695, + "mean_token_accuracy": 0.6495102594296137, + "num_tokens": 1857700887.0, + "step": 11081 + }, + { + "entropy": 1.6869684358437855, + "epoch": 1.2174068276070418, + "grad_norm": 0.745124340057373, + "learning_rate": 8.31250881944856e-06, + "loss": 1.3735, + "mean_token_accuracy": 0.6654743601878484, + "num_tokens": 1857869919.0, + "step": 11082 + }, + { + "entropy": 1.690779209136963, + "epoch": 1.2175166845184147, + "grad_norm": 0.6019642949104309, + "learning_rate": 8.310980784372576e-06, + "loss": 1.3871, + "mean_token_accuracy": 0.6539642065763474, + "num_tokens": 1858056352.0, + "step": 11083 + }, + { + "entropy": 1.6384514768918355, + "epoch": 1.2176265414297878, + "grad_norm": 0.66316157579422, + "learning_rate": 8.309452834402837e-06, + "loss": 1.34, + "mean_token_accuracy": 0.6631773859262466, + "num_tokens": 1858232023.0, + "step": 11084 + }, + { + "entropy": 1.7274446388085682, + "epoch": 1.2177363983411607, + "grad_norm": 0.7292064428329468, + "learning_rate": 8.307924969587708e-06, + "loss": 1.5255, + "mean_token_accuracy": 0.6418920457363129, + "num_tokens": 1858412774.0, + "step": 11085 + }, + { + "entropy": 1.6863858600457509, + "epoch": 1.2178462552525335, + "grad_norm": 0.7607459425926208, + "learning_rate": 8.306397189975537e-06, + "loss": 1.3769, + "mean_token_accuracy": 0.6505500276883444, + "num_tokens": 1858530907.0, + "step": 11086 + }, + { + "entropy": 1.713003009557724, + "epoch": 1.2179561121639064, + "grad_norm": 0.6406744718551636, + "learning_rate": 8.30486949561468e-06, + "loss": 1.4627, + "mean_token_accuracy": 0.6432670553525289, + "num_tokens": 1858701857.0, + "step": 11087 + }, + { + "entropy": 1.7706784307956696, + "epoch": 1.2180659690752795, + "grad_norm": 0.6789109706878662, + "learning_rate": 8.303341886553493e-06, + "loss": 1.3834, + "mean_token_accuracy": 0.6596761445204417, + "num_tokens": 1858891212.0, + "step": 11088 + }, + { + "entropy": 1.6831317842006683, + "epoch": 1.2181758259866524, + "grad_norm": 0.6298303604125977, + "learning_rate": 8.30181436284032e-06, + "loss": 1.312, + "mean_token_accuracy": 0.6730460574229559, + "num_tokens": 1859017132.0, + "step": 11089 + }, + { + "entropy": 1.6755876143773396, + "epoch": 1.2182856828980253, + "grad_norm": 1.0117133855819702, + "learning_rate": 8.300286924523505e-06, + "loss": 1.4366, + "mean_token_accuracy": 0.6528118550777435, + "num_tokens": 1859168864.0, + "step": 11090 + }, + { + "entropy": 1.632968008518219, + "epoch": 1.2183955398093982, + "grad_norm": 0.6477782130241394, + "learning_rate": 8.298759571651393e-06, + "loss": 1.4088, + "mean_token_accuracy": 0.6646546920140585, + "num_tokens": 1859317350.0, + "step": 11091 + }, + { + "entropy": 1.697382648785909, + "epoch": 1.2185053967207713, + "grad_norm": 0.7350544333457947, + "learning_rate": 8.297232304272322e-06, + "loss": 1.4274, + "mean_token_accuracy": 0.6546217650175095, + "num_tokens": 1859504084.0, + "step": 11092 + }, + { + "entropy": 1.6902291178703308, + "epoch": 1.2186152536321442, + "grad_norm": 0.7430658340454102, + "learning_rate": 8.295705122434633e-06, + "loss": 1.3245, + "mean_token_accuracy": 0.6620542804400126, + "num_tokens": 1859649852.0, + "step": 11093 + }, + { + "entropy": 1.7104488511880238, + "epoch": 1.218725110543517, + "grad_norm": 0.7023297548294067, + "learning_rate": 8.294178026186656e-06, + "loss": 1.3796, + "mean_token_accuracy": 0.6658419122298559, + "num_tokens": 1859776385.0, + "step": 11094 + }, + { + "entropy": 1.6531602640946705, + "epoch": 1.21883496745489, + "grad_norm": 0.6301870346069336, + "learning_rate": 8.292651015576725e-06, + "loss": 1.3238, + "mean_token_accuracy": 0.6639458288749059, + "num_tokens": 1859912333.0, + "step": 11095 + }, + { + "entropy": 1.7121768792470295, + "epoch": 1.2189448243662628, + "grad_norm": 0.7700769901275635, + "learning_rate": 8.29112409065317e-06, + "loss": 1.435, + "mean_token_accuracy": 0.6486354172229767, + "num_tokens": 1860057917.0, + "step": 11096 + }, + { + "entropy": 1.6702754994233449, + "epoch": 1.219054681277636, + "grad_norm": 0.834185004234314, + "learning_rate": 8.289597251464319e-06, + "loss": 1.4033, + "mean_token_accuracy": 0.6532324800888697, + "num_tokens": 1860209091.0, + "step": 11097 + }, + { + "entropy": 1.6551378965377808, + "epoch": 1.2191645381890088, + "grad_norm": 0.7919728755950928, + "learning_rate": 8.288070498058489e-06, + "loss": 1.5648, + "mean_token_accuracy": 0.6503797471523285, + "num_tokens": 1860389834.0, + "step": 11098 + }, + { + "entropy": 1.687700519959132, + "epoch": 1.2192743951003817, + "grad_norm": 0.699600100517273, + "learning_rate": 8.28654383048401e-06, + "loss": 1.2284, + "mean_token_accuracy": 0.6715284287929535, + "num_tokens": 1860507119.0, + "step": 11099 + }, + { + "entropy": 1.6870764593283336, + "epoch": 1.2193842520117546, + "grad_norm": 0.6864370107650757, + "learning_rate": 8.285017248789195e-06, + "loss": 1.3806, + "mean_token_accuracy": 0.6525691151618958, + "num_tokens": 1860656756.0, + "step": 11100 + }, + { + "entropy": 1.795731355746587, + "epoch": 1.2194941089231277, + "grad_norm": 0.8872252106666565, + "learning_rate": 8.28349075302236e-06, + "loss": 1.5156, + "mean_token_accuracy": 0.6348374287287394, + "num_tokens": 1860800299.0, + "step": 11101 + }, + { + "entropy": 1.6798087656497955, + "epoch": 1.2196039658345006, + "grad_norm": 0.6114014983177185, + "learning_rate": 8.281964343231817e-06, + "loss": 1.3712, + "mean_token_accuracy": 0.6489862948656082, + "num_tokens": 1860986779.0, + "step": 11102 + }, + { + "entropy": 1.7513733704884846, + "epoch": 1.2197138227458735, + "grad_norm": 0.6035370826721191, + "learning_rate": 8.280438019465885e-06, + "loss": 1.4784, + "mean_token_accuracy": 0.6494489560524622, + "num_tokens": 1861163872.0, + "step": 11103 + }, + { + "entropy": 1.7338022689024608, + "epoch": 1.2198236796572464, + "grad_norm": 0.7351298928260803, + "learning_rate": 8.278911781772853e-06, + "loss": 1.3004, + "mean_token_accuracy": 0.6633716921011606, + "num_tokens": 1861310542.0, + "step": 11104 + }, + { + "entropy": 1.7447912494341533, + "epoch": 1.2199335365686195, + "grad_norm": 0.5645570755004883, + "learning_rate": 8.277385630201044e-06, + "loss": 1.4266, + "mean_token_accuracy": 0.6452751606702805, + "num_tokens": 1861542153.0, + "step": 11105 + }, + { + "entropy": 1.6788997650146484, + "epoch": 1.2200433934799924, + "grad_norm": 0.7873282432556152, + "learning_rate": 8.275859564798753e-06, + "loss": 1.5012, + "mean_token_accuracy": 0.6227647066116333, + "num_tokens": 1861776129.0, + "step": 11106 + }, + { + "entropy": 1.7607338031133015, + "epoch": 1.2201532503913652, + "grad_norm": 0.6690042614936829, + "learning_rate": 8.274333585614278e-06, + "loss": 1.4915, + "mean_token_accuracy": 0.6434455215930939, + "num_tokens": 1861967058.0, + "step": 11107 + }, + { + "entropy": 1.675374945004781, + "epoch": 1.2202631073027381, + "grad_norm": 0.6949226260185242, + "learning_rate": 8.272807692695915e-06, + "loss": 1.3347, + "mean_token_accuracy": 0.6678502013285955, + "num_tokens": 1862107356.0, + "step": 11108 + }, + { + "entropy": 1.6946699917316437, + "epoch": 1.220372964214111, + "grad_norm": 0.7122815251350403, + "learning_rate": 8.271281886091964e-06, + "loss": 1.349, + "mean_token_accuracy": 0.6702584276596705, + "num_tokens": 1862267288.0, + "step": 11109 + }, + { + "entropy": 1.6867165565490723, + "epoch": 1.2204828211254841, + "grad_norm": 0.7338141202926636, + "learning_rate": 8.26975616585071e-06, + "loss": 1.3542, + "mean_token_accuracy": 0.6660454173882803, + "num_tokens": 1862425941.0, + "step": 11110 + }, + { + "entropy": 1.7361581027507782, + "epoch": 1.220592678036857, + "grad_norm": 0.7320640087127686, + "learning_rate": 8.26823053202044e-06, + "loss": 1.412, + "mean_token_accuracy": 0.6694223483403524, + "num_tokens": 1862582085.0, + "step": 11111 + }, + { + "entropy": 1.5928312540054321, + "epoch": 1.22070253494823, + "grad_norm": 0.5795355439186096, + "learning_rate": 8.266704984649448e-06, + "loss": 1.2941, + "mean_token_accuracy": 0.6710518797238668, + "num_tokens": 1862754268.0, + "step": 11112 + }, + { + "entropy": 1.790160854657491, + "epoch": 1.2208123918596028, + "grad_norm": 0.7169445753097534, + "learning_rate": 8.265179523786007e-06, + "loss": 1.3867, + "mean_token_accuracy": 0.655024250348409, + "num_tokens": 1862879401.0, + "step": 11113 + }, + { + "entropy": 1.7564424475034077, + "epoch": 1.2209222487709759, + "grad_norm": 0.7270147204399109, + "learning_rate": 8.263654149478404e-06, + "loss": 1.4396, + "mean_token_accuracy": 0.6577896674474081, + "num_tokens": 1863038450.0, + "step": 11114 + }, + { + "entropy": 1.7379739979902904, + "epoch": 1.2210321056823488, + "grad_norm": 0.6270740032196045, + "learning_rate": 8.262128861774914e-06, + "loss": 1.4605, + "mean_token_accuracy": 0.6530610223611196, + "num_tokens": 1863206326.0, + "step": 11115 + }, + { + "entropy": 1.7211223940054576, + "epoch": 1.2211419625937217, + "grad_norm": 0.7160316109657288, + "learning_rate": 8.260603660723809e-06, + "loss": 1.3408, + "mean_token_accuracy": 0.6656116793553034, + "num_tokens": 1863350263.0, + "step": 11116 + }, + { + "entropy": 1.7302058239777882, + "epoch": 1.2212518195050945, + "grad_norm": 0.6913062334060669, + "learning_rate": 8.259078546373365e-06, + "loss": 1.3812, + "mean_token_accuracy": 0.6563667754332224, + "num_tokens": 1863492291.0, + "step": 11117 + }, + { + "entropy": 1.6666455070177715, + "epoch": 1.2213616764164676, + "grad_norm": 0.5929701328277588, + "learning_rate": 8.257553518771853e-06, + "loss": 1.3796, + "mean_token_accuracy": 0.6561521291732788, + "num_tokens": 1863680842.0, + "step": 11118 + }, + { + "entropy": 1.691912164290746, + "epoch": 1.2214715333278405, + "grad_norm": 0.6996101140975952, + "learning_rate": 8.256028577967534e-06, + "loss": 1.4237, + "mean_token_accuracy": 0.652123952905337, + "num_tokens": 1863838255.0, + "step": 11119 + }, + { + "entropy": 1.67883962392807, + "epoch": 1.2215813902392134, + "grad_norm": 0.6681597828865051, + "learning_rate": 8.254503724008673e-06, + "loss": 1.4035, + "mean_token_accuracy": 0.6579047491153082, + "num_tokens": 1863976314.0, + "step": 11120 + }, + { + "entropy": 1.761623462041219, + "epoch": 1.2216912471505863, + "grad_norm": 0.6309159398078918, + "learning_rate": 8.252978956943536e-06, + "loss": 1.5039, + "mean_token_accuracy": 0.636713887254397, + "num_tokens": 1864175243.0, + "step": 11121 + }, + { + "entropy": 1.722734143336614, + "epoch": 1.2218011040619592, + "grad_norm": 0.5722051858901978, + "learning_rate": 8.251454276820372e-06, + "loss": 1.259, + "mean_token_accuracy": 0.6658644527196884, + "num_tokens": 1864306155.0, + "step": 11122 + }, + { + "entropy": 1.6461964547634125, + "epoch": 1.2219109609733323, + "grad_norm": 0.6660195589065552, + "learning_rate": 8.249929683687442e-06, + "loss": 1.3394, + "mean_token_accuracy": 0.669757604598999, + "num_tokens": 1864445396.0, + "step": 11123 + }, + { + "entropy": 1.677109609047572, + "epoch": 1.2220208178847052, + "grad_norm": 0.7361236810684204, + "learning_rate": 8.248405177593005e-06, + "loss": 1.5038, + "mean_token_accuracy": 0.6409556319316229, + "num_tokens": 1864633037.0, + "step": 11124 + }, + { + "entropy": 1.7299526433149974, + "epoch": 1.222130674796078, + "grad_norm": 0.8260616064071655, + "learning_rate": 8.246880758585299e-06, + "loss": 1.3426, + "mean_token_accuracy": 0.6657137821118037, + "num_tokens": 1864758296.0, + "step": 11125 + }, + { + "entropy": 1.7556905547777812, + "epoch": 1.222240531707451, + "grad_norm": 0.9507250785827637, + "learning_rate": 8.245356426712577e-06, + "loss": 1.5266, + "mean_token_accuracy": 0.6378592848777771, + "num_tokens": 1864935775.0, + "step": 11126 + }, + { + "entropy": 1.6686455806096394, + "epoch": 1.222350388618824, + "grad_norm": 0.6535077691078186, + "learning_rate": 8.243832182023082e-06, + "loss": 1.4378, + "mean_token_accuracy": 0.6565053512652715, + "num_tokens": 1865122257.0, + "step": 11127 + }, + { + "entropy": 1.6927911341190338, + "epoch": 1.222460245530197, + "grad_norm": 0.579563319683075, + "learning_rate": 8.242308024565058e-06, + "loss": 1.4441, + "mean_token_accuracy": 0.6382209062576294, + "num_tokens": 1865351730.0, + "step": 11128 + }, + { + "entropy": 1.695607751607895, + "epoch": 1.2225701024415698, + "grad_norm": 0.7512062191963196, + "learning_rate": 8.240783954386744e-06, + "loss": 1.4194, + "mean_token_accuracy": 0.6544724305470785, + "num_tokens": 1865476451.0, + "step": 11129 + }, + { + "entropy": 1.7077071964740753, + "epoch": 1.2226799593529427, + "grad_norm": 0.6465796828269958, + "learning_rate": 8.239259971536369e-06, + "loss": 1.3455, + "mean_token_accuracy": 0.6696978360414505, + "num_tokens": 1865637091.0, + "step": 11130 + }, + { + "entropy": 1.7845915853977203, + "epoch": 1.2227898162643158, + "grad_norm": 0.7069242000579834, + "learning_rate": 8.237736076062176e-06, + "loss": 1.4593, + "mean_token_accuracy": 0.6528904487689337, + "num_tokens": 1865780627.0, + "step": 11131 + }, + { + "entropy": 1.643855979045232, + "epoch": 1.2228996731756887, + "grad_norm": 0.6371172070503235, + "learning_rate": 8.23621226801239e-06, + "loss": 1.3646, + "mean_token_accuracy": 0.6664480765660604, + "num_tokens": 1865934135.0, + "step": 11132 + }, + { + "entropy": 1.7372296055157979, + "epoch": 1.2230095300870616, + "grad_norm": 0.8932238817214966, + "learning_rate": 8.23468854743524e-06, + "loss": 1.533, + "mean_token_accuracy": 0.6475943475961685, + "num_tokens": 1866099365.0, + "step": 11133 + }, + { + "entropy": 1.6458615064620972, + "epoch": 1.2231193869984345, + "grad_norm": 0.6451045870780945, + "learning_rate": 8.233164914378952e-06, + "loss": 1.3967, + "mean_token_accuracy": 0.661471222837766, + "num_tokens": 1866274426.0, + "step": 11134 + }, + { + "entropy": 1.7578480541706085, + "epoch": 1.2232292439098074, + "grad_norm": 0.770330548286438, + "learning_rate": 8.231641368891752e-06, + "loss": 1.5281, + "mean_token_accuracy": 0.6454629898071289, + "num_tokens": 1866492310.0, + "step": 11135 + }, + { + "entropy": 1.7039073308308919, + "epoch": 1.2233391008211805, + "grad_norm": 0.7558161020278931, + "learning_rate": 8.230117911021849e-06, + "loss": 1.2976, + "mean_token_accuracy": 0.6695977548758189, + "num_tokens": 1866623727.0, + "step": 11136 + }, + { + "entropy": 1.674913187821706, + "epoch": 1.2234489577325534, + "grad_norm": 0.6112053394317627, + "learning_rate": 8.228594540817467e-06, + "loss": 1.3014, + "mean_token_accuracy": 0.6781783352295557, + "num_tokens": 1866761880.0, + "step": 11137 + }, + { + "entropy": 1.6501458883285522, + "epoch": 1.2235588146439262, + "grad_norm": 0.6094418168067932, + "learning_rate": 8.227071258326823e-06, + "loss": 1.4306, + "mean_token_accuracy": 0.6528994739055634, + "num_tokens": 1867008257.0, + "step": 11138 + }, + { + "entropy": 1.7520569463570912, + "epoch": 1.2236686715552993, + "grad_norm": 0.7769097089767456, + "learning_rate": 8.22554806359812e-06, + "loss": 1.2928, + "mean_token_accuracy": 0.6684681624174118, + "num_tokens": 1867113624.0, + "step": 11139 + }, + { + "entropy": 1.6467416286468506, + "epoch": 1.2237785284666722, + "grad_norm": 0.6554421782493591, + "learning_rate": 8.224024956679568e-06, + "loss": 1.2857, + "mean_token_accuracy": 0.66878113647302, + "num_tokens": 1867252361.0, + "step": 11140 + }, + { + "entropy": 1.75765860080719, + "epoch": 1.2238883853780451, + "grad_norm": 0.8180747628211975, + "learning_rate": 8.222501937619385e-06, + "loss": 1.4596, + "mean_token_accuracy": 0.6483653237422308, + "num_tokens": 1867380976.0, + "step": 11141 + }, + { + "entropy": 1.746919463078181, + "epoch": 1.223998242289418, + "grad_norm": 0.63518887758255, + "learning_rate": 8.220979006465755e-06, + "loss": 1.4453, + "mean_token_accuracy": 0.6484291801850001, + "num_tokens": 1867523470.0, + "step": 11142 + }, + { + "entropy": 1.7108286619186401, + "epoch": 1.2241080992007909, + "grad_norm": 0.7541074156761169, + "learning_rate": 8.219456163266891e-06, + "loss": 1.2723, + "mean_token_accuracy": 0.6758786340554556, + "num_tokens": 1867627150.0, + "step": 11143 + }, + { + "entropy": 1.703292191028595, + "epoch": 1.224217956112164, + "grad_norm": 0.6642011404037476, + "learning_rate": 8.217933408070985e-06, + "loss": 1.3838, + "mean_token_accuracy": 0.6647111773490906, + "num_tokens": 1867821226.0, + "step": 11144 + }, + { + "entropy": 1.722439835468928, + "epoch": 1.2243278130235369, + "grad_norm": 0.7255253791809082, + "learning_rate": 8.216410740926235e-06, + "loss": 1.5162, + "mean_token_accuracy": 0.6462632616360983, + "num_tokens": 1868008940.0, + "step": 11145 + }, + { + "entropy": 1.7716669142246246, + "epoch": 1.2244376699349098, + "grad_norm": 0.6904542446136475, + "learning_rate": 8.214888161880827e-06, + "loss": 1.3692, + "mean_token_accuracy": 0.659872904419899, + "num_tokens": 1868166158.0, + "step": 11146 + }, + { + "entropy": 1.7202934126059215, + "epoch": 1.2245475268462827, + "grad_norm": 0.6671558022499084, + "learning_rate": 8.21336567098296e-06, + "loss": 1.2963, + "mean_token_accuracy": 0.6699869285027186, + "num_tokens": 1868311826.0, + "step": 11147 + }, + { + "entropy": 1.684409538904826, + "epoch": 1.2246573837576555, + "grad_norm": 0.6063627004623413, + "learning_rate": 8.211843268280807e-06, + "loss": 1.38, + "mean_token_accuracy": 0.6571643104155859, + "num_tokens": 1868495561.0, + "step": 11148 + }, + { + "entropy": 1.6580710808436077, + "epoch": 1.2247672406690286, + "grad_norm": 0.6459930539131165, + "learning_rate": 8.210320953822561e-06, + "loss": 1.377, + "mean_token_accuracy": 0.6583688110113144, + "num_tokens": 1868664866.0, + "step": 11149 + }, + { + "entropy": 1.6917679210503895, + "epoch": 1.2248770975804015, + "grad_norm": 0.8134970664978027, + "learning_rate": 8.208798727656404e-06, + "loss": 1.3967, + "mean_token_accuracy": 0.6652498145898184, + "num_tokens": 1868894590.0, + "step": 11150 + }, + { + "entropy": 1.6833031276861827, + "epoch": 1.2249869544917744, + "grad_norm": 0.6595972180366516, + "learning_rate": 8.207276589830505e-06, + "loss": 1.4866, + "mean_token_accuracy": 0.6401710361242294, + "num_tokens": 1869068031.0, + "step": 11151 + }, + { + "entropy": 1.701577494541804, + "epoch": 1.2250968114031475, + "grad_norm": 0.6729449033737183, + "learning_rate": 8.20575454039304e-06, + "loss": 1.4412, + "mean_token_accuracy": 0.6557190865278244, + "num_tokens": 1869229821.0, + "step": 11152 + }, + { + "entropy": 1.7543394267559052, + "epoch": 1.2252066683145204, + "grad_norm": 0.7445177435874939, + "learning_rate": 8.204232579392192e-06, + "loss": 1.2951, + "mean_token_accuracy": 0.669405405720075, + "num_tokens": 1869350127.0, + "step": 11153 + }, + { + "entropy": 1.7157810529073079, + "epoch": 1.2253165252258933, + "grad_norm": 0.712943971157074, + "learning_rate": 8.20271070687612e-06, + "loss": 1.4664, + "mean_token_accuracy": 0.64886274933815, + "num_tokens": 1869538791.0, + "step": 11154 + }, + { + "entropy": 1.6712701618671417, + "epoch": 1.2254263821372662, + "grad_norm": 0.6949601769447327, + "learning_rate": 8.201188922892994e-06, + "loss": 1.3177, + "mean_token_accuracy": 0.6590605328480402, + "num_tokens": 1869677641.0, + "step": 11155 + }, + { + "entropy": 1.6515525877475739, + "epoch": 1.225536239048639, + "grad_norm": 0.6297810673713684, + "learning_rate": 8.199667227490978e-06, + "loss": 1.2314, + "mean_token_accuracy": 0.6822561621665955, + "num_tokens": 1869820121.0, + "step": 11156 + }, + { + "entropy": 1.72104745109876, + "epoch": 1.2256460959600122, + "grad_norm": 0.582068681716919, + "learning_rate": 8.198145620718229e-06, + "loss": 1.4739, + "mean_token_accuracy": 0.6453680694103241, + "num_tokens": 1870094410.0, + "step": 11157 + }, + { + "entropy": 1.7204219698905945, + "epoch": 1.225755952871385, + "grad_norm": 0.9038074612617493, + "learning_rate": 8.19662410262291e-06, + "loss": 1.4717, + "mean_token_accuracy": 0.6598606556653976, + "num_tokens": 1870217099.0, + "step": 11158 + }, + { + "entropy": 1.705298662185669, + "epoch": 1.225865809782758, + "grad_norm": 0.956987738609314, + "learning_rate": 8.195102673253179e-06, + "loss": 1.3267, + "mean_token_accuracy": 0.674546500047048, + "num_tokens": 1870363296.0, + "step": 11159 + }, + { + "entropy": 1.7120076020558674, + "epoch": 1.2259756666941308, + "grad_norm": 0.6030857563018799, + "learning_rate": 8.19358133265718e-06, + "loss": 1.4096, + "mean_token_accuracy": 0.6453298330307007, + "num_tokens": 1870546904.0, + "step": 11160 + }, + { + "entropy": 1.6742028892040253, + "epoch": 1.2260855236055037, + "grad_norm": 0.7126300930976868, + "learning_rate": 8.192060080883066e-06, + "loss": 1.4757, + "mean_token_accuracy": 0.6451009213924408, + "num_tokens": 1870746388.0, + "step": 11161 + }, + { + "entropy": 1.6565779447555542, + "epoch": 1.2261953805168768, + "grad_norm": 0.6220477223396301, + "learning_rate": 8.19053891797899e-06, + "loss": 1.5842, + "mean_token_accuracy": 0.6242297689119974, + "num_tokens": 1870984793.0, + "step": 11162 + }, + { + "entropy": 1.7263220647970836, + "epoch": 1.2263052374282497, + "grad_norm": 0.7451938390731812, + "learning_rate": 8.189017843993087e-06, + "loss": 1.3359, + "mean_token_accuracy": 0.6607558329900106, + "num_tokens": 1871172441.0, + "step": 11163 + }, + { + "entropy": 1.7222507695357006, + "epoch": 1.2264150943396226, + "grad_norm": 0.6210897564888, + "learning_rate": 8.187496858973504e-06, + "loss": 1.4375, + "mean_token_accuracy": 0.6482439885536829, + "num_tokens": 1871366509.0, + "step": 11164 + }, + { + "entropy": 1.6978266040484111, + "epoch": 1.2265249512509957, + "grad_norm": 0.6470620036125183, + "learning_rate": 8.185975962968382e-06, + "loss": 1.329, + "mean_token_accuracy": 0.6652160336573919, + "num_tokens": 1871583409.0, + "step": 11165 + }, + { + "entropy": 1.7219670116901398, + "epoch": 1.2266348081623686, + "grad_norm": 0.6918816566467285, + "learning_rate": 8.184455156025849e-06, + "loss": 1.5423, + "mean_token_accuracy": 0.6453223278125128, + "num_tokens": 1871795247.0, + "step": 11166 + }, + { + "entropy": 1.655881514151891, + "epoch": 1.2267446650737415, + "grad_norm": 0.5830437541007996, + "learning_rate": 8.182934438194039e-06, + "loss": 1.3384, + "mean_token_accuracy": 0.6642249425252279, + "num_tokens": 1872026942.0, + "step": 11167 + }, + { + "entropy": 1.6868476569652557, + "epoch": 1.2268545219851144, + "grad_norm": 0.7208216190338135, + "learning_rate": 8.18141380952109e-06, + "loss": 1.4512, + "mean_token_accuracy": 0.6498266657193502, + "num_tokens": 1872200498.0, + "step": 11168 + }, + { + "entropy": 1.7578080296516418, + "epoch": 1.2269643788964872, + "grad_norm": 0.6857250332832336, + "learning_rate": 8.179893270055122e-06, + "loss": 1.3811, + "mean_token_accuracy": 0.6548460274934769, + "num_tokens": 1872368081.0, + "step": 11169 + }, + { + "entropy": 1.6785810391108196, + "epoch": 1.2270742358078603, + "grad_norm": 0.6952616572380066, + "learning_rate": 8.178372819844258e-06, + "loss": 1.2608, + "mean_token_accuracy": 0.6776244093974432, + "num_tokens": 1872510704.0, + "step": 11170 + }, + { + "entropy": 1.7037639617919922, + "epoch": 1.2271840927192332, + "grad_norm": 0.7573713064193726, + "learning_rate": 8.176852458936628e-06, + "loss": 1.4666, + "mean_token_accuracy": 0.6548537611961365, + "num_tokens": 1872669770.0, + "step": 11171 + }, + { + "entropy": 1.6827135582764943, + "epoch": 1.2272939496306061, + "grad_norm": 0.6930450201034546, + "learning_rate": 8.175332187380341e-06, + "loss": 1.3069, + "mean_token_accuracy": 0.66745425760746, + "num_tokens": 1872875315.0, + "step": 11172 + }, + { + "entropy": 1.7051290174325306, + "epoch": 1.227403806541979, + "grad_norm": 0.6210904121398926, + "learning_rate": 8.173812005223517e-06, + "loss": 1.2625, + "mean_token_accuracy": 0.6816918949286143, + "num_tokens": 1873033536.0, + "step": 11173 + }, + { + "entropy": 1.7081526120503743, + "epoch": 1.2275136634533519, + "grad_norm": 0.6397086977958679, + "learning_rate": 8.172291912514274e-06, + "loss": 1.473, + "mean_token_accuracy": 0.663579652706782, + "num_tokens": 1873186734.0, + "step": 11174 + }, + { + "entropy": 1.7037063737710316, + "epoch": 1.227623520364725, + "grad_norm": 0.6888397336006165, + "learning_rate": 8.170771909300716e-06, + "loss": 1.5498, + "mean_token_accuracy": 0.6241617798805237, + "num_tokens": 1873388660.0, + "step": 11175 + }, + { + "entropy": 1.6468111673990886, + "epoch": 1.2277333772760979, + "grad_norm": 0.6850365996360779, + "learning_rate": 8.169251995630948e-06, + "loss": 1.3269, + "mean_token_accuracy": 0.6640166540940603, + "num_tokens": 1873634673.0, + "step": 11176 + }, + { + "entropy": 1.7099710702896118, + "epoch": 1.2278432341874708, + "grad_norm": 0.6371767520904541, + "learning_rate": 8.167732171553088e-06, + "loss": 1.341, + "mean_token_accuracy": 0.6740860641002655, + "num_tokens": 1873780882.0, + "step": 11177 + }, + { + "entropy": 1.7181882460912068, + "epoch": 1.2279530910988439, + "grad_norm": 0.7756669521331787, + "learning_rate": 8.166212437115221e-06, + "loss": 1.403, + "mean_token_accuracy": 0.6458031634489695, + "num_tokens": 1873933442.0, + "step": 11178 + }, + { + "entropy": 1.6694627106189728, + "epoch": 1.2280629480102168, + "grad_norm": 0.8991198539733887, + "learning_rate": 8.164692792365456e-06, + "loss": 1.3021, + "mean_token_accuracy": 0.6779783020416895, + "num_tokens": 1874056969.0, + "step": 11179 + }, + { + "entropy": 1.7401012182235718, + "epoch": 1.2281728049215896, + "grad_norm": 0.7417164444923401, + "learning_rate": 8.163173237351887e-06, + "loss": 1.434, + "mean_token_accuracy": 0.6551551967859268, + "num_tokens": 1874216033.0, + "step": 11180 + }, + { + "entropy": 1.7963026364644368, + "epoch": 1.2282826618329625, + "grad_norm": 0.7083638310432434, + "learning_rate": 8.161653772122607e-06, + "loss": 1.4688, + "mean_token_accuracy": 0.6413846760988235, + "num_tokens": 1874412535.0, + "step": 11181 + }, + { + "entropy": 1.6756743987401326, + "epoch": 1.2283925187443354, + "grad_norm": 0.6325013637542725, + "learning_rate": 8.1601343967257e-06, + "loss": 1.3387, + "mean_token_accuracy": 0.6656516889731089, + "num_tokens": 1874560478.0, + "step": 11182 + }, + { + "entropy": 1.6859131356080372, + "epoch": 1.2285023756557085, + "grad_norm": 0.6860812306404114, + "learning_rate": 8.15861511120927e-06, + "loss": 1.3272, + "mean_token_accuracy": 0.6739522715409597, + "num_tokens": 1874709470.0, + "step": 11183 + }, + { + "entropy": 1.7304639220237732, + "epoch": 1.2286122325670814, + "grad_norm": 1.0082952976226807, + "learning_rate": 8.157095915621382e-06, + "loss": 1.5461, + "mean_token_accuracy": 0.6445205509662628, + "num_tokens": 1874878019.0, + "step": 11184 + }, + { + "entropy": 1.7110270063082378, + "epoch": 1.2287220894784543, + "grad_norm": 0.6818872690200806, + "learning_rate": 8.155576810010131e-06, + "loss": 1.6461, + "mean_token_accuracy": 0.6105376332998276, + "num_tokens": 1875092932.0, + "step": 11185 + }, + { + "entropy": 1.6794796387354534, + "epoch": 1.2288319463898272, + "grad_norm": 0.5344785451889038, + "learning_rate": 8.154057794423595e-06, + "loss": 1.3906, + "mean_token_accuracy": 0.6523840377728144, + "num_tokens": 1875304235.0, + "step": 11186 + }, + { + "entropy": 1.635409543911616, + "epoch": 1.2289418033012, + "grad_norm": 0.5482689738273621, + "learning_rate": 8.152538868909846e-06, + "loss": 1.4172, + "mean_token_accuracy": 0.6563707540432612, + "num_tokens": 1875523446.0, + "step": 11187 + }, + { + "entropy": 1.7209465603033702, + "epoch": 1.2290516602125732, + "grad_norm": 0.8221262693405151, + "learning_rate": 8.151020033516957e-06, + "loss": 1.4545, + "mean_token_accuracy": 0.6479563862085342, + "num_tokens": 1875685022.0, + "step": 11188 + }, + { + "entropy": 1.7244952420393627, + "epoch": 1.229161517123946, + "grad_norm": 0.7386845350265503, + "learning_rate": 8.149501288293e-06, + "loss": 1.3956, + "mean_token_accuracy": 0.6560295174519221, + "num_tokens": 1875850083.0, + "step": 11189 + }, + { + "entropy": 1.6882909337679546, + "epoch": 1.229271374035319, + "grad_norm": 0.5964418649673462, + "learning_rate": 8.147982633286043e-06, + "loss": 1.4545, + "mean_token_accuracy": 0.6443512588739395, + "num_tokens": 1876051887.0, + "step": 11190 + }, + { + "entropy": 1.7120668391386669, + "epoch": 1.229381230946692, + "grad_norm": 0.7184486389160156, + "learning_rate": 8.146464068544153e-06, + "loss": 1.4313, + "mean_token_accuracy": 0.6619121432304382, + "num_tokens": 1876193353.0, + "step": 11191 + }, + { + "entropy": 1.715154270331065, + "epoch": 1.229491087858065, + "grad_norm": 0.7384195923805237, + "learning_rate": 8.144945594115386e-06, + "loss": 1.4344, + "mean_token_accuracy": 0.6491910715897878, + "num_tokens": 1876348250.0, + "step": 11192 + }, + { + "entropy": 1.7401968638102214, + "epoch": 1.2296009447694378, + "grad_norm": 0.7795338034629822, + "learning_rate": 8.143427210047806e-06, + "loss": 1.3542, + "mean_token_accuracy": 0.6642808963855108, + "num_tokens": 1876487873.0, + "step": 11193 + }, + { + "entropy": 1.6769267618656158, + "epoch": 1.2297108016808107, + "grad_norm": 0.7037333846092224, + "learning_rate": 8.14190891638947e-06, + "loss": 1.2072, + "mean_token_accuracy": 0.681295191248258, + "num_tokens": 1876634540.0, + "step": 11194 + }, + { + "entropy": 1.7220933934052784, + "epoch": 1.2298206585921836, + "grad_norm": 0.7242723107337952, + "learning_rate": 8.140390713188425e-06, + "loss": 1.3561, + "mean_token_accuracy": 0.6651198863983154, + "num_tokens": 1876792228.0, + "step": 11195 + }, + { + "entropy": 1.6575371026992798, + "epoch": 1.2299305155035567, + "grad_norm": 0.5735102295875549, + "learning_rate": 8.138872600492725e-06, + "loss": 1.4878, + "mean_token_accuracy": 0.644407923022906, + "num_tokens": 1877007455.0, + "step": 11196 + }, + { + "entropy": 1.711806943019231, + "epoch": 1.2300403724149296, + "grad_norm": 0.5998605489730835, + "learning_rate": 8.137354578350422e-06, + "loss": 1.5256, + "mean_token_accuracy": 0.6423748483260473, + "num_tokens": 1877208212.0, + "step": 11197 + }, + { + "entropy": 1.6850987871487935, + "epoch": 1.2301502293263025, + "grad_norm": 0.5823908448219299, + "learning_rate": 8.135836646809552e-06, + "loss": 1.3704, + "mean_token_accuracy": 0.658658762772878, + "num_tokens": 1877387686.0, + "step": 11198 + }, + { + "entropy": 1.7524065176645915, + "epoch": 1.2302600862376754, + "grad_norm": 0.6456050872802734, + "learning_rate": 8.134318805918161e-06, + "loss": 1.4089, + "mean_token_accuracy": 0.6498519033193588, + "num_tokens": 1877539996.0, + "step": 11199 + }, + { + "entropy": 1.786929150422414, + "epoch": 1.2303699431490482, + "grad_norm": 0.7778921723365784, + "learning_rate": 8.132801055724296e-06, + "loss": 1.6354, + "mean_token_accuracy": 0.6156754593054453, + "num_tokens": 1877784918.0, + "step": 11200 + }, + { + "entropy": 1.7587116559346516, + "epoch": 1.2304798000604213, + "grad_norm": 0.7484762668609619, + "learning_rate": 8.13128339627598e-06, + "loss": 1.3407, + "mean_token_accuracy": 0.6659899353981018, + "num_tokens": 1877989211.0, + "step": 11201 + }, + { + "entropy": 1.6641751329104106, + "epoch": 1.2305896569717942, + "grad_norm": 0.8541742563247681, + "learning_rate": 8.12976582762125e-06, + "loss": 1.4287, + "mean_token_accuracy": 0.6577843030293783, + "num_tokens": 1878183278.0, + "step": 11202 + }, + { + "entropy": 1.6898448566595714, + "epoch": 1.2306995138831671, + "grad_norm": 0.7239437103271484, + "learning_rate": 8.128248349808143e-06, + "loss": 1.2639, + "mean_token_accuracy": 0.6711952984333038, + "num_tokens": 1878308690.0, + "step": 11203 + }, + { + "entropy": 1.6802580654621124, + "epoch": 1.2308093707945402, + "grad_norm": 0.6512843370437622, + "learning_rate": 8.12673096288468e-06, + "loss": 1.5386, + "mean_token_accuracy": 0.6389039307832718, + "num_tokens": 1878465206.0, + "step": 11204 + }, + { + "entropy": 1.724786251783371, + "epoch": 1.230919227705913, + "grad_norm": 0.7077043652534485, + "learning_rate": 8.125213666898886e-06, + "loss": 1.3819, + "mean_token_accuracy": 0.6506198197603226, + "num_tokens": 1878611726.0, + "step": 11205 + }, + { + "entropy": 1.7447443306446075, + "epoch": 1.231029084617286, + "grad_norm": 0.6581472754478455, + "learning_rate": 8.123696461898785e-06, + "loss": 1.318, + "mean_token_accuracy": 0.6526310493548712, + "num_tokens": 1878761057.0, + "step": 11206 + }, + { + "entropy": 1.741285651922226, + "epoch": 1.2311389415286589, + "grad_norm": 0.7155635356903076, + "learning_rate": 8.122179347932396e-06, + "loss": 1.5159, + "mean_token_accuracy": 0.6463326240579287, + "num_tokens": 1878956011.0, + "step": 11207 + }, + { + "entropy": 1.7428237795829773, + "epoch": 1.2312487984400318, + "grad_norm": 0.7881234288215637, + "learning_rate": 8.12066232504773e-06, + "loss": 1.5306, + "mean_token_accuracy": 0.6414182931184769, + "num_tokens": 1879152697.0, + "step": 11208 + }, + { + "entropy": 1.6375042895476024, + "epoch": 1.2313586553514049, + "grad_norm": 0.622815728187561, + "learning_rate": 8.119145393292808e-06, + "loss": 1.3191, + "mean_token_accuracy": 0.6815162648757299, + "num_tokens": 1879303949.0, + "step": 11209 + }, + { + "entropy": 1.6750175754229228, + "epoch": 1.2314685122627778, + "grad_norm": 0.6067901253700256, + "learning_rate": 8.117628552715636e-06, + "loss": 1.4323, + "mean_token_accuracy": 0.6625420202811559, + "num_tokens": 1879486772.0, + "step": 11210 + }, + { + "entropy": 1.7117481927076976, + "epoch": 1.2315783691741506, + "grad_norm": 0.6861073970794678, + "learning_rate": 8.116111803364218e-06, + "loss": 1.325, + "mean_token_accuracy": 0.6644338915745417, + "num_tokens": 1879628385.0, + "step": 11211 + }, + { + "entropy": 1.7356761197249095, + "epoch": 1.2316882260855235, + "grad_norm": 0.7062935829162598, + "learning_rate": 8.114595145286565e-06, + "loss": 1.3774, + "mean_token_accuracy": 0.6549742966890335, + "num_tokens": 1879808474.0, + "step": 11212 + }, + { + "entropy": 1.6221475005149841, + "epoch": 1.2317980829968966, + "grad_norm": 0.7179040908813477, + "learning_rate": 8.113078578530676e-06, + "loss": 1.4577, + "mean_token_accuracy": 0.6649397065242132, + "num_tokens": 1879965479.0, + "step": 11213 + }, + { + "entropy": 1.6686547497908275, + "epoch": 1.2319079399082695, + "grad_norm": 0.6317336559295654, + "learning_rate": 8.111562103144543e-06, + "loss": 1.3409, + "mean_token_accuracy": 0.6715045968691508, + "num_tokens": 1880102350.0, + "step": 11214 + }, + { + "entropy": 1.6721225877602894, + "epoch": 1.2320177968196424, + "grad_norm": 0.6598741412162781, + "learning_rate": 8.110045719176178e-06, + "loss": 1.4653, + "mean_token_accuracy": 0.650575632850329, + "num_tokens": 1880285478.0, + "step": 11215 + }, + { + "entropy": 1.7288126051425934, + "epoch": 1.2321276537310153, + "grad_norm": 0.7576711773872375, + "learning_rate": 8.108529426673555e-06, + "loss": 1.4545, + "mean_token_accuracy": 0.6471219807863235, + "num_tokens": 1880471549.0, + "step": 11216 + }, + { + "entropy": 1.749824732542038, + "epoch": 1.2322375106423884, + "grad_norm": 0.7423568367958069, + "learning_rate": 8.107013225684678e-06, + "loss": 1.3571, + "mean_token_accuracy": 0.6579922884702682, + "num_tokens": 1880652106.0, + "step": 11217 + }, + { + "entropy": 1.6819796562194824, + "epoch": 1.2323473675537613, + "grad_norm": 0.7285211682319641, + "learning_rate": 8.105497116257526e-06, + "loss": 1.3604, + "mean_token_accuracy": 0.6721114267905554, + "num_tokens": 1880810988.0, + "step": 11218 + }, + { + "entropy": 1.7526112000147502, + "epoch": 1.2324572244651342, + "grad_norm": 0.7563691139221191, + "learning_rate": 8.103981098440087e-06, + "loss": 1.405, + "mean_token_accuracy": 0.668835868438085, + "num_tokens": 1880927999.0, + "step": 11219 + }, + { + "entropy": 1.678017516930898, + "epoch": 1.232567081376507, + "grad_norm": 0.7735137939453125, + "learning_rate": 8.10246517228034e-06, + "loss": 1.3019, + "mean_token_accuracy": 0.675841843088468, + "num_tokens": 1881086954.0, + "step": 11220 + }, + { + "entropy": 1.6915496389071147, + "epoch": 1.23267693828788, + "grad_norm": 0.6371824145317078, + "learning_rate": 8.100949337826267e-06, + "loss": 1.3869, + "mean_token_accuracy": 0.6606535166501999, + "num_tokens": 1881251528.0, + "step": 11221 + }, + { + "entropy": 1.6786122421423595, + "epoch": 1.232786795199253, + "grad_norm": 0.7070814967155457, + "learning_rate": 8.099433595125838e-06, + "loss": 1.3486, + "mean_token_accuracy": 0.6716959228118261, + "num_tokens": 1881373248.0, + "step": 11222 + }, + { + "entropy": 1.7694110969702403, + "epoch": 1.232896652110626, + "grad_norm": 0.6588417887687683, + "learning_rate": 8.097917944227031e-06, + "loss": 1.4997, + "mean_token_accuracy": 0.6302092870076498, + "num_tokens": 1881541933.0, + "step": 11223 + }, + { + "entropy": 1.689767171939214, + "epoch": 1.2330065090219988, + "grad_norm": 0.8221830129623413, + "learning_rate": 8.096402385177816e-06, + "loss": 1.4524, + "mean_token_accuracy": 0.6705189446608225, + "num_tokens": 1881695653.0, + "step": 11224 + }, + { + "entropy": 1.7141635119915009, + "epoch": 1.2331163659333717, + "grad_norm": 0.6804819107055664, + "learning_rate": 8.094886918026153e-06, + "loss": 1.305, + "mean_token_accuracy": 0.6599769194920858, + "num_tokens": 1881827348.0, + "step": 11225 + }, + { + "entropy": 1.6513873438040416, + "epoch": 1.2332262228447448, + "grad_norm": 0.6210925579071045, + "learning_rate": 8.093371542820007e-06, + "loss": 1.3637, + "mean_token_accuracy": 0.6525876174370447, + "num_tokens": 1881985777.0, + "step": 11226 + }, + { + "entropy": 1.683081477880478, + "epoch": 1.2333360797561177, + "grad_norm": 0.7101804614067078, + "learning_rate": 8.09185625960735e-06, + "loss": 1.2141, + "mean_token_accuracy": 0.6824707140525182, + "num_tokens": 1882099438.0, + "step": 11227 + }, + { + "entropy": 1.7311066389083862, + "epoch": 1.2334459366674906, + "grad_norm": 0.7459114789962769, + "learning_rate": 8.090341068436125e-06, + "loss": 1.3144, + "mean_token_accuracy": 0.6670710841814677, + "num_tokens": 1882267699.0, + "step": 11228 + }, + { + "entropy": 1.6882832149664562, + "epoch": 1.2335557935788635, + "grad_norm": 0.714763879776001, + "learning_rate": 8.088825969354298e-06, + "loss": 1.2732, + "mean_token_accuracy": 0.6782094736893972, + "num_tokens": 1882453057.0, + "step": 11229 + }, + { + "entropy": 1.684934099515279, + "epoch": 1.2336656504902366, + "grad_norm": 0.638083279132843, + "learning_rate": 8.087310962409818e-06, + "loss": 1.341, + "mean_token_accuracy": 0.661807561914126, + "num_tokens": 1882603582.0, + "step": 11230 + }, + { + "entropy": 1.7063041031360626, + "epoch": 1.2337755074016095, + "grad_norm": 0.6284477710723877, + "learning_rate": 8.085796047650632e-06, + "loss": 1.5387, + "mean_token_accuracy": 0.6299227277437845, + "num_tokens": 1882851492.0, + "step": 11231 + }, + { + "entropy": 1.7208701372146606, + "epoch": 1.2338853643129823, + "grad_norm": 0.7093353867530823, + "learning_rate": 8.084281225124684e-06, + "loss": 1.4154, + "mean_token_accuracy": 0.6541569431622823, + "num_tokens": 1882989394.0, + "step": 11232 + }, + { + "entropy": 1.798000564177831, + "epoch": 1.2339952212243552, + "grad_norm": 0.834276556968689, + "learning_rate": 8.082766494879928e-06, + "loss": 1.5977, + "mean_token_accuracy": 0.6478389153877894, + "num_tokens": 1883119346.0, + "step": 11233 + }, + { + "entropy": 1.7449569801489513, + "epoch": 1.2341050781357281, + "grad_norm": 0.6619470715522766, + "learning_rate": 8.081251856964291e-06, + "loss": 1.3306, + "mean_token_accuracy": 0.6545126388470331, + "num_tokens": 1883247560.0, + "step": 11234 + }, + { + "entropy": 1.7160409688949585, + "epoch": 1.2342149350471012, + "grad_norm": 0.736487865447998, + "learning_rate": 8.079737311425723e-06, + "loss": 1.4371, + "mean_token_accuracy": 0.6482950200637182, + "num_tokens": 1883402069.0, + "step": 11235 + }, + { + "entropy": 1.7530849079291027, + "epoch": 1.234324791958474, + "grad_norm": 0.8390946984291077, + "learning_rate": 8.078222858312152e-06, + "loss": 1.504, + "mean_token_accuracy": 0.6466073642174403, + "num_tokens": 1883577023.0, + "step": 11236 + }, + { + "entropy": 1.7000373403231304, + "epoch": 1.234434648869847, + "grad_norm": 0.6646814942359924, + "learning_rate": 8.07670849767151e-06, + "loss": 1.2625, + "mean_token_accuracy": 0.6718258758385977, + "num_tokens": 1883700619.0, + "step": 11237 + }, + { + "entropy": 1.745482623577118, + "epoch": 1.2345445057812199, + "grad_norm": 0.6695995330810547, + "learning_rate": 8.075194229551726e-06, + "loss": 1.3949, + "mean_token_accuracy": 0.6459067513545355, + "num_tokens": 1883864316.0, + "step": 11238 + }, + { + "entropy": 1.6707193851470947, + "epoch": 1.234654362692593, + "grad_norm": 0.6652836799621582, + "learning_rate": 8.073680054000733e-06, + "loss": 1.497, + "mean_token_accuracy": 0.6416679819424947, + "num_tokens": 1884073039.0, + "step": 11239 + }, + { + "entropy": 1.73182346423467, + "epoch": 1.2347642196039659, + "grad_norm": 0.7183116674423218, + "learning_rate": 8.07216597106644e-06, + "loss": 1.3477, + "mean_token_accuracy": 0.6592358897129694, + "num_tokens": 1884211467.0, + "step": 11240 + }, + { + "entropy": 1.6830492317676544, + "epoch": 1.2348740765153388, + "grad_norm": 0.6599522233009338, + "learning_rate": 8.070651980796775e-06, + "loss": 1.4568, + "mean_token_accuracy": 0.6476858655611674, + "num_tokens": 1884378829.0, + "step": 11241 + }, + { + "entropy": 1.686943491299947, + "epoch": 1.2349839334267116, + "grad_norm": 0.607049286365509, + "learning_rate": 8.06913808323966e-06, + "loss": 1.5163, + "mean_token_accuracy": 0.6283295204242071, + "num_tokens": 1884603951.0, + "step": 11242 + }, + { + "entropy": 1.705921232700348, + "epoch": 1.2350937903380848, + "grad_norm": 0.7713742852210999, + "learning_rate": 8.067624278443e-06, + "loss": 1.4968, + "mean_token_accuracy": 0.6453157613674799, + "num_tokens": 1884801772.0, + "step": 11243 + }, + { + "entropy": 1.751990258693695, + "epoch": 1.2352036472494576, + "grad_norm": 0.8362163305282593, + "learning_rate": 8.06611056645471e-06, + "loss": 1.446, + "mean_token_accuracy": 0.653899297118187, + "num_tokens": 1884960483.0, + "step": 11244 + }, + { + "entropy": 1.643377035856247, + "epoch": 1.2353135041608305, + "grad_norm": 0.5995488166809082, + "learning_rate": 8.064596947322703e-06, + "loss": 1.429, + "mean_token_accuracy": 0.6459860801696777, + "num_tokens": 1885182089.0, + "step": 11245 + }, + { + "entropy": 1.6707975268363953, + "epoch": 1.2354233610722034, + "grad_norm": 0.6349611282348633, + "learning_rate": 8.063083421094875e-06, + "loss": 1.3166, + "mean_token_accuracy": 0.6706758588552475, + "num_tokens": 1885352722.0, + "step": 11246 + }, + { + "entropy": 1.73516180117925, + "epoch": 1.2355332179835763, + "grad_norm": 0.6147273182868958, + "learning_rate": 8.061569987819138e-06, + "loss": 1.4639, + "mean_token_accuracy": 0.6467922131220499, + "num_tokens": 1885566345.0, + "step": 11247 + }, + { + "entropy": 1.716229885816574, + "epoch": 1.2356430748949494, + "grad_norm": 0.7673629522323608, + "learning_rate": 8.060056647543382e-06, + "loss": 1.4798, + "mean_token_accuracy": 0.6467408984899521, + "num_tokens": 1885762755.0, + "step": 11248 + }, + { + "entropy": 1.7514924108982086, + "epoch": 1.2357529318063223, + "grad_norm": 0.7376429438591003, + "learning_rate": 8.058543400315511e-06, + "loss": 1.336, + "mean_token_accuracy": 0.6571770658095678, + "num_tokens": 1885914416.0, + "step": 11249 + }, + { + "entropy": 1.7116054991881053, + "epoch": 1.2358627887176952, + "grad_norm": 1.5102351903915405, + "learning_rate": 8.057030246183416e-06, + "loss": 1.6694, + "mean_token_accuracy": 0.6353745808204015, + "num_tokens": 1886111948.0, + "step": 11250 + }, + { + "entropy": 1.6769183973471324, + "epoch": 1.235972645629068, + "grad_norm": 0.9520527720451355, + "learning_rate": 8.055517185194988e-06, + "loss": 1.3784, + "mean_token_accuracy": 0.6644560744365057, + "num_tokens": 1886245502.0, + "step": 11251 + }, + { + "entropy": 1.7583904763062794, + "epoch": 1.2360825025404412, + "grad_norm": 0.810713529586792, + "learning_rate": 8.054004217398108e-06, + "loss": 1.4918, + "mean_token_accuracy": 0.6517674972613653, + "num_tokens": 1886405065.0, + "step": 11252 + }, + { + "entropy": 1.6664839486281078, + "epoch": 1.236192359451814, + "grad_norm": 0.6370511651039124, + "learning_rate": 8.052491342840677e-06, + "loss": 1.3647, + "mean_token_accuracy": 0.6623385399580002, + "num_tokens": 1886553072.0, + "step": 11253 + }, + { + "entropy": 1.737219403187434, + "epoch": 1.236302216363187, + "grad_norm": 0.7130185961723328, + "learning_rate": 8.05097856157056e-06, + "loss": 1.3521, + "mean_token_accuracy": 0.6642791330814362, + "num_tokens": 1886689924.0, + "step": 11254 + }, + { + "entropy": 1.7893791596094768, + "epoch": 1.2364120732745598, + "grad_norm": 0.6922145485877991, + "learning_rate": 8.049465873635644e-06, + "loss": 1.4279, + "mean_token_accuracy": 0.6482027868429819, + "num_tokens": 1886819576.0, + "step": 11255 + }, + { + "entropy": 1.7212933500607808, + "epoch": 1.236521930185933, + "grad_norm": 0.6548290848731995, + "learning_rate": 8.047953279083805e-06, + "loss": 1.4333, + "mean_token_accuracy": 0.650567352771759, + "num_tokens": 1887004217.0, + "step": 11256 + }, + { + "entropy": 1.7187994420528412, + "epoch": 1.2366317870973058, + "grad_norm": 0.6271427273750305, + "learning_rate": 8.046440777962914e-06, + "loss": 1.4241, + "mean_token_accuracy": 0.6418495823939642, + "num_tokens": 1887209350.0, + "step": 11257 + }, + { + "entropy": 1.7576852043469746, + "epoch": 1.2367416440086787, + "grad_norm": 0.7661997675895691, + "learning_rate": 8.044928370320837e-06, + "loss": 1.5276, + "mean_token_accuracy": 0.6394319285949072, + "num_tokens": 1887413418.0, + "step": 11258 + }, + { + "entropy": 1.7255164881547291, + "epoch": 1.2368515009200516, + "grad_norm": 0.7709239721298218, + "learning_rate": 8.043416056205453e-06, + "loss": 1.4078, + "mean_token_accuracy": 0.6649090001980463, + "num_tokens": 1887560944.0, + "step": 11259 + }, + { + "entropy": 1.692698359489441, + "epoch": 1.2369613578314245, + "grad_norm": 0.8086570501327515, + "learning_rate": 8.041903835664615e-06, + "loss": 1.4357, + "mean_token_accuracy": 0.6539787004391352, + "num_tokens": 1887718710.0, + "step": 11260 + }, + { + "entropy": 1.7619259258111317, + "epoch": 1.2370712147427976, + "grad_norm": 0.7824429869651794, + "learning_rate": 8.040391708746186e-06, + "loss": 1.6336, + "mean_token_accuracy": 0.6352614412705103, + "num_tokens": 1887896912.0, + "step": 11261 + }, + { + "entropy": 1.6718494693438213, + "epoch": 1.2371810716541705, + "grad_norm": 0.7123764157295227, + "learning_rate": 8.038879675498031e-06, + "loss": 1.3994, + "mean_token_accuracy": 0.6606058677037557, + "num_tokens": 1888048701.0, + "step": 11262 + }, + { + "entropy": 1.6848260561625164, + "epoch": 1.2372909285655433, + "grad_norm": 0.7577449679374695, + "learning_rate": 8.037367735967995e-06, + "loss": 1.522, + "mean_token_accuracy": 0.6498318860928217, + "num_tokens": 1888223514.0, + "step": 11263 + }, + { + "entropy": 1.6410026550292969, + "epoch": 1.2374007854769162, + "grad_norm": 0.6510109901428223, + "learning_rate": 8.035855890203934e-06, + "loss": 1.4112, + "mean_token_accuracy": 0.6637563705444336, + "num_tokens": 1888407644.0, + "step": 11264 + }, + { + "entropy": 1.7332176466782887, + "epoch": 1.2375106423882893, + "grad_norm": 0.791204571723938, + "learning_rate": 8.034344138253704e-06, + "loss": 1.2795, + "mean_token_accuracy": 0.6705978065729141, + "num_tokens": 1888559368.0, + "step": 11265 + }, + { + "entropy": 1.7388821343580882, + "epoch": 1.2376204992996622, + "grad_norm": 0.7957805395126343, + "learning_rate": 8.03283248016514e-06, + "loss": 1.4571, + "mean_token_accuracy": 0.6462946683168411, + "num_tokens": 1888761523.0, + "step": 11266 + }, + { + "entropy": 1.749239871899287, + "epoch": 1.237730356211035, + "grad_norm": 0.7862349152565002, + "learning_rate": 8.031320915986093e-06, + "loss": 1.2856, + "mean_token_accuracy": 0.6639771660168966, + "num_tokens": 1888914049.0, + "step": 11267 + }, + { + "entropy": 1.637138585249583, + "epoch": 1.237840213122408, + "grad_norm": 0.6658058762550354, + "learning_rate": 8.029809445764404e-06, + "loss": 1.3354, + "mean_token_accuracy": 0.661085252960523, + "num_tokens": 1889102287.0, + "step": 11268 + }, + { + "entropy": 1.769773135582606, + "epoch": 1.237950070033781, + "grad_norm": 0.8036999702453613, + "learning_rate": 8.028298069547907e-06, + "loss": 1.5082, + "mean_token_accuracy": 0.6557409813006719, + "num_tokens": 1889224418.0, + "step": 11269 + }, + { + "entropy": 1.7286994357903798, + "epoch": 1.238059926945154, + "grad_norm": 0.7675381898880005, + "learning_rate": 8.02678678738443e-06, + "loss": 1.5319, + "mean_token_accuracy": 0.6377677967151006, + "num_tokens": 1889439985.0, + "step": 11270 + }, + { + "entropy": 1.681524654229482, + "epoch": 1.2381697838565269, + "grad_norm": 0.702340304851532, + "learning_rate": 8.025275599321825e-06, + "loss": 1.3766, + "mean_token_accuracy": 0.6608841866254807, + "num_tokens": 1889599434.0, + "step": 11271 + }, + { + "entropy": 1.6762547592322032, + "epoch": 1.2382796407678998, + "grad_norm": 0.6304272413253784, + "learning_rate": 8.023764505407894e-06, + "loss": 1.4182, + "mean_token_accuracy": 0.6476298222939173, + "num_tokens": 1889807142.0, + "step": 11272 + }, + { + "entropy": 1.7101092040538788, + "epoch": 1.2383894976792726, + "grad_norm": 0.675635814666748, + "learning_rate": 8.02225350569048e-06, + "loss": 1.3755, + "mean_token_accuracy": 0.6752298523982366, + "num_tokens": 1889984200.0, + "step": 11273 + }, + { + "entropy": 1.7226931552092235, + "epoch": 1.2384993545906458, + "grad_norm": 0.7080081701278687, + "learning_rate": 8.020742600217403e-06, + "loss": 1.4787, + "mean_token_accuracy": 0.6550223429997762, + "num_tokens": 1890184448.0, + "step": 11274 + }, + { + "entropy": 1.723098615805308, + "epoch": 1.2386092115020186, + "grad_norm": 0.7677369713783264, + "learning_rate": 8.019231789036477e-06, + "loss": 1.4064, + "mean_token_accuracy": 0.6541879673798879, + "num_tokens": 1890351047.0, + "step": 11275 + }, + { + "entropy": 1.6581893960634868, + "epoch": 1.2387190684133915, + "grad_norm": 0.6451082229614258, + "learning_rate": 8.017721072195522e-06, + "loss": 1.5279, + "mean_token_accuracy": 0.6365671356519064, + "num_tokens": 1890543514.0, + "step": 11276 + }, + { + "entropy": 1.7340434888998668, + "epoch": 1.2388289253247644, + "grad_norm": 0.7805740237236023, + "learning_rate": 8.016210449742354e-06, + "loss": 1.3005, + "mean_token_accuracy": 0.6726719886064529, + "num_tokens": 1890661486.0, + "step": 11277 + }, + { + "entropy": 1.6500552793343861, + "epoch": 1.2389387822361375, + "grad_norm": 0.658091127872467, + "learning_rate": 8.014699921724777e-06, + "loss": 1.424, + "mean_token_accuracy": 0.6632284422715505, + "num_tokens": 1890861914.0, + "step": 11278 + }, + { + "entropy": 1.768718143304189, + "epoch": 1.2390486391475104, + "grad_norm": 0.6783964037895203, + "learning_rate": 8.013189488190605e-06, + "loss": 1.4826, + "mean_token_accuracy": 0.6467755486567816, + "num_tokens": 1891080586.0, + "step": 11279 + }, + { + "entropy": 1.7763389150301616, + "epoch": 1.2391584960588833, + "grad_norm": 0.688116729259491, + "learning_rate": 8.01167914918764e-06, + "loss": 1.5137, + "mean_token_accuracy": 0.6396682957808176, + "num_tokens": 1891230652.0, + "step": 11280 + }, + { + "entropy": 1.7367208699385326, + "epoch": 1.2392683529702562, + "grad_norm": 0.688133180141449, + "learning_rate": 8.010168904763681e-06, + "loss": 1.5204, + "mean_token_accuracy": 0.6416043788194656, + "num_tokens": 1891411181.0, + "step": 11281 + }, + { + "entropy": 1.717280815045039, + "epoch": 1.2393782098816293, + "grad_norm": 0.5666574239730835, + "learning_rate": 8.008658754966527e-06, + "loss": 1.3223, + "mean_token_accuracy": 0.6569070219993591, + "num_tokens": 1891627961.0, + "step": 11282 + }, + { + "entropy": 1.6722883383433025, + "epoch": 1.2394880667930022, + "grad_norm": 0.5551705360412598, + "learning_rate": 8.007148699843982e-06, + "loss": 1.4714, + "mean_token_accuracy": 0.6321922043959299, + "num_tokens": 1891858469.0, + "step": 11283 + }, + { + "entropy": 1.6376002232233684, + "epoch": 1.239597923704375, + "grad_norm": 0.8324296474456787, + "learning_rate": 8.00563873944383e-06, + "loss": 1.3256, + "mean_token_accuracy": 0.6628724733988444, + "num_tokens": 1892054586.0, + "step": 11284 + }, + { + "entropy": 1.7216839094956715, + "epoch": 1.239707780615748, + "grad_norm": 0.6770010590553284, + "learning_rate": 8.004128873813859e-06, + "loss": 1.3593, + "mean_token_accuracy": 0.655941034356753, + "num_tokens": 1892219818.0, + "step": 11285 + }, + { + "entropy": 1.6371920903523762, + "epoch": 1.2398176375271208, + "grad_norm": 0.6165127158164978, + "learning_rate": 8.002619103001863e-06, + "loss": 1.4678, + "mean_token_accuracy": 0.6445280561844507, + "num_tokens": 1892423750.0, + "step": 11286 + }, + { + "entropy": 1.716288646062215, + "epoch": 1.239927494438494, + "grad_norm": 0.6745500564575195, + "learning_rate": 8.00110942705562e-06, + "loss": 1.3397, + "mean_token_accuracy": 0.675658643245697, + "num_tokens": 1892583215.0, + "step": 11287 + }, + { + "entropy": 1.7311961750189464, + "epoch": 1.2400373513498668, + "grad_norm": 0.681602954864502, + "learning_rate": 7.999599846022909e-06, + "loss": 1.3579, + "mean_token_accuracy": 0.665436198314031, + "num_tokens": 1892769578.0, + "step": 11288 + }, + { + "entropy": 1.6320242981115978, + "epoch": 1.2401472082612397, + "grad_norm": 0.6943032741546631, + "learning_rate": 7.998090359951518e-06, + "loss": 1.3245, + "mean_token_accuracy": 0.6648548195759455, + "num_tokens": 1892920874.0, + "step": 11289 + }, + { + "entropy": 1.7329721252123516, + "epoch": 1.2402570651726126, + "grad_norm": 0.6341266632080078, + "learning_rate": 7.996580968889209e-06, + "loss": 1.4349, + "mean_token_accuracy": 0.6397968182961146, + "num_tokens": 1893096196.0, + "step": 11290 + }, + { + "entropy": 1.715551386276881, + "epoch": 1.2403669220839857, + "grad_norm": 0.6576389670372009, + "learning_rate": 7.99507167288376e-06, + "loss": 1.3832, + "mean_token_accuracy": 0.6513058344523112, + "num_tokens": 1893260174.0, + "step": 11291 + }, + { + "entropy": 1.7113690475622814, + "epoch": 1.2404767789953586, + "grad_norm": 0.653464138507843, + "learning_rate": 7.99356247198294e-06, + "loss": 1.5356, + "mean_token_accuracy": 0.6355665028095245, + "num_tokens": 1893463782.0, + "step": 11292 + }, + { + "entropy": 1.7006126741568248, + "epoch": 1.2405866359067315, + "grad_norm": 3.044800043106079, + "learning_rate": 7.992053366234513e-06, + "loss": 1.2922, + "mean_token_accuracy": 0.660791665315628, + "num_tokens": 1893671388.0, + "step": 11293 + }, + { + "entropy": 1.662010023991267, + "epoch": 1.2406964928181043, + "grad_norm": 0.680200457572937, + "learning_rate": 7.990544355686239e-06, + "loss": 1.4566, + "mean_token_accuracy": 0.6599440028270086, + "num_tokens": 1893824239.0, + "step": 11294 + }, + { + "entropy": 1.6833816369374592, + "epoch": 1.2408063497294775, + "grad_norm": 0.6722885966300964, + "learning_rate": 7.989035440385885e-06, + "loss": 1.5087, + "mean_token_accuracy": 0.6377679258584976, + "num_tokens": 1894057681.0, + "step": 11295 + }, + { + "entropy": 1.7269285221894581, + "epoch": 1.2409162066408503, + "grad_norm": 0.8462622165679932, + "learning_rate": 7.987526620381197e-06, + "loss": 1.5224, + "mean_token_accuracy": 0.6455184866984686, + "num_tokens": 1894213333.0, + "step": 11296 + }, + { + "entropy": 1.720696081717809, + "epoch": 1.2410260635522232, + "grad_norm": 0.7249704599380493, + "learning_rate": 7.986017895719934e-06, + "loss": 1.3996, + "mean_token_accuracy": 0.6568809896707535, + "num_tokens": 1894371395.0, + "step": 11297 + }, + { + "entropy": 1.6835100750128429, + "epoch": 1.241135920463596, + "grad_norm": 0.6641572713851929, + "learning_rate": 7.984509266449854e-06, + "loss": 1.3834, + "mean_token_accuracy": 0.6554353535175323, + "num_tokens": 1894511956.0, + "step": 11298 + }, + { + "entropy": 1.7112232049306233, + "epoch": 1.241245777374969, + "grad_norm": 0.7815585732460022, + "learning_rate": 7.98300073261869e-06, + "loss": 1.2925, + "mean_token_accuracy": 0.6678037742773691, + "num_tokens": 1894618068.0, + "step": 11299 + }, + { + "entropy": 1.755595584710439, + "epoch": 1.241355634286342, + "grad_norm": 0.6250059604644775, + "learning_rate": 7.981492294274194e-06, + "loss": 1.3003, + "mean_token_accuracy": 0.6646648645401001, + "num_tokens": 1894758381.0, + "step": 11300 + }, + { + "entropy": 1.75324742992719, + "epoch": 1.241465491197715, + "grad_norm": 0.7397940754890442, + "learning_rate": 7.97998395146411e-06, + "loss": 1.4168, + "mean_token_accuracy": 0.6665991842746735, + "num_tokens": 1894892394.0, + "step": 11301 + }, + { + "entropy": 1.7075209816296895, + "epoch": 1.2415753481090879, + "grad_norm": 0.7101148366928101, + "learning_rate": 7.978475704236169e-06, + "loss": 1.3675, + "mean_token_accuracy": 0.6513032168149948, + "num_tokens": 1895077445.0, + "step": 11302 + }, + { + "entropy": 1.7482849955558777, + "epoch": 1.2416852050204608, + "grad_norm": 0.72342449426651, + "learning_rate": 7.976967552638111e-06, + "loss": 1.3761, + "mean_token_accuracy": 0.6645904332399368, + "num_tokens": 1895252858.0, + "step": 11303 + }, + { + "entropy": 1.6877289811770122, + "epoch": 1.2417950619318339, + "grad_norm": 0.6635198593139648, + "learning_rate": 7.975459496717672e-06, + "loss": 1.2438, + "mean_token_accuracy": 0.6745279332002004, + "num_tokens": 1895422983.0, + "step": 11304 + }, + { + "entropy": 1.686889111995697, + "epoch": 1.2419049188432068, + "grad_norm": 0.6177757978439331, + "learning_rate": 7.973951536522574e-06, + "loss": 1.4462, + "mean_token_accuracy": 0.6353614429632822, + "num_tokens": 1895599337.0, + "step": 11305 + }, + { + "entropy": 1.6756052076816559, + "epoch": 1.2420147757545796, + "grad_norm": 0.6626149415969849, + "learning_rate": 7.972443672100543e-06, + "loss": 1.2887, + "mean_token_accuracy": 0.6680636157592138, + "num_tokens": 1895731538.0, + "step": 11306 + }, + { + "entropy": 1.685206929842631, + "epoch": 1.2421246326659525, + "grad_norm": 0.7096571326255798, + "learning_rate": 7.970935903499312e-06, + "loss": 1.2293, + "mean_token_accuracy": 0.679922545949618, + "num_tokens": 1895859633.0, + "step": 11307 + }, + { + "entropy": 1.6315878629684448, + "epoch": 1.2422344895773256, + "grad_norm": 0.6607580780982971, + "learning_rate": 7.96942823076659e-06, + "loss": 1.2639, + "mean_token_accuracy": 0.6793940017620722, + "num_tokens": 1896020071.0, + "step": 11308 + }, + { + "entropy": 1.7243158320585887, + "epoch": 1.2423443464886985, + "grad_norm": 0.6347528696060181, + "learning_rate": 7.967920653950105e-06, + "loss": 1.3314, + "mean_token_accuracy": 0.6642320106426874, + "num_tokens": 1896228945.0, + "step": 11309 + }, + { + "entropy": 1.74300483862559, + "epoch": 1.2424542034000714, + "grad_norm": 0.7812113165855408, + "learning_rate": 7.966413173097559e-06, + "loss": 1.4193, + "mean_token_accuracy": 0.6341168930133184, + "num_tokens": 1896375603.0, + "step": 11310 + }, + { + "entropy": 1.741927295923233, + "epoch": 1.2425640603114443, + "grad_norm": 0.9015730619430542, + "learning_rate": 7.96490578825667e-06, + "loss": 1.3465, + "mean_token_accuracy": 0.6564824233452479, + "num_tokens": 1896517786.0, + "step": 11311 + }, + { + "entropy": 1.7557086944580078, + "epoch": 1.2426739172228172, + "grad_norm": 0.6636369824409485, + "learning_rate": 7.963398499475146e-06, + "loss": 1.5039, + "mean_token_accuracy": 0.6422920376062393, + "num_tokens": 1896712152.0, + "step": 11312 + }, + { + "entropy": 1.7586438258488972, + "epoch": 1.2427837741341903, + "grad_norm": 0.7631456255912781, + "learning_rate": 7.961891306800691e-06, + "loss": 1.4998, + "mean_token_accuracy": 0.6448372304439545, + "num_tokens": 1896863330.0, + "step": 11313 + }, + { + "entropy": 1.7146364947160084, + "epoch": 1.2428936310455632, + "grad_norm": 0.7649849653244019, + "learning_rate": 7.960384210281005e-06, + "loss": 1.4133, + "mean_token_accuracy": 0.6560538013776144, + "num_tokens": 1897024053.0, + "step": 11314 + }, + { + "entropy": 1.6867588957150776, + "epoch": 1.243003487956936, + "grad_norm": 0.7380170226097107, + "learning_rate": 7.958877209963794e-06, + "loss": 1.3173, + "mean_token_accuracy": 0.6729622135559717, + "num_tokens": 1897153363.0, + "step": 11315 + }, + { + "entropy": 1.7193871140480042, + "epoch": 1.243113344868309, + "grad_norm": 0.8186469674110413, + "learning_rate": 7.957370305896744e-06, + "loss": 1.3701, + "mean_token_accuracy": 0.6554479797681173, + "num_tokens": 1897330815.0, + "step": 11316 + }, + { + "entropy": 1.7145767311255138, + "epoch": 1.243223201779682, + "grad_norm": 0.625273585319519, + "learning_rate": 7.955863498127555e-06, + "loss": 1.4123, + "mean_token_accuracy": 0.6473723153273264, + "num_tokens": 1897486340.0, + "step": 11317 + }, + { + "entropy": 1.6745652059714, + "epoch": 1.243333058691055, + "grad_norm": 0.6851847171783447, + "learning_rate": 7.954356786703916e-06, + "loss": 1.3004, + "mean_token_accuracy": 0.6670237829287847, + "num_tokens": 1897651114.0, + "step": 11318 + }, + { + "entropy": 1.72303906083107, + "epoch": 1.2434429156024278, + "grad_norm": 0.6741484999656677, + "learning_rate": 7.95285017167351e-06, + "loss": 1.4053, + "mean_token_accuracy": 0.659760649005572, + "num_tokens": 1897842734.0, + "step": 11319 + }, + { + "entropy": 1.708362211783727, + "epoch": 1.2435527725138007, + "grad_norm": 0.5985382199287415, + "learning_rate": 7.951343653084023e-06, + "loss": 1.4592, + "mean_token_accuracy": 0.6402342220147451, + "num_tokens": 1898046316.0, + "step": 11320 + }, + { + "entropy": 1.6680605312188466, + "epoch": 1.2436626294251738, + "grad_norm": 0.642793595790863, + "learning_rate": 7.94983723098314e-06, + "loss": 1.4187, + "mean_token_accuracy": 0.6495463897784551, + "num_tokens": 1898222912.0, + "step": 11321 + }, + { + "entropy": 1.686878780523936, + "epoch": 1.2437724863365467, + "grad_norm": 0.6895222067832947, + "learning_rate": 7.948330905418527e-06, + "loss": 1.4837, + "mean_token_accuracy": 0.6618087788422903, + "num_tokens": 1898404255.0, + "step": 11322 + }, + { + "entropy": 1.7111988961696625, + "epoch": 1.2438823432479196, + "grad_norm": 0.7032332420349121, + "learning_rate": 7.94682467643787e-06, + "loss": 1.2716, + "mean_token_accuracy": 0.6768457492192587, + "num_tokens": 1898544016.0, + "step": 11323 + }, + { + "entropy": 1.7379381159941356, + "epoch": 1.2439922001592925, + "grad_norm": 0.664441704750061, + "learning_rate": 7.945318544088836e-06, + "loss": 1.2956, + "mean_token_accuracy": 0.6739976902802786, + "num_tokens": 1898669680.0, + "step": 11324 + }, + { + "entropy": 1.7190495828787486, + "epoch": 1.2441020570706653, + "grad_norm": 0.636641800403595, + "learning_rate": 7.943812508419093e-06, + "loss": 1.4763, + "mean_token_accuracy": 0.6462114254633585, + "num_tokens": 1898880825.0, + "step": 11325 + }, + { + "entropy": 1.6952326397101085, + "epoch": 1.2442119139820385, + "grad_norm": 0.7681459188461304, + "learning_rate": 7.942306569476303e-06, + "loss": 1.198, + "mean_token_accuracy": 0.6756665309270223, + "num_tokens": 1899015166.0, + "step": 11326 + }, + { + "entropy": 1.7597149014472961, + "epoch": 1.2443217708934113, + "grad_norm": 0.7061123251914978, + "learning_rate": 7.940800727308142e-06, + "loss": 1.4911, + "mean_token_accuracy": 0.6365721672773361, + "num_tokens": 1899174895.0, + "step": 11327 + }, + { + "entropy": 1.6776218215624492, + "epoch": 1.2444316278047842, + "grad_norm": 0.8109696507453918, + "learning_rate": 7.93929498196225e-06, + "loss": 1.279, + "mean_token_accuracy": 0.6621010253826777, + "num_tokens": 1899330055.0, + "step": 11328 + }, + { + "entropy": 1.685188114643097, + "epoch": 1.244541484716157, + "grad_norm": 0.7661949396133423, + "learning_rate": 7.937789333486296e-06, + "loss": 1.2638, + "mean_token_accuracy": 0.684383233388265, + "num_tokens": 1899458252.0, + "step": 11329 + }, + { + "entropy": 1.7264382243156433, + "epoch": 1.2446513416275302, + "grad_norm": 0.6619189977645874, + "learning_rate": 7.936283781927934e-06, + "loss": 1.4722, + "mean_token_accuracy": 0.6431934088468552, + "num_tokens": 1899621097.0, + "step": 11330 + }, + { + "entropy": 1.7132483919461567, + "epoch": 1.244761198538903, + "grad_norm": 0.6909228563308716, + "learning_rate": 7.934778327334804e-06, + "loss": 1.4797, + "mean_token_accuracy": 0.6424340556065241, + "num_tokens": 1899790828.0, + "step": 11331 + }, + { + "entropy": 1.6885711252689362, + "epoch": 1.244871055450276, + "grad_norm": 0.7082422375679016, + "learning_rate": 7.933272969754558e-06, + "loss": 1.4103, + "mean_token_accuracy": 0.6677902390559515, + "num_tokens": 1899958848.0, + "step": 11332 + }, + { + "entropy": 1.7759423851966858, + "epoch": 1.2449809123616489, + "grad_norm": 0.7639626860618591, + "learning_rate": 7.931767709234848e-06, + "loss": 1.4212, + "mean_token_accuracy": 0.662767251332601, + "num_tokens": 1900092051.0, + "step": 11333 + }, + { + "entropy": 1.7315894961357117, + "epoch": 1.245090769273022, + "grad_norm": 0.6064445972442627, + "learning_rate": 7.9302625458233e-06, + "loss": 1.4768, + "mean_token_accuracy": 0.6455651024977366, + "num_tokens": 1900320309.0, + "step": 11334 + }, + { + "entropy": 1.6946211953957875, + "epoch": 1.2452006261843949, + "grad_norm": 0.7126203179359436, + "learning_rate": 7.928757479567561e-06, + "loss": 1.3931, + "mean_token_accuracy": 0.6527270923058192, + "num_tokens": 1900534165.0, + "step": 11335 + }, + { + "entropy": 1.6968140602111816, + "epoch": 1.2453104830957678, + "grad_norm": 0.8074250817298889, + "learning_rate": 7.927252510515266e-06, + "loss": 1.5176, + "mean_token_accuracy": 0.6537874937057495, + "num_tokens": 1900743441.0, + "step": 11336 + }, + { + "entropy": 1.7140926122665405, + "epoch": 1.2454203400071406, + "grad_norm": 0.7751270532608032, + "learning_rate": 7.925747638714043e-06, + "loss": 1.4288, + "mean_token_accuracy": 0.6500294556220373, + "num_tokens": 1900930640.0, + "step": 11337 + }, + { + "entropy": 1.6379591524600983, + "epoch": 1.2455301969185135, + "grad_norm": 0.7415010929107666, + "learning_rate": 7.92424286421152e-06, + "loss": 1.2861, + "mean_token_accuracy": 0.6670693109432856, + "num_tokens": 1901069110.0, + "step": 11338 + }, + { + "entropy": 1.6497638821601868, + "epoch": 1.2456400538298866, + "grad_norm": 0.7474594116210938, + "learning_rate": 7.922738187055329e-06, + "loss": 1.3534, + "mean_token_accuracy": 0.6680422226587931, + "num_tokens": 1901229274.0, + "step": 11339 + }, + { + "entropy": 1.6657158931096394, + "epoch": 1.2457499107412595, + "grad_norm": 0.6230567097663879, + "learning_rate": 7.921233607293084e-06, + "loss": 1.3925, + "mean_token_accuracy": 0.6576641102631887, + "num_tokens": 1901396643.0, + "step": 11340 + }, + { + "entropy": 1.6605386932690938, + "epoch": 1.2458597676526324, + "grad_norm": 0.711249828338623, + "learning_rate": 7.919729124972409e-06, + "loss": 1.3159, + "mean_token_accuracy": 0.6733442395925522, + "num_tokens": 1901560464.0, + "step": 11341 + }, + { + "entropy": 1.7819677889347076, + "epoch": 1.2459696245640055, + "grad_norm": 0.6379202008247375, + "learning_rate": 7.91822474014092e-06, + "loss": 1.4941, + "mean_token_accuracy": 0.6413015226523081, + "num_tokens": 1901759395.0, + "step": 11342 + }, + { + "entropy": 1.7399208843708038, + "epoch": 1.2460794814753784, + "grad_norm": 0.811903715133667, + "learning_rate": 7.916720452846229e-06, + "loss": 1.538, + "mean_token_accuracy": 0.6447446842988332, + "num_tokens": 1901901676.0, + "step": 11343 + }, + { + "entropy": 1.7778501212596893, + "epoch": 1.2461893383867513, + "grad_norm": 0.7180720567703247, + "learning_rate": 7.915216263135942e-06, + "loss": 1.449, + "mean_token_accuracy": 0.6565718402465185, + "num_tokens": 1902050193.0, + "step": 11344 + }, + { + "entropy": 1.765024612347285, + "epoch": 1.2462991952981242, + "grad_norm": 0.6027868390083313, + "learning_rate": 7.91371217105768e-06, + "loss": 1.4265, + "mean_token_accuracy": 0.6494091699520746, + "num_tokens": 1902235835.0, + "step": 11345 + }, + { + "entropy": 1.6627511084079742, + "epoch": 1.246409052209497, + "grad_norm": 0.9971237182617188, + "learning_rate": 7.912208176659028e-06, + "loss": 1.4272, + "mean_token_accuracy": 0.6701801866292953, + "num_tokens": 1902389123.0, + "step": 11346 + }, + { + "entropy": 1.6802993714809418, + "epoch": 1.2465189091208702, + "grad_norm": 0.6501787304878235, + "learning_rate": 7.9107042799876e-06, + "loss": 1.4603, + "mean_token_accuracy": 0.6509832988182703, + "num_tokens": 1902539417.0, + "step": 11347 + }, + { + "entropy": 1.7607911229133606, + "epoch": 1.246628766032243, + "grad_norm": 0.7493710517883301, + "learning_rate": 7.909200481090989e-06, + "loss": 1.4329, + "mean_token_accuracy": 0.6626504063606262, + "num_tokens": 1902707782.0, + "step": 11348 + }, + { + "entropy": 1.6591468056042988, + "epoch": 1.246738622943616, + "grad_norm": 0.7652831673622131, + "learning_rate": 7.90769678001679e-06, + "loss": 1.2996, + "mean_token_accuracy": 0.675150990486145, + "num_tokens": 1902829430.0, + "step": 11349 + }, + { + "entropy": 1.6999635299046834, + "epoch": 1.2468484798549888, + "grad_norm": 0.7982178330421448, + "learning_rate": 7.906193176812591e-06, + "loss": 1.1053, + "mean_token_accuracy": 0.7014695952335993, + "num_tokens": 1902925845.0, + "step": 11350 + }, + { + "entropy": 1.739738126595815, + "epoch": 1.2469583367663617, + "grad_norm": 0.7161890268325806, + "learning_rate": 7.904689671525992e-06, + "loss": 1.3593, + "mean_token_accuracy": 0.6658004621664683, + "num_tokens": 1903064238.0, + "step": 11351 + }, + { + "entropy": 1.7278717656930287, + "epoch": 1.2470681936777348, + "grad_norm": 0.6915018558502197, + "learning_rate": 7.903186264204561e-06, + "loss": 1.3432, + "mean_token_accuracy": 0.6522951871156693, + "num_tokens": 1903220088.0, + "step": 11352 + }, + { + "entropy": 1.7226141194502513, + "epoch": 1.2471780505891077, + "grad_norm": 0.6510446667671204, + "learning_rate": 7.901682954895893e-06, + "loss": 1.5513, + "mean_token_accuracy": 0.6328976154327393, + "num_tokens": 1903418123.0, + "step": 11353 + }, + { + "entropy": 1.6626664996147156, + "epoch": 1.2472879075004806, + "grad_norm": 0.5966384410858154, + "learning_rate": 7.900179743647567e-06, + "loss": 1.4024, + "mean_token_accuracy": 0.6576230376958847, + "num_tokens": 1903595039.0, + "step": 11354 + }, + { + "entropy": 1.7284984985987346, + "epoch": 1.2473977644118537, + "grad_norm": 0.6498193740844727, + "learning_rate": 7.898676630507152e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.6563937862714132, + "num_tokens": 1903733716.0, + "step": 11355 + }, + { + "entropy": 1.673979103565216, + "epoch": 1.2475076213232266, + "grad_norm": 0.776412308216095, + "learning_rate": 7.89717361552222e-06, + "loss": 1.2461, + "mean_token_accuracy": 0.6760004907846451, + "num_tokens": 1903851827.0, + "step": 11356 + }, + { + "entropy": 1.7170047760009766, + "epoch": 1.2476174782345995, + "grad_norm": 0.6476826667785645, + "learning_rate": 7.895670698740354e-06, + "loss": 1.4353, + "mean_token_accuracy": 0.6535161038239797, + "num_tokens": 1903994160.0, + "step": 11357 + }, + { + "entropy": 1.7057405809561412, + "epoch": 1.2477273351459723, + "grad_norm": 3.21343994140625, + "learning_rate": 7.894167880209103e-06, + "loss": 1.3062, + "mean_token_accuracy": 0.6527599294980367, + "num_tokens": 1904204321.0, + "step": 11358 + }, + { + "entropy": 1.7505437235037486, + "epoch": 1.2478371920573452, + "grad_norm": 0.704789936542511, + "learning_rate": 7.892665159976042e-06, + "loss": 1.4142, + "mean_token_accuracy": 0.6686330437660217, + "num_tokens": 1904352497.0, + "step": 11359 + }, + { + "entropy": 1.6835230986277263, + "epoch": 1.2479470489687183, + "grad_norm": 0.7824683785438538, + "learning_rate": 7.89116253808873e-06, + "loss": 1.2709, + "mean_token_accuracy": 0.6724584052960078, + "num_tokens": 1904462352.0, + "step": 11360 + }, + { + "entropy": 1.771289696296056, + "epoch": 1.2480569058800912, + "grad_norm": 0.6507266163825989, + "learning_rate": 7.889660014594722e-06, + "loss": 1.3916, + "mean_token_accuracy": 0.6458721508582433, + "num_tokens": 1904644989.0, + "step": 11361 + }, + { + "entropy": 1.6943688193957012, + "epoch": 1.248166762791464, + "grad_norm": 0.6878480315208435, + "learning_rate": 7.888157589541571e-06, + "loss": 1.3827, + "mean_token_accuracy": 0.6601410458485285, + "num_tokens": 1904811730.0, + "step": 11362 + }, + { + "entropy": 1.7335290908813477, + "epoch": 1.248276619702837, + "grad_norm": 0.6336010098457336, + "learning_rate": 7.886655262976834e-06, + "loss": 1.51, + "mean_token_accuracy": 0.6341728915770849, + "num_tokens": 1905005726.0, + "step": 11363 + }, + { + "entropy": 1.7121953169504802, + "epoch": 1.2483864766142099, + "grad_norm": 0.610726535320282, + "learning_rate": 7.885153034948053e-06, + "loss": 1.3719, + "mean_token_accuracy": 0.655587320526441, + "num_tokens": 1905219181.0, + "step": 11364 + }, + { + "entropy": 1.7200307448705037, + "epoch": 1.248496333525583, + "grad_norm": 0.6448392868041992, + "learning_rate": 7.883650905502773e-06, + "loss": 1.452, + "mean_token_accuracy": 0.6528996278842291, + "num_tokens": 1905429324.0, + "step": 11365 + }, + { + "entropy": 1.768402338027954, + "epoch": 1.2486061904369559, + "grad_norm": 0.6446058750152588, + "learning_rate": 7.88214887468854e-06, + "loss": 1.3712, + "mean_token_accuracy": 0.6579476048549017, + "num_tokens": 1905566991.0, + "step": 11366 + }, + { + "entropy": 1.7406864861647289, + "epoch": 1.2487160473483288, + "grad_norm": 0.7488144040107727, + "learning_rate": 7.880646942552891e-06, + "loss": 1.4148, + "mean_token_accuracy": 0.6457947393258413, + "num_tokens": 1905756851.0, + "step": 11367 + }, + { + "entropy": 1.7286293804645538, + "epoch": 1.2488259042597019, + "grad_norm": 0.7700992822647095, + "learning_rate": 7.87914510914336e-06, + "loss": 1.4692, + "mean_token_accuracy": 0.6668734302123388, + "num_tokens": 1905897609.0, + "step": 11368 + }, + { + "entropy": 1.6787353257338207, + "epoch": 1.2489357611710747, + "grad_norm": 0.6573531627655029, + "learning_rate": 7.87764337450748e-06, + "loss": 1.3161, + "mean_token_accuracy": 0.6725091288487116, + "num_tokens": 1906051094.0, + "step": 11369 + }, + { + "entropy": 1.7037298083305359, + "epoch": 1.2490456180824476, + "grad_norm": 0.7217747569084167, + "learning_rate": 7.876141738692778e-06, + "loss": 1.5271, + "mean_token_accuracy": 0.6617752313613892, + "num_tokens": 1906225865.0, + "step": 11370 + }, + { + "entropy": 1.7089182237784069, + "epoch": 1.2491554749938205, + "grad_norm": 0.7379319667816162, + "learning_rate": 7.874640201746784e-06, + "loss": 1.2766, + "mean_token_accuracy": 0.6676273395617803, + "num_tokens": 1906346116.0, + "step": 11371 + }, + { + "entropy": 1.6987358729044597, + "epoch": 1.2492653319051934, + "grad_norm": 0.756645679473877, + "learning_rate": 7.87313876371702e-06, + "loss": 1.3747, + "mean_token_accuracy": 0.6652411719163259, + "num_tokens": 1906558656.0, + "step": 11372 + }, + { + "entropy": 1.6623602509498596, + "epoch": 1.2493751888165665, + "grad_norm": 0.7025351524353027, + "learning_rate": 7.871637424651002e-06, + "loss": 1.3465, + "mean_token_accuracy": 0.6570960233608881, + "num_tokens": 1906710447.0, + "step": 11373 + }, + { + "entropy": 1.7358343799908955, + "epoch": 1.2494850457279394, + "grad_norm": 0.6764085292816162, + "learning_rate": 7.870136184596253e-06, + "loss": 1.41, + "mean_token_accuracy": 0.6695780654748281, + "num_tokens": 1906839460.0, + "step": 11374 + }, + { + "entropy": 1.7395348747571309, + "epoch": 1.2495949026393123, + "grad_norm": 0.6143444776535034, + "learning_rate": 7.868635043600283e-06, + "loss": 1.3852, + "mean_token_accuracy": 0.6516972482204437, + "num_tokens": 1906996950.0, + "step": 11375 + }, + { + "entropy": 1.7187215089797974, + "epoch": 1.2497047595506852, + "grad_norm": 0.6263564229011536, + "learning_rate": 7.867134001710601e-06, + "loss": 1.4024, + "mean_token_accuracy": 0.6484654247760773, + "num_tokens": 1907218349.0, + "step": 11376 + }, + { + "entropy": 1.7315069735050201, + "epoch": 1.249814616462058, + "grad_norm": 0.5838350653648376, + "learning_rate": 7.865633058974718e-06, + "loss": 1.4567, + "mean_token_accuracy": 0.6523949603239695, + "num_tokens": 1907459088.0, + "step": 11377 + }, + { + "entropy": 1.7416872481505077, + "epoch": 1.2499244733734312, + "grad_norm": 0.751085102558136, + "learning_rate": 7.864132215440137e-06, + "loss": 1.234, + "mean_token_accuracy": 0.6816779424746832, + "num_tokens": 1907560765.0, + "step": 11378 + }, + { + "entropy": 1.687682181596756, + "epoch": 1.250034330284804, + "grad_norm": 0.6116113662719727, + "learning_rate": 7.862631471154357e-06, + "loss": 1.2595, + "mean_token_accuracy": 0.6819984763860703, + "num_tokens": 1907720034.0, + "step": 11379 + }, + { + "entropy": 1.677247832218806, + "epoch": 1.250144187196177, + "grad_norm": 0.6379266977310181, + "learning_rate": 7.861130826164878e-06, + "loss": 1.4734, + "mean_token_accuracy": 0.6537698358297348, + "num_tokens": 1907896716.0, + "step": 11380 + }, + { + "entropy": 1.7170507113138835, + "epoch": 1.25025404410755, + "grad_norm": 0.6168753504753113, + "learning_rate": 7.859630280519193e-06, + "loss": 1.5527, + "mean_token_accuracy": 0.642242968082428, + "num_tokens": 1908096706.0, + "step": 11381 + }, + { + "entropy": 1.8267957270145416, + "epoch": 1.250363901018923, + "grad_norm": 0.7190276980400085, + "learning_rate": 7.85812983426479e-06, + "loss": 1.4579, + "mean_token_accuracy": 0.6479291965564092, + "num_tokens": 1908212863.0, + "step": 11382 + }, + { + "entropy": 1.7421042323112488, + "epoch": 1.2504737579302958, + "grad_norm": 0.6885977983474731, + "learning_rate": 7.85662948744917e-06, + "loss": 1.351, + "mean_token_accuracy": 0.6630544364452362, + "num_tokens": 1908387457.0, + "step": 11383 + }, + { + "entropy": 1.7527458270390828, + "epoch": 1.2505836148416687, + "grad_norm": 1.0296976566314697, + "learning_rate": 7.855129240119808e-06, + "loss": 1.4872, + "mean_token_accuracy": 0.6270147214333216, + "num_tokens": 1908595445.0, + "step": 11384 + }, + { + "entropy": 1.6783875326315563, + "epoch": 1.2506934717530416, + "grad_norm": 0.7553209066390991, + "learning_rate": 7.853629092324187e-06, + "loss": 1.5384, + "mean_token_accuracy": 0.6514027168353399, + "num_tokens": 1908751432.0, + "step": 11385 + }, + { + "entropy": 1.6871531903743744, + "epoch": 1.2508033286644147, + "grad_norm": 0.7314842343330383, + "learning_rate": 7.852129044109788e-06, + "loss": 1.2429, + "mean_token_accuracy": 0.6730901698271433, + "num_tokens": 1908862066.0, + "step": 11386 + }, + { + "entropy": 1.7323083678881328, + "epoch": 1.2509131855757876, + "grad_norm": 0.6713790893554688, + "learning_rate": 7.850629095524086e-06, + "loss": 1.4655, + "mean_token_accuracy": 0.6579304486513138, + "num_tokens": 1909003499.0, + "step": 11387 + }, + { + "entropy": 1.7376106083393097, + "epoch": 1.2510230424871605, + "grad_norm": 0.7411003708839417, + "learning_rate": 7.849129246614552e-06, + "loss": 1.3845, + "mean_token_accuracy": 0.6707366009553274, + "num_tokens": 1909184557.0, + "step": 11388 + }, + { + "entropy": 1.660805990298589, + "epoch": 1.2511328993985333, + "grad_norm": 0.6880229115486145, + "learning_rate": 7.847629497428664e-06, + "loss": 1.394, + "mean_token_accuracy": 0.6483379105726877, + "num_tokens": 1909382189.0, + "step": 11389 + }, + { + "entropy": 1.7267470955848694, + "epoch": 1.2512427563099062, + "grad_norm": 0.7655637860298157, + "learning_rate": 7.846129848013874e-06, + "loss": 1.3935, + "mean_token_accuracy": 0.6489508698383967, + "num_tokens": 1909567336.0, + "step": 11390 + }, + { + "entropy": 1.765625, + "epoch": 1.2513526132212793, + "grad_norm": 0.7343372702598572, + "learning_rate": 7.844630298417657e-06, + "loss": 1.2667, + "mean_token_accuracy": 0.6655522038539251, + "num_tokens": 1909736898.0, + "step": 11391 + }, + { + "entropy": 1.699706216653188, + "epoch": 1.2514624701326522, + "grad_norm": 0.6885928511619568, + "learning_rate": 7.843130848687472e-06, + "loss": 1.3203, + "mean_token_accuracy": 0.6711514194806417, + "num_tokens": 1909883541.0, + "step": 11392 + }, + { + "entropy": 1.7037067711353302, + "epoch": 1.251572327044025, + "grad_norm": 0.7248368263244629, + "learning_rate": 7.84163149887077e-06, + "loss": 1.3969, + "mean_token_accuracy": 0.6453188508749008, + "num_tokens": 1910013519.0, + "step": 11393 + }, + { + "entropy": 1.6497456729412079, + "epoch": 1.2516821839553982, + "grad_norm": 0.6989073753356934, + "learning_rate": 7.840132249015005e-06, + "loss": 1.2801, + "mean_token_accuracy": 0.6834556013345718, + "num_tokens": 1910183319.0, + "step": 11394 + }, + { + "entropy": 1.733015646537145, + "epoch": 1.251792040866771, + "grad_norm": 0.7218592166900635, + "learning_rate": 7.838633099167636e-06, + "loss": 1.2952, + "mean_token_accuracy": 0.668768381079038, + "num_tokens": 1910309032.0, + "step": 11395 + }, + { + "entropy": 1.6412979066371918, + "epoch": 1.251901897778144, + "grad_norm": 0.6749725341796875, + "learning_rate": 7.837134049376101e-06, + "loss": 1.4272, + "mean_token_accuracy": 0.676101932922999, + "num_tokens": 1910504707.0, + "step": 11396 + }, + { + "entropy": 1.6720272302627563, + "epoch": 1.2520117546895169, + "grad_norm": 0.6150344610214233, + "learning_rate": 7.835635099687849e-06, + "loss": 1.3575, + "mean_token_accuracy": 0.667877584695816, + "num_tokens": 1910686467.0, + "step": 11397 + }, + { + "entropy": 1.6736577153205872, + "epoch": 1.2521216116008898, + "grad_norm": 0.6787571907043457, + "learning_rate": 7.834136250150322e-06, + "loss": 1.3508, + "mean_token_accuracy": 0.6710595637559891, + "num_tokens": 1910821814.0, + "step": 11398 + }, + { + "entropy": 1.674732546011607, + "epoch": 1.2522314685122629, + "grad_norm": 0.7294467687606812, + "learning_rate": 7.832637500810956e-06, + "loss": 1.3117, + "mean_token_accuracy": 0.6755828162034353, + "num_tokens": 1910986283.0, + "step": 11399 + }, + { + "entropy": 1.7269720037778218, + "epoch": 1.2523413254236357, + "grad_norm": 0.6978003978729248, + "learning_rate": 7.83113885171718e-06, + "loss": 1.4251, + "mean_token_accuracy": 0.6505262355009714, + "num_tokens": 1911148949.0, + "step": 11400 + }, + { + "entropy": 1.684136559565862, + "epoch": 1.2524511823350086, + "grad_norm": 0.591343879699707, + "learning_rate": 7.829640302916439e-06, + "loss": 1.3168, + "mean_token_accuracy": 0.6647894382476807, + "num_tokens": 1911289617.0, + "step": 11401 + }, + { + "entropy": 1.7063394288221996, + "epoch": 1.2525610392463815, + "grad_norm": 0.8325570225715637, + "learning_rate": 7.82814185445615e-06, + "loss": 1.3085, + "mean_token_accuracy": 0.6733732322851816, + "num_tokens": 1911410460.0, + "step": 11402 + }, + { + "entropy": 1.7209921578566234, + "epoch": 1.2526708961577544, + "grad_norm": 0.6524738669395447, + "learning_rate": 7.826643506383741e-06, + "loss": 1.3605, + "mean_token_accuracy": 0.6583642363548279, + "num_tokens": 1911582978.0, + "step": 11403 + }, + { + "entropy": 1.7001774509747822, + "epoch": 1.2527807530691275, + "grad_norm": 0.581378698348999, + "learning_rate": 7.82514525874664e-06, + "loss": 1.5381, + "mean_token_accuracy": 0.6198792159557343, + "num_tokens": 1911779836.0, + "step": 11404 + }, + { + "entropy": 1.7446727454662323, + "epoch": 1.2528906099805004, + "grad_norm": 0.8353737592697144, + "learning_rate": 7.823647111592257e-06, + "loss": 1.505, + "mean_token_accuracy": 0.6476826096574465, + "num_tokens": 1911926444.0, + "step": 11405 + }, + { + "entropy": 1.6707488397757213, + "epoch": 1.2530004668918733, + "grad_norm": 0.7989435195922852, + "learning_rate": 7.82214906496801e-06, + "loss": 1.4969, + "mean_token_accuracy": 0.6394098401069641, + "num_tokens": 1912111911.0, + "step": 11406 + }, + { + "entropy": 1.7673610746860504, + "epoch": 1.2531103238032464, + "grad_norm": 0.8053948879241943, + "learning_rate": 7.820651118921319e-06, + "loss": 1.3536, + "mean_token_accuracy": 0.6432522932688395, + "num_tokens": 1912232792.0, + "step": 11407 + }, + { + "entropy": 1.6859275102615356, + "epoch": 1.2532201807146193, + "grad_norm": 0.7226851582527161, + "learning_rate": 7.819153273499582e-06, + "loss": 1.3106, + "mean_token_accuracy": 0.6881605138381323, + "num_tokens": 1912365835.0, + "step": 11408 + }, + { + "entropy": 1.7172163128852844, + "epoch": 1.2533300376259922, + "grad_norm": 0.7478646039962769, + "learning_rate": 7.817655528750212e-06, + "loss": 1.3927, + "mean_token_accuracy": 0.6513003359238306, + "num_tokens": 1912538400.0, + "step": 11409 + }, + { + "entropy": 1.7053345441818237, + "epoch": 1.253439894537365, + "grad_norm": 0.6886608600616455, + "learning_rate": 7.816157884720612e-06, + "loss": 1.4765, + "mean_token_accuracy": 0.6423324594895045, + "num_tokens": 1912724255.0, + "step": 11410 + }, + { + "entropy": 1.727815439303716, + "epoch": 1.253549751448738, + "grad_norm": 0.7457959055900574, + "learning_rate": 7.81466034145818e-06, + "loss": 1.4172, + "mean_token_accuracy": 0.6606029123067856, + "num_tokens": 1912962558.0, + "step": 11411 + }, + { + "entropy": 1.6915427148342133, + "epoch": 1.253659608360111, + "grad_norm": 0.6017783880233765, + "learning_rate": 7.813162899010309e-06, + "loss": 1.512, + "mean_token_accuracy": 0.6343448410431544, + "num_tokens": 1913116255.0, + "step": 11412 + }, + { + "entropy": 1.7317763566970825, + "epoch": 1.253769465271484, + "grad_norm": 0.6574037075042725, + "learning_rate": 7.811665557424405e-06, + "loss": 1.3666, + "mean_token_accuracy": 0.6605449169874191, + "num_tokens": 1913270950.0, + "step": 11413 + }, + { + "entropy": 1.7199652592341106, + "epoch": 1.2538793221828568, + "grad_norm": 0.796875, + "learning_rate": 7.81016831674784e-06, + "loss": 1.3238, + "mean_token_accuracy": 0.6675882587830225, + "num_tokens": 1913411579.0, + "step": 11414 + }, + { + "entropy": 1.6290069818496704, + "epoch": 1.2539891790942297, + "grad_norm": 0.680347204208374, + "learning_rate": 7.808671177028013e-06, + "loss": 1.4744, + "mean_token_accuracy": 0.6564949949582418, + "num_tokens": 1913645682.0, + "step": 11415 + }, + { + "entropy": 1.7070193191369374, + "epoch": 1.2540990360056026, + "grad_norm": 0.6644991636276245, + "learning_rate": 7.80717413831231e-06, + "loss": 1.5298, + "mean_token_accuracy": 0.6586725761493047, + "num_tokens": 1913872167.0, + "step": 11416 + }, + { + "entropy": 1.708907941977183, + "epoch": 1.2542088929169757, + "grad_norm": 0.6234670877456665, + "learning_rate": 7.805677200648101e-06, + "loss": 1.3705, + "mean_token_accuracy": 0.6489444921414057, + "num_tokens": 1914049662.0, + "step": 11417 + }, + { + "entropy": 1.6813920140266418, + "epoch": 1.2543187498283486, + "grad_norm": 0.6542984843254089, + "learning_rate": 7.80418036408277e-06, + "loss": 1.3409, + "mean_token_accuracy": 0.6567636926968893, + "num_tokens": 1914249075.0, + "step": 11418 + }, + { + "entropy": 1.7307861546675365, + "epoch": 1.2544286067397215, + "grad_norm": 0.723311185836792, + "learning_rate": 7.802683628663697e-06, + "loss": 1.4246, + "mean_token_accuracy": 0.6489053318897883, + "num_tokens": 1914392536.0, + "step": 11419 + }, + { + "entropy": 1.7004589041074116, + "epoch": 1.2545384636510946, + "grad_norm": 0.5945419669151306, + "learning_rate": 7.801186994438236e-06, + "loss": 1.4268, + "mean_token_accuracy": 0.6500704089800516, + "num_tokens": 1914564395.0, + "step": 11420 + }, + { + "entropy": 1.6588083505630493, + "epoch": 1.2546483205624674, + "grad_norm": 0.7622363567352295, + "learning_rate": 7.79969046145377e-06, + "loss": 1.4433, + "mean_token_accuracy": 0.6651933292547861, + "num_tokens": 1914736861.0, + "step": 11421 + }, + { + "entropy": 1.7121857802073162, + "epoch": 1.2547581774738403, + "grad_norm": 0.7922995090484619, + "learning_rate": 7.798194029757661e-06, + "loss": 1.3512, + "mean_token_accuracy": 0.6623698522647222, + "num_tokens": 1914899502.0, + "step": 11422 + }, + { + "entropy": 1.6708403130372365, + "epoch": 1.2548680343852132, + "grad_norm": 0.8336834907531738, + "learning_rate": 7.796697699397266e-06, + "loss": 1.5238, + "mean_token_accuracy": 0.6434931059678396, + "num_tokens": 1915090759.0, + "step": 11423 + }, + { + "entropy": 1.7130014995733898, + "epoch": 1.254977891296586, + "grad_norm": 0.7885116338729858, + "learning_rate": 7.795201470419944e-06, + "loss": 1.4998, + "mean_token_accuracy": 0.6617122739553452, + "num_tokens": 1915294853.0, + "step": 11424 + }, + { + "entropy": 1.695469965537389, + "epoch": 1.2550877482079592, + "grad_norm": 0.7806084156036377, + "learning_rate": 7.793705342873057e-06, + "loss": 1.5192, + "mean_token_accuracy": 0.6436646829048792, + "num_tokens": 1915519404.0, + "step": 11425 + }, + { + "entropy": 1.7029032309850056, + "epoch": 1.255197605119332, + "grad_norm": 0.6547440886497498, + "learning_rate": 7.792209316803945e-06, + "loss": 1.4503, + "mean_token_accuracy": 0.6494365930557251, + "num_tokens": 1915706424.0, + "step": 11426 + }, + { + "entropy": 1.6860091984272003, + "epoch": 1.255307462030705, + "grad_norm": 0.7135421633720398, + "learning_rate": 7.790713392259967e-06, + "loss": 1.6007, + "mean_token_accuracy": 0.6431414932012558, + "num_tokens": 1915915669.0, + "step": 11427 + }, + { + "entropy": 1.6765115559101105, + "epoch": 1.2554173189420779, + "grad_norm": 0.6755972504615784, + "learning_rate": 7.78921756928846e-06, + "loss": 1.3682, + "mean_token_accuracy": 0.653800884882609, + "num_tokens": 1916074318.0, + "step": 11428 + }, + { + "entropy": 1.6629555523395538, + "epoch": 1.2555271758534507, + "grad_norm": 0.6233551502227783, + "learning_rate": 7.787721847936773e-06, + "loss": 1.5946, + "mean_token_accuracy": 0.6113560448090235, + "num_tokens": 1916321807.0, + "step": 11429 + }, + { + "entropy": 1.7326057354609172, + "epoch": 1.2556370327648239, + "grad_norm": 0.7778398990631104, + "learning_rate": 7.786226228252245e-06, + "loss": 1.2951, + "mean_token_accuracy": 0.6696663945913315, + "num_tokens": 1916452669.0, + "step": 11430 + }, + { + "entropy": 1.6951783398787181, + "epoch": 1.2557468896761967, + "grad_norm": 0.7745827436447144, + "learning_rate": 7.784730710282203e-06, + "loss": 1.3895, + "mean_token_accuracy": 0.6611627688010534, + "num_tokens": 1916616875.0, + "step": 11431 + }, + { + "entropy": 1.6729782323042552, + "epoch": 1.2558567465875696, + "grad_norm": 0.6417363286018372, + "learning_rate": 7.783235294073986e-06, + "loss": 1.3102, + "mean_token_accuracy": 0.6600176095962524, + "num_tokens": 1916753385.0, + "step": 11432 + }, + { + "entropy": 1.72296741604805, + "epoch": 1.2559666034989427, + "grad_norm": 0.7033810019493103, + "learning_rate": 7.781739979674922e-06, + "loss": 1.3348, + "mean_token_accuracy": 0.6627410103877386, + "num_tokens": 1916948753.0, + "step": 11433 + }, + { + "entropy": 1.795237421989441, + "epoch": 1.2560764604103156, + "grad_norm": 0.8221445679664612, + "learning_rate": 7.780244767132339e-06, + "loss": 1.4476, + "mean_token_accuracy": 0.6527186830838522, + "num_tokens": 1917103771.0, + "step": 11434 + }, + { + "entropy": 1.6837720175584157, + "epoch": 1.2561863173216885, + "grad_norm": 0.7041736245155334, + "learning_rate": 7.778749656493558e-06, + "loss": 1.3005, + "mean_token_accuracy": 0.6646720518668493, + "num_tokens": 1917251916.0, + "step": 11435 + }, + { + "entropy": 1.7194795906543732, + "epoch": 1.2562961742330614, + "grad_norm": 0.7449667453765869, + "learning_rate": 7.7772546478059e-06, + "loss": 1.4377, + "mean_token_accuracy": 0.6714818626642227, + "num_tokens": 1917395245.0, + "step": 11436 + }, + { + "entropy": 1.6425227721532185, + "epoch": 1.2564060311444343, + "grad_norm": 0.6943098902702332, + "learning_rate": 7.77575974111668e-06, + "loss": 1.4591, + "mean_token_accuracy": 0.6591807802518209, + "num_tokens": 1917627333.0, + "step": 11437 + }, + { + "entropy": 1.6901133060455322, + "epoch": 1.2565158880558074, + "grad_norm": 0.6888213753700256, + "learning_rate": 7.774264936473209e-06, + "loss": 1.4082, + "mean_token_accuracy": 0.6591382523377737, + "num_tokens": 1917797362.0, + "step": 11438 + }, + { + "entropy": 1.6233469347159069, + "epoch": 1.2566257449671803, + "grad_norm": 0.6449564695358276, + "learning_rate": 7.772770233922801e-06, + "loss": 1.3172, + "mean_token_accuracy": 0.6671230693658193, + "num_tokens": 1917967575.0, + "step": 11439 + }, + { + "entropy": 1.7254037757714589, + "epoch": 1.2567356018785532, + "grad_norm": 0.7280165553092957, + "learning_rate": 7.771275633512761e-06, + "loss": 1.4346, + "mean_token_accuracy": 0.6726182848215103, + "num_tokens": 1918147690.0, + "step": 11440 + }, + { + "entropy": 1.671642541885376, + "epoch": 1.256845458789926, + "grad_norm": 0.7780535221099854, + "learning_rate": 7.769781135290392e-06, + "loss": 1.2984, + "mean_token_accuracy": 0.6698754082123438, + "num_tokens": 1918277327.0, + "step": 11441 + }, + { + "entropy": 1.7655751307805378, + "epoch": 1.256955315701299, + "grad_norm": 0.7205750346183777, + "learning_rate": 7.768286739302997e-06, + "loss": 1.4342, + "mean_token_accuracy": 0.6538830598195394, + "num_tokens": 1918451845.0, + "step": 11442 + }, + { + "entropy": 1.6561244527498882, + "epoch": 1.257065172612672, + "grad_norm": 0.5723996162414551, + "learning_rate": 7.766792445597867e-06, + "loss": 1.2961, + "mean_token_accuracy": 0.6749825278917948, + "num_tokens": 1918591768.0, + "step": 11443 + }, + { + "entropy": 1.7065655092398326, + "epoch": 1.257175029524045, + "grad_norm": 0.6684293150901794, + "learning_rate": 7.765298254222295e-06, + "loss": 1.3863, + "mean_token_accuracy": 0.6667204201221466, + "num_tokens": 1918751296.0, + "step": 11444 + }, + { + "entropy": 1.7191319068272908, + "epoch": 1.2572848864354178, + "grad_norm": 0.6368053555488586, + "learning_rate": 7.763804165223583e-06, + "loss": 1.3593, + "mean_token_accuracy": 0.6531160324811935, + "num_tokens": 1918911483.0, + "step": 11445 + }, + { + "entropy": 1.7349167664845784, + "epoch": 1.257394743346791, + "grad_norm": 0.6222125291824341, + "learning_rate": 7.762310178649009e-06, + "loss": 1.4368, + "mean_token_accuracy": 0.6423256794611613, + "num_tokens": 1919049670.0, + "step": 11446 + }, + { + "entropy": 1.6775768597920735, + "epoch": 1.2575046002581638, + "grad_norm": 0.682058572769165, + "learning_rate": 7.760816294545859e-06, + "loss": 1.5105, + "mean_token_accuracy": 0.6353928248087565, + "num_tokens": 1919331340.0, + "step": 11447 + }, + { + "entropy": 1.6649962762991588, + "epoch": 1.2576144571695367, + "grad_norm": 0.6357629299163818, + "learning_rate": 7.759322512961414e-06, + "loss": 1.4366, + "mean_token_accuracy": 0.6457183212041855, + "num_tokens": 1919563182.0, + "step": 11448 + }, + { + "entropy": 1.675305445988973, + "epoch": 1.2577243140809096, + "grad_norm": 0.6749522089958191, + "learning_rate": 7.757828833942951e-06, + "loss": 1.3358, + "mean_token_accuracy": 0.6629159996906916, + "num_tokens": 1919785893.0, + "step": 11449 + }, + { + "entropy": 1.7580168048540752, + "epoch": 1.2578341709922825, + "grad_norm": 0.6780009269714355, + "learning_rate": 7.756335257537741e-06, + "loss": 1.4463, + "mean_token_accuracy": 0.6388568629821142, + "num_tokens": 1919960622.0, + "step": 11450 + }, + { + "entropy": 1.7142368654410045, + "epoch": 1.2579440279036556, + "grad_norm": 0.7723596692085266, + "learning_rate": 7.754841783793064e-06, + "loss": 1.3538, + "mean_token_accuracy": 0.6707404851913452, + "num_tokens": 1920101550.0, + "step": 11451 + }, + { + "entropy": 1.741561730702718, + "epoch": 1.2580538848150284, + "grad_norm": 0.7125125527381897, + "learning_rate": 7.753348412756179e-06, + "loss": 1.466, + "mean_token_accuracy": 0.6531734565893809, + "num_tokens": 1920294592.0, + "step": 11452 + }, + { + "entropy": 1.6811016102631886, + "epoch": 1.2581637417264013, + "grad_norm": 0.6982854604721069, + "learning_rate": 7.751855144474354e-06, + "loss": 1.4956, + "mean_token_accuracy": 0.642639954884847, + "num_tokens": 1920433718.0, + "step": 11453 + }, + { + "entropy": 1.7243138253688812, + "epoch": 1.2582735986377744, + "grad_norm": 0.7025527954101562, + "learning_rate": 7.75036197899485e-06, + "loss": 1.2947, + "mean_token_accuracy": 0.6688729325930277, + "num_tokens": 1920552445.0, + "step": 11454 + }, + { + "entropy": 1.7009615500768025, + "epoch": 1.258383455549147, + "grad_norm": 0.7054716944694519, + "learning_rate": 7.748868916364924e-06, + "loss": 1.5373, + "mean_token_accuracy": 0.6239589502414068, + "num_tokens": 1920759876.0, + "step": 11455 + }, + { + "entropy": 1.7350860337416332, + "epoch": 1.2584933124605202, + "grad_norm": 0.6489484906196594, + "learning_rate": 7.747375956631833e-06, + "loss": 1.4164, + "mean_token_accuracy": 0.6492180824279785, + "num_tokens": 1920930265.0, + "step": 11456 + }, + { + "entropy": 1.7335429390271504, + "epoch": 1.258603169371893, + "grad_norm": 2.155296564102173, + "learning_rate": 7.745883099842828e-06, + "loss": 1.2186, + "mean_token_accuracy": 0.6745936175187429, + "num_tokens": 1921145852.0, + "step": 11457 + }, + { + "entropy": 1.6847576002279918, + "epoch": 1.258713026283266, + "grad_norm": 0.7475409507751465, + "learning_rate": 7.744390346045156e-06, + "loss": 1.3127, + "mean_token_accuracy": 0.6665635804335276, + "num_tokens": 1921288993.0, + "step": 11458 + }, + { + "entropy": 1.6852596898873646, + "epoch": 1.258822883194639, + "grad_norm": 0.6202402114868164, + "learning_rate": 7.742897695286063e-06, + "loss": 1.2607, + "mean_token_accuracy": 0.6772429198026657, + "num_tokens": 1921467875.0, + "step": 11459 + }, + { + "entropy": 1.7395563423633575, + "epoch": 1.258932740106012, + "grad_norm": 0.6076090335845947, + "learning_rate": 7.741405147612791e-06, + "loss": 1.3799, + "mean_token_accuracy": 0.6517507483561834, + "num_tokens": 1921609703.0, + "step": 11460 + }, + { + "entropy": 1.6285878519217174, + "epoch": 1.2590425970173849, + "grad_norm": 0.6175395846366882, + "learning_rate": 7.739912703072576e-06, + "loss": 1.3612, + "mean_token_accuracy": 0.6656326601902643, + "num_tokens": 1921784864.0, + "step": 11461 + }, + { + "entropy": 1.7589463591575623, + "epoch": 1.2591524539287577, + "grad_norm": 0.7153595685958862, + "learning_rate": 7.738420361712654e-06, + "loss": 1.4444, + "mean_token_accuracy": 0.6449099431435267, + "num_tokens": 1921959314.0, + "step": 11462 + }, + { + "entropy": 1.7452492415904999, + "epoch": 1.2592623108401306, + "grad_norm": 0.7535262107849121, + "learning_rate": 7.736928123580259e-06, + "loss": 1.6277, + "mean_token_accuracy": 0.6322442690531412, + "num_tokens": 1922134627.0, + "step": 11463 + }, + { + "entropy": 1.7327638566493988, + "epoch": 1.2593721677515037, + "grad_norm": 0.6867733001708984, + "learning_rate": 7.73543598872262e-06, + "loss": 1.2915, + "mean_token_accuracy": 0.6723167101542155, + "num_tokens": 1922251085.0, + "step": 11464 + }, + { + "entropy": 1.6638148029645283, + "epoch": 1.2594820246628766, + "grad_norm": 0.7520348429679871, + "learning_rate": 7.733943957186958e-06, + "loss": 1.3079, + "mean_token_accuracy": 0.667260949810346, + "num_tokens": 1922418678.0, + "step": 11465 + }, + { + "entropy": 1.65032497048378, + "epoch": 1.2595918815742495, + "grad_norm": 0.6573466062545776, + "learning_rate": 7.7324520290205e-06, + "loss": 1.3575, + "mean_token_accuracy": 0.6559811184803644, + "num_tokens": 1922574854.0, + "step": 11466 + }, + { + "entropy": 1.676581472158432, + "epoch": 1.2597017384856226, + "grad_norm": 0.7159135341644287, + "learning_rate": 7.730960204270464e-06, + "loss": 1.4979, + "mean_token_accuracy": 0.6280734737714132, + "num_tokens": 1922802094.0, + "step": 11467 + }, + { + "entropy": 1.643155614535014, + "epoch": 1.2598115953969953, + "grad_norm": 0.6512061953544617, + "learning_rate": 7.729468482984062e-06, + "loss": 1.5478, + "mean_token_accuracy": 0.6497600624958674, + "num_tokens": 1923003559.0, + "step": 11468 + }, + { + "entropy": 1.7391295929749806, + "epoch": 1.2599214523083684, + "grad_norm": 0.7512926459312439, + "learning_rate": 7.727976865208511e-06, + "loss": 1.3226, + "mean_token_accuracy": 0.6600721975167593, + "num_tokens": 1923121923.0, + "step": 11469 + }, + { + "entropy": 1.7174865404764812, + "epoch": 1.2600313092197413, + "grad_norm": 0.6891658306121826, + "learning_rate": 7.726485350991016e-06, + "loss": 1.2844, + "mean_token_accuracy": 0.6672490239143372, + "num_tokens": 1923266801.0, + "step": 11470 + }, + { + "entropy": 1.6965750257174175, + "epoch": 1.2601411661311142, + "grad_norm": 3.922152042388916, + "learning_rate": 7.724993940378784e-06, + "loss": 1.6618, + "mean_token_accuracy": 0.6115802451968193, + "num_tokens": 1923523178.0, + "step": 11471 + }, + { + "entropy": 1.72745943069458, + "epoch": 1.2602510230424873, + "grad_norm": 0.6440877318382263, + "learning_rate": 7.723502633419022e-06, + "loss": 1.4972, + "mean_token_accuracy": 0.6545726358890533, + "num_tokens": 1923703015.0, + "step": 11472 + }, + { + "entropy": 1.693722536166509, + "epoch": 1.2603608799538601, + "grad_norm": 0.7532528638839722, + "learning_rate": 7.722011430158923e-06, + "loss": 1.5349, + "mean_token_accuracy": 0.6389701962471008, + "num_tokens": 1923865936.0, + "step": 11473 + }, + { + "entropy": 1.72782959540685, + "epoch": 1.260470736865233, + "grad_norm": 0.7088679671287537, + "learning_rate": 7.72052033064568e-06, + "loss": 1.2202, + "mean_token_accuracy": 0.6756645192702612, + "num_tokens": 1923968318.0, + "step": 11474 + }, + { + "entropy": 1.6902720232804616, + "epoch": 1.260580593776606, + "grad_norm": 0.612113356590271, + "learning_rate": 7.7190293349265e-06, + "loss": 1.38, + "mean_token_accuracy": 0.6533292979001999, + "num_tokens": 1924177834.0, + "step": 11475 + }, + { + "entropy": 1.6937540173530579, + "epoch": 1.2606904506879788, + "grad_norm": 0.7892910242080688, + "learning_rate": 7.717538443048556e-06, + "loss": 1.4682, + "mean_token_accuracy": 0.6503288199504217, + "num_tokens": 1924315589.0, + "step": 11476 + }, + { + "entropy": 1.7266385753949482, + "epoch": 1.260800307599352, + "grad_norm": 0.7862039804458618, + "learning_rate": 7.716047655059043e-06, + "loss": 1.266, + "mean_token_accuracy": 0.6670999377965927, + "num_tokens": 1924421277.0, + "step": 11477 + }, + { + "entropy": 1.6943805813789368, + "epoch": 1.2609101645107248, + "grad_norm": 0.6646753549575806, + "learning_rate": 7.714556971005145e-06, + "loss": 1.4912, + "mean_token_accuracy": 0.6547419528166453, + "num_tokens": 1924585290.0, + "step": 11478 + }, + { + "entropy": 1.7027036249637604, + "epoch": 1.2610200214220977, + "grad_norm": 0.609962522983551, + "learning_rate": 7.713066390934034e-06, + "loss": 1.4166, + "mean_token_accuracy": 0.6544028073549271, + "num_tokens": 1924754377.0, + "step": 11479 + }, + { + "entropy": 1.7627936601638794, + "epoch": 1.2611298783334708, + "grad_norm": 0.6366637349128723, + "learning_rate": 7.711575914892893e-06, + "loss": 1.4668, + "mean_token_accuracy": 0.6458509564399719, + "num_tokens": 1924953930.0, + "step": 11480 + }, + { + "entropy": 1.720168004433314, + "epoch": 1.2612397352448437, + "grad_norm": 0.6627671122550964, + "learning_rate": 7.710085542928893e-06, + "loss": 1.4203, + "mean_token_accuracy": 0.6433569043874741, + "num_tokens": 1925151322.0, + "step": 11481 + }, + { + "entropy": 1.7099298934141796, + "epoch": 1.2613495921562166, + "grad_norm": 0.7178329825401306, + "learning_rate": 7.708595275089202e-06, + "loss": 1.5419, + "mean_token_accuracy": 0.6385843257109324, + "num_tokens": 1925338468.0, + "step": 11482 + }, + { + "entropy": 1.6707678933938344, + "epoch": 1.2614594490675894, + "grad_norm": 0.6434171795845032, + "learning_rate": 7.707105111420985e-06, + "loss": 1.4085, + "mean_token_accuracy": 0.6535218954086304, + "num_tokens": 1925559149.0, + "step": 11483 + }, + { + "entropy": 1.6994469662507374, + "epoch": 1.2615693059789623, + "grad_norm": 0.6782479882240295, + "learning_rate": 7.705615051971413e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.6566101660331091, + "num_tokens": 1925685270.0, + "step": 11484 + }, + { + "entropy": 1.7290050586064656, + "epoch": 1.2616791628903354, + "grad_norm": 0.649772047996521, + "learning_rate": 7.704125096787636e-06, + "loss": 1.4744, + "mean_token_accuracy": 0.6495188226302465, + "num_tokens": 1925843285.0, + "step": 11485 + }, + { + "entropy": 1.6614007751146953, + "epoch": 1.2617890198017083, + "grad_norm": 0.7200374603271484, + "learning_rate": 7.702635245916814e-06, + "loss": 1.2959, + "mean_token_accuracy": 0.6671645094950994, + "num_tokens": 1925992589.0, + "step": 11486 + }, + { + "entropy": 1.7314343353112538, + "epoch": 1.2618988767130812, + "grad_norm": 0.7426960468292236, + "learning_rate": 7.701145499406106e-06, + "loss": 1.4451, + "mean_token_accuracy": 0.650872215628624, + "num_tokens": 1926166622.0, + "step": 11487 + }, + { + "entropy": 1.7814875145753224, + "epoch": 1.262008733624454, + "grad_norm": 0.7111337184906006, + "learning_rate": 7.69965585730265e-06, + "loss": 1.311, + "mean_token_accuracy": 0.6611191133658091, + "num_tokens": 1926299574.0, + "step": 11488 + }, + { + "entropy": 1.773252805074056, + "epoch": 1.262118590535827, + "grad_norm": 0.6960779428482056, + "learning_rate": 7.698166319653604e-06, + "loss": 1.3395, + "mean_token_accuracy": 0.6631787866353989, + "num_tokens": 1926441744.0, + "step": 11489 + }, + { + "entropy": 1.721550424893697, + "epoch": 1.2622284474472, + "grad_norm": 0.6645501255989075, + "learning_rate": 7.696676886506102e-06, + "loss": 1.4345, + "mean_token_accuracy": 0.652705987294515, + "num_tokens": 1926620603.0, + "step": 11490 + }, + { + "entropy": 1.624302864074707, + "epoch": 1.262338304358573, + "grad_norm": 0.8502170443534851, + "learning_rate": 7.695187557907292e-06, + "loss": 1.1604, + "mean_token_accuracy": 0.6946276426315308, + "num_tokens": 1926753986.0, + "step": 11491 + }, + { + "entropy": 1.7301335036754608, + "epoch": 1.2624481612699459, + "grad_norm": 0.9755292534828186, + "learning_rate": 7.693698333904305e-06, + "loss": 1.4621, + "mean_token_accuracy": 0.6535843859116236, + "num_tokens": 1926923605.0, + "step": 11492 + }, + { + "entropy": 1.691352754831314, + "epoch": 1.262558018181319, + "grad_norm": 0.6778144836425781, + "learning_rate": 7.692209214544276e-06, + "loss": 1.4903, + "mean_token_accuracy": 0.6365737020969391, + "num_tokens": 1927182509.0, + "step": 11493 + }, + { + "entropy": 1.6417991022268932, + "epoch": 1.2626678750926918, + "grad_norm": 0.629219651222229, + "learning_rate": 7.690720199874331e-06, + "loss": 1.355, + "mean_token_accuracy": 0.6662428428729376, + "num_tokens": 1927352842.0, + "step": 11494 + }, + { + "entropy": 1.6753190557161968, + "epoch": 1.2627777320040647, + "grad_norm": 0.6111719608306885, + "learning_rate": 7.689231289941606e-06, + "loss": 1.4178, + "mean_token_accuracy": 0.6571441541115443, + "num_tokens": 1927616855.0, + "step": 11495 + }, + { + "entropy": 1.6729080478350322, + "epoch": 1.2628875889154376, + "grad_norm": 0.6489112973213196, + "learning_rate": 7.687742484793215e-06, + "loss": 1.3208, + "mean_token_accuracy": 0.6776315818230311, + "num_tokens": 1927765143.0, + "step": 11496 + }, + { + "entropy": 1.7384801010290782, + "epoch": 1.2629974458268105, + "grad_norm": 0.7252342104911804, + "learning_rate": 7.686253784476284e-06, + "loss": 1.3797, + "mean_token_accuracy": 0.6533515950043997, + "num_tokens": 1927917822.0, + "step": 11497 + }, + { + "entropy": 1.6913585464159648, + "epoch": 1.2631073027381836, + "grad_norm": 0.8369355797767639, + "learning_rate": 7.684765189037925e-06, + "loss": 1.3558, + "mean_token_accuracy": 0.6676561236381531, + "num_tokens": 1928111621.0, + "step": 11498 + }, + { + "entropy": 1.7651902238527934, + "epoch": 1.2632171596495565, + "grad_norm": 0.7449557781219482, + "learning_rate": 7.683276698525257e-06, + "loss": 1.4675, + "mean_token_accuracy": 0.6476789265871048, + "num_tokens": 1928263994.0, + "step": 11499 + }, + { + "entropy": 1.715785026550293, + "epoch": 1.2633270165609294, + "grad_norm": 0.7928597927093506, + "learning_rate": 7.681788312985383e-06, + "loss": 1.3824, + "mean_token_accuracy": 0.6676217714945475, + "num_tokens": 1928407724.0, + "step": 11500 + }, + { + "entropy": 1.6963496307531993, + "epoch": 1.2634368734723023, + "grad_norm": 0.7719584703445435, + "learning_rate": 7.680300032465418e-06, + "loss": 1.3438, + "mean_token_accuracy": 0.6627303858598074, + "num_tokens": 1928529780.0, + "step": 11501 + }, + { + "entropy": 1.7370944917201996, + "epoch": 1.2635467303836752, + "grad_norm": 0.6030802726745605, + "learning_rate": 7.678811857012461e-06, + "loss": 1.5213, + "mean_token_accuracy": 0.6461159139871597, + "num_tokens": 1928707122.0, + "step": 11502 + }, + { + "entropy": 1.7230145931243896, + "epoch": 1.2636565872950483, + "grad_norm": 0.7120351791381836, + "learning_rate": 7.67732378667361e-06, + "loss": 1.5991, + "mean_token_accuracy": 0.6233674536148707, + "num_tokens": 1928938624.0, + "step": 11503 + }, + { + "entropy": 1.7156452139218648, + "epoch": 1.2637664442064211, + "grad_norm": 0.7055292129516602, + "learning_rate": 7.675835821495965e-06, + "loss": 1.3116, + "mean_token_accuracy": 0.6527894685665766, + "num_tokens": 1929072927.0, + "step": 11504 + }, + { + "entropy": 1.7379835744698842, + "epoch": 1.263876301117794, + "grad_norm": 0.6333845853805542, + "learning_rate": 7.674347961526617e-06, + "loss": 1.4977, + "mean_token_accuracy": 0.6325055857499441, + "num_tokens": 1929263786.0, + "step": 11505 + }, + { + "entropy": 1.676991045475006, + "epoch": 1.2639861580291671, + "grad_norm": 0.6579580903053284, + "learning_rate": 7.672860206812655e-06, + "loss": 1.3853, + "mean_token_accuracy": 0.6635429114103317, + "num_tokens": 1929431076.0, + "step": 11506 + }, + { + "entropy": 1.757521351178487, + "epoch": 1.26409601494054, + "grad_norm": 0.7360755205154419, + "learning_rate": 7.671372557401174e-06, + "loss": 1.5488, + "mean_token_accuracy": 0.623514766494433, + "num_tokens": 1929649046.0, + "step": 11507 + }, + { + "entropy": 1.620929052432378, + "epoch": 1.264205871851913, + "grad_norm": 0.7235838174819946, + "learning_rate": 7.66988501333925e-06, + "loss": 1.3106, + "mean_token_accuracy": 0.6729168196519216, + "num_tokens": 1929847207.0, + "step": 11508 + }, + { + "entropy": 1.6391695042451222, + "epoch": 1.2643157287632858, + "grad_norm": 0.5893858671188354, + "learning_rate": 7.668397574673963e-06, + "loss": 1.4936, + "mean_token_accuracy": 0.6433763305346171, + "num_tokens": 1930052630.0, + "step": 11509 + }, + { + "entropy": 1.675877183675766, + "epoch": 1.2644255856746587, + "grad_norm": 0.6175031661987305, + "learning_rate": 7.666910241452395e-06, + "loss": 1.3547, + "mean_token_accuracy": 0.665746475259463, + "num_tokens": 1930224630.0, + "step": 11510 + }, + { + "entropy": 1.7053968608379364, + "epoch": 1.2645354425860318, + "grad_norm": 0.7241131663322449, + "learning_rate": 7.665423013721611e-06, + "loss": 1.3036, + "mean_token_accuracy": 0.6749317497014999, + "num_tokens": 1930350724.0, + "step": 11511 + }, + { + "entropy": 1.6937896013259888, + "epoch": 1.2646452994974047, + "grad_norm": 0.6429654359817505, + "learning_rate": 7.663935891528686e-06, + "loss": 1.3792, + "mean_token_accuracy": 0.6615054110685984, + "num_tokens": 1930486501.0, + "step": 11512 + }, + { + "entropy": 1.655285765727361, + "epoch": 1.2647551564087776, + "grad_norm": 0.6734585762023926, + "learning_rate": 7.662448874920692e-06, + "loss": 1.3187, + "mean_token_accuracy": 0.6705884784460068, + "num_tokens": 1930635527.0, + "step": 11513 + }, + { + "entropy": 1.6695108612378438, + "epoch": 1.2648650133201504, + "grad_norm": 0.7096666693687439, + "learning_rate": 7.660961963944682e-06, + "loss": 1.3868, + "mean_token_accuracy": 0.6595136175553004, + "num_tokens": 1930815397.0, + "step": 11514 + }, + { + "entropy": 1.6726165413856506, + "epoch": 1.2649748702315233, + "grad_norm": 0.6741091012954712, + "learning_rate": 7.659475158647724e-06, + "loss": 1.4177, + "mean_token_accuracy": 0.6559295405944189, + "num_tokens": 1930982798.0, + "step": 11515 + }, + { + "entropy": 1.7462388277053833, + "epoch": 1.2650847271428964, + "grad_norm": 0.6222598552703857, + "learning_rate": 7.657988459076872e-06, + "loss": 1.538, + "mean_token_accuracy": 0.6377201875050863, + "num_tokens": 1931198546.0, + "step": 11516 + }, + { + "entropy": 1.647881656885147, + "epoch": 1.2651945840542693, + "grad_norm": 0.7117844223976135, + "learning_rate": 7.656501865279178e-06, + "loss": 1.4001, + "mean_token_accuracy": 0.6614241848389307, + "num_tokens": 1931360393.0, + "step": 11517 + }, + { + "entropy": 1.7693034013112385, + "epoch": 1.2653044409656422, + "grad_norm": 0.7837836742401123, + "learning_rate": 7.655015377301693e-06, + "loss": 1.3253, + "mean_token_accuracy": 0.664752279718717, + "num_tokens": 1931480695.0, + "step": 11518 + }, + { + "entropy": 1.6859951515992482, + "epoch": 1.2654142978770153, + "grad_norm": 0.8946231603622437, + "learning_rate": 7.653528995191467e-06, + "loss": 1.3278, + "mean_token_accuracy": 0.6740283519029617, + "num_tokens": 1931619467.0, + "step": 11519 + }, + { + "entropy": 1.6824671526749928, + "epoch": 1.2655241547883882, + "grad_norm": 0.705892026424408, + "learning_rate": 7.652042718995539e-06, + "loss": 1.2626, + "mean_token_accuracy": 0.6893934309482574, + "num_tokens": 1931782165.0, + "step": 11520 + }, + { + "entropy": 1.680429647366206, + "epoch": 1.265634011699761, + "grad_norm": 0.7540983557701111, + "learning_rate": 7.650556548760948e-06, + "loss": 1.3173, + "mean_token_accuracy": 0.6678841362396876, + "num_tokens": 1931937633.0, + "step": 11521 + }, + { + "entropy": 1.648360123236974, + "epoch": 1.265743868611134, + "grad_norm": 0.7288416624069214, + "learning_rate": 7.649070484534737e-06, + "loss": 1.342, + "mean_token_accuracy": 0.6797003994385401, + "num_tokens": 1932146802.0, + "step": 11522 + }, + { + "entropy": 1.7429245710372925, + "epoch": 1.2658537255225069, + "grad_norm": 0.724900484085083, + "learning_rate": 7.647584526363933e-06, + "loss": 1.4145, + "mean_token_accuracy": 0.6517399648825327, + "num_tokens": 1932307870.0, + "step": 11523 + }, + { + "entropy": 1.6724059581756592, + "epoch": 1.26596358243388, + "grad_norm": 0.612019419670105, + "learning_rate": 7.646098674295566e-06, + "loss": 1.4009, + "mean_token_accuracy": 0.6560710817575455, + "num_tokens": 1932481868.0, + "step": 11524 + }, + { + "entropy": 1.6238444844881694, + "epoch": 1.2660734393452528, + "grad_norm": 0.6354291439056396, + "learning_rate": 7.644612928376666e-06, + "loss": 1.5184, + "mean_token_accuracy": 0.653970350821813, + "num_tokens": 1932664212.0, + "step": 11525 + }, + { + "entropy": 1.7032426098982494, + "epoch": 1.2661832962566257, + "grad_norm": 0.6199919581413269, + "learning_rate": 7.643127288654255e-06, + "loss": 1.4766, + "mean_token_accuracy": 0.6471427232027054, + "num_tokens": 1932844482.0, + "step": 11526 + }, + { + "entropy": 1.677983929713567, + "epoch": 1.2662931531679986, + "grad_norm": 0.698670506477356, + "learning_rate": 7.641641755175353e-06, + "loss": 1.3514, + "mean_token_accuracy": 0.6649612784385681, + "num_tokens": 1932999488.0, + "step": 11527 + }, + { + "entropy": 1.7283147772153218, + "epoch": 1.2664030100793715, + "grad_norm": 0.6362758278846741, + "learning_rate": 7.640156327986978e-06, + "loss": 1.5035, + "mean_token_accuracy": 0.6524873872598013, + "num_tokens": 1933210321.0, + "step": 11528 + }, + { + "entropy": 1.6926906903584797, + "epoch": 1.2665128669907446, + "grad_norm": 0.6771997809410095, + "learning_rate": 7.63867100713614e-06, + "loss": 1.4221, + "mean_token_accuracy": 0.6506749987602234, + "num_tokens": 1933393700.0, + "step": 11529 + }, + { + "entropy": 1.7136725882689159, + "epoch": 1.2666227239021175, + "grad_norm": 0.6458131074905396, + "learning_rate": 7.637185792669849e-06, + "loss": 1.3923, + "mean_token_accuracy": 0.6545117845137914, + "num_tokens": 1933537916.0, + "step": 11530 + }, + { + "entropy": 1.678369532028834, + "epoch": 1.2667325808134904, + "grad_norm": 0.739032506942749, + "learning_rate": 7.635700684635112e-06, + "loss": 1.3112, + "mean_token_accuracy": 0.6715343842903773, + "num_tokens": 1933714762.0, + "step": 11531 + }, + { + "entropy": 1.65420796473821, + "epoch": 1.2668424377248635, + "grad_norm": 0.7117313742637634, + "learning_rate": 7.634215683078934e-06, + "loss": 1.446, + "mean_token_accuracy": 0.6615893120567004, + "num_tokens": 1933884409.0, + "step": 11532 + }, + { + "entropy": 1.7248832484086354, + "epoch": 1.2669522946362364, + "grad_norm": 0.6786313056945801, + "learning_rate": 7.632730788048313e-06, + "loss": 1.4713, + "mean_token_accuracy": 0.6573885877927145, + "num_tokens": 1934081855.0, + "step": 11533 + }, + { + "entropy": 1.7391786475976307, + "epoch": 1.2670621515476093, + "grad_norm": 0.6770562529563904, + "learning_rate": 7.631245999590244e-06, + "loss": 1.388, + "mean_token_accuracy": 0.6576652526855469, + "num_tokens": 1934265897.0, + "step": 11534 + }, + { + "entropy": 1.755203555027644, + "epoch": 1.2671720084589821, + "grad_norm": 0.5667737126350403, + "learning_rate": 7.629761317751723e-06, + "loss": 1.4356, + "mean_token_accuracy": 0.6415314426024755, + "num_tokens": 1934470125.0, + "step": 11535 + }, + { + "entropy": 1.7336049179236095, + "epoch": 1.267281865370355, + "grad_norm": 0.6754252910614014, + "learning_rate": 7.628276742579732e-06, + "loss": 1.3752, + "mean_token_accuracy": 0.6565804481506348, + "num_tokens": 1934638119.0, + "step": 11536 + }, + { + "entropy": 1.7392099499702454, + "epoch": 1.2673917222817281, + "grad_norm": 0.6438708305358887, + "learning_rate": 7.626792274121268e-06, + "loss": 1.5711, + "mean_token_accuracy": 0.6428494701782862, + "num_tokens": 1934809654.0, + "step": 11537 + }, + { + "entropy": 1.7270666062831879, + "epoch": 1.267501579193101, + "grad_norm": 0.6939952373504639, + "learning_rate": 7.625307912423308e-06, + "loss": 1.4309, + "mean_token_accuracy": 0.6438876688480377, + "num_tokens": 1934985020.0, + "step": 11538 + }, + { + "entropy": 1.6828594009081523, + "epoch": 1.267611436104474, + "grad_norm": 0.8025250434875488, + "learning_rate": 7.6238236575328315e-06, + "loss": 1.2169, + "mean_token_accuracy": 0.6810509413480759, + "num_tokens": 1935128210.0, + "step": 11539 + }, + { + "entropy": 1.7216303646564484, + "epoch": 1.2677212930158468, + "grad_norm": 1.0964614152908325, + "learning_rate": 7.622339509496814e-06, + "loss": 1.4948, + "mean_token_accuracy": 0.6510275801022848, + "num_tokens": 1935295219.0, + "step": 11540 + }, + { + "entropy": 1.7091183761755626, + "epoch": 1.2678311499272197, + "grad_norm": 0.7597293257713318, + "learning_rate": 7.620855468362232e-06, + "loss": 1.3388, + "mean_token_accuracy": 0.6646958986918131, + "num_tokens": 1935444915.0, + "step": 11541 + }, + { + "entropy": 1.7101080814997356, + "epoch": 1.2679410068385928, + "grad_norm": 0.6305139064788818, + "learning_rate": 7.619371534176045e-06, + "loss": 1.3943, + "mean_token_accuracy": 0.6649887412786484, + "num_tokens": 1935608450.0, + "step": 11542 + }, + { + "entropy": 1.776473770538966, + "epoch": 1.2680508637499657, + "grad_norm": 0.674400269985199, + "learning_rate": 7.6178877069852344e-06, + "loss": 1.4229, + "mean_token_accuracy": 0.658959781130155, + "num_tokens": 1935755983.0, + "step": 11543 + }, + { + "entropy": 1.6880793074766796, + "epoch": 1.2681607206613386, + "grad_norm": 0.8230254054069519, + "learning_rate": 7.616403986836749e-06, + "loss": 1.2906, + "mean_token_accuracy": 0.6654263834158579, + "num_tokens": 1935880326.0, + "step": 11544 + }, + { + "entropy": 1.6400333046913147, + "epoch": 1.2682705775727117, + "grad_norm": 0.6082746386528015, + "learning_rate": 7.614920373777552e-06, + "loss": 1.2296, + "mean_token_accuracy": 0.6776652832825979, + "num_tokens": 1936009216.0, + "step": 11545 + }, + { + "entropy": 1.7601770758628845, + "epoch": 1.2683804344840846, + "grad_norm": 0.8611322045326233, + "learning_rate": 7.613436867854602e-06, + "loss": 1.4269, + "mean_token_accuracy": 0.6573955913384756, + "num_tokens": 1936200547.0, + "step": 11546 + }, + { + "entropy": 1.627968817949295, + "epoch": 1.2684902913954574, + "grad_norm": 0.6086071729660034, + "learning_rate": 7.611953469114848e-06, + "loss": 1.3741, + "mean_token_accuracy": 0.6606210221846899, + "num_tokens": 1936426414.0, + "step": 11547 + }, + { + "entropy": 1.715288132429123, + "epoch": 1.2686001483068303, + "grad_norm": 0.6595588326454163, + "learning_rate": 7.610470177605242e-06, + "loss": 1.3391, + "mean_token_accuracy": 0.6657747477293015, + "num_tokens": 1936579983.0, + "step": 11548 + }, + { + "entropy": 1.7211828331152599, + "epoch": 1.2687100052182032, + "grad_norm": 0.7300513386726379, + "learning_rate": 7.608986993372727e-06, + "loss": 1.4806, + "mean_token_accuracy": 0.6543687780698141, + "num_tokens": 1936713738.0, + "step": 11549 + }, + { + "entropy": 1.6829596360524495, + "epoch": 1.2688198621295763, + "grad_norm": 0.6392272710800171, + "learning_rate": 7.607503916464241e-06, + "loss": 1.3133, + "mean_token_accuracy": 0.6747443874677023, + "num_tokens": 1936857583.0, + "step": 11550 + }, + { + "entropy": 1.682002027829488, + "epoch": 1.2689297190409492, + "grad_norm": 0.6379438638687134, + "learning_rate": 7.606020946926731e-06, + "loss": 1.4315, + "mean_token_accuracy": 0.6531884868939718, + "num_tokens": 1937072118.0, + "step": 11551 + }, + { + "entropy": 1.6802029808362324, + "epoch": 1.269039575952322, + "grad_norm": 0.6880862712860107, + "learning_rate": 7.6045380848071295e-06, + "loss": 1.3077, + "mean_token_accuracy": 0.66480353474617, + "num_tokens": 1937192796.0, + "step": 11552 + }, + { + "entropy": 1.7240610718727112, + "epoch": 1.269149432863695, + "grad_norm": 0.8166324496269226, + "learning_rate": 7.6030553301523665e-06, + "loss": 1.4164, + "mean_token_accuracy": 0.653435026605924, + "num_tokens": 1937370359.0, + "step": 11553 + }, + { + "entropy": 1.6760085920492809, + "epoch": 1.2692592897750679, + "grad_norm": 0.7545213103294373, + "learning_rate": 7.601572683009373e-06, + "loss": 1.2791, + "mean_token_accuracy": 0.6738790373007456, + "num_tokens": 1937515434.0, + "step": 11554 + }, + { + "entropy": 1.7001279195149739, + "epoch": 1.269369146686441, + "grad_norm": 0.6577803492546082, + "learning_rate": 7.60009014342507e-06, + "loss": 1.5541, + "mean_token_accuracy": 0.6371362606684366, + "num_tokens": 1937763386.0, + "step": 11555 + }, + { + "entropy": 1.6969635585943859, + "epoch": 1.2694790035978138, + "grad_norm": 0.6654831171035767, + "learning_rate": 7.598607711446382e-06, + "loss": 1.3728, + "mean_token_accuracy": 0.6597702354192734, + "num_tokens": 1937915864.0, + "step": 11556 + }, + { + "entropy": 1.7329435348510742, + "epoch": 1.2695888605091867, + "grad_norm": 0.7754169702529907, + "learning_rate": 7.59712538712023e-06, + "loss": 1.5217, + "mean_token_accuracy": 0.6538184309999148, + "num_tokens": 1938088629.0, + "step": 11557 + }, + { + "entropy": 1.7482871214548747, + "epoch": 1.2696987174205598, + "grad_norm": 0.6027451157569885, + "learning_rate": 7.595643170493525e-06, + "loss": 1.2809, + "mean_token_accuracy": 0.6688571075598398, + "num_tokens": 1938242998.0, + "step": 11558 + }, + { + "entropy": 1.6371783415476482, + "epoch": 1.2698085743319327, + "grad_norm": 0.7046194672584534, + "learning_rate": 7.594161061613179e-06, + "loss": 1.4464, + "mean_token_accuracy": 0.6568540185689926, + "num_tokens": 1938473136.0, + "step": 11559 + }, + { + "entropy": 1.71583757797877, + "epoch": 1.2699184312433056, + "grad_norm": 0.753637969493866, + "learning_rate": 7.592679060526101e-06, + "loss": 1.5237, + "mean_token_accuracy": 0.6531053235133489, + "num_tokens": 1938604756.0, + "step": 11560 + }, + { + "entropy": 1.7930570244789124, + "epoch": 1.2700282881546785, + "grad_norm": 0.7080893516540527, + "learning_rate": 7.591197167279196e-06, + "loss": 1.5274, + "mean_token_accuracy": 0.6388321270545324, + "num_tokens": 1938794403.0, + "step": 11561 + }, + { + "entropy": 1.6845806340376537, + "epoch": 1.2701381450660514, + "grad_norm": 0.6687464714050293, + "learning_rate": 7.58971538191936e-06, + "loss": 1.2573, + "mean_token_accuracy": 0.6719970951477686, + "num_tokens": 1938929740.0, + "step": 11562 + }, + { + "entropy": 1.6890028317769368, + "epoch": 1.2702480019774245, + "grad_norm": 0.7339609265327454, + "learning_rate": 7.588233704493502e-06, + "loss": 1.3484, + "mean_token_accuracy": 0.6660866936047872, + "num_tokens": 1939075182.0, + "step": 11563 + }, + { + "entropy": 1.7128116687138875, + "epoch": 1.2703578588887974, + "grad_norm": 0.8218494057655334, + "learning_rate": 7.586752135048505e-06, + "loss": 1.3804, + "mean_token_accuracy": 0.6709433694680532, + "num_tokens": 1939238859.0, + "step": 11564 + }, + { + "entropy": 1.6646581888198853, + "epoch": 1.2704677158001703, + "grad_norm": 0.7205196022987366, + "learning_rate": 7.585270673631266e-06, + "loss": 1.29, + "mean_token_accuracy": 0.681825632850329, + "num_tokens": 1939391199.0, + "step": 11565 + }, + { + "entropy": 1.7184071640173595, + "epoch": 1.2705775727115431, + "grad_norm": 0.6637095808982849, + "learning_rate": 7.583789320288675e-06, + "loss": 1.2919, + "mean_token_accuracy": 0.6663111497958502, + "num_tokens": 1939503801.0, + "step": 11566 + }, + { + "entropy": 1.684226264556249, + "epoch": 1.270687429622916, + "grad_norm": 0.6571996212005615, + "learning_rate": 7.58230807506761e-06, + "loss": 1.426, + "mean_token_accuracy": 0.6591875404119492, + "num_tokens": 1939662664.0, + "step": 11567 + }, + { + "entropy": 1.6475600401560466, + "epoch": 1.2707972865342891, + "grad_norm": 0.6276744604110718, + "learning_rate": 7.580826938014953e-06, + "loss": 1.3372, + "mean_token_accuracy": 0.6658165256182352, + "num_tokens": 1939828551.0, + "step": 11568 + }, + { + "entropy": 1.685198297103246, + "epoch": 1.270907143445662, + "grad_norm": 0.6816840171813965, + "learning_rate": 7.579345909177586e-06, + "loss": 1.4007, + "mean_token_accuracy": 0.657182534535726, + "num_tokens": 1939959149.0, + "step": 11569 + }, + { + "entropy": 1.7015057305494945, + "epoch": 1.271017000357035, + "grad_norm": 0.6778846979141235, + "learning_rate": 7.577864988602377e-06, + "loss": 1.4315, + "mean_token_accuracy": 0.6453954130411148, + "num_tokens": 1940132843.0, + "step": 11570 + }, + { + "entropy": 1.7004386285940807, + "epoch": 1.271126857268408, + "grad_norm": 0.7151092886924744, + "learning_rate": 7.5763841763362e-06, + "loss": 1.4591, + "mean_token_accuracy": 0.6519459386666616, + "num_tokens": 1940315082.0, + "step": 11571 + }, + { + "entropy": 1.6989735166231792, + "epoch": 1.271236714179781, + "grad_norm": 0.6294535994529724, + "learning_rate": 7.574903472425923e-06, + "loss": 1.217, + "mean_token_accuracy": 0.6772444297870001, + "num_tokens": 1940439043.0, + "step": 11572 + }, + { + "entropy": 1.7112750212351482, + "epoch": 1.2713465710911538, + "grad_norm": 0.7392633557319641, + "learning_rate": 7.573422876918404e-06, + "loss": 1.4047, + "mean_token_accuracy": 0.6572145769993464, + "num_tokens": 1940580555.0, + "step": 11573 + }, + { + "entropy": 1.783184975385666, + "epoch": 1.2714564280025267, + "grad_norm": 0.6250627040863037, + "learning_rate": 7.571942389860507e-06, + "loss": 1.4025, + "mean_token_accuracy": 0.6510418156782786, + "num_tokens": 1940756497.0, + "step": 11574 + }, + { + "entropy": 1.6986599067846935, + "epoch": 1.2715662849138996, + "grad_norm": 0.6578481197357178, + "learning_rate": 7.570462011299091e-06, + "loss": 1.2965, + "mean_token_accuracy": 0.6748481144507726, + "num_tokens": 1940890219.0, + "step": 11575 + }, + { + "entropy": 1.7359587053457897, + "epoch": 1.2716761418252727, + "grad_norm": 0.6191852688789368, + "learning_rate": 7.568981741281007e-06, + "loss": 1.468, + "mean_token_accuracy": 0.6537004808584849, + "num_tokens": 1941083366.0, + "step": 11576 + }, + { + "entropy": 1.6769965887069702, + "epoch": 1.2717859987366456, + "grad_norm": 0.639702320098877, + "learning_rate": 7.567501579853103e-06, + "loss": 1.5604, + "mean_token_accuracy": 0.6393257280190786, + "num_tokens": 1941330691.0, + "step": 11577 + }, + { + "entropy": 1.695969820022583, + "epoch": 1.2718958556480184, + "grad_norm": 0.6549391746520996, + "learning_rate": 7.5660215270622306e-06, + "loss": 1.393, + "mean_token_accuracy": 0.6529108683268229, + "num_tokens": 1941483759.0, + "step": 11578 + }, + { + "entropy": 1.6532461146513622, + "epoch": 1.2720057125593913, + "grad_norm": 0.5573631525039673, + "learning_rate": 7.5645415829552275e-06, + "loss": 1.424, + "mean_token_accuracy": 0.6503102580706278, + "num_tokens": 1941662294.0, + "step": 11579 + }, + { + "entropy": 1.720036009947459, + "epoch": 1.2721155694707642, + "grad_norm": 0.6423214673995972, + "learning_rate": 7.56306174757893e-06, + "loss": 1.3438, + "mean_token_accuracy": 0.6584438482920328, + "num_tokens": 1941797711.0, + "step": 11580 + }, + { + "entropy": 1.6889376938343048, + "epoch": 1.2722254263821373, + "grad_norm": 0.6410171389579773, + "learning_rate": 7.5615820209801875e-06, + "loss": 1.4084, + "mean_token_accuracy": 0.6631045937538147, + "num_tokens": 1941925972.0, + "step": 11581 + }, + { + "entropy": 1.7637586692969005, + "epoch": 1.2723352832935102, + "grad_norm": 1.9716415405273438, + "learning_rate": 7.560102403205822e-06, + "loss": 1.1051, + "mean_token_accuracy": 0.676330178976059, + "num_tokens": 1942076811.0, + "step": 11582 + }, + { + "entropy": 1.6584815084934235, + "epoch": 1.272445140204883, + "grad_norm": 0.6023903489112854, + "learning_rate": 7.558622894302663e-06, + "loss": 1.4478, + "mean_token_accuracy": 0.660874476035436, + "num_tokens": 1942249799.0, + "step": 11583 + }, + { + "entropy": 1.7897962033748627, + "epoch": 1.2725549971162562, + "grad_norm": 0.7759119868278503, + "learning_rate": 7.557143494317543e-06, + "loss": 1.2283, + "mean_token_accuracy": 0.6802993218104044, + "num_tokens": 1942363001.0, + "step": 11584 + }, + { + "entropy": 1.7277598679065704, + "epoch": 1.272664854027629, + "grad_norm": 0.5648651719093323, + "learning_rate": 7.5556642032972774e-06, + "loss": 1.3947, + "mean_token_accuracy": 0.6426876882712046, + "num_tokens": 1942584633.0, + "step": 11585 + }, + { + "entropy": 1.7164186437924702, + "epoch": 1.272774710939002, + "grad_norm": 0.7039127349853516, + "learning_rate": 7.554185021288684e-06, + "loss": 1.5314, + "mean_token_accuracy": 0.6496036102374395, + "num_tokens": 1942729133.0, + "step": 11586 + }, + { + "entropy": 1.6905947029590607, + "epoch": 1.2728845678503748, + "grad_norm": 0.6478644609451294, + "learning_rate": 7.5527059483385875e-06, + "loss": 1.3381, + "mean_token_accuracy": 0.659003218015035, + "num_tokens": 1942862637.0, + "step": 11587 + }, + { + "entropy": 1.6825307210286458, + "epoch": 1.2729944247617477, + "grad_norm": 0.6765702962875366, + "learning_rate": 7.551226984493793e-06, + "loss": 1.3988, + "mean_token_accuracy": 0.6641071836153666, + "num_tokens": 1943082999.0, + "step": 11588 + }, + { + "entropy": 1.7173330585161846, + "epoch": 1.2731042816731208, + "grad_norm": 0.8550540804862976, + "learning_rate": 7.549748129801109e-06, + "loss": 1.5485, + "mean_token_accuracy": 0.6394771635532379, + "num_tokens": 1943238205.0, + "step": 11589 + }, + { + "entropy": 1.6354697545369465, + "epoch": 1.2732141385844937, + "grad_norm": 0.688818633556366, + "learning_rate": 7.548269384307345e-06, + "loss": 1.2072, + "mean_token_accuracy": 0.6835995813210806, + "num_tokens": 1943351183.0, + "step": 11590 + }, + { + "entropy": 1.6105882823467255, + "epoch": 1.2733239954958666, + "grad_norm": 0.6564744710922241, + "learning_rate": 7.5467907480592984e-06, + "loss": 1.3681, + "mean_token_accuracy": 0.6682546585798264, + "num_tokens": 1943561853.0, + "step": 11591 + }, + { + "entropy": 1.6714021066824596, + "epoch": 1.2734338524072395, + "grad_norm": 0.6318192481994629, + "learning_rate": 7.545312221103765e-06, + "loss": 1.3323, + "mean_token_accuracy": 0.669236014286677, + "num_tokens": 1943736887.0, + "step": 11592 + }, + { + "entropy": 1.7196357150872548, + "epoch": 1.2735437093186124, + "grad_norm": 0.7883795499801636, + "learning_rate": 7.543833803487548e-06, + "loss": 1.4954, + "mean_token_accuracy": 0.6526716152826945, + "num_tokens": 1943908941.0, + "step": 11593 + }, + { + "entropy": 1.6909798383712769, + "epoch": 1.2736535662299855, + "grad_norm": 0.7004644870758057, + "learning_rate": 7.542355495257432e-06, + "loss": 1.4842, + "mean_token_accuracy": 0.6391565153996149, + "num_tokens": 1944106941.0, + "step": 11594 + }, + { + "entropy": 1.7800021568934123, + "epoch": 1.2737634231413584, + "grad_norm": 0.8370211124420166, + "learning_rate": 7.540877296460205e-06, + "loss": 1.2816, + "mean_token_accuracy": 0.6755285759766897, + "num_tokens": 1944294593.0, + "step": 11595 + }, + { + "entropy": 1.7417665024598439, + "epoch": 1.2738732800527313, + "grad_norm": 0.663817822933197, + "learning_rate": 7.539399207142657e-06, + "loss": 1.4015, + "mean_token_accuracy": 0.6470625003178915, + "num_tokens": 1944448204.0, + "step": 11596 + }, + { + "entropy": 1.717939426501592, + "epoch": 1.2739831369641044, + "grad_norm": 0.6097803115844727, + "learning_rate": 7.537921227351561e-06, + "loss": 1.3607, + "mean_token_accuracy": 0.6571676184733709, + "num_tokens": 1944609689.0, + "step": 11597 + }, + { + "entropy": 1.709089497725169, + "epoch": 1.2740929938754773, + "grad_norm": 0.7132073044776917, + "learning_rate": 7.536443357133696e-06, + "loss": 1.4271, + "mean_token_accuracy": 0.6546765118837357, + "num_tokens": 1944787935.0, + "step": 11598 + }, + { + "entropy": 1.6773851712544758, + "epoch": 1.2742028507868501, + "grad_norm": 0.6749030947685242, + "learning_rate": 7.5349655965358415e-06, + "loss": 1.4296, + "mean_token_accuracy": 0.6590938319762548, + "num_tokens": 1944997360.0, + "step": 11599 + }, + { + "entropy": 1.69098565975825, + "epoch": 1.274312707698223, + "grad_norm": 0.6707255244255066, + "learning_rate": 7.533487945604765e-06, + "loss": 1.322, + "mean_token_accuracy": 0.6724912573893865, + "num_tokens": 1945134346.0, + "step": 11600 + }, + { + "entropy": 1.7805437743663788, + "epoch": 1.274422564609596, + "grad_norm": 0.6537451148033142, + "learning_rate": 7.532010404387231e-06, + "loss": 1.4231, + "mean_token_accuracy": 0.646497001250585, + "num_tokens": 1945327281.0, + "step": 11601 + }, + { + "entropy": 1.666386862595876, + "epoch": 1.274532421520969, + "grad_norm": 0.6942588686943054, + "learning_rate": 7.530532972930007e-06, + "loss": 1.2446, + "mean_token_accuracy": 0.6778380324443182, + "num_tokens": 1945439224.0, + "step": 11602 + }, + { + "entropy": 1.7151568233966827, + "epoch": 1.274642278432342, + "grad_norm": 0.7166895866394043, + "learning_rate": 7.529055651279851e-06, + "loss": 1.3638, + "mean_token_accuracy": 0.6571328192949295, + "num_tokens": 1945660352.0, + "step": 11603 + }, + { + "entropy": 1.7030317882696788, + "epoch": 1.2747521353437148, + "grad_norm": 0.6731720566749573, + "learning_rate": 7.5275784394835135e-06, + "loss": 1.3623, + "mean_token_accuracy": 0.6527252991994222, + "num_tokens": 1945809702.0, + "step": 11604 + }, + { + "entropy": 1.7581920226414998, + "epoch": 1.2748619922550877, + "grad_norm": 0.6831167936325073, + "learning_rate": 7.526101337587761e-06, + "loss": 1.3561, + "mean_token_accuracy": 0.6571609377861023, + "num_tokens": 1945959339.0, + "step": 11605 + }, + { + "entropy": 1.6485190987586975, + "epoch": 1.2749718491664606, + "grad_norm": 0.6158422827720642, + "learning_rate": 7.524624345639333e-06, + "loss": 1.3174, + "mean_token_accuracy": 0.6634372224410375, + "num_tokens": 1946151020.0, + "step": 11606 + }, + { + "entropy": 1.6657854715983074, + "epoch": 1.2750817060778337, + "grad_norm": 0.622463583946228, + "learning_rate": 7.5231474636849785e-06, + "loss": 1.3579, + "mean_token_accuracy": 0.645659883817037, + "num_tokens": 1946309288.0, + "step": 11607 + }, + { + "entropy": 1.7252983450889587, + "epoch": 1.2751915629892066, + "grad_norm": 0.7152490615844727, + "learning_rate": 7.521670691771443e-06, + "loss": 1.4609, + "mean_token_accuracy": 0.6484145522117615, + "num_tokens": 1946497861.0, + "step": 11608 + }, + { + "entropy": 1.6993794043858845, + "epoch": 1.2753014199005794, + "grad_norm": 0.6890069842338562, + "learning_rate": 7.52019402994546e-06, + "loss": 1.4335, + "mean_token_accuracy": 0.6603013724088669, + "num_tokens": 1946661551.0, + "step": 11609 + }, + { + "entropy": 1.7552596231301625, + "epoch": 1.2754112768119525, + "grad_norm": 0.6681763529777527, + "learning_rate": 7.5187174782537675e-06, + "loss": 1.4473, + "mean_token_accuracy": 0.6456053505341212, + "num_tokens": 1946840538.0, + "step": 11610 + }, + { + "entropy": 1.7156967719395955, + "epoch": 1.2755211337233254, + "grad_norm": 0.629675567150116, + "learning_rate": 7.517241036743097e-06, + "loss": 1.5218, + "mean_token_accuracy": 0.6306114296118418, + "num_tokens": 1947058986.0, + "step": 11611 + }, + { + "entropy": 1.7247178852558136, + "epoch": 1.2756309906346983, + "grad_norm": 0.6354183554649353, + "learning_rate": 7.51576470546018e-06, + "loss": 1.4083, + "mean_token_accuracy": 0.6475364615519842, + "num_tokens": 1947200502.0, + "step": 11612 + }, + { + "entropy": 1.7191159228483837, + "epoch": 1.2757408475460712, + "grad_norm": 0.5886407494544983, + "learning_rate": 7.514288484451742e-06, + "loss": 1.4599, + "mean_token_accuracy": 0.6498788446187973, + "num_tokens": 1947384612.0, + "step": 11613 + }, + { + "entropy": 1.7265147765477498, + "epoch": 1.275850704457444, + "grad_norm": 0.6430819630622864, + "learning_rate": 7.5128123737645e-06, + "loss": 1.4648, + "mean_token_accuracy": 0.6587880849838257, + "num_tokens": 1947533842.0, + "step": 11614 + }, + { + "entropy": 1.6808937191963196, + "epoch": 1.2759605613688172, + "grad_norm": 0.6885290741920471, + "learning_rate": 7.511336373445175e-06, + "loss": 1.4076, + "mean_token_accuracy": 0.6503855834404627, + "num_tokens": 1947734266.0, + "step": 11615 + }, + { + "entropy": 1.7028583685557048, + "epoch": 1.27607041828019, + "grad_norm": 0.7654819488525391, + "learning_rate": 7.5098604835404856e-06, + "loss": 1.3317, + "mean_token_accuracy": 0.6681941697994868, + "num_tokens": 1947850050.0, + "step": 11616 + }, + { + "entropy": 1.7267645796140034, + "epoch": 1.276180275191563, + "grad_norm": 0.6581327319145203, + "learning_rate": 7.508384704097134e-06, + "loss": 1.4452, + "mean_token_accuracy": 0.6505034416913986, + "num_tokens": 1948033475.0, + "step": 11617 + }, + { + "entropy": 1.6980148752530415, + "epoch": 1.2762901321029358, + "grad_norm": 0.7092710137367249, + "learning_rate": 7.506909035161833e-06, + "loss": 1.3132, + "mean_token_accuracy": 0.6706616580486298, + "num_tokens": 1948154888.0, + "step": 11618 + }, + { + "entropy": 1.6872047583262126, + "epoch": 1.2763999890143087, + "grad_norm": 0.7112807035446167, + "learning_rate": 7.505433476781292e-06, + "loss": 1.2504, + "mean_token_accuracy": 0.6689596921205521, + "num_tokens": 1948286056.0, + "step": 11619 + }, + { + "entropy": 1.7642404039700825, + "epoch": 1.2765098459256818, + "grad_norm": 0.7342185974121094, + "learning_rate": 7.5039580290022054e-06, + "loss": 1.495, + "mean_token_accuracy": 0.6511110663414001, + "num_tokens": 1948447031.0, + "step": 11620 + }, + { + "entropy": 1.714083880186081, + "epoch": 1.2766197028370547, + "grad_norm": 0.8669022917747498, + "learning_rate": 7.502482691871269e-06, + "loss": 1.1501, + "mean_token_accuracy": 0.6964519172906876, + "num_tokens": 1948544481.0, + "step": 11621 + }, + { + "entropy": 1.6839018563429515, + "epoch": 1.2767295597484276, + "grad_norm": 0.6407862901687622, + "learning_rate": 7.501007465435182e-06, + "loss": 1.568, + "mean_token_accuracy": 0.6368062049150467, + "num_tokens": 1948753108.0, + "step": 11622 + }, + { + "entropy": 1.763452668984731, + "epoch": 1.2768394166598007, + "grad_norm": 0.7078571319580078, + "learning_rate": 7.499532349740631e-06, + "loss": 1.5233, + "mean_token_accuracy": 0.6335721065600713, + "num_tokens": 1948921783.0, + "step": 11623 + }, + { + "entropy": 1.6938395102818806, + "epoch": 1.2769492735711736, + "grad_norm": 0.6525269150733948, + "learning_rate": 7.498057344834302e-06, + "loss": 1.5406, + "mean_token_accuracy": 0.6494365582863489, + "num_tokens": 1949114943.0, + "step": 11624 + }, + { + "entropy": 1.6960388819376628, + "epoch": 1.2770591304825465, + "grad_norm": 0.6488698124885559, + "learning_rate": 7.496582450762881e-06, + "loss": 1.3803, + "mean_token_accuracy": 0.6615366737047831, + "num_tokens": 1949274514.0, + "step": 11625 + }, + { + "entropy": 1.6426782707373302, + "epoch": 1.2771689873939194, + "grad_norm": 0.6749052405357361, + "learning_rate": 7.495107667573047e-06, + "loss": 1.3651, + "mean_token_accuracy": 0.6629842420419058, + "num_tokens": 1949433286.0, + "step": 11626 + }, + { + "entropy": 1.7444765071074169, + "epoch": 1.2772788443052923, + "grad_norm": 0.5791497230529785, + "learning_rate": 7.493632995311477e-06, + "loss": 1.373, + "mean_token_accuracy": 0.6536852220694224, + "num_tokens": 1949597131.0, + "step": 11627 + }, + { + "entropy": 1.6915338238080342, + "epoch": 1.2773887012166654, + "grad_norm": 0.8342865109443665, + "learning_rate": 7.492158434024846e-06, + "loss": 1.6073, + "mean_token_accuracy": 0.6346415231625239, + "num_tokens": 1949781046.0, + "step": 11628 + }, + { + "entropy": 1.7401387890179951, + "epoch": 1.2774985581280383, + "grad_norm": 0.62762051820755, + "learning_rate": 7.490683983759814e-06, + "loss": 1.435, + "mean_token_accuracy": 0.6580404887596766, + "num_tokens": 1949912389.0, + "step": 11629 + }, + { + "entropy": 1.6930598020553589, + "epoch": 1.2776084150394111, + "grad_norm": 0.6954199075698853, + "learning_rate": 7.489209644563053e-06, + "loss": 1.416, + "mean_token_accuracy": 0.659172311425209, + "num_tokens": 1950093173.0, + "step": 11630 + }, + { + "entropy": 1.6991178691387177, + "epoch": 1.277718271950784, + "grad_norm": 0.712602972984314, + "learning_rate": 7.487735416481227e-06, + "loss": 1.306, + "mean_token_accuracy": 0.6658920894066492, + "num_tokens": 1950240503.0, + "step": 11631 + }, + { + "entropy": 1.7666970590750377, + "epoch": 1.277828128862157, + "grad_norm": 0.6580962538719177, + "learning_rate": 7.486261299560993e-06, + "loss": 1.4578, + "mean_token_accuracy": 0.660940021276474, + "num_tokens": 1950405403.0, + "step": 11632 + }, + { + "entropy": 1.684452474117279, + "epoch": 1.27793798577353, + "grad_norm": 0.8572995662689209, + "learning_rate": 7.484787293849003e-06, + "loss": 1.2695, + "mean_token_accuracy": 0.6728391995032629, + "num_tokens": 1950539727.0, + "step": 11633 + }, + { + "entropy": 1.717938760916392, + "epoch": 1.278047842684903, + "grad_norm": 0.7117380499839783, + "learning_rate": 7.483313399391914e-06, + "loss": 1.3573, + "mean_token_accuracy": 0.6588635991017023, + "num_tokens": 1950689158.0, + "step": 11634 + }, + { + "entropy": 1.703975349664688, + "epoch": 1.2781576995962758, + "grad_norm": 3.185786724090576, + "learning_rate": 7.48183961623637e-06, + "loss": 1.5718, + "mean_token_accuracy": 0.6170200606187185, + "num_tokens": 1950984775.0, + "step": 11635 + }, + { + "entropy": 1.7145447830359142, + "epoch": 1.278267556507649, + "grad_norm": 0.689428985118866, + "learning_rate": 7.480365944429013e-06, + "loss": 1.5036, + "mean_token_accuracy": 0.6465061157941818, + "num_tokens": 1951196598.0, + "step": 11636 + }, + { + "entropy": 1.7026291191577911, + "epoch": 1.2783774134190218, + "grad_norm": 0.6487104296684265, + "learning_rate": 7.478892384016494e-06, + "loss": 1.5404, + "mean_token_accuracy": 0.6514692256848017, + "num_tokens": 1951402964.0, + "step": 11637 + }, + { + "entropy": 1.6885337332884471, + "epoch": 1.2784872703303947, + "grad_norm": 0.7306270599365234, + "learning_rate": 7.477418935045442e-06, + "loss": 1.4906, + "mean_token_accuracy": 0.6599554171164831, + "num_tokens": 1951561045.0, + "step": 11638 + }, + { + "entropy": 1.6858366429805756, + "epoch": 1.2785971272417676, + "grad_norm": 0.6280055046081543, + "learning_rate": 7.475945597562491e-06, + "loss": 1.4303, + "mean_token_accuracy": 0.6426028609275818, + "num_tokens": 1951751395.0, + "step": 11639 + }, + { + "entropy": 1.7249255081017811, + "epoch": 1.2787069841531404, + "grad_norm": 0.6180586218833923, + "learning_rate": 7.4744723716142785e-06, + "loss": 1.5163, + "mean_token_accuracy": 0.6588364889224371, + "num_tokens": 1951920690.0, + "step": 11640 + }, + { + "entropy": 1.6651106576124828, + "epoch": 1.2788168410645135, + "grad_norm": 0.629157304763794, + "learning_rate": 7.472999257247424e-06, + "loss": 1.3646, + "mean_token_accuracy": 0.6712455501159033, + "num_tokens": 1952071357.0, + "step": 11641 + }, + { + "entropy": 1.7360005180040996, + "epoch": 1.2789266979758864, + "grad_norm": 0.6886469125747681, + "learning_rate": 7.471526254508552e-06, + "loss": 1.2119, + "mean_token_accuracy": 0.6848239749670029, + "num_tokens": 1952185288.0, + "step": 11642 + }, + { + "entropy": 1.7535901367664337, + "epoch": 1.2790365548872593, + "grad_norm": 0.8039774298667908, + "learning_rate": 7.470053363444288e-06, + "loss": 1.3669, + "mean_token_accuracy": 0.6577004939317703, + "num_tokens": 1952336655.0, + "step": 11643 + }, + { + "entropy": 1.7281550963719685, + "epoch": 1.2791464117986322, + "grad_norm": 0.6642824411392212, + "learning_rate": 7.4685805841012414e-06, + "loss": 1.3183, + "mean_token_accuracy": 0.6607625285784403, + "num_tokens": 1952503015.0, + "step": 11644 + }, + { + "entropy": 1.6712844371795654, + "epoch": 1.279256268710005, + "grad_norm": 0.6621568202972412, + "learning_rate": 7.467107916526028e-06, + "loss": 1.3695, + "mean_token_accuracy": 0.6621130158503851, + "num_tokens": 1952675470.0, + "step": 11645 + }, + { + "entropy": 1.6059234241644542, + "epoch": 1.2793661256213782, + "grad_norm": 0.6320291757583618, + "learning_rate": 7.46563536076526e-06, + "loss": 1.283, + "mean_token_accuracy": 0.6788782924413681, + "num_tokens": 1952802099.0, + "step": 11646 + }, + { + "entropy": 1.7441943685213726, + "epoch": 1.279475982532751, + "grad_norm": 0.6545817255973816, + "learning_rate": 7.464162916865541e-06, + "loss": 1.3842, + "mean_token_accuracy": 0.6757529973983765, + "num_tokens": 1952974376.0, + "step": 11647 + }, + { + "entropy": 1.6520853539307911, + "epoch": 1.279585839444124, + "grad_norm": 0.6548392176628113, + "learning_rate": 7.462690584873467e-06, + "loss": 1.355, + "mean_token_accuracy": 0.6545472939809164, + "num_tokens": 1953140371.0, + "step": 11648 + }, + { + "entropy": 1.7266732851664226, + "epoch": 1.279695696355497, + "grad_norm": 0.8059017062187195, + "learning_rate": 7.461218364835645e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6783981472253799, + "num_tokens": 1953295038.0, + "step": 11649 + }, + { + "entropy": 1.6765375832716625, + "epoch": 1.27980555326687, + "grad_norm": 0.6597868204116821, + "learning_rate": 7.459746256798666e-06, + "loss": 1.3894, + "mean_token_accuracy": 0.6578025966882706, + "num_tokens": 1953454565.0, + "step": 11650 + }, + { + "entropy": 1.666476051012675, + "epoch": 1.2799154101782428, + "grad_norm": 0.6877656579017639, + "learning_rate": 7.4582742608091244e-06, + "loss": 1.4281, + "mean_token_accuracy": 0.6640834957361221, + "num_tokens": 1953657839.0, + "step": 11651 + }, + { + "entropy": 1.6953352391719818, + "epoch": 1.2800252670896157, + "grad_norm": 0.6984429359436035, + "learning_rate": 7.456802376913608e-06, + "loss": 1.3965, + "mean_token_accuracy": 0.670659194389979, + "num_tokens": 1953769451.0, + "step": 11652 + }, + { + "entropy": 1.66109103957812, + "epoch": 1.2801351240009886, + "grad_norm": 0.6225873827934265, + "learning_rate": 7.455330605158697e-06, + "loss": 1.4248, + "mean_token_accuracy": 0.6516261696815491, + "num_tokens": 1953987279.0, + "step": 11653 + }, + { + "entropy": 1.7144280870755513, + "epoch": 1.2802449809123617, + "grad_norm": 0.6939162015914917, + "learning_rate": 7.453858945590973e-06, + "loss": 1.4259, + "mean_token_accuracy": 0.6643421500921249, + "num_tokens": 1954139885.0, + "step": 11654 + }, + { + "entropy": 1.6657731036345165, + "epoch": 1.2803548378237346, + "grad_norm": 0.7827641367912292, + "learning_rate": 7.45238739825702e-06, + "loss": 1.3829, + "mean_token_accuracy": 0.6760758807261785, + "num_tokens": 1954304470.0, + "step": 11655 + }, + { + "entropy": 1.6782557964324951, + "epoch": 1.2804646947351075, + "grad_norm": 0.7069709897041321, + "learning_rate": 7.4509159632034045e-06, + "loss": 1.3769, + "mean_token_accuracy": 0.6612060517072678, + "num_tokens": 1954436039.0, + "step": 11656 + }, + { + "entropy": 1.6865948935349782, + "epoch": 1.2805745516464806, + "grad_norm": 0.5696167945861816, + "learning_rate": 7.449444640476702e-06, + "loss": 1.4007, + "mean_token_accuracy": 0.6477059076229731, + "num_tokens": 1954623170.0, + "step": 11657 + }, + { + "entropy": 1.7937167088190715, + "epoch": 1.2806844085578533, + "grad_norm": 0.8088985085487366, + "learning_rate": 7.447973430123476e-06, + "loss": 1.5221, + "mean_token_accuracy": 0.6332688679297765, + "num_tokens": 1954786672.0, + "step": 11658 + }, + { + "entropy": 1.6993589500586193, + "epoch": 1.2807942654692264, + "grad_norm": 0.6645467877388, + "learning_rate": 7.446502332190289e-06, + "loss": 1.3088, + "mean_token_accuracy": 0.6647295008103052, + "num_tokens": 1954897607.0, + "step": 11659 + }, + { + "entropy": 1.6773101290067036, + "epoch": 1.2809041223805993, + "grad_norm": 0.8003481030464172, + "learning_rate": 7.445031346723699e-06, + "loss": 1.3166, + "mean_token_accuracy": 0.6816578855117162, + "num_tokens": 1955037208.0, + "step": 11660 + }, + { + "entropy": 1.6241084535916646, + "epoch": 1.2810139792919721, + "grad_norm": 0.8966746926307678, + "learning_rate": 7.443560473770271e-06, + "loss": 1.395, + "mean_token_accuracy": 0.6726480275392532, + "num_tokens": 1955209074.0, + "step": 11661 + }, + { + "entropy": 1.7033714254697163, + "epoch": 1.2811238362033452, + "grad_norm": 0.7265210151672363, + "learning_rate": 7.442089713376548e-06, + "loss": 1.2868, + "mean_token_accuracy": 0.6727895885705948, + "num_tokens": 1955344048.0, + "step": 11662 + }, + { + "entropy": 1.6858701407909393, + "epoch": 1.2812336931147181, + "grad_norm": 0.7443154454231262, + "learning_rate": 7.440619065589083e-06, + "loss": 1.4205, + "mean_token_accuracy": 0.6643148511648178, + "num_tokens": 1955487623.0, + "step": 11663 + }, + { + "entropy": 1.6719338993231456, + "epoch": 1.281343550026091, + "grad_norm": 0.7148160934448242, + "learning_rate": 7.439148530454423e-06, + "loss": 1.5308, + "mean_token_accuracy": 0.6345583150784174, + "num_tokens": 1955669240.0, + "step": 11664 + }, + { + "entropy": 1.6509188016255696, + "epoch": 1.281453406937464, + "grad_norm": 0.80116868019104, + "learning_rate": 7.437678108019104e-06, + "loss": 1.4478, + "mean_token_accuracy": 0.6688994914293289, + "num_tokens": 1955847696.0, + "step": 11665 + }, + { + "entropy": 1.7179848750432332, + "epoch": 1.2815632638488368, + "grad_norm": 0.6163962483406067, + "learning_rate": 7.436207798329667e-06, + "loss": 1.4411, + "mean_token_accuracy": 0.6562605003515879, + "num_tokens": 1956010904.0, + "step": 11666 + }, + { + "entropy": 1.695182869831721, + "epoch": 1.28167312076021, + "grad_norm": 0.6897042989730835, + "learning_rate": 7.434737601432651e-06, + "loss": 1.3903, + "mean_token_accuracy": 0.6611831237872442, + "num_tokens": 1956172860.0, + "step": 11667 + }, + { + "entropy": 1.6385972301165264, + "epoch": 1.2817829776715828, + "grad_norm": 0.5938105583190918, + "learning_rate": 7.43326751737458e-06, + "loss": 1.4496, + "mean_token_accuracy": 0.6599717885255814, + "num_tokens": 1956402001.0, + "step": 11668 + }, + { + "entropy": 1.6853100558121998, + "epoch": 1.2818928345829557, + "grad_norm": 0.7520754337310791, + "learning_rate": 7.4317975462019885e-06, + "loss": 1.4595, + "mean_token_accuracy": 0.6442477852106094, + "num_tokens": 1956542664.0, + "step": 11669 + }, + { + "entropy": 1.695339282353719, + "epoch": 1.2820026914943288, + "grad_norm": 0.6549242734909058, + "learning_rate": 7.430327687961394e-06, + "loss": 1.3846, + "mean_token_accuracy": 0.6543597926696142, + "num_tokens": 1956719717.0, + "step": 11670 + }, + { + "entropy": 1.678192138671875, + "epoch": 1.2821125484057014, + "grad_norm": 0.7235942482948303, + "learning_rate": 7.428857942699322e-06, + "loss": 1.396, + "mean_token_accuracy": 0.6574389437834421, + "num_tokens": 1956849343.0, + "step": 11671 + }, + { + "entropy": 1.6636808514595032, + "epoch": 1.2822224053170745, + "grad_norm": 0.664930522441864, + "learning_rate": 7.427388310462285e-06, + "loss": 1.3348, + "mean_token_accuracy": 0.6659966111183167, + "num_tokens": 1957000013.0, + "step": 11672 + }, + { + "entropy": 1.6923915545145671, + "epoch": 1.2823322622284474, + "grad_norm": 0.7998056411743164, + "learning_rate": 7.425918791296798e-06, + "loss": 1.4864, + "mean_token_accuracy": 0.6688689639170965, + "num_tokens": 1957204036.0, + "step": 11673 + }, + { + "entropy": 1.7271687885125477, + "epoch": 1.2824421191398203, + "grad_norm": 0.7436834573745728, + "learning_rate": 7.42444938524937e-06, + "loss": 1.312, + "mean_token_accuracy": 0.6680330435434977, + "num_tokens": 1957337123.0, + "step": 11674 + }, + { + "entropy": 1.715428461631139, + "epoch": 1.2825519760511934, + "grad_norm": 0.7918713688850403, + "learning_rate": 7.422980092366512e-06, + "loss": 1.3576, + "mean_token_accuracy": 0.6627502292394638, + "num_tokens": 1957532881.0, + "step": 11675 + }, + { + "entropy": 1.7627593576908112, + "epoch": 1.2826618329625663, + "grad_norm": 0.6629673838615417, + "learning_rate": 7.421510912694716e-06, + "loss": 1.3912, + "mean_token_accuracy": 0.6503987908363342, + "num_tokens": 1957681518.0, + "step": 11676 + }, + { + "entropy": 1.6355752150217693, + "epoch": 1.2827716898739392, + "grad_norm": 0.6012086868286133, + "learning_rate": 7.420041846280492e-06, + "loss": 1.4797, + "mean_token_accuracy": 0.6419784228006998, + "num_tokens": 1957868543.0, + "step": 11677 + }, + { + "entropy": 1.7351706624031067, + "epoch": 1.282881546785312, + "grad_norm": 0.6617944240570068, + "learning_rate": 7.418572893170328e-06, + "loss": 1.4835, + "mean_token_accuracy": 0.646317924062411, + "num_tokens": 1958073332.0, + "step": 11678 + }, + { + "entropy": 1.7856847544511159, + "epoch": 1.282991403696685, + "grad_norm": 0.7386542558670044, + "learning_rate": 7.417104053410718e-06, + "loss": 1.373, + "mean_token_accuracy": 0.6571315675973892, + "num_tokens": 1958228225.0, + "step": 11679 + }, + { + "entropy": 1.6699702441692352, + "epoch": 1.283101260608058, + "grad_norm": 0.6713958382606506, + "learning_rate": 7.415635327048152e-06, + "loss": 1.3583, + "mean_token_accuracy": 0.6684353550275167, + "num_tokens": 1958397897.0, + "step": 11680 + }, + { + "entropy": 1.7619405488173168, + "epoch": 1.283211117519431, + "grad_norm": 0.9027857780456543, + "learning_rate": 7.414166714129112e-06, + "loss": 1.3348, + "mean_token_accuracy": 0.6649856468041738, + "num_tokens": 1958569608.0, + "step": 11681 + }, + { + "entropy": 1.7285025020440419, + "epoch": 1.2833209744308038, + "grad_norm": 0.6914839744567871, + "learning_rate": 7.4126982147000785e-06, + "loss": 1.4919, + "mean_token_accuracy": 0.6378475278615952, + "num_tokens": 1958774892.0, + "step": 11682 + }, + { + "entropy": 1.7116582890351613, + "epoch": 1.283430831342177, + "grad_norm": 0.7318129539489746, + "learning_rate": 7.411229828807531e-06, + "loss": 1.3924, + "mean_token_accuracy": 0.6698858588933945, + "num_tokens": 1958936632.0, + "step": 11683 + }, + { + "entropy": 1.6879831353823345, + "epoch": 1.2835406882535498, + "grad_norm": 0.6720309257507324, + "learning_rate": 7.409761556497945e-06, + "loss": 1.3349, + "mean_token_accuracy": 0.658750464518865, + "num_tokens": 1959147194.0, + "step": 11684 + }, + { + "entropy": 1.7317336002985637, + "epoch": 1.2836505451649227, + "grad_norm": 0.7208735346794128, + "learning_rate": 7.408293397817783e-06, + "loss": 1.46, + "mean_token_accuracy": 0.6475148300329844, + "num_tokens": 1959348096.0, + "step": 11685 + }, + { + "entropy": 1.6827017863591511, + "epoch": 1.2837604020762956, + "grad_norm": 0.6627811789512634, + "learning_rate": 7.406825352813516e-06, + "loss": 1.3233, + "mean_token_accuracy": 0.6603292127450308, + "num_tokens": 1959490153.0, + "step": 11686 + }, + { + "entropy": 1.7177750865618389, + "epoch": 1.2838702589876685, + "grad_norm": 0.6499682664871216, + "learning_rate": 7.405357421531614e-06, + "loss": 1.3783, + "mean_token_accuracy": 0.6555012961228689, + "num_tokens": 1959626237.0, + "step": 11687 + }, + { + "entropy": 1.7713517745335896, + "epoch": 1.2839801158990416, + "grad_norm": 0.6956122517585754, + "learning_rate": 7.403889604018524e-06, + "loss": 1.496, + "mean_token_accuracy": 0.6454281061887741, + "num_tokens": 1959841349.0, + "step": 11688 + }, + { + "entropy": 1.7518351475397747, + "epoch": 1.2840899728104145, + "grad_norm": 0.6666655540466309, + "learning_rate": 7.402421900320711e-06, + "loss": 1.3928, + "mean_token_accuracy": 0.641091987490654, + "num_tokens": 1960040280.0, + "step": 11689 + }, + { + "entropy": 1.7136612335840862, + "epoch": 1.2841998297217874, + "grad_norm": 0.8001027703285217, + "learning_rate": 7.400954310484623e-06, + "loss": 1.5009, + "mean_token_accuracy": 0.6399757514397303, + "num_tokens": 1960222001.0, + "step": 11690 + }, + { + "entropy": 1.6805146038532257, + "epoch": 1.2843096866331603, + "grad_norm": 0.760985255241394, + "learning_rate": 7.399486834556706e-06, + "loss": 1.2747, + "mean_token_accuracy": 0.6765825847784678, + "num_tokens": 1960361451.0, + "step": 11691 + }, + { + "entropy": 1.6519253353277843, + "epoch": 1.2844195435445331, + "grad_norm": 0.6442874670028687, + "learning_rate": 7.3980194725834105e-06, + "loss": 1.4785, + "mean_token_accuracy": 0.6515339364608129, + "num_tokens": 1960537160.0, + "step": 11692 + }, + { + "entropy": 1.7215826908747356, + "epoch": 1.2845294004559062, + "grad_norm": 0.8578620553016663, + "learning_rate": 7.3965522246111774e-06, + "loss": 1.268, + "mean_token_accuracy": 0.6820251246293386, + "num_tokens": 1960638199.0, + "step": 11693 + }, + { + "entropy": 1.6882357994715373, + "epoch": 1.2846392573672791, + "grad_norm": 0.6779175996780396, + "learning_rate": 7.395085090686443e-06, + "loss": 1.2499, + "mean_token_accuracy": 0.6728865206241608, + "num_tokens": 1960770852.0, + "step": 11694 + }, + { + "entropy": 1.6696954766909282, + "epoch": 1.284749114278652, + "grad_norm": 0.6538259387016296, + "learning_rate": 7.3936180708556375e-06, + "loss": 1.2807, + "mean_token_accuracy": 0.6751231253147125, + "num_tokens": 1960900887.0, + "step": 11695 + }, + { + "entropy": 1.6774700582027435, + "epoch": 1.2848589711900251, + "grad_norm": 0.7781380414962769, + "learning_rate": 7.392151165165198e-06, + "loss": 1.3951, + "mean_token_accuracy": 0.6492411891619364, + "num_tokens": 1961094569.0, + "step": 11696 + }, + { + "entropy": 1.6870755751927693, + "epoch": 1.284968828101398, + "grad_norm": 0.629217803478241, + "learning_rate": 7.390684373661547e-06, + "loss": 1.432, + "mean_token_accuracy": 0.6549626439809799, + "num_tokens": 1961376840.0, + "step": 11697 + }, + { + "entropy": 1.6291709244251251, + "epoch": 1.285078685012771, + "grad_norm": 0.5751771926879883, + "learning_rate": 7.389217696391107e-06, + "loss": 1.3228, + "mean_token_accuracy": 0.6684358169635137, + "num_tokens": 1961548354.0, + "step": 11698 + }, + { + "entropy": 1.727874477704366, + "epoch": 1.2851885419241438, + "grad_norm": 0.7103152871131897, + "learning_rate": 7.387751133400303e-06, + "loss": 1.5097, + "mean_token_accuracy": 0.6475981076558431, + "num_tokens": 1961734956.0, + "step": 11699 + }, + { + "entropy": 1.68792125582695, + "epoch": 1.2852983988355167, + "grad_norm": 0.6444193124771118, + "learning_rate": 7.386284684735547e-06, + "loss": 1.3165, + "mean_token_accuracy": 0.6684761742750803, + "num_tokens": 1961923277.0, + "step": 11700 + }, + { + "entropy": 1.723108321428299, + "epoch": 1.2854082557468898, + "grad_norm": 0.7117464542388916, + "learning_rate": 7.384818350443252e-06, + "loss": 1.3607, + "mean_token_accuracy": 0.6554534633954366, + "num_tokens": 1962108819.0, + "step": 11701 + }, + { + "entropy": 1.6910544236501057, + "epoch": 1.2855181126582627, + "grad_norm": 0.6509753465652466, + "learning_rate": 7.38335213056983e-06, + "loss": 1.2883, + "mean_token_accuracy": 0.6676128606001536, + "num_tokens": 1962275707.0, + "step": 11702 + }, + { + "entropy": 1.6606975098450978, + "epoch": 1.2856279695696355, + "grad_norm": 0.7419883608818054, + "learning_rate": 7.38188602516168e-06, + "loss": 1.3423, + "mean_token_accuracy": 0.6589969595273336, + "num_tokens": 1962462866.0, + "step": 11703 + }, + { + "entropy": 1.7654975454012554, + "epoch": 1.2857378264810084, + "grad_norm": 0.7047287225723267, + "learning_rate": 7.380420034265205e-06, + "loss": 1.4466, + "mean_token_accuracy": 0.6558421750863394, + "num_tokens": 1962610696.0, + "step": 11704 + }, + { + "entropy": 1.6822943886121113, + "epoch": 1.2858476833923813, + "grad_norm": 0.6518201231956482, + "learning_rate": 7.3789541579268095e-06, + "loss": 1.2855, + "mean_token_accuracy": 0.6662348906199137, + "num_tokens": 1962770700.0, + "step": 11705 + }, + { + "entropy": 1.758181909720103, + "epoch": 1.2859575403037544, + "grad_norm": 0.6527379155158997, + "learning_rate": 7.377488396192882e-06, + "loss": 1.4032, + "mean_token_accuracy": 0.6612779349088669, + "num_tokens": 1962946514.0, + "step": 11706 + }, + { + "entropy": 1.764567494392395, + "epoch": 1.2860673972151273, + "grad_norm": 0.7921777367591858, + "learning_rate": 7.376022749109812e-06, + "loss": 1.3384, + "mean_token_accuracy": 0.6526055236657461, + "num_tokens": 1963086122.0, + "step": 11707 + }, + { + "entropy": 1.7310861845811207, + "epoch": 1.2861772541265002, + "grad_norm": 0.737005889415741, + "learning_rate": 7.374557216723994e-06, + "loss": 1.372, + "mean_token_accuracy": 0.6504655679066976, + "num_tokens": 1963209728.0, + "step": 11708 + }, + { + "entropy": 1.6854293247063954, + "epoch": 1.2862871110378733, + "grad_norm": 0.6641037464141846, + "learning_rate": 7.3730917990818015e-06, + "loss": 1.4029, + "mean_token_accuracy": 0.6705863028764725, + "num_tokens": 1963357538.0, + "step": 11709 + }, + { + "entropy": 1.6769119401772816, + "epoch": 1.2863969679492462, + "grad_norm": 0.6438668966293335, + "learning_rate": 7.37162649622962e-06, + "loss": 1.4861, + "mean_token_accuracy": 0.6491279552380244, + "num_tokens": 1963582021.0, + "step": 11710 + }, + { + "entropy": 1.641738514105479, + "epoch": 1.286506824860619, + "grad_norm": 0.6233183145523071, + "learning_rate": 7.3701613082138275e-06, + "loss": 1.3751, + "mean_token_accuracy": 0.6620054890712103, + "num_tokens": 1963776433.0, + "step": 11711 + }, + { + "entropy": 1.6426913837591808, + "epoch": 1.286616681771992, + "grad_norm": 0.5799766182899475, + "learning_rate": 7.368696235080792e-06, + "loss": 1.372, + "mean_token_accuracy": 0.6623861541350683, + "num_tokens": 1963956213.0, + "step": 11712 + }, + { + "entropy": 1.7183026572068532, + "epoch": 1.2867265386833648, + "grad_norm": 0.681439995765686, + "learning_rate": 7.367231276876885e-06, + "loss": 1.455, + "mean_token_accuracy": 0.655020589629809, + "num_tokens": 1964135822.0, + "step": 11713 + }, + { + "entropy": 1.7211709121863048, + "epoch": 1.286836395594738, + "grad_norm": 0.6723782420158386, + "learning_rate": 7.365766433648471e-06, + "loss": 1.4048, + "mean_token_accuracy": 0.6689492960770925, + "num_tokens": 1964301098.0, + "step": 11714 + }, + { + "entropy": 1.7698278029759724, + "epoch": 1.2869462525061108, + "grad_norm": 0.77666175365448, + "learning_rate": 7.3643017054419146e-06, + "loss": 1.5987, + "mean_token_accuracy": 0.6401151369015375, + "num_tokens": 1964462937.0, + "step": 11715 + }, + { + "entropy": 1.6744122505187988, + "epoch": 1.2870561094174837, + "grad_norm": 0.8135389685630798, + "learning_rate": 7.362837092303565e-06, + "loss": 1.205, + "mean_token_accuracy": 0.6760056912899017, + "num_tokens": 1964577164.0, + "step": 11716 + }, + { + "entropy": 1.7008947432041168, + "epoch": 1.2871659663288566, + "grad_norm": 0.6217523813247681, + "learning_rate": 7.361372594279785e-06, + "loss": 1.3901, + "mean_token_accuracy": 0.6549960921208063, + "num_tokens": 1964734519.0, + "step": 11717 + }, + { + "entropy": 1.7229733566443126, + "epoch": 1.2872758232402295, + "grad_norm": 0.6909745335578918, + "learning_rate": 7.359908211416924e-06, + "loss": 1.3528, + "mean_token_accuracy": 0.6632678508758545, + "num_tokens": 1964878772.0, + "step": 11718 + }, + { + "entropy": 1.7443588475386302, + "epoch": 1.2873856801516026, + "grad_norm": 0.6048434376716614, + "learning_rate": 7.358443943761326e-06, + "loss": 1.461, + "mean_token_accuracy": 0.6371092249949774, + "num_tokens": 1965083666.0, + "step": 11719 + }, + { + "entropy": 1.7583917180697124, + "epoch": 1.2874955370629755, + "grad_norm": 0.6534097790718079, + "learning_rate": 7.35697979135934e-06, + "loss": 1.3643, + "mean_token_accuracy": 0.6648856898148855, + "num_tokens": 1965213053.0, + "step": 11720 + }, + { + "entropy": 1.7194798986117046, + "epoch": 1.2876053939743484, + "grad_norm": 0.646305501461029, + "learning_rate": 7.3555157542572984e-06, + "loss": 1.3896, + "mean_token_accuracy": 0.6472862859567007, + "num_tokens": 1965354020.0, + "step": 11721 + }, + { + "entropy": 1.675745298465093, + "epoch": 1.2877152508857215, + "grad_norm": 0.5743793845176697, + "learning_rate": 7.354051832501541e-06, + "loss": 1.3386, + "mean_token_accuracy": 0.6610815872748693, + "num_tokens": 1965529564.0, + "step": 11722 + }, + { + "entropy": 1.6679284969965618, + "epoch": 1.2878251077970944, + "grad_norm": 0.553485631942749, + "learning_rate": 7.352588026138401e-06, + "loss": 1.464, + "mean_token_accuracy": 0.6533455202976862, + "num_tokens": 1965718786.0, + "step": 11723 + }, + { + "entropy": 1.6976435681184132, + "epoch": 1.2879349647084672, + "grad_norm": 0.6102703809738159, + "learning_rate": 7.351124335214206e-06, + "loss": 1.3461, + "mean_token_accuracy": 0.6602204740047455, + "num_tokens": 1965910642.0, + "step": 11724 + }, + { + "entropy": 1.7120192646980286, + "epoch": 1.2880448216198401, + "grad_norm": 0.7902237772941589, + "learning_rate": 7.349660759775283e-06, + "loss": 1.1653, + "mean_token_accuracy": 0.6933720608552297, + "num_tokens": 1966041926.0, + "step": 11725 + }, + { + "entropy": 1.6760241091251373, + "epoch": 1.288154678531213, + "grad_norm": 0.6319912672042847, + "learning_rate": 7.348197299867952e-06, + "loss": 1.3921, + "mean_token_accuracy": 0.678732305765152, + "num_tokens": 1966200003.0, + "step": 11726 + }, + { + "entropy": 1.7074303428332012, + "epoch": 1.2882645354425861, + "grad_norm": 0.7581055164337158, + "learning_rate": 7.34673395553853e-06, + "loss": 1.3912, + "mean_token_accuracy": 0.6616330395142237, + "num_tokens": 1966353610.0, + "step": 11727 + }, + { + "entropy": 1.6789619823296864, + "epoch": 1.288374392353959, + "grad_norm": 0.774255096912384, + "learning_rate": 7.345270726833331e-06, + "loss": 1.3375, + "mean_token_accuracy": 0.6750166416168213, + "num_tokens": 1966490169.0, + "step": 11728 + }, + { + "entropy": 1.7094947596391041, + "epoch": 1.288484249265332, + "grad_norm": 0.6712803840637207, + "learning_rate": 7.343807613798668e-06, + "loss": 1.2826, + "mean_token_accuracy": 0.6736528823773066, + "num_tokens": 1966605134.0, + "step": 11729 + }, + { + "entropy": 1.6739847759405773, + "epoch": 1.2885941061767048, + "grad_norm": 0.5984385013580322, + "learning_rate": 7.342344616480848e-06, + "loss": 1.3459, + "mean_token_accuracy": 0.6620316952466965, + "num_tokens": 1966787815.0, + "step": 11730 + }, + { + "entropy": 1.6550920108954112, + "epoch": 1.2887039630880777, + "grad_norm": 0.6178155541419983, + "learning_rate": 7.340881734926171e-06, + "loss": 1.3412, + "mean_token_accuracy": 0.6700419485569, + "num_tokens": 1966997480.0, + "step": 11731 + }, + { + "entropy": 1.7782117525736492, + "epoch": 1.2888138199994508, + "grad_norm": 0.6943917274475098, + "learning_rate": 7.339418969180938e-06, + "loss": 1.5025, + "mean_token_accuracy": 0.6368412226438522, + "num_tokens": 1967178649.0, + "step": 11732 + }, + { + "entropy": 1.7510855495929718, + "epoch": 1.2889236769108237, + "grad_norm": 0.6902898550033569, + "learning_rate": 7.337956319291446e-06, + "loss": 1.2981, + "mean_token_accuracy": 0.6604495048522949, + "num_tokens": 1967292655.0, + "step": 11733 + }, + { + "entropy": 1.7116970022519429, + "epoch": 1.2890335338221965, + "grad_norm": 0.7756577134132385, + "learning_rate": 7.336493785303986e-06, + "loss": 1.3434, + "mean_token_accuracy": 0.6637662698825201, + "num_tokens": 1967438644.0, + "step": 11734 + }, + { + "entropy": 1.7591506739457448, + "epoch": 1.2891433907335696, + "grad_norm": 0.5982057452201843, + "learning_rate": 7.335031367264844e-06, + "loss": 1.379, + "mean_token_accuracy": 0.6485430747270584, + "num_tokens": 1967622935.0, + "step": 11735 + }, + { + "entropy": 1.6700083116690319, + "epoch": 1.2892532476449425, + "grad_norm": 0.6805751919746399, + "learning_rate": 7.333569065220309e-06, + "loss": 1.318, + "mean_token_accuracy": 0.6725479116042455, + "num_tokens": 1967792881.0, + "step": 11736 + }, + { + "entropy": 1.7045224507649739, + "epoch": 1.2893631045563154, + "grad_norm": 0.7660247087478638, + "learning_rate": 7.332106879216667e-06, + "loss": 1.4916, + "mean_token_accuracy": 0.652613898118337, + "num_tokens": 1967956377.0, + "step": 11737 + }, + { + "entropy": 1.6230275332927704, + "epoch": 1.2894729614676883, + "grad_norm": 0.6637044548988342, + "learning_rate": 7.3306448093001825e-06, + "loss": 1.3228, + "mean_token_accuracy": 0.6681044300397238, + "num_tokens": 1968087519.0, + "step": 11738 + }, + { + "entropy": 1.6863668859004974, + "epoch": 1.2895828183790612, + "grad_norm": 0.594308614730835, + "learning_rate": 7.329182855517141e-06, + "loss": 1.4027, + "mean_token_accuracy": 0.6514003972212473, + "num_tokens": 1968251423.0, + "step": 11739 + }, + { + "entropy": 1.6702902913093567, + "epoch": 1.2896926752904343, + "grad_norm": 0.693254828453064, + "learning_rate": 7.327721017913805e-06, + "loss": 1.2496, + "mean_token_accuracy": 0.6850862701733907, + "num_tokens": 1968412040.0, + "step": 11740 + }, + { + "entropy": 1.662459562222163, + "epoch": 1.2898025322018072, + "grad_norm": 0.7011620402336121, + "learning_rate": 7.326259296536442e-06, + "loss": 1.5088, + "mean_token_accuracy": 0.6396182477474213, + "num_tokens": 1968616860.0, + "step": 11741 + }, + { + "entropy": 1.737975647052129, + "epoch": 1.28991238911318, + "grad_norm": 0.8035622239112854, + "learning_rate": 7.32479769143132e-06, + "loss": 1.2819, + "mean_token_accuracy": 0.6790835956732432, + "num_tokens": 1968740375.0, + "step": 11742 + }, + { + "entropy": 1.730758676926295, + "epoch": 1.290022246024553, + "grad_norm": 0.6591079831123352, + "learning_rate": 7.323336202644698e-06, + "loss": 1.3442, + "mean_token_accuracy": 0.6591987013816833, + "num_tokens": 1968923933.0, + "step": 11743 + }, + { + "entropy": 1.756201942761739, + "epoch": 1.2901321029359258, + "grad_norm": 0.6940193176269531, + "learning_rate": 7.3218748302228236e-06, + "loss": 1.5993, + "mean_token_accuracy": 0.6265199581782023, + "num_tokens": 1969109185.0, + "step": 11744 + }, + { + "entropy": 1.6942794720331829, + "epoch": 1.290241959847299, + "grad_norm": 0.7570939660072327, + "learning_rate": 7.320413574211955e-06, + "loss": 1.3312, + "mean_token_accuracy": 0.6817633907000223, + "num_tokens": 1969221453.0, + "step": 11745 + }, + { + "entropy": 1.6586816012859344, + "epoch": 1.2903518167586718, + "grad_norm": 0.6691310405731201, + "learning_rate": 7.31895243465834e-06, + "loss": 1.3373, + "mean_token_accuracy": 0.6555156062046686, + "num_tokens": 1969369427.0, + "step": 11746 + }, + { + "entropy": 1.7016185621420543, + "epoch": 1.2904616736700447, + "grad_norm": 0.6833151578903198, + "learning_rate": 7.317491411608217e-06, + "loss": 1.4382, + "mean_token_accuracy": 0.6552936285734177, + "num_tokens": 1969538724.0, + "step": 11747 + }, + { + "entropy": 1.6746133367220561, + "epoch": 1.2905715305814178, + "grad_norm": 0.5930050611495972, + "learning_rate": 7.316030505107834e-06, + "loss": 1.4735, + "mean_token_accuracy": 0.6479932516813278, + "num_tokens": 1969761041.0, + "step": 11748 + }, + { + "entropy": 1.6806008915106456, + "epoch": 1.2906813874927907, + "grad_norm": 0.7082892060279846, + "learning_rate": 7.314569715203428e-06, + "loss": 1.3426, + "mean_token_accuracy": 0.6537379374106725, + "num_tokens": 1969892525.0, + "step": 11749 + }, + { + "entropy": 1.784896006186803, + "epoch": 1.2907912444041636, + "grad_norm": 0.7892354726791382, + "learning_rate": 7.3131090419412285e-06, + "loss": 1.4915, + "mean_token_accuracy": 0.6374368518590927, + "num_tokens": 1970080563.0, + "step": 11750 + }, + { + "entropy": 1.6796510914961498, + "epoch": 1.2909011013155365, + "grad_norm": 0.6004863381385803, + "learning_rate": 7.311648485367464e-06, + "loss": 1.412, + "mean_token_accuracy": 0.6581088254849116, + "num_tokens": 1970248705.0, + "step": 11751 + }, + { + "entropy": 1.7254607180754344, + "epoch": 1.2910109582269094, + "grad_norm": 0.6974371075630188, + "learning_rate": 7.310188045528368e-06, + "loss": 1.4514, + "mean_token_accuracy": 0.6568728238344193, + "num_tokens": 1970419431.0, + "step": 11752 + }, + { + "entropy": 1.7045761744181316, + "epoch": 1.2911208151382825, + "grad_norm": 2.843557119369507, + "learning_rate": 7.308727722470153e-06, + "loss": 1.4688, + "mean_token_accuracy": 0.6505365371704102, + "num_tokens": 1970581754.0, + "step": 11753 + }, + { + "entropy": 1.7577880720297496, + "epoch": 1.2912306720496554, + "grad_norm": 0.9117422699928284, + "learning_rate": 7.307267516239043e-06, + "loss": 1.4026, + "mean_token_accuracy": 0.6464939614137014, + "num_tokens": 1970743224.0, + "step": 11754 + }, + { + "entropy": 1.7608660360177357, + "epoch": 1.2913405289610282, + "grad_norm": 0.7385045289993286, + "learning_rate": 7.305807426881255e-06, + "loss": 1.324, + "mean_token_accuracy": 0.6659322182337443, + "num_tokens": 1970867328.0, + "step": 11755 + }, + { + "entropy": 1.7018007536729176, + "epoch": 1.2914503858724011, + "grad_norm": 0.5788907408714294, + "learning_rate": 7.304347454442992e-06, + "loss": 1.2803, + "mean_token_accuracy": 0.6538551598787308, + "num_tokens": 1971134236.0, + "step": 11756 + }, + { + "entropy": 1.756419579188029, + "epoch": 1.291560242783774, + "grad_norm": 0.619351327419281, + "learning_rate": 7.302887598970472e-06, + "loss": 1.3685, + "mean_token_accuracy": 0.6473907629648844, + "num_tokens": 1971335443.0, + "step": 11757 + }, + { + "entropy": 1.7158561150232952, + "epoch": 1.2916700996951471, + "grad_norm": 1.6634712219238281, + "learning_rate": 7.3014278605098934e-06, + "loss": 1.2694, + "mean_token_accuracy": 0.6615792512893677, + "num_tokens": 1971562350.0, + "step": 11758 + }, + { + "entropy": 1.6589511632919312, + "epoch": 1.29177995660652, + "grad_norm": 0.6881945133209229, + "learning_rate": 7.299968239107451e-06, + "loss": 1.3316, + "mean_token_accuracy": 0.6625782549381256, + "num_tokens": 1971746704.0, + "step": 11759 + }, + { + "entropy": 1.7037721276283264, + "epoch": 1.291889813517893, + "grad_norm": 0.6696583032608032, + "learning_rate": 7.298508734809351e-06, + "loss": 1.4161, + "mean_token_accuracy": 0.6640026867389679, + "num_tokens": 1971956211.0, + "step": 11760 + }, + { + "entropy": 1.7227367758750916, + "epoch": 1.291999670429266, + "grad_norm": 0.6352359652519226, + "learning_rate": 7.297049347661782e-06, + "loss": 1.5067, + "mean_token_accuracy": 0.6483261436223984, + "num_tokens": 1972162108.0, + "step": 11761 + }, + { + "entropy": 1.7867354949315388, + "epoch": 1.2921095273406389, + "grad_norm": 0.6077547073364258, + "learning_rate": 7.29559007771093e-06, + "loss": 1.4752, + "mean_token_accuracy": 0.6493107676506042, + "num_tokens": 1972309946.0, + "step": 11762 + }, + { + "entropy": 1.7437163889408112, + "epoch": 1.2922193842520118, + "grad_norm": 0.7125455141067505, + "learning_rate": 7.2941309250029845e-06, + "loss": 1.4619, + "mean_token_accuracy": 0.6622959723075231, + "num_tokens": 1972500940.0, + "step": 11763 + }, + { + "entropy": 1.7352955440680187, + "epoch": 1.2923292411633847, + "grad_norm": 0.5929360389709473, + "learning_rate": 7.2926718895841246e-06, + "loss": 1.4278, + "mean_token_accuracy": 0.646904394030571, + "num_tokens": 1972691170.0, + "step": 11764 + }, + { + "entropy": 1.7026380797227223, + "epoch": 1.2924390980747575, + "grad_norm": 0.6624311804771423, + "learning_rate": 7.291212971500527e-06, + "loss": 1.3807, + "mean_token_accuracy": 0.6627266258001328, + "num_tokens": 1972854592.0, + "step": 11765 + }, + { + "entropy": 1.6698378721872966, + "epoch": 1.2925489549861306, + "grad_norm": 0.7162081599235535, + "learning_rate": 7.289754170798369e-06, + "loss": 1.3603, + "mean_token_accuracy": 0.6661340196927389, + "num_tokens": 1973042979.0, + "step": 11766 + }, + { + "entropy": 1.6535949905713399, + "epoch": 1.2926588118975035, + "grad_norm": 0.6275128722190857, + "learning_rate": 7.288295487523822e-06, + "loss": 1.2839, + "mean_token_accuracy": 0.6786264330148697, + "num_tokens": 1973170895.0, + "step": 11767 + }, + { + "entropy": 1.732323278983434, + "epoch": 1.2927686688088764, + "grad_norm": 0.8781585097312927, + "learning_rate": 7.286836921723048e-06, + "loss": 1.36, + "mean_token_accuracy": 0.6611862430969874, + "num_tokens": 1973331867.0, + "step": 11768 + }, + { + "entropy": 1.7570100327332814, + "epoch": 1.2928785257202493, + "grad_norm": 0.7007432579994202, + "learning_rate": 7.2853784734422155e-06, + "loss": 1.4099, + "mean_token_accuracy": 0.6472747921943665, + "num_tokens": 1973490229.0, + "step": 11769 + }, + { + "entropy": 1.711938053369522, + "epoch": 1.2929883826316222, + "grad_norm": 0.6697954535484314, + "learning_rate": 7.283920142727479e-06, + "loss": 1.5415, + "mean_token_accuracy": 0.634870320558548, + "num_tokens": 1973666134.0, + "step": 11770 + }, + { + "entropy": 1.6825863222281139, + "epoch": 1.2930982395429953, + "grad_norm": 0.7022350430488586, + "learning_rate": 7.282461929624991e-06, + "loss": 1.2739, + "mean_token_accuracy": 0.664794052640597, + "num_tokens": 1973774886.0, + "step": 11771 + }, + { + "entropy": 1.7592849830786388, + "epoch": 1.2932080964543682, + "grad_norm": 0.7243680953979492, + "learning_rate": 7.2810038341809105e-06, + "loss": 1.4997, + "mean_token_accuracy": 0.6491112063328425, + "num_tokens": 1973920195.0, + "step": 11772 + }, + { + "entropy": 1.6567996442317963, + "epoch": 1.293317953365741, + "grad_norm": 0.7638998031616211, + "learning_rate": 7.279545856441385e-06, + "loss": 1.2132, + "mean_token_accuracy": 0.6840778191884359, + "num_tokens": 1974044070.0, + "step": 11773 + }, + { + "entropy": 1.7306942145029705, + "epoch": 1.2934278102771142, + "grad_norm": 0.7598798274993896, + "learning_rate": 7.278087996452554e-06, + "loss": 1.4113, + "mean_token_accuracy": 0.6556040098269781, + "num_tokens": 1974195242.0, + "step": 11774 + }, + { + "entropy": 1.7199226518472035, + "epoch": 1.293537667188487, + "grad_norm": 0.7280505299568176, + "learning_rate": 7.2766302542605615e-06, + "loss": 1.4192, + "mean_token_accuracy": 0.6484815229972204, + "num_tokens": 1974355884.0, + "step": 11775 + }, + { + "entropy": 1.7219727238019307, + "epoch": 1.29364752409986, + "grad_norm": 0.8648471236228943, + "learning_rate": 7.275172629911546e-06, + "loss": 1.4423, + "mean_token_accuracy": 0.6569175471862158, + "num_tokens": 1974466512.0, + "step": 11776 + }, + { + "entropy": 1.757084995508194, + "epoch": 1.2937573810112328, + "grad_norm": 0.7437247037887573, + "learning_rate": 7.2737151234516365e-06, + "loss": 1.3887, + "mean_token_accuracy": 0.6547428021828333, + "num_tokens": 1974621651.0, + "step": 11777 + }, + { + "entropy": 1.7222739160060883, + "epoch": 1.2938672379226057, + "grad_norm": 0.682804524898529, + "learning_rate": 7.2722577349269615e-06, + "loss": 1.4023, + "mean_token_accuracy": 0.6509612699349722, + "num_tokens": 1974790569.0, + "step": 11778 + }, + { + "entropy": 1.6923163831233978, + "epoch": 1.2939770948339788, + "grad_norm": 0.7132525444030762, + "learning_rate": 7.270800464383654e-06, + "loss": 1.3487, + "mean_token_accuracy": 0.6630134681860606, + "num_tokens": 1974908362.0, + "step": 11779 + }, + { + "entropy": 1.7271219789981842, + "epoch": 1.2940869517453517, + "grad_norm": 0.6374887824058533, + "learning_rate": 7.269343311867829e-06, + "loss": 1.4675, + "mean_token_accuracy": 0.6458124866088232, + "num_tokens": 1975049557.0, + "step": 11780 + }, + { + "entropy": 1.7077939212322235, + "epoch": 1.2941968086567246, + "grad_norm": 0.6350587010383606, + "learning_rate": 7.2678862774256065e-06, + "loss": 1.337, + "mean_token_accuracy": 0.6723198741674423, + "num_tokens": 1975191967.0, + "step": 11781 + }, + { + "entropy": 1.7040321032206218, + "epoch": 1.2943066655680975, + "grad_norm": 0.6252678036689758, + "learning_rate": 7.266429361103105e-06, + "loss": 1.3758, + "mean_token_accuracy": 0.6479005714257559, + "num_tokens": 1975354337.0, + "step": 11782 + }, + { + "entropy": 1.7597824732462566, + "epoch": 1.2944165224794704, + "grad_norm": 0.8166074752807617, + "learning_rate": 7.264972562946428e-06, + "loss": 1.481, + "mean_token_accuracy": 0.6362377305825552, + "num_tokens": 1975526565.0, + "step": 11783 + }, + { + "entropy": 1.6754455467065175, + "epoch": 1.2945263793908435, + "grad_norm": 0.736605703830719, + "learning_rate": 7.263515883001686e-06, + "loss": 1.2794, + "mean_token_accuracy": 0.671693374713262, + "num_tokens": 1975678286.0, + "step": 11784 + }, + { + "entropy": 1.6512251496315002, + "epoch": 1.2946362363022164, + "grad_norm": 9.648917198181152, + "learning_rate": 7.2620593213149874e-06, + "loss": 1.3817, + "mean_token_accuracy": 0.6627425750096639, + "num_tokens": 1975887204.0, + "step": 11785 + }, + { + "entropy": 1.7003162701924641, + "epoch": 1.2947460932135892, + "grad_norm": 0.7399555444717407, + "learning_rate": 7.260602877932421e-06, + "loss": 1.3761, + "mean_token_accuracy": 0.6608262062072754, + "num_tokens": 1976048216.0, + "step": 11786 + }, + { + "entropy": 1.7178180714448292, + "epoch": 1.2948559501249624, + "grad_norm": 0.7446051239967346, + "learning_rate": 7.259146552900094e-06, + "loss": 1.4646, + "mean_token_accuracy": 0.6506668627262115, + "num_tokens": 1976241273.0, + "step": 11787 + }, + { + "entropy": 1.7054597040017445, + "epoch": 1.2949658070363352, + "grad_norm": 0.6272408366203308, + "learning_rate": 7.25769034626409e-06, + "loss": 1.2619, + "mean_token_accuracy": 0.6702099094788233, + "num_tokens": 1976386664.0, + "step": 11788 + }, + { + "entropy": 1.7981916566689808, + "epoch": 1.2950756639477081, + "grad_norm": 0.5923606157302856, + "learning_rate": 7.256234258070501e-06, + "loss": 1.5048, + "mean_token_accuracy": 0.6292905509471893, + "num_tokens": 1976617356.0, + "step": 11789 + }, + { + "entropy": 1.7312207321325939, + "epoch": 1.295185520859081, + "grad_norm": 0.6586351990699768, + "learning_rate": 7.254778288365411e-06, + "loss": 1.4734, + "mean_token_accuracy": 0.6350182294845581, + "num_tokens": 1976838422.0, + "step": 11790 + }, + { + "entropy": 1.760515828927358, + "epoch": 1.295295377770454, + "grad_norm": 0.7017537951469421, + "learning_rate": 7.253322437194901e-06, + "loss": 1.5208, + "mean_token_accuracy": 0.6316369622945786, + "num_tokens": 1977010019.0, + "step": 11791 + }, + { + "entropy": 1.666582852602005, + "epoch": 1.295405234681827, + "grad_norm": 0.5920802354812622, + "learning_rate": 7.251866704605042e-06, + "loss": 1.2817, + "mean_token_accuracy": 0.6713870366414388, + "num_tokens": 1977141644.0, + "step": 11792 + }, + { + "entropy": 1.7240260044733684, + "epoch": 1.2955150915931999, + "grad_norm": 0.6175614595413208, + "learning_rate": 7.25041109064192e-06, + "loss": 1.3674, + "mean_token_accuracy": 0.6563980529705683, + "num_tokens": 1977305533.0, + "step": 11793 + }, + { + "entropy": 1.7184670567512512, + "epoch": 1.2956249485045728, + "grad_norm": 0.8212663531303406, + "learning_rate": 7.248955595351592e-06, + "loss": 1.3066, + "mean_token_accuracy": 0.6645220468441645, + "num_tokens": 1977456661.0, + "step": 11794 + }, + { + "entropy": 1.6388721764087677, + "epoch": 1.2957348054159457, + "grad_norm": 0.743337869644165, + "learning_rate": 7.2475002187801345e-06, + "loss": 1.3813, + "mean_token_accuracy": 0.6679724355538686, + "num_tokens": 1977638102.0, + "step": 11795 + }, + { + "entropy": 1.7348832388718922, + "epoch": 1.2958446623273185, + "grad_norm": 0.7091452479362488, + "learning_rate": 7.246044960973602e-06, + "loss": 1.3269, + "mean_token_accuracy": 0.6580028831958771, + "num_tokens": 1977794927.0, + "step": 11796 + }, + { + "entropy": 1.7408220171928406, + "epoch": 1.2959545192386916, + "grad_norm": 0.5987675189971924, + "learning_rate": 7.244589821978052e-06, + "loss": 1.1792, + "mean_token_accuracy": 0.6719856162865957, + "num_tokens": 1977968569.0, + "step": 11797 + }, + { + "entropy": 1.7049931287765503, + "epoch": 1.2960643761500645, + "grad_norm": 0.8301398158073425, + "learning_rate": 7.243134801839544e-06, + "loss": 1.3704, + "mean_token_accuracy": 0.6715128173430761, + "num_tokens": 1978133116.0, + "step": 11798 + }, + { + "entropy": 1.7279678384462993, + "epoch": 1.2961742330614374, + "grad_norm": 0.7465829849243164, + "learning_rate": 7.24167990060413e-06, + "loss": 1.3674, + "mean_token_accuracy": 0.6562019089857737, + "num_tokens": 1978285874.0, + "step": 11799 + }, + { + "entropy": 1.662907858689626, + "epoch": 1.2962840899728105, + "grad_norm": 0.744123101234436, + "learning_rate": 7.240225118317847e-06, + "loss": 1.5223, + "mean_token_accuracy": 0.6506157964468002, + "num_tokens": 1978461876.0, + "step": 11800 + }, + { + "entropy": 1.7489655017852783, + "epoch": 1.2963939468841834, + "grad_norm": 0.7728520631790161, + "learning_rate": 7.238770455026747e-06, + "loss": 1.3505, + "mean_token_accuracy": 0.6687692006429037, + "num_tokens": 1978591043.0, + "step": 11801 + }, + { + "entropy": 1.7328561941782634, + "epoch": 1.2965038037955563, + "grad_norm": 0.707901120185852, + "learning_rate": 7.237315910776872e-06, + "loss": 1.4924, + "mean_token_accuracy": 0.6454095045725504, + "num_tokens": 1978750688.0, + "step": 11802 + }, + { + "entropy": 1.6640961865584056, + "epoch": 1.2966136607069292, + "grad_norm": 0.9605063796043396, + "learning_rate": 7.235861485614248e-06, + "loss": 1.2226, + "mean_token_accuracy": 0.6842072506745657, + "num_tokens": 1978910145.0, + "step": 11803 + }, + { + "entropy": 1.703210969765981, + "epoch": 1.296723517618302, + "grad_norm": 0.6342226266860962, + "learning_rate": 7.234407179584912e-06, + "loss": 1.3948, + "mean_token_accuracy": 0.6613224347432455, + "num_tokens": 1979095563.0, + "step": 11804 + }, + { + "entropy": 1.6607285638650258, + "epoch": 1.2968333745296752, + "grad_norm": 0.8452777862548828, + "learning_rate": 7.2329529927348966e-06, + "loss": 1.211, + "mean_token_accuracy": 0.6795276602109274, + "num_tokens": 1979225266.0, + "step": 11805 + }, + { + "entropy": 1.7239519755045574, + "epoch": 1.296943231441048, + "grad_norm": 0.6554011106491089, + "learning_rate": 7.231498925110214e-06, + "loss": 1.3701, + "mean_token_accuracy": 0.6555942744016647, + "num_tokens": 1979419579.0, + "step": 11806 + }, + { + "entropy": 1.7264830768108368, + "epoch": 1.297053088352421, + "grad_norm": 0.7340265512466431, + "learning_rate": 7.230044976756898e-06, + "loss": 1.6073, + "mean_token_accuracy": 0.6350849618514379, + "num_tokens": 1979619520.0, + "step": 11807 + }, + { + "entropy": 1.6583397487799327, + "epoch": 1.2971629452637938, + "grad_norm": 0.6471593379974365, + "learning_rate": 7.2285911477209604e-06, + "loss": 1.3271, + "mean_token_accuracy": 0.6611123780409495, + "num_tokens": 1979757550.0, + "step": 11808 + }, + { + "entropy": 1.7062188585599263, + "epoch": 1.2972728021751667, + "grad_norm": 0.7513505220413208, + "learning_rate": 7.227137438048411e-06, + "loss": 1.4257, + "mean_token_accuracy": 0.6616204331318537, + "num_tokens": 1979909535.0, + "step": 11809 + }, + { + "entropy": 1.7708937724431355, + "epoch": 1.2973826590865398, + "grad_norm": 1.0161291360855103, + "learning_rate": 7.225683847785261e-06, + "loss": 1.4273, + "mean_token_accuracy": 0.6494510521491369, + "num_tokens": 1980093310.0, + "step": 11810 + }, + { + "entropy": 1.7270852228005726, + "epoch": 1.2974925159979127, + "grad_norm": 1.1134231090545654, + "learning_rate": 7.224230376977519e-06, + "loss": 1.5241, + "mean_token_accuracy": 0.655661274989446, + "num_tokens": 1980280004.0, + "step": 11811 + }, + { + "entropy": 1.6724676191806793, + "epoch": 1.2976023729092856, + "grad_norm": 0.6491298079490662, + "learning_rate": 7.222777025671182e-06, + "loss": 1.3062, + "mean_token_accuracy": 0.6757878363132477, + "num_tokens": 1980485888.0, + "step": 11812 + }, + { + "entropy": 1.7291560967763264, + "epoch": 1.2977122298206587, + "grad_norm": 0.6538869738578796, + "learning_rate": 7.221323793912247e-06, + "loss": 1.4369, + "mean_token_accuracy": 0.6460357258717219, + "num_tokens": 1980665077.0, + "step": 11813 + }, + { + "entropy": 1.6848385234673817, + "epoch": 1.2978220867320316, + "grad_norm": 0.704898476600647, + "learning_rate": 7.219870681746717e-06, + "loss": 1.4264, + "mean_token_accuracy": 0.6565316567818323, + "num_tokens": 1980810801.0, + "step": 11814 + }, + { + "entropy": 1.6930330594380696, + "epoch": 1.2979319436434045, + "grad_norm": 0.7854775786399841, + "learning_rate": 7.218417689220576e-06, + "loss": 1.4475, + "mean_token_accuracy": 0.6585629632075628, + "num_tokens": 1980944906.0, + "step": 11815 + }, + { + "entropy": 1.6579373677571614, + "epoch": 1.2980418005547774, + "grad_norm": 0.697228729724884, + "learning_rate": 7.216964816379805e-06, + "loss": 1.3587, + "mean_token_accuracy": 0.6618794798851013, + "num_tokens": 1981144140.0, + "step": 11816 + }, + { + "entropy": 1.634168028831482, + "epoch": 1.2981516574661502, + "grad_norm": 0.6177424788475037, + "learning_rate": 7.2155120632704e-06, + "loss": 1.5157, + "mean_token_accuracy": 0.6338949004809061, + "num_tokens": 1981412853.0, + "step": 11817 + }, + { + "entropy": 1.7243984242280324, + "epoch": 1.2982615143775234, + "grad_norm": 0.6800485849380493, + "learning_rate": 7.214059429938329e-06, + "loss": 1.5578, + "mean_token_accuracy": 0.64493028819561, + "num_tokens": 1981563675.0, + "step": 11818 + }, + { + "entropy": 1.728604664405187, + "epoch": 1.2983713712888962, + "grad_norm": 0.7974774837493896, + "learning_rate": 7.212606916429572e-06, + "loss": 1.2656, + "mean_token_accuracy": 0.6728243281443914, + "num_tokens": 1981676374.0, + "step": 11819 + }, + { + "entropy": 1.7148986756801605, + "epoch": 1.2984812282002691, + "grad_norm": 0.5727463364601135, + "learning_rate": 7.211154522790103e-06, + "loss": 1.4279, + "mean_token_accuracy": 0.638765682776769, + "num_tokens": 1981861170.0, + "step": 11820 + }, + { + "entropy": 1.6881239612897236, + "epoch": 1.298591085111642, + "grad_norm": 0.7453581094741821, + "learning_rate": 7.2097022490658795e-06, + "loss": 1.3699, + "mean_token_accuracy": 0.6761754850546519, + "num_tokens": 1982007898.0, + "step": 11821 + }, + { + "entropy": 1.6406256258487701, + "epoch": 1.298700942023015, + "grad_norm": 0.5940481424331665, + "learning_rate": 7.208250095302878e-06, + "loss": 1.3569, + "mean_token_accuracy": 0.6683052430550257, + "num_tokens": 1982173648.0, + "step": 11822 + }, + { + "entropy": 1.6838708420594533, + "epoch": 1.298810798934388, + "grad_norm": 0.651563286781311, + "learning_rate": 7.206798061547049e-06, + "loss": 1.5306, + "mean_token_accuracy": 0.6416983604431152, + "num_tokens": 1982331870.0, + "step": 11823 + }, + { + "entropy": 1.7058892448743184, + "epoch": 1.2989206558457609, + "grad_norm": 0.6198409795761108, + "learning_rate": 7.205346147844352e-06, + "loss": 1.3637, + "mean_token_accuracy": 0.6558132419983546, + "num_tokens": 1982532371.0, + "step": 11824 + }, + { + "entropy": 1.7239612738291423, + "epoch": 1.2990305127571338, + "grad_norm": 0.7450637221336365, + "learning_rate": 7.203894354240737e-06, + "loss": 1.43, + "mean_token_accuracy": 0.6622414539257685, + "num_tokens": 1982696214.0, + "step": 11825 + }, + { + "entropy": 1.7213209768136342, + "epoch": 1.2991403696685069, + "grad_norm": 0.6577510237693787, + "learning_rate": 7.20244268078216e-06, + "loss": 1.3634, + "mean_token_accuracy": 0.6626957158247629, + "num_tokens": 1982828315.0, + "step": 11826 + }, + { + "entropy": 1.7126743793487549, + "epoch": 1.2992502265798798, + "grad_norm": 0.6911088228225708, + "learning_rate": 7.2009911275145605e-06, + "loss": 1.3489, + "mean_token_accuracy": 0.6616120487451553, + "num_tokens": 1982974989.0, + "step": 11827 + }, + { + "entropy": 1.742800772190094, + "epoch": 1.2993600834912526, + "grad_norm": 0.7396600246429443, + "learning_rate": 7.1995396944838765e-06, + "loss": 1.4829, + "mean_token_accuracy": 0.638206327954928, + "num_tokens": 1983151283.0, + "step": 11828 + }, + { + "entropy": 1.6596211989720662, + "epoch": 1.2994699404026255, + "grad_norm": 2.785015821456909, + "learning_rate": 7.198088381736053e-06, + "loss": 1.3438, + "mean_token_accuracy": 0.6643916070461273, + "num_tokens": 1983335044.0, + "step": 11829 + }, + { + "entropy": 1.6466420888900757, + "epoch": 1.2995797973139984, + "grad_norm": 0.5830391049385071, + "learning_rate": 7.196637189317015e-06, + "loss": 1.4721, + "mean_token_accuracy": 0.6433140188455582, + "num_tokens": 1983568953.0, + "step": 11830 + }, + { + "entropy": 1.7181233763694763, + "epoch": 1.2996896542253715, + "grad_norm": 0.8077186942100525, + "learning_rate": 7.1951861172726985e-06, + "loss": 1.2321, + "mean_token_accuracy": 0.6756879289944967, + "num_tokens": 1983674728.0, + "step": 11831 + }, + { + "entropy": 1.6590841114521027, + "epoch": 1.2997995111367444, + "grad_norm": 0.5984413027763367, + "learning_rate": 7.193735165649027e-06, + "loss": 1.5037, + "mean_token_accuracy": 0.6605504155158997, + "num_tokens": 1983866230.0, + "step": 11832 + }, + { + "entropy": 1.7355755269527435, + "epoch": 1.2999093680481173, + "grad_norm": 0.7825373411178589, + "learning_rate": 7.192284334491919e-06, + "loss": 1.3951, + "mean_token_accuracy": 0.6593762536843618, + "num_tokens": 1984017499.0, + "step": 11833 + }, + { + "entropy": 1.6846800744533539, + "epoch": 1.3000192249594902, + "grad_norm": 0.6214932799339294, + "learning_rate": 7.190833623847302e-06, + "loss": 1.2622, + "mean_token_accuracy": 0.6819400539000829, + "num_tokens": 1984194774.0, + "step": 11834 + }, + { + "entropy": 1.6585146188735962, + "epoch": 1.300129081870863, + "grad_norm": 0.5928328037261963, + "learning_rate": 7.189383033761082e-06, + "loss": 1.4513, + "mean_token_accuracy": 0.6382601261138916, + "num_tokens": 1984450421.0, + "step": 11835 + }, + { + "entropy": 1.6726765831311543, + "epoch": 1.3002389387822362, + "grad_norm": 0.6932438015937805, + "learning_rate": 7.187932564279168e-06, + "loss": 1.4706, + "mean_token_accuracy": 0.6601354628801346, + "num_tokens": 1984578455.0, + "step": 11836 + }, + { + "entropy": 1.7142049372196198, + "epoch": 1.300348795693609, + "grad_norm": 0.7112865447998047, + "learning_rate": 7.186482215447472e-06, + "loss": 1.5127, + "mean_token_accuracy": 0.6401646981636683, + "num_tokens": 1984793396.0, + "step": 11837 + }, + { + "entropy": 1.7584912180900574, + "epoch": 1.300458652604982, + "grad_norm": 0.7246370315551758, + "learning_rate": 7.185031987311899e-06, + "loss": 1.562, + "mean_token_accuracy": 0.625005453824997, + "num_tokens": 1984962832.0, + "step": 11838 + }, + { + "entropy": 1.6865639090538025, + "epoch": 1.300568509516355, + "grad_norm": 0.6186059713363647, + "learning_rate": 7.183581879918344e-06, + "loss": 1.5017, + "mean_token_accuracy": 0.6512916932503382, + "num_tokens": 1985164889.0, + "step": 11839 + }, + { + "entropy": 1.7075908879439037, + "epoch": 1.300678366427728, + "grad_norm": 0.6377032399177551, + "learning_rate": 7.182131893312698e-06, + "loss": 1.5451, + "mean_token_accuracy": 0.6384754379590353, + "num_tokens": 1985344290.0, + "step": 11840 + }, + { + "entropy": 1.6983333627382915, + "epoch": 1.3007882233391008, + "grad_norm": 0.7583761811256409, + "learning_rate": 7.180682027540864e-06, + "loss": 1.4311, + "mean_token_accuracy": 0.6552286992470423, + "num_tokens": 1985503791.0, + "step": 11841 + }, + { + "entropy": 1.6771070162455242, + "epoch": 1.3008980802504737, + "grad_norm": 0.7743870615959167, + "learning_rate": 7.179232282648716e-06, + "loss": 1.3962, + "mean_token_accuracy": 0.6450282633304596, + "num_tokens": 1985677205.0, + "step": 11842 + }, + { + "entropy": 1.6850054661432903, + "epoch": 1.3010079371618466, + "grad_norm": 0.6330224871635437, + "learning_rate": 7.177782658682148e-06, + "loss": 1.3411, + "mean_token_accuracy": 0.6549698412418365, + "num_tokens": 1985854174.0, + "step": 11843 + }, + { + "entropy": 1.7192271451155345, + "epoch": 1.3011177940732197, + "grad_norm": 0.7451735734939575, + "learning_rate": 7.176333155687039e-06, + "loss": 1.4255, + "mean_token_accuracy": 0.6647992481788, + "num_tokens": 1986045909.0, + "step": 11844 + }, + { + "entropy": 1.705276260773341, + "epoch": 1.3012276509845926, + "grad_norm": 0.7818323373794556, + "learning_rate": 7.174883773709258e-06, + "loss": 1.412, + "mean_token_accuracy": 0.66334301729997, + "num_tokens": 1986222420.0, + "step": 11845 + }, + { + "entropy": 1.7389629483222961, + "epoch": 1.3013375078959655, + "grad_norm": 0.7498189210891724, + "learning_rate": 7.173434512794686e-06, + "loss": 1.3499, + "mean_token_accuracy": 0.656540701786677, + "num_tokens": 1986418388.0, + "step": 11846 + }, + { + "entropy": 1.6752577722072601, + "epoch": 1.3014473648073386, + "grad_norm": 0.680406391620636, + "learning_rate": 7.171985372989185e-06, + "loss": 1.4032, + "mean_token_accuracy": 0.6697799315055212, + "num_tokens": 1986620372.0, + "step": 11847 + }, + { + "entropy": 1.7210516333580017, + "epoch": 1.3015572217187112, + "grad_norm": 0.776543378829956, + "learning_rate": 7.170536354338622e-06, + "loss": 1.3586, + "mean_token_accuracy": 0.6546121736367544, + "num_tokens": 1986743025.0, + "step": 11848 + }, + { + "entropy": 1.7157737612724304, + "epoch": 1.3016670786300844, + "grad_norm": 0.681416928768158, + "learning_rate": 7.169087456888859e-06, + "loss": 1.2704, + "mean_token_accuracy": 0.6695546756188074, + "num_tokens": 1986896242.0, + "step": 11849 + }, + { + "entropy": 1.7309301495552063, + "epoch": 1.3017769355414572, + "grad_norm": 0.6461417078971863, + "learning_rate": 7.167638680685749e-06, + "loss": 1.2813, + "mean_token_accuracy": 0.6683636407057444, + "num_tokens": 1987045305.0, + "step": 11850 + }, + { + "entropy": 1.697464495897293, + "epoch": 1.3018867924528301, + "grad_norm": 0.6775514483451843, + "learning_rate": 7.16619002577515e-06, + "loss": 1.4658, + "mean_token_accuracy": 0.6477470993995667, + "num_tokens": 1987223981.0, + "step": 11851 + }, + { + "entropy": 1.7292206982771556, + "epoch": 1.3019966493642032, + "grad_norm": 0.7780271172523499, + "learning_rate": 7.164741492202911e-06, + "loss": 1.4561, + "mean_token_accuracy": 0.6541826476653417, + "num_tokens": 1987371081.0, + "step": 11852 + }, + { + "entropy": 1.6961702009042103, + "epoch": 1.3021065062755761, + "grad_norm": 0.7289796471595764, + "learning_rate": 7.163293080014872e-06, + "loss": 1.2561, + "mean_token_accuracy": 0.6818042149146398, + "num_tokens": 1987473964.0, + "step": 11853 + }, + { + "entropy": 1.6992531319459279, + "epoch": 1.302216363186949, + "grad_norm": 0.85200035572052, + "learning_rate": 7.161844789256882e-06, + "loss": 1.2384, + "mean_token_accuracy": 0.6741587022940317, + "num_tokens": 1987601010.0, + "step": 11854 + }, + { + "entropy": 1.7139351069927216, + "epoch": 1.3023262200983219, + "grad_norm": 1.0030229091644287, + "learning_rate": 7.160396619974772e-06, + "loss": 1.3984, + "mean_token_accuracy": 0.6595780551433563, + "num_tokens": 1987758086.0, + "step": 11855 + }, + { + "entropy": 1.7854057649771373, + "epoch": 1.3024360770096948, + "grad_norm": 0.6239326000213623, + "learning_rate": 7.158948572214377e-06, + "loss": 1.4072, + "mean_token_accuracy": 0.6497927755117416, + "num_tokens": 1987956125.0, + "step": 11856 + }, + { + "entropy": 1.6467955509821575, + "epoch": 1.3025459339210679, + "grad_norm": 0.621613085269928, + "learning_rate": 7.157500646021529e-06, + "loss": 1.3393, + "mean_token_accuracy": 0.6661281585693359, + "num_tokens": 1988127619.0, + "step": 11857 + }, + { + "entropy": 1.6565412779649098, + "epoch": 1.3026557908324408, + "grad_norm": 0.7688978314399719, + "learning_rate": 7.156052841442058e-06, + "loss": 1.3215, + "mean_token_accuracy": 0.6778760701417923, + "num_tokens": 1988347207.0, + "step": 11858 + }, + { + "entropy": 1.756882220506668, + "epoch": 1.3027656477438136, + "grad_norm": 0.7031749486923218, + "learning_rate": 7.154605158521784e-06, + "loss": 1.473, + "mean_token_accuracy": 0.6567084838946661, + "num_tokens": 1988503513.0, + "step": 11859 + }, + { + "entropy": 1.647767146428426, + "epoch": 1.3028755046551868, + "grad_norm": 0.7281495928764343, + "learning_rate": 7.153157597306517e-06, + "loss": 1.3981, + "mean_token_accuracy": 0.6550866365432739, + "num_tokens": 1988676182.0, + "step": 11860 + }, + { + "entropy": 1.7764273285865784, + "epoch": 1.3029853615665594, + "grad_norm": 0.7380655407905579, + "learning_rate": 7.1517101578420845e-06, + "loss": 1.3915, + "mean_token_accuracy": 0.6554965376853943, + "num_tokens": 1988784119.0, + "step": 11861 + }, + { + "entropy": 1.710543821255366, + "epoch": 1.3030952184779325, + "grad_norm": 0.6946497559547424, + "learning_rate": 7.150262840174287e-06, + "loss": 1.4343, + "mean_token_accuracy": 0.6613740076621374, + "num_tokens": 1988976260.0, + "step": 11862 + }, + { + "entropy": 1.6800654629866283, + "epoch": 1.3032050753893054, + "grad_norm": 0.7925371527671814, + "learning_rate": 7.148815644348939e-06, + "loss": 1.4201, + "mean_token_accuracy": 0.6524600485960642, + "num_tokens": 1989116149.0, + "step": 11863 + }, + { + "entropy": 1.6714057624340057, + "epoch": 1.3033149323006783, + "grad_norm": 0.7147430777549744, + "learning_rate": 7.1473685704118415e-06, + "loss": 1.4243, + "mean_token_accuracy": 0.6460753281911215, + "num_tokens": 1989292509.0, + "step": 11864 + }, + { + "entropy": 1.6726448833942413, + "epoch": 1.3034247892120514, + "grad_norm": 0.8853915929794312, + "learning_rate": 7.145921618408789e-06, + "loss": 1.4295, + "mean_token_accuracy": 0.6527641713619232, + "num_tokens": 1989445522.0, + "step": 11865 + }, + { + "entropy": 1.6634302536646526, + "epoch": 1.3035346461234243, + "grad_norm": 0.8797194361686707, + "learning_rate": 7.1444747883855825e-06, + "loss": 1.441, + "mean_token_accuracy": 0.6599002232154211, + "num_tokens": 1989603818.0, + "step": 11866 + }, + { + "entropy": 1.733254959185918, + "epoch": 1.3036445030347972, + "grad_norm": 0.8028691411018372, + "learning_rate": 7.1430280803880125e-06, + "loss": 1.2522, + "mean_token_accuracy": 0.6718244006236395, + "num_tokens": 1989737439.0, + "step": 11867 + }, + { + "entropy": 1.691257268190384, + "epoch": 1.30375435994617, + "grad_norm": 0.6741119027137756, + "learning_rate": 7.1415814944618646e-06, + "loss": 1.4412, + "mean_token_accuracy": 0.6452458004156748, + "num_tokens": 1989925558.0, + "step": 11868 + }, + { + "entropy": 1.702443500359853, + "epoch": 1.303864216857543, + "grad_norm": 0.8861745595932007, + "learning_rate": 7.140135030652919e-06, + "loss": 1.4018, + "mean_token_accuracy": 0.6592222899198532, + "num_tokens": 1990085254.0, + "step": 11869 + }, + { + "entropy": 1.6728020509084065, + "epoch": 1.303974073768916, + "grad_norm": 0.6596800684928894, + "learning_rate": 7.138688689006968e-06, + "loss": 1.3176, + "mean_token_accuracy": 0.6630978385607401, + "num_tokens": 1990248507.0, + "step": 11870 + }, + { + "entropy": 1.7162721355756123, + "epoch": 1.304083930680289, + "grad_norm": 0.799435019493103, + "learning_rate": 7.13724246956978e-06, + "loss": 1.4751, + "mean_token_accuracy": 0.6495125244061152, + "num_tokens": 1990412879.0, + "step": 11871 + }, + { + "entropy": 1.6690248648325603, + "epoch": 1.3041937875916618, + "grad_norm": 0.7061107754707336, + "learning_rate": 7.135796372387121e-06, + "loss": 1.4601, + "mean_token_accuracy": 0.6446433266003927, + "num_tokens": 1990609780.0, + "step": 11872 + }, + { + "entropy": 1.661782403786977, + "epoch": 1.304303644503035, + "grad_norm": 0.620296835899353, + "learning_rate": 7.13435039750477e-06, + "loss": 1.4143, + "mean_token_accuracy": 0.6596356878678004, + "num_tokens": 1990813000.0, + "step": 11873 + }, + { + "entropy": 1.737670491139094, + "epoch": 1.3044135014144078, + "grad_norm": 0.6647923588752747, + "learning_rate": 7.132904544968484e-06, + "loss": 1.4695, + "mean_token_accuracy": 0.628335619966189, + "num_tokens": 1991014720.0, + "step": 11874 + }, + { + "entropy": 1.771154135465622, + "epoch": 1.3045233583257807, + "grad_norm": 0.7745919823646545, + "learning_rate": 7.131458814824033e-06, + "loss": 1.3392, + "mean_token_accuracy": 0.6562488625446955, + "num_tokens": 1991159959.0, + "step": 11875 + }, + { + "entropy": 1.720243752002716, + "epoch": 1.3046332152371536, + "grad_norm": 0.6591370105743408, + "learning_rate": 7.130013207117164e-06, + "loss": 1.3527, + "mean_token_accuracy": 0.6506476004918417, + "num_tokens": 1991284571.0, + "step": 11876 + }, + { + "entropy": 1.7519052525361378, + "epoch": 1.3047430721485265, + "grad_norm": 0.8525600433349609, + "learning_rate": 7.128567721893629e-06, + "loss": 1.3246, + "mean_token_accuracy": 0.6619629363218943, + "num_tokens": 1991433891.0, + "step": 11877 + }, + { + "entropy": 1.7587460080782573, + "epoch": 1.3048529290598996, + "grad_norm": 0.7629795074462891, + "learning_rate": 7.127122359199186e-06, + "loss": 1.5044, + "mean_token_accuracy": 0.633780856927236, + "num_tokens": 1991631013.0, + "step": 11878 + }, + { + "entropy": 1.7409328023592632, + "epoch": 1.3049627859712725, + "grad_norm": 1.040186882019043, + "learning_rate": 7.1256771190795744e-06, + "loss": 1.4168, + "mean_token_accuracy": 0.6474807063738505, + "num_tokens": 1991741027.0, + "step": 11879 + }, + { + "entropy": 1.6785088181495667, + "epoch": 1.3050726428826454, + "grad_norm": 0.6517196893692017, + "learning_rate": 7.124232001580533e-06, + "loss": 1.4536, + "mean_token_accuracy": 0.6468540678421656, + "num_tokens": 1991930281.0, + "step": 11880 + }, + { + "entropy": 1.7357937196890514, + "epoch": 1.3051824997940182, + "grad_norm": 0.6505614519119263, + "learning_rate": 7.1227870067478025e-06, + "loss": 1.5418, + "mean_token_accuracy": 0.6523448824882507, + "num_tokens": 1992141041.0, + "step": 11881 + }, + { + "entropy": 1.6435925761858623, + "epoch": 1.3052923567053911, + "grad_norm": 0.73778235912323, + "learning_rate": 7.121342134627121e-06, + "loss": 1.3333, + "mean_token_accuracy": 0.6684698065121969, + "num_tokens": 1992299737.0, + "step": 11882 + }, + { + "entropy": 1.6869693100452423, + "epoch": 1.3054022136167642, + "grad_norm": 0.6052371859550476, + "learning_rate": 7.1198973852642094e-06, + "loss": 1.4825, + "mean_token_accuracy": 0.6540184319019318, + "num_tokens": 1992527255.0, + "step": 11883 + }, + { + "entropy": 1.6479829649130504, + "epoch": 1.3055120705281371, + "grad_norm": 0.6196063756942749, + "learning_rate": 7.118452758704797e-06, + "loss": 1.4023, + "mean_token_accuracy": 0.6664341787497202, + "num_tokens": 1992722009.0, + "step": 11884 + }, + { + "entropy": 1.751798113187154, + "epoch": 1.30562192743951, + "grad_norm": 0.7154742479324341, + "learning_rate": 7.117008254994608e-06, + "loss": 1.4442, + "mean_token_accuracy": 0.6428210635979971, + "num_tokens": 1992926541.0, + "step": 11885 + }, + { + "entropy": 1.7372412979602814, + "epoch": 1.305731784350883, + "grad_norm": 0.6471896171569824, + "learning_rate": 7.115563874179354e-06, + "loss": 1.3508, + "mean_token_accuracy": 0.6744556576013565, + "num_tokens": 1993067139.0, + "step": 11886 + }, + { + "entropy": 1.6598396003246307, + "epoch": 1.305841641262256, + "grad_norm": 0.6070998311042786, + "learning_rate": 7.114119616304758e-06, + "loss": 1.4995, + "mean_token_accuracy": 0.6376579652229944, + "num_tokens": 1993332234.0, + "step": 11887 + }, + { + "entropy": 1.6974481840928395, + "epoch": 1.3059514981736289, + "grad_norm": 0.689513623714447, + "learning_rate": 7.112675481416524e-06, + "loss": 1.4293, + "mean_token_accuracy": 0.6504635115464529, + "num_tokens": 1993523750.0, + "step": 11888 + }, + { + "entropy": 1.7427086234092712, + "epoch": 1.3060613550850018, + "grad_norm": 0.6812959313392639, + "learning_rate": 7.111231469560356e-06, + "loss": 1.369, + "mean_token_accuracy": 0.6677893449862798, + "num_tokens": 1993677201.0, + "step": 11889 + }, + { + "entropy": 1.7141542931397755, + "epoch": 1.3061712119963746, + "grad_norm": 0.6561225652694702, + "learning_rate": 7.109787580781964e-06, + "loss": 1.4565, + "mean_token_accuracy": 0.649625892440478, + "num_tokens": 1993841174.0, + "step": 11890 + }, + { + "entropy": 1.7247630953788757, + "epoch": 1.3062810689077478, + "grad_norm": 0.7922856211662292, + "learning_rate": 7.108343815127041e-06, + "loss": 1.1284, + "mean_token_accuracy": 0.682140568892161, + "num_tokens": 1994019614.0, + "step": 11891 + }, + { + "entropy": 1.7190321187178295, + "epoch": 1.3063909258191206, + "grad_norm": 0.7327906489372253, + "learning_rate": 7.10690017264128e-06, + "loss": 1.3485, + "mean_token_accuracy": 0.6538528551657995, + "num_tokens": 1994170865.0, + "step": 11892 + }, + { + "entropy": 1.7219412624835968, + "epoch": 1.3065007827304935, + "grad_norm": 0.6950879096984863, + "learning_rate": 7.105456653370373e-06, + "loss": 1.6429, + "mean_token_accuracy": 0.6246584728360176, + "num_tokens": 1994357037.0, + "step": 11893 + }, + { + "entropy": 1.6911317110061646, + "epoch": 1.3066106396418664, + "grad_norm": 0.6314573884010315, + "learning_rate": 7.104013257360012e-06, + "loss": 1.4832, + "mean_token_accuracy": 0.641195093592008, + "num_tokens": 1994567440.0, + "step": 11894 + }, + { + "entropy": 1.7743210991223652, + "epoch": 1.3067204965532393, + "grad_norm": 0.6991893649101257, + "learning_rate": 7.102569984655876e-06, + "loss": 1.4349, + "mean_token_accuracy": 0.6492632130781809, + "num_tokens": 1994730948.0, + "step": 11895 + }, + { + "entropy": 1.6938276489575703, + "epoch": 1.3068303534646124, + "grad_norm": 0.6895888447761536, + "learning_rate": 7.101126835303642e-06, + "loss": 1.2818, + "mean_token_accuracy": 0.676262636979421, + "num_tokens": 1994872136.0, + "step": 11896 + }, + { + "entropy": 1.7031813363234203, + "epoch": 1.3069402103759853, + "grad_norm": 0.6379356980323792, + "learning_rate": 7.099683809348987e-06, + "loss": 1.5104, + "mean_token_accuracy": 0.6480912466843923, + "num_tokens": 1995100831.0, + "step": 11897 + }, + { + "entropy": 1.697776734828949, + "epoch": 1.3070500672873582, + "grad_norm": 0.782518208026886, + "learning_rate": 7.098240906837581e-06, + "loss": 1.4419, + "mean_token_accuracy": 0.6553240418434143, + "num_tokens": 1995309589.0, + "step": 11898 + }, + { + "entropy": 1.733527531226476, + "epoch": 1.3071599241987313, + "grad_norm": 0.7691713571548462, + "learning_rate": 7.096798127815095e-06, + "loss": 1.544, + "mean_token_accuracy": 0.6410651057958603, + "num_tokens": 1995483068.0, + "step": 11899 + }, + { + "entropy": 1.7282393078009288, + "epoch": 1.3072697811101042, + "grad_norm": 0.7292653322219849, + "learning_rate": 7.095355472327188e-06, + "loss": 1.5436, + "mean_token_accuracy": 0.6305726369222006, + "num_tokens": 1995685871.0, + "step": 11900 + }, + { + "entropy": 1.7617888549963634, + "epoch": 1.307379638021477, + "grad_norm": 0.607972264289856, + "learning_rate": 7.093912940419518e-06, + "loss": 1.4118, + "mean_token_accuracy": 0.6436517437299093, + "num_tokens": 1995881773.0, + "step": 11901 + }, + { + "entropy": 1.6877683500448863, + "epoch": 1.30748949493285, + "grad_norm": 0.8216177225112915, + "learning_rate": 7.0924705321377476e-06, + "loss": 1.4615, + "mean_token_accuracy": 0.6518243153889974, + "num_tokens": 1996016525.0, + "step": 11902 + }, + { + "entropy": 1.6974779566129048, + "epoch": 1.3075993518442228, + "grad_norm": 0.701296865940094, + "learning_rate": 7.091028247527523e-06, + "loss": 1.4144, + "mean_token_accuracy": 0.6555771032969157, + "num_tokens": 1996158592.0, + "step": 11903 + }, + { + "entropy": 1.681029220422109, + "epoch": 1.307709208755596, + "grad_norm": 0.6263594627380371, + "learning_rate": 7.08958608663449e-06, + "loss": 1.4482, + "mean_token_accuracy": 0.6391114493211111, + "num_tokens": 1996383683.0, + "step": 11904 + }, + { + "entropy": 1.6188758412996929, + "epoch": 1.3078190656669688, + "grad_norm": 0.5903595685958862, + "learning_rate": 7.088144049504297e-06, + "loss": 1.2563, + "mean_token_accuracy": 0.6739430278539658, + "num_tokens": 1996537882.0, + "step": 11905 + }, + { + "entropy": 1.7247794965902965, + "epoch": 1.3079289225783417, + "grad_norm": 0.645677924156189, + "learning_rate": 7.0867021361825834e-06, + "loss": 1.392, + "mean_token_accuracy": 0.6563497483730316, + "num_tokens": 1996661633.0, + "step": 11906 + }, + { + "entropy": 1.6579484939575195, + "epoch": 1.3080387794897146, + "grad_norm": 0.606181263923645, + "learning_rate": 7.085260346714984e-06, + "loss": 1.5362, + "mean_token_accuracy": 0.6405880848566691, + "num_tokens": 1996846001.0, + "step": 11907 + }, + { + "entropy": 1.699442724386851, + "epoch": 1.3081486364010875, + "grad_norm": 0.6131225228309631, + "learning_rate": 7.083818681147128e-06, + "loss": 1.3592, + "mean_token_accuracy": 0.658347432812055, + "num_tokens": 1997010019.0, + "step": 11908 + }, + { + "entropy": 1.688926676909129, + "epoch": 1.3082584933124606, + "grad_norm": 0.7150919437408447, + "learning_rate": 7.08237713952465e-06, + "loss": 1.1604, + "mean_token_accuracy": 0.6859798580408096, + "num_tokens": 1997120284.0, + "step": 11909 + }, + { + "entropy": 1.703328440586726, + "epoch": 1.3083683502238335, + "grad_norm": 0.6847726702690125, + "learning_rate": 7.0809357218931655e-06, + "loss": 1.5503, + "mean_token_accuracy": 0.637129470705986, + "num_tokens": 1997338833.0, + "step": 11910 + }, + { + "entropy": 1.7029491166273754, + "epoch": 1.3084782071352064, + "grad_norm": 0.6607728004455566, + "learning_rate": 7.079494428298306e-06, + "loss": 1.3826, + "mean_token_accuracy": 0.6601289560397466, + "num_tokens": 1997488890.0, + "step": 11911 + }, + { + "entropy": 1.779124120871226, + "epoch": 1.3085880640465795, + "grad_norm": 0.6851378083229065, + "learning_rate": 7.078053258785675e-06, + "loss": 1.5597, + "mean_token_accuracy": 0.6372034152348837, + "num_tokens": 1997717867.0, + "step": 11912 + }, + { + "entropy": 1.703583796819051, + "epoch": 1.3086979209579523, + "grad_norm": 0.7330154776573181, + "learning_rate": 7.076612213400893e-06, + "loss": 1.5164, + "mean_token_accuracy": 0.6479217112064362, + "num_tokens": 1997907725.0, + "step": 11913 + }, + { + "entropy": 1.752791404724121, + "epoch": 1.3088077778693252, + "grad_norm": 0.6505106687545776, + "learning_rate": 7.075171292189567e-06, + "loss": 1.2992, + "mean_token_accuracy": 0.6648927380641302, + "num_tokens": 1998051327.0, + "step": 11914 + }, + { + "entropy": 1.7556921243667603, + "epoch": 1.3089176347806981, + "grad_norm": 0.9044151902198792, + "learning_rate": 7.073730495197302e-06, + "loss": 1.3221, + "mean_token_accuracy": 0.6583772599697113, + "num_tokens": 1998151465.0, + "step": 11915 + }, + { + "entropy": 1.6777693728605907, + "epoch": 1.309027491692071, + "grad_norm": 0.5966777205467224, + "learning_rate": 7.072289822469696e-06, + "loss": 1.4588, + "mean_token_accuracy": 0.6472314149141312, + "num_tokens": 1998389856.0, + "step": 11916 + }, + { + "entropy": 1.7139520446459453, + "epoch": 1.309137348603444, + "grad_norm": 0.8145208358764648, + "learning_rate": 7.070849274052347e-06, + "loss": 1.4261, + "mean_token_accuracy": 0.6561163713534673, + "num_tokens": 1998569531.0, + "step": 11917 + }, + { + "entropy": 1.7513943115870159, + "epoch": 1.309247205514817, + "grad_norm": 0.7245900630950928, + "learning_rate": 7.069408849990846e-06, + "loss": 1.4398, + "mean_token_accuracy": 0.6619679679473242, + "num_tokens": 1998718517.0, + "step": 11918 + }, + { + "entropy": 1.7176838616530101, + "epoch": 1.3093570624261899, + "grad_norm": 0.6961973309516907, + "learning_rate": 7.067968550330788e-06, + "loss": 1.3737, + "mean_token_accuracy": 0.665938675403595, + "num_tokens": 1998860116.0, + "step": 11919 + }, + { + "entropy": 1.7159354587395985, + "epoch": 1.3094669193375628, + "grad_norm": 0.7147000432014465, + "learning_rate": 7.066528375117754e-06, + "loss": 1.2228, + "mean_token_accuracy": 0.6914103428522745, + "num_tokens": 1998983159.0, + "step": 11920 + }, + { + "entropy": 1.6717688739299774, + "epoch": 1.3095767762489356, + "grad_norm": 0.6504638195037842, + "learning_rate": 7.06508832439732e-06, + "loss": 1.3763, + "mean_token_accuracy": 0.657271221280098, + "num_tokens": 1999134388.0, + "step": 11921 + }, + { + "entropy": 1.7023660739262898, + "epoch": 1.3096866331603088, + "grad_norm": 0.7245521545410156, + "learning_rate": 7.0636483982150685e-06, + "loss": 1.4674, + "mean_token_accuracy": 0.6482570519049963, + "num_tokens": 1999333736.0, + "step": 11922 + }, + { + "entropy": 1.7503623863061268, + "epoch": 1.3097964900716816, + "grad_norm": 0.7295483350753784, + "learning_rate": 7.0622085966165775e-06, + "loss": 1.2565, + "mean_token_accuracy": 0.6758219550053278, + "num_tokens": 1999443030.0, + "step": 11923 + }, + { + "entropy": 1.6741431951522827, + "epoch": 1.3099063469830545, + "grad_norm": 0.8775436282157898, + "learning_rate": 7.060768919647402e-06, + "loss": 1.3264, + "mean_token_accuracy": 0.6649001787106196, + "num_tokens": 1999620601.0, + "step": 11924 + }, + { + "entropy": 1.6897100607554119, + "epoch": 1.3100162038944276, + "grad_norm": 0.5896010994911194, + "learning_rate": 7.0593293673531185e-06, + "loss": 1.4494, + "mean_token_accuracy": 0.6483661234378815, + "num_tokens": 1999852207.0, + "step": 11925 + }, + { + "entropy": 1.7433798710505168, + "epoch": 1.3101260608058005, + "grad_norm": 0.6538956761360168, + "learning_rate": 7.057889939779284e-06, + "loss": 1.3828, + "mean_token_accuracy": 0.6552889595429102, + "num_tokens": 2000021319.0, + "step": 11926 + }, + { + "entropy": 1.6846702595551808, + "epoch": 1.3102359177171734, + "grad_norm": 0.8677592873573303, + "learning_rate": 7.056450636971459e-06, + "loss": 1.2809, + "mean_token_accuracy": 0.6779492398103079, + "num_tokens": 2000178801.0, + "step": 11927 + }, + { + "entropy": 1.7277966737747192, + "epoch": 1.3103457746285463, + "grad_norm": 0.7624452114105225, + "learning_rate": 7.055011458975189e-06, + "loss": 1.4015, + "mean_token_accuracy": 0.6448380748430887, + "num_tokens": 2000366731.0, + "step": 11928 + }, + { + "entropy": 1.6685428619384766, + "epoch": 1.3104556315399192, + "grad_norm": 0.6821413636207581, + "learning_rate": 7.053572405836035e-06, + "loss": 1.3076, + "mean_token_accuracy": 0.6730211079120636, + "num_tokens": 2000503771.0, + "step": 11929 + }, + { + "entropy": 1.7650530834992726, + "epoch": 1.3105654884512923, + "grad_norm": 0.7161713242530823, + "learning_rate": 7.0521334775995325e-06, + "loss": 1.2504, + "mean_token_accuracy": 0.674405058224996, + "num_tokens": 2000625020.0, + "step": 11930 + }, + { + "entropy": 1.7475067675113678, + "epoch": 1.3106753453626652, + "grad_norm": 0.7672361135482788, + "learning_rate": 7.050694674311227e-06, + "loss": 1.5302, + "mean_token_accuracy": 0.6399498085180918, + "num_tokens": 2000778280.0, + "step": 11931 + }, + { + "entropy": 1.7035863002141316, + "epoch": 1.310785202274038, + "grad_norm": 0.6786298751831055, + "learning_rate": 7.049255996016657e-06, + "loss": 1.4849, + "mean_token_accuracy": 0.6429836452007294, + "num_tokens": 2000942476.0, + "step": 11932 + }, + { + "entropy": 1.6817757089932759, + "epoch": 1.310895059185411, + "grad_norm": 0.6696064472198486, + "learning_rate": 7.047817442761351e-06, + "loss": 1.32, + "mean_token_accuracy": 0.668683315316836, + "num_tokens": 2001089622.0, + "step": 11933 + }, + { + "entropy": 1.6976955632368724, + "epoch": 1.3110049160967838, + "grad_norm": 0.7553936839103699, + "learning_rate": 7.046379014590847e-06, + "loss": 1.2293, + "mean_token_accuracy": 0.6788338373104731, + "num_tokens": 2001197344.0, + "step": 11934 + }, + { + "entropy": 1.6999300718307495, + "epoch": 1.311114773008157, + "grad_norm": 0.6611595153808594, + "learning_rate": 7.0449407115506655e-06, + "loss": 1.3581, + "mean_token_accuracy": 0.6565580070018768, + "num_tokens": 2001349759.0, + "step": 11935 + }, + { + "entropy": 1.6997497379779816, + "epoch": 1.3112246299195298, + "grad_norm": 1.5599058866500854, + "learning_rate": 7.043502533686321e-06, + "loss": 1.3612, + "mean_token_accuracy": 0.664860337972641, + "num_tokens": 2001540113.0, + "step": 11936 + }, + { + "entropy": 1.6655697226524353, + "epoch": 1.3113344868309027, + "grad_norm": 0.7153550982475281, + "learning_rate": 7.04206448104334e-06, + "loss": 1.4734, + "mean_token_accuracy": 0.6454818745454153, + "num_tokens": 2001759838.0, + "step": 11937 + }, + { + "entropy": 1.6881613234678905, + "epoch": 1.3114443437422758, + "grad_norm": 0.6710202693939209, + "learning_rate": 7.04062655366724e-06, + "loss": 1.3767, + "mean_token_accuracy": 0.6576898495356241, + "num_tokens": 2001927019.0, + "step": 11938 + }, + { + "entropy": 1.6877728005250294, + "epoch": 1.3115542006536487, + "grad_norm": 0.7258594036102295, + "learning_rate": 7.039188751603525e-06, + "loss": 1.2771, + "mean_token_accuracy": 0.6696644773085912, + "num_tokens": 2002100598.0, + "step": 11939 + }, + { + "entropy": 1.6909307440121968, + "epoch": 1.3116640575650216, + "grad_norm": 0.6466124057769775, + "learning_rate": 7.037751074897698e-06, + "loss": 1.5839, + "mean_token_accuracy": 0.6439488381147385, + "num_tokens": 2002336522.0, + "step": 11940 + }, + { + "entropy": 1.7181779742240906, + "epoch": 1.3117739144763945, + "grad_norm": 0.6814691424369812, + "learning_rate": 7.036313523595266e-06, + "loss": 1.4541, + "mean_token_accuracy": 0.6534381111462911, + "num_tokens": 2002516915.0, + "step": 11941 + }, + { + "entropy": 1.6743408739566803, + "epoch": 1.3118837713877674, + "grad_norm": 0.6737117171287537, + "learning_rate": 7.034876097741723e-06, + "loss": 1.3741, + "mean_token_accuracy": 0.6632463186979294, + "num_tokens": 2002703215.0, + "step": 11942 + }, + { + "entropy": 1.7161073585351307, + "epoch": 1.3119936282991405, + "grad_norm": 0.6047975420951843, + "learning_rate": 7.033438797382568e-06, + "loss": 1.3728, + "mean_token_accuracy": 0.6500318894783655, + "num_tokens": 2002862792.0, + "step": 11943 + }, + { + "entropy": 1.6879894336064656, + "epoch": 1.3121034852105133, + "grad_norm": 0.660843551158905, + "learning_rate": 7.032001622563287e-06, + "loss": 1.435, + "mean_token_accuracy": 0.6527181764443716, + "num_tokens": 2003060623.0, + "step": 11944 + }, + { + "entropy": 1.7308319707711537, + "epoch": 1.3122133421218862, + "grad_norm": 0.6762115359306335, + "learning_rate": 7.030564573329364e-06, + "loss": 1.3298, + "mean_token_accuracy": 0.6627347220977148, + "num_tokens": 2003226847.0, + "step": 11945 + }, + { + "entropy": 1.6313576400279999, + "epoch": 1.3123231990332591, + "grad_norm": 0.647402822971344, + "learning_rate": 7.029127649726286e-06, + "loss": 1.46, + "mean_token_accuracy": 0.6542697101831436, + "num_tokens": 2003413157.0, + "step": 11946 + }, + { + "entropy": 1.6898943781852722, + "epoch": 1.312433055944632, + "grad_norm": 0.6319282650947571, + "learning_rate": 7.027690851799529e-06, + "loss": 1.4451, + "mean_token_accuracy": 0.6684151142835617, + "num_tokens": 2003564249.0, + "step": 11947 + }, + { + "entropy": 1.6767113904158275, + "epoch": 1.312542912856005, + "grad_norm": 0.794843316078186, + "learning_rate": 7.026254179594563e-06, + "loss": 1.3385, + "mean_token_accuracy": 0.6593736608823141, + "num_tokens": 2003753111.0, + "step": 11948 + }, + { + "entropy": 1.7902327179908752, + "epoch": 1.312652769767378, + "grad_norm": 0.7606083154678345, + "learning_rate": 7.024817633156862e-06, + "loss": 1.5597, + "mean_token_accuracy": 0.6477732261021932, + "num_tokens": 2003905456.0, + "step": 11949 + }, + { + "entropy": 1.755109578371048, + "epoch": 1.3127626266787509, + "grad_norm": 0.7885520458221436, + "learning_rate": 7.023381212531895e-06, + "loss": 1.3244, + "mean_token_accuracy": 0.6673354307810465, + "num_tokens": 2004044077.0, + "step": 11950 + }, + { + "entropy": 1.7376815676689148, + "epoch": 1.312872483590124, + "grad_norm": 0.6872356534004211, + "learning_rate": 7.02194491776512e-06, + "loss": 1.4115, + "mean_token_accuracy": 0.6557418157656988, + "num_tokens": 2004214096.0, + "step": 11951 + }, + { + "entropy": 1.7242650091648102, + "epoch": 1.3129823405014969, + "grad_norm": 1.8172736167907715, + "learning_rate": 7.020508748901993e-06, + "loss": 1.181, + "mean_token_accuracy": 0.6751060833533605, + "num_tokens": 2004384350.0, + "step": 11952 + }, + { + "entropy": 1.7193986773490906, + "epoch": 1.3130921974128698, + "grad_norm": 0.7913485169410706, + "learning_rate": 7.019072705987975e-06, + "loss": 1.3648, + "mean_token_accuracy": 0.6586080143849055, + "num_tokens": 2004533962.0, + "step": 11953 + }, + { + "entropy": 1.7227738400300343, + "epoch": 1.3132020543242426, + "grad_norm": 0.6674314141273499, + "learning_rate": 7.017636789068507e-06, + "loss": 1.439, + "mean_token_accuracy": 0.6552018125851949, + "num_tokens": 2004682824.0, + "step": 11954 + }, + { + "entropy": 1.696447104215622, + "epoch": 1.3133119112356155, + "grad_norm": 0.7668749094009399, + "learning_rate": 7.0162009981890445e-06, + "loss": 1.2401, + "mean_token_accuracy": 0.6790489206711451, + "num_tokens": 2004823503.0, + "step": 11955 + }, + { + "entropy": 1.7591257691383362, + "epoch": 1.3134217681469886, + "grad_norm": 0.6392584443092346, + "learning_rate": 7.014765333395026e-06, + "loss": 1.4618, + "mean_token_accuracy": 0.640847826997439, + "num_tokens": 2005055675.0, + "step": 11956 + }, + { + "entropy": 1.7323053081830342, + "epoch": 1.3135316250583615, + "grad_norm": 0.7856550216674805, + "learning_rate": 7.0133297947318845e-06, + "loss": 1.2616, + "mean_token_accuracy": 0.6696621626615524, + "num_tokens": 2005187034.0, + "step": 11957 + }, + { + "entropy": 1.7238508264223735, + "epoch": 1.3136414819697344, + "grad_norm": 0.845143735408783, + "learning_rate": 7.011894382245062e-06, + "loss": 1.4599, + "mean_token_accuracy": 0.6531222860018412, + "num_tokens": 2005330183.0, + "step": 11958 + }, + { + "entropy": 1.7025631666183472, + "epoch": 1.3137513388811073, + "grad_norm": 0.7193249464035034, + "learning_rate": 7.0104590959799845e-06, + "loss": 1.322, + "mean_token_accuracy": 0.6668793509403864, + "num_tokens": 2005483599.0, + "step": 11959 + }, + { + "entropy": 1.721425364414851, + "epoch": 1.3138611957924802, + "grad_norm": 0.8288023471832275, + "learning_rate": 7.009023935982076e-06, + "loss": 1.3867, + "mean_token_accuracy": 0.6620455334583918, + "num_tokens": 2005630137.0, + "step": 11960 + }, + { + "entropy": 1.6998872856299083, + "epoch": 1.3139710527038533, + "grad_norm": 0.673412024974823, + "learning_rate": 7.0075889022967625e-06, + "loss": 1.5038, + "mean_token_accuracy": 0.6355178554852804, + "num_tokens": 2005829806.0, + "step": 11961 + }, + { + "entropy": 1.792010138432185, + "epoch": 1.3140809096152262, + "grad_norm": 0.805509090423584, + "learning_rate": 7.0061539949694645e-06, + "loss": 1.4613, + "mean_token_accuracy": 0.6476135204235712, + "num_tokens": 2005964524.0, + "step": 11962 + }, + { + "entropy": 1.7988332509994507, + "epoch": 1.314190766526599, + "grad_norm": 0.7284339666366577, + "learning_rate": 7.004719214045592e-06, + "loss": 1.4274, + "mean_token_accuracy": 0.6368361463149389, + "num_tokens": 2006085615.0, + "step": 11963 + }, + { + "entropy": 1.7714728315671284, + "epoch": 1.3143006234379722, + "grad_norm": 0.674480676651001, + "learning_rate": 7.003284559570554e-06, + "loss": 1.4091, + "mean_token_accuracy": 0.6386928856372833, + "num_tokens": 2006276341.0, + "step": 11964 + }, + { + "entropy": 1.7038045426209767, + "epoch": 1.314410480349345, + "grad_norm": 0.7045182585716248, + "learning_rate": 7.001850031589761e-06, + "loss": 1.4416, + "mean_token_accuracy": 0.6599552830060323, + "num_tokens": 2006435304.0, + "step": 11965 + }, + { + "entropy": 1.7020907998085022, + "epoch": 1.314520337260718, + "grad_norm": 0.7166406512260437, + "learning_rate": 7.0004156301486095e-06, + "loss": 1.3538, + "mean_token_accuracy": 0.6714149415493011, + "num_tokens": 2006545361.0, + "step": 11966 + }, + { + "entropy": 1.6336172918478649, + "epoch": 1.3146301941720908, + "grad_norm": 0.5933431386947632, + "learning_rate": 6.998981355292505e-06, + "loss": 1.4167, + "mean_token_accuracy": 0.6494115591049194, + "num_tokens": 2006711734.0, + "step": 11967 + }, + { + "entropy": 1.704167326291402, + "epoch": 1.3147400510834637, + "grad_norm": 0.6579341292381287, + "learning_rate": 6.997547207066836e-06, + "loss": 1.2635, + "mean_token_accuracy": 0.6757092028856277, + "num_tokens": 2006860103.0, + "step": 11968 + }, + { + "entropy": 1.725958655277888, + "epoch": 1.3148499079948368, + "grad_norm": 0.7082239389419556, + "learning_rate": 6.996113185516993e-06, + "loss": 1.3941, + "mean_token_accuracy": 0.6522109111150106, + "num_tokens": 2007005193.0, + "step": 11969 + }, + { + "entropy": 1.6946588456630707, + "epoch": 1.3149597649062097, + "grad_norm": 0.7615059614181519, + "learning_rate": 6.994679290688366e-06, + "loss": 1.3615, + "mean_token_accuracy": 0.6599778831005096, + "num_tokens": 2007176565.0, + "step": 11970 + }, + { + "entropy": 1.64273335536321, + "epoch": 1.3150696218175826, + "grad_norm": 0.6892207860946655, + "learning_rate": 6.993245522626335e-06, + "loss": 1.1621, + "mean_token_accuracy": 0.6934431493282318, + "num_tokens": 2007289708.0, + "step": 11971 + }, + { + "entropy": 1.7297363777955372, + "epoch": 1.3151794787289555, + "grad_norm": 0.7124273180961609, + "learning_rate": 6.991811881376274e-06, + "loss": 1.4418, + "mean_token_accuracy": 0.641096313794454, + "num_tokens": 2007489730.0, + "step": 11972 + }, + { + "entropy": 1.7163341144720714, + "epoch": 1.3152893356403283, + "grad_norm": 0.6671984195709229, + "learning_rate": 6.990378366983563e-06, + "loss": 1.4064, + "mean_token_accuracy": 0.6469017068545023, + "num_tokens": 2007658295.0, + "step": 11973 + }, + { + "entropy": 1.7121194104353588, + "epoch": 1.3153991925517015, + "grad_norm": 0.7968659400939941, + "learning_rate": 6.9889449794935685e-06, + "loss": 1.3539, + "mean_token_accuracy": 0.6690041224161783, + "num_tokens": 2007827861.0, + "step": 11974 + }, + { + "entropy": 1.7029815713564556, + "epoch": 1.3155090494630743, + "grad_norm": 0.6681612133979797, + "learning_rate": 6.987511718951661e-06, + "loss": 1.4541, + "mean_token_accuracy": 0.6633522013823191, + "num_tokens": 2008021545.0, + "step": 11975 + }, + { + "entropy": 1.727482130130132, + "epoch": 1.3156189063744472, + "grad_norm": 0.6855336427688599, + "learning_rate": 6.9860785854032e-06, + "loss": 1.5242, + "mean_token_accuracy": 0.6396484598517418, + "num_tokens": 2008216504.0, + "step": 11976 + }, + { + "entropy": 1.7152353723843892, + "epoch": 1.3157287632858203, + "grad_norm": 0.6966313123703003, + "learning_rate": 6.9846455788935376e-06, + "loss": 1.3325, + "mean_token_accuracy": 0.6538062343994776, + "num_tokens": 2008410271.0, + "step": 11977 + }, + { + "entropy": 1.7069965600967407, + "epoch": 1.3158386201971932, + "grad_norm": 0.6595732569694519, + "learning_rate": 6.983212699468035e-06, + "loss": 1.5271, + "mean_token_accuracy": 0.6532554477453232, + "num_tokens": 2008581622.0, + "step": 11978 + }, + { + "entropy": 1.6654066642125447, + "epoch": 1.315948477108566, + "grad_norm": 0.6108769774436951, + "learning_rate": 6.981779947172047e-06, + "loss": 1.4571, + "mean_token_accuracy": 0.6504184703032175, + "num_tokens": 2008773849.0, + "step": 11979 + }, + { + "entropy": 1.65288241704305, + "epoch": 1.316058334019939, + "grad_norm": 0.7052204608917236, + "learning_rate": 6.980347322050905e-06, + "loss": 1.2769, + "mean_token_accuracy": 0.6784281581640244, + "num_tokens": 2008965203.0, + "step": 11980 + }, + { + "entropy": 1.6817485094070435, + "epoch": 1.3161681909313119, + "grad_norm": 0.5673655867576599, + "learning_rate": 6.97891482414996e-06, + "loss": 1.3401, + "mean_token_accuracy": 0.6604795058568319, + "num_tokens": 2009142441.0, + "step": 11981 + }, + { + "entropy": 1.7240809003512065, + "epoch": 1.316278047842685, + "grad_norm": 0.6513432860374451, + "learning_rate": 6.9774824535145525e-06, + "loss": 1.4674, + "mean_token_accuracy": 0.6503648559252421, + "num_tokens": 2009350171.0, + "step": 11982 + }, + { + "entropy": 1.687408596277237, + "epoch": 1.3163879047540579, + "grad_norm": 1.6337801218032837, + "learning_rate": 6.976050210190013e-06, + "loss": 1.3665, + "mean_token_accuracy": 0.6606544703245163, + "num_tokens": 2009555178.0, + "step": 11983 + }, + { + "entropy": 1.7758075793584187, + "epoch": 1.3164977616654308, + "grad_norm": 0.7786139249801636, + "learning_rate": 6.9746180942216676e-06, + "loss": 1.3912, + "mean_token_accuracy": 0.657907764116923, + "num_tokens": 2009757056.0, + "step": 11984 + }, + { + "entropy": 1.7262560923894246, + "epoch": 1.3166076185768036, + "grad_norm": 0.5811822414398193, + "learning_rate": 6.973186105654849e-06, + "loss": 1.4709, + "mean_token_accuracy": 0.6334926833709081, + "num_tokens": 2009978729.0, + "step": 11985 + }, + { + "entropy": 1.7114702463150024, + "epoch": 1.3167174754881765, + "grad_norm": 0.6832294464111328, + "learning_rate": 6.971754244534872e-06, + "loss": 1.3515, + "mean_token_accuracy": 0.6603354662656784, + "num_tokens": 2010165090.0, + "step": 11986 + }, + { + "entropy": 1.7641779085000355, + "epoch": 1.3168273323995496, + "grad_norm": 0.694060742855072, + "learning_rate": 6.97032251090706e-06, + "loss": 1.3409, + "mean_token_accuracy": 0.6643087863922119, + "num_tokens": 2010274509.0, + "step": 11987 + }, + { + "entropy": 1.6871871054172516, + "epoch": 1.3169371893109225, + "grad_norm": 0.7732803821563721, + "learning_rate": 6.9688909048167265e-06, + "loss": 1.2772, + "mean_token_accuracy": 0.6672064363956451, + "num_tokens": 2010382893.0, + "step": 11988 + }, + { + "entropy": 1.679459939400355, + "epoch": 1.3170470462222954, + "grad_norm": 0.6008415818214417, + "learning_rate": 6.967459426309175e-06, + "loss": 1.3141, + "mean_token_accuracy": 0.6672980437676111, + "num_tokens": 2010528829.0, + "step": 11989 + }, + { + "entropy": 1.6381245056788127, + "epoch": 1.3171569031336685, + "grad_norm": 1.6934006214141846, + "learning_rate": 6.966028075429716e-06, + "loss": 1.0885, + "mean_token_accuracy": 0.6874684443076452, + "num_tokens": 2010687460.0, + "step": 11990 + }, + { + "entropy": 1.7382714649041493, + "epoch": 1.3172667600450414, + "grad_norm": 0.7133601307868958, + "learning_rate": 6.9645968522236576e-06, + "loss": 1.4665, + "mean_token_accuracy": 0.6435732394456863, + "num_tokens": 2010850045.0, + "step": 11991 + }, + { + "entropy": 1.736552745103836, + "epoch": 1.3173766169564143, + "grad_norm": 0.8156201243400574, + "learning_rate": 6.963165756736283e-06, + "loss": 1.3862, + "mean_token_accuracy": 0.6678305069605509, + "num_tokens": 2010981868.0, + "step": 11992 + }, + { + "entropy": 1.670212835073471, + "epoch": 1.3174864738677872, + "grad_norm": 0.6825160384178162, + "learning_rate": 6.961734789012895e-06, + "loss": 1.3759, + "mean_token_accuracy": 0.6739961455265681, + "num_tokens": 2011137705.0, + "step": 11993 + }, + { + "entropy": 1.7107898096243541, + "epoch": 1.31759633077916, + "grad_norm": 0.6937064528465271, + "learning_rate": 6.9603039490987834e-06, + "loss": 1.3964, + "mean_token_accuracy": 0.6487229913473129, + "num_tokens": 2011327696.0, + "step": 11994 + }, + { + "entropy": 1.7345775763193767, + "epoch": 1.3177061876905332, + "grad_norm": 0.7579020857810974, + "learning_rate": 6.958873237039231e-06, + "loss": 1.3378, + "mean_token_accuracy": 0.6619026213884354, + "num_tokens": 2011466276.0, + "step": 11995 + }, + { + "entropy": 1.6762347221374512, + "epoch": 1.317816044601906, + "grad_norm": 0.7532185912132263, + "learning_rate": 6.957442652879516e-06, + "loss": 1.3609, + "mean_token_accuracy": 0.6672007888555527, + "num_tokens": 2011583262.0, + "step": 11996 + }, + { + "entropy": 1.6885204215844472, + "epoch": 1.317925901513279, + "grad_norm": 0.8010686635971069, + "learning_rate": 6.956012196664925e-06, + "loss": 1.4232, + "mean_token_accuracy": 0.6506709555784861, + "num_tokens": 2011723412.0, + "step": 11997 + }, + { + "entropy": 1.687297483285268, + "epoch": 1.3180357584246518, + "grad_norm": 0.6130065321922302, + "learning_rate": 6.95458186844072e-06, + "loss": 1.4189, + "mean_token_accuracy": 0.6468682587146759, + "num_tokens": 2011927534.0, + "step": 11998 + }, + { + "entropy": 1.6592112084229786, + "epoch": 1.3181456153360247, + "grad_norm": 0.6361923217773438, + "learning_rate": 6.9531516682521805e-06, + "loss": 1.3959, + "mean_token_accuracy": 0.6538289586702982, + "num_tokens": 2012095512.0, + "step": 11999 + }, + { + "entropy": 1.682155708471934, + "epoch": 1.3182554722473978, + "grad_norm": 0.6887022852897644, + "learning_rate": 6.951721596144566e-06, + "loss": 1.4071, + "mean_token_accuracy": 0.6655903309583664, + "num_tokens": 2012261571.0, + "step": 12000 + }, + { + "entropy": 1.636295755704244, + "epoch": 1.3183653291587707, + "grad_norm": 0.8937088847160339, + "learning_rate": 6.950291652163137e-06, + "loss": 1.4039, + "mean_token_accuracy": 0.6635189006725947, + "num_tokens": 2012434032.0, + "step": 12001 + }, + { + "entropy": 1.7253755927085876, + "epoch": 1.3184751860701436, + "grad_norm": 0.6961193680763245, + "learning_rate": 6.9488618363531515e-06, + "loss": 1.4444, + "mean_token_accuracy": 0.6477092305819193, + "num_tokens": 2012624662.0, + "step": 12002 + }, + { + "entropy": 1.6379418571790059, + "epoch": 1.3185850429815167, + "grad_norm": 0.7339573502540588, + "learning_rate": 6.947432148759871e-06, + "loss": 1.2261, + "mean_token_accuracy": 0.6745143185059229, + "num_tokens": 2012745932.0, + "step": 12003 + }, + { + "entropy": 1.6386353770891826, + "epoch": 1.3186948998928896, + "grad_norm": 0.6927421689033508, + "learning_rate": 6.946002589428528e-06, + "loss": 1.3068, + "mean_token_accuracy": 0.6671117693185806, + "num_tokens": 2012919590.0, + "step": 12004 + }, + { + "entropy": 1.7081229587395985, + "epoch": 1.3188047568042625, + "grad_norm": 0.7358942627906799, + "learning_rate": 6.9445731584043776e-06, + "loss": 1.4894, + "mean_token_accuracy": 0.6323947161436081, + "num_tokens": 2013142770.0, + "step": 12005 + }, + { + "entropy": 1.6147757669289906, + "epoch": 1.3189146137156353, + "grad_norm": 0.6353159546852112, + "learning_rate": 6.943143855732662e-06, + "loss": 1.2623, + "mean_token_accuracy": 0.6711504012346268, + "num_tokens": 2013315444.0, + "step": 12006 + }, + { + "entropy": 1.7047906319300334, + "epoch": 1.3190244706270082, + "grad_norm": 0.8190060257911682, + "learning_rate": 6.941714681458617e-06, + "loss": 1.3866, + "mean_token_accuracy": 0.6565055847167969, + "num_tokens": 2013477090.0, + "step": 12007 + }, + { + "entropy": 1.734445333480835, + "epoch": 1.3191343275383813, + "grad_norm": 0.5961945056915283, + "learning_rate": 6.940285635627468e-06, + "loss": 1.4759, + "mean_token_accuracy": 0.6589976797501246, + "num_tokens": 2013679313.0, + "step": 12008 + }, + { + "entropy": 1.7090435028076172, + "epoch": 1.3192441844497542, + "grad_norm": 0.7113987803459167, + "learning_rate": 6.9388567182844545e-06, + "loss": 1.5274, + "mean_token_accuracy": 0.6512242555618286, + "num_tokens": 2013869597.0, + "step": 12009 + }, + { + "entropy": 1.708377093076706, + "epoch": 1.319354041361127, + "grad_norm": 0.6466943025588989, + "learning_rate": 6.9374279294747914e-06, + "loss": 1.3696, + "mean_token_accuracy": 0.657651330033938, + "num_tokens": 2014060463.0, + "step": 12010 + }, + { + "entropy": 1.7558682362238567, + "epoch": 1.3194638982725, + "grad_norm": 0.6701050400733948, + "learning_rate": 6.9359992692437074e-06, + "loss": 1.5358, + "mean_token_accuracy": 0.6455720663070679, + "num_tokens": 2014230820.0, + "step": 12011 + }, + { + "entropy": 1.716192901134491, + "epoch": 1.3195737551838729, + "grad_norm": 0.6614550948143005, + "learning_rate": 6.934570737636415e-06, + "loss": 1.3733, + "mean_token_accuracy": 0.6561314910650253, + "num_tokens": 2014358023.0, + "step": 12012 + }, + { + "entropy": 1.7146589954694111, + "epoch": 1.319683612095246, + "grad_norm": 0.8411096930503845, + "learning_rate": 6.933142334698126e-06, + "loss": 1.504, + "mean_token_accuracy": 0.6503568341334661, + "num_tokens": 2014493658.0, + "step": 12013 + }, + { + "entropy": 1.656291385491689, + "epoch": 1.3197934690066189, + "grad_norm": 0.6897427439689636, + "learning_rate": 6.931714060474051e-06, + "loss": 1.3497, + "mean_token_accuracy": 0.6640297720829645, + "num_tokens": 2014639448.0, + "step": 12014 + }, + { + "entropy": 1.7179415325323741, + "epoch": 1.3199033259179918, + "grad_norm": 0.8011785745620728, + "learning_rate": 6.930285915009391e-06, + "loss": 1.4384, + "mean_token_accuracy": 0.6595332821210226, + "num_tokens": 2014781669.0, + "step": 12015 + }, + { + "entropy": 1.6988608439763386, + "epoch": 1.3200131828293649, + "grad_norm": 0.7006280422210693, + "learning_rate": 6.928857898349347e-06, + "loss": 1.4754, + "mean_token_accuracy": 0.6605327129364014, + "num_tokens": 2014956142.0, + "step": 12016 + }, + { + "entropy": 1.7000405689080555, + "epoch": 1.3201230397407377, + "grad_norm": 0.8182290196418762, + "learning_rate": 6.927430010539115e-06, + "loss": 1.5508, + "mean_token_accuracy": 0.6524588018655777, + "num_tokens": 2015126652.0, + "step": 12017 + }, + { + "entropy": 1.6618265112241108, + "epoch": 1.3202328966521106, + "grad_norm": 0.7501579523086548, + "learning_rate": 6.9260022516238915e-06, + "loss": 1.402, + "mean_token_accuracy": 0.6613495101531347, + "num_tokens": 2015257773.0, + "step": 12018 + }, + { + "entropy": 1.7788889408111572, + "epoch": 1.3203427535634835, + "grad_norm": 0.7341859936714172, + "learning_rate": 6.924574621648861e-06, + "loss": 1.3527, + "mean_token_accuracy": 0.6608054389556249, + "num_tokens": 2015397071.0, + "step": 12019 + }, + { + "entropy": 1.7242496609687805, + "epoch": 1.3204526104748564, + "grad_norm": 0.6629573702812195, + "learning_rate": 6.923147120659204e-06, + "loss": 1.4938, + "mean_token_accuracy": 0.6375502049922943, + "num_tokens": 2015580976.0, + "step": 12020 + }, + { + "entropy": 1.6560555597146351, + "epoch": 1.3205624673862295, + "grad_norm": 0.6642670035362244, + "learning_rate": 6.921719748700107e-06, + "loss": 1.3582, + "mean_token_accuracy": 0.6535757084687551, + "num_tokens": 2015763946.0, + "step": 12021 + }, + { + "entropy": 1.768319457769394, + "epoch": 1.3206723242976024, + "grad_norm": 0.7571823596954346, + "learning_rate": 6.9202925058167395e-06, + "loss": 1.41, + "mean_token_accuracy": 0.64376833041509, + "num_tokens": 2015909480.0, + "step": 12022 + }, + { + "entropy": 1.6888968745867412, + "epoch": 1.3207821812089753, + "grad_norm": 0.6024224758148193, + "learning_rate": 6.918865392054276e-06, + "loss": 1.4121, + "mean_token_accuracy": 0.6446995933850607, + "num_tokens": 2016132439.0, + "step": 12023 + }, + { + "entropy": 1.6744337181250255, + "epoch": 1.3208920381203482, + "grad_norm": 0.6757270097732544, + "learning_rate": 6.917438407457888e-06, + "loss": 1.4779, + "mean_token_accuracy": 0.6477493494749069, + "num_tokens": 2016327301.0, + "step": 12024 + }, + { + "entropy": 1.6705586810906727, + "epoch": 1.321001895031721, + "grad_norm": 0.6879779696464539, + "learning_rate": 6.916011552072729e-06, + "loss": 1.3824, + "mean_token_accuracy": 0.6512851764758428, + "num_tokens": 2016514623.0, + "step": 12025 + }, + { + "entropy": 1.6962314943472545, + "epoch": 1.3211117519430942, + "grad_norm": 0.7470236420631409, + "learning_rate": 6.9145848259439676e-06, + "loss": 1.3411, + "mean_token_accuracy": 0.6559085547924042, + "num_tokens": 2016664612.0, + "step": 12026 + }, + { + "entropy": 1.6964036126931508, + "epoch": 1.321221608854467, + "grad_norm": 0.6507591009140015, + "learning_rate": 6.913158229116755e-06, + "loss": 1.3099, + "mean_token_accuracy": 0.6590965191523234, + "num_tokens": 2016862328.0, + "step": 12027 + }, + { + "entropy": 1.6942812999089558, + "epoch": 1.32133146576584, + "grad_norm": 0.8158959150314331, + "learning_rate": 6.911731761636241e-06, + "loss": 1.2446, + "mean_token_accuracy": 0.6787735968828201, + "num_tokens": 2016988531.0, + "step": 12028 + }, + { + "entropy": 1.6633458336194356, + "epoch": 1.321441322677213, + "grad_norm": 0.7270153760910034, + "learning_rate": 6.910305423547574e-06, + "loss": 1.4116, + "mean_token_accuracy": 0.6561925808588663, + "num_tokens": 2017194398.0, + "step": 12029 + }, + { + "entropy": 1.7125836710135143, + "epoch": 1.321551179588586, + "grad_norm": 0.7208383083343506, + "learning_rate": 6.908879214895902e-06, + "loss": 1.5425, + "mean_token_accuracy": 0.629949559768041, + "num_tokens": 2017344053.0, + "step": 12030 + }, + { + "entropy": 1.6911011735598247, + "epoch": 1.3216610364999588, + "grad_norm": 0.6760180592536926, + "learning_rate": 6.907453135726358e-06, + "loss": 1.2465, + "mean_token_accuracy": 0.6811218510071436, + "num_tokens": 2017484897.0, + "step": 12031 + }, + { + "entropy": 1.7392044166723888, + "epoch": 1.3217708934113317, + "grad_norm": 0.6357057094573975, + "learning_rate": 6.906027186084079e-06, + "loss": 1.4049, + "mean_token_accuracy": 0.6535494228204092, + "num_tokens": 2017680899.0, + "step": 12032 + }, + { + "entropy": 1.7378877997398376, + "epoch": 1.3218807503227046, + "grad_norm": 0.7144643664360046, + "learning_rate": 6.9046013660141895e-06, + "loss": 1.4085, + "mean_token_accuracy": 0.6645797441403071, + "num_tokens": 2017822147.0, + "step": 12033 + }, + { + "entropy": 1.67686927318573, + "epoch": 1.3219906072340777, + "grad_norm": 0.6405379772186279, + "learning_rate": 6.903175675561823e-06, + "loss": 1.5225, + "mean_token_accuracy": 0.6583962291479111, + "num_tokens": 2018060459.0, + "step": 12034 + }, + { + "entropy": 1.7921959658463795, + "epoch": 1.3221004641454506, + "grad_norm": 0.6897278428077698, + "learning_rate": 6.901750114772107e-06, + "loss": 1.5251, + "mean_token_accuracy": 0.6415904760360718, + "num_tokens": 2018209159.0, + "step": 12035 + }, + { + "entropy": 1.748464286327362, + "epoch": 1.3222103210568235, + "grad_norm": 0.7679427862167358, + "learning_rate": 6.900324683690145e-06, + "loss": 1.2433, + "mean_token_accuracy": 0.6749467998743057, + "num_tokens": 2018314401.0, + "step": 12036 + }, + { + "entropy": 1.808843304713567, + "epoch": 1.3223201779681963, + "grad_norm": 0.6774364709854126, + "learning_rate": 6.8988993823610595e-06, + "loss": 1.5223, + "mean_token_accuracy": 0.646757240096728, + "num_tokens": 2018473445.0, + "step": 12037 + }, + { + "entropy": 1.7024830679098766, + "epoch": 1.3224300348795692, + "grad_norm": 0.6565459966659546, + "learning_rate": 6.897474210829965e-06, + "loss": 1.4297, + "mean_token_accuracy": 0.6503916382789612, + "num_tokens": 2018649350.0, + "step": 12038 + }, + { + "entropy": 1.6916101773579915, + "epoch": 1.3225398917909423, + "grad_norm": 0.7631819248199463, + "learning_rate": 6.896049169141964e-06, + "loss": 1.4192, + "mean_token_accuracy": 0.6639653344949087, + "num_tokens": 2018818121.0, + "step": 12039 + }, + { + "entropy": 1.6281659305095673, + "epoch": 1.3226497487023152, + "grad_norm": 0.6123877167701721, + "learning_rate": 6.894624257342153e-06, + "loss": 1.3443, + "mean_token_accuracy": 0.6637972791989645, + "num_tokens": 2018982855.0, + "step": 12040 + }, + { + "entropy": 1.6332585612932842, + "epoch": 1.322759605613688, + "grad_norm": 0.6782002449035645, + "learning_rate": 6.893199475475638e-06, + "loss": 1.4276, + "mean_token_accuracy": 0.6608372827370962, + "num_tokens": 2019181386.0, + "step": 12041 + }, + { + "entropy": 1.6969127257664998, + "epoch": 1.3228694625250612, + "grad_norm": 0.751428484916687, + "learning_rate": 6.891774823587505e-06, + "loss": 1.4005, + "mean_token_accuracy": 0.6656929155190786, + "num_tokens": 2019326811.0, + "step": 12042 + }, + { + "entropy": 1.6707496643066406, + "epoch": 1.322979319436434, + "grad_norm": 0.6801748871803284, + "learning_rate": 6.890350301722852e-06, + "loss": 1.405, + "mean_token_accuracy": 0.6508347243070602, + "num_tokens": 2019497430.0, + "step": 12043 + }, + { + "entropy": 1.770410180091858, + "epoch": 1.323089176347807, + "grad_norm": 0.6442971229553223, + "learning_rate": 6.888925909926758e-06, + "loss": 1.4553, + "mean_token_accuracy": 0.6529978712399801, + "num_tokens": 2019663455.0, + "step": 12044 + }, + { + "entropy": 1.7813760836919148, + "epoch": 1.3231990332591799, + "grad_norm": 0.7301626801490784, + "learning_rate": 6.887501648244306e-06, + "loss": 1.3938, + "mean_token_accuracy": 0.6543681472539902, + "num_tokens": 2019814550.0, + "step": 12045 + }, + { + "entropy": 1.7918006579081218, + "epoch": 1.3233088901705528, + "grad_norm": 0.7231292724609375, + "learning_rate": 6.886077516720572e-06, + "loss": 1.614, + "mean_token_accuracy": 0.6222240428129832, + "num_tokens": 2020042991.0, + "step": 12046 + }, + { + "entropy": 1.6463509897391002, + "epoch": 1.3234187470819259, + "grad_norm": 0.6251701712608337, + "learning_rate": 6.8846535154006385e-06, + "loss": 1.3859, + "mean_token_accuracy": 0.667206252614657, + "num_tokens": 2020213466.0, + "step": 12047 + }, + { + "entropy": 1.739404598871867, + "epoch": 1.3235286039932987, + "grad_norm": 0.7523561716079712, + "learning_rate": 6.8832296443295585e-06, + "loss": 1.4522, + "mean_token_accuracy": 0.648137629032135, + "num_tokens": 2020366099.0, + "step": 12048 + }, + { + "entropy": 1.6742408871650696, + "epoch": 1.3236384609046716, + "grad_norm": 0.7165248394012451, + "learning_rate": 6.881805903552408e-06, + "loss": 1.4481, + "mean_token_accuracy": 0.6673527806997299, + "num_tokens": 2020537306.0, + "step": 12049 + }, + { + "entropy": 1.7045758267243702, + "epoch": 1.3237483178160447, + "grad_norm": 0.7423866987228394, + "learning_rate": 6.880382293114245e-06, + "loss": 1.4574, + "mean_token_accuracy": 0.6491605639457703, + "num_tokens": 2020660666.0, + "step": 12050 + }, + { + "entropy": 1.7853013277053833, + "epoch": 1.3238581747274174, + "grad_norm": 0.6766754984855652, + "learning_rate": 6.878958813060127e-06, + "loss": 1.3687, + "mean_token_accuracy": 0.6513369977474213, + "num_tokens": 2020832651.0, + "step": 12051 + }, + { + "entropy": 1.657790740331014, + "epoch": 1.3239680316387905, + "grad_norm": 0.6425931453704834, + "learning_rate": 6.877535463435103e-06, + "loss": 1.3053, + "mean_token_accuracy": 0.6669684946537018, + "num_tokens": 2020962525.0, + "step": 12052 + }, + { + "entropy": 1.6860653658707936, + "epoch": 1.3240778885501634, + "grad_norm": 0.6832910180091858, + "learning_rate": 6.876112244284228e-06, + "loss": 1.2645, + "mean_token_accuracy": 0.6669280380010605, + "num_tokens": 2021106646.0, + "step": 12053 + }, + { + "entropy": 1.693892925977707, + "epoch": 1.3241877454615363, + "grad_norm": 0.5836326479911804, + "learning_rate": 6.874689155652537e-06, + "loss": 1.4917, + "mean_token_accuracy": 0.6535281638304392, + "num_tokens": 2021316957.0, + "step": 12054 + }, + { + "entropy": 1.7312416632970173, + "epoch": 1.3242976023729094, + "grad_norm": 0.9197054505348206, + "learning_rate": 6.873266197585079e-06, + "loss": 1.4773, + "mean_token_accuracy": 0.6464556207259496, + "num_tokens": 2021458891.0, + "step": 12055 + }, + { + "entropy": 1.6423344214757283, + "epoch": 1.3244074592842823, + "grad_norm": 0.5606783628463745, + "learning_rate": 6.8718433701268885e-06, + "loss": 1.2475, + "mean_token_accuracy": 0.6767958799997965, + "num_tokens": 2021632253.0, + "step": 12056 + }, + { + "entropy": 1.7067996362845104, + "epoch": 1.3245173161956552, + "grad_norm": 0.5830101370811462, + "learning_rate": 6.870420673322988e-06, + "loss": 1.393, + "mean_token_accuracy": 0.6444303045670191, + "num_tokens": 2021840691.0, + "step": 12057 + }, + { + "entropy": 1.745294988155365, + "epoch": 1.324627173107028, + "grad_norm": 0.6736421585083008, + "learning_rate": 6.8689981072184166e-06, + "loss": 1.402, + "mean_token_accuracy": 0.6488266239563624, + "num_tokens": 2021994717.0, + "step": 12058 + }, + { + "entropy": 1.7859689891338348, + "epoch": 1.324737030018401, + "grad_norm": 0.7754059433937073, + "learning_rate": 6.867575671858197e-06, + "loss": 1.4224, + "mean_token_accuracy": 0.641977791984876, + "num_tokens": 2022184817.0, + "step": 12059 + }, + { + "entropy": 1.6727923055489857, + "epoch": 1.324846886929774, + "grad_norm": 0.6476246118545532, + "learning_rate": 6.86615336728734e-06, + "loss": 1.3758, + "mean_token_accuracy": 0.6541973451773325, + "num_tokens": 2022383266.0, + "step": 12060 + }, + { + "entropy": 1.7898275355497997, + "epoch": 1.324956743841147, + "grad_norm": 0.8410981297492981, + "learning_rate": 6.864731193550867e-06, + "loss": 1.3628, + "mean_token_accuracy": 0.6513276447852453, + "num_tokens": 2022514087.0, + "step": 12061 + }, + { + "entropy": 1.658981204032898, + "epoch": 1.3250666007525198, + "grad_norm": 0.7151813507080078, + "learning_rate": 6.863309150693789e-06, + "loss": 1.3358, + "mean_token_accuracy": 0.6637451301018397, + "num_tokens": 2022697233.0, + "step": 12062 + }, + { + "entropy": 1.723794678846995, + "epoch": 1.325176457663893, + "grad_norm": 0.706913411617279, + "learning_rate": 6.861887238761116e-06, + "loss": 1.4122, + "mean_token_accuracy": 0.6545346329609553, + "num_tokens": 2022854156.0, + "step": 12063 + }, + { + "entropy": 1.7677522897720337, + "epoch": 1.3252863145752656, + "grad_norm": 0.6113898158073425, + "learning_rate": 6.86046545779784e-06, + "loss": 1.5288, + "mean_token_accuracy": 0.6368463883797327, + "num_tokens": 2023116448.0, + "step": 12064 + }, + { + "entropy": 1.7364682853221893, + "epoch": 1.3253961714866387, + "grad_norm": 0.6671069860458374, + "learning_rate": 6.859043807848973e-06, + "loss": 1.4571, + "mean_token_accuracy": 0.6500351677338282, + "num_tokens": 2023293074.0, + "step": 12065 + }, + { + "entropy": 1.726439744234085, + "epoch": 1.3255060283980116, + "grad_norm": 0.6300005912780762, + "learning_rate": 6.8576222889595e-06, + "loss": 1.3818, + "mean_token_accuracy": 0.6553014020125071, + "num_tokens": 2023473442.0, + "step": 12066 + }, + { + "entropy": 1.7023302714029949, + "epoch": 1.3256158853093845, + "grad_norm": 0.7470946311950684, + "learning_rate": 6.856200901174417e-06, + "loss": 1.5458, + "mean_token_accuracy": 0.6382872660954794, + "num_tokens": 2023630081.0, + "step": 12067 + }, + { + "entropy": 1.7234592040379841, + "epoch": 1.3257257422207576, + "grad_norm": 0.7340896129608154, + "learning_rate": 6.854779644538708e-06, + "loss": 1.4096, + "mean_token_accuracy": 0.6467949201663336, + "num_tokens": 2023806667.0, + "step": 12068 + }, + { + "entropy": 1.735681543747584, + "epoch": 1.3258355991321304, + "grad_norm": 0.7612413763999939, + "learning_rate": 6.853358519097353e-06, + "loss": 1.5394, + "mean_token_accuracy": 0.6362536748250326, + "num_tokens": 2023996526.0, + "step": 12069 + }, + { + "entropy": 1.6876520315806072, + "epoch": 1.3259454560435033, + "grad_norm": 0.7015395164489746, + "learning_rate": 6.851937524895334e-06, + "loss": 1.3935, + "mean_token_accuracy": 0.6509424696365992, + "num_tokens": 2024153887.0, + "step": 12070 + }, + { + "entropy": 1.7338077227274578, + "epoch": 1.3260553129548762, + "grad_norm": 0.7074576020240784, + "learning_rate": 6.850516661977626e-06, + "loss": 1.4231, + "mean_token_accuracy": 0.6674382239580154, + "num_tokens": 2024314815.0, + "step": 12071 + }, + { + "entropy": 1.7138215899467468, + "epoch": 1.326165169866249, + "grad_norm": 0.7921140789985657, + "learning_rate": 6.849095930389193e-06, + "loss": 1.3343, + "mean_token_accuracy": 0.6653185288111368, + "num_tokens": 2024439564.0, + "step": 12072 + }, + { + "entropy": 1.7277231812477112, + "epoch": 1.3262750267776222, + "grad_norm": 0.6742545366287231, + "learning_rate": 6.847675330175001e-06, + "loss": 1.5814, + "mean_token_accuracy": 0.6412968585888544, + "num_tokens": 2024634239.0, + "step": 12073 + }, + { + "entropy": 1.6866126358509064, + "epoch": 1.326384883688995, + "grad_norm": 0.6755991578102112, + "learning_rate": 6.8462548613800176e-06, + "loss": 1.3887, + "mean_token_accuracy": 0.6467768748601278, + "num_tokens": 2024837409.0, + "step": 12074 + }, + { + "entropy": 1.763855755329132, + "epoch": 1.326494740600368, + "grad_norm": 0.8013515472412109, + "learning_rate": 6.844834524049198e-06, + "loss": 1.4646, + "mean_token_accuracy": 0.6409885436296463, + "num_tokens": 2025036012.0, + "step": 12075 + }, + { + "entropy": 1.765538622935613, + "epoch": 1.326604597511741, + "grad_norm": 0.6882338523864746, + "learning_rate": 6.843414318227486e-06, + "loss": 1.3705, + "mean_token_accuracy": 0.6453090657790502, + "num_tokens": 2025164469.0, + "step": 12076 + }, + { + "entropy": 1.6984353959560394, + "epoch": 1.326714454423114, + "grad_norm": 0.7873267531394958, + "learning_rate": 6.8419942439598445e-06, + "loss": 1.5014, + "mean_token_accuracy": 0.6472826500733694, + "num_tokens": 2025344537.0, + "step": 12077 + }, + { + "entropy": 1.7276420692602794, + "epoch": 1.3268243113344869, + "grad_norm": 0.6308796405792236, + "learning_rate": 6.8405743012912074e-06, + "loss": 1.3326, + "mean_token_accuracy": 0.6528024027744929, + "num_tokens": 2025488514.0, + "step": 12078 + }, + { + "entropy": 1.7100607454776764, + "epoch": 1.3269341682458597, + "grad_norm": 0.6320695877075195, + "learning_rate": 6.839154490266521e-06, + "loss": 1.377, + "mean_token_accuracy": 0.6606988708178202, + "num_tokens": 2025632637.0, + "step": 12079 + }, + { + "entropy": 1.730517605940501, + "epoch": 1.3270440251572326, + "grad_norm": 0.9139355421066284, + "learning_rate": 6.837734810930722e-06, + "loss": 1.242, + "mean_token_accuracy": 0.68810007472833, + "num_tokens": 2025747502.0, + "step": 12080 + }, + { + "entropy": 1.695272147655487, + "epoch": 1.3271538820686057, + "grad_norm": 0.6435163021087646, + "learning_rate": 6.836315263328737e-06, + "loss": 1.4986, + "mean_token_accuracy": 0.6545588473478953, + "num_tokens": 2025962120.0, + "step": 12081 + }, + { + "entropy": 1.6857863465944927, + "epoch": 1.3272637389799786, + "grad_norm": 0.7062543630599976, + "learning_rate": 6.834895847505496e-06, + "loss": 1.3823, + "mean_token_accuracy": 0.6642873833576838, + "num_tokens": 2026100917.0, + "step": 12082 + }, + { + "entropy": 1.6795487602551777, + "epoch": 1.3273735958913515, + "grad_norm": 0.6519520282745361, + "learning_rate": 6.833476563505934e-06, + "loss": 1.454, + "mean_token_accuracy": 0.6510899215936661, + "num_tokens": 2026281990.0, + "step": 12083 + }, + { + "entropy": 1.7207885682582855, + "epoch": 1.3274834528027244, + "grad_norm": 0.5761967301368713, + "learning_rate": 6.8320574113749535e-06, + "loss": 1.4114, + "mean_token_accuracy": 0.6519269794225693, + "num_tokens": 2026492957.0, + "step": 12084 + }, + { + "entropy": 1.7370118896166484, + "epoch": 1.3275933097140973, + "grad_norm": 1.0227420330047607, + "learning_rate": 6.830638391157478e-06, + "loss": 1.3871, + "mean_token_accuracy": 0.6556207984685898, + "num_tokens": 2026647880.0, + "step": 12085 + }, + { + "entropy": 1.7096319099267323, + "epoch": 1.3277031666254704, + "grad_norm": 0.6812415719032288, + "learning_rate": 6.829219502898421e-06, + "loss": 1.3995, + "mean_token_accuracy": 0.6833713253339132, + "num_tokens": 2026809514.0, + "step": 12086 + }, + { + "entropy": 1.704438676436742, + "epoch": 1.3278130235368433, + "grad_norm": 0.7178927063941956, + "learning_rate": 6.827800746642688e-06, + "loss": 1.5393, + "mean_token_accuracy": 0.6350291073322296, + "num_tokens": 2027020403.0, + "step": 12087 + }, + { + "entropy": 1.6621620655059814, + "epoch": 1.3279228804482162, + "grad_norm": 0.7681390643119812, + "learning_rate": 6.826382122435178e-06, + "loss": 1.3886, + "mean_token_accuracy": 0.6718294769525528, + "num_tokens": 2027223142.0, + "step": 12088 + }, + { + "entropy": 1.6573795974254608, + "epoch": 1.3280327373595893, + "grad_norm": 0.6329140067100525, + "learning_rate": 6.824963630320798e-06, + "loss": 1.3725, + "mean_token_accuracy": 0.6754241387049357, + "num_tokens": 2027379460.0, + "step": 12089 + }, + { + "entropy": 1.6735499898592632, + "epoch": 1.3281425942709622, + "grad_norm": 0.780312180519104, + "learning_rate": 6.823545270344432e-06, + "loss": 1.3158, + "mean_token_accuracy": 0.6761320730050405, + "num_tokens": 2027511602.0, + "step": 12090 + }, + { + "entropy": 1.7244892517725627, + "epoch": 1.328252451182335, + "grad_norm": 0.796455442905426, + "learning_rate": 6.822127042550983e-06, + "loss": 1.2815, + "mean_token_accuracy": 0.6747524440288544, + "num_tokens": 2027691642.0, + "step": 12091 + }, + { + "entropy": 1.6727336744467418, + "epoch": 1.328362308093708, + "grad_norm": 0.5619511604309082, + "learning_rate": 6.820708946985325e-06, + "loss": 1.3715, + "mean_token_accuracy": 0.6650692274173101, + "num_tokens": 2027869176.0, + "step": 12092 + }, + { + "entropy": 1.6756927768389385, + "epoch": 1.3284721650050808, + "grad_norm": 0.6625829935073853, + "learning_rate": 6.819290983692346e-06, + "loss": 1.2637, + "mean_token_accuracy": 0.6874003559350967, + "num_tokens": 2028019901.0, + "step": 12093 + }, + { + "entropy": 1.6760556896527607, + "epoch": 1.328582021916454, + "grad_norm": 0.6881618499755859, + "learning_rate": 6.817873152716925e-06, + "loss": 1.41, + "mean_token_accuracy": 0.6657233734925588, + "num_tokens": 2028184898.0, + "step": 12094 + }, + { + "entropy": 1.7184965113798778, + "epoch": 1.3286918788278268, + "grad_norm": 0.6784566640853882, + "learning_rate": 6.816455454103936e-06, + "loss": 1.4383, + "mean_token_accuracy": 0.6545319110155106, + "num_tokens": 2028337020.0, + "step": 12095 + }, + { + "entropy": 1.6974100073178608, + "epoch": 1.3288017357391997, + "grad_norm": 0.623323380947113, + "learning_rate": 6.815037887898243e-06, + "loss": 1.5349, + "mean_token_accuracy": 0.6560999403397242, + "num_tokens": 2028506971.0, + "step": 12096 + }, + { + "entropy": 1.6983208358287811, + "epoch": 1.3289115926505726, + "grad_norm": 0.7683018445968628, + "learning_rate": 6.813620454144718e-06, + "loss": 1.3477, + "mean_token_accuracy": 0.663595899939537, + "num_tokens": 2028724476.0, + "step": 12097 + }, + { + "entropy": 1.701758086681366, + "epoch": 1.3290214495619455, + "grad_norm": 0.7559242248535156, + "learning_rate": 6.812203152888216e-06, + "loss": 1.3109, + "mean_token_accuracy": 0.6690565794706345, + "num_tokens": 2028875421.0, + "step": 12098 + }, + { + "entropy": 1.722754289706548, + "epoch": 1.3291313064733186, + "grad_norm": 0.6389537453651428, + "learning_rate": 6.8107859841736e-06, + "loss": 1.3986, + "mean_token_accuracy": 0.6538368314504623, + "num_tokens": 2029057890.0, + "step": 12099 + }, + { + "entropy": 1.7785523335138957, + "epoch": 1.3292411633846914, + "grad_norm": 0.7880353927612305, + "learning_rate": 6.80936894804572e-06, + "loss": 1.3391, + "mean_token_accuracy": 0.6652881453434626, + "num_tokens": 2029173437.0, + "step": 12100 + }, + { + "entropy": 1.6625278691450756, + "epoch": 1.3293510202960643, + "grad_norm": 0.715702474117279, + "learning_rate": 6.807952044549422e-06, + "loss": 1.2885, + "mean_token_accuracy": 0.6675082196791967, + "num_tokens": 2029315644.0, + "step": 12101 + }, + { + "entropy": 1.7816561063130696, + "epoch": 1.3294608772074374, + "grad_norm": 0.6899002194404602, + "learning_rate": 6.806535273729551e-06, + "loss": 1.5393, + "mean_token_accuracy": 0.6315892537434896, + "num_tokens": 2029493296.0, + "step": 12102 + }, + { + "entropy": 1.6816334029038746, + "epoch": 1.3295707341188103, + "grad_norm": 0.6617991328239441, + "learning_rate": 6.8051186356309585e-06, + "loss": 1.2528, + "mean_token_accuracy": 0.6830165733893713, + "num_tokens": 2029629625.0, + "step": 12103 + }, + { + "entropy": 1.750120351711909, + "epoch": 1.3296805910301832, + "grad_norm": 0.7644038796424866, + "learning_rate": 6.803702130298462e-06, + "loss": 1.3048, + "mean_token_accuracy": 0.6720990637938181, + "num_tokens": 2029736612.0, + "step": 12104 + }, + { + "entropy": 1.7711990475654602, + "epoch": 1.329790447941556, + "grad_norm": 0.712631106376648, + "learning_rate": 6.802285757776903e-06, + "loss": 1.4558, + "mean_token_accuracy": 0.6478696018457413, + "num_tokens": 2029871767.0, + "step": 12105 + }, + { + "entropy": 1.7491689622402191, + "epoch": 1.329900304852929, + "grad_norm": 0.6656632423400879, + "learning_rate": 6.800869518111111e-06, + "loss": 1.3455, + "mean_token_accuracy": 0.6632706572612127, + "num_tokens": 2030058925.0, + "step": 12106 + }, + { + "entropy": 1.7960249086221058, + "epoch": 1.330010161764302, + "grad_norm": 0.6363185048103333, + "learning_rate": 6.7994534113459075e-06, + "loss": 1.5365, + "mean_token_accuracy": 0.61135300497214, + "num_tokens": 2030274330.0, + "step": 12107 + }, + { + "entropy": 1.631099820137024, + "epoch": 1.330120018675675, + "grad_norm": 0.7717340588569641, + "learning_rate": 6.798037437526106e-06, + "loss": 1.2993, + "mean_token_accuracy": 0.6846882502237955, + "num_tokens": 2030417966.0, + "step": 12108 + }, + { + "entropy": 1.6861089169979095, + "epoch": 1.3302298755870479, + "grad_norm": 0.8972033262252808, + "learning_rate": 6.796621596696531e-06, + "loss": 1.7368, + "mean_token_accuracy": 0.6355453704794248, + "num_tokens": 2030588343.0, + "step": 12109 + }, + { + "entropy": 1.7588840822378795, + "epoch": 1.3303397324984207, + "grad_norm": 0.6780613660812378, + "learning_rate": 6.795205888901984e-06, + "loss": 1.4806, + "mean_token_accuracy": 0.6419356018304825, + "num_tokens": 2030766964.0, + "step": 12110 + }, + { + "entropy": 1.7469553053379059, + "epoch": 1.3304495894097936, + "grad_norm": 0.6792296767234802, + "learning_rate": 6.793790314187281e-06, + "loss": 1.3987, + "mean_token_accuracy": 0.6516177902619044, + "num_tokens": 2030931496.0, + "step": 12111 + }, + { + "entropy": 1.6458273430665333, + "epoch": 1.3305594463211667, + "grad_norm": 0.8105985522270203, + "learning_rate": 6.792374872597217e-06, + "loss": 1.2897, + "mean_token_accuracy": 0.6808892091115316, + "num_tokens": 2031079780.0, + "step": 12112 + }, + { + "entropy": 1.6807252566019695, + "epoch": 1.3306693032325396, + "grad_norm": 0.7525298595428467, + "learning_rate": 6.79095956417659e-06, + "loss": 1.3477, + "mean_token_accuracy": 0.6601622154315313, + "num_tokens": 2031305389.0, + "step": 12113 + }, + { + "entropy": 1.751152257124583, + "epoch": 1.3307791601439125, + "grad_norm": 1.0267671346664429, + "learning_rate": 6.789544388970196e-06, + "loss": 1.4263, + "mean_token_accuracy": 0.6512612501780192, + "num_tokens": 2031449086.0, + "step": 12114 + }, + { + "entropy": 1.6886617640654247, + "epoch": 1.3308890170552856, + "grad_norm": 0.766525149345398, + "learning_rate": 6.788129347022832e-06, + "loss": 1.2504, + "mean_token_accuracy": 0.6706090221802393, + "num_tokens": 2031571554.0, + "step": 12115 + }, + { + "entropy": 1.7444157203038533, + "epoch": 1.3309988739666585, + "grad_norm": 0.7540661692619324, + "learning_rate": 6.786714438379269e-06, + "loss": 1.3464, + "mean_token_accuracy": 0.6591867109139761, + "num_tokens": 2031697222.0, + "step": 12116 + }, + { + "entropy": 1.717256059249242, + "epoch": 1.3311087308780314, + "grad_norm": 0.7085712552070618, + "learning_rate": 6.7852996630842936e-06, + "loss": 1.4093, + "mean_token_accuracy": 0.6617002884546915, + "num_tokens": 2031842284.0, + "step": 12117 + }, + { + "entropy": 1.6935730675856273, + "epoch": 1.3312185877894043, + "grad_norm": 0.7506963014602661, + "learning_rate": 6.7838850211826925e-06, + "loss": 1.5524, + "mean_token_accuracy": 0.6330506006876627, + "num_tokens": 2032029021.0, + "step": 12118 + }, + { + "entropy": 1.6292231281598408, + "epoch": 1.3313284447007772, + "grad_norm": 0.6500567197799683, + "learning_rate": 6.782470512719227e-06, + "loss": 1.4116, + "mean_token_accuracy": 0.6551267306009928, + "num_tokens": 2032225231.0, + "step": 12119 + }, + { + "entropy": 1.7364420294761658, + "epoch": 1.3314383016121503, + "grad_norm": 0.6082978248596191, + "learning_rate": 6.781056137738667e-06, + "loss": 1.3851, + "mean_token_accuracy": 0.6635348598162333, + "num_tokens": 2032400791.0, + "step": 12120 + }, + { + "entropy": 1.6161598761876423, + "epoch": 1.3315481585235232, + "grad_norm": 0.612918496131897, + "learning_rate": 6.779641896285783e-06, + "loss": 1.3149, + "mean_token_accuracy": 0.6781648695468903, + "num_tokens": 2032538191.0, + "step": 12121 + }, + { + "entropy": 1.726882795492808, + "epoch": 1.331658015434896, + "grad_norm": 0.7016002535820007, + "learning_rate": 6.778227788405325e-06, + "loss": 1.6569, + "mean_token_accuracy": 0.6355594048897425, + "num_tokens": 2032731160.0, + "step": 12122 + }, + { + "entropy": 1.6399606863657634, + "epoch": 1.331767872346269, + "grad_norm": 0.8239472508430481, + "learning_rate": 6.776813814142062e-06, + "loss": 1.2364, + "mean_token_accuracy": 0.6827895094950994, + "num_tokens": 2032887536.0, + "step": 12123 + }, + { + "entropy": 1.7254582345485687, + "epoch": 1.3318777292576418, + "grad_norm": 0.6165110468864441, + "learning_rate": 6.7753999735407375e-06, + "loss": 1.4148, + "mean_token_accuracy": 0.6540853381156921, + "num_tokens": 2033034011.0, + "step": 12124 + }, + { + "entropy": 1.6723759472370148, + "epoch": 1.331987586169015, + "grad_norm": 0.7529140710830688, + "learning_rate": 6.773986266646098e-06, + "loss": 1.4075, + "mean_token_accuracy": 0.66306305428346, + "num_tokens": 2033173974.0, + "step": 12125 + }, + { + "entropy": 1.6746163368225098, + "epoch": 1.3320974430803878, + "grad_norm": 0.6699461936950684, + "learning_rate": 6.772572693502887e-06, + "loss": 1.4146, + "mean_token_accuracy": 0.6633772750695547, + "num_tokens": 2033318698.0, + "step": 12126 + }, + { + "entropy": 1.6714819769064586, + "epoch": 1.3322072999917607, + "grad_norm": 0.8034442663192749, + "learning_rate": 6.771159254155853e-06, + "loss": 1.3074, + "mean_token_accuracy": 0.6753875811894735, + "num_tokens": 2033446101.0, + "step": 12127 + }, + { + "entropy": 1.644019901752472, + "epoch": 1.3323171569031338, + "grad_norm": 0.5550585985183716, + "learning_rate": 6.769745948649717e-06, + "loss": 1.3786, + "mean_token_accuracy": 0.6502327471971512, + "num_tokens": 2033648528.0, + "step": 12128 + }, + { + "entropy": 1.7570864657560985, + "epoch": 1.3324270138145067, + "grad_norm": 0.7444114685058594, + "learning_rate": 6.768332777029214e-06, + "loss": 1.5661, + "mean_token_accuracy": 0.6311159779628118, + "num_tokens": 2033809795.0, + "step": 12129 + }, + { + "entropy": 1.702331284681956, + "epoch": 1.3325368707258796, + "grad_norm": 0.6505382061004639, + "learning_rate": 6.766919739339076e-06, + "loss": 1.4109, + "mean_token_accuracy": 0.6532570620377859, + "num_tokens": 2033966956.0, + "step": 12130 + }, + { + "entropy": 1.6757369736830394, + "epoch": 1.3326467276372524, + "grad_norm": 0.6705249547958374, + "learning_rate": 6.76550683562402e-06, + "loss": 1.3289, + "mean_token_accuracy": 0.6670956462621689, + "num_tokens": 2034141052.0, + "step": 12131 + }, + { + "entropy": 1.7126195033391316, + "epoch": 1.3327565845486253, + "grad_norm": 0.7784457802772522, + "learning_rate": 6.764094065928762e-06, + "loss": 1.3657, + "mean_token_accuracy": 0.6621719797452291, + "num_tokens": 2034413616.0, + "step": 12132 + }, + { + "entropy": 1.6480099658171337, + "epoch": 1.3328664414599984, + "grad_norm": 0.634157657623291, + "learning_rate": 6.762681430298021e-06, + "loss": 1.5479, + "mean_token_accuracy": 0.6490356723467509, + "num_tokens": 2034615609.0, + "step": 12133 + }, + { + "entropy": 1.7340960800647736, + "epoch": 1.3329762983713713, + "grad_norm": 0.7252094149589539, + "learning_rate": 6.7612689287764996e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.6707275907198588, + "num_tokens": 2034723645.0, + "step": 12134 + }, + { + "entropy": 1.6428366204102833, + "epoch": 1.3330861552827442, + "grad_norm": 0.7468758225440979, + "learning_rate": 6.759856561408912e-06, + "loss": 1.3562, + "mean_token_accuracy": 0.659113829334577, + "num_tokens": 2034890797.0, + "step": 12135 + }, + { + "entropy": 1.743541826804479, + "epoch": 1.333196012194117, + "grad_norm": 0.7921319603919983, + "learning_rate": 6.758444328239951e-06, + "loss": 1.2967, + "mean_token_accuracy": 0.6673512558142344, + "num_tokens": 2035037394.0, + "step": 12136 + }, + { + "entropy": 1.7060744762420654, + "epoch": 1.33330586910549, + "grad_norm": 0.8504371047019958, + "learning_rate": 6.757032229314314e-06, + "loss": 1.2957, + "mean_token_accuracy": 0.6720158954461416, + "num_tokens": 2035148382.0, + "step": 12137 + }, + { + "entropy": 1.660773108402888, + "epoch": 1.333415726016863, + "grad_norm": 0.6613491773605347, + "learning_rate": 6.7556202646766955e-06, + "loss": 1.2584, + "mean_token_accuracy": 0.6726939876874288, + "num_tokens": 2035265813.0, + "step": 12138 + }, + { + "entropy": 1.7064545849959056, + "epoch": 1.333525582928236, + "grad_norm": 0.7123542428016663, + "learning_rate": 6.7542084343717885e-06, + "loss": 1.407, + "mean_token_accuracy": 0.6450283875068029, + "num_tokens": 2035442166.0, + "step": 12139 + }, + { + "entropy": 1.759802907705307, + "epoch": 1.3336354398396089, + "grad_norm": 0.6230959296226501, + "learning_rate": 6.752796738444265e-06, + "loss": 1.4219, + "mean_token_accuracy": 0.640018438299497, + "num_tokens": 2035628368.0, + "step": 12140 + }, + { + "entropy": 1.6767445107301076, + "epoch": 1.333745296750982, + "grad_norm": 0.7107008695602417, + "learning_rate": 6.7513851769388105e-06, + "loss": 1.5074, + "mean_token_accuracy": 0.6424607733885447, + "num_tokens": 2035852613.0, + "step": 12141 + }, + { + "entropy": 1.6929062108198802, + "epoch": 1.3338551536623549, + "grad_norm": 0.8039124011993408, + "learning_rate": 6.749973749900104e-06, + "loss": 1.4325, + "mean_token_accuracy": 0.6694301267464956, + "num_tokens": 2036027820.0, + "step": 12142 + }, + { + "entropy": 1.695325791835785, + "epoch": 1.3339650105737277, + "grad_norm": 0.7455415725708008, + "learning_rate": 6.748562457372814e-06, + "loss": 1.599, + "mean_token_accuracy": 0.6340650320053101, + "num_tokens": 2036191571.0, + "step": 12143 + }, + { + "entropy": 1.6283740599950154, + "epoch": 1.3340748674851006, + "grad_norm": 0.597327709197998, + "learning_rate": 6.747151299401602e-06, + "loss": 1.4393, + "mean_token_accuracy": 0.6598650167385737, + "num_tokens": 2036409052.0, + "step": 12144 + }, + { + "entropy": 1.70907461643219, + "epoch": 1.3341847243964735, + "grad_norm": 0.7348134517669678, + "learning_rate": 6.74574027603114e-06, + "loss": 1.3319, + "mean_token_accuracy": 0.6699612587690353, + "num_tokens": 2036548176.0, + "step": 12145 + }, + { + "entropy": 1.750954935948054, + "epoch": 1.3342945813078466, + "grad_norm": 0.6614691019058228, + "learning_rate": 6.744329387306077e-06, + "loss": 1.5194, + "mean_token_accuracy": 0.6467989534139633, + "num_tokens": 2036787561.0, + "step": 12146 + }, + { + "entropy": 1.7040914098421733, + "epoch": 1.3344044382192195, + "grad_norm": 0.8070111274719238, + "learning_rate": 6.742918633271074e-06, + "loss": 1.5117, + "mean_token_accuracy": 0.6465377608935038, + "num_tokens": 2036938048.0, + "step": 12147 + }, + { + "entropy": 1.744322548309962, + "epoch": 1.3345142951305924, + "grad_norm": 0.7317628264427185, + "learning_rate": 6.741508013970779e-06, + "loss": 1.3279, + "mean_token_accuracy": 0.6692080895105997, + "num_tokens": 2037071722.0, + "step": 12148 + }, + { + "entropy": 1.7309604982535045, + "epoch": 1.3346241520419653, + "grad_norm": 0.6659313440322876, + "learning_rate": 6.740097529449833e-06, + "loss": 1.4134, + "mean_token_accuracy": 0.6467402577400208, + "num_tokens": 2037281053.0, + "step": 12149 + }, + { + "entropy": 1.7567891379197438, + "epoch": 1.3347340089533382, + "grad_norm": 0.6446396112442017, + "learning_rate": 6.7386871797528816e-06, + "loss": 1.38, + "mean_token_accuracy": 0.6591022710005442, + "num_tokens": 2037409299.0, + "step": 12150 + }, + { + "entropy": 1.7695753872394562, + "epoch": 1.3348438658647113, + "grad_norm": 0.8622597455978394, + "learning_rate": 6.737276964924564e-06, + "loss": 1.4295, + "mean_token_accuracy": 0.665856863061587, + "num_tokens": 2037543858.0, + "step": 12151 + }, + { + "entropy": 1.6813296675682068, + "epoch": 1.3349537227760842, + "grad_norm": 0.711685299873352, + "learning_rate": 6.735866885009506e-06, + "loss": 1.2672, + "mean_token_accuracy": 0.6845654745896658, + "num_tokens": 2037666476.0, + "step": 12152 + }, + { + "entropy": 1.6936272382736206, + "epoch": 1.335063579687457, + "grad_norm": 0.6655703783035278, + "learning_rate": 6.7344569400523404e-06, + "loss": 1.348, + "mean_token_accuracy": 0.670586441953977, + "num_tokens": 2037839539.0, + "step": 12153 + }, + { + "entropy": 1.659144659837087, + "epoch": 1.3351734365988301, + "grad_norm": 0.6707544922828674, + "learning_rate": 6.733047130097689e-06, + "loss": 1.3097, + "mean_token_accuracy": 0.670238604148229, + "num_tokens": 2037978288.0, + "step": 12154 + }, + { + "entropy": 1.6983022093772888, + "epoch": 1.335283293510203, + "grad_norm": 0.5423887968063354, + "learning_rate": 6.731637455190177e-06, + "loss": 1.4855, + "mean_token_accuracy": 0.6436324616273245, + "num_tokens": 2038175684.0, + "step": 12155 + }, + { + "entropy": 1.6970913509527843, + "epoch": 1.335393150421576, + "grad_norm": 0.6358638405799866, + "learning_rate": 6.730227915374414e-06, + "loss": 1.3425, + "mean_token_accuracy": 0.6590030988057455, + "num_tokens": 2038309359.0, + "step": 12156 + }, + { + "entropy": 1.68005574742953, + "epoch": 1.3355030073329488, + "grad_norm": 0.6394631862640381, + "learning_rate": 6.728818510695012e-06, + "loss": 1.4228, + "mean_token_accuracy": 0.6577940632899603, + "num_tokens": 2038463898.0, + "step": 12157 + }, + { + "entropy": 1.7448046207427979, + "epoch": 1.3356128642443217, + "grad_norm": 0.677924394607544, + "learning_rate": 6.7274092411965795e-06, + "loss": 1.3776, + "mean_token_accuracy": 0.6486985584100088, + "num_tokens": 2038682056.0, + "step": 12158 + }, + { + "entropy": 1.706708659728368, + "epoch": 1.3357227211556948, + "grad_norm": 0.6977136731147766, + "learning_rate": 6.7260001069237265e-06, + "loss": 1.4341, + "mean_token_accuracy": 0.650297815601031, + "num_tokens": 2038849719.0, + "step": 12159 + }, + { + "entropy": 1.7140244444211323, + "epoch": 1.3358325780670677, + "grad_norm": 0.6504734754562378, + "learning_rate": 6.7245911079210365e-06, + "loss": 1.4013, + "mean_token_accuracy": 0.6597279409567515, + "num_tokens": 2039024559.0, + "step": 12160 + }, + { + "entropy": 1.7537720998128254, + "epoch": 1.3359424349784406, + "grad_norm": 0.6768696308135986, + "learning_rate": 6.723182244233111e-06, + "loss": 1.4743, + "mean_token_accuracy": 0.6464989334344864, + "num_tokens": 2039186877.0, + "step": 12161 + }, + { + "entropy": 1.7301738758881886, + "epoch": 1.3360522918898134, + "grad_norm": 0.8379589915275574, + "learning_rate": 6.7217735159045434e-06, + "loss": 1.301, + "mean_token_accuracy": 0.6735063940286636, + "num_tokens": 2039339067.0, + "step": 12162 + }, + { + "entropy": 1.6896904309590657, + "epoch": 1.3361621488011863, + "grad_norm": 0.7300477623939514, + "learning_rate": 6.720364922979918e-06, + "loss": 1.3683, + "mean_token_accuracy": 0.6638319989045461, + "num_tokens": 2039499002.0, + "step": 12163 + }, + { + "entropy": 1.7401621341705322, + "epoch": 1.3362720057125594, + "grad_norm": 0.814299464225769, + "learning_rate": 6.71895646550381e-06, + "loss": 1.5958, + "mean_token_accuracy": 0.6319139301776886, + "num_tokens": 2039683860.0, + "step": 12164 + }, + { + "entropy": 1.7155976593494415, + "epoch": 1.3363818626239323, + "grad_norm": 0.614666759967804, + "learning_rate": 6.7175481435208045e-06, + "loss": 1.4577, + "mean_token_accuracy": 0.6343758553266525, + "num_tokens": 2039861354.0, + "step": 12165 + }, + { + "entropy": 1.6503975987434387, + "epoch": 1.3364917195353052, + "grad_norm": 0.6941211223602295, + "learning_rate": 6.716139957075466e-06, + "loss": 1.262, + "mean_token_accuracy": 0.669905404249827, + "num_tokens": 2039997906.0, + "step": 12166 + }, + { + "entropy": 1.7294440964857738, + "epoch": 1.3366015764466783, + "grad_norm": 0.860774040222168, + "learning_rate": 6.71473190621237e-06, + "loss": 1.4486, + "mean_token_accuracy": 0.6619910101095835, + "num_tokens": 2040154720.0, + "step": 12167 + }, + { + "entropy": 1.7141015529632568, + "epoch": 1.3367114333580512, + "grad_norm": 0.6437616348266602, + "learning_rate": 6.7133239909760815e-06, + "loss": 1.3718, + "mean_token_accuracy": 0.6447582989931107, + "num_tokens": 2040342386.0, + "step": 12168 + }, + { + "entropy": 1.7256827255090077, + "epoch": 1.336821290269424, + "grad_norm": 0.8925187587738037, + "learning_rate": 6.711916211411151e-06, + "loss": 1.5247, + "mean_token_accuracy": 0.6567995399236679, + "num_tokens": 2040464956.0, + "step": 12169 + }, + { + "entropy": 1.6982427140076954, + "epoch": 1.336931147180797, + "grad_norm": 0.709115207195282, + "learning_rate": 6.710508567562142e-06, + "loss": 1.3701, + "mean_token_accuracy": 0.6482947717110316, + "num_tokens": 2040624818.0, + "step": 12170 + }, + { + "entropy": 1.7100390096505482, + "epoch": 1.3370410040921699, + "grad_norm": 0.6098870038986206, + "learning_rate": 6.7091010594736096e-06, + "loss": 1.2835, + "mean_token_accuracy": 0.6711164067188898, + "num_tokens": 2040777685.0, + "step": 12171 + }, + { + "entropy": 1.7272491256395976, + "epoch": 1.337150861003543, + "grad_norm": 0.6299756169319153, + "learning_rate": 6.7076936871900876e-06, + "loss": 1.3556, + "mean_token_accuracy": 0.6612934718529383, + "num_tokens": 2040945251.0, + "step": 12172 + }, + { + "entropy": 1.7412489652633667, + "epoch": 1.3372607179149159, + "grad_norm": 0.6856157183647156, + "learning_rate": 6.706286450756129e-06, + "loss": 1.2422, + "mean_token_accuracy": 0.6719231804211935, + "num_tokens": 2041059005.0, + "step": 12173 + }, + { + "entropy": 1.6607161959012349, + "epoch": 1.3373705748262887, + "grad_norm": 0.6099095940589905, + "learning_rate": 6.70487935021627e-06, + "loss": 1.3921, + "mean_token_accuracy": 0.6561809430519739, + "num_tokens": 2041292263.0, + "step": 12174 + }, + { + "entropy": 1.661214272181193, + "epoch": 1.3374804317376616, + "grad_norm": 0.6299693584442139, + "learning_rate": 6.703472385615045e-06, + "loss": 1.2867, + "mean_token_accuracy": 0.6732292920351028, + "num_tokens": 2041428050.0, + "step": 12175 + }, + { + "entropy": 1.7231532831986744, + "epoch": 1.3375902886490345, + "grad_norm": 0.7081091403961182, + "learning_rate": 6.7020655569969795e-06, + "loss": 1.3649, + "mean_token_accuracy": 0.6584912836551666, + "num_tokens": 2041583936.0, + "step": 12176 + }, + { + "entropy": 1.7170475920041401, + "epoch": 1.3377001455604076, + "grad_norm": 0.7208677530288696, + "learning_rate": 6.700658864406607e-06, + "loss": 1.3288, + "mean_token_accuracy": 0.6651655087868372, + "num_tokens": 2041738235.0, + "step": 12177 + }, + { + "entropy": 1.6539806723594666, + "epoch": 1.3378100024717805, + "grad_norm": 0.5469607710838318, + "learning_rate": 6.69925230788844e-06, + "loss": 1.3708, + "mean_token_accuracy": 0.6626506249109904, + "num_tokens": 2041949745.0, + "step": 12178 + }, + { + "entropy": 1.6960765818754833, + "epoch": 1.3379198593831534, + "grad_norm": 0.6942289471626282, + "learning_rate": 6.697845887487002e-06, + "loss": 1.3427, + "mean_token_accuracy": 0.666606068611145, + "num_tokens": 2042104158.0, + "step": 12179 + }, + { + "entropy": 1.7386765678723652, + "epoch": 1.3380297162945265, + "grad_norm": 0.7499749064445496, + "learning_rate": 6.696439603246805e-06, + "loss": 1.355, + "mean_token_accuracy": 0.6540759851535162, + "num_tokens": 2042257471.0, + "step": 12180 + }, + { + "entropy": 1.6771236062049866, + "epoch": 1.3381395732058994, + "grad_norm": 0.6517854928970337, + "learning_rate": 6.69503345521235e-06, + "loss": 1.4844, + "mean_token_accuracy": 0.6465798964103063, + "num_tokens": 2042424318.0, + "step": 12181 + }, + { + "entropy": 1.6540433168411255, + "epoch": 1.3382494301172723, + "grad_norm": 0.7354924082756042, + "learning_rate": 6.693627443428146e-06, + "loss": 1.2878, + "mean_token_accuracy": 0.6684385339419047, + "num_tokens": 2042554398.0, + "step": 12182 + }, + { + "entropy": 1.7346096734205882, + "epoch": 1.3383592870286452, + "grad_norm": 0.6490511894226074, + "learning_rate": 6.6922215679387014e-06, + "loss": 1.4537, + "mean_token_accuracy": 0.6384791831175486, + "num_tokens": 2042753232.0, + "step": 12183 + }, + { + "entropy": 1.6197856763998668, + "epoch": 1.338469143940018, + "grad_norm": 0.6182257533073425, + "learning_rate": 6.690815828788495e-06, + "loss": 1.3176, + "mean_token_accuracy": 0.6738310505946478, + "num_tokens": 2042931082.0, + "step": 12184 + }, + { + "entropy": 1.6874169707298279, + "epoch": 1.3385790008513911, + "grad_norm": 0.6227688193321228, + "learning_rate": 6.6894102260220266e-06, + "loss": 1.3646, + "mean_token_accuracy": 0.6637938221295675, + "num_tokens": 2043074729.0, + "step": 12185 + }, + { + "entropy": 1.7578496237595875, + "epoch": 1.338688857762764, + "grad_norm": 0.6879784464836121, + "learning_rate": 6.688004759683784e-06, + "loss": 1.4202, + "mean_token_accuracy": 0.6445033997297287, + "num_tokens": 2043229062.0, + "step": 12186 + }, + { + "entropy": 1.7419428924719493, + "epoch": 1.338798714674137, + "grad_norm": 0.6606770157814026, + "learning_rate": 6.68659942981825e-06, + "loss": 1.3376, + "mean_token_accuracy": 0.6617419819037119, + "num_tokens": 2043347488.0, + "step": 12187 + }, + { + "entropy": 1.7223469018936157, + "epoch": 1.3389085715855098, + "grad_norm": 0.6497092247009277, + "learning_rate": 6.685194236469896e-06, + "loss": 1.5492, + "mean_token_accuracy": 0.6567690620819727, + "num_tokens": 2043535592.0, + "step": 12188 + }, + { + "entropy": 1.748223255077998, + "epoch": 1.3390184284968827, + "grad_norm": 1.2254005670547485, + "learning_rate": 6.683789179683203e-06, + "loss": 1.4608, + "mean_token_accuracy": 0.6514624655246735, + "num_tokens": 2043684053.0, + "step": 12189 + }, + { + "entropy": 1.6679157416025798, + "epoch": 1.3391282854082558, + "grad_norm": 0.5853722095489502, + "learning_rate": 6.682384259502635e-06, + "loss": 1.3211, + "mean_token_accuracy": 0.6666586250066757, + "num_tokens": 2043841336.0, + "step": 12190 + }, + { + "entropy": 1.6924639145533245, + "epoch": 1.3392381423196287, + "grad_norm": 0.7584408521652222, + "learning_rate": 6.680979475972664e-06, + "loss": 1.3216, + "mean_token_accuracy": 0.6670918017625809, + "num_tokens": 2043997000.0, + "step": 12191 + }, + { + "entropy": 1.7606268525123596, + "epoch": 1.3393479992310016, + "grad_norm": 0.7066436409950256, + "learning_rate": 6.679574829137744e-06, + "loss": 1.4732, + "mean_token_accuracy": 0.6344574143489202, + "num_tokens": 2044206455.0, + "step": 12192 + }, + { + "entropy": 1.7649494012196858, + "epoch": 1.3394578561423747, + "grad_norm": 0.7756333947181702, + "learning_rate": 6.678170319042332e-06, + "loss": 1.5191, + "mean_token_accuracy": 0.6414674917856852, + "num_tokens": 2044379695.0, + "step": 12193 + }, + { + "entropy": 1.7187654276688893, + "epoch": 1.3395677130537476, + "grad_norm": 0.7068918347358704, + "learning_rate": 6.676765945730881e-06, + "loss": 1.4654, + "mean_token_accuracy": 0.6458430662751198, + "num_tokens": 2044525734.0, + "step": 12194 + }, + { + "entropy": 1.7495672305425007, + "epoch": 1.3396775699651204, + "grad_norm": 0.6518183350563049, + "learning_rate": 6.675361709247847e-06, + "loss": 1.4416, + "mean_token_accuracy": 0.6462061703205109, + "num_tokens": 2044703188.0, + "step": 12195 + }, + { + "entropy": 1.7593078414599101, + "epoch": 1.3397874268764933, + "grad_norm": 0.8291671276092529, + "learning_rate": 6.673957609637659e-06, + "loss": 1.2821, + "mean_token_accuracy": 0.6755085190137228, + "num_tokens": 2044844945.0, + "step": 12196 + }, + { + "entropy": 1.6297094126542409, + "epoch": 1.3398972837878662, + "grad_norm": 0.6503934264183044, + "learning_rate": 6.672553646944764e-06, + "loss": 1.5341, + "mean_token_accuracy": 0.6331368734439214, + "num_tokens": 2045065342.0, + "step": 12197 + }, + { + "entropy": 1.730812023083369, + "epoch": 1.3400071406992393, + "grad_norm": 0.6852719187736511, + "learning_rate": 6.6711498212135994e-06, + "loss": 1.3495, + "mean_token_accuracy": 0.6606137305498123, + "num_tokens": 2045262146.0, + "step": 12198 + }, + { + "entropy": 1.6728038688500722, + "epoch": 1.3401169976106122, + "grad_norm": 0.6521802544593811, + "learning_rate": 6.669746132488591e-06, + "loss": 1.3743, + "mean_token_accuracy": 0.6557741363843282, + "num_tokens": 2045501023.0, + "step": 12199 + }, + { + "entropy": 1.7133564253648121, + "epoch": 1.340226854521985, + "grad_norm": 0.8919302821159363, + "learning_rate": 6.668342580814165e-06, + "loss": 1.3303, + "mean_token_accuracy": 0.6722137182950974, + "num_tokens": 2045671599.0, + "step": 12200 + }, + { + "entropy": 1.776214490334193, + "epoch": 1.340336711433358, + "grad_norm": 0.8077658414840698, + "learning_rate": 6.666939166234747e-06, + "loss": 1.4297, + "mean_token_accuracy": 0.6601533641417822, + "num_tokens": 2045889947.0, + "step": 12201 + }, + { + "entropy": 1.7369691332181294, + "epoch": 1.3404465683447309, + "grad_norm": 0.6759545207023621, + "learning_rate": 6.665535888794748e-06, + "loss": 1.2795, + "mean_token_accuracy": 0.664157842596372, + "num_tokens": 2046039114.0, + "step": 12202 + }, + { + "entropy": 1.7115701337655385, + "epoch": 1.340556425256104, + "grad_norm": 0.8110707402229309, + "learning_rate": 6.664132748538588e-06, + "loss": 1.3392, + "mean_token_accuracy": 0.6724221408367157, + "num_tokens": 2046210860.0, + "step": 12203 + }, + { + "entropy": 1.6262700359026592, + "epoch": 1.3406662821674769, + "grad_norm": 0.6628835201263428, + "learning_rate": 6.662729745510674e-06, + "loss": 1.1864, + "mean_token_accuracy": 0.6867125034332275, + "num_tokens": 2046348059.0, + "step": 12204 + }, + { + "entropy": 1.7018965184688568, + "epoch": 1.3407761390788497, + "grad_norm": 0.8943867683410645, + "learning_rate": 6.661326879755403e-06, + "loss": 1.2828, + "mean_token_accuracy": 0.6711592872937521, + "num_tokens": 2046465860.0, + "step": 12205 + }, + { + "entropy": 1.7263304789861043, + "epoch": 1.3408859959902228, + "grad_norm": 0.6487671136856079, + "learning_rate": 6.659924151317184e-06, + "loss": 1.3756, + "mean_token_accuracy": 0.6614161481459936, + "num_tokens": 2046623387.0, + "step": 12206 + }, + { + "entropy": 1.6342344085375469, + "epoch": 1.3409958529015957, + "grad_norm": 0.5863806009292603, + "learning_rate": 6.658521560240416e-06, + "loss": 1.4543, + "mean_token_accuracy": 0.6475448707739512, + "num_tokens": 2046836646.0, + "step": 12207 + }, + { + "entropy": 1.6563426355520885, + "epoch": 1.3411057098129686, + "grad_norm": 0.7072530388832092, + "learning_rate": 6.657119106569477e-06, + "loss": 1.2196, + "mean_token_accuracy": 0.6976286470890045, + "num_tokens": 2046975202.0, + "step": 12208 + }, + { + "entropy": 1.7593169311682384, + "epoch": 1.3412155667243415, + "grad_norm": 0.64771568775177, + "learning_rate": 6.655716790348763e-06, + "loss": 1.5249, + "mean_token_accuracy": 0.6462835719188055, + "num_tokens": 2047176210.0, + "step": 12209 + }, + { + "entropy": 1.7476372917493184, + "epoch": 1.3413254236357144, + "grad_norm": 0.6780056953430176, + "learning_rate": 6.654314611622656e-06, + "loss": 1.3844, + "mean_token_accuracy": 0.6559838354587555, + "num_tokens": 2047355243.0, + "step": 12210 + }, + { + "entropy": 1.7375625570615132, + "epoch": 1.3414352805470875, + "grad_norm": 0.6932122707366943, + "learning_rate": 6.652912570435536e-06, + "loss": 1.2896, + "mean_token_accuracy": 0.6642061273256937, + "num_tokens": 2047471435.0, + "step": 12211 + }, + { + "entropy": 1.7208791573842366, + "epoch": 1.3415451374584604, + "grad_norm": 0.6981452107429504, + "learning_rate": 6.651510666831772e-06, + "loss": 1.3985, + "mean_token_accuracy": 0.6554392377535502, + "num_tokens": 2047653436.0, + "step": 12212 + }, + { + "entropy": 1.703304221232732, + "epoch": 1.3416549943698333, + "grad_norm": 0.7472629547119141, + "learning_rate": 6.650108900855734e-06, + "loss": 1.405, + "mean_token_accuracy": 0.662998785575231, + "num_tokens": 2047810436.0, + "step": 12213 + }, + { + "entropy": 1.6598742206891377, + "epoch": 1.3417648512812062, + "grad_norm": 0.6906598806381226, + "learning_rate": 6.6487072725517874e-06, + "loss": 1.3876, + "mean_token_accuracy": 0.669889286160469, + "num_tokens": 2047947809.0, + "step": 12214 + }, + { + "entropy": 1.7231373190879822, + "epoch": 1.341874708192579, + "grad_norm": 0.7631545662879944, + "learning_rate": 6.647305781964304e-06, + "loss": 1.4282, + "mean_token_accuracy": 0.665252481897672, + "num_tokens": 2048165206.0, + "step": 12215 + }, + { + "entropy": 1.693508545557658, + "epoch": 1.3419845651039521, + "grad_norm": 0.6549314260482788, + "learning_rate": 6.645904429137622e-06, + "loss": 1.3312, + "mean_token_accuracy": 0.6650498857100805, + "num_tokens": 2048331616.0, + "step": 12216 + }, + { + "entropy": 1.6807125707467396, + "epoch": 1.342094422015325, + "grad_norm": 0.6334341168403625, + "learning_rate": 6.644503214116105e-06, + "loss": 1.5014, + "mean_token_accuracy": 0.6361768593390783, + "num_tokens": 2048541247.0, + "step": 12217 + }, + { + "entropy": 1.7305749952793121, + "epoch": 1.342204278926698, + "grad_norm": 0.7204241752624512, + "learning_rate": 6.6431021369441005e-06, + "loss": 1.3633, + "mean_token_accuracy": 0.6571338723103205, + "num_tokens": 2048708995.0, + "step": 12218 + }, + { + "entropy": 1.688479075829188, + "epoch": 1.342314135838071, + "grad_norm": 0.65947026014328, + "learning_rate": 6.64170119766595e-06, + "loss": 1.4483, + "mean_token_accuracy": 0.6561633894840876, + "num_tokens": 2048890366.0, + "step": 12219 + }, + { + "entropy": 1.669929713010788, + "epoch": 1.342423992749444, + "grad_norm": 0.702551007270813, + "learning_rate": 6.640300396325991e-06, + "loss": 1.3, + "mean_token_accuracy": 0.6672399689753851, + "num_tokens": 2049007667.0, + "step": 12220 + }, + { + "entropy": 1.7009166479110718, + "epoch": 1.3425338496608168, + "grad_norm": 0.6616597175598145, + "learning_rate": 6.638899732968562e-06, + "loss": 1.3765, + "mean_token_accuracy": 0.6643947462240855, + "num_tokens": 2049182073.0, + "step": 12221 + }, + { + "entropy": 1.6636148790518444, + "epoch": 1.3426437065721897, + "grad_norm": 0.6758390665054321, + "learning_rate": 6.637499207637988e-06, + "loss": 1.2797, + "mean_token_accuracy": 0.686865970492363, + "num_tokens": 2049322365.0, + "step": 12222 + }, + { + "entropy": 1.729952871799469, + "epoch": 1.3427535634835626, + "grad_norm": 0.6957722902297974, + "learning_rate": 6.636098820378603e-06, + "loss": 1.4878, + "mean_token_accuracy": 0.6524218966563543, + "num_tokens": 2049500723.0, + "step": 12223 + }, + { + "entropy": 1.68058975537618, + "epoch": 1.3428634203949357, + "grad_norm": 0.677686870098114, + "learning_rate": 6.6346985712347215e-06, + "loss": 1.2679, + "mean_token_accuracy": 0.6779388835032781, + "num_tokens": 2049657923.0, + "step": 12224 + }, + { + "entropy": 1.629438002904256, + "epoch": 1.3429732773063086, + "grad_norm": 0.7106833457946777, + "learning_rate": 6.633298460250661e-06, + "loss": 1.363, + "mean_token_accuracy": 0.6710018614927927, + "num_tokens": 2049895914.0, + "step": 12225 + }, + { + "entropy": 1.692349870999654, + "epoch": 1.3430831342176814, + "grad_norm": 0.6746785044670105, + "learning_rate": 6.631898487470736e-06, + "loss": 1.4901, + "mean_token_accuracy": 0.6519258220990499, + "num_tokens": 2050061436.0, + "step": 12226 + }, + { + "entropy": 1.7352541486422222, + "epoch": 1.3431929911290543, + "grad_norm": 0.6847538948059082, + "learning_rate": 6.630498652939263e-06, + "loss": 1.4904, + "mean_token_accuracy": 0.6462793598572413, + "num_tokens": 2050247189.0, + "step": 12227 + }, + { + "entropy": 1.697327196598053, + "epoch": 1.3433028480404272, + "grad_norm": 0.9095365405082703, + "learning_rate": 6.6290989567005325e-06, + "loss": 1.8281, + "mean_token_accuracy": 0.6107906103134155, + "num_tokens": 2050449478.0, + "step": 12228 + }, + { + "entropy": 1.6943837304910023, + "epoch": 1.3434127049518003, + "grad_norm": 0.7068896293640137, + "learning_rate": 6.627699398798849e-06, + "loss": 1.3645, + "mean_token_accuracy": 0.6637044797341028, + "num_tokens": 2050599011.0, + "step": 12229 + }, + { + "entropy": 1.7469304104646046, + "epoch": 1.3435225618631732, + "grad_norm": 0.6389604210853577, + "learning_rate": 6.626299979278514e-06, + "loss": 1.3339, + "mean_token_accuracy": 0.6636842538913091, + "num_tokens": 2050749646.0, + "step": 12230 + }, + { + "entropy": 1.7814926008383434, + "epoch": 1.343632418774546, + "grad_norm": 0.6876501441001892, + "learning_rate": 6.6249006981838134e-06, + "loss": 1.4588, + "mean_token_accuracy": 0.6435671746730804, + "num_tokens": 2050899001.0, + "step": 12231 + }, + { + "entropy": 1.6366635859012604, + "epoch": 1.3437422756859192, + "grad_norm": 0.6466518044471741, + "learning_rate": 6.623501555559031e-06, + "loss": 1.2687, + "mean_token_accuracy": 0.6803223540385565, + "num_tokens": 2051066927.0, + "step": 12232 + }, + { + "entropy": 1.6806427439053853, + "epoch": 1.343852132597292, + "grad_norm": 0.6225482225418091, + "learning_rate": 6.622102551448456e-06, + "loss": 1.2683, + "mean_token_accuracy": 0.6693209211031595, + "num_tokens": 2051200497.0, + "step": 12233 + }, + { + "entropy": 1.7055828273296356, + "epoch": 1.343961989508665, + "grad_norm": 0.6730872988700867, + "learning_rate": 6.620703685896358e-06, + "loss": 1.4317, + "mean_token_accuracy": 0.6518972416718801, + "num_tokens": 2051333834.0, + "step": 12234 + }, + { + "entropy": 1.7186238567034404, + "epoch": 1.3440718464200379, + "grad_norm": 0.6576967835426331, + "learning_rate": 6.619304958947019e-06, + "loss": 1.4835, + "mean_token_accuracy": 0.6451779951651891, + "num_tokens": 2051514481.0, + "step": 12235 + }, + { + "entropy": 1.7354322572549183, + "epoch": 1.3441817033314107, + "grad_norm": 0.609137237071991, + "learning_rate": 6.617906370644704e-06, + "loss": 1.3636, + "mean_token_accuracy": 0.6611924121777216, + "num_tokens": 2051685341.0, + "step": 12236 + }, + { + "entropy": 1.696060409148534, + "epoch": 1.3442915602427838, + "grad_norm": 0.6814700961112976, + "learning_rate": 6.616507921033673e-06, + "loss": 1.1382, + "mean_token_accuracy": 0.6968227724234263, + "num_tokens": 2051839522.0, + "step": 12237 + }, + { + "entropy": 1.7019359568754833, + "epoch": 1.3444014171541567, + "grad_norm": 0.7261495590209961, + "learning_rate": 6.615109610158194e-06, + "loss": 1.3654, + "mean_token_accuracy": 0.6753996213277181, + "num_tokens": 2051985272.0, + "step": 12238 + }, + { + "entropy": 1.727369596560796, + "epoch": 1.3445112740655296, + "grad_norm": 0.6944707632064819, + "learning_rate": 6.6137114380625255e-06, + "loss": 1.349, + "mean_token_accuracy": 0.6564425776402155, + "num_tokens": 2052119479.0, + "step": 12239 + }, + { + "entropy": 1.7554010450839996, + "epoch": 1.3446211309769027, + "grad_norm": 0.8773122429847717, + "learning_rate": 6.612313404790907e-06, + "loss": 1.3218, + "mean_token_accuracy": 0.6642026404539744, + "num_tokens": 2052246785.0, + "step": 12240 + }, + { + "entropy": 1.6962561508019764, + "epoch": 1.3447309878882754, + "grad_norm": 0.6583256721496582, + "learning_rate": 6.61091551038759e-06, + "loss": 1.3261, + "mean_token_accuracy": 0.6596464316050211, + "num_tokens": 2052401863.0, + "step": 12241 + }, + { + "entropy": 1.6963837146759033, + "epoch": 1.3448408447996485, + "grad_norm": 0.6508731842041016, + "learning_rate": 6.609517754896824e-06, + "loss": 1.3197, + "mean_token_accuracy": 0.6548064053058624, + "num_tokens": 2052537089.0, + "step": 12242 + }, + { + "entropy": 1.6928088863690693, + "epoch": 1.3449507017110214, + "grad_norm": 0.6507278084754944, + "learning_rate": 6.608120138362844e-06, + "loss": 1.4328, + "mean_token_accuracy": 0.6521178285280863, + "num_tokens": 2052697099.0, + "step": 12243 + }, + { + "entropy": 1.743686467409134, + "epoch": 1.3450605586223943, + "grad_norm": 0.6717689037322998, + "learning_rate": 6.6067226608298765e-06, + "loss": 1.4188, + "mean_token_accuracy": 0.6479152143001556, + "num_tokens": 2052886501.0, + "step": 12244 + }, + { + "entropy": 1.6980459491411846, + "epoch": 1.3451704155337674, + "grad_norm": 0.7341813445091248, + "learning_rate": 6.605325322342162e-06, + "loss": 1.3154, + "mean_token_accuracy": 0.6584825615088145, + "num_tokens": 2053021245.0, + "step": 12245 + }, + { + "entropy": 1.7826583683490753, + "epoch": 1.3452802724451403, + "grad_norm": 0.770753026008606, + "learning_rate": 6.603928122943918e-06, + "loss": 1.3637, + "mean_token_accuracy": 0.6621987770001093, + "num_tokens": 2053162265.0, + "step": 12246 + }, + { + "entropy": 1.7585231363773346, + "epoch": 1.3453901293565131, + "grad_norm": 0.7951369881629944, + "learning_rate": 6.602531062679371e-06, + "loss": 1.426, + "mean_token_accuracy": 0.6497030705213547, + "num_tokens": 2053305821.0, + "step": 12247 + }, + { + "entropy": 1.6882805128892262, + "epoch": 1.345499986267886, + "grad_norm": 0.7582414150238037, + "learning_rate": 6.6011341415927345e-06, + "loss": 1.4527, + "mean_token_accuracy": 0.6527374486128489, + "num_tokens": 2053475006.0, + "step": 12248 + }, + { + "entropy": 1.7408703466256459, + "epoch": 1.345609843179259, + "grad_norm": 0.621370255947113, + "learning_rate": 6.599737359728216e-06, + "loss": 1.4883, + "mean_token_accuracy": 0.6453725149234136, + "num_tokens": 2053652199.0, + "step": 12249 + }, + { + "entropy": 1.6780038674672444, + "epoch": 1.345719700090632, + "grad_norm": 0.7224386930465698, + "learning_rate": 6.598340717130027e-06, + "loss": 1.3707, + "mean_token_accuracy": 0.6695211380720139, + "num_tokens": 2053819186.0, + "step": 12250 + }, + { + "entropy": 1.7335948844750722, + "epoch": 1.345829557002005, + "grad_norm": 0.6624288558959961, + "learning_rate": 6.59694421384238e-06, + "loss": 1.4288, + "mean_token_accuracy": 0.6465139786402384, + "num_tokens": 2053967739.0, + "step": 12251 + }, + { + "entropy": 1.7141142686208088, + "epoch": 1.3459394139133778, + "grad_norm": 0.6660525798797607, + "learning_rate": 6.595547849909456e-06, + "loss": 1.55, + "mean_token_accuracy": 0.6478944619496664, + "num_tokens": 2054156120.0, + "step": 12252 + }, + { + "entropy": 1.756292422612508, + "epoch": 1.346049270824751, + "grad_norm": 0.781925618648529, + "learning_rate": 6.594151625375458e-06, + "loss": 1.2002, + "mean_token_accuracy": 0.6815067678689957, + "num_tokens": 2054258490.0, + "step": 12253 + }, + { + "entropy": 1.7494585911432903, + "epoch": 1.3461591277361236, + "grad_norm": 0.8068336844444275, + "learning_rate": 6.5927555402845775e-06, + "loss": 1.4026, + "mean_token_accuracy": 0.6626935104529063, + "num_tokens": 2054424651.0, + "step": 12254 + }, + { + "entropy": 1.715184877316157, + "epoch": 1.3462689846474967, + "grad_norm": 0.7686038613319397, + "learning_rate": 6.591359594681001e-06, + "loss": 1.2625, + "mean_token_accuracy": 0.6788710653781891, + "num_tokens": 2054553349.0, + "step": 12255 + }, + { + "entropy": 1.7815559605757396, + "epoch": 1.3463788415588696, + "grad_norm": 0.8140032291412354, + "learning_rate": 6.5899637886089014e-06, + "loss": 1.3976, + "mean_token_accuracy": 0.6547159850597382, + "num_tokens": 2054696873.0, + "step": 12256 + }, + { + "entropy": 1.671088566382726, + "epoch": 1.3464886984702424, + "grad_norm": 0.6825757026672363, + "learning_rate": 6.588568122112464e-06, + "loss": 1.5188, + "mean_token_accuracy": 0.6364674071470896, + "num_tokens": 2054865237.0, + "step": 12257 + }, + { + "entropy": 1.7114405731360118, + "epoch": 1.3465985553816155, + "grad_norm": 0.7471262216567993, + "learning_rate": 6.587172595235856e-06, + "loss": 1.426, + "mean_token_accuracy": 0.6726455787817637, + "num_tokens": 2054979860.0, + "step": 12258 + }, + { + "entropy": 1.708991785844167, + "epoch": 1.3467084122929884, + "grad_norm": 0.641041100025177, + "learning_rate": 6.585777208023249e-06, + "loss": 1.4844, + "mean_token_accuracy": 0.65794704357783, + "num_tokens": 2055155113.0, + "step": 12259 + }, + { + "entropy": 1.7066246469815571, + "epoch": 1.3468182692043613, + "grad_norm": 0.6625202298164368, + "learning_rate": 6.584381960518805e-06, + "loss": 1.3399, + "mean_token_accuracy": 0.6542827486991882, + "num_tokens": 2055292902.0, + "step": 12260 + }, + { + "entropy": 1.7277962068716686, + "epoch": 1.3469281261157342, + "grad_norm": 0.7618191838264465, + "learning_rate": 6.58298685276668e-06, + "loss": 1.5263, + "mean_token_accuracy": 0.6597974797089895, + "num_tokens": 2055475600.0, + "step": 12261 + }, + { + "entropy": 1.7162013947963715, + "epoch": 1.347037983027107, + "grad_norm": 0.6941340565681458, + "learning_rate": 6.581591884811029e-06, + "loss": 1.3457, + "mean_token_accuracy": 0.653758093714714, + "num_tokens": 2055605034.0, + "step": 12262 + }, + { + "entropy": 1.6504423717657726, + "epoch": 1.3471478399384802, + "grad_norm": 0.6448864340782166, + "learning_rate": 6.580197056696009e-06, + "loss": 1.3639, + "mean_token_accuracy": 0.6632355799277624, + "num_tokens": 2055763298.0, + "step": 12263 + }, + { + "entropy": 1.7163707713286083, + "epoch": 1.347257696849853, + "grad_norm": 0.740750253200531, + "learning_rate": 6.578802368465758e-06, + "loss": 1.318, + "mean_token_accuracy": 0.6563950031995773, + "num_tokens": 2055913303.0, + "step": 12264 + }, + { + "entropy": 1.6848769783973694, + "epoch": 1.347367553761226, + "grad_norm": 0.7327452301979065, + "learning_rate": 6.577407820164417e-06, + "loss": 1.2855, + "mean_token_accuracy": 0.6678441762924194, + "num_tokens": 2056037104.0, + "step": 12265 + }, + { + "entropy": 1.731307754913966, + "epoch": 1.347477410672599, + "grad_norm": 0.7852592468261719, + "learning_rate": 6.576013411836128e-06, + "loss": 1.3271, + "mean_token_accuracy": 0.6711703638235728, + "num_tokens": 2056238659.0, + "step": 12266 + }, + { + "entropy": 1.640326350927353, + "epoch": 1.347587267583972, + "grad_norm": 0.6604642271995544, + "learning_rate": 6.5746191435250226e-06, + "loss": 1.4065, + "mean_token_accuracy": 0.6636442442735037, + "num_tokens": 2056418652.0, + "step": 12267 + }, + { + "entropy": 1.6644433339436848, + "epoch": 1.3476971244953448, + "grad_norm": 0.7689099311828613, + "learning_rate": 6.5732250152752245e-06, + "loss": 1.252, + "mean_token_accuracy": 0.6750207046667734, + "num_tokens": 2056524847.0, + "step": 12268 + }, + { + "entropy": 1.6737519601980846, + "epoch": 1.3478069814067177, + "grad_norm": 0.665513813495636, + "learning_rate": 6.5718310271308635e-06, + "loss": 1.3628, + "mean_token_accuracy": 0.6609103033939997, + "num_tokens": 2056685770.0, + "step": 12269 + }, + { + "entropy": 1.705015589793523, + "epoch": 1.3479168383180906, + "grad_norm": 0.5894766449928284, + "learning_rate": 6.57043717913605e-06, + "loss": 1.5859, + "mean_token_accuracy": 0.6079653153816859, + "num_tokens": 2056913511.0, + "step": 12270 + }, + { + "entropy": 1.7268774608771007, + "epoch": 1.3480266952294637, + "grad_norm": 0.7870163321495056, + "learning_rate": 6.569043471334908e-06, + "loss": 1.6112, + "mean_token_accuracy": 0.6265291919310888, + "num_tokens": 2057089554.0, + "step": 12271 + }, + { + "entropy": 1.7062111397584279, + "epoch": 1.3481365521408366, + "grad_norm": 0.7420828342437744, + "learning_rate": 6.567649903771543e-06, + "loss": 1.4877, + "mean_token_accuracy": 0.638950581351916, + "num_tokens": 2057257100.0, + "step": 12272 + }, + { + "entropy": 1.6089070041974385, + "epoch": 1.3482464090522095, + "grad_norm": 0.6262128949165344, + "learning_rate": 6.56625647649006e-06, + "loss": 1.2779, + "mean_token_accuracy": 0.6737044056256613, + "num_tokens": 2057420018.0, + "step": 12273 + }, + { + "entropy": 1.706796109676361, + "epoch": 1.3483562659635824, + "grad_norm": 0.6636627912521362, + "learning_rate": 6.564863189534562e-06, + "loss": 1.2494, + "mean_token_accuracy": 0.6729203015565872, + "num_tokens": 2057522688.0, + "step": 12274 + }, + { + "entropy": 1.6989140808582306, + "epoch": 1.3484661228749553, + "grad_norm": 0.6211623549461365, + "learning_rate": 6.563470042949147e-06, + "loss": 1.3388, + "mean_token_accuracy": 0.6737065613269806, + "num_tokens": 2057711110.0, + "step": 12275 + }, + { + "entropy": 1.6851005852222443, + "epoch": 1.3485759797863284, + "grad_norm": 0.7660583257675171, + "learning_rate": 6.562077036777902e-06, + "loss": 1.4126, + "mean_token_accuracy": 0.6566516806681951, + "num_tokens": 2057882659.0, + "step": 12276 + }, + { + "entropy": 1.7562197347482045, + "epoch": 1.3486858366977013, + "grad_norm": 0.6725447177886963, + "learning_rate": 6.560684171064924e-06, + "loss": 1.354, + "mean_token_accuracy": 0.6481892019510269, + "num_tokens": 2058055824.0, + "step": 12277 + }, + { + "entropy": 1.7564709782600403, + "epoch": 1.3487956936090741, + "grad_norm": 0.7089916467666626, + "learning_rate": 6.5592914458542855e-06, + "loss": 1.5148, + "mean_token_accuracy": 0.6371178328990936, + "num_tokens": 2058221303.0, + "step": 12278 + }, + { + "entropy": 1.7744645377000172, + "epoch": 1.3489055505204472, + "grad_norm": 0.785156786441803, + "learning_rate": 6.557898861190077e-06, + "loss": 1.4629, + "mean_token_accuracy": 0.6464930176734924, + "num_tokens": 2058427514.0, + "step": 12279 + }, + { + "entropy": 1.7491690417130787, + "epoch": 1.3490154074318201, + "grad_norm": 0.7399646043777466, + "learning_rate": 6.556506417116368e-06, + "loss": 1.3963, + "mean_token_accuracy": 0.6607019901275635, + "num_tokens": 2058610955.0, + "step": 12280 + }, + { + "entropy": 1.7057534257570903, + "epoch": 1.349125264343193, + "grad_norm": 0.8085160255432129, + "learning_rate": 6.555114113677226e-06, + "loss": 1.4088, + "mean_token_accuracy": 0.6513328750928243, + "num_tokens": 2058770075.0, + "step": 12281 + }, + { + "entropy": 1.7024572590986888, + "epoch": 1.349235121254566, + "grad_norm": 0.6303825378417969, + "learning_rate": 6.553721950916717e-06, + "loss": 1.334, + "mean_token_accuracy": 0.6583151618639628, + "num_tokens": 2058936370.0, + "step": 12282 + }, + { + "entropy": 1.7161445319652557, + "epoch": 1.3493449781659388, + "grad_norm": 0.7127066254615784, + "learning_rate": 6.552329928878914e-06, + "loss": 1.4262, + "mean_token_accuracy": 0.6654284497102102, + "num_tokens": 2059051247.0, + "step": 12283 + }, + { + "entropy": 1.7131555875142415, + "epoch": 1.349454835077312, + "grad_norm": 0.6429823637008667, + "learning_rate": 6.550938047607855e-06, + "loss": 1.3437, + "mean_token_accuracy": 0.6701266666253408, + "num_tokens": 2059187258.0, + "step": 12284 + }, + { + "entropy": 1.6841067373752594, + "epoch": 1.3495646919886848, + "grad_norm": 0.623063862323761, + "learning_rate": 6.549546307147604e-06, + "loss": 1.4499, + "mean_token_accuracy": 0.6565073132514954, + "num_tokens": 2059370768.0, + "step": 12285 + }, + { + "entropy": 1.7391627728939056, + "epoch": 1.3496745489000577, + "grad_norm": 0.5836269855499268, + "learning_rate": 6.548154707542209e-06, + "loss": 1.5104, + "mean_token_accuracy": 0.6366867274045944, + "num_tokens": 2059615803.0, + "step": 12286 + }, + { + "entropy": 1.725864330927531, + "epoch": 1.3497844058114306, + "grad_norm": 0.6897220015525818, + "learning_rate": 6.546763248835713e-06, + "loss": 1.3016, + "mean_token_accuracy": 0.659923846522967, + "num_tokens": 2059736379.0, + "step": 12287 + }, + { + "entropy": 1.6832166115442913, + "epoch": 1.3498942627228034, + "grad_norm": 0.7074958086013794, + "learning_rate": 6.5453719310721485e-06, + "loss": 1.2413, + "mean_token_accuracy": 0.6812171091636022, + "num_tokens": 2059877019.0, + "step": 12288 + }, + { + "entropy": 1.7349075376987457, + "epoch": 1.3500041196341765, + "grad_norm": 0.7303879857063293, + "learning_rate": 6.543980754295559e-06, + "loss": 1.2653, + "mean_token_accuracy": 0.6705899288256963, + "num_tokens": 2060007355.0, + "step": 12289 + }, + { + "entropy": 1.6935294369856517, + "epoch": 1.3501139765455494, + "grad_norm": 0.9410924911499023, + "learning_rate": 6.542589718549968e-06, + "loss": 1.5074, + "mean_token_accuracy": 0.6501014828681946, + "num_tokens": 2060194841.0, + "step": 12290 + }, + { + "entropy": 1.649632195631663, + "epoch": 1.3502238334569223, + "grad_norm": 0.5783915519714355, + "learning_rate": 6.541198823879406e-06, + "loss": 1.2677, + "mean_token_accuracy": 0.6630587677160898, + "num_tokens": 2060380983.0, + "step": 12291 + }, + { + "entropy": 1.7274872958660126, + "epoch": 1.3503336903682954, + "grad_norm": 0.67799311876297, + "learning_rate": 6.5398080703278935e-06, + "loss": 1.3454, + "mean_token_accuracy": 0.6603363305330276, + "num_tokens": 2060548780.0, + "step": 12292 + }, + { + "entropy": 1.7195112307866414, + "epoch": 1.3504435472796683, + "grad_norm": 0.7613741755485535, + "learning_rate": 6.5384174579394435e-06, + "loss": 1.2624, + "mean_token_accuracy": 0.6771682302157084, + "num_tokens": 2060657304.0, + "step": 12293 + }, + { + "entropy": 1.7164626916249592, + "epoch": 1.3505534041910412, + "grad_norm": 0.8549966216087341, + "learning_rate": 6.537026986758068e-06, + "loss": 1.3898, + "mean_token_accuracy": 0.650303453207016, + "num_tokens": 2060815540.0, + "step": 12294 + }, + { + "entropy": 1.7116701900959015, + "epoch": 1.350663261102414, + "grad_norm": 0.6703973412513733, + "learning_rate": 6.5356366568277855e-06, + "loss": 1.4342, + "mean_token_accuracy": 0.6575676451126734, + "num_tokens": 2060986234.0, + "step": 12295 + }, + { + "entropy": 1.6897992591063182, + "epoch": 1.350773118013787, + "grad_norm": 0.6119180917739868, + "learning_rate": 6.534246468192582e-06, + "loss": 1.3418, + "mean_token_accuracy": 0.6577243904272715, + "num_tokens": 2061159321.0, + "step": 12296 + }, + { + "entropy": 1.6780081788698833, + "epoch": 1.35088297492516, + "grad_norm": 0.6298530697822571, + "learning_rate": 6.532856420896469e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6540986796220144, + "num_tokens": 2061293448.0, + "step": 12297 + }, + { + "entropy": 1.701556493838628, + "epoch": 1.350992831836533, + "grad_norm": 0.6996129155158997, + "learning_rate": 6.531466514983438e-06, + "loss": 1.4905, + "mean_token_accuracy": 0.6383817990620931, + "num_tokens": 2061518965.0, + "step": 12298 + }, + { + "entropy": 1.665728211402893, + "epoch": 1.3511026887479058, + "grad_norm": 0.8420900106430054, + "learning_rate": 6.530076750497479e-06, + "loss": 1.4345, + "mean_token_accuracy": 0.6632737889885902, + "num_tokens": 2061659464.0, + "step": 12299 + }, + { + "entropy": 1.680874894062678, + "epoch": 1.3512125456592787, + "grad_norm": 0.669554591178894, + "learning_rate": 6.5286871274825736e-06, + "loss": 1.5223, + "mean_token_accuracy": 0.642878438035647, + "num_tokens": 2061862373.0, + "step": 12300 + }, + { + "entropy": 1.7252130707105, + "epoch": 1.3513224025706516, + "grad_norm": 0.6238853335380554, + "learning_rate": 6.527297645982709e-06, + "loss": 1.4893, + "mean_token_accuracy": 0.6513131509224573, + "num_tokens": 2062013384.0, + "step": 12301 + }, + { + "entropy": 1.6984285215536754, + "epoch": 1.3514322594820247, + "grad_norm": 0.7303968667984009, + "learning_rate": 6.525908306041855e-06, + "loss": 1.2639, + "mean_token_accuracy": 0.6747083564599355, + "num_tokens": 2062142981.0, + "step": 12302 + }, + { + "entropy": 1.685420423746109, + "epoch": 1.3515421163933976, + "grad_norm": 0.7917304039001465, + "learning_rate": 6.52451910770399e-06, + "loss": 1.3632, + "mean_token_accuracy": 0.6628880898157755, + "num_tokens": 2062320185.0, + "step": 12303 + }, + { + "entropy": 1.7349448402722676, + "epoch": 1.3516519733047705, + "grad_norm": 0.732818067073822, + "learning_rate": 6.52313005101308e-06, + "loss": 1.3378, + "mean_token_accuracy": 0.661628877123197, + "num_tokens": 2062477090.0, + "step": 12304 + }, + { + "entropy": 1.7262056469917297, + "epoch": 1.3517618302161436, + "grad_norm": 0.8141718506813049, + "learning_rate": 6.5217411360130815e-06, + "loss": 1.5927, + "mean_token_accuracy": 0.642752543091774, + "num_tokens": 2062658383.0, + "step": 12305 + }, + { + "entropy": 1.697360982497533, + "epoch": 1.3518716871275165, + "grad_norm": 0.685117244720459, + "learning_rate": 6.520352362747959e-06, + "loss": 1.2822, + "mean_token_accuracy": 0.6733687619368235, + "num_tokens": 2062789276.0, + "step": 12306 + }, + { + "entropy": 1.6366734206676483, + "epoch": 1.3519815440388894, + "grad_norm": 0.7046248316764832, + "learning_rate": 6.518963731261673e-06, + "loss": 1.3198, + "mean_token_accuracy": 0.6753099660078684, + "num_tokens": 2062938946.0, + "step": 12307 + }, + { + "entropy": 1.662851224342982, + "epoch": 1.3520914009502623, + "grad_norm": 0.6730368137359619, + "learning_rate": 6.517575241598157e-06, + "loss": 1.3058, + "mean_token_accuracy": 0.6809868812561035, + "num_tokens": 2063102608.0, + "step": 12308 + }, + { + "entropy": 1.6642492314179738, + "epoch": 1.3522012578616351, + "grad_norm": 0.6425415873527527, + "learning_rate": 6.516186893801366e-06, + "loss": 1.4789, + "mean_token_accuracy": 0.6525690505901972, + "num_tokens": 2063301963.0, + "step": 12309 + }, + { + "entropy": 1.6880824367205303, + "epoch": 1.3523111147730082, + "grad_norm": 0.6970797181129456, + "learning_rate": 6.514798687915243e-06, + "loss": 1.3369, + "mean_token_accuracy": 0.6655599971612295, + "num_tokens": 2063467828.0, + "step": 12310 + }, + { + "entropy": 1.7415729264418285, + "epoch": 1.3524209716843811, + "grad_norm": 0.7570353746414185, + "learning_rate": 6.513410623983719e-06, + "loss": 1.4903, + "mean_token_accuracy": 0.652079368631045, + "num_tokens": 2063628984.0, + "step": 12311 + }, + { + "entropy": 1.7562313973903656, + "epoch": 1.352530828595754, + "grad_norm": 0.834516167640686, + "learning_rate": 6.512022702050726e-06, + "loss": 1.2405, + "mean_token_accuracy": 0.6728113840023676, + "num_tokens": 2063738934.0, + "step": 12312 + }, + { + "entropy": 1.7031614283720653, + "epoch": 1.352640685507127, + "grad_norm": 0.6265947222709656, + "learning_rate": 6.510634922160194e-06, + "loss": 1.3499, + "mean_token_accuracy": 0.6655804167191187, + "num_tokens": 2063945293.0, + "step": 12313 + }, + { + "entropy": 1.750719130039215, + "epoch": 1.3527505424184998, + "grad_norm": 0.7226284742355347, + "learning_rate": 6.5092472843560404e-06, + "loss": 1.4446, + "mean_token_accuracy": 0.6544534166653951, + "num_tokens": 2064107640.0, + "step": 12314 + }, + { + "entropy": 1.6810857057571411, + "epoch": 1.352860399329873, + "grad_norm": 0.7839952111244202, + "learning_rate": 6.507859788682191e-06, + "loss": 1.3484, + "mean_token_accuracy": 0.6710902700821558, + "num_tokens": 2064260066.0, + "step": 12315 + }, + { + "entropy": 1.7289798359076183, + "epoch": 1.3529702562412458, + "grad_norm": 0.7496806979179382, + "learning_rate": 6.506472435182555e-06, + "loss": 1.417, + "mean_token_accuracy": 0.656493753194809, + "num_tokens": 2064413871.0, + "step": 12316 + }, + { + "entropy": 1.724411557118098, + "epoch": 1.3530801131526187, + "grad_norm": 0.7686552405357361, + "learning_rate": 6.505085223901037e-06, + "loss": 1.4302, + "mean_token_accuracy": 0.6589695413907369, + "num_tokens": 2064584214.0, + "step": 12317 + }, + { + "entropy": 1.7191261947154999, + "epoch": 1.3531899700639918, + "grad_norm": 0.65711510181427, + "learning_rate": 6.503698154881547e-06, + "loss": 1.5165, + "mean_token_accuracy": 0.6463207254807154, + "num_tokens": 2064812066.0, + "step": 12318 + }, + { + "entropy": 1.7270474930604298, + "epoch": 1.3532998269753647, + "grad_norm": 0.7155986428260803, + "learning_rate": 6.50231122816799e-06, + "loss": 1.4462, + "mean_token_accuracy": 0.6558532069126765, + "num_tokens": 2064987126.0, + "step": 12319 + }, + { + "entropy": 1.6704954504966736, + "epoch": 1.3534096838867375, + "grad_norm": 0.7355462312698364, + "learning_rate": 6.500924443804251e-06, + "loss": 1.4153, + "mean_token_accuracy": 0.6554108460744222, + "num_tokens": 2065163540.0, + "step": 12320 + }, + { + "entropy": 1.7073476314544678, + "epoch": 1.3535195407981104, + "grad_norm": 0.6781120896339417, + "learning_rate": 6.499537801834224e-06, + "loss": 1.4084, + "mean_token_accuracy": 0.645026778181394, + "num_tokens": 2065344154.0, + "step": 12321 + }, + { + "entropy": 1.6700172821680705, + "epoch": 1.3536293977094833, + "grad_norm": 0.6056855320930481, + "learning_rate": 6.4981513023018026e-06, + "loss": 1.4415, + "mean_token_accuracy": 0.6474265257517496, + "num_tokens": 2065592601.0, + "step": 12322 + }, + { + "entropy": 1.7153640190760295, + "epoch": 1.3537392546208564, + "grad_norm": 0.6348044276237488, + "learning_rate": 6.4967649452508645e-06, + "loss": 1.3697, + "mean_token_accuracy": 0.6543361097574234, + "num_tokens": 2065767977.0, + "step": 12323 + }, + { + "entropy": 1.714322954416275, + "epoch": 1.3538491115322293, + "grad_norm": 0.6365450024604797, + "learning_rate": 6.4953787307252815e-06, + "loss": 1.442, + "mean_token_accuracy": 0.635169451435407, + "num_tokens": 2065967643.0, + "step": 12324 + }, + { + "entropy": 1.7047211130460103, + "epoch": 1.3539589684436022, + "grad_norm": 0.8873185515403748, + "learning_rate": 6.493992658768935e-06, + "loss": 1.418, + "mean_token_accuracy": 0.6530811885992686, + "num_tokens": 2066111579.0, + "step": 12325 + }, + { + "entropy": 1.7306146423021953, + "epoch": 1.354068825354975, + "grad_norm": 0.7051663398742676, + "learning_rate": 6.492606729425688e-06, + "loss": 1.4218, + "mean_token_accuracy": 0.6628289421399435, + "num_tokens": 2066301945.0, + "step": 12326 + }, + { + "entropy": 1.7398844460646312, + "epoch": 1.354178682266348, + "grad_norm": 0.6335932612419128, + "learning_rate": 6.491220942739411e-06, + "loss": 1.276, + "mean_token_accuracy": 0.6720243046681086, + "num_tokens": 2066426077.0, + "step": 12327 + }, + { + "entropy": 1.707278350989024, + "epoch": 1.354288539177721, + "grad_norm": 0.6095598936080933, + "learning_rate": 6.489835298753959e-06, + "loss": 1.4107, + "mean_token_accuracy": 0.6501167962948481, + "num_tokens": 2066577706.0, + "step": 12328 + }, + { + "entropy": 1.7580445806185405, + "epoch": 1.354398396089094, + "grad_norm": 0.6501827836036682, + "learning_rate": 6.488449797513183e-06, + "loss": 1.4603, + "mean_token_accuracy": 0.6521537154912949, + "num_tokens": 2066736674.0, + "step": 12329 + }, + { + "entropy": 1.7793205082416534, + "epoch": 1.3545082530004668, + "grad_norm": 0.8058743476867676, + "learning_rate": 6.487064439060939e-06, + "loss": 1.4942, + "mean_token_accuracy": 0.649241695801417, + "num_tokens": 2066929361.0, + "step": 12330 + }, + { + "entropy": 1.6442664166291554, + "epoch": 1.35461810991184, + "grad_norm": 0.7431530356407166, + "learning_rate": 6.485679223441079e-06, + "loss": 1.2053, + "mean_token_accuracy": 0.6872799694538116, + "num_tokens": 2067059964.0, + "step": 12331 + }, + { + "entropy": 1.6199305057525635, + "epoch": 1.3547279668232128, + "grad_norm": 0.6863206624984741, + "learning_rate": 6.48429415069743e-06, + "loss": 1.3179, + "mean_token_accuracy": 0.6764807005723318, + "num_tokens": 2067198007.0, + "step": 12332 + }, + { + "entropy": 1.6564082105954487, + "epoch": 1.3548378237345857, + "grad_norm": 0.6196966767311096, + "learning_rate": 6.482909220873838e-06, + "loss": 1.447, + "mean_token_accuracy": 0.6547676275173823, + "num_tokens": 2067377864.0, + "step": 12333 + }, + { + "entropy": 1.7091114819049835, + "epoch": 1.3549476806459586, + "grad_norm": 0.8706151843070984, + "learning_rate": 6.481524434014134e-06, + "loss": 1.4952, + "mean_token_accuracy": 0.6537403712670008, + "num_tokens": 2067518216.0, + "step": 12334 + }, + { + "entropy": 1.685113827387492, + "epoch": 1.3550575375573315, + "grad_norm": 0.7392619848251343, + "learning_rate": 6.480139790162146e-06, + "loss": 1.4484, + "mean_token_accuracy": 0.652221699555715, + "num_tokens": 2067650159.0, + "step": 12335 + }, + { + "entropy": 1.7211529811223347, + "epoch": 1.3551673944687046, + "grad_norm": 0.7249420285224915, + "learning_rate": 6.478755289361698e-06, + "loss": 1.4059, + "mean_token_accuracy": 0.642576590180397, + "num_tokens": 2067855074.0, + "step": 12336 + }, + { + "entropy": 1.7155493001143138, + "epoch": 1.3552772513800775, + "grad_norm": 1.159257411956787, + "learning_rate": 6.4773709316566036e-06, + "loss": 1.237, + "mean_token_accuracy": 0.6643270750840505, + "num_tokens": 2068013117.0, + "step": 12337 + }, + { + "entropy": 1.693147212266922, + "epoch": 1.3553871082914504, + "grad_norm": 0.692810595035553, + "learning_rate": 6.475986717090683e-06, + "loss": 1.3471, + "mean_token_accuracy": 0.6661613335212072, + "num_tokens": 2068174867.0, + "step": 12338 + }, + { + "entropy": 1.66440216700236, + "epoch": 1.3554969652028233, + "grad_norm": 0.6912005543708801, + "learning_rate": 6.474602645707746e-06, + "loss": 1.3643, + "mean_token_accuracy": 0.660924697915713, + "num_tokens": 2068363011.0, + "step": 12339 + }, + { + "entropy": 1.7170608242352803, + "epoch": 1.3556068221141961, + "grad_norm": 0.6402304172515869, + "learning_rate": 6.473218717551597e-06, + "loss": 1.5135, + "mean_token_accuracy": 0.6523879170417786, + "num_tokens": 2068560209.0, + "step": 12340 + }, + { + "entropy": 1.7328025897343953, + "epoch": 1.3557166790255692, + "grad_norm": 0.6300782561302185, + "learning_rate": 6.471834932666033e-06, + "loss": 1.3726, + "mean_token_accuracy": 0.6494940121968588, + "num_tokens": 2068759721.0, + "step": 12341 + }, + { + "entropy": 1.7307655314604442, + "epoch": 1.3558265359369421, + "grad_norm": 0.716905951499939, + "learning_rate": 6.470451291094855e-06, + "loss": 1.3454, + "mean_token_accuracy": 0.6711088915665945, + "num_tokens": 2068930911.0, + "step": 12342 + }, + { + "entropy": 1.7667767107486725, + "epoch": 1.355936392848315, + "grad_norm": 0.6788172721862793, + "learning_rate": 6.469067792881853e-06, + "loss": 1.4581, + "mean_token_accuracy": 0.6369956433773041, + "num_tokens": 2069122750.0, + "step": 12343 + }, + { + "entropy": 1.7475673854351044, + "epoch": 1.3560462497596881, + "grad_norm": 0.744155764579773, + "learning_rate": 6.467684438070809e-06, + "loss": 1.3626, + "mean_token_accuracy": 0.6463822772105535, + "num_tokens": 2069264289.0, + "step": 12344 + }, + { + "entropy": 1.7614603241284688, + "epoch": 1.356156106671061, + "grad_norm": 0.6432300209999084, + "learning_rate": 6.466301226705516e-06, + "loss": 1.4698, + "mean_token_accuracy": 0.6485941559076309, + "num_tokens": 2069414632.0, + "step": 12345 + }, + { + "entropy": 1.6905387043952942, + "epoch": 1.356265963582434, + "grad_norm": 0.6607070565223694, + "learning_rate": 6.464918158829741e-06, + "loss": 1.4342, + "mean_token_accuracy": 0.6547519415616989, + "num_tokens": 2069600554.0, + "step": 12346 + }, + { + "entropy": 1.660075883070628, + "epoch": 1.3563758204938068, + "grad_norm": 0.6849450469017029, + "learning_rate": 6.463535234487267e-06, + "loss": 1.3493, + "mean_token_accuracy": 0.6670071085294088, + "num_tokens": 2069751438.0, + "step": 12347 + }, + { + "entropy": 1.7046632369359334, + "epoch": 1.3564856774051797, + "grad_norm": 0.6302372217178345, + "learning_rate": 6.462152453721859e-06, + "loss": 1.3905, + "mean_token_accuracy": 0.6431731383005778, + "num_tokens": 2069882559.0, + "step": 12348 + }, + { + "entropy": 1.7553166151046753, + "epoch": 1.3565955343165528, + "grad_norm": 0.7600220441818237, + "learning_rate": 6.460769816577277e-06, + "loss": 1.3602, + "mean_token_accuracy": 0.6480604211489359, + "num_tokens": 2070014200.0, + "step": 12349 + }, + { + "entropy": 1.6376391152540843, + "epoch": 1.3567053912279257, + "grad_norm": 0.7972891926765442, + "learning_rate": 6.4593873230972845e-06, + "loss": 1.4161, + "mean_token_accuracy": 0.6628619134426117, + "num_tokens": 2070220462.0, + "step": 12350 + }, + { + "entropy": 1.6964278320471446, + "epoch": 1.3568152481392985, + "grad_norm": 0.6905115842819214, + "learning_rate": 6.458004973325643e-06, + "loss": 1.2696, + "mean_token_accuracy": 0.6703715721766154, + "num_tokens": 2070379579.0, + "step": 12351 + }, + { + "entropy": 1.695211390654246, + "epoch": 1.3569251050506714, + "grad_norm": 0.6479082107543945, + "learning_rate": 6.456622767306093e-06, + "loss": 1.3873, + "mean_token_accuracy": 0.6564722607533137, + "num_tokens": 2070508597.0, + "step": 12352 + }, + { + "entropy": 1.6002402206261952, + "epoch": 1.3570349619620443, + "grad_norm": 0.6399569511413574, + "learning_rate": 6.455240705082386e-06, + "loss": 1.1791, + "mean_token_accuracy": 0.6922868291536967, + "num_tokens": 2070662900.0, + "step": 12353 + }, + { + "entropy": 1.667330761750539, + "epoch": 1.3571448188734174, + "grad_norm": 0.6267159581184387, + "learning_rate": 6.453858786698264e-06, + "loss": 1.4196, + "mean_token_accuracy": 0.6513769179582596, + "num_tokens": 2070864178.0, + "step": 12354 + }, + { + "entropy": 1.697418709595998, + "epoch": 1.3572546757847903, + "grad_norm": 0.615899920463562, + "learning_rate": 6.4524770121974625e-06, + "loss": 1.4321, + "mean_token_accuracy": 0.6468136459589005, + "num_tokens": 2071051168.0, + "step": 12355 + }, + { + "entropy": 1.7128571271896362, + "epoch": 1.3573645326961632, + "grad_norm": 0.6929111480712891, + "learning_rate": 6.451095381623711e-06, + "loss": 1.3526, + "mean_token_accuracy": 0.6634085973103842, + "num_tokens": 2071226809.0, + "step": 12356 + }, + { + "entropy": 1.7381273806095123, + "epoch": 1.3574743896075363, + "grad_norm": 0.6834197640419006, + "learning_rate": 6.449713895020746e-06, + "loss": 1.4679, + "mean_token_accuracy": 0.6420772125323614, + "num_tokens": 2071396186.0, + "step": 12357 + }, + { + "entropy": 1.722596416870753, + "epoch": 1.3575842465189092, + "grad_norm": 0.6959985494613647, + "learning_rate": 6.448332552432282e-06, + "loss": 1.4207, + "mean_token_accuracy": 0.6518757989009222, + "num_tokens": 2071595004.0, + "step": 12358 + }, + { + "entropy": 1.6781906882921855, + "epoch": 1.357694103430282, + "grad_norm": 0.63628751039505, + "learning_rate": 6.446951353902045e-06, + "loss": 1.4942, + "mean_token_accuracy": 0.663748636841774, + "num_tokens": 2071740366.0, + "step": 12359 + }, + { + "entropy": 1.6715950568517048, + "epoch": 1.357803960341655, + "grad_norm": 0.6627793312072754, + "learning_rate": 6.445570299473744e-06, + "loss": 1.4144, + "mean_token_accuracy": 0.660917063554128, + "num_tokens": 2071913108.0, + "step": 12360 + }, + { + "entropy": 1.803322861591975, + "epoch": 1.3579138172530278, + "grad_norm": 0.7100440859794617, + "learning_rate": 6.4441893891910885e-06, + "loss": 1.6403, + "mean_token_accuracy": 0.6258356620868047, + "num_tokens": 2072140175.0, + "step": 12361 + }, + { + "entropy": 1.7187660336494446, + "epoch": 1.358023674164401, + "grad_norm": 0.648104727268219, + "learning_rate": 6.442808623097787e-06, + "loss": 1.3935, + "mean_token_accuracy": 0.6531741370757421, + "num_tokens": 2072329640.0, + "step": 12362 + }, + { + "entropy": 1.6625087360541027, + "epoch": 1.3581335310757738, + "grad_norm": 0.6913976669311523, + "learning_rate": 6.441428001237546e-06, + "loss": 1.3703, + "mean_token_accuracy": 0.6623245229323705, + "num_tokens": 2072470021.0, + "step": 12363 + }, + { + "entropy": 1.6901950438817341, + "epoch": 1.3582433879871467, + "grad_norm": 0.6735898852348328, + "learning_rate": 6.440047523654047e-06, + "loss": 1.5483, + "mean_token_accuracy": 0.6314966926972071, + "num_tokens": 2072671483.0, + "step": 12364 + }, + { + "entropy": 1.7260485688845317, + "epoch": 1.3583532448985196, + "grad_norm": 0.7406774163246155, + "learning_rate": 6.438667190390989e-06, + "loss": 1.2643, + "mean_token_accuracy": 0.676031157374382, + "num_tokens": 2072824287.0, + "step": 12365 + }, + { + "entropy": 1.6700426439444225, + "epoch": 1.3584631018098925, + "grad_norm": 0.6413915157318115, + "learning_rate": 6.437287001492063e-06, + "loss": 1.3453, + "mean_token_accuracy": 0.6662431508302689, + "num_tokens": 2072998417.0, + "step": 12366 + }, + { + "entropy": 1.717922439177831, + "epoch": 1.3585729587212656, + "grad_norm": 0.7316557765007019, + "learning_rate": 6.4359069570009455e-06, + "loss": 1.5969, + "mean_token_accuracy": 0.620139608780543, + "num_tokens": 2073209496.0, + "step": 12367 + }, + { + "entropy": 1.6824163496494293, + "epoch": 1.3586828156326385, + "grad_norm": 0.667999804019928, + "learning_rate": 6.434527056961315e-06, + "loss": 1.3104, + "mean_token_accuracy": 0.6806386361519495, + "num_tokens": 2073399215.0, + "step": 12368 + }, + { + "entropy": 1.6601607302824657, + "epoch": 1.3587926725440114, + "grad_norm": 0.5929083228111267, + "learning_rate": 6.4331473014168485e-06, + "loss": 1.2936, + "mean_token_accuracy": 0.6646314362684885, + "num_tokens": 2073573586.0, + "step": 12369 + }, + { + "entropy": 1.8227445185184479, + "epoch": 1.3589025294553845, + "grad_norm": 0.6329379677772522, + "learning_rate": 6.431767690411208e-06, + "loss": 1.4407, + "mean_token_accuracy": 0.6459532777468363, + "num_tokens": 2073708654.0, + "step": 12370 + }, + { + "entropy": 1.6659250060717266, + "epoch": 1.3590123863667574, + "grad_norm": 0.6623415946960449, + "learning_rate": 6.430388223988067e-06, + "loss": 1.6464, + "mean_token_accuracy": 0.6338375359773636, + "num_tokens": 2073930206.0, + "step": 12371 + }, + { + "entropy": 1.730675846338272, + "epoch": 1.3591222432781302, + "grad_norm": 0.6786333918571472, + "learning_rate": 6.429008902191077e-06, + "loss": 1.3172, + "mean_token_accuracy": 0.676612580815951, + "num_tokens": 2074090430.0, + "step": 12372 + }, + { + "entropy": 1.723184158404668, + "epoch": 1.3592321001895031, + "grad_norm": 0.9140381217002869, + "learning_rate": 6.4276297250638945e-06, + "loss": 1.3079, + "mean_token_accuracy": 0.6676105012496313, + "num_tokens": 2074265822.0, + "step": 12373 + }, + { + "entropy": 1.6886428495248158, + "epoch": 1.359341957100876, + "grad_norm": 0.696064293384552, + "learning_rate": 6.426250692650169e-06, + "loss": 1.2555, + "mean_token_accuracy": 0.6780825853347778, + "num_tokens": 2074394539.0, + "step": 12374 + }, + { + "entropy": 1.7155614097913106, + "epoch": 1.3594518140122491, + "grad_norm": 0.751995861530304, + "learning_rate": 6.424871804993555e-06, + "loss": 1.4351, + "mean_token_accuracy": 0.6506524682044983, + "num_tokens": 2074589651.0, + "step": 12375 + }, + { + "entropy": 1.7856918176015217, + "epoch": 1.359561670923622, + "grad_norm": 0.8634830117225647, + "learning_rate": 6.423493062137683e-06, + "loss": 1.4097, + "mean_token_accuracy": 0.6395214746395746, + "num_tokens": 2074752173.0, + "step": 12376 + }, + { + "entropy": 1.6887954970200856, + "epoch": 1.359671527834995, + "grad_norm": 0.686133623123169, + "learning_rate": 6.42211446412619e-06, + "loss": 1.3559, + "mean_token_accuracy": 0.665522962808609, + "num_tokens": 2074914932.0, + "step": 12377 + }, + { + "entropy": 1.6841253538926442, + "epoch": 1.3597813847463678, + "grad_norm": 0.6983356475830078, + "learning_rate": 6.420736011002715e-06, + "loss": 1.3099, + "mean_token_accuracy": 0.674403061469396, + "num_tokens": 2075065681.0, + "step": 12378 + }, + { + "entropy": 1.7410068213939667, + "epoch": 1.3598912416577407, + "grad_norm": 0.798302173614502, + "learning_rate": 6.419357702810882e-06, + "loss": 1.4741, + "mean_token_accuracy": 0.6387134939432144, + "num_tokens": 2075284738.0, + "step": 12379 + }, + { + "entropy": 1.711126794417699, + "epoch": 1.3600010985691138, + "grad_norm": 0.6752367615699768, + "learning_rate": 6.417979539594311e-06, + "loss": 1.4566, + "mean_token_accuracy": 0.6450911511977514, + "num_tokens": 2075459304.0, + "step": 12380 + }, + { + "entropy": 1.6090798874696095, + "epoch": 1.3601109554804867, + "grad_norm": 0.7679362297058105, + "learning_rate": 6.416601521396626e-06, + "loss": 1.453, + "mean_token_accuracy": 0.6455042411883672, + "num_tokens": 2075657038.0, + "step": 12381 + }, + { + "entropy": 1.7894425988197327, + "epoch": 1.3602208123918595, + "grad_norm": 0.685226559638977, + "learning_rate": 6.4152236482614336e-06, + "loss": 1.6363, + "mean_token_accuracy": 0.6196721792221069, + "num_tokens": 2075889197.0, + "step": 12382 + }, + { + "entropy": 1.6688521007696788, + "epoch": 1.3603306693032327, + "grad_norm": 0.606177031993866, + "learning_rate": 6.413845920232351e-06, + "loss": 1.5137, + "mean_token_accuracy": 0.641996776064237, + "num_tokens": 2076111995.0, + "step": 12383 + }, + { + "entropy": 1.7345014313856761, + "epoch": 1.3604405262146055, + "grad_norm": 0.7717848420143127, + "learning_rate": 6.41246833735298e-06, + "loss": 1.4069, + "mean_token_accuracy": 0.6651813685894012, + "num_tokens": 2076293865.0, + "step": 12384 + }, + { + "entropy": 1.6873282094796498, + "epoch": 1.3605503831259784, + "grad_norm": 0.6491485238075256, + "learning_rate": 6.411090899666912e-06, + "loss": 1.4593, + "mean_token_accuracy": 0.6515095929304758, + "num_tokens": 2076503891.0, + "step": 12385 + }, + { + "entropy": 1.6362064977486928, + "epoch": 1.3606602400373513, + "grad_norm": 0.669855535030365, + "learning_rate": 6.4097136072177516e-06, + "loss": 1.4198, + "mean_token_accuracy": 0.6423494170109431, + "num_tokens": 2076635813.0, + "step": 12386 + }, + { + "entropy": 1.7514414191246033, + "epoch": 1.3607700969487242, + "grad_norm": 0.683935284614563, + "learning_rate": 6.408336460049091e-06, + "loss": 1.3794, + "mean_token_accuracy": 0.6483311802148819, + "num_tokens": 2076766414.0, + "step": 12387 + }, + { + "entropy": 1.7777518530686696, + "epoch": 1.3608799538600973, + "grad_norm": 0.7260268926620483, + "learning_rate": 6.406959458204509e-06, + "loss": 1.4164, + "mean_token_accuracy": 0.6536405185858408, + "num_tokens": 2076906861.0, + "step": 12388 + }, + { + "entropy": 1.7372454206148784, + "epoch": 1.3609898107714702, + "grad_norm": 0.6218958497047424, + "learning_rate": 6.4055826017275895e-06, + "loss": 1.4287, + "mean_token_accuracy": 0.6501857141653696, + "num_tokens": 2077064213.0, + "step": 12389 + }, + { + "entropy": 1.6530345578988392, + "epoch": 1.361099667682843, + "grad_norm": 0.6436760425567627, + "learning_rate": 6.404205890661914e-06, + "loss": 1.4417, + "mean_token_accuracy": 0.6399639348189036, + "num_tokens": 2077282573.0, + "step": 12390 + }, + { + "entropy": 1.7672642767429352, + "epoch": 1.361209524594216, + "grad_norm": 0.6963381767272949, + "learning_rate": 6.40282932505105e-06, + "loss": 1.388, + "mean_token_accuracy": 0.6588474710782369, + "num_tokens": 2077494073.0, + "step": 12391 + }, + { + "entropy": 1.6798830231030781, + "epoch": 1.3613193815055888, + "grad_norm": 0.7534324526786804, + "learning_rate": 6.4014529049385674e-06, + "loss": 1.2571, + "mean_token_accuracy": 0.6731600165367126, + "num_tokens": 2077645291.0, + "step": 12392 + }, + { + "entropy": 1.6944253742694855, + "epoch": 1.361429238416962, + "grad_norm": 0.7895978093147278, + "learning_rate": 6.400076630368024e-06, + "loss": 1.3938, + "mean_token_accuracy": 0.6565722674131393, + "num_tokens": 2077779765.0, + "step": 12393 + }, + { + "entropy": 1.732394814491272, + "epoch": 1.3615390953283348, + "grad_norm": 0.7902349233627319, + "learning_rate": 6.398700501382983e-06, + "loss": 1.5484, + "mean_token_accuracy": 0.6475964114069939, + "num_tokens": 2077979044.0, + "step": 12394 + }, + { + "entropy": 1.6490447123845418, + "epoch": 1.3616489522397077, + "grad_norm": 0.6817672252655029, + "learning_rate": 6.397324518027002e-06, + "loss": 1.3966, + "mean_token_accuracy": 0.6636393964290619, + "num_tokens": 2078130750.0, + "step": 12395 + }, + { + "entropy": 1.7045618295669556, + "epoch": 1.3617588091510808, + "grad_norm": 0.6113632917404175, + "learning_rate": 6.395948680343625e-06, + "loss": 1.5267, + "mean_token_accuracy": 0.6461210399866104, + "num_tokens": 2078334424.0, + "step": 12396 + }, + { + "entropy": 1.677074631055196, + "epoch": 1.3618686660624537, + "grad_norm": 0.6405321955680847, + "learning_rate": 6.394572988376393e-06, + "loss": 1.3298, + "mean_token_accuracy": 0.6814304739236832, + "num_tokens": 2078483301.0, + "step": 12397 + }, + { + "entropy": 1.6795497337977092, + "epoch": 1.3619785229738266, + "grad_norm": 0.6742625832557678, + "learning_rate": 6.393197442168856e-06, + "loss": 1.4616, + "mean_token_accuracy": 0.6637395819028219, + "num_tokens": 2078651932.0, + "step": 12398 + }, + { + "entropy": 1.6983890235424042, + "epoch": 1.3620883798851995, + "grad_norm": 0.8591446280479431, + "learning_rate": 6.391822041764542e-06, + "loss": 1.3586, + "mean_token_accuracy": 0.6608823935190836, + "num_tokens": 2078783927.0, + "step": 12399 + }, + { + "entropy": 1.7535987198352814, + "epoch": 1.3621982367965724, + "grad_norm": 0.6175576448440552, + "learning_rate": 6.390446787206983e-06, + "loss": 1.5121, + "mean_token_accuracy": 0.6477769613265991, + "num_tokens": 2078957176.0, + "step": 12400 + }, + { + "entropy": 1.7856710652510326, + "epoch": 1.3623080937079455, + "grad_norm": 0.6982350945472717, + "learning_rate": 6.389071678539708e-06, + "loss": 1.4667, + "mean_token_accuracy": 0.6387214660644531, + "num_tokens": 2079129907.0, + "step": 12401 + }, + { + "entropy": 1.7471899092197418, + "epoch": 1.3624179506193184, + "grad_norm": 0.895235002040863, + "learning_rate": 6.387696715806233e-06, + "loss": 1.5033, + "mean_token_accuracy": 0.6521243900060654, + "num_tokens": 2079292376.0, + "step": 12402 + }, + { + "entropy": 1.702579249938329, + "epoch": 1.3625278075306912, + "grad_norm": 0.80966717004776, + "learning_rate": 6.3863218990500835e-06, + "loss": 1.424, + "mean_token_accuracy": 0.6629576434691747, + "num_tokens": 2079480359.0, + "step": 12403 + }, + { + "entropy": 1.6714877784252167, + "epoch": 1.3626376644420641, + "grad_norm": 0.7189324498176575, + "learning_rate": 6.384947228314765e-06, + "loss": 1.239, + "mean_token_accuracy": 0.6766127347946167, + "num_tokens": 2079633790.0, + "step": 12404 + }, + { + "entropy": 1.6770406166712444, + "epoch": 1.362747521353437, + "grad_norm": 0.5395998358726501, + "learning_rate": 6.383572703643786e-06, + "loss": 1.4526, + "mean_token_accuracy": 0.6433726151784261, + "num_tokens": 2079890149.0, + "step": 12405 + }, + { + "entropy": 1.6809902389844258, + "epoch": 1.3628573782648101, + "grad_norm": 0.6718010306358337, + "learning_rate": 6.382198325080649e-06, + "loss": 1.5027, + "mean_token_accuracy": 0.636824240287145, + "num_tokens": 2080094393.0, + "step": 12406 + }, + { + "entropy": 1.6974414388338726, + "epoch": 1.362967235176183, + "grad_norm": 0.6535385847091675, + "learning_rate": 6.380824092668857e-06, + "loss": 1.3319, + "mean_token_accuracy": 0.6718876659870148, + "num_tokens": 2080234231.0, + "step": 12407 + }, + { + "entropy": 1.701567719380061, + "epoch": 1.363077092087556, + "grad_norm": 0.5973226428031921, + "learning_rate": 6.379450006451902e-06, + "loss": 1.5466, + "mean_token_accuracy": 0.6344873458147049, + "num_tokens": 2080439315.0, + "step": 12408 + }, + { + "entropy": 1.6704054077466328, + "epoch": 1.363186948998929, + "grad_norm": 0.6961952447891235, + "learning_rate": 6.378076066473269e-06, + "loss": 1.4765, + "mean_token_accuracy": 0.6445471247037252, + "num_tokens": 2080636110.0, + "step": 12409 + }, + { + "entropy": 1.7136572698752086, + "epoch": 1.363296805910302, + "grad_norm": 0.8393598198890686, + "learning_rate": 6.37670227277645e-06, + "loss": 1.4695, + "mean_token_accuracy": 0.6775861183802286, + "num_tokens": 2080766358.0, + "step": 12410 + }, + { + "entropy": 1.6980823675791423, + "epoch": 1.3634066628216748, + "grad_norm": 0.7727437019348145, + "learning_rate": 6.37532862540492e-06, + "loss": 1.4852, + "mean_token_accuracy": 0.6424992879231771, + "num_tokens": 2080940544.0, + "step": 12411 + }, + { + "entropy": 1.7293777863184612, + "epoch": 1.3635165197330477, + "grad_norm": 0.7608976364135742, + "learning_rate": 6.3739551244021515e-06, + "loss": 1.4743, + "mean_token_accuracy": 0.6436664660771688, + "num_tokens": 2081098947.0, + "step": 12412 + }, + { + "entropy": 1.7463851571083069, + "epoch": 1.3636263766444205, + "grad_norm": 0.6330917477607727, + "learning_rate": 6.372581769811621e-06, + "loss": 1.3279, + "mean_token_accuracy": 0.6707935730616251, + "num_tokens": 2081249943.0, + "step": 12413 + }, + { + "entropy": 1.744843582312266, + "epoch": 1.3637362335557937, + "grad_norm": 0.8132315874099731, + "learning_rate": 6.37120856167679e-06, + "loss": 1.6554, + "mean_token_accuracy": 0.635198379556338, + "num_tokens": 2081409375.0, + "step": 12414 + }, + { + "entropy": 1.6739271680514018, + "epoch": 1.3638460904671665, + "grad_norm": 0.7317605018615723, + "learning_rate": 6.369835500041126e-06, + "loss": 1.3227, + "mean_token_accuracy": 0.663427397608757, + "num_tokens": 2081563302.0, + "step": 12415 + }, + { + "entropy": 1.6746556460857391, + "epoch": 1.3639559473785394, + "grad_norm": 0.7247527837753296, + "learning_rate": 6.368462584948082e-06, + "loss": 1.3398, + "mean_token_accuracy": 0.6720200031995773, + "num_tokens": 2081684223.0, + "step": 12416 + }, + { + "entropy": 1.6810146073500316, + "epoch": 1.3640658042899123, + "grad_norm": 0.7046148777008057, + "learning_rate": 6.367089816441106e-06, + "loss": 1.3198, + "mean_token_accuracy": 0.6763695975144705, + "num_tokens": 2081804676.0, + "step": 12417 + }, + { + "entropy": 1.7369506259759266, + "epoch": 1.3641756612012852, + "grad_norm": 0.679513156414032, + "learning_rate": 6.36571719456365e-06, + "loss": 1.4399, + "mean_token_accuracy": 0.652540922164917, + "num_tokens": 2081966586.0, + "step": 12418 + }, + { + "entropy": 1.7097415924072266, + "epoch": 1.3642855181126583, + "grad_norm": 0.6426488161087036, + "learning_rate": 6.364344719359161e-06, + "loss": 1.4498, + "mean_token_accuracy": 0.6472660650809606, + "num_tokens": 2082178544.0, + "step": 12419 + }, + { + "entropy": 1.7177700698375702, + "epoch": 1.3643953750240312, + "grad_norm": 0.6975800395011902, + "learning_rate": 6.362972390871072e-06, + "loss": 1.4556, + "mean_token_accuracy": 0.6420897940794627, + "num_tokens": 2082354325.0, + "step": 12420 + }, + { + "entropy": 1.6956178347269695, + "epoch": 1.364505231935404, + "grad_norm": 0.6548015475273132, + "learning_rate": 6.361600209142813e-06, + "loss": 1.2944, + "mean_token_accuracy": 0.6646904796361923, + "num_tokens": 2082470336.0, + "step": 12421 + }, + { + "entropy": 1.6302022536595662, + "epoch": 1.3646150888467772, + "grad_norm": 1.402173638343811, + "learning_rate": 6.360228174217822e-06, + "loss": 1.3346, + "mean_token_accuracy": 0.6607320159673691, + "num_tokens": 2082670245.0, + "step": 12422 + }, + { + "entropy": 1.7581724623839061, + "epoch": 1.36472494575815, + "grad_norm": 0.7247641682624817, + "learning_rate": 6.358856286139517e-06, + "loss": 1.3864, + "mean_token_accuracy": 0.65654323498408, + "num_tokens": 2082832020.0, + "step": 12423 + }, + { + "entropy": 1.7525269190470378, + "epoch": 1.364834802669523, + "grad_norm": 0.7007389664649963, + "learning_rate": 6.3574845449513175e-06, + "loss": 1.5072, + "mean_token_accuracy": 0.6481544723113378, + "num_tokens": 2082991463.0, + "step": 12424 + }, + { + "entropy": 1.693600445985794, + "epoch": 1.3649446595808958, + "grad_norm": 0.7632626891136169, + "learning_rate": 6.356112950696642e-06, + "loss": 1.3861, + "mean_token_accuracy": 0.6590569615364075, + "num_tokens": 2083136636.0, + "step": 12425 + }, + { + "entropy": 1.680637151002884, + "epoch": 1.3650545164922687, + "grad_norm": 0.6591139435768127, + "learning_rate": 6.354741503418897e-06, + "loss": 1.4859, + "mean_token_accuracy": 0.6472095102071762, + "num_tokens": 2083318799.0, + "step": 12426 + }, + { + "entropy": 1.7445678512255351, + "epoch": 1.3651643734036418, + "grad_norm": 0.8773960471153259, + "learning_rate": 6.353370203161493e-06, + "loss": 1.5806, + "mean_token_accuracy": 0.6431012004613876, + "num_tokens": 2083489179.0, + "step": 12427 + }, + { + "entropy": 1.6873707075913746, + "epoch": 1.3652742303150147, + "grad_norm": 0.6120126843452454, + "learning_rate": 6.351999049967829e-06, + "loss": 1.4179, + "mean_token_accuracy": 0.6483838905890783, + "num_tokens": 2083675182.0, + "step": 12428 + }, + { + "entropy": 1.7138080596923828, + "epoch": 1.3653840872263876, + "grad_norm": 0.7926364541053772, + "learning_rate": 6.350628043881296e-06, + "loss": 1.3874, + "mean_token_accuracy": 0.6502122531334559, + "num_tokens": 2083802000.0, + "step": 12429 + }, + { + "entropy": 1.6688072582085927, + "epoch": 1.3654939441377605, + "grad_norm": 0.6590139865875244, + "learning_rate": 6.349257184945291e-06, + "loss": 1.3461, + "mean_token_accuracy": 0.6638127416372299, + "num_tokens": 2083975602.0, + "step": 12430 + }, + { + "entropy": 1.6825914184252422, + "epoch": 1.3656038010491334, + "grad_norm": 0.6948336958885193, + "learning_rate": 6.347886473203204e-06, + "loss": 1.4358, + "mean_token_accuracy": 0.6588354905446371, + "num_tokens": 2084152778.0, + "step": 12431 + }, + { + "entropy": 1.6966508328914642, + "epoch": 1.3657136579605065, + "grad_norm": 0.6842400431632996, + "learning_rate": 6.346515908698414e-06, + "loss": 1.4629, + "mean_token_accuracy": 0.6462537546952566, + "num_tokens": 2084319697.0, + "step": 12432 + }, + { + "entropy": 1.7046682834625244, + "epoch": 1.3658235148718794, + "grad_norm": 0.6207724213600159, + "learning_rate": 6.345145491474295e-06, + "loss": 1.3735, + "mean_token_accuracy": 0.6687126259009043, + "num_tokens": 2084458870.0, + "step": 12433 + }, + { + "entropy": 1.6672801077365875, + "epoch": 1.3659333717832522, + "grad_norm": 0.7119844555854797, + "learning_rate": 6.3437752215742264e-06, + "loss": 1.4564, + "mean_token_accuracy": 0.672510157028834, + "num_tokens": 2084603570.0, + "step": 12434 + }, + { + "entropy": 1.6629460255304973, + "epoch": 1.3660432286946254, + "grad_norm": 0.7983689904212952, + "learning_rate": 6.3424050990415745e-06, + "loss": 1.4319, + "mean_token_accuracy": 0.6575490633646647, + "num_tokens": 2084752897.0, + "step": 12435 + }, + { + "entropy": 1.6838939388593037, + "epoch": 1.3661530856059982, + "grad_norm": 0.6658820509910583, + "learning_rate": 6.341035123919699e-06, + "loss": 1.6706, + "mean_token_accuracy": 0.6166095087925593, + "num_tokens": 2084981969.0, + "step": 12436 + }, + { + "entropy": 1.6812881429990132, + "epoch": 1.3662629425173711, + "grad_norm": 0.6590875387191772, + "learning_rate": 6.339665296251966e-06, + "loss": 1.4998, + "mean_token_accuracy": 0.6573397864898046, + "num_tokens": 2085171039.0, + "step": 12437 + }, + { + "entropy": 1.706734577814738, + "epoch": 1.366372799428744, + "grad_norm": 0.6647880673408508, + "learning_rate": 6.338295616081722e-06, + "loss": 1.4314, + "mean_token_accuracy": 0.6501033653815588, + "num_tokens": 2085335853.0, + "step": 12438 + }, + { + "entropy": 1.6170736749966939, + "epoch": 1.366482656340117, + "grad_norm": 0.7858694195747375, + "learning_rate": 6.336926083452326e-06, + "loss": 1.2863, + "mean_token_accuracy": 0.6720435669024786, + "num_tokens": 2085498251.0, + "step": 12439 + }, + { + "entropy": 1.6985772848129272, + "epoch": 1.36659251325149, + "grad_norm": 0.6605982184410095, + "learning_rate": 6.335556698407117e-06, + "loss": 1.3983, + "mean_token_accuracy": 0.6527110983928045, + "num_tokens": 2085671772.0, + "step": 12440 + }, + { + "entropy": 1.6974250773588817, + "epoch": 1.366702370162863, + "grad_norm": 0.7720880508422852, + "learning_rate": 6.334187460989434e-06, + "loss": 1.3258, + "mean_token_accuracy": 0.6786187787850698, + "num_tokens": 2085791125.0, + "step": 12441 + }, + { + "entropy": 1.7007473905881245, + "epoch": 1.3668122270742358, + "grad_norm": 0.6617766618728638, + "learning_rate": 6.332818371242615e-06, + "loss": 1.3439, + "mean_token_accuracy": 0.6567811866601309, + "num_tokens": 2085914564.0, + "step": 12442 + }, + { + "entropy": 1.6779978076616924, + "epoch": 1.3669220839856089, + "grad_norm": 0.6167494058609009, + "learning_rate": 6.331449429209998e-06, + "loss": 1.5439, + "mean_token_accuracy": 0.6489314138889313, + "num_tokens": 2086101783.0, + "step": 12443 + }, + { + "entropy": 1.6638370255629222, + "epoch": 1.3670319408969815, + "grad_norm": 0.6931138634681702, + "learning_rate": 6.330080634934896e-06, + "loss": 1.245, + "mean_token_accuracy": 0.6787913938363394, + "num_tokens": 2086270110.0, + "step": 12444 + }, + { + "entropy": 1.7775676747163136, + "epoch": 1.3671417978083547, + "grad_norm": 0.673394501209259, + "learning_rate": 6.3287119884606385e-06, + "loss": 1.3892, + "mean_token_accuracy": 0.6486051231622696, + "num_tokens": 2086442552.0, + "step": 12445 + }, + { + "entropy": 1.7234329481919606, + "epoch": 1.3672516547197275, + "grad_norm": 0.6274213194847107, + "learning_rate": 6.327343489830544e-06, + "loss": 1.409, + "mean_token_accuracy": 0.6496559977531433, + "num_tokens": 2086600998.0, + "step": 12446 + }, + { + "entropy": 1.6764400204022725, + "epoch": 1.3673615116311004, + "grad_norm": 0.5889531970024109, + "learning_rate": 6.3259751390879235e-06, + "loss": 1.4925, + "mean_token_accuracy": 0.6441396176815033, + "num_tokens": 2086809020.0, + "step": 12447 + }, + { + "entropy": 1.7784501016139984, + "epoch": 1.3674713685424735, + "grad_norm": 0.6351875066757202, + "learning_rate": 6.324606936276081e-06, + "loss": 1.3461, + "mean_token_accuracy": 0.6528228173653284, + "num_tokens": 2086958944.0, + "step": 12448 + }, + { + "entropy": 1.7382746239503224, + "epoch": 1.3675812254538464, + "grad_norm": 0.7826247811317444, + "learning_rate": 6.323238881438322e-06, + "loss": 1.2748, + "mean_token_accuracy": 0.6688608477512995, + "num_tokens": 2087071918.0, + "step": 12449 + }, + { + "entropy": 1.6849194665749867, + "epoch": 1.3676910823652193, + "grad_norm": 0.8119662404060364, + "learning_rate": 6.321870974617945e-06, + "loss": 1.4217, + "mean_token_accuracy": 0.6616232146819433, + "num_tokens": 2087246848.0, + "step": 12450 + }, + { + "entropy": 1.6627297898133595, + "epoch": 1.3678009392765922, + "grad_norm": 0.7588232755661011, + "learning_rate": 6.320503215858247e-06, + "loss": 1.3605, + "mean_token_accuracy": 0.6651882280906042, + "num_tokens": 2087390127.0, + "step": 12451 + }, + { + "entropy": 1.6607940097649891, + "epoch": 1.367910796187965, + "grad_norm": 0.6578659415245056, + "learning_rate": 6.3191356052025125e-06, + "loss": 1.3467, + "mean_token_accuracy": 0.663345048824946, + "num_tokens": 2087593548.0, + "step": 12452 + }, + { + "entropy": 1.7427105605602264, + "epoch": 1.3680206530993382, + "grad_norm": 0.6504858136177063, + "learning_rate": 6.317768142694023e-06, + "loss": 1.5592, + "mean_token_accuracy": 0.6351048996051153, + "num_tokens": 2087811467.0, + "step": 12453 + }, + { + "entropy": 1.7341377437114716, + "epoch": 1.368130510010711, + "grad_norm": 0.7811003923416138, + "learning_rate": 6.316400828376067e-06, + "loss": 1.3086, + "mean_token_accuracy": 0.6740827312072118, + "num_tokens": 2087971627.0, + "step": 12454 + }, + { + "entropy": 1.73456209897995, + "epoch": 1.368240366922084, + "grad_norm": 0.8186246156692505, + "learning_rate": 6.315033662291913e-06, + "loss": 1.3032, + "mean_token_accuracy": 0.6766839226086935, + "num_tokens": 2088104619.0, + "step": 12455 + }, + { + "entropy": 1.745054046312968, + "epoch": 1.368350223833457, + "grad_norm": 0.6494616270065308, + "learning_rate": 6.31366664448483e-06, + "loss": 1.4788, + "mean_token_accuracy": 0.6535368661085764, + "num_tokens": 2088264333.0, + "step": 12456 + }, + { + "entropy": 1.6955593327681224, + "epoch": 1.3684600807448297, + "grad_norm": 0.6850633025169373, + "learning_rate": 6.312299774998088e-06, + "loss": 1.3775, + "mean_token_accuracy": 0.6563636163870493, + "num_tokens": 2088428326.0, + "step": 12457 + }, + { + "entropy": 1.6658681730429332, + "epoch": 1.3685699376562028, + "grad_norm": 0.7219941020011902, + "learning_rate": 6.310933053874944e-06, + "loss": 1.3378, + "mean_token_accuracy": 0.6682392458120981, + "num_tokens": 2088585255.0, + "step": 12458 + }, + { + "entropy": 1.6764575441678364, + "epoch": 1.3686797945675757, + "grad_norm": 0.6834865212440491, + "learning_rate": 6.309566481158657e-06, + "loss": 1.4419, + "mean_token_accuracy": 0.6662019590536753, + "num_tokens": 2088742531.0, + "step": 12459 + }, + { + "entropy": 1.6608708600203197, + "epoch": 1.3687896514789486, + "grad_norm": 0.5805539488792419, + "learning_rate": 6.30820005689248e-06, + "loss": 1.3691, + "mean_token_accuracy": 0.667238692442576, + "num_tokens": 2088931283.0, + "step": 12460 + }, + { + "entropy": 1.6612081130345662, + "epoch": 1.3688995083903217, + "grad_norm": 0.6132814288139343, + "learning_rate": 6.306833781119653e-06, + "loss": 1.3991, + "mean_token_accuracy": 0.6704057107369105, + "num_tokens": 2089145116.0, + "step": 12461 + }, + { + "entropy": 1.656867245833079, + "epoch": 1.3690093653016946, + "grad_norm": 0.587921142578125, + "learning_rate": 6.305467653883419e-06, + "loss": 1.3241, + "mean_token_accuracy": 0.6596115976572037, + "num_tokens": 2089314770.0, + "step": 12462 + }, + { + "entropy": 1.6916143695513408, + "epoch": 1.3691192222130675, + "grad_norm": 0.6570534110069275, + "learning_rate": 6.304101675227025e-06, + "loss": 1.2452, + "mean_token_accuracy": 0.6774038225412369, + "num_tokens": 2089454204.0, + "step": 12463 + }, + { + "entropy": 1.655198593934377, + "epoch": 1.3692290791244404, + "grad_norm": 0.7079626321792603, + "learning_rate": 6.3027358451936945e-06, + "loss": 1.4054, + "mean_token_accuracy": 0.6695781598488489, + "num_tokens": 2089589977.0, + "step": 12464 + }, + { + "entropy": 1.7478280862172444, + "epoch": 1.3693389360358132, + "grad_norm": 0.8134309649467468, + "learning_rate": 6.301370163826657e-06, + "loss": 1.3172, + "mean_token_accuracy": 0.6582539429267248, + "num_tokens": 2089722453.0, + "step": 12465 + }, + { + "entropy": 1.7540696163972218, + "epoch": 1.3694487929471864, + "grad_norm": 0.8033037781715393, + "learning_rate": 6.30000463116914e-06, + "loss": 1.4302, + "mean_token_accuracy": 0.6428679327170054, + "num_tokens": 2089859787.0, + "step": 12466 + }, + { + "entropy": 1.7167806526025136, + "epoch": 1.3695586498585592, + "grad_norm": 0.9351958632469177, + "learning_rate": 6.298639247264356e-06, + "loss": 1.3128, + "mean_token_accuracy": 0.6571259746948878, + "num_tokens": 2090018785.0, + "step": 12467 + }, + { + "entropy": 1.6641875207424164, + "epoch": 1.3696685067699321, + "grad_norm": 0.6932764053344727, + "learning_rate": 6.297274012155521e-06, + "loss": 1.4692, + "mean_token_accuracy": 0.6500126868486404, + "num_tokens": 2090182955.0, + "step": 12468 + }, + { + "entropy": 1.6753845711549122, + "epoch": 1.3697783636813052, + "grad_norm": 0.6691131591796875, + "learning_rate": 6.295908925885845e-06, + "loss": 1.3916, + "mean_token_accuracy": 0.6713538318872452, + "num_tokens": 2090346994.0, + "step": 12469 + }, + { + "entropy": 1.715299944082896, + "epoch": 1.3698882205926781, + "grad_norm": 0.7135533690452576, + "learning_rate": 6.294543988498529e-06, + "loss": 1.2943, + "mean_token_accuracy": 0.6677762617667516, + "num_tokens": 2090522173.0, + "step": 12470 + }, + { + "entropy": 1.7170192102591197, + "epoch": 1.369998077504051, + "grad_norm": 0.60918790102005, + "learning_rate": 6.293179200036781e-06, + "loss": 1.4456, + "mean_token_accuracy": 0.6431434949239095, + "num_tokens": 2090703501.0, + "step": 12471 + }, + { + "entropy": 1.7095450460910797, + "epoch": 1.370107934415424, + "grad_norm": 0.8031734824180603, + "learning_rate": 6.29181456054379e-06, + "loss": 1.3702, + "mean_token_accuracy": 0.653916930158933, + "num_tokens": 2090852167.0, + "step": 12472 + }, + { + "entropy": 1.6358346939086914, + "epoch": 1.3702177913267968, + "grad_norm": 0.7461312413215637, + "learning_rate": 6.290450070062741e-06, + "loss": 1.3405, + "mean_token_accuracy": 0.6676389326651891, + "num_tokens": 2091034902.0, + "step": 12473 + }, + { + "entropy": 1.6393639147281647, + "epoch": 1.3703276482381699, + "grad_norm": 0.7326330542564392, + "learning_rate": 6.289085728636827e-06, + "loss": 1.4606, + "mean_token_accuracy": 0.6608719974756241, + "num_tokens": 2091197060.0, + "step": 12474 + }, + { + "entropy": 1.702725499868393, + "epoch": 1.3704375051495428, + "grad_norm": 0.7266789674758911, + "learning_rate": 6.287721536309228e-06, + "loss": 1.468, + "mean_token_accuracy": 0.6516217837731043, + "num_tokens": 2091354267.0, + "step": 12475 + }, + { + "entropy": 1.7397367060184479, + "epoch": 1.3705473620609157, + "grad_norm": 0.6959664821624756, + "learning_rate": 6.286357493123121e-06, + "loss": 1.3982, + "mean_token_accuracy": 0.644005666176478, + "num_tokens": 2091481641.0, + "step": 12476 + }, + { + "entropy": 1.6641556123892467, + "epoch": 1.3706572189722885, + "grad_norm": 0.5910245776176453, + "learning_rate": 6.284993599121671e-06, + "loss": 1.4441, + "mean_token_accuracy": 0.6538594514131546, + "num_tokens": 2091691502.0, + "step": 12477 + }, + { + "entropy": 1.6857453385988872, + "epoch": 1.3707670758836614, + "grad_norm": 0.7743722796440125, + "learning_rate": 6.283629854348053e-06, + "loss": 1.5228, + "mean_token_accuracy": 0.6551050196091334, + "num_tokens": 2091850943.0, + "step": 12478 + }, + { + "entropy": 1.7026999294757843, + "epoch": 1.3708769327950345, + "grad_norm": 0.8539242744445801, + "learning_rate": 6.2822662588454255e-06, + "loss": 1.2969, + "mean_token_accuracy": 0.6824150482813517, + "num_tokens": 2091973000.0, + "step": 12479 + }, + { + "entropy": 1.650414725144704, + "epoch": 1.3709867897064074, + "grad_norm": 0.6949166059494019, + "learning_rate": 6.280902812656941e-06, + "loss": 1.3121, + "mean_token_accuracy": 0.668070966998736, + "num_tokens": 2092112881.0, + "step": 12480 + }, + { + "entropy": 1.7248587508996327, + "epoch": 1.3710966466177803, + "grad_norm": 0.843644917011261, + "learning_rate": 6.279539515825759e-06, + "loss": 1.5992, + "mean_token_accuracy": 0.6491927405198415, + "num_tokens": 2092288051.0, + "step": 12481 + }, + { + "entropy": 1.6901381611824036, + "epoch": 1.3712065035291534, + "grad_norm": 0.6566561460494995, + "learning_rate": 6.2781763683950216e-06, + "loss": 1.4618, + "mean_token_accuracy": 0.6334747324387232, + "num_tokens": 2092497839.0, + "step": 12482 + }, + { + "entropy": 1.6996184488137562, + "epoch": 1.3713163604405263, + "grad_norm": 0.6987338066101074, + "learning_rate": 6.276813370407876e-06, + "loss": 1.476, + "mean_token_accuracy": 0.6629880964756012, + "num_tokens": 2092636759.0, + "step": 12483 + }, + { + "entropy": 1.750564714272817, + "epoch": 1.3714262173518992, + "grad_norm": 0.8554201722145081, + "learning_rate": 6.27545052190746e-06, + "loss": 1.4742, + "mean_token_accuracy": 0.6432334631681442, + "num_tokens": 2092770538.0, + "step": 12484 + }, + { + "entropy": 1.6747083564599354, + "epoch": 1.371536074263272, + "grad_norm": 0.6590608358383179, + "learning_rate": 6.274087822936904e-06, + "loss": 1.3891, + "mean_token_accuracy": 0.6730438470840454, + "num_tokens": 2092951129.0, + "step": 12485 + }, + { + "entropy": 1.6601560016473134, + "epoch": 1.371645931174645, + "grad_norm": 0.7627508640289307, + "learning_rate": 6.272725273539337e-06, + "loss": 1.3777, + "mean_token_accuracy": 0.6717945039272308, + "num_tokens": 2093112437.0, + "step": 12486 + }, + { + "entropy": 1.6760172843933105, + "epoch": 1.371755788086018, + "grad_norm": 0.6052493453025818, + "learning_rate": 6.271362873757889e-06, + "loss": 1.3659, + "mean_token_accuracy": 0.6492635011672974, + "num_tokens": 2093284948.0, + "step": 12487 + }, + { + "entropy": 1.7177870472272236, + "epoch": 1.371865644997391, + "grad_norm": 0.6940590739250183, + "learning_rate": 6.270000623635675e-06, + "loss": 1.3116, + "mean_token_accuracy": 0.6777070065339407, + "num_tokens": 2093413636.0, + "step": 12488 + }, + { + "entropy": 1.7334169745445251, + "epoch": 1.3719755019087638, + "grad_norm": 0.7038564682006836, + "learning_rate": 6.268638523215807e-06, + "loss": 1.5389, + "mean_token_accuracy": 0.6339151461919149, + "num_tokens": 2093610103.0, + "step": 12489 + }, + { + "entropy": 1.7048886219660442, + "epoch": 1.3720853588201367, + "grad_norm": 0.7182838916778564, + "learning_rate": 6.267276572541401e-06, + "loss": 1.4323, + "mean_token_accuracy": 0.6515081773201624, + "num_tokens": 2093782215.0, + "step": 12490 + }, + { + "entropy": 1.6638799210389454, + "epoch": 1.3721952157315096, + "grad_norm": 0.6247413754463196, + "learning_rate": 6.265914771655559e-06, + "loss": 1.4979, + "mean_token_accuracy": 0.649769072731336, + "num_tokens": 2094006255.0, + "step": 12491 + }, + { + "entropy": 1.684423953294754, + "epoch": 1.3723050726428827, + "grad_norm": 0.6764265894889832, + "learning_rate": 6.264553120601378e-06, + "loss": 1.44, + "mean_token_accuracy": 0.6405173540115356, + "num_tokens": 2094154650.0, + "step": 12492 + }, + { + "entropy": 1.707674354314804, + "epoch": 1.3724149295542556, + "grad_norm": 0.6965126395225525, + "learning_rate": 6.26319161942196e-06, + "loss": 1.321, + "mean_token_accuracy": 0.6638032595316569, + "num_tokens": 2094304169.0, + "step": 12493 + }, + { + "entropy": 1.7017245789368947, + "epoch": 1.3725247864656285, + "grad_norm": 0.687353253364563, + "learning_rate": 6.261830268160388e-06, + "loss": 1.2905, + "mean_token_accuracy": 0.6623808195193609, + "num_tokens": 2094463463.0, + "step": 12494 + }, + { + "entropy": 1.7572371661663055, + "epoch": 1.3726346433770016, + "grad_norm": 0.6422104239463806, + "learning_rate": 6.260469066859758e-06, + "loss": 1.5449, + "mean_token_accuracy": 0.6423913687467575, + "num_tokens": 2094645342.0, + "step": 12495 + }, + { + "entropy": 1.7151329219341278, + "epoch": 1.3727445002883745, + "grad_norm": 0.7359181046485901, + "learning_rate": 6.259108015563146e-06, + "loss": 1.4197, + "mean_token_accuracy": 0.6493860632181168, + "num_tokens": 2094828348.0, + "step": 12496 + }, + { + "entropy": 1.71349502603213, + "epoch": 1.3728543571997474, + "grad_norm": 0.6822761297225952, + "learning_rate": 6.257747114313626e-06, + "loss": 1.4804, + "mean_token_accuracy": 0.6557674954334894, + "num_tokens": 2095006520.0, + "step": 12497 + }, + { + "entropy": 1.688800722360611, + "epoch": 1.3729642141111202, + "grad_norm": 0.5929533839225769, + "learning_rate": 6.256386363154272e-06, + "loss": 1.3362, + "mean_token_accuracy": 0.6658134708801905, + "num_tokens": 2095165179.0, + "step": 12498 + }, + { + "entropy": 1.7582799593607585, + "epoch": 1.3730740710224931, + "grad_norm": 0.6980983018875122, + "learning_rate": 6.255025762128156e-06, + "loss": 1.5299, + "mean_token_accuracy": 0.6501601040363312, + "num_tokens": 2095338814.0, + "step": 12499 + }, + { + "entropy": 1.6549382110436757, + "epoch": 1.3731839279338662, + "grad_norm": 0.6574755907058716, + "learning_rate": 6.253665311278337e-06, + "loss": 1.4851, + "mean_token_accuracy": 0.6519963542620341, + "num_tokens": 2095554250.0, + "step": 12500 + }, + { + "entropy": 1.7362759113311768, + "epoch": 1.3732937848452391, + "grad_norm": 0.6475224494934082, + "learning_rate": 6.252305010647868e-06, + "loss": 1.3406, + "mean_token_accuracy": 0.6704058945178986, + "num_tokens": 2095687229.0, + "step": 12501 + }, + { + "entropy": 1.683371404806773, + "epoch": 1.373403641756612, + "grad_norm": 0.6420804858207703, + "learning_rate": 6.250944860279809e-06, + "loss": 1.344, + "mean_token_accuracy": 0.6628765761852264, + "num_tokens": 2095919210.0, + "step": 12502 + }, + { + "entropy": 1.7301292022069295, + "epoch": 1.3735134986679849, + "grad_norm": 0.8406617641448975, + "learning_rate": 6.249584860217206e-06, + "loss": 1.4015, + "mean_token_accuracy": 0.6547163327534994, + "num_tokens": 2096052146.0, + "step": 12503 + }, + { + "entropy": 1.6722088654836018, + "epoch": 1.3736233555793578, + "grad_norm": 0.7558131814002991, + "learning_rate": 6.248225010503098e-06, + "loss": 1.4738, + "mean_token_accuracy": 0.6520429998636246, + "num_tokens": 2096209673.0, + "step": 12504 + }, + { + "entropy": 1.684027413527171, + "epoch": 1.3737332124907309, + "grad_norm": 0.8138048648834229, + "learning_rate": 6.246865311180532e-06, + "loss": 1.5581, + "mean_token_accuracy": 0.6619268457094828, + "num_tokens": 2096376074.0, + "step": 12505 + }, + { + "entropy": 1.666286826133728, + "epoch": 1.3738430694021038, + "grad_norm": 0.6682919859886169, + "learning_rate": 6.245505762292532e-06, + "loss": 1.3127, + "mean_token_accuracy": 0.6639792720476786, + "num_tokens": 2096564264.0, + "step": 12506 + }, + { + "entropy": 1.760203758875529, + "epoch": 1.3739529263134767, + "grad_norm": 0.7202064990997314, + "learning_rate": 6.2441463638821355e-06, + "loss": 1.5849, + "mean_token_accuracy": 0.6438749060034752, + "num_tokens": 2096724407.0, + "step": 12507 + }, + { + "entropy": 1.758449226617813, + "epoch": 1.3740627832248498, + "grad_norm": 0.6769862771034241, + "learning_rate": 6.242787115992364e-06, + "loss": 1.5019, + "mean_token_accuracy": 0.6401093502839407, + "num_tokens": 2096904566.0, + "step": 12508 + }, + { + "entropy": 1.6738151510556538, + "epoch": 1.3741726401362226, + "grad_norm": 0.5517680644989014, + "learning_rate": 6.241428018666234e-06, + "loss": 1.3892, + "mean_token_accuracy": 0.655499001344045, + "num_tokens": 2097086758.0, + "step": 12509 + }, + { + "entropy": 1.714376340309779, + "epoch": 1.3742824970475955, + "grad_norm": 0.6948719620704651, + "learning_rate": 6.240069071946762e-06, + "loss": 1.4616, + "mean_token_accuracy": 0.6460278133551279, + "num_tokens": 2097254724.0, + "step": 12510 + }, + { + "entropy": 1.7263469596703847, + "epoch": 1.3743923539589684, + "grad_norm": 0.6415128707885742, + "learning_rate": 6.238710275876962e-06, + "loss": 1.3862, + "mean_token_accuracy": 0.6575345396995544, + "num_tokens": 2097417417.0, + "step": 12511 + }, + { + "entropy": 1.698015163342158, + "epoch": 1.3745022108703413, + "grad_norm": 0.6114696264266968, + "learning_rate": 6.237351630499837e-06, + "loss": 1.4891, + "mean_token_accuracy": 0.6359892090161642, + "num_tokens": 2097646264.0, + "step": 12512 + }, + { + "entropy": 1.764273762702942, + "epoch": 1.3746120677817144, + "grad_norm": 0.9009761810302734, + "learning_rate": 6.235993135858387e-06, + "loss": 1.4376, + "mean_token_accuracy": 0.6558701246976852, + "num_tokens": 2097847731.0, + "step": 12513 + }, + { + "entropy": 1.7166384359200795, + "epoch": 1.3747219246930873, + "grad_norm": 0.7256246209144592, + "learning_rate": 6.234634791995603e-06, + "loss": 1.2758, + "mean_token_accuracy": 0.6786874433358511, + "num_tokens": 2097976363.0, + "step": 12514 + }, + { + "entropy": 1.7404154340426128, + "epoch": 1.3748317816044602, + "grad_norm": 0.7519568800926208, + "learning_rate": 6.233276598954485e-06, + "loss": 1.4151, + "mean_token_accuracy": 0.6605635484059652, + "num_tokens": 2098120603.0, + "step": 12515 + }, + { + "entropy": 1.6740521490573883, + "epoch": 1.374941638515833, + "grad_norm": 0.7059155106544495, + "learning_rate": 6.231918556778014e-06, + "loss": 1.3228, + "mean_token_accuracy": 0.6625839471817017, + "num_tokens": 2098259613.0, + "step": 12516 + }, + { + "entropy": 1.7056198716163635, + "epoch": 1.375051495427206, + "grad_norm": 0.6181541085243225, + "learning_rate": 6.2305606655091685e-06, + "loss": 1.347, + "mean_token_accuracy": 0.6637411365906397, + "num_tokens": 2098407106.0, + "step": 12517 + }, + { + "entropy": 1.7412952582041423, + "epoch": 1.375161352338579, + "grad_norm": 0.645773708820343, + "learning_rate": 6.229202925190931e-06, + "loss": 1.5052, + "mean_token_accuracy": 0.6412904262542725, + "num_tokens": 2098599802.0, + "step": 12518 + }, + { + "entropy": 1.7033185958862305, + "epoch": 1.375271209249952, + "grad_norm": 0.7069703936576843, + "learning_rate": 6.227845335866271e-06, + "loss": 1.3859, + "mean_token_accuracy": 0.6646387676397959, + "num_tokens": 2098760048.0, + "step": 12519 + }, + { + "entropy": 1.7417400777339935, + "epoch": 1.3753810661613248, + "grad_norm": 0.8793759346008301, + "learning_rate": 6.226487897578159e-06, + "loss": 1.3239, + "mean_token_accuracy": 0.665963664650917, + "num_tokens": 2098930969.0, + "step": 12520 + }, + { + "entropy": 1.6477711200714111, + "epoch": 1.375490923072698, + "grad_norm": 0.8466112017631531, + "learning_rate": 6.22513061036955e-06, + "loss": 1.3336, + "mean_token_accuracy": 0.6694683879613876, + "num_tokens": 2099084185.0, + "step": 12521 + }, + { + "entropy": 1.7149950762589772, + "epoch": 1.3756007799840708, + "grad_norm": 0.8220770359039307, + "learning_rate": 6.223773474283408e-06, + "loss": 1.387, + "mean_token_accuracy": 0.6811329424381256, + "num_tokens": 2099253890.0, + "step": 12522 + }, + { + "entropy": 1.6823686361312866, + "epoch": 1.3757106368954437, + "grad_norm": 0.6571083664894104, + "learning_rate": 6.222416489362683e-06, + "loss": 1.2217, + "mean_token_accuracy": 0.6786747376124064, + "num_tokens": 2099388467.0, + "step": 12523 + }, + { + "entropy": 1.6723161041736603, + "epoch": 1.3758204938068166, + "grad_norm": 0.6704456806182861, + "learning_rate": 6.221059655650321e-06, + "loss": 1.369, + "mean_token_accuracy": 0.6669376641511917, + "num_tokens": 2099559332.0, + "step": 12524 + }, + { + "entropy": 1.7238895495732625, + "epoch": 1.3759303507181895, + "grad_norm": 0.846031904220581, + "learning_rate": 6.21970297318927e-06, + "loss": 1.5058, + "mean_token_accuracy": 0.6613429884115855, + "num_tokens": 2099730332.0, + "step": 12525 + }, + { + "entropy": 1.6958635946114857, + "epoch": 1.3760402076295626, + "grad_norm": 0.6999905705451965, + "learning_rate": 6.218346442022462e-06, + "loss": 1.2999, + "mean_token_accuracy": 0.6632877240578333, + "num_tokens": 2099900875.0, + "step": 12526 + }, + { + "entropy": 1.6386962433656056, + "epoch": 1.3761500645409355, + "grad_norm": 0.5973513126373291, + "learning_rate": 6.2169900621928394e-06, + "loss": 1.4151, + "mean_token_accuracy": 0.6484298954407374, + "num_tokens": 2100076851.0, + "step": 12527 + }, + { + "entropy": 1.6838893989721935, + "epoch": 1.3762599214523084, + "grad_norm": 0.7201468348503113, + "learning_rate": 6.215633833743325e-06, + "loss": 1.2795, + "mean_token_accuracy": 0.6828643282254537, + "num_tokens": 2100209931.0, + "step": 12528 + }, + { + "entropy": 1.653613954782486, + "epoch": 1.3763697783636812, + "grad_norm": 0.6516929268836975, + "learning_rate": 6.214277756716841e-06, + "loss": 1.3982, + "mean_token_accuracy": 0.6462257554133733, + "num_tokens": 2100421383.0, + "step": 12529 + }, + { + "entropy": 1.5921331147352855, + "epoch": 1.3764796352750541, + "grad_norm": 0.6113102436065674, + "learning_rate": 6.212921831156309e-06, + "loss": 1.3048, + "mean_token_accuracy": 0.6834805657466253, + "num_tokens": 2100566416.0, + "step": 12530 + }, + { + "entropy": 1.6861707468827565, + "epoch": 1.3765894921864272, + "grad_norm": 0.744138240814209, + "learning_rate": 6.2115660571046475e-06, + "loss": 1.4114, + "mean_token_accuracy": 0.6627868016560873, + "num_tokens": 2100707648.0, + "step": 12531 + }, + { + "entropy": 1.7084755897521973, + "epoch": 1.3766993490978001, + "grad_norm": 0.613015353679657, + "learning_rate": 6.2102104346047635e-06, + "loss": 1.4212, + "mean_token_accuracy": 0.659169336160024, + "num_tokens": 2100885511.0, + "step": 12532 + }, + { + "entropy": 1.7335072060426076, + "epoch": 1.376809206009173, + "grad_norm": 0.6884053945541382, + "learning_rate": 6.208854963699555e-06, + "loss": 1.3903, + "mean_token_accuracy": 0.6577414770921072, + "num_tokens": 2101034010.0, + "step": 12533 + }, + { + "entropy": 1.6870386103789012, + "epoch": 1.376919062920546, + "grad_norm": 0.6219229102134705, + "learning_rate": 6.207499644431935e-06, + "loss": 1.3897, + "mean_token_accuracy": 0.6502025226751963, + "num_tokens": 2101186648.0, + "step": 12534 + }, + { + "entropy": 1.7138726909955342, + "epoch": 1.377028919831919, + "grad_norm": 0.5587577819824219, + "learning_rate": 6.206144476844789e-06, + "loss": 1.4165, + "mean_token_accuracy": 0.6410103340943655, + "num_tokens": 2101425465.0, + "step": 12535 + }, + { + "entropy": 1.7444292902946472, + "epoch": 1.3771387767432919, + "grad_norm": 0.6542041301727295, + "learning_rate": 6.204789460981008e-06, + "loss": 1.5301, + "mean_token_accuracy": 0.6419963190952936, + "num_tokens": 2101626304.0, + "step": 12536 + }, + { + "entropy": 1.6845300594965618, + "epoch": 1.3772486336546648, + "grad_norm": 0.7498136758804321, + "learning_rate": 6.203434596883482e-06, + "loss": 1.3899, + "mean_token_accuracy": 0.6670129199822744, + "num_tokens": 2101765786.0, + "step": 12537 + }, + { + "entropy": 1.8169357279936473, + "epoch": 1.3773584905660377, + "grad_norm": 0.7087510824203491, + "learning_rate": 6.202079884595088e-06, + "loss": 1.3182, + "mean_token_accuracy": 0.6673329919576645, + "num_tokens": 2101886833.0, + "step": 12538 + }, + { + "entropy": 1.7348575592041016, + "epoch": 1.3774683474774108, + "grad_norm": 0.7547774910926819, + "learning_rate": 6.200725324158705e-06, + "loss": 1.3981, + "mean_token_accuracy": 0.6532778888940811, + "num_tokens": 2102006067.0, + "step": 12539 + }, + { + "entropy": 1.695908526579539, + "epoch": 1.3775782043887836, + "grad_norm": 0.7535329461097717, + "learning_rate": 6.199370915617204e-06, + "loss": 1.4789, + "mean_token_accuracy": 0.665493423740069, + "num_tokens": 2102171012.0, + "step": 12540 + }, + { + "entropy": 1.715345323085785, + "epoch": 1.3776880613001565, + "grad_norm": 0.6196267604827881, + "learning_rate": 6.198016659013447e-06, + "loss": 1.3696, + "mean_token_accuracy": 0.6550077845652899, + "num_tokens": 2102338771.0, + "step": 12541 + }, + { + "entropy": 1.689347783724467, + "epoch": 1.3777979182115294, + "grad_norm": 0.7949912548065186, + "learning_rate": 6.196662554390298e-06, + "loss": 1.2438, + "mean_token_accuracy": 0.6799779733022054, + "num_tokens": 2102449380.0, + "step": 12542 + }, + { + "entropy": 1.717601974805196, + "epoch": 1.3779077751229023, + "grad_norm": 0.682672381401062, + "learning_rate": 6.19530860179062e-06, + "loss": 1.303, + "mean_token_accuracy": 0.6659560054540634, + "num_tokens": 2102580923.0, + "step": 12543 + }, + { + "entropy": 1.6912067731221516, + "epoch": 1.3780176320342754, + "grad_norm": 0.6183836460113525, + "learning_rate": 6.1939548012572585e-06, + "loss": 1.5116, + "mean_token_accuracy": 0.6377200831969579, + "num_tokens": 2102835634.0, + "step": 12544 + }, + { + "entropy": 1.7292577226956685, + "epoch": 1.3781274889456483, + "grad_norm": 0.7108417749404907, + "learning_rate": 6.1926011528330575e-06, + "loss": 1.4302, + "mean_token_accuracy": 0.6581776638825735, + "num_tokens": 2102968731.0, + "step": 12545 + }, + { + "entropy": 1.663769433895747, + "epoch": 1.3782373458570212, + "grad_norm": 0.5661031603813171, + "learning_rate": 6.191247656560868e-06, + "loss": 1.3358, + "mean_token_accuracy": 0.6701933294534683, + "num_tokens": 2103135710.0, + "step": 12546 + }, + { + "entropy": 1.7361170947551727, + "epoch": 1.3783472027683943, + "grad_norm": 0.7165773510932922, + "learning_rate": 6.189894312483524e-06, + "loss": 1.4254, + "mean_token_accuracy": 0.6522279679775238, + "num_tokens": 2103314119.0, + "step": 12547 + }, + { + "entropy": 1.7245140473047893, + "epoch": 1.3784570596797672, + "grad_norm": 0.6944742202758789, + "learning_rate": 6.188541120643854e-06, + "loss": 1.2226, + "mean_token_accuracy": 0.6783800820509592, + "num_tokens": 2103440751.0, + "step": 12548 + }, + { + "entropy": 1.7549363176027934, + "epoch": 1.37856691659114, + "grad_norm": 0.7486315369606018, + "learning_rate": 6.1871880810846915e-06, + "loss": 1.3902, + "mean_token_accuracy": 0.6532481958468755, + "num_tokens": 2103581713.0, + "step": 12549 + }, + { + "entropy": 1.616852581501007, + "epoch": 1.378676773502513, + "grad_norm": 0.8074557781219482, + "learning_rate": 6.185835193848856e-06, + "loss": 1.2921, + "mean_token_accuracy": 0.6893499394257864, + "num_tokens": 2103724805.0, + "step": 12550 + }, + { + "entropy": 1.7497636179129283, + "epoch": 1.3787866304138858, + "grad_norm": 0.8509400486946106, + "learning_rate": 6.184482458979169e-06, + "loss": 1.4539, + "mean_token_accuracy": 0.6395279069741567, + "num_tokens": 2103893055.0, + "step": 12551 + }, + { + "entropy": 1.6811266740163167, + "epoch": 1.378896487325259, + "grad_norm": 0.7023653388023376, + "learning_rate": 6.183129876518443e-06, + "loss": 1.3276, + "mean_token_accuracy": 0.666248674194018, + "num_tokens": 2104081495.0, + "step": 12552 + }, + { + "entropy": 1.7035264372825623, + "epoch": 1.3790063442366318, + "grad_norm": 0.5721656084060669, + "learning_rate": 6.181777446509482e-06, + "loss": 1.4066, + "mean_token_accuracy": 0.6570123036702474, + "num_tokens": 2104292769.0, + "step": 12553 + }, + { + "entropy": 1.6963448226451874, + "epoch": 1.3791162011480047, + "grad_norm": 0.6713624596595764, + "learning_rate": 6.180425168995094e-06, + "loss": 1.4223, + "mean_token_accuracy": 0.6561168730258942, + "num_tokens": 2104442926.0, + "step": 12554 + }, + { + "entropy": 1.6694329380989075, + "epoch": 1.3792260580593776, + "grad_norm": 0.6325618624687195, + "learning_rate": 6.179073044018082e-06, + "loss": 1.4122, + "mean_token_accuracy": 0.6522913922866186, + "num_tokens": 2104676130.0, + "step": 12555 + }, + { + "entropy": 1.7309764126936595, + "epoch": 1.3793359149707505, + "grad_norm": 0.7151079773902893, + "learning_rate": 6.177721071621234e-06, + "loss": 1.3119, + "mean_token_accuracy": 0.6660696119070053, + "num_tokens": 2104807634.0, + "step": 12556 + }, + { + "entropy": 1.6221238374710083, + "epoch": 1.3794457718821236, + "grad_norm": 0.7269033193588257, + "learning_rate": 6.176369251847341e-06, + "loss": 1.343, + "mean_token_accuracy": 0.6632204552491506, + "num_tokens": 2104975691.0, + "step": 12557 + }, + { + "entropy": 1.7174046039581299, + "epoch": 1.3795556287934965, + "grad_norm": 0.7088026404380798, + "learning_rate": 6.175017584739187e-06, + "loss": 1.2995, + "mean_token_accuracy": 0.6678259124358495, + "num_tokens": 2105128915.0, + "step": 12558 + }, + { + "entropy": 1.7887861529986064, + "epoch": 1.3796654857048694, + "grad_norm": 0.696653425693512, + "learning_rate": 6.173666070339554e-06, + "loss": 1.4658, + "mean_token_accuracy": 0.6444426278273264, + "num_tokens": 2105294149.0, + "step": 12559 + }, + { + "entropy": 1.7095742324988048, + "epoch": 1.3797753426162425, + "grad_norm": 0.6750052571296692, + "learning_rate": 6.172314708691212e-06, + "loss": 1.3382, + "mean_token_accuracy": 0.6658424387375513, + "num_tokens": 2105438713.0, + "step": 12560 + }, + { + "entropy": 1.694052904844284, + "epoch": 1.3798851995276153, + "grad_norm": 0.5075100064277649, + "learning_rate": 6.170963499836937e-06, + "loss": 1.3667, + "mean_token_accuracy": 0.6514973640441895, + "num_tokens": 2105612254.0, + "step": 12561 + }, + { + "entropy": 1.6640310784180958, + "epoch": 1.3799950564389882, + "grad_norm": 0.5934505462646484, + "learning_rate": 6.169612443819488e-06, + "loss": 1.3747, + "mean_token_accuracy": 0.6665078550577164, + "num_tokens": 2105797489.0, + "step": 12562 + }, + { + "entropy": 1.6789606213569641, + "epoch": 1.3801049133503611, + "grad_norm": 0.6959724426269531, + "learning_rate": 6.1682615406816325e-06, + "loss": 1.2852, + "mean_token_accuracy": 0.6706260542074839, + "num_tokens": 2105957291.0, + "step": 12563 + }, + { + "entropy": 1.7314012149969737, + "epoch": 1.380214770261734, + "grad_norm": 0.7916790246963501, + "learning_rate": 6.166910790466121e-06, + "loss": 1.3361, + "mean_token_accuracy": 0.6655499537785848, + "num_tokens": 2106101795.0, + "step": 12564 + }, + { + "entropy": 1.6863299409548442, + "epoch": 1.380324627173107, + "grad_norm": 0.8303491473197937, + "learning_rate": 6.165560193215702e-06, + "loss": 1.2178, + "mean_token_accuracy": 0.6846508930126826, + "num_tokens": 2106228777.0, + "step": 12565 + }, + { + "entropy": 1.6824834048748016, + "epoch": 1.38043448408448, + "grad_norm": 0.6064619421958923, + "learning_rate": 6.164209748973124e-06, + "loss": 1.4575, + "mean_token_accuracy": 0.6451242392261823, + "num_tokens": 2106442223.0, + "step": 12566 + }, + { + "entropy": 1.6857933203379314, + "epoch": 1.3805443409958529, + "grad_norm": 0.6482805609703064, + "learning_rate": 6.162859457781132e-06, + "loss": 1.4251, + "mean_token_accuracy": 0.6491700212160746, + "num_tokens": 2106642621.0, + "step": 12567 + }, + { + "entropy": 1.746164898077647, + "epoch": 1.3806541979072258, + "grad_norm": 0.7709128260612488, + "learning_rate": 6.161509319682459e-06, + "loss": 1.3878, + "mean_token_accuracy": 0.6586853563785553, + "num_tokens": 2106813233.0, + "step": 12568 + }, + { + "entropy": 1.7627079784870148, + "epoch": 1.3807640548185987, + "grad_norm": 1.2081003189086914, + "learning_rate": 6.160159334719833e-06, + "loss": 1.4917, + "mean_token_accuracy": 0.6570613433917364, + "num_tokens": 2106996455.0, + "step": 12569 + }, + { + "entropy": 1.6405859887599945, + "epoch": 1.3808739117299718, + "grad_norm": 0.703881561756134, + "learning_rate": 6.158809502935985e-06, + "loss": 1.2482, + "mean_token_accuracy": 0.6670927107334137, + "num_tokens": 2107209001.0, + "step": 12570 + }, + { + "entropy": 1.6598562101523082, + "epoch": 1.3809837686413446, + "grad_norm": 0.7052212953567505, + "learning_rate": 6.1574598243736346e-06, + "loss": 1.3986, + "mean_token_accuracy": 0.662861779332161, + "num_tokens": 2107392456.0, + "step": 12571 + }, + { + "entropy": 1.7908701996008556, + "epoch": 1.3810936255527175, + "grad_norm": 0.6938340663909912, + "learning_rate": 6.156110299075501e-06, + "loss": 1.3683, + "mean_token_accuracy": 0.6564254115025202, + "num_tokens": 2107530671.0, + "step": 12572 + }, + { + "entropy": 1.7378122210502625, + "epoch": 1.3812034824640906, + "grad_norm": 0.8076921701431274, + "learning_rate": 6.154760927084289e-06, + "loss": 1.5943, + "mean_token_accuracy": 0.6301184669137001, + "num_tokens": 2107734369.0, + "step": 12573 + }, + { + "entropy": 1.6900312105814617, + "epoch": 1.3813133393754635, + "grad_norm": 0.7580331563949585, + "learning_rate": 6.153411708442709e-06, + "loss": 1.4061, + "mean_token_accuracy": 0.6511543840169907, + "num_tokens": 2107933013.0, + "step": 12574 + }, + { + "entropy": 1.7424448728561401, + "epoch": 1.3814231962868364, + "grad_norm": 0.8853403329849243, + "learning_rate": 6.152062643193469e-06, + "loss": 1.395, + "mean_token_accuracy": 0.6606808751821518, + "num_tokens": 2108052983.0, + "step": 12575 + }, + { + "entropy": 1.6669080555438995, + "epoch": 1.3815330531982093, + "grad_norm": 0.7201902866363525, + "learning_rate": 6.150713731379262e-06, + "loss": 1.3757, + "mean_token_accuracy": 0.6702162722746531, + "num_tokens": 2108200984.0, + "step": 12576 + }, + { + "entropy": 1.6895977854728699, + "epoch": 1.3816429101095822, + "grad_norm": 0.6631450653076172, + "learning_rate": 6.1493649730427775e-06, + "loss": 1.4566, + "mean_token_accuracy": 0.6551151325305303, + "num_tokens": 2108383996.0, + "step": 12577 + }, + { + "entropy": 1.8024785220623016, + "epoch": 1.3817527670209553, + "grad_norm": 0.758419394493103, + "learning_rate": 6.148016368226708e-06, + "loss": 1.4561, + "mean_token_accuracy": 0.652512788772583, + "num_tokens": 2108632948.0, + "step": 12578 + }, + { + "entropy": 1.6716083586215973, + "epoch": 1.3818626239323282, + "grad_norm": 0.6724756360054016, + "learning_rate": 6.1466679169737305e-06, + "loss": 1.2769, + "mean_token_accuracy": 0.6760951578617096, + "num_tokens": 2108779464.0, + "step": 12579 + }, + { + "entropy": 1.7235155701637268, + "epoch": 1.381972480843701, + "grad_norm": 0.6784364581108093, + "learning_rate": 6.145319619326531e-06, + "loss": 1.4671, + "mean_token_accuracy": 0.6498638937870661, + "num_tokens": 2108948254.0, + "step": 12580 + }, + { + "entropy": 1.763847251733144, + "epoch": 1.382082337755074, + "grad_norm": 0.8038673400878906, + "learning_rate": 6.143971475327777e-06, + "loss": 1.5473, + "mean_token_accuracy": 0.6444573253393173, + "num_tokens": 2109131964.0, + "step": 12581 + }, + { + "entropy": 1.7665770848592122, + "epoch": 1.3821921946664468, + "grad_norm": 0.6910974979400635, + "learning_rate": 6.142623485020135e-06, + "loss": 1.4179, + "mean_token_accuracy": 0.649805506070455, + "num_tokens": 2109315655.0, + "step": 12582 + }, + { + "entropy": 1.6560252110163372, + "epoch": 1.38230205157782, + "grad_norm": 0.6830186247825623, + "learning_rate": 6.141275648446274e-06, + "loss": 1.3571, + "mean_token_accuracy": 0.6550218462944031, + "num_tokens": 2109499004.0, + "step": 12583 + }, + { + "entropy": 1.7072090804576874, + "epoch": 1.3824119084891928, + "grad_norm": 0.7160013318061829, + "learning_rate": 6.139927965648848e-06, + "loss": 1.3447, + "mean_token_accuracy": 0.6548336744308472, + "num_tokens": 2109659932.0, + "step": 12584 + }, + { + "entropy": 1.686892330646515, + "epoch": 1.3825217654005657, + "grad_norm": 0.6150763034820557, + "learning_rate": 6.138580436670512e-06, + "loss": 1.4549, + "mean_token_accuracy": 0.635261004169782, + "num_tokens": 2109900953.0, + "step": 12585 + }, + { + "entropy": 1.75520854194959, + "epoch": 1.3826316223119388, + "grad_norm": 0.6944630146026611, + "learning_rate": 6.137233061553914e-06, + "loss": 1.5959, + "mean_token_accuracy": 0.6286770751078924, + "num_tokens": 2110108815.0, + "step": 12586 + }, + { + "entropy": 1.720687488714854, + "epoch": 1.3827414792233117, + "grad_norm": 0.6678749322891235, + "learning_rate": 6.1358858403416985e-06, + "loss": 1.3596, + "mean_token_accuracy": 0.6810717135667801, + "num_tokens": 2110296447.0, + "step": 12587 + }, + { + "entropy": 1.7100276947021484, + "epoch": 1.3828513361346846, + "grad_norm": 0.6869692206382751, + "learning_rate": 6.134538773076506e-06, + "loss": 1.3972, + "mean_token_accuracy": 0.6752725392580032, + "num_tokens": 2110501100.0, + "step": 12588 + }, + { + "entropy": 1.7093996107578278, + "epoch": 1.3829611930460575, + "grad_norm": 0.7765591144561768, + "learning_rate": 6.1331918598009664e-06, + "loss": 1.2499, + "mean_token_accuracy": 0.6684116174777349, + "num_tokens": 2110646493.0, + "step": 12589 + }, + { + "entropy": 1.7051993509133656, + "epoch": 1.3830710499574304, + "grad_norm": 0.7620055079460144, + "learning_rate": 6.131845100557713e-06, + "loss": 1.3419, + "mean_token_accuracy": 0.6666328310966492, + "num_tokens": 2110816819.0, + "step": 12590 + }, + { + "entropy": 1.7090232570966084, + "epoch": 1.3831809068688035, + "grad_norm": 0.6869613528251648, + "learning_rate": 6.130498495389365e-06, + "loss": 1.4943, + "mean_token_accuracy": 0.6450514495372772, + "num_tokens": 2110994559.0, + "step": 12591 + }, + { + "entropy": 1.6619928280512493, + "epoch": 1.3832907637801763, + "grad_norm": 0.6001034379005432, + "learning_rate": 6.129152044338551e-06, + "loss": 1.2886, + "mean_token_accuracy": 0.6643996685743332, + "num_tokens": 2111140891.0, + "step": 12592 + }, + { + "entropy": 1.7544625401496887, + "epoch": 1.3834006206915492, + "grad_norm": 0.7446973919868469, + "learning_rate": 6.1278057474478795e-06, + "loss": 1.2745, + "mean_token_accuracy": 0.676040510336558, + "num_tokens": 2111325676.0, + "step": 12593 + }, + { + "entropy": 1.6300282776355743, + "epoch": 1.3835104776029221, + "grad_norm": 0.7586323022842407, + "learning_rate": 6.1264596047599555e-06, + "loss": 1.4867, + "mean_token_accuracy": 0.6626367469628652, + "num_tokens": 2111503103.0, + "step": 12594 + }, + { + "entropy": 1.6632478535175323, + "epoch": 1.383620334514295, + "grad_norm": 0.6314704418182373, + "learning_rate": 6.125113616317394e-06, + "loss": 1.4226, + "mean_token_accuracy": 0.6542702714602152, + "num_tokens": 2111687155.0, + "step": 12595 + }, + { + "entropy": 1.7528144121170044, + "epoch": 1.383730191425668, + "grad_norm": 0.7653163075447083, + "learning_rate": 6.123767782162789e-06, + "loss": 1.2804, + "mean_token_accuracy": 0.672723392645518, + "num_tokens": 2111806165.0, + "step": 12596 + }, + { + "entropy": 1.7398656606674194, + "epoch": 1.383840048337041, + "grad_norm": 0.7522697448730469, + "learning_rate": 6.1224221023387335e-06, + "loss": 1.426, + "mean_token_accuracy": 0.6618303308884302, + "num_tokens": 2111943423.0, + "step": 12597 + }, + { + "entropy": 1.7423573831717174, + "epoch": 1.3839499052484139, + "grad_norm": 0.8257996439933777, + "learning_rate": 6.121076576887821e-06, + "loss": 1.4182, + "mean_token_accuracy": 0.678856705625852, + "num_tokens": 2112103993.0, + "step": 12598 + }, + { + "entropy": 1.7207870781421661, + "epoch": 1.384059762159787, + "grad_norm": 0.6698588728904724, + "learning_rate": 6.119731205852638e-06, + "loss": 1.5969, + "mean_token_accuracy": 0.6185207416613897, + "num_tokens": 2112323075.0, + "step": 12599 + }, + { + "entropy": 1.7429817418257396, + "epoch": 1.3841696190711599, + "grad_norm": 0.6996835470199585, + "learning_rate": 6.118385989275766e-06, + "loss": 1.4458, + "mean_token_accuracy": 0.6526039093732834, + "num_tokens": 2112446359.0, + "step": 12600 + }, + { + "entropy": 1.7398698528607686, + "epoch": 1.3842794759825328, + "grad_norm": 0.8282992839813232, + "learning_rate": 6.117040927199771e-06, + "loss": 1.5379, + "mean_token_accuracy": 0.648671011130015, + "num_tokens": 2112590838.0, + "step": 12601 + }, + { + "entropy": 1.6970640818277996, + "epoch": 1.3843893328939056, + "grad_norm": 0.7883489727973938, + "learning_rate": 6.115696019667236e-06, + "loss": 1.5544, + "mean_token_accuracy": 0.6379801481962204, + "num_tokens": 2112806234.0, + "step": 12602 + }, + { + "entropy": 1.7620785633722942, + "epoch": 1.3844991898052785, + "grad_norm": 0.8957377672195435, + "learning_rate": 6.1143512667207195e-06, + "loss": 1.4253, + "mean_token_accuracy": 0.6558419863382975, + "num_tokens": 2112951869.0, + "step": 12603 + }, + { + "entropy": 1.6944806178410847, + "epoch": 1.3846090467166516, + "grad_norm": 0.591116726398468, + "learning_rate": 6.113006668402783e-06, + "loss": 1.5106, + "mean_token_accuracy": 0.6455500026543936, + "num_tokens": 2113186421.0, + "step": 12604 + }, + { + "entropy": 1.6918245454629262, + "epoch": 1.3847189036280245, + "grad_norm": 0.9420191645622253, + "learning_rate": 6.111662224755984e-06, + "loss": 1.0811, + "mean_token_accuracy": 0.6826836367448171, + "num_tokens": 2113359446.0, + "step": 12605 + }, + { + "entropy": 1.7305250068505604, + "epoch": 1.3848287605393974, + "grad_norm": 0.7896611094474792, + "learning_rate": 6.110317935822871e-06, + "loss": 1.4241, + "mean_token_accuracy": 0.657659446199735, + "num_tokens": 2113523537.0, + "step": 12606 + }, + { + "entropy": 1.6499216953913372, + "epoch": 1.3849386174507703, + "grad_norm": 0.7420812845230103, + "learning_rate": 6.108973801645994e-06, + "loss": 1.3184, + "mean_token_accuracy": 0.6785426884889603, + "num_tokens": 2113662311.0, + "step": 12607 + }, + { + "entropy": 1.7228143910566966, + "epoch": 1.3850484743621432, + "grad_norm": 0.7056854367256165, + "learning_rate": 6.107629822267894e-06, + "loss": 1.3208, + "mean_token_accuracy": 0.6636027296384176, + "num_tokens": 2113793614.0, + "step": 12608 + }, + { + "entropy": 1.795301725467046, + "epoch": 1.3851583312735163, + "grad_norm": 0.9566717147827148, + "learning_rate": 6.106285997731101e-06, + "loss": 1.4437, + "mean_token_accuracy": 0.6586398979028066, + "num_tokens": 2113942725.0, + "step": 12609 + }, + { + "entropy": 1.679547091325124, + "epoch": 1.3852681881848892, + "grad_norm": 0.58782958984375, + "learning_rate": 6.1049423280781515e-06, + "loss": 1.3413, + "mean_token_accuracy": 0.6591441829999288, + "num_tokens": 2114102494.0, + "step": 12610 + }, + { + "entropy": 1.7034885783990223, + "epoch": 1.385378045096262, + "grad_norm": 0.7246780395507812, + "learning_rate": 6.103598813351575e-06, + "loss": 1.3381, + "mean_token_accuracy": 0.6627988219261169, + "num_tokens": 2114234854.0, + "step": 12611 + }, + { + "entropy": 1.7090636690457661, + "epoch": 1.3854879020076352, + "grad_norm": 0.6690557599067688, + "learning_rate": 6.10225545359389e-06, + "loss": 1.4635, + "mean_token_accuracy": 0.6498597512642542, + "num_tokens": 2114454147.0, + "step": 12612 + }, + { + "entropy": 1.7583944102128346, + "epoch": 1.385597758919008, + "grad_norm": 0.8286144137382507, + "learning_rate": 6.100912248847608e-06, + "loss": 1.3619, + "mean_token_accuracy": 0.6575885117053986, + "num_tokens": 2114634934.0, + "step": 12613 + }, + { + "entropy": 1.6706369022528331, + "epoch": 1.385707615830381, + "grad_norm": 0.6204984188079834, + "learning_rate": 6.099569199155251e-06, + "loss": 1.3126, + "mean_token_accuracy": 0.6611980448166529, + "num_tokens": 2114792957.0, + "step": 12614 + }, + { + "entropy": 1.6743928492069244, + "epoch": 1.3858174727417538, + "grad_norm": 0.6785169243812561, + "learning_rate": 6.09822630455932e-06, + "loss": 1.3939, + "mean_token_accuracy": 0.658115471402804, + "num_tokens": 2114962832.0, + "step": 12615 + }, + { + "entropy": 1.6953681409358978, + "epoch": 1.3859273296531267, + "grad_norm": 0.6881593465805054, + "learning_rate": 6.0968835651023135e-06, + "loss": 1.389, + "mean_token_accuracy": 0.6494457125663757, + "num_tokens": 2115136453.0, + "step": 12616 + }, + { + "entropy": 1.7425408363342285, + "epoch": 1.3860371865644998, + "grad_norm": 0.7663772702217102, + "learning_rate": 6.0955409808267375e-06, + "loss": 1.4875, + "mean_token_accuracy": 0.6456399957338969, + "num_tokens": 2115377261.0, + "step": 12617 + }, + { + "entropy": 1.6987970372041066, + "epoch": 1.3861470434758727, + "grad_norm": 0.6070899963378906, + "learning_rate": 6.0941985517750745e-06, + "loss": 1.4082, + "mean_token_accuracy": 0.6523675471544266, + "num_tokens": 2115581762.0, + "step": 12618 + }, + { + "entropy": 1.7355947196483612, + "epoch": 1.3862569003872456, + "grad_norm": 0.7134828567504883, + "learning_rate": 6.092856277989822e-06, + "loss": 1.1913, + "mean_token_accuracy": 0.687599798043569, + "num_tokens": 2115710252.0, + "step": 12619 + }, + { + "entropy": 1.6785525679588318, + "epoch": 1.3863667572986185, + "grad_norm": 0.6055605411529541, + "learning_rate": 6.0915141595134555e-06, + "loss": 1.3615, + "mean_token_accuracy": 0.6554200698932012, + "num_tokens": 2115893536.0, + "step": 12620 + }, + { + "entropy": 1.7625056405862172, + "epoch": 1.3864766142099914, + "grad_norm": 0.9016237854957581, + "learning_rate": 6.090172196388451e-06, + "loss": 1.5013, + "mean_token_accuracy": 0.6534651468197504, + "num_tokens": 2116052986.0, + "step": 12621 + }, + { + "entropy": 1.7133546868960063, + "epoch": 1.3865864711213645, + "grad_norm": 0.6940526366233826, + "learning_rate": 6.088830388657284e-06, + "loss": 1.3231, + "mean_token_accuracy": 0.6634116520484289, + "num_tokens": 2116218287.0, + "step": 12622 + }, + { + "entropy": 1.6911606689294179, + "epoch": 1.3866963280327373, + "grad_norm": 0.6579228639602661, + "learning_rate": 6.0874887363624255e-06, + "loss": 1.2912, + "mean_token_accuracy": 0.6736189971367518, + "num_tokens": 2116348102.0, + "step": 12623 + }, + { + "entropy": 1.6526914338270824, + "epoch": 1.3868061849441102, + "grad_norm": 0.6540764570236206, + "learning_rate": 6.086147239546336e-06, + "loss": 1.418, + "mean_token_accuracy": 0.6521534671386083, + "num_tokens": 2116528433.0, + "step": 12624 + }, + { + "entropy": 1.6569193700949352, + "epoch": 1.3869160418554833, + "grad_norm": 0.5886544585227966, + "learning_rate": 6.084805898251468e-06, + "loss": 1.4071, + "mean_token_accuracy": 0.660218303402265, + "num_tokens": 2116774255.0, + "step": 12625 + }, + { + "entropy": 1.7367678980032604, + "epoch": 1.3870258987668562, + "grad_norm": 0.7575657367706299, + "learning_rate": 6.083464712520282e-06, + "loss": 1.415, + "mean_token_accuracy": 0.6493276755015055, + "num_tokens": 2116913832.0, + "step": 12626 + }, + { + "entropy": 1.7625746925671895, + "epoch": 1.387135755678229, + "grad_norm": 0.723961353302002, + "learning_rate": 6.082123682395222e-06, + "loss": 1.5134, + "mean_token_accuracy": 0.6588109185298284, + "num_tokens": 2117102538.0, + "step": 12627 + }, + { + "entropy": 1.6678134202957153, + "epoch": 1.387245612589602, + "grad_norm": 0.7065550088882446, + "learning_rate": 6.080782807918728e-06, + "loss": 1.3057, + "mean_token_accuracy": 0.676655059059461, + "num_tokens": 2117230523.0, + "step": 12628 + }, + { + "entropy": 1.742838462193807, + "epoch": 1.3873554695009749, + "grad_norm": 0.7316716313362122, + "learning_rate": 6.079442089133245e-06, + "loss": 1.3569, + "mean_token_accuracy": 0.6540696074565252, + "num_tokens": 2117359500.0, + "step": 12629 + }, + { + "entropy": 1.7375612556934357, + "epoch": 1.387465326412348, + "grad_norm": 0.756300687789917, + "learning_rate": 6.078101526081199e-06, + "loss": 1.5052, + "mean_token_accuracy": 0.6618654529253641, + "num_tokens": 2117537412.0, + "step": 12630 + }, + { + "entropy": 1.6921890676021576, + "epoch": 1.3875751833237209, + "grad_norm": 0.6773094534873962, + "learning_rate": 6.076761118805026e-06, + "loss": 1.3607, + "mean_token_accuracy": 0.661697601278623, + "num_tokens": 2117723137.0, + "step": 12631 + }, + { + "entropy": 1.68595223625501, + "epoch": 1.3876850402350938, + "grad_norm": 0.6630276441574097, + "learning_rate": 6.075420867347144e-06, + "loss": 1.3329, + "mean_token_accuracy": 0.6754196931918462, + "num_tokens": 2117863313.0, + "step": 12632 + }, + { + "entropy": 1.7486995458602905, + "epoch": 1.3877948971464666, + "grad_norm": 0.8930343985557556, + "learning_rate": 6.07408077174997e-06, + "loss": 1.4725, + "mean_token_accuracy": 0.660398542881012, + "num_tokens": 2118000107.0, + "step": 12633 + }, + { + "entropy": 1.6790860096613567, + "epoch": 1.3879047540578395, + "grad_norm": 0.633882462978363, + "learning_rate": 6.072740832055923e-06, + "loss": 1.402, + "mean_token_accuracy": 0.6450261523326238, + "num_tokens": 2118213587.0, + "step": 12634 + }, + { + "entropy": 1.7514410018920898, + "epoch": 1.3880146109692126, + "grad_norm": 0.7302515506744385, + "learning_rate": 6.071401048307406e-06, + "loss": 1.3367, + "mean_token_accuracy": 0.6589195132255554, + "num_tokens": 2118373897.0, + "step": 12635 + }, + { + "entropy": 1.734345058600108, + "epoch": 1.3881244678805855, + "grad_norm": 0.6358147859573364, + "learning_rate": 6.070061420546827e-06, + "loss": 1.4476, + "mean_token_accuracy": 0.6477800408999125, + "num_tokens": 2118531537.0, + "step": 12636 + }, + { + "entropy": 1.7529782156149547, + "epoch": 1.3882343247919584, + "grad_norm": 0.6754707098007202, + "learning_rate": 6.0687219488165826e-06, + "loss": 1.3373, + "mean_token_accuracy": 0.6639518241087595, + "num_tokens": 2118681904.0, + "step": 12637 + }, + { + "entropy": 1.674676090478897, + "epoch": 1.3883441817033315, + "grad_norm": 0.8263653516769409, + "learning_rate": 6.067382633159062e-06, + "loss": 1.2034, + "mean_token_accuracy": 0.6820022811492285, + "num_tokens": 2118787859.0, + "step": 12638 + }, + { + "entropy": 1.7267674307028453, + "epoch": 1.3884540386147044, + "grad_norm": 0.6797496676445007, + "learning_rate": 6.066043473616665e-06, + "loss": 1.3251, + "mean_token_accuracy": 0.6689668297767639, + "num_tokens": 2118933401.0, + "step": 12639 + }, + { + "entropy": 1.6904515027999878, + "epoch": 1.3885638955260773, + "grad_norm": 0.7103528380393982, + "learning_rate": 6.064704470231766e-06, + "loss": 1.3393, + "mean_token_accuracy": 0.6714362452427546, + "num_tokens": 2119096794.0, + "step": 12640 + }, + { + "entropy": 1.729119469722112, + "epoch": 1.3886737524374502, + "grad_norm": 0.7350696921348572, + "learning_rate": 6.063365623046744e-06, + "loss": 1.4866, + "mean_token_accuracy": 0.6382670154174169, + "num_tokens": 2119289765.0, + "step": 12641 + }, + { + "entropy": 1.6505240897337596, + "epoch": 1.388783609348823, + "grad_norm": 0.6928836107254028, + "learning_rate": 6.062026932103976e-06, + "loss": 1.3396, + "mean_token_accuracy": 0.6640495459238688, + "num_tokens": 2119449525.0, + "step": 12642 + }, + { + "entropy": 1.6739195088545482, + "epoch": 1.3888934662601962, + "grad_norm": 0.6808801293373108, + "learning_rate": 6.0606883974458345e-06, + "loss": 1.4049, + "mean_token_accuracy": 0.6502687732378641, + "num_tokens": 2119607013.0, + "step": 12643 + }, + { + "entropy": 1.7042246758937836, + "epoch": 1.389003323171569, + "grad_norm": 0.9757330417633057, + "learning_rate": 6.059350019114678e-06, + "loss": 1.5106, + "mean_token_accuracy": 0.6467631061871847, + "num_tokens": 2119764218.0, + "step": 12644 + }, + { + "entropy": 1.7424436310927074, + "epoch": 1.389113180082942, + "grad_norm": 0.7235627770423889, + "learning_rate": 6.0580117971528655e-06, + "loss": 1.3913, + "mean_token_accuracy": 0.6477436472972234, + "num_tokens": 2119946466.0, + "step": 12645 + }, + { + "entropy": 1.7050765951474507, + "epoch": 1.389223036994315, + "grad_norm": 0.6872043609619141, + "learning_rate": 6.056673731602753e-06, + "loss": 1.2568, + "mean_token_accuracy": 0.6747584690650305, + "num_tokens": 2120094895.0, + "step": 12646 + }, + { + "entropy": 1.667133589585622, + "epoch": 1.3893328939056877, + "grad_norm": 0.7159779071807861, + "learning_rate": 6.055335822506688e-06, + "loss": 1.2612, + "mean_token_accuracy": 0.6703705290953318, + "num_tokens": 2120234938.0, + "step": 12647 + }, + { + "entropy": 1.7026324371496837, + "epoch": 1.3894427508170608, + "grad_norm": 0.6149495244026184, + "learning_rate": 6.053998069907019e-06, + "loss": 1.3952, + "mean_token_accuracy": 0.6657251864671707, + "num_tokens": 2120393921.0, + "step": 12648 + }, + { + "entropy": 1.7234665950139363, + "epoch": 1.3895526077284337, + "grad_norm": 0.6455737352371216, + "learning_rate": 6.052660473846084e-06, + "loss": 1.5308, + "mean_token_accuracy": 0.626121923327446, + "num_tokens": 2120594349.0, + "step": 12649 + }, + { + "entropy": 1.6969818969567616, + "epoch": 1.3896624646398066, + "grad_norm": 0.7119026184082031, + "learning_rate": 6.05132303436621e-06, + "loss": 1.4113, + "mean_token_accuracy": 0.6501194735368093, + "num_tokens": 2120773080.0, + "step": 12650 + }, + { + "entropy": 1.7361374100049336, + "epoch": 1.3897723215511797, + "grad_norm": 0.7004039287567139, + "learning_rate": 6.049985751509737e-06, + "loss": 1.3639, + "mean_token_accuracy": 0.6526622970898946, + "num_tokens": 2120923549.0, + "step": 12651 + }, + { + "entropy": 1.737576534350713, + "epoch": 1.3898821784625526, + "grad_norm": 0.7022482752799988, + "learning_rate": 6.048648625318984e-06, + "loss": 1.4327, + "mean_token_accuracy": 0.6477037022511164, + "num_tokens": 2121135737.0, + "step": 12652 + }, + { + "entropy": 1.6698547104994457, + "epoch": 1.3899920353739255, + "grad_norm": 0.6836487054824829, + "learning_rate": 6.0473116558362664e-06, + "loss": 1.2327, + "mean_token_accuracy": 0.6811383267243704, + "num_tokens": 2121273006.0, + "step": 12653 + }, + { + "entropy": 1.696809043486913, + "epoch": 1.3901018922852983, + "grad_norm": 0.820905327796936, + "learning_rate": 6.045974843103905e-06, + "loss": 1.3486, + "mean_token_accuracy": 0.6534950186808904, + "num_tokens": 2121413071.0, + "step": 12654 + }, + { + "entropy": 1.6631783346335094, + "epoch": 1.3902117491966712, + "grad_norm": 0.6575664281845093, + "learning_rate": 6.0446381871642094e-06, + "loss": 1.4987, + "mean_token_accuracy": 0.6349124858776728, + "num_tokens": 2121632157.0, + "step": 12655 + }, + { + "entropy": 1.6863780121008556, + "epoch": 1.3903216061080443, + "grad_norm": 0.6468070149421692, + "learning_rate": 6.043301688059482e-06, + "loss": 1.4576, + "mean_token_accuracy": 0.6422467132409414, + "num_tokens": 2121782067.0, + "step": 12656 + }, + { + "entropy": 1.7445188562075298, + "epoch": 1.3904314630194172, + "grad_norm": 0.6437369585037231, + "learning_rate": 6.04196534583202e-06, + "loss": 1.3062, + "mean_token_accuracy": 0.6756186882654825, + "num_tokens": 2121955217.0, + "step": 12657 + }, + { + "entropy": 1.7488488654295604, + "epoch": 1.39054131993079, + "grad_norm": 0.7552010416984558, + "learning_rate": 6.0406291605241255e-06, + "loss": 1.321, + "mean_token_accuracy": 0.6689753333727518, + "num_tokens": 2122074461.0, + "step": 12658 + }, + { + "entropy": 1.6862552265326183, + "epoch": 1.3906511768421632, + "grad_norm": 0.7533565759658813, + "learning_rate": 6.039293132178078e-06, + "loss": 1.4313, + "mean_token_accuracy": 0.6706264317035675, + "num_tokens": 2122270613.0, + "step": 12659 + }, + { + "entropy": 1.668715238571167, + "epoch": 1.390761033753536, + "grad_norm": 0.6359433531761169, + "learning_rate": 6.0379572608361715e-06, + "loss": 1.2886, + "mean_token_accuracy": 0.6727031916379929, + "num_tokens": 2122418030.0, + "step": 12660 + }, + { + "entropy": 1.7360176543394725, + "epoch": 1.390870890664909, + "grad_norm": 0.6393101215362549, + "learning_rate": 6.036621546540682e-06, + "loss": 1.4723, + "mean_token_accuracy": 0.6463060726722082, + "num_tokens": 2122658480.0, + "step": 12661 + }, + { + "entropy": 1.6931299567222595, + "epoch": 1.3909807475762819, + "grad_norm": 0.6674166321754456, + "learning_rate": 6.035285989333879e-06, + "loss": 1.2776, + "mean_token_accuracy": 0.6711171269416809, + "num_tokens": 2122775840.0, + "step": 12662 + }, + { + "entropy": 1.6795523365338643, + "epoch": 1.3910906044876548, + "grad_norm": 0.6074432134628296, + "learning_rate": 6.033950589258042e-06, + "loss": 1.3652, + "mean_token_accuracy": 0.6604388256867727, + "num_tokens": 2122943493.0, + "step": 12663 + }, + { + "entropy": 1.6648745040098827, + "epoch": 1.3912004613990279, + "grad_norm": 0.6981958150863647, + "learning_rate": 6.032615346355431e-06, + "loss": 1.5333, + "mean_token_accuracy": 0.6418487280607224, + "num_tokens": 2123127314.0, + "step": 12664 + }, + { + "entropy": 1.6763863563537598, + "epoch": 1.3913103183104008, + "grad_norm": 0.6888807415962219, + "learning_rate": 6.031280260668304e-06, + "loss": 1.3952, + "mean_token_accuracy": 0.6572959423065186, + "num_tokens": 2123323822.0, + "step": 12665 + }, + { + "entropy": 1.6850894292195637, + "epoch": 1.3914201752217736, + "grad_norm": 0.6015814542770386, + "learning_rate": 6.029945332238916e-06, + "loss": 1.5196, + "mean_token_accuracy": 0.6445889174938202, + "num_tokens": 2123520705.0, + "step": 12666 + }, + { + "entropy": 1.7734043498833973, + "epoch": 1.3915300321331465, + "grad_norm": 0.6743616461753845, + "learning_rate": 6.028610561109522e-06, + "loss": 1.5488, + "mean_token_accuracy": 0.6452811906735102, + "num_tokens": 2123672858.0, + "step": 12667 + }, + { + "entropy": 1.7352135578791301, + "epoch": 1.3916398890445194, + "grad_norm": 0.73172527551651, + "learning_rate": 6.027275947322364e-06, + "loss": 1.3727, + "mean_token_accuracy": 0.6599378883838654, + "num_tokens": 2123804318.0, + "step": 12668 + }, + { + "entropy": 1.7475886444250743, + "epoch": 1.3917497459558925, + "grad_norm": 0.6466162800788879, + "learning_rate": 6.025941490919678e-06, + "loss": 1.4018, + "mean_token_accuracy": 0.6429226100444794, + "num_tokens": 2123975562.0, + "step": 12669 + }, + { + "entropy": 1.7205195526281993, + "epoch": 1.3918596028672654, + "grad_norm": 0.6437305808067322, + "learning_rate": 6.024607191943707e-06, + "loss": 1.3518, + "mean_token_accuracy": 0.6662203172842661, + "num_tokens": 2124119989.0, + "step": 12670 + }, + { + "entropy": 1.6926952401796977, + "epoch": 1.3919694597786383, + "grad_norm": 0.6400312185287476, + "learning_rate": 6.023273050436671e-06, + "loss": 1.3766, + "mean_token_accuracy": 0.6680413832267126, + "num_tokens": 2124270078.0, + "step": 12671 + }, + { + "entropy": 1.7688163717587788, + "epoch": 1.3920793166900114, + "grad_norm": 0.7608019113540649, + "learning_rate": 6.021939066440805e-06, + "loss": 1.3084, + "mean_token_accuracy": 0.6677152961492538, + "num_tokens": 2124413626.0, + "step": 12672 + }, + { + "entropy": 1.6955342292785645, + "epoch": 1.3921891736013843, + "grad_norm": 0.669330358505249, + "learning_rate": 6.020605239998325e-06, + "loss": 1.4574, + "mean_token_accuracy": 0.6318171223004659, + "num_tokens": 2124604904.0, + "step": 12673 + }, + { + "entropy": 1.7541027069091797, + "epoch": 1.3922990305127572, + "grad_norm": 0.7224034667015076, + "learning_rate": 6.0192715711514415e-06, + "loss": 1.3589, + "mean_token_accuracy": 0.6613028347492218, + "num_tokens": 2124738170.0, + "step": 12674 + }, + { + "entropy": 1.623542954524358, + "epoch": 1.39240888742413, + "grad_norm": 0.7347180247306824, + "learning_rate": 6.01793805994237e-06, + "loss": 1.4068, + "mean_token_accuracy": 0.6547928502162298, + "num_tokens": 2124955515.0, + "step": 12675 + }, + { + "entropy": 1.7191152274608612, + "epoch": 1.392518744335503, + "grad_norm": 0.6667714715003967, + "learning_rate": 6.016604706413316e-06, + "loss": 1.3162, + "mean_token_accuracy": 0.6564339945713679, + "num_tokens": 2125103505.0, + "step": 12676 + }, + { + "entropy": 1.7070033649603527, + "epoch": 1.392628601246876, + "grad_norm": 0.868321418762207, + "learning_rate": 6.015271510606473e-06, + "loss": 1.4041, + "mean_token_accuracy": 0.6572008927663168, + "num_tokens": 2125269047.0, + "step": 12677 + }, + { + "entropy": 1.7142626245816548, + "epoch": 1.392738458158249, + "grad_norm": 0.6363254189491272, + "learning_rate": 6.01393847256404e-06, + "loss": 1.5588, + "mean_token_accuracy": 0.6368576760093371, + "num_tokens": 2125441325.0, + "step": 12678 + }, + { + "entropy": 1.7593932350476582, + "epoch": 1.3928483150696218, + "grad_norm": 0.5922143459320068, + "learning_rate": 6.012605592328213e-06, + "loss": 1.4497, + "mean_token_accuracy": 0.6469251116116842, + "num_tokens": 2125633983.0, + "step": 12679 + }, + { + "entropy": 1.699719746907552, + "epoch": 1.3929581719809947, + "grad_norm": 0.7195116877555847, + "learning_rate": 6.0112728699411714e-06, + "loss": 1.4665, + "mean_token_accuracy": 0.6510594636201859, + "num_tokens": 2125807954.0, + "step": 12680 + }, + { + "entropy": 1.76499076684316, + "epoch": 1.3930680288923676, + "grad_norm": 0.7974780797958374, + "learning_rate": 6.009940305445091e-06, + "loss": 1.3902, + "mean_token_accuracy": 0.6549367159605026, + "num_tokens": 2125928186.0, + "step": 12681 + }, + { + "entropy": 1.6431426803270976, + "epoch": 1.3931778858037407, + "grad_norm": 0.6491580605506897, + "learning_rate": 6.008607898882155e-06, + "loss": 1.3716, + "mean_token_accuracy": 0.6630857636531194, + "num_tokens": 2126146596.0, + "step": 12682 + }, + { + "entropy": 1.6817518671353657, + "epoch": 1.3932877427151136, + "grad_norm": 0.7032451629638672, + "learning_rate": 6.00727565029453e-06, + "loss": 1.4448, + "mean_token_accuracy": 0.6501007825136185, + "num_tokens": 2126341206.0, + "step": 12683 + }, + { + "entropy": 1.6864906052748363, + "epoch": 1.3933975996264865, + "grad_norm": 0.8053936958312988, + "learning_rate": 6.005943559724376e-06, + "loss": 1.5758, + "mean_token_accuracy": 0.6425473292668661, + "num_tokens": 2126538622.0, + "step": 12684 + }, + { + "entropy": 1.6943072477976482, + "epoch": 1.3935074565378596, + "grad_norm": 0.6725866198539734, + "learning_rate": 6.004611627213863e-06, + "loss": 1.3835, + "mean_token_accuracy": 0.6750175058841705, + "num_tokens": 2126712924.0, + "step": 12685 + }, + { + "entropy": 1.7044211824735005, + "epoch": 1.3936173134492325, + "grad_norm": 0.6494740843772888, + "learning_rate": 6.003279852805137e-06, + "loss": 1.4474, + "mean_token_accuracy": 0.6646173646052679, + "num_tokens": 2126862241.0, + "step": 12686 + }, + { + "entropy": 1.7204302748044331, + "epoch": 1.3937271703606053, + "grad_norm": 0.6521239876747131, + "learning_rate": 6.001948236540357e-06, + "loss": 1.4459, + "mean_token_accuracy": 0.6400475154320399, + "num_tokens": 2127025520.0, + "step": 12687 + }, + { + "entropy": 1.716679612795512, + "epoch": 1.3938370272719782, + "grad_norm": 0.7173079252243042, + "learning_rate": 6.000616778461661e-06, + "loss": 1.3788, + "mean_token_accuracy": 0.6761051565408707, + "num_tokens": 2127282244.0, + "step": 12688 + }, + { + "entropy": 1.7010992169380188, + "epoch": 1.393946884183351, + "grad_norm": 19.059829711914062, + "learning_rate": 5.99928547861119e-06, + "loss": 1.3966, + "mean_token_accuracy": 0.6659414370854696, + "num_tokens": 2127448006.0, + "step": 12689 + }, + { + "entropy": 1.657021979490916, + "epoch": 1.3940567410947242, + "grad_norm": 0.5839347839355469, + "learning_rate": 5.9979543370310775e-06, + "loss": 1.4324, + "mean_token_accuracy": 0.636112704873085, + "num_tokens": 2127644165.0, + "step": 12690 + }, + { + "entropy": 1.714819739262263, + "epoch": 1.394166598006097, + "grad_norm": 0.6099923849105835, + "learning_rate": 5.996623353763462e-06, + "loss": 1.4218, + "mean_token_accuracy": 0.6461069136857986, + "num_tokens": 2127789572.0, + "step": 12691 + }, + { + "entropy": 1.7752255698045094, + "epoch": 1.39427645491747, + "grad_norm": 0.722017228603363, + "learning_rate": 5.995292528850462e-06, + "loss": 1.5213, + "mean_token_accuracy": 0.6427341798941294, + "num_tokens": 2127974615.0, + "step": 12692 + }, + { + "entropy": 1.7523868183294933, + "epoch": 1.3943863118288429, + "grad_norm": 0.6684828996658325, + "learning_rate": 5.993961862334197e-06, + "loss": 1.4147, + "mean_token_accuracy": 0.653435617685318, + "num_tokens": 2128141442.0, + "step": 12693 + }, + { + "entropy": 1.7046211461226146, + "epoch": 1.3944961687402158, + "grad_norm": 0.7684405446052551, + "learning_rate": 5.9926313542567815e-06, + "loss": 1.2746, + "mean_token_accuracy": 0.6661444703737894, + "num_tokens": 2128311566.0, + "step": 12694 + }, + { + "entropy": 1.7271720071633656, + "epoch": 1.3946060256515889, + "grad_norm": 0.6236696839332581, + "learning_rate": 5.99130100466033e-06, + "loss": 1.3441, + "mean_token_accuracy": 0.663600504398346, + "num_tokens": 2128500803.0, + "step": 12695 + }, + { + "entropy": 1.7542118628819783, + "epoch": 1.3947158825629618, + "grad_norm": 0.8479838371276855, + "learning_rate": 5.989970813586945e-06, + "loss": 1.4227, + "mean_token_accuracy": 0.6497650593519211, + "num_tokens": 2128680857.0, + "step": 12696 + }, + { + "entropy": 1.66288094719251, + "epoch": 1.3948257394743346, + "grad_norm": 0.6915057897567749, + "learning_rate": 5.988640781078724e-06, + "loss": 1.3693, + "mean_token_accuracy": 0.6525774498780569, + "num_tokens": 2128866827.0, + "step": 12697 + }, + { + "entropy": 1.729894479115804, + "epoch": 1.3949355963857077, + "grad_norm": 0.6445484161376953, + "learning_rate": 5.987310907177763e-06, + "loss": 1.4009, + "mean_token_accuracy": 0.6387731532255808, + "num_tokens": 2129076102.0, + "step": 12698 + }, + { + "entropy": 1.6647725601991017, + "epoch": 1.3950454532970806, + "grad_norm": 0.7019267678260803, + "learning_rate": 5.985981191926156e-06, + "loss": 1.4318, + "mean_token_accuracy": 0.6519081046183904, + "num_tokens": 2129283963.0, + "step": 12699 + }, + { + "entropy": 1.6667829751968384, + "epoch": 1.3951553102084535, + "grad_norm": 0.6425153017044067, + "learning_rate": 5.984651635365985e-06, + "loss": 1.4025, + "mean_token_accuracy": 0.6508398950099945, + "num_tokens": 2129458498.0, + "step": 12700 + }, + { + "entropy": 1.6813570360342662, + "epoch": 1.3952651671198264, + "grad_norm": 0.7219659090042114, + "learning_rate": 5.983322237539326e-06, + "loss": 1.331, + "mean_token_accuracy": 0.6687901417414347, + "num_tokens": 2129633109.0, + "step": 12701 + }, + { + "entropy": 1.7440830767154694, + "epoch": 1.3953750240311993, + "grad_norm": 0.7922948598861694, + "learning_rate": 5.981992998488262e-06, + "loss": 1.3723, + "mean_token_accuracy": 0.6505018224318823, + "num_tokens": 2129786689.0, + "step": 12702 + }, + { + "entropy": 1.6904495855172474, + "epoch": 1.3954848809425724, + "grad_norm": 0.7381642460823059, + "learning_rate": 5.980663918254854e-06, + "loss": 1.449, + "mean_token_accuracy": 0.6513757407665253, + "num_tokens": 2129959663.0, + "step": 12703 + }, + { + "entropy": 1.594289908806483, + "epoch": 1.3955947378539453, + "grad_norm": 0.7294413447380066, + "learning_rate": 5.979334996881177e-06, + "loss": 1.2396, + "mean_token_accuracy": 0.6839832961559296, + "num_tokens": 2130119921.0, + "step": 12704 + }, + { + "entropy": 1.6758387287457783, + "epoch": 1.3957045947653182, + "grad_norm": 0.6613010168075562, + "learning_rate": 5.978006234409282e-06, + "loss": 1.4366, + "mean_token_accuracy": 0.6469068974256516, + "num_tokens": 2130320202.0, + "step": 12705 + }, + { + "entropy": 1.6659991939862568, + "epoch": 1.395814451676691, + "grad_norm": 0.6377636790275574, + "learning_rate": 5.9766776308812245e-06, + "loss": 1.3725, + "mean_token_accuracy": 0.6629499892393748, + "num_tokens": 2130524111.0, + "step": 12706 + }, + { + "entropy": 1.6998618841171265, + "epoch": 1.395924308588064, + "grad_norm": 0.6595562100410461, + "learning_rate": 5.9753491863390585e-06, + "loss": 1.5279, + "mean_token_accuracy": 0.6366192599137624, + "num_tokens": 2130767602.0, + "step": 12707 + }, + { + "entropy": 1.7020623286565144, + "epoch": 1.396034165499437, + "grad_norm": 0.8012373447418213, + "learning_rate": 5.974020900824829e-06, + "loss": 1.4119, + "mean_token_accuracy": 0.675724262992541, + "num_tokens": 2130889858.0, + "step": 12708 + }, + { + "entropy": 1.7753592034180958, + "epoch": 1.39614402241081, + "grad_norm": 0.7129668593406677, + "learning_rate": 5.972692774380568e-06, + "loss": 1.3992, + "mean_token_accuracy": 0.669427881638209, + "num_tokens": 2131043830.0, + "step": 12709 + }, + { + "entropy": 1.67575670282046, + "epoch": 1.3962538793221828, + "grad_norm": 0.7088383436203003, + "learning_rate": 5.9713648070483165e-06, + "loss": 1.5448, + "mean_token_accuracy": 0.6411570161581039, + "num_tokens": 2131241286.0, + "step": 12710 + }, + { + "entropy": 1.6525772909323375, + "epoch": 1.396363736233556, + "grad_norm": 0.7682273983955383, + "learning_rate": 5.9700369988701055e-06, + "loss": 1.3177, + "mean_token_accuracy": 0.6682016005118688, + "num_tokens": 2131431791.0, + "step": 12711 + }, + { + "entropy": 1.6430234909057617, + "epoch": 1.3964735931449288, + "grad_norm": 0.5847803950309753, + "learning_rate": 5.968709349887957e-06, + "loss": 1.3363, + "mean_token_accuracy": 0.676411454876264, + "num_tokens": 2131614057.0, + "step": 12712 + }, + { + "entropy": 1.679671843846639, + "epoch": 1.3965834500563017, + "grad_norm": 0.7472701072692871, + "learning_rate": 5.9673818601438885e-06, + "loss": 1.2549, + "mean_token_accuracy": 0.6748977800210317, + "num_tokens": 2131726410.0, + "step": 12713 + }, + { + "entropy": 1.7021221121152241, + "epoch": 1.3966933069676746, + "grad_norm": 0.8775181174278259, + "learning_rate": 5.9660545296799185e-06, + "loss": 1.3966, + "mean_token_accuracy": 0.6520507534344991, + "num_tokens": 2131916348.0, + "step": 12714 + }, + { + "entropy": 1.6914891302585602, + "epoch": 1.3968031638790475, + "grad_norm": 0.6840558648109436, + "learning_rate": 5.964727358538049e-06, + "loss": 1.2845, + "mean_token_accuracy": 0.6717403084039688, + "num_tokens": 2132060214.0, + "step": 12715 + }, + { + "entropy": 1.7304686605930328, + "epoch": 1.3969130207904206, + "grad_norm": 0.7312942147254944, + "learning_rate": 5.963400346760297e-06, + "loss": 1.5967, + "mean_token_accuracy": 0.6488021487991015, + "num_tokens": 2132217864.0, + "step": 12716 + }, + { + "entropy": 1.7533520062764485, + "epoch": 1.3970228777017935, + "grad_norm": 0.6745875477790833, + "learning_rate": 5.962073494388652e-06, + "loss": 1.4307, + "mean_token_accuracy": 0.6536213358243307, + "num_tokens": 2132385245.0, + "step": 12717 + }, + { + "entropy": 1.7508942981561024, + "epoch": 1.3971327346131663, + "grad_norm": 0.7504527568817139, + "learning_rate": 5.9607468014651085e-06, + "loss": 1.5074, + "mean_token_accuracy": 0.6408179601033529, + "num_tokens": 2132551692.0, + "step": 12718 + }, + { + "entropy": 1.7008541425069172, + "epoch": 1.3972425915245392, + "grad_norm": 0.6057212352752686, + "learning_rate": 5.959420268031661e-06, + "loss": 1.5229, + "mean_token_accuracy": 0.6457130114237467, + "num_tokens": 2132768679.0, + "step": 12719 + }, + { + "entropy": 1.6575111548105876, + "epoch": 1.397352448435912, + "grad_norm": 0.8810737133026123, + "learning_rate": 5.9580938941302905e-06, + "loss": 1.2997, + "mean_token_accuracy": 0.6577588965495428, + "num_tokens": 2132902521.0, + "step": 12720 + }, + { + "entropy": 1.7134600281715393, + "epoch": 1.3974623053472852, + "grad_norm": 0.6999838948249817, + "learning_rate": 5.956767679802972e-06, + "loss": 1.4848, + "mean_token_accuracy": 0.6436127026875814, + "num_tokens": 2133091286.0, + "step": 12721 + }, + { + "entropy": 1.6435298323631287, + "epoch": 1.397572162258658, + "grad_norm": 0.7796320915222168, + "learning_rate": 5.955441625091685e-06, + "loss": 1.5166, + "mean_token_accuracy": 0.6519753734270731, + "num_tokens": 2133271963.0, + "step": 12722 + }, + { + "entropy": 1.7259367903073628, + "epoch": 1.397682019170031, + "grad_norm": 0.7645683288574219, + "learning_rate": 5.9541157300384015e-06, + "loss": 1.501, + "mean_token_accuracy": 0.6502551784118017, + "num_tokens": 2133415532.0, + "step": 12723 + }, + { + "entropy": 1.693211168050766, + "epoch": 1.397791876081404, + "grad_norm": 0.7891166806221008, + "learning_rate": 5.95278999468508e-06, + "loss": 1.2687, + "mean_token_accuracy": 0.6685374329487482, + "num_tokens": 2133555494.0, + "step": 12724 + }, + { + "entropy": 1.6511625250180562, + "epoch": 1.397901732992777, + "grad_norm": 0.5752436518669128, + "learning_rate": 5.951464419073677e-06, + "loss": 1.441, + "mean_token_accuracy": 0.6449030637741089, + "num_tokens": 2133777556.0, + "step": 12725 + }, + { + "entropy": 1.6687390704949696, + "epoch": 1.3980115899041499, + "grad_norm": 0.6337757110595703, + "learning_rate": 5.9501390032461555e-06, + "loss": 1.3054, + "mean_token_accuracy": 0.6698885361353556, + "num_tokens": 2134009899.0, + "step": 12726 + }, + { + "entropy": 1.7128651042779286, + "epoch": 1.3981214468155228, + "grad_norm": 0.6634067296981812, + "learning_rate": 5.9488137472444526e-06, + "loss": 1.4062, + "mean_token_accuracy": 0.6472688515981039, + "num_tokens": 2134211243.0, + "step": 12727 + }, + { + "entropy": 1.7509056230386097, + "epoch": 1.3982313037268956, + "grad_norm": 0.691646933555603, + "learning_rate": 5.947488651110525e-06, + "loss": 1.4227, + "mean_token_accuracy": 0.6611177225907644, + "num_tokens": 2134348640.0, + "step": 12728 + }, + { + "entropy": 1.7512084345022838, + "epoch": 1.3983411606382687, + "grad_norm": 0.8137237429618835, + "learning_rate": 5.946163714886304e-06, + "loss": 1.3646, + "mean_token_accuracy": 0.6650453756252924, + "num_tokens": 2134516069.0, + "step": 12729 + }, + { + "entropy": 1.702260931332906, + "epoch": 1.3984510175496416, + "grad_norm": 0.6533256769180298, + "learning_rate": 5.944838938613722e-06, + "loss": 1.4827, + "mean_token_accuracy": 0.6493832468986511, + "num_tokens": 2134719967.0, + "step": 12730 + }, + { + "entropy": 1.720413823922475, + "epoch": 1.3985608744610145, + "grad_norm": 0.7293774485588074, + "learning_rate": 5.94351432233471e-06, + "loss": 1.3079, + "mean_token_accuracy": 0.6654588927825292, + "num_tokens": 2134840334.0, + "step": 12731 + }, + { + "entropy": 1.6680286626021068, + "epoch": 1.3986707313723874, + "grad_norm": 0.7952906489372253, + "learning_rate": 5.942189866091192e-06, + "loss": 1.4333, + "mean_token_accuracy": 0.6534133901198705, + "num_tokens": 2134991028.0, + "step": 12732 + }, + { + "entropy": 1.6684763828913372, + "epoch": 1.3987805882837603, + "grad_norm": 0.6839401721954346, + "learning_rate": 5.940865569925084e-06, + "loss": 1.5594, + "mean_token_accuracy": 0.6263647129138311, + "num_tokens": 2135260443.0, + "step": 12733 + }, + { + "entropy": 1.763797640800476, + "epoch": 1.3988904451951334, + "grad_norm": 0.6452272534370422, + "learning_rate": 5.9395414338783e-06, + "loss": 1.4462, + "mean_token_accuracy": 0.6526133120059967, + "num_tokens": 2135421632.0, + "step": 12734 + }, + { + "entropy": 1.7352626224358876, + "epoch": 1.3990003021065063, + "grad_norm": 0.6591407060623169, + "learning_rate": 5.938217457992752e-06, + "loss": 1.3205, + "mean_token_accuracy": 0.6572297314802805, + "num_tokens": 2135570456.0, + "step": 12735 + }, + { + "entropy": 1.6932378311951954, + "epoch": 1.3991101590178792, + "grad_norm": 1.0089200735092163, + "learning_rate": 5.936893642310342e-06, + "loss": 1.4389, + "mean_token_accuracy": 0.6600636690855026, + "num_tokens": 2135747412.0, + "step": 12736 + }, + { + "entropy": 1.6893216868241627, + "epoch": 1.3992200159292523, + "grad_norm": 0.5760171413421631, + "learning_rate": 5.935569986872962e-06, + "loss": 1.4425, + "mean_token_accuracy": 0.6468855142593384, + "num_tokens": 2135980426.0, + "step": 12737 + }, + { + "entropy": 1.7061325411001842, + "epoch": 1.3993298728406252, + "grad_norm": 0.6980313062667847, + "learning_rate": 5.934246491722515e-06, + "loss": 1.3273, + "mean_token_accuracy": 0.6672591865062714, + "num_tokens": 2136158512.0, + "step": 12738 + }, + { + "entropy": 1.734901487827301, + "epoch": 1.399439729751998, + "grad_norm": 0.8329048752784729, + "learning_rate": 5.93292315690088e-06, + "loss": 1.3188, + "mean_token_accuracy": 0.6712016463279724, + "num_tokens": 2136318374.0, + "step": 12739 + }, + { + "entropy": 1.7213096022605896, + "epoch": 1.399549586663371, + "grad_norm": 0.8984243273735046, + "learning_rate": 5.931599982449945e-06, + "loss": 1.5536, + "mean_token_accuracy": 0.648496687412262, + "num_tokens": 2136472657.0, + "step": 12740 + }, + { + "entropy": 1.6307465930779774, + "epoch": 1.3996594435747438, + "grad_norm": 0.6590262651443481, + "learning_rate": 5.930276968411589e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.6669967323541641, + "num_tokens": 2136625484.0, + "step": 12741 + }, + { + "entropy": 1.744086354970932, + "epoch": 1.399769300486117, + "grad_norm": 0.794403612613678, + "learning_rate": 5.928954114827679e-06, + "loss": 1.2884, + "mean_token_accuracy": 0.6704124808311462, + "num_tokens": 2136783736.0, + "step": 12742 + }, + { + "entropy": 1.7087414264678955, + "epoch": 1.3998791573974898, + "grad_norm": 0.6517627239227295, + "learning_rate": 5.927631421740088e-06, + "loss": 1.4211, + "mean_token_accuracy": 0.6420366764068604, + "num_tokens": 2136965121.0, + "step": 12743 + }, + { + "entropy": 1.7388703723748524, + "epoch": 1.3999890143088627, + "grad_norm": 0.5937987565994263, + "learning_rate": 5.926308889190677e-06, + "loss": 1.3561, + "mean_token_accuracy": 0.6579962919155756, + "num_tokens": 2137139051.0, + "step": 12744 + }, + { + "entropy": 1.710933009783427, + "epoch": 1.4000988712202356, + "grad_norm": 0.653157651424408, + "learning_rate": 5.9249865172213e-06, + "loss": 1.4606, + "mean_token_accuracy": 0.647930254538854, + "num_tokens": 2137320154.0, + "step": 12745 + }, + { + "entropy": 1.6999001502990723, + "epoch": 1.4002087281316085, + "grad_norm": 0.7102558612823486, + "learning_rate": 5.9236643058738154e-06, + "loss": 1.4033, + "mean_token_accuracy": 0.6542644649744034, + "num_tokens": 2137500878.0, + "step": 12746 + }, + { + "entropy": 1.7328318357467651, + "epoch": 1.4003185850429816, + "grad_norm": 0.6837024092674255, + "learning_rate": 5.922342255190069e-06, + "loss": 1.346, + "mean_token_accuracy": 0.66578309237957, + "num_tokens": 2137655692.0, + "step": 12747 + }, + { + "entropy": 1.6835198104381561, + "epoch": 1.4004284419543545, + "grad_norm": 0.6370250582695007, + "learning_rate": 5.921020365211904e-06, + "loss": 1.5254, + "mean_token_accuracy": 0.6239050130049387, + "num_tokens": 2137823214.0, + "step": 12748 + }, + { + "entropy": 1.7602061529954274, + "epoch": 1.4005382988657273, + "grad_norm": 0.6429856419563293, + "learning_rate": 5.91969863598115e-06, + "loss": 1.4799, + "mean_token_accuracy": 0.6393208205699921, + "num_tokens": 2138018064.0, + "step": 12749 + }, + { + "entropy": 1.745541363954544, + "epoch": 1.4006481557771004, + "grad_norm": 0.8476991653442383, + "learning_rate": 5.918377067539649e-06, + "loss": 1.1879, + "mean_token_accuracy": 0.6587680826584498, + "num_tokens": 2138210800.0, + "step": 12750 + }, + { + "entropy": 1.7145410180091858, + "epoch": 1.4007580126884733, + "grad_norm": 0.7389444708824158, + "learning_rate": 5.917055659929226e-06, + "loss": 1.4971, + "mean_token_accuracy": 0.6424557218949, + "num_tokens": 2138401395.0, + "step": 12751 + }, + { + "entropy": 1.7683631479740143, + "epoch": 1.4008678695998462, + "grad_norm": 0.7572634816169739, + "learning_rate": 5.9157344131916964e-06, + "loss": 1.2822, + "mean_token_accuracy": 0.6673834770917892, + "num_tokens": 2138510935.0, + "step": 12752 + }, + { + "entropy": 1.7613183856010437, + "epoch": 1.400977726511219, + "grad_norm": 0.7066530585289001, + "learning_rate": 5.914413327368884e-06, + "loss": 1.4304, + "mean_token_accuracy": 0.6482445945342382, + "num_tokens": 2138700696.0, + "step": 12753 + }, + { + "entropy": 1.7241238355636597, + "epoch": 1.401087583422592, + "grad_norm": 0.6761777997016907, + "learning_rate": 5.913092402502596e-06, + "loss": 1.4649, + "mean_token_accuracy": 0.632220983505249, + "num_tokens": 2138879164.0, + "step": 12754 + }, + { + "entropy": 1.7183875143527985, + "epoch": 1.401197440333965, + "grad_norm": 0.6765937209129333, + "learning_rate": 5.911771638634645e-06, + "loss": 1.3318, + "mean_token_accuracy": 0.6607321550448736, + "num_tokens": 2138999181.0, + "step": 12755 + }, + { + "entropy": 1.7736754318078358, + "epoch": 1.401307297245338, + "grad_norm": 6.098966598510742, + "learning_rate": 5.910451035806827e-06, + "loss": 1.3586, + "mean_token_accuracy": 0.6717801292737325, + "num_tokens": 2139163745.0, + "step": 12756 + }, + { + "entropy": 1.617441564798355, + "epoch": 1.4014171541567109, + "grad_norm": 0.6018511652946472, + "learning_rate": 5.909130594060937e-06, + "loss": 1.497, + "mean_token_accuracy": 0.6571058879295985, + "num_tokens": 2139356692.0, + "step": 12757 + }, + { + "entropy": 1.7085582911968231, + "epoch": 1.4015270110680838, + "grad_norm": 0.7229043245315552, + "learning_rate": 5.907810313438773e-06, + "loss": 1.2965, + "mean_token_accuracy": 0.6663492073615392, + "num_tokens": 2139499979.0, + "step": 12758 + }, + { + "entropy": 1.7085819641749065, + "epoch": 1.4016368679794566, + "grad_norm": 0.6772550344467163, + "learning_rate": 5.906490193982117e-06, + "loss": 1.4769, + "mean_token_accuracy": 0.6481290062268575, + "num_tokens": 2139690644.0, + "step": 12759 + }, + { + "entropy": 1.7135390937328339, + "epoch": 1.4017467248908297, + "grad_norm": 0.7282260060310364, + "learning_rate": 5.905170235732753e-06, + "loss": 1.3773, + "mean_token_accuracy": 0.6570235292116801, + "num_tokens": 2139868571.0, + "step": 12760 + }, + { + "entropy": 1.7161648571491241, + "epoch": 1.4018565818022026, + "grad_norm": 0.820698082447052, + "learning_rate": 5.903850438732454e-06, + "loss": 1.6134, + "mean_token_accuracy": 0.6403177628914515, + "num_tokens": 2140033198.0, + "step": 12761 + }, + { + "entropy": 1.6414225101470947, + "epoch": 1.4019664387135755, + "grad_norm": 0.7905219197273254, + "learning_rate": 5.9025308030229926e-06, + "loss": 1.334, + "mean_token_accuracy": 0.6737157901128134, + "num_tokens": 2140175099.0, + "step": 12762 + }, + { + "entropy": 1.6699174046516418, + "epoch": 1.4020762956249486, + "grad_norm": 0.7612943053245544, + "learning_rate": 5.901211328646134e-06, + "loss": 1.3, + "mean_token_accuracy": 0.6618338972330093, + "num_tokens": 2140342013.0, + "step": 12763 + }, + { + "entropy": 1.721044272184372, + "epoch": 1.4021861525363215, + "grad_norm": 0.6190001964569092, + "learning_rate": 5.899892015643641e-06, + "loss": 1.3738, + "mean_token_accuracy": 0.6535343378782272, + "num_tokens": 2140523914.0, + "step": 12764 + }, + { + "entropy": 1.7023292283217113, + "epoch": 1.4022960094476944, + "grad_norm": 0.6998386979103088, + "learning_rate": 5.898572864057264e-06, + "loss": 1.2795, + "mean_token_accuracy": 0.6628076682488123, + "num_tokens": 2140648869.0, + "step": 12765 + }, + { + "entropy": 1.684864302476247, + "epoch": 1.4024058663590673, + "grad_norm": 0.7491025328636169, + "learning_rate": 5.8972538739287565e-06, + "loss": 1.3828, + "mean_token_accuracy": 0.6604053676128387, + "num_tokens": 2140791080.0, + "step": 12766 + }, + { + "entropy": 1.6907603442668915, + "epoch": 1.4025157232704402, + "grad_norm": 0.7660679221153259, + "learning_rate": 5.895935045299868e-06, + "loss": 1.4723, + "mean_token_accuracy": 0.6463464796543121, + "num_tokens": 2140950257.0, + "step": 12767 + }, + { + "entropy": 1.7172885537147522, + "epoch": 1.4026255801818133, + "grad_norm": 0.6535598635673523, + "learning_rate": 5.894616378212335e-06, + "loss": 1.6173, + "mean_token_accuracy": 0.6273392041524252, + "num_tokens": 2141170697.0, + "step": 12768 + }, + { + "entropy": 1.6735565066337585, + "epoch": 1.4027354370931862, + "grad_norm": 0.7289633750915527, + "learning_rate": 5.8932978727078916e-06, + "loss": 1.5432, + "mean_token_accuracy": 0.6520901521046957, + "num_tokens": 2141348090.0, + "step": 12769 + }, + { + "entropy": 1.7456530233224232, + "epoch": 1.402845294004559, + "grad_norm": 0.6960586905479431, + "learning_rate": 5.891979528828271e-06, + "loss": 1.3654, + "mean_token_accuracy": 0.6549131870269775, + "num_tokens": 2141509964.0, + "step": 12770 + }, + { + "entropy": 1.7111040155092876, + "epoch": 1.402955150915932, + "grad_norm": 0.660510241985321, + "learning_rate": 5.8906613466151945e-06, + "loss": 1.5111, + "mean_token_accuracy": 0.6399530122677485, + "num_tokens": 2141692058.0, + "step": 12771 + }, + { + "entropy": 1.64168119430542, + "epoch": 1.4030650078273048, + "grad_norm": 0.6687092185020447, + "learning_rate": 5.889343326110386e-06, + "loss": 1.3046, + "mean_token_accuracy": 0.6702596594889959, + "num_tokens": 2141843417.0, + "step": 12772 + }, + { + "entropy": 1.6783235470453899, + "epoch": 1.403174864738678, + "grad_norm": 0.6634986996650696, + "learning_rate": 5.8880254673555585e-06, + "loss": 1.3643, + "mean_token_accuracy": 0.6527419487635294, + "num_tokens": 2142029490.0, + "step": 12773 + }, + { + "entropy": 1.8106913566589355, + "epoch": 1.4032847216500508, + "grad_norm": 0.9083941578865051, + "learning_rate": 5.886707770392419e-06, + "loss": 1.3996, + "mean_token_accuracy": 0.6511979649464289, + "num_tokens": 2142270554.0, + "step": 12774 + }, + { + "entropy": 1.673154612382253, + "epoch": 1.4033945785614237, + "grad_norm": 0.8272859454154968, + "learning_rate": 5.885390235262678e-06, + "loss": 1.3946, + "mean_token_accuracy": 0.6538991828759512, + "num_tokens": 2142445460.0, + "step": 12775 + }, + { + "entropy": 1.7107236782709758, + "epoch": 1.4035044354727968, + "grad_norm": 0.7277297377586365, + "learning_rate": 5.88407286200803e-06, + "loss": 1.359, + "mean_token_accuracy": 0.6583436330159506, + "num_tokens": 2142591637.0, + "step": 12776 + }, + { + "entropy": 1.7707445522149403, + "epoch": 1.4036142923841697, + "grad_norm": 0.8745039701461792, + "learning_rate": 5.882755650670168e-06, + "loss": 1.3564, + "mean_token_accuracy": 0.6674359192450842, + "num_tokens": 2142733811.0, + "step": 12777 + }, + { + "entropy": 1.7235571146011353, + "epoch": 1.4037241492955426, + "grad_norm": 0.644283652305603, + "learning_rate": 5.881438601290783e-06, + "loss": 1.3158, + "mean_token_accuracy": 0.6760291904211044, + "num_tokens": 2142913022.0, + "step": 12778 + }, + { + "entropy": 1.7330328822135925, + "epoch": 1.4038340062069155, + "grad_norm": 0.8673796057701111, + "learning_rate": 5.880121713911564e-06, + "loss": 1.2657, + "mean_token_accuracy": 0.6716126203536987, + "num_tokens": 2143050557.0, + "step": 12779 + }, + { + "entropy": 1.6988015472888947, + "epoch": 1.4039438631182883, + "grad_norm": 0.6786276698112488, + "learning_rate": 5.878804988574187e-06, + "loss": 1.4185, + "mean_token_accuracy": 0.6512501438458761, + "num_tokens": 2143287497.0, + "step": 12780 + }, + { + "entropy": 1.691178212563197, + "epoch": 1.4040537200296614, + "grad_norm": 0.6749993562698364, + "learning_rate": 5.877488425320319e-06, + "loss": 1.555, + "mean_token_accuracy": 0.6484788060188293, + "num_tokens": 2143479455.0, + "step": 12781 + }, + { + "entropy": 1.7058672209580739, + "epoch": 1.4041635769410343, + "grad_norm": 0.7358183264732361, + "learning_rate": 5.876172024191638e-06, + "loss": 1.4368, + "mean_token_accuracy": 0.6451542327801386, + "num_tokens": 2143664368.0, + "step": 12782 + }, + { + "entropy": 1.7063271800676982, + "epoch": 1.4042734338524072, + "grad_norm": 0.5937331318855286, + "learning_rate": 5.8748557852298e-06, + "loss": 1.4229, + "mean_token_accuracy": 0.6448936760425568, + "num_tokens": 2143865254.0, + "step": 12783 + }, + { + "entropy": 1.6791508595148723, + "epoch": 1.40438329076378, + "grad_norm": 0.6096206903457642, + "learning_rate": 5.8735397084764715e-06, + "loss": 1.5288, + "mean_token_accuracy": 0.6363308951258659, + "num_tokens": 2144054655.0, + "step": 12784 + }, + { + "entropy": 1.6694469451904297, + "epoch": 1.404493147675153, + "grad_norm": 0.6919692754745483, + "learning_rate": 5.8722237939733e-06, + "loss": 1.3743, + "mean_token_accuracy": 0.6567158748706182, + "num_tokens": 2144227659.0, + "step": 12785 + }, + { + "entropy": 1.69756019115448, + "epoch": 1.404603004586526, + "grad_norm": 0.5972253680229187, + "learning_rate": 5.870908041761931e-06, + "loss": 1.4428, + "mean_token_accuracy": 0.6423445741335551, + "num_tokens": 2144422290.0, + "step": 12786 + }, + { + "entropy": 1.7097438871860504, + "epoch": 1.404712861497899, + "grad_norm": 0.7439383268356323, + "learning_rate": 5.869592451884016e-06, + "loss": 1.5138, + "mean_token_accuracy": 0.6612397755185763, + "num_tokens": 2144562143.0, + "step": 12787 + }, + { + "entropy": 1.6832230985164642, + "epoch": 1.4048227184092719, + "grad_norm": 0.6270433664321899, + "learning_rate": 5.868277024381188e-06, + "loss": 1.4087, + "mean_token_accuracy": 0.6563821186621984, + "num_tokens": 2144747093.0, + "step": 12788 + }, + { + "entropy": 1.6605586012204487, + "epoch": 1.404932575320645, + "grad_norm": 0.5849980711936951, + "learning_rate": 5.8669617592950756e-06, + "loss": 1.4643, + "mean_token_accuracy": 0.6427861303091049, + "num_tokens": 2144974369.0, + "step": 12789 + }, + { + "entropy": 1.754450609286626, + "epoch": 1.4050424322320179, + "grad_norm": 0.6792782545089722, + "learning_rate": 5.8656466566673096e-06, + "loss": 1.5626, + "mean_token_accuracy": 0.6303362647692362, + "num_tokens": 2145155907.0, + "step": 12790 + }, + { + "entropy": 1.7363029321034749, + "epoch": 1.4051522891433907, + "grad_norm": 0.7463129162788391, + "learning_rate": 5.864331716539519e-06, + "loss": 1.5308, + "mean_token_accuracy": 0.6494456827640533, + "num_tokens": 2145329791.0, + "step": 12791 + }, + { + "entropy": 1.665359725554784, + "epoch": 1.4052621460547636, + "grad_norm": 0.681336522102356, + "learning_rate": 5.863016938953313e-06, + "loss": 1.4401, + "mean_token_accuracy": 0.6599595348040262, + "num_tokens": 2145484550.0, + "step": 12792 + }, + { + "entropy": 1.6739614307880402, + "epoch": 1.4053720029661365, + "grad_norm": 0.6140089631080627, + "learning_rate": 5.861702323950304e-06, + "loss": 1.366, + "mean_token_accuracy": 0.654315322637558, + "num_tokens": 2145641851.0, + "step": 12793 + }, + { + "entropy": 1.7016875247160594, + "epoch": 1.4054818598775096, + "grad_norm": 0.6950314044952393, + "learning_rate": 5.860387871572105e-06, + "loss": 1.3112, + "mean_token_accuracy": 0.6771847307682037, + "num_tokens": 2145795212.0, + "step": 12794 + }, + { + "entropy": 1.7205670773983002, + "epoch": 1.4055917167888825, + "grad_norm": 0.6614289879798889, + "learning_rate": 5.85907358186031e-06, + "loss": 1.4641, + "mean_token_accuracy": 0.646242747704188, + "num_tokens": 2146010519.0, + "step": 12795 + }, + { + "entropy": 1.6690000593662262, + "epoch": 1.4057015737002554, + "grad_norm": 0.7087666988372803, + "learning_rate": 5.857759454856522e-06, + "loss": 1.2666, + "mean_token_accuracy": 0.6690100828806559, + "num_tokens": 2146162066.0, + "step": 12796 + }, + { + "entropy": 1.7213394542535145, + "epoch": 1.4058114306116283, + "grad_norm": 0.6396216750144958, + "learning_rate": 5.856445490602332e-06, + "loss": 1.4768, + "mean_token_accuracy": 0.642402172088623, + "num_tokens": 2146343869.0, + "step": 12797 + }, + { + "entropy": 1.7456571360429127, + "epoch": 1.4059212875230012, + "grad_norm": 0.775600016117096, + "learning_rate": 5.855131689139319e-06, + "loss": 1.5141, + "mean_token_accuracy": 0.6424583395322164, + "num_tokens": 2146511314.0, + "step": 12798 + }, + { + "entropy": 1.6875923077265422, + "epoch": 1.4060311444343743, + "grad_norm": 0.8226394653320312, + "learning_rate": 5.853818050509075e-06, + "loss": 1.3774, + "mean_token_accuracy": 0.6524281054735184, + "num_tokens": 2146732048.0, + "step": 12799 + }, + { + "entropy": 1.6746432185173035, + "epoch": 1.4061410013457472, + "grad_norm": 0.8111725449562073, + "learning_rate": 5.852504574753171e-06, + "loss": 1.3547, + "mean_token_accuracy": 0.670900379618009, + "num_tokens": 2146891070.0, + "step": 12800 + }, + { + "entropy": 1.7106841901938121, + "epoch": 1.40625085825712, + "grad_norm": 0.8684120178222656, + "learning_rate": 5.851191261913173e-06, + "loss": 1.4345, + "mean_token_accuracy": 0.6579526364803314, + "num_tokens": 2147074096.0, + "step": 12801 + }, + { + "entropy": 1.6633965174357097, + "epoch": 1.4063607151684931, + "grad_norm": 0.6919369101524353, + "learning_rate": 5.8498781120306515e-06, + "loss": 1.5774, + "mean_token_accuracy": 0.6280744473139445, + "num_tokens": 2147378563.0, + "step": 12802 + }, + { + "entropy": 1.7567805548508961, + "epoch": 1.406470572079866, + "grad_norm": 0.6097819805145264, + "learning_rate": 5.84856512514717e-06, + "loss": 1.5855, + "mean_token_accuracy": 0.6411197036504745, + "num_tokens": 2147542328.0, + "step": 12803 + }, + { + "entropy": 1.7062805791695912, + "epoch": 1.406580428991239, + "grad_norm": 0.7504722476005554, + "learning_rate": 5.847252301304283e-06, + "loss": 1.4771, + "mean_token_accuracy": 0.6363706986109415, + "num_tokens": 2147776239.0, + "step": 12804 + }, + { + "entropy": 1.6895011464754741, + "epoch": 1.4066902859026118, + "grad_norm": 0.7538740038871765, + "learning_rate": 5.845939640543532e-06, + "loss": 1.161, + "mean_token_accuracy": 0.6999478687842687, + "num_tokens": 2147905569.0, + "step": 12805 + }, + { + "entropy": 1.742385983467102, + "epoch": 1.4068001428139847, + "grad_norm": 0.7203928828239441, + "learning_rate": 5.844627142906476e-06, + "loss": 1.3725, + "mean_token_accuracy": 0.65119768679142, + "num_tokens": 2148091661.0, + "step": 12806 + }, + { + "entropy": 1.6920853157838185, + "epoch": 1.4069099997253578, + "grad_norm": 0.9913017153739929, + "learning_rate": 5.843314808434642e-06, + "loss": 1.2324, + "mean_token_accuracy": 0.6863707005977631, + "num_tokens": 2148224018.0, + "step": 12807 + }, + { + "entropy": 1.7018676499525707, + "epoch": 1.4070198566367307, + "grad_norm": 0.6461707949638367, + "learning_rate": 5.842002637169575e-06, + "loss": 1.4177, + "mean_token_accuracy": 0.6471654524405798, + "num_tokens": 2148433747.0, + "step": 12808 + }, + { + "entropy": 1.6787349085013072, + "epoch": 1.4071297135481036, + "grad_norm": 0.7102311253547668, + "learning_rate": 5.840690629152801e-06, + "loss": 1.6121, + "mean_token_accuracy": 0.6307234813769659, + "num_tokens": 2148624255.0, + "step": 12809 + }, + { + "entropy": 1.7247712711493175, + "epoch": 1.4072395704594765, + "grad_norm": 0.7935449481010437, + "learning_rate": 5.8393787844258395e-06, + "loss": 1.4452, + "mean_token_accuracy": 0.6595299392938614, + "num_tokens": 2148761952.0, + "step": 12810 + }, + { + "entropy": 1.7299401660760243, + "epoch": 1.4073494273708493, + "grad_norm": 0.7766920328140259, + "learning_rate": 5.838067103030216e-06, + "loss": 1.4923, + "mean_token_accuracy": 0.6479160586992899, + "num_tokens": 2148884392.0, + "step": 12811 + }, + { + "entropy": 1.658048282066981, + "epoch": 1.4074592842822224, + "grad_norm": 0.6866027116775513, + "learning_rate": 5.836755585007445e-06, + "loss": 1.3419, + "mean_token_accuracy": 0.6578021794557571, + "num_tokens": 2149045505.0, + "step": 12812 + }, + { + "entropy": 1.6724791725476582, + "epoch": 1.4075691411935953, + "grad_norm": 0.8036855459213257, + "learning_rate": 5.8354442303990285e-06, + "loss": 1.3986, + "mean_token_accuracy": 0.6611756682395935, + "num_tokens": 2149207385.0, + "step": 12813 + }, + { + "entropy": 1.7076423863569896, + "epoch": 1.4076789981049682, + "grad_norm": 0.7133153676986694, + "learning_rate": 5.834133039246479e-06, + "loss": 1.4558, + "mean_token_accuracy": 0.6625415583451589, + "num_tokens": 2149386316.0, + "step": 12814 + }, + { + "entropy": 1.6813267568747203, + "epoch": 1.4077888550163413, + "grad_norm": 0.697750985622406, + "learning_rate": 5.832822011591287e-06, + "loss": 1.2656, + "mean_token_accuracy": 0.6713636467854182, + "num_tokens": 2149506203.0, + "step": 12815 + }, + { + "entropy": 1.7380880614121754, + "epoch": 1.4078987119277142, + "grad_norm": 0.8035324215888977, + "learning_rate": 5.831511147474953e-06, + "loss": 1.3003, + "mean_token_accuracy": 0.6692630002895991, + "num_tokens": 2149633124.0, + "step": 12816 + }, + { + "entropy": 1.6894714534282684, + "epoch": 1.408008568839087, + "grad_norm": 0.918594241142273, + "learning_rate": 5.830200446938963e-06, + "loss": 1.3384, + "mean_token_accuracy": 0.6638317654530207, + "num_tokens": 2149829653.0, + "step": 12817 + }, + { + "entropy": 1.6925647656122844, + "epoch": 1.40811842575046, + "grad_norm": 0.7484959959983826, + "learning_rate": 5.828889910024796e-06, + "loss": 1.3594, + "mean_token_accuracy": 0.670293723543485, + "num_tokens": 2149990639.0, + "step": 12818 + }, + { + "entropy": 1.7760498821735382, + "epoch": 1.4082282826618329, + "grad_norm": 0.890577495098114, + "learning_rate": 5.827579536773933e-06, + "loss": 1.559, + "mean_token_accuracy": 0.6545114864905676, + "num_tokens": 2150162739.0, + "step": 12819 + }, + { + "entropy": 1.7027121881643932, + "epoch": 1.408338139573206, + "grad_norm": 0.625486433506012, + "learning_rate": 5.826269327227853e-06, + "loss": 1.5667, + "mean_token_accuracy": 0.6334756761789322, + "num_tokens": 2150347461.0, + "step": 12820 + }, + { + "entropy": 1.7147199014822643, + "epoch": 1.4084479964845789, + "grad_norm": 0.7005285024642944, + "learning_rate": 5.824959281428012e-06, + "loss": 1.281, + "mean_token_accuracy": 0.6745847860972086, + "num_tokens": 2150493583.0, + "step": 12821 + }, + { + "entropy": 1.6792974670728047, + "epoch": 1.4085578533959517, + "grad_norm": 0.6783992648124695, + "learning_rate": 5.823649399415876e-06, + "loss": 1.3088, + "mean_token_accuracy": 0.671634684006373, + "num_tokens": 2150622072.0, + "step": 12822 + }, + { + "entropy": 1.7010674675305684, + "epoch": 1.4086677103073246, + "grad_norm": 0.6391741037368774, + "learning_rate": 5.822339681232909e-06, + "loss": 1.3342, + "mean_token_accuracy": 0.6632145543893179, + "num_tokens": 2150816027.0, + "step": 12823 + }, + { + "entropy": 1.7505244612693787, + "epoch": 1.4087775672186975, + "grad_norm": 0.7569530606269836, + "learning_rate": 5.821030126920558e-06, + "loss": 1.5322, + "mean_token_accuracy": 0.6407630940278372, + "num_tokens": 2150992201.0, + "step": 12824 + }, + { + "entropy": 1.724602570136388, + "epoch": 1.4088874241300706, + "grad_norm": 0.7671657800674438, + "learning_rate": 5.819720736520265e-06, + "loss": 1.3978, + "mean_token_accuracy": 0.6648318469524384, + "num_tokens": 2151147887.0, + "step": 12825 + }, + { + "entropy": 1.6906703511873882, + "epoch": 1.4089972810414435, + "grad_norm": 0.688035249710083, + "learning_rate": 5.818411510073481e-06, + "loss": 1.3757, + "mean_token_accuracy": 0.6557995080947876, + "num_tokens": 2151322253.0, + "step": 12826 + }, + { + "entropy": 1.6956557631492615, + "epoch": 1.4091071379528164, + "grad_norm": 0.5983976125717163, + "learning_rate": 5.817102447621634e-06, + "loss": 1.5676, + "mean_token_accuracy": 0.639850397904714, + "num_tokens": 2151538705.0, + "step": 12827 + }, + { + "entropy": 1.7294293542702992, + "epoch": 1.4092169948641895, + "grad_norm": 0.7593173384666443, + "learning_rate": 5.815793549206163e-06, + "loss": 1.4723, + "mean_token_accuracy": 0.6482956012090048, + "num_tokens": 2151684485.0, + "step": 12828 + }, + { + "entropy": 1.6686496635278065, + "epoch": 1.4093268517755624, + "grad_norm": 0.7154465317726135, + "learning_rate": 5.8144848148684885e-06, + "loss": 1.2243, + "mean_token_accuracy": 0.6809603323539098, + "num_tokens": 2151805051.0, + "step": 12829 + }, + { + "entropy": 1.685718337694804, + "epoch": 1.4094367086869353, + "grad_norm": 0.6651716232299805, + "learning_rate": 5.813176244650032e-06, + "loss": 1.2744, + "mean_token_accuracy": 0.6674046268065771, + "num_tokens": 2151913172.0, + "step": 12830 + }, + { + "entropy": 1.738152305285136, + "epoch": 1.4095465655983082, + "grad_norm": 0.7076160907745361, + "learning_rate": 5.811867838592211e-06, + "loss": 1.477, + "mean_token_accuracy": 0.6419303814570109, + "num_tokens": 2152093205.0, + "step": 12831 + }, + { + "entropy": 1.7107795576254528, + "epoch": 1.409656422509681, + "grad_norm": 0.6455994844436646, + "learning_rate": 5.810559596736437e-06, + "loss": 1.344, + "mean_token_accuracy": 0.6550297737121582, + "num_tokens": 2152241280.0, + "step": 12832 + }, + { + "entropy": 1.717966268459956, + "epoch": 1.4097662794210541, + "grad_norm": 0.6599206924438477, + "learning_rate": 5.809251519124109e-06, + "loss": 1.4948, + "mean_token_accuracy": 0.6381760487953821, + "num_tokens": 2152459746.0, + "step": 12833 + }, + { + "entropy": 1.7183611194292705, + "epoch": 1.409876136332427, + "grad_norm": 0.5863080024719238, + "learning_rate": 5.807943605796631e-06, + "loss": 1.4689, + "mean_token_accuracy": 0.6355762084325155, + "num_tokens": 2152659000.0, + "step": 12834 + }, + { + "entropy": 1.6961732705434163, + "epoch": 1.4099859932438, + "grad_norm": 0.6659945249557495, + "learning_rate": 5.806635856795404e-06, + "loss": 1.3066, + "mean_token_accuracy": 0.6640375355879465, + "num_tokens": 2152798280.0, + "step": 12835 + }, + { + "entropy": 1.7171707153320312, + "epoch": 1.410095850155173, + "grad_norm": 0.5799298286437988, + "learning_rate": 5.80532827216181e-06, + "loss": 1.458, + "mean_token_accuracy": 0.6387546559174856, + "num_tokens": 2153031793.0, + "step": 12836 + }, + { + "entropy": 1.7386144399642944, + "epoch": 1.4102057070665457, + "grad_norm": 0.6916424632072449, + "learning_rate": 5.804020851937231e-06, + "loss": 1.4831, + "mean_token_accuracy": 0.6459663063287735, + "num_tokens": 2153245122.0, + "step": 12837 + }, + { + "entropy": 1.7278761863708496, + "epoch": 1.4103155639779188, + "grad_norm": 0.657574474811554, + "learning_rate": 5.8027135961630565e-06, + "loss": 1.4642, + "mean_token_accuracy": 0.6525082488854727, + "num_tokens": 2153416789.0, + "step": 12838 + }, + { + "entropy": 1.7336822350819905, + "epoch": 1.4104254208892917, + "grad_norm": 0.7622162103652954, + "learning_rate": 5.801406504880649e-06, + "loss": 1.3134, + "mean_token_accuracy": 0.6629678755998611, + "num_tokens": 2153535980.0, + "step": 12839 + }, + { + "entropy": 1.654565433661143, + "epoch": 1.4105352778006646, + "grad_norm": 0.6035882830619812, + "learning_rate": 5.800099578131388e-06, + "loss": 1.3029, + "mean_token_accuracy": 0.680802529056867, + "num_tokens": 2153683959.0, + "step": 12840 + }, + { + "entropy": 1.6937087972958882, + "epoch": 1.4106451347120377, + "grad_norm": 0.6252169609069824, + "learning_rate": 5.798792815956632e-06, + "loss": 1.3357, + "mean_token_accuracy": 0.6599215567111969, + "num_tokens": 2153878650.0, + "step": 12841 + }, + { + "entropy": 1.7652093668778737, + "epoch": 1.4107549916234106, + "grad_norm": 0.715120255947113, + "learning_rate": 5.797486218397737e-06, + "loss": 1.4924, + "mean_token_accuracy": 0.6500881711641947, + "num_tokens": 2154048004.0, + "step": 12842 + }, + { + "entropy": 1.6499028007189434, + "epoch": 1.4108648485347834, + "grad_norm": 0.8363161683082581, + "learning_rate": 5.796179785496061e-06, + "loss": 1.3537, + "mean_token_accuracy": 0.6692218035459518, + "num_tokens": 2154206593.0, + "step": 12843 + }, + { + "entropy": 1.6193082729975383, + "epoch": 1.4109747054461563, + "grad_norm": 0.6517688035964966, + "learning_rate": 5.7948735172929495e-06, + "loss": 1.2561, + "mean_token_accuracy": 0.6783891270558039, + "num_tokens": 2154358170.0, + "step": 12844 + }, + { + "entropy": 1.685720553000768, + "epoch": 1.4110845623575292, + "grad_norm": 0.6175381541252136, + "learning_rate": 5.7935674138297435e-06, + "loss": 1.3547, + "mean_token_accuracy": 0.653323769569397, + "num_tokens": 2154528515.0, + "step": 12845 + }, + { + "entropy": 1.6989250282446544, + "epoch": 1.4111944192689023, + "grad_norm": 0.8476423025131226, + "learning_rate": 5.792261475147782e-06, + "loss": 1.5093, + "mean_token_accuracy": 0.6390935728947321, + "num_tokens": 2154727006.0, + "step": 12846 + }, + { + "entropy": 1.6973857482274373, + "epoch": 1.4113042761802752, + "grad_norm": 0.7867861986160278, + "learning_rate": 5.790955701288402e-06, + "loss": 1.4885, + "mean_token_accuracy": 0.6534365713596344, + "num_tokens": 2154880048.0, + "step": 12847 + }, + { + "entropy": 1.7213109532992046, + "epoch": 1.411414133091648, + "grad_norm": 0.6492405533790588, + "learning_rate": 5.7896500922929265e-06, + "loss": 1.3551, + "mean_token_accuracy": 0.6515356749296188, + "num_tokens": 2155041717.0, + "step": 12848 + }, + { + "entropy": 1.7319613297780354, + "epoch": 1.4115239900030212, + "grad_norm": 0.6068700551986694, + "learning_rate": 5.788344648202675e-06, + "loss": 1.327, + "mean_token_accuracy": 0.6601489931344986, + "num_tokens": 2155192456.0, + "step": 12849 + }, + { + "entropy": 1.6868494153022766, + "epoch": 1.4116338469143939, + "grad_norm": 0.6022621393203735, + "learning_rate": 5.78703936905897e-06, + "loss": 1.4006, + "mean_token_accuracy": 0.6591216822465261, + "num_tokens": 2155389011.0, + "step": 12850 + }, + { + "entropy": 1.6397359669208527, + "epoch": 1.411743703825767, + "grad_norm": 0.7067409753799438, + "learning_rate": 5.785734254903117e-06, + "loss": 1.346, + "mean_token_accuracy": 0.6666657626628876, + "num_tokens": 2155528733.0, + "step": 12851 + }, + { + "entropy": 1.6686400373776753, + "epoch": 1.4118535607371399, + "grad_norm": 0.6087329387664795, + "learning_rate": 5.784429305776427e-06, + "loss": 1.3229, + "mean_token_accuracy": 0.6637933800617853, + "num_tokens": 2155688378.0, + "step": 12852 + }, + { + "entropy": 1.7409396668275197, + "epoch": 1.4119634176485127, + "grad_norm": 0.7018166184425354, + "learning_rate": 5.7831245217202e-06, + "loss": 1.5291, + "mean_token_accuracy": 0.6422918488581976, + "num_tokens": 2155840989.0, + "step": 12853 + }, + { + "entropy": 1.7613717218240101, + "epoch": 1.4120732745598858, + "grad_norm": 0.6511824727058411, + "learning_rate": 5.7818199027757296e-06, + "loss": 1.4611, + "mean_token_accuracy": 0.6367582231760025, + "num_tokens": 2156025021.0, + "step": 12854 + }, + { + "entropy": 1.6414049168427784, + "epoch": 1.4121831314712587, + "grad_norm": 0.82599276304245, + "learning_rate": 5.78051544898431e-06, + "loss": 1.2513, + "mean_token_accuracy": 0.6847147146860758, + "num_tokens": 2156184157.0, + "step": 12855 + }, + { + "entropy": 1.6162588596343994, + "epoch": 1.4122929883826316, + "grad_norm": 0.6030210256576538, + "learning_rate": 5.779211160387224e-06, + "loss": 1.2715, + "mean_token_accuracy": 0.681743452946345, + "num_tokens": 2156337483.0, + "step": 12856 + }, + { + "entropy": 1.7061065038045247, + "epoch": 1.4124028452940045, + "grad_norm": 0.7851901054382324, + "learning_rate": 5.777907037025748e-06, + "loss": 1.2282, + "mean_token_accuracy": 0.6925330509742101, + "num_tokens": 2156434763.0, + "step": 12857 + }, + { + "entropy": 1.6331138213475545, + "epoch": 1.4125127022053774, + "grad_norm": 0.6462249755859375, + "learning_rate": 5.776603078941163e-06, + "loss": 1.3081, + "mean_token_accuracy": 0.6729962974786758, + "num_tokens": 2156592213.0, + "step": 12858 + }, + { + "entropy": 1.7185394763946533, + "epoch": 1.4126225591167505, + "grad_norm": 0.7172744274139404, + "learning_rate": 5.775299286174739e-06, + "loss": 1.5015, + "mean_token_accuracy": 0.6441001494725546, + "num_tokens": 2156745527.0, + "step": 12859 + }, + { + "entropy": 1.743662456671397, + "epoch": 1.4127324160281234, + "grad_norm": 0.6882581114768982, + "learning_rate": 5.773995658767739e-06, + "loss": 1.4154, + "mean_token_accuracy": 0.6491503864526749, + "num_tokens": 2156898386.0, + "step": 12860 + }, + { + "entropy": 1.7289847433567047, + "epoch": 1.4128422729394963, + "grad_norm": 0.6334021687507629, + "learning_rate": 5.772692196761418e-06, + "loss": 1.3968, + "mean_token_accuracy": 0.64505868156751, + "num_tokens": 2157089076.0, + "step": 12861 + }, + { + "entropy": 1.6975704431533813, + "epoch": 1.4129521298508694, + "grad_norm": 0.6869586706161499, + "learning_rate": 5.771388900197037e-06, + "loss": 1.4119, + "mean_token_accuracy": 0.6508818864822388, + "num_tokens": 2157300620.0, + "step": 12862 + }, + { + "entropy": 1.6993489861488342, + "epoch": 1.4130619867622423, + "grad_norm": 0.5923440456390381, + "learning_rate": 5.770085769115836e-06, + "loss": 1.4502, + "mean_token_accuracy": 0.6494368265072504, + "num_tokens": 2157490825.0, + "step": 12863 + }, + { + "entropy": 1.6607881089051564, + "epoch": 1.4131718436736151, + "grad_norm": 0.6591402292251587, + "learning_rate": 5.76878280355907e-06, + "loss": 1.4262, + "mean_token_accuracy": 0.6494799305995306, + "num_tokens": 2157695141.0, + "step": 12864 + }, + { + "entropy": 1.6793291966120403, + "epoch": 1.413281700584988, + "grad_norm": 0.6479206085205078, + "learning_rate": 5.76748000356797e-06, + "loss": 1.2806, + "mean_token_accuracy": 0.6759979277849197, + "num_tokens": 2157865996.0, + "step": 12865 + }, + { + "entropy": 1.7249796688556671, + "epoch": 1.413391557496361, + "grad_norm": 0.7696998715400696, + "learning_rate": 5.766177369183767e-06, + "loss": 1.4357, + "mean_token_accuracy": 0.642528717716535, + "num_tokens": 2158053561.0, + "step": 12866 + }, + { + "entropy": 1.760029007991155, + "epoch": 1.413501414407734, + "grad_norm": 0.8501371145248413, + "learning_rate": 5.764874900447693e-06, + "loss": 1.6135, + "mean_token_accuracy": 0.6389002650976181, + "num_tokens": 2158203865.0, + "step": 12867 + }, + { + "entropy": 1.7189124127229054, + "epoch": 1.413611271319107, + "grad_norm": 0.723200261592865, + "learning_rate": 5.763572597400972e-06, + "loss": 1.3901, + "mean_token_accuracy": 0.6513624439636866, + "num_tokens": 2158354427.0, + "step": 12868 + }, + { + "entropy": 1.6675411363442738, + "epoch": 1.4137211282304798, + "grad_norm": 0.6228331923484802, + "learning_rate": 5.762270460084813e-06, + "loss": 1.3496, + "mean_token_accuracy": 0.6633824755748113, + "num_tokens": 2158523511.0, + "step": 12869 + }, + { + "entropy": 1.8182001411914825, + "epoch": 1.4138309851418527, + "grad_norm": 0.7652587890625, + "learning_rate": 5.760968488540437e-06, + "loss": 1.7234, + "mean_token_accuracy": 0.6275846213102341, + "num_tokens": 2158704933.0, + "step": 12870 + }, + { + "entropy": 1.7456133862336476, + "epoch": 1.4139408420532256, + "grad_norm": 0.9210031628608704, + "learning_rate": 5.759666682809049e-06, + "loss": 1.3865, + "mean_token_accuracy": 0.662195548415184, + "num_tokens": 2158826778.0, + "step": 12871 + }, + { + "entropy": 1.7603688438733418, + "epoch": 1.4140506989645987, + "grad_norm": 1.4208470582962036, + "learning_rate": 5.758365042931848e-06, + "loss": 1.4263, + "mean_token_accuracy": 0.6728880008061727, + "num_tokens": 2158989638.0, + "step": 12872 + }, + { + "entropy": 1.616513580083847, + "epoch": 1.4141605558759716, + "grad_norm": 0.6693636775016785, + "learning_rate": 5.75706356895003e-06, + "loss": 1.1544, + "mean_token_accuracy": 0.6959843138853709, + "num_tokens": 2159102023.0, + "step": 12873 + }, + { + "entropy": 1.7250353495279949, + "epoch": 1.4142704127873444, + "grad_norm": 0.7106985449790955, + "learning_rate": 5.75576226090479e-06, + "loss": 1.3421, + "mean_token_accuracy": 0.6622584760189056, + "num_tokens": 2159251112.0, + "step": 12874 + }, + { + "entropy": 1.6478977501392365, + "epoch": 1.4143802696987176, + "grad_norm": 0.7380810379981995, + "learning_rate": 5.754461118837309e-06, + "loss": 1.498, + "mean_token_accuracy": 0.6418175796667734, + "num_tokens": 2159440534.0, + "step": 12875 + }, + { + "entropy": 1.691566934188207, + "epoch": 1.4144901266100904, + "grad_norm": 0.6885169744491577, + "learning_rate": 5.753160142788775e-06, + "loss": 1.3672, + "mean_token_accuracy": 0.6527943164110184, + "num_tokens": 2159592449.0, + "step": 12876 + }, + { + "entropy": 1.718112548192342, + "epoch": 1.4145999835214633, + "grad_norm": 3.1608479022979736, + "learning_rate": 5.7518593328003515e-06, + "loss": 1.1454, + "mean_token_accuracy": 0.6821771760781606, + "num_tokens": 2159796095.0, + "step": 12877 + }, + { + "entropy": 1.6949720482031505, + "epoch": 1.4147098404328362, + "grad_norm": 0.6075740456581116, + "learning_rate": 5.750558688913217e-06, + "loss": 1.3461, + "mean_token_accuracy": 0.6663214464982351, + "num_tokens": 2159968669.0, + "step": 12878 + }, + { + "entropy": 1.654084712266922, + "epoch": 1.414819697344209, + "grad_norm": 0.5908815264701843, + "learning_rate": 5.749258211168536e-06, + "loss": 1.2713, + "mean_token_accuracy": 0.6685434530178705, + "num_tokens": 2160112122.0, + "step": 12879 + }, + { + "entropy": 1.7059245606263478, + "epoch": 1.4149295542555822, + "grad_norm": 0.6774670481681824, + "learning_rate": 5.747957899607468e-06, + "loss": 1.5032, + "mean_token_accuracy": 0.6411937922239304, + "num_tokens": 2160296569.0, + "step": 12880 + }, + { + "entropy": 1.6509164174397786, + "epoch": 1.415039411166955, + "grad_norm": 0.6976071000099182, + "learning_rate": 5.7466577542711634e-06, + "loss": 1.5301, + "mean_token_accuracy": 0.6424646973609924, + "num_tokens": 2160522783.0, + "step": 12881 + }, + { + "entropy": 1.5837687651316326, + "epoch": 1.415149268078328, + "grad_norm": 0.5916568040847778, + "learning_rate": 5.745357775200775e-06, + "loss": 1.3209, + "mean_token_accuracy": 0.6775771975517273, + "num_tokens": 2160705969.0, + "step": 12882 + }, + { + "entropy": 1.7270430326461792, + "epoch": 1.4152591249897009, + "grad_norm": 0.6653161644935608, + "learning_rate": 5.744057962437441e-06, + "loss": 1.4435, + "mean_token_accuracy": 0.6445691585540771, + "num_tokens": 2160854814.0, + "step": 12883 + }, + { + "entropy": 1.7269325355688732, + "epoch": 1.4153689819010737, + "grad_norm": 0.6765090227127075, + "learning_rate": 5.74275831602231e-06, + "loss": 1.317, + "mean_token_accuracy": 0.6648717721303304, + "num_tokens": 2160986318.0, + "step": 12884 + }, + { + "entropy": 1.7668460508187611, + "epoch": 1.4154788388124468, + "grad_norm": 0.8804817199707031, + "learning_rate": 5.741458835996507e-06, + "loss": 1.5703, + "mean_token_accuracy": 0.6425085514783859, + "num_tokens": 2161164540.0, + "step": 12885 + }, + { + "entropy": 1.719743698835373, + "epoch": 1.4155886957238197, + "grad_norm": 0.8075534105300903, + "learning_rate": 5.740159522401161e-06, + "loss": 1.3203, + "mean_token_accuracy": 0.6848004907369614, + "num_tokens": 2161294773.0, + "step": 12886 + }, + { + "entropy": 1.726130078236262, + "epoch": 1.4156985526351926, + "grad_norm": 0.6449326276779175, + "learning_rate": 5.738860375277395e-06, + "loss": 1.4198, + "mean_token_accuracy": 0.630332425236702, + "num_tokens": 2161463323.0, + "step": 12887 + }, + { + "entropy": 1.712664246559143, + "epoch": 1.4158084095465657, + "grad_norm": 0.6622018218040466, + "learning_rate": 5.737561394666336e-06, + "loss": 1.3093, + "mean_token_accuracy": 0.6683304011821747, + "num_tokens": 2161633350.0, + "step": 12888 + }, + { + "entropy": 1.6602988839149475, + "epoch": 1.4159182664579386, + "grad_norm": 0.6974785923957825, + "learning_rate": 5.7362625806090775e-06, + "loss": 1.3417, + "mean_token_accuracy": 0.6663461575905482, + "num_tokens": 2161813061.0, + "step": 12889 + }, + { + "entropy": 1.651836782693863, + "epoch": 1.4160281233693115, + "grad_norm": 0.6534989476203918, + "learning_rate": 5.734963933146739e-06, + "loss": 1.4203, + "mean_token_accuracy": 0.6599469731251398, + "num_tokens": 2161992435.0, + "step": 12890 + }, + { + "entropy": 1.709712266921997, + "epoch": 1.4161379802806844, + "grad_norm": 0.7406985759735107, + "learning_rate": 5.733665452320422e-06, + "loss": 1.4721, + "mean_token_accuracy": 0.6705115636189779, + "num_tokens": 2162170219.0, + "step": 12891 + }, + { + "entropy": 1.669614851474762, + "epoch": 1.4162478371920573, + "grad_norm": 0.6635571122169495, + "learning_rate": 5.73236713817122e-06, + "loss": 1.3549, + "mean_token_accuracy": 0.6629445304473242, + "num_tokens": 2162337356.0, + "step": 12892 + }, + { + "entropy": 1.6756293376286824, + "epoch": 1.4163576941034304, + "grad_norm": 0.6038379073143005, + "learning_rate": 5.731068990740222e-06, + "loss": 1.4527, + "mean_token_accuracy": 0.633764331539472, + "num_tokens": 2162534176.0, + "step": 12893 + }, + { + "entropy": 1.6719582378864288, + "epoch": 1.4164675510148033, + "grad_norm": 0.7196714282035828, + "learning_rate": 5.729771010068518e-06, + "loss": 1.2758, + "mean_token_accuracy": 0.6750404040018717, + "num_tokens": 2162683391.0, + "step": 12894 + }, + { + "entropy": 1.7006418605645497, + "epoch": 1.4165774079261761, + "grad_norm": 0.6588510274887085, + "learning_rate": 5.728473196197184e-06, + "loss": 1.3755, + "mean_token_accuracy": 0.6491716603438059, + "num_tokens": 2162858449.0, + "step": 12895 + }, + { + "entropy": 1.704519013563792, + "epoch": 1.416687264837549, + "grad_norm": 0.7052327990531921, + "learning_rate": 5.7271755491673035e-06, + "loss": 1.2329, + "mean_token_accuracy": 0.6732802291711172, + "num_tokens": 2162964910.0, + "step": 12896 + }, + { + "entropy": 1.695040076971054, + "epoch": 1.416797121748922, + "grad_norm": 0.6275352835655212, + "learning_rate": 5.725878069019937e-06, + "loss": 1.538, + "mean_token_accuracy": 0.6412870685259501, + "num_tokens": 2163208900.0, + "step": 12897 + }, + { + "entropy": 1.6921038031578064, + "epoch": 1.416906978660295, + "grad_norm": 0.7399893999099731, + "learning_rate": 5.724580755796152e-06, + "loss": 1.4541, + "mean_token_accuracy": 0.6728978330890337, + "num_tokens": 2163332942.0, + "step": 12898 + }, + { + "entropy": 1.708950052658717, + "epoch": 1.417016835571668, + "grad_norm": 0.7712686657905579, + "learning_rate": 5.72328360953701e-06, + "loss": 1.4268, + "mean_token_accuracy": 0.6555162221193314, + "num_tokens": 2163493726.0, + "step": 12899 + }, + { + "entropy": 1.714707463979721, + "epoch": 1.4171266924830408, + "grad_norm": 0.6765271425247192, + "learning_rate": 5.7219866302835684e-06, + "loss": 1.4633, + "mean_token_accuracy": 0.6470478971799215, + "num_tokens": 2163663872.0, + "step": 12900 + }, + { + "entropy": 1.6449009974797566, + "epoch": 1.417236549394414, + "grad_norm": 0.6789788603782654, + "learning_rate": 5.720689818076864e-06, + "loss": 1.3217, + "mean_token_accuracy": 0.6745987633864085, + "num_tokens": 2163845660.0, + "step": 12901 + }, + { + "entropy": 1.6422028144200642, + "epoch": 1.4173464063057868, + "grad_norm": 0.8197759389877319, + "learning_rate": 5.719393172957951e-06, + "loss": 1.3709, + "mean_token_accuracy": 0.670257086555163, + "num_tokens": 2163970180.0, + "step": 12902 + }, + { + "entropy": 1.6707020998001099, + "epoch": 1.4174562632171597, + "grad_norm": 0.8020114302635193, + "learning_rate": 5.718096694967866e-06, + "loss": 1.4755, + "mean_token_accuracy": 0.6537665476401647, + "num_tokens": 2164098025.0, + "step": 12903 + }, + { + "entropy": 1.723763604958852, + "epoch": 1.4175661201285326, + "grad_norm": 0.8109487295150757, + "learning_rate": 5.716800384147642e-06, + "loss": 1.5173, + "mean_token_accuracy": 0.6525298108657202, + "num_tokens": 2164325693.0, + "step": 12904 + }, + { + "entropy": 1.6953211824099224, + "epoch": 1.4176759770399054, + "grad_norm": 0.7380589842796326, + "learning_rate": 5.715504240538301e-06, + "loss": 1.2802, + "mean_token_accuracy": 0.6700010697046915, + "num_tokens": 2164459673.0, + "step": 12905 + }, + { + "entropy": 1.735133836666743, + "epoch": 1.4177858339512786, + "grad_norm": 0.8321533799171448, + "learning_rate": 5.714208264180872e-06, + "loss": 1.5847, + "mean_token_accuracy": 0.6274262269337972, + "num_tokens": 2164662503.0, + "step": 12906 + }, + { + "entropy": 1.6269804338614147, + "epoch": 1.4178956908626514, + "grad_norm": 0.7073882818222046, + "learning_rate": 5.712912455116367e-06, + "loss": 1.3359, + "mean_token_accuracy": 0.6600817640622457, + "num_tokens": 2164855340.0, + "step": 12907 + }, + { + "entropy": 1.7148225208123524, + "epoch": 1.4180055477740243, + "grad_norm": 0.700375497341156, + "learning_rate": 5.7116168133858044e-06, + "loss": 1.3533, + "mean_token_accuracy": 0.6706081926822662, + "num_tokens": 2165023645.0, + "step": 12908 + }, + { + "entropy": 1.7189152439435322, + "epoch": 1.4181154046853972, + "grad_norm": 0.7371551394462585, + "learning_rate": 5.710321339030186e-06, + "loss": 1.43, + "mean_token_accuracy": 0.6535715262095133, + "num_tokens": 2165227184.0, + "step": 12909 + }, + { + "entropy": 1.6818451484044392, + "epoch": 1.41822526159677, + "grad_norm": 0.660900354385376, + "learning_rate": 5.70902603209051e-06, + "loss": 1.184, + "mean_token_accuracy": 0.6832146992286047, + "num_tokens": 2165339873.0, + "step": 12910 + }, + { + "entropy": 1.6560562153657277, + "epoch": 1.4183351185081432, + "grad_norm": 0.6540271043777466, + "learning_rate": 5.70773089260778e-06, + "loss": 1.2773, + "mean_token_accuracy": 0.6754108965396881, + "num_tokens": 2165478503.0, + "step": 12911 + }, + { + "entropy": 1.644927054643631, + "epoch": 1.418444975419516, + "grad_norm": 0.6257344484329224, + "learning_rate": 5.7064359206229825e-06, + "loss": 1.3435, + "mean_token_accuracy": 0.6578503449757894, + "num_tokens": 2165658626.0, + "step": 12912 + }, + { + "entropy": 1.678837110598882, + "epoch": 1.418554832330889, + "grad_norm": 0.7022602558135986, + "learning_rate": 5.7051411161771e-06, + "loss": 1.5779, + "mean_token_accuracy": 0.6389969835678736, + "num_tokens": 2165860011.0, + "step": 12913 + }, + { + "entropy": 1.671900063753128, + "epoch": 1.418664689242262, + "grad_norm": 0.7610450983047485, + "learning_rate": 5.703846479311113e-06, + "loss": 1.3848, + "mean_token_accuracy": 0.6613888889551163, + "num_tokens": 2166057539.0, + "step": 12914 + }, + { + "entropy": 1.732055425643921, + "epoch": 1.418774546153635, + "grad_norm": 0.8880397081375122, + "learning_rate": 5.702552010066004e-06, + "loss": 1.2981, + "mean_token_accuracy": 0.6812852670749029, + "num_tokens": 2166212303.0, + "step": 12915 + }, + { + "entropy": 1.6862863500912983, + "epoch": 1.4188844030650078, + "grad_norm": 0.7152805924415588, + "learning_rate": 5.701257708482736e-06, + "loss": 1.3078, + "mean_token_accuracy": 0.6692292392253876, + "num_tokens": 2166363658.0, + "step": 12916 + }, + { + "entropy": 1.7482119103272755, + "epoch": 1.4189942599763807, + "grad_norm": 0.7135628461837769, + "learning_rate": 5.69996357460227e-06, + "loss": 1.6073, + "mean_token_accuracy": 0.6363287791609764, + "num_tokens": 2166531130.0, + "step": 12917 + }, + { + "entropy": 1.6944365203380585, + "epoch": 1.4191041168877536, + "grad_norm": 0.8478591442108154, + "learning_rate": 5.6986696084655725e-06, + "loss": 1.2922, + "mean_token_accuracy": 0.6734979252020518, + "num_tokens": 2166657623.0, + "step": 12918 + }, + { + "entropy": 1.6972975830237071, + "epoch": 1.4192139737991267, + "grad_norm": 0.5631718039512634, + "learning_rate": 5.6973758101135905e-06, + "loss": 1.3744, + "mean_token_accuracy": 0.6703929851452509, + "num_tokens": 2166856825.0, + "step": 12919 + }, + { + "entropy": 1.6998174389203389, + "epoch": 1.4193238307104996, + "grad_norm": 0.7097121477127075, + "learning_rate": 5.696082179587275e-06, + "loss": 1.4455, + "mean_token_accuracy": 0.6481041411558787, + "num_tokens": 2167059792.0, + "step": 12920 + }, + { + "entropy": 1.6798253854115803, + "epoch": 1.4194336876218725, + "grad_norm": 0.6257836818695068, + "learning_rate": 5.694788716927571e-06, + "loss": 1.4738, + "mean_token_accuracy": 0.64958456158638, + "num_tokens": 2167257982.0, + "step": 12921 + }, + { + "entropy": 1.673070341348648, + "epoch": 1.4195435445332454, + "grad_norm": 0.6729440093040466, + "learning_rate": 5.69349542217541e-06, + "loss": 1.3327, + "mean_token_accuracy": 0.6657520681619644, + "num_tokens": 2167412027.0, + "step": 12922 + }, + { + "entropy": 1.7135821183522542, + "epoch": 1.4196534014446183, + "grad_norm": 0.7069577574729919, + "learning_rate": 5.692202295371731e-06, + "loss": 1.4568, + "mean_token_accuracy": 0.6652841120958328, + "num_tokens": 2167572724.0, + "step": 12923 + }, + { + "entropy": 1.7391641736030579, + "epoch": 1.4197632583559914, + "grad_norm": 0.6086916923522949, + "learning_rate": 5.690909336557458e-06, + "loss": 1.4022, + "mean_token_accuracy": 0.6388307412465414, + "num_tokens": 2167724978.0, + "step": 12924 + }, + { + "entropy": 1.6861611207326253, + "epoch": 1.4198731152673643, + "grad_norm": 0.6505236625671387, + "learning_rate": 5.689616545773508e-06, + "loss": 1.4473, + "mean_token_accuracy": 0.652971088886261, + "num_tokens": 2167860519.0, + "step": 12925 + }, + { + "entropy": 1.7772870361804962, + "epoch": 1.4199829721787371, + "grad_norm": 0.7045353055000305, + "learning_rate": 5.6883239230608024e-06, + "loss": 1.3123, + "mean_token_accuracy": 0.6595732072989146, + "num_tokens": 2167959886.0, + "step": 12926 + }, + { + "entropy": 1.7017023464043934, + "epoch": 1.4200928290901103, + "grad_norm": 0.8636213541030884, + "learning_rate": 5.687031468460253e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.6783096243937811, + "num_tokens": 2168139313.0, + "step": 12927 + }, + { + "entropy": 1.680685927470525, + "epoch": 1.4202026860014831, + "grad_norm": 0.7073819637298584, + "learning_rate": 5.685739182012764e-06, + "loss": 1.3014, + "mean_token_accuracy": 0.6693469732999802, + "num_tokens": 2168261083.0, + "step": 12928 + }, + { + "entropy": 1.7441634833812714, + "epoch": 1.420312542912856, + "grad_norm": 0.6499477624893188, + "learning_rate": 5.684447063759233e-06, + "loss": 1.4158, + "mean_token_accuracy": 0.6491926809151968, + "num_tokens": 2168426919.0, + "step": 12929 + }, + { + "entropy": 1.7324085434277852, + "epoch": 1.420422399824229, + "grad_norm": 0.7667383551597595, + "learning_rate": 5.683155113740559e-06, + "loss": 1.2891, + "mean_token_accuracy": 0.6760394672552744, + "num_tokens": 2168542110.0, + "step": 12930 + }, + { + "entropy": 1.7055922349294026, + "epoch": 1.4205322567356018, + "grad_norm": 0.674475908279419, + "learning_rate": 5.681863331997628e-06, + "loss": 1.4863, + "mean_token_accuracy": 0.6470987647771835, + "num_tokens": 2168739330.0, + "step": 12931 + }, + { + "entropy": 1.698354721069336, + "epoch": 1.420642113646975, + "grad_norm": 0.6338586807250977, + "learning_rate": 5.680571718571328e-06, + "loss": 1.3027, + "mean_token_accuracy": 0.6713970800240835, + "num_tokens": 2168880859.0, + "step": 12932 + }, + { + "entropy": 1.645394931236903, + "epoch": 1.4207519705583478, + "grad_norm": 0.67153000831604, + "learning_rate": 5.679280273502537e-06, + "loss": 1.33, + "mean_token_accuracy": 0.6692242324352264, + "num_tokens": 2169052718.0, + "step": 12933 + }, + { + "entropy": 1.7089401880900066, + "epoch": 1.4208618274697207, + "grad_norm": 0.6924530863761902, + "learning_rate": 5.677988996832124e-06, + "loss": 1.4324, + "mean_token_accuracy": 0.6542358994483948, + "num_tokens": 2169198381.0, + "step": 12934 + }, + { + "entropy": 1.7337498764197032, + "epoch": 1.4209716843810936, + "grad_norm": 0.7746621370315552, + "learning_rate": 5.676697888600965e-06, + "loss": 1.3295, + "mean_token_accuracy": 0.6665536761283875, + "num_tokens": 2169349542.0, + "step": 12935 + }, + { + "entropy": 1.6929566363493602, + "epoch": 1.4210815412924664, + "grad_norm": 0.6887615323066711, + "learning_rate": 5.675406948849919e-06, + "loss": 1.6089, + "mean_token_accuracy": 0.6433848490317663, + "num_tokens": 2169537168.0, + "step": 12936 + }, + { + "entropy": 1.7130916615327199, + "epoch": 1.4211913982038396, + "grad_norm": 1.0259429216384888, + "learning_rate": 5.67411617761984e-06, + "loss": 1.4418, + "mean_token_accuracy": 0.650916631023089, + "num_tokens": 2169701884.0, + "step": 12937 + }, + { + "entropy": 1.6535163124402363, + "epoch": 1.4213012551152124, + "grad_norm": 0.6167465448379517, + "learning_rate": 5.672825574951588e-06, + "loss": 1.3627, + "mean_token_accuracy": 0.6604279528061548, + "num_tokens": 2169914597.0, + "step": 12938 + }, + { + "entropy": 1.6423707405726116, + "epoch": 1.4214111120265853, + "grad_norm": 0.6382650136947632, + "learning_rate": 5.671535140886002e-06, + "loss": 1.3769, + "mean_token_accuracy": 0.6672868579626083, + "num_tokens": 2170071581.0, + "step": 12939 + }, + { + "entropy": 1.7335049013296764, + "epoch": 1.4215209689379584, + "grad_norm": 0.8444647192955017, + "learning_rate": 5.670244875463931e-06, + "loss": 1.4357, + "mean_token_accuracy": 0.6631718277931213, + "num_tokens": 2170211841.0, + "step": 12940 + }, + { + "entropy": 1.732320378224055, + "epoch": 1.4216308258493313, + "grad_norm": 0.7674136757850647, + "learning_rate": 5.668954778726209e-06, + "loss": 1.2571, + "mean_token_accuracy": 0.681780661145846, + "num_tokens": 2170337013.0, + "step": 12941 + }, + { + "entropy": 1.6558412313461304, + "epoch": 1.4217406827607042, + "grad_norm": 0.5625472068786621, + "learning_rate": 5.667664850713662e-06, + "loss": 1.3564, + "mean_token_accuracy": 0.6685200929641724, + "num_tokens": 2170505477.0, + "step": 12942 + }, + { + "entropy": 1.6879661480585735, + "epoch": 1.421850539672077, + "grad_norm": 0.6796611547470093, + "learning_rate": 5.66637509146712e-06, + "loss": 1.5171, + "mean_token_accuracy": 0.646856447060903, + "num_tokens": 2170703823.0, + "step": 12943 + }, + { + "entropy": 1.7009165585041046, + "epoch": 1.42196039658345, + "grad_norm": 0.7441216111183167, + "learning_rate": 5.66508550102741e-06, + "loss": 1.5151, + "mean_token_accuracy": 0.6342372844616572, + "num_tokens": 2170899789.0, + "step": 12944 + }, + { + "entropy": 1.7801036536693573, + "epoch": 1.422070253494823, + "grad_norm": 0.7327330112457275, + "learning_rate": 5.663796079435331e-06, + "loss": 1.3925, + "mean_token_accuracy": 0.6534372419118881, + "num_tokens": 2171035224.0, + "step": 12945 + }, + { + "entropy": 1.6900883217652638, + "epoch": 1.422180110406196, + "grad_norm": 0.7192649245262146, + "learning_rate": 5.662506826731704e-06, + "loss": 1.2754, + "mean_token_accuracy": 0.6802859654029211, + "num_tokens": 2171203629.0, + "step": 12946 + }, + { + "entropy": 1.7146925528844197, + "epoch": 1.4222899673175688, + "grad_norm": 0.8082349896430969, + "learning_rate": 5.661217742957333e-06, + "loss": 1.6062, + "mean_token_accuracy": 0.6461126953363419, + "num_tokens": 2171370122.0, + "step": 12947 + }, + { + "entropy": 1.7389146387577057, + "epoch": 1.4223998242289417, + "grad_norm": 0.7392411828041077, + "learning_rate": 5.659928828153015e-06, + "loss": 1.3126, + "mean_token_accuracy": 0.667231614391009, + "num_tokens": 2171482880.0, + "step": 12948 + }, + { + "entropy": 1.6697891255219777, + "epoch": 1.4225096811403146, + "grad_norm": 0.6810927987098694, + "learning_rate": 5.658640082359541e-06, + "loss": 1.2989, + "mean_token_accuracy": 0.6649407347043356, + "num_tokens": 2171621173.0, + "step": 12949 + }, + { + "entropy": 1.7280420064926147, + "epoch": 1.4226195380516877, + "grad_norm": 0.6208683252334595, + "learning_rate": 5.657351505617703e-06, + "loss": 1.458, + "mean_token_accuracy": 0.6525483777125677, + "num_tokens": 2171892873.0, + "step": 12950 + }, + { + "entropy": 1.7015692094961803, + "epoch": 1.4227293949630606, + "grad_norm": 0.7659103274345398, + "learning_rate": 5.656063097968281e-06, + "loss": 1.4574, + "mean_token_accuracy": 0.6513445029656092, + "num_tokens": 2172057108.0, + "step": 12951 + }, + { + "entropy": 1.7544087767601013, + "epoch": 1.4228392518744335, + "grad_norm": 0.7193136215209961, + "learning_rate": 5.6547748594520556e-06, + "loss": 1.3002, + "mean_token_accuracy": 0.6638988107442856, + "num_tokens": 2172162991.0, + "step": 12952 + }, + { + "entropy": 1.6446592311064403, + "epoch": 1.4229491087858066, + "grad_norm": 0.6459053754806519, + "learning_rate": 5.653486790109798e-06, + "loss": 1.319, + "mean_token_accuracy": 0.6649145980676016, + "num_tokens": 2172326151.0, + "step": 12953 + }, + { + "entropy": 1.6824187239011128, + "epoch": 1.4230589656971795, + "grad_norm": 0.6160148978233337, + "learning_rate": 5.65219888998227e-06, + "loss": 1.4223, + "mean_token_accuracy": 0.6440077473719915, + "num_tokens": 2172475596.0, + "step": 12954 + }, + { + "entropy": 1.6969729959964752, + "epoch": 1.4231688226085524, + "grad_norm": 0.748254120349884, + "learning_rate": 5.650911159110239e-06, + "loss": 1.2247, + "mean_token_accuracy": 0.6879066576560339, + "num_tokens": 2172577164.0, + "step": 12955 + }, + { + "entropy": 1.71368607878685, + "epoch": 1.4232786795199253, + "grad_norm": 0.7107913494110107, + "learning_rate": 5.649623597534466e-06, + "loss": 1.3242, + "mean_token_accuracy": 0.6704561958710352, + "num_tokens": 2172735492.0, + "step": 12956 + }, + { + "entropy": 1.6840336819489796, + "epoch": 1.4233885364312981, + "grad_norm": 0.6022214889526367, + "learning_rate": 5.648336205295687e-06, + "loss": 1.3555, + "mean_token_accuracy": 0.6485221783320109, + "num_tokens": 2172911336.0, + "step": 12957 + }, + { + "entropy": 1.7092094123363495, + "epoch": 1.4234983933426713, + "grad_norm": 0.8171796202659607, + "learning_rate": 5.647048982434656e-06, + "loss": 1.4057, + "mean_token_accuracy": 0.65643543501695, + "num_tokens": 2173068228.0, + "step": 12958 + }, + { + "entropy": 1.762761503458023, + "epoch": 1.4236082502540441, + "grad_norm": 0.7240894436836243, + "learning_rate": 5.645761928992117e-06, + "loss": 1.3219, + "mean_token_accuracy": 0.6557150532801946, + "num_tokens": 2173184380.0, + "step": 12959 + }, + { + "entropy": 1.721298485994339, + "epoch": 1.423718107165417, + "grad_norm": 0.7749879360198975, + "learning_rate": 5.644475045008799e-06, + "loss": 1.5254, + "mean_token_accuracy": 0.6502954959869385, + "num_tokens": 2173328706.0, + "step": 12960 + }, + { + "entropy": 1.7307699620723724, + "epoch": 1.42382796407679, + "grad_norm": 0.7057486772537231, + "learning_rate": 5.643188330525431e-06, + "loss": 1.2917, + "mean_token_accuracy": 0.6716625094413757, + "num_tokens": 2173441079.0, + "step": 12961 + }, + { + "entropy": 1.716073344151179, + "epoch": 1.4239378209881628, + "grad_norm": 0.6514110565185547, + "learning_rate": 5.641901785582739e-06, + "loss": 1.3905, + "mean_token_accuracy": 0.6514262358347574, + "num_tokens": 2173586081.0, + "step": 12962 + }, + { + "entropy": 1.655029982328415, + "epoch": 1.424047677899536, + "grad_norm": 0.6521716117858887, + "learning_rate": 5.640615410221442e-06, + "loss": 1.3778, + "mean_token_accuracy": 0.6559510032335917, + "num_tokens": 2173819782.0, + "step": 12963 + }, + { + "entropy": 1.6636370718479156, + "epoch": 1.4241575348109088, + "grad_norm": 0.6372007727622986, + "learning_rate": 5.639329204482252e-06, + "loss": 1.3485, + "mean_token_accuracy": 0.6604503045479456, + "num_tokens": 2173978689.0, + "step": 12964 + }, + { + "entropy": 1.6833548645178478, + "epoch": 1.4242673917222817, + "grad_norm": 0.7769525051116943, + "learning_rate": 5.638043168405878e-06, + "loss": 1.4396, + "mean_token_accuracy": 0.6549781362215678, + "num_tokens": 2174216659.0, + "step": 12965 + }, + { + "entropy": 1.7020526230335236, + "epoch": 1.4243772486336548, + "grad_norm": 0.6384181380271912, + "learning_rate": 5.636757302033018e-06, + "loss": 1.321, + "mean_token_accuracy": 0.6678778429826101, + "num_tokens": 2174383271.0, + "step": 12966 + }, + { + "entropy": 1.6687651177247365, + "epoch": 1.4244871055450277, + "grad_norm": 0.7364367246627808, + "learning_rate": 5.6354716054043726e-06, + "loss": 1.5467, + "mean_token_accuracy": 0.6513969451189041, + "num_tokens": 2174578775.0, + "step": 12967 + }, + { + "entropy": 1.6899695893128712, + "epoch": 1.4245969624564006, + "grad_norm": 0.6350962519645691, + "learning_rate": 5.634186078560641e-06, + "loss": 1.339, + "mean_token_accuracy": 0.6744259546200434, + "num_tokens": 2174745441.0, + "step": 12968 + }, + { + "entropy": 1.6761878331502278, + "epoch": 1.4247068193677734, + "grad_norm": 0.726622998714447, + "learning_rate": 5.632900721542496e-06, + "loss": 1.5962, + "mean_token_accuracy": 0.6452071170012156, + "num_tokens": 2174967033.0, + "step": 12969 + }, + { + "entropy": 1.6440961559613545, + "epoch": 1.4248166762791463, + "grad_norm": 0.7337450385093689, + "learning_rate": 5.631615534390623e-06, + "loss": 1.4722, + "mean_token_accuracy": 0.6409422506888708, + "num_tokens": 2175205554.0, + "step": 12970 + }, + { + "entropy": 1.720875859260559, + "epoch": 1.4249265331905194, + "grad_norm": 0.7657408118247986, + "learning_rate": 5.630330517145704e-06, + "loss": 1.5809, + "mean_token_accuracy": 0.6256515284379324, + "num_tokens": 2175426782.0, + "step": 12971 + }, + { + "entropy": 1.740049531062444, + "epoch": 1.4250363901018923, + "grad_norm": 0.6672664880752563, + "learning_rate": 5.6290456698484045e-06, + "loss": 1.5588, + "mean_token_accuracy": 0.6511749972899755, + "num_tokens": 2175633316.0, + "step": 12972 + }, + { + "entropy": 1.7396978239218395, + "epoch": 1.4251462470132652, + "grad_norm": 0.6931610107421875, + "learning_rate": 5.627760992539384e-06, + "loss": 1.3684, + "mean_token_accuracy": 0.6563472002744675, + "num_tokens": 2175780067.0, + "step": 12973 + }, + { + "entropy": 1.6736437479654949, + "epoch": 1.425256103924638, + "grad_norm": 0.574825644493103, + "learning_rate": 5.626476485259314e-06, + "loss": 1.6164, + "mean_token_accuracy": 0.6455618888139725, + "num_tokens": 2175994610.0, + "step": 12974 + }, + { + "entropy": 1.6988399227460225, + "epoch": 1.425365960836011, + "grad_norm": 0.828632652759552, + "learning_rate": 5.6251921480488355e-06, + "loss": 1.3082, + "mean_token_accuracy": 0.6656528313954672, + "num_tokens": 2176110701.0, + "step": 12975 + }, + { + "entropy": 1.7076418995857239, + "epoch": 1.425475817747384, + "grad_norm": 0.5862886309623718, + "learning_rate": 5.623907980948608e-06, + "loss": 1.3982, + "mean_token_accuracy": 0.662509153286616, + "num_tokens": 2176301670.0, + "step": 12976 + }, + { + "entropy": 1.7174928188323975, + "epoch": 1.425585674658757, + "grad_norm": 0.8851239681243896, + "learning_rate": 5.6226239839992715e-06, + "loss": 1.4016, + "mean_token_accuracy": 0.6703563729921976, + "num_tokens": 2176456456.0, + "step": 12977 + }, + { + "entropy": 1.7496284544467926, + "epoch": 1.4256955315701298, + "grad_norm": 0.6377162337303162, + "learning_rate": 5.6213401572414575e-06, + "loss": 1.3347, + "mean_token_accuracy": 0.6575185209512711, + "num_tokens": 2176602131.0, + "step": 12978 + }, + { + "entropy": 1.766124387582143, + "epoch": 1.425805388481503, + "grad_norm": 0.667473316192627, + "learning_rate": 5.620056500715805e-06, + "loss": 1.3978, + "mean_token_accuracy": 0.654146542151769, + "num_tokens": 2176788934.0, + "step": 12979 + }, + { + "entropy": 1.7000204424063365, + "epoch": 1.4259152453928758, + "grad_norm": 0.7213659286499023, + "learning_rate": 5.618773014462946e-06, + "loss": 1.1064, + "mean_token_accuracy": 0.7038625578085581, + "num_tokens": 2176882325.0, + "step": 12980 + }, + { + "entropy": 1.7640029390652974, + "epoch": 1.4260251023042487, + "grad_norm": 0.6563233733177185, + "learning_rate": 5.617489698523491e-06, + "loss": 1.4325, + "mean_token_accuracy": 0.6590708047151566, + "num_tokens": 2177014004.0, + "step": 12981 + }, + { + "entropy": 1.6775661706924438, + "epoch": 1.4261349592156216, + "grad_norm": 0.5699209570884705, + "learning_rate": 5.616206552938059e-06, + "loss": 1.3725, + "mean_token_accuracy": 0.6618246585130692, + "num_tokens": 2177185937.0, + "step": 12982 + }, + { + "entropy": 1.6460503935813904, + "epoch": 1.4262448161269945, + "grad_norm": 0.7296600937843323, + "learning_rate": 5.614923577747269e-06, + "loss": 1.3168, + "mean_token_accuracy": 0.6661859899759293, + "num_tokens": 2177345935.0, + "step": 12983 + }, + { + "entropy": 1.6979570190111797, + "epoch": 1.4263546730383676, + "grad_norm": 0.6132034063339233, + "learning_rate": 5.613640772991721e-06, + "loss": 1.4773, + "mean_token_accuracy": 0.6433689743280411, + "num_tokens": 2177602730.0, + "step": 12984 + }, + { + "entropy": 1.6999173959096272, + "epoch": 1.4264645299497405, + "grad_norm": 0.6948098540306091, + "learning_rate": 5.612358138712011e-06, + "loss": 1.4101, + "mean_token_accuracy": 0.6698167969783148, + "num_tokens": 2177759033.0, + "step": 12985 + }, + { + "entropy": 1.7339690029621124, + "epoch": 1.4265743868611134, + "grad_norm": 0.7029469013214111, + "learning_rate": 5.611075674948743e-06, + "loss": 1.2782, + "mean_token_accuracy": 0.6747192790110906, + "num_tokens": 2177875280.0, + "step": 12986 + }, + { + "entropy": 1.6518168846766155, + "epoch": 1.4266842437724863, + "grad_norm": 0.7279475331306458, + "learning_rate": 5.609793381742497e-06, + "loss": 1.1832, + "mean_token_accuracy": 0.6870991041262945, + "num_tokens": 2177996766.0, + "step": 12987 + }, + { + "entropy": 1.7683051228523254, + "epoch": 1.4267941006838591, + "grad_norm": 0.7115809917449951, + "learning_rate": 5.608511259133867e-06, + "loss": 1.4565, + "mean_token_accuracy": 0.6595746825138727, + "num_tokens": 2178140721.0, + "step": 12988 + }, + { + "entropy": 1.6959002912044525, + "epoch": 1.4269039575952323, + "grad_norm": 0.7604736089706421, + "learning_rate": 5.607229307163423e-06, + "loss": 1.2443, + "mean_token_accuracy": 0.6789979139963785, + "num_tokens": 2178285786.0, + "step": 12989 + }, + { + "entropy": 1.751099556684494, + "epoch": 1.4270138145066051, + "grad_norm": 0.6957893967628479, + "learning_rate": 5.60594752587174e-06, + "loss": 1.3528, + "mean_token_accuracy": 0.6624589115381241, + "num_tokens": 2178418812.0, + "step": 12990 + }, + { + "entropy": 1.7143315374851227, + "epoch": 1.427123671417978, + "grad_norm": 0.6933035850524902, + "learning_rate": 5.60466591529939e-06, + "loss": 1.3594, + "mean_token_accuracy": 0.6720022509495417, + "num_tokens": 2178586498.0, + "step": 12991 + }, + { + "entropy": 1.7343165179093678, + "epoch": 1.4272335283293511, + "grad_norm": 0.6081417798995972, + "learning_rate": 5.603384475486932e-06, + "loss": 1.4883, + "mean_token_accuracy": 0.6346796850363413, + "num_tokens": 2178832189.0, + "step": 12992 + }, + { + "entropy": 1.7000845571358998, + "epoch": 1.427343385240724, + "grad_norm": 0.6261142492294312, + "learning_rate": 5.602103206474922e-06, + "loss": 1.3748, + "mean_token_accuracy": 0.6570387482643127, + "num_tokens": 2179026605.0, + "step": 12993 + }, + { + "entropy": 1.6828905642032623, + "epoch": 1.427453242152097, + "grad_norm": 0.7044478058815002, + "learning_rate": 5.600822108303916e-06, + "loss": 1.296, + "mean_token_accuracy": 0.6614427367846171, + "num_tokens": 2179195579.0, + "step": 12994 + }, + { + "entropy": 1.6979553401470184, + "epoch": 1.4275630990634698, + "grad_norm": 0.8138183355331421, + "learning_rate": 5.599541181014453e-06, + "loss": 1.4325, + "mean_token_accuracy": 0.6653849979241689, + "num_tokens": 2179366662.0, + "step": 12995 + }, + { + "entropy": 1.758367915948232, + "epoch": 1.4276729559748427, + "grad_norm": 0.7187701463699341, + "learning_rate": 5.598260424647081e-06, + "loss": 1.3425, + "mean_token_accuracy": 0.656711811820666, + "num_tokens": 2179506900.0, + "step": 12996 + }, + { + "entropy": 1.6735180914402008, + "epoch": 1.4277828128862158, + "grad_norm": 0.7112841606140137, + "learning_rate": 5.596979839242335e-06, + "loss": 1.2976, + "mean_token_accuracy": 0.6648980478445689, + "num_tokens": 2179630864.0, + "step": 12997 + }, + { + "entropy": 1.7127126554648082, + "epoch": 1.4278926697975887, + "grad_norm": 0.6963240504264832, + "learning_rate": 5.595699424840737e-06, + "loss": 1.3437, + "mean_token_accuracy": 0.6695072799921036, + "num_tokens": 2179771664.0, + "step": 12998 + }, + { + "entropy": 1.6788500547409058, + "epoch": 1.4280025267089616, + "grad_norm": 0.8437992334365845, + "learning_rate": 5.5944191814828174e-06, + "loss": 1.346, + "mean_token_accuracy": 0.6613343755404154, + "num_tokens": 2179961091.0, + "step": 12999 + }, + { + "entropy": 1.7473607063293457, + "epoch": 1.4281123836203344, + "grad_norm": 0.7349570393562317, + "learning_rate": 5.593139109209102e-06, + "loss": 1.4735, + "mean_token_accuracy": 0.6576576034228007, + "num_tokens": 2180151428.0, + "step": 13000 + }, + { + "entropy": 1.6567772924900055, + "epoch": 1.4282222405317073, + "grad_norm": 0.7208495736122131, + "learning_rate": 5.591859208060091e-06, + "loss": 1.2619, + "mean_token_accuracy": 0.6733733216921488, + "num_tokens": 2180285719.0, + "step": 13001 + }, + { + "entropy": 1.7362763285636902, + "epoch": 1.4283320974430804, + "grad_norm": 0.725077748298645, + "learning_rate": 5.590579478076298e-06, + "loss": 1.2903, + "mean_token_accuracy": 0.6674645096063614, + "num_tokens": 2180390288.0, + "step": 13002 + }, + { + "entropy": 1.6696288685003917, + "epoch": 1.4284419543544533, + "grad_norm": 0.5948540568351746, + "learning_rate": 5.58929991929823e-06, + "loss": 1.4792, + "mean_token_accuracy": 0.6409497807423273, + "num_tokens": 2180565522.0, + "step": 13003 + }, + { + "entropy": 1.703221042950948, + "epoch": 1.4285518112658262, + "grad_norm": 0.6803807020187378, + "learning_rate": 5.5880205317663824e-06, + "loss": 1.3171, + "mean_token_accuracy": 0.6711372335751852, + "num_tokens": 2180680337.0, + "step": 13004 + }, + { + "entropy": 1.6649436155954997, + "epoch": 1.4286616681771993, + "grad_norm": 0.6614540219306946, + "learning_rate": 5.586741315521245e-06, + "loss": 1.5934, + "mean_token_accuracy": 0.62440458436807, + "num_tokens": 2180939669.0, + "step": 13005 + }, + { + "entropy": 1.6837556461493175, + "epoch": 1.4287715250885722, + "grad_norm": 0.6564772725105286, + "learning_rate": 5.585462270603306e-06, + "loss": 1.4291, + "mean_token_accuracy": 0.662267878651619, + "num_tokens": 2181122104.0, + "step": 13006 + }, + { + "entropy": 1.7078391114870708, + "epoch": 1.428881381999945, + "grad_norm": 0.6484875679016113, + "learning_rate": 5.5841833970530425e-06, + "loss": 1.3659, + "mean_token_accuracy": 0.6654360741376877, + "num_tokens": 2181268418.0, + "step": 13007 + }, + { + "entropy": 1.6462851862112682, + "epoch": 1.428991238911318, + "grad_norm": 0.5821180939674377, + "learning_rate": 5.58290469491094e-06, + "loss": 1.3693, + "mean_token_accuracy": 0.6470950643221537, + "num_tokens": 2181533426.0, + "step": 13008 + }, + { + "entropy": 1.7857622504234314, + "epoch": 1.4291010958226908, + "grad_norm": 0.7754672169685364, + "learning_rate": 5.581626164217461e-06, + "loss": 1.4693, + "mean_token_accuracy": 0.6310462603966395, + "num_tokens": 2181692955.0, + "step": 13009 + }, + { + "entropy": 1.6840496559937794, + "epoch": 1.429210952734064, + "grad_norm": 1.3394370079040527, + "learning_rate": 5.58034780501307e-06, + "loss": 1.4793, + "mean_token_accuracy": 0.6427704244852066, + "num_tokens": 2181887005.0, + "step": 13010 + }, + { + "entropy": 1.7042312423388164, + "epoch": 1.4293208096454368, + "grad_norm": 0.6307734847068787, + "learning_rate": 5.579069617338229e-06, + "loss": 1.4193, + "mean_token_accuracy": 0.6560370475053787, + "num_tokens": 2182086766.0, + "step": 13011 + }, + { + "entropy": 1.7062763075033824, + "epoch": 1.4294306665568097, + "grad_norm": 0.6449376344680786, + "learning_rate": 5.577791601233398e-06, + "loss": 1.5444, + "mean_token_accuracy": 0.6326194703578949, + "num_tokens": 2182269781.0, + "step": 13012 + }, + { + "entropy": 1.6698502898216248, + "epoch": 1.4295405234681826, + "grad_norm": 0.6093115210533142, + "learning_rate": 5.576513756739012e-06, + "loss": 1.2875, + "mean_token_accuracy": 0.6728635678688685, + "num_tokens": 2182442382.0, + "step": 13013 + }, + { + "entropy": 1.7181947231292725, + "epoch": 1.4296503803795555, + "grad_norm": 0.682712733745575, + "learning_rate": 5.5752360838955215e-06, + "loss": 1.3765, + "mean_token_accuracy": 0.6576512654622396, + "num_tokens": 2182589601.0, + "step": 13014 + }, + { + "entropy": 1.6917083462079365, + "epoch": 1.4297602372909286, + "grad_norm": 0.6454870700836182, + "learning_rate": 5.573958582743368e-06, + "loss": 1.4754, + "mean_token_accuracy": 0.6636495043834051, + "num_tokens": 2182779186.0, + "step": 13015 + }, + { + "entropy": 1.7364496489365895, + "epoch": 1.4298700942023015, + "grad_norm": 0.8121635317802429, + "learning_rate": 5.572681253322983e-06, + "loss": 1.4932, + "mean_token_accuracy": 0.6382195055484772, + "num_tokens": 2182965939.0, + "step": 13016 + }, + { + "entropy": 1.705742100874583, + "epoch": 1.4299799511136744, + "grad_norm": 0.6007391810417175, + "learning_rate": 5.571404095674786e-06, + "loss": 1.4925, + "mean_token_accuracy": 0.6454195727904638, + "num_tokens": 2183198308.0, + "step": 13017 + }, + { + "entropy": 1.7828343609968822, + "epoch": 1.4300898080250475, + "grad_norm": 0.857241690158844, + "learning_rate": 5.570127109839205e-06, + "loss": 1.5772, + "mean_token_accuracy": 0.6316992690165838, + "num_tokens": 2183435367.0, + "step": 13018 + }, + { + "entropy": 1.719041536251704, + "epoch": 1.4301996649364204, + "grad_norm": 0.6930078864097595, + "learning_rate": 5.568850295856652e-06, + "loss": 1.434, + "mean_token_accuracy": 0.6500537196795145, + "num_tokens": 2183640896.0, + "step": 13019 + }, + { + "entropy": 1.6936982572078705, + "epoch": 1.4303095218477933, + "grad_norm": 0.8050754070281982, + "learning_rate": 5.567573653767544e-06, + "loss": 1.3243, + "mean_token_accuracy": 0.667048583428065, + "num_tokens": 2183808495.0, + "step": 13020 + }, + { + "entropy": 1.7069650292396545, + "epoch": 1.4304193787591661, + "grad_norm": 0.7172401547431946, + "learning_rate": 5.5662971836122795e-06, + "loss": 1.2996, + "mean_token_accuracy": 0.6751367499430975, + "num_tokens": 2183998179.0, + "step": 13021 + }, + { + "entropy": 1.744606077671051, + "epoch": 1.430529235670539, + "grad_norm": 0.7243566513061523, + "learning_rate": 5.56502088543126e-06, + "loss": 1.3691, + "mean_token_accuracy": 0.6522730439901352, + "num_tokens": 2184155317.0, + "step": 13022 + }, + { + "entropy": 1.6920949220657349, + "epoch": 1.4306390925819121, + "grad_norm": 0.5887582898139954, + "learning_rate": 5.56374475926488e-06, + "loss": 1.4795, + "mean_token_accuracy": 0.6456716706355413, + "num_tokens": 2184358935.0, + "step": 13023 + }, + { + "entropy": 1.7107125322024028, + "epoch": 1.430748949493285, + "grad_norm": 0.6471151113510132, + "learning_rate": 5.562468805153534e-06, + "loss": 1.4389, + "mean_token_accuracy": 0.6550725599129995, + "num_tokens": 2184539771.0, + "step": 13024 + }, + { + "entropy": 1.7102607389291127, + "epoch": 1.430858806404658, + "grad_norm": 0.7383255958557129, + "learning_rate": 5.561193023137595e-06, + "loss": 1.3653, + "mean_token_accuracy": 0.6568672160307566, + "num_tokens": 2184693987.0, + "step": 13025 + }, + { + "entropy": 1.7418889204661052, + "epoch": 1.4309686633160308, + "grad_norm": 0.8502252697944641, + "learning_rate": 5.559917413257444e-06, + "loss": 1.3336, + "mean_token_accuracy": 0.6681547611951828, + "num_tokens": 2184835284.0, + "step": 13026 + }, + { + "entropy": 1.7333911557992299, + "epoch": 1.4310785202274037, + "grad_norm": 0.669732928276062, + "learning_rate": 5.558641975553459e-06, + "loss": 1.2936, + "mean_token_accuracy": 0.664469505349795, + "num_tokens": 2184953631.0, + "step": 13027 + }, + { + "entropy": 1.7422559758027394, + "epoch": 1.4311883771387768, + "grad_norm": 0.7372428178787231, + "learning_rate": 5.557366710066006e-06, + "loss": 1.4934, + "mean_token_accuracy": 0.6541983361045519, + "num_tokens": 2185094275.0, + "step": 13028 + }, + { + "entropy": 1.6865850885709126, + "epoch": 1.4312982340501497, + "grad_norm": 0.7553392648696899, + "learning_rate": 5.556091616835438e-06, + "loss": 1.4829, + "mean_token_accuracy": 0.6615019887685776, + "num_tokens": 2185230818.0, + "step": 13029 + }, + { + "entropy": 1.7182862261931102, + "epoch": 1.4314080909615226, + "grad_norm": 0.6798596382141113, + "learning_rate": 5.554816695902122e-06, + "loss": 1.4433, + "mean_token_accuracy": 0.6591095378001531, + "num_tokens": 2185375644.0, + "step": 13030 + }, + { + "entropy": 1.771297464768092, + "epoch": 1.4315179478728957, + "grad_norm": 0.7370211482048035, + "learning_rate": 5.5535419473064015e-06, + "loss": 1.4662, + "mean_token_accuracy": 0.6467802077531815, + "num_tokens": 2185563893.0, + "step": 13031 + }, + { + "entropy": 1.6960931917031605, + "epoch": 1.4316278047842685, + "grad_norm": 0.7024471163749695, + "learning_rate": 5.552267371088626e-06, + "loss": 1.5382, + "mean_token_accuracy": 0.6351848443349203, + "num_tokens": 2185756274.0, + "step": 13032 + }, + { + "entropy": 1.7461300293604534, + "epoch": 1.4317376616956414, + "grad_norm": 0.7345073223114014, + "learning_rate": 5.550992967289134e-06, + "loss": 1.3774, + "mean_token_accuracy": 0.6503315269947052, + "num_tokens": 2185906994.0, + "step": 13033 + }, + { + "entropy": 1.7313400208950043, + "epoch": 1.4318475186070143, + "grad_norm": 0.6339378952980042, + "learning_rate": 5.549718735948255e-06, + "loss": 1.4133, + "mean_token_accuracy": 0.6485500335693359, + "num_tokens": 2186094480.0, + "step": 13034 + }, + { + "entropy": 1.6995634138584137, + "epoch": 1.4319573755183872, + "grad_norm": 0.6970425248146057, + "learning_rate": 5.548444677106324e-06, + "loss": 1.3832, + "mean_token_accuracy": 0.6507101853688558, + "num_tokens": 2186266968.0, + "step": 13035 + }, + { + "entropy": 1.6785328388214111, + "epoch": 1.4320672324297603, + "grad_norm": 0.8331180214881897, + "learning_rate": 5.547170790803667e-06, + "loss": 1.394, + "mean_token_accuracy": 0.6583549777666727, + "num_tokens": 2186422477.0, + "step": 13036 + }, + { + "entropy": 1.7474220593770344, + "epoch": 1.4321770893411332, + "grad_norm": 0.7904669642448425, + "learning_rate": 5.545897077080591e-06, + "loss": 1.5484, + "mean_token_accuracy": 0.6391499191522598, + "num_tokens": 2186658720.0, + "step": 13037 + }, + { + "entropy": 1.745592087507248, + "epoch": 1.432286946252506, + "grad_norm": 0.688362181186676, + "learning_rate": 5.544623535977416e-06, + "loss": 1.4482, + "mean_token_accuracy": 0.6570040633281072, + "num_tokens": 2186849614.0, + "step": 13038 + }, + { + "entropy": 1.683472563823064, + "epoch": 1.4323968031638792, + "grad_norm": 0.6357275247573853, + "learning_rate": 5.543350167534451e-06, + "loss": 1.3715, + "mean_token_accuracy": 0.6654830276966095, + "num_tokens": 2187013867.0, + "step": 13039 + }, + { + "entropy": 1.6940331260363262, + "epoch": 1.4325066600752518, + "grad_norm": 0.621670126914978, + "learning_rate": 5.542076971791994e-06, + "loss": 1.3792, + "mean_token_accuracy": 0.6595076471567154, + "num_tokens": 2187182576.0, + "step": 13040 + }, + { + "entropy": 1.7123170693715413, + "epoch": 1.432616516986625, + "grad_norm": 0.9452431797981262, + "learning_rate": 5.5408039487903375e-06, + "loss": 1.4066, + "mean_token_accuracy": 0.6636346479256948, + "num_tokens": 2187310672.0, + "step": 13041 + }, + { + "entropy": 1.6607805689175923, + "epoch": 1.4327263738979978, + "grad_norm": 0.6522439122200012, + "learning_rate": 5.5395310985697804e-06, + "loss": 1.1924, + "mean_token_accuracy": 0.6833398044109344, + "num_tokens": 2187434823.0, + "step": 13042 + }, + { + "entropy": 1.709082802136739, + "epoch": 1.4328362308093707, + "grad_norm": 0.6486063003540039, + "learning_rate": 5.538258421170599e-06, + "loss": 1.4724, + "mean_token_accuracy": 0.6308440069357554, + "num_tokens": 2187628693.0, + "step": 13043 + }, + { + "entropy": 1.6720016201337178, + "epoch": 1.4329460877207438, + "grad_norm": 0.790734052658081, + "learning_rate": 5.5369859166330816e-06, + "loss": 1.456, + "mean_token_accuracy": 0.6650431652863821, + "num_tokens": 2187795256.0, + "step": 13044 + }, + { + "entropy": 1.7333543697992961, + "epoch": 1.4330559446321167, + "grad_norm": 0.6807409524917603, + "learning_rate": 5.535713584997498e-06, + "loss": 1.4672, + "mean_token_accuracy": 0.6574084411064783, + "num_tokens": 2187948169.0, + "step": 13045 + }, + { + "entropy": 1.7210048735141754, + "epoch": 1.4331658015434896, + "grad_norm": 0.7557775974273682, + "learning_rate": 5.5344414263041145e-06, + "loss": 1.3155, + "mean_token_accuracy": 0.6705209712187449, + "num_tokens": 2188074869.0, + "step": 13046 + }, + { + "entropy": 1.679331550995509, + "epoch": 1.4332756584548625, + "grad_norm": 0.6121296286582947, + "learning_rate": 5.5331694405931966e-06, + "loss": 1.553, + "mean_token_accuracy": 0.629560798406601, + "num_tokens": 2188272724.0, + "step": 13047 + }, + { + "entropy": 1.6509125630060832, + "epoch": 1.4333855153662354, + "grad_norm": 0.764921247959137, + "learning_rate": 5.531897627905009e-06, + "loss": 1.2867, + "mean_token_accuracy": 0.6722366611162821, + "num_tokens": 2188438097.0, + "step": 13048 + }, + { + "entropy": 1.668609122435252, + "epoch": 1.4334953722776085, + "grad_norm": 0.6842905282974243, + "learning_rate": 5.530625988279791e-06, + "loss": 1.373, + "mean_token_accuracy": 0.6644268482923508, + "num_tokens": 2188625138.0, + "step": 13049 + }, + { + "entropy": 1.7268520295619965, + "epoch": 1.4336052291889814, + "grad_norm": 0.7898194789886475, + "learning_rate": 5.529354521757796e-06, + "loss": 1.3678, + "mean_token_accuracy": 0.6659293274084727, + "num_tokens": 2188786405.0, + "step": 13050 + }, + { + "entropy": 1.6797465880711873, + "epoch": 1.4337150861003543, + "grad_norm": 0.6213613748550415, + "learning_rate": 5.5280832283792685e-06, + "loss": 1.2378, + "mean_token_accuracy": 0.6838393161694208, + "num_tokens": 2188901389.0, + "step": 13051 + }, + { + "entropy": 1.749913473924001, + "epoch": 1.4338249430117274, + "grad_norm": 0.8360404372215271, + "learning_rate": 5.52681210818444e-06, + "loss": 1.4877, + "mean_token_accuracy": 0.6598167518774668, + "num_tokens": 2189098778.0, + "step": 13052 + }, + { + "entropy": 1.6591697732607524, + "epoch": 1.4339347999231002, + "grad_norm": 0.736677885055542, + "learning_rate": 5.52554116121354e-06, + "loss": 1.292, + "mean_token_accuracy": 0.6754182428121567, + "num_tokens": 2189222964.0, + "step": 13053 + }, + { + "entropy": 1.6895016729831696, + "epoch": 1.4340446568344731, + "grad_norm": 0.6395838260650635, + "learning_rate": 5.5242703875067985e-06, + "loss": 1.3534, + "mean_token_accuracy": 0.6683394263188044, + "num_tokens": 2189399202.0, + "step": 13054 + }, + { + "entropy": 1.7012076675891876, + "epoch": 1.434154513745846, + "grad_norm": 0.5661507844924927, + "learning_rate": 5.522999787104429e-06, + "loss": 1.3654, + "mean_token_accuracy": 0.672503188252449, + "num_tokens": 2189573162.0, + "step": 13055 + }, + { + "entropy": 1.7428718010584514, + "epoch": 1.434264370657219, + "grad_norm": 0.5942409634590149, + "learning_rate": 5.521729360046653e-06, + "loss": 1.3816, + "mean_token_accuracy": 0.6550378203392029, + "num_tokens": 2189775760.0, + "step": 13056 + }, + { + "entropy": 1.7421286702156067, + "epoch": 1.434374227568592, + "grad_norm": 0.6235953569412231, + "learning_rate": 5.52045910637367e-06, + "loss": 1.5425, + "mean_token_accuracy": 0.6408863415320715, + "num_tokens": 2189967802.0, + "step": 13057 + }, + { + "entropy": 1.763216882944107, + "epoch": 1.434484084479965, + "grad_norm": 0.7079127430915833, + "learning_rate": 5.519189026125684e-06, + "loss": 1.3053, + "mean_token_accuracy": 0.6678203245004019, + "num_tokens": 2190080360.0, + "step": 13058 + }, + { + "entropy": 1.6415379345417023, + "epoch": 1.4345939413913378, + "grad_norm": 0.6366350650787354, + "learning_rate": 5.5179191193429015e-06, + "loss": 1.2732, + "mean_token_accuracy": 0.6822443703810374, + "num_tokens": 2190213423.0, + "step": 13059 + }, + { + "entropy": 1.66671418150266, + "epoch": 1.4347037983027107, + "grad_norm": 37.921844482421875, + "learning_rate": 5.516649386065508e-06, + "loss": 1.3524, + "mean_token_accuracy": 0.6727404892444611, + "num_tokens": 2190381575.0, + "step": 13060 + }, + { + "entropy": 1.6547318299611409, + "epoch": 1.4348136552140835, + "grad_norm": 0.6079090237617493, + "learning_rate": 5.515379826333688e-06, + "loss": 1.49, + "mean_token_accuracy": 0.6499229669570923, + "num_tokens": 2190556977.0, + "step": 13061 + }, + { + "entropy": 1.7024961809317272, + "epoch": 1.4349235121254567, + "grad_norm": 0.664475679397583, + "learning_rate": 5.514110440187628e-06, + "loss": 1.5275, + "mean_token_accuracy": 0.6373623659213384, + "num_tokens": 2190773850.0, + "step": 13062 + }, + { + "entropy": 1.7388477126757305, + "epoch": 1.4350333690368295, + "grad_norm": 0.7594343423843384, + "learning_rate": 5.5128412276674955e-06, + "loss": 1.448, + "mean_token_accuracy": 0.6530890514453253, + "num_tokens": 2190944059.0, + "step": 13063 + }, + { + "entropy": 1.761966496706009, + "epoch": 1.4351432259482024, + "grad_norm": 0.7008910775184631, + "learning_rate": 5.5115721888134695e-06, + "loss": 1.5423, + "mean_token_accuracy": 0.6320922573407491, + "num_tokens": 2191128206.0, + "step": 13064 + }, + { + "entropy": 1.7421748340129852, + "epoch": 1.4352530828595755, + "grad_norm": 0.7401275634765625, + "learning_rate": 5.510303323665712e-06, + "loss": 1.4024, + "mean_token_accuracy": 0.6560095548629761, + "num_tokens": 2191241600.0, + "step": 13065 + }, + { + "entropy": 1.7223056654135387, + "epoch": 1.4353629397709484, + "grad_norm": 0.6929873824119568, + "learning_rate": 5.509034632264376e-06, + "loss": 1.2268, + "mean_token_accuracy": 0.6787229428688685, + "num_tokens": 2191357856.0, + "step": 13066 + }, + { + "entropy": 1.7451776067415874, + "epoch": 1.4354727966823213, + "grad_norm": 0.7561651468276978, + "learning_rate": 5.507766114649622e-06, + "loss": 1.445, + "mean_token_accuracy": 0.6532369504372278, + "num_tokens": 2191496045.0, + "step": 13067 + }, + { + "entropy": 1.7426794469356537, + "epoch": 1.4355826535936942, + "grad_norm": 0.6168361306190491, + "learning_rate": 5.506497770861598e-06, + "loss": 1.4346, + "mean_token_accuracy": 0.6491942703723907, + "num_tokens": 2191669650.0, + "step": 13068 + }, + { + "entropy": 1.679451435804367, + "epoch": 1.435692510505067, + "grad_norm": 0.7383124232292175, + "learning_rate": 5.50522960094044e-06, + "loss": 1.3998, + "mean_token_accuracy": 0.6661538481712341, + "num_tokens": 2191842559.0, + "step": 13069 + }, + { + "entropy": 1.723546991745631, + "epoch": 1.4358023674164402, + "grad_norm": 0.8791068196296692, + "learning_rate": 5.503961604926291e-06, + "loss": 1.4383, + "mean_token_accuracy": 0.6561855375766754, + "num_tokens": 2192029229.0, + "step": 13070 + }, + { + "entropy": 1.7081499894460042, + "epoch": 1.435912224327813, + "grad_norm": 0.7823132872581482, + "learning_rate": 5.502693782859282e-06, + "loss": 1.4804, + "mean_token_accuracy": 0.6432801336050034, + "num_tokens": 2192197284.0, + "step": 13071 + }, + { + "entropy": 1.7025318245093028, + "epoch": 1.436022081239186, + "grad_norm": 0.655196487903595, + "learning_rate": 5.501426134779538e-06, + "loss": 1.3445, + "mean_token_accuracy": 0.6672158092260361, + "num_tokens": 2192338396.0, + "step": 13072 + }, + { + "entropy": 1.7241312563419342, + "epoch": 1.4361319381505588, + "grad_norm": 0.7512596845626831, + "learning_rate": 5.500158660727175e-06, + "loss": 1.2377, + "mean_token_accuracy": 0.6815748860438665, + "num_tokens": 2192441821.0, + "step": 13073 + }, + { + "entropy": 1.683451513449351, + "epoch": 1.4362417950619317, + "grad_norm": 0.6415113210678101, + "learning_rate": 5.498891360742316e-06, + "loss": 1.3612, + "mean_token_accuracy": 0.6760003666083018, + "num_tokens": 2192589743.0, + "step": 13074 + }, + { + "entropy": 1.6738179723421733, + "epoch": 1.4363516519733048, + "grad_norm": 0.6926242709159851, + "learning_rate": 5.497624234865062e-06, + "loss": 1.3332, + "mean_token_accuracy": 0.6816667566696802, + "num_tokens": 2192738312.0, + "step": 13075 + }, + { + "entropy": 1.71518008907636, + "epoch": 1.4364615088846777, + "grad_norm": 0.6560258865356445, + "learning_rate": 5.496357283135526e-06, + "loss": 1.5321, + "mean_token_accuracy": 0.6293992896874746, + "num_tokens": 2192940324.0, + "step": 13076 + }, + { + "entropy": 1.6770286560058594, + "epoch": 1.4365713657960506, + "grad_norm": 0.6862941980361938, + "learning_rate": 5.495090505593802e-06, + "loss": 1.3098, + "mean_token_accuracy": 0.6722962707281113, + "num_tokens": 2193087527.0, + "step": 13077 + }, + { + "entropy": 1.7614335318406422, + "epoch": 1.4366812227074237, + "grad_norm": 0.6682018637657166, + "learning_rate": 5.49382390227998e-06, + "loss": 1.3978, + "mean_token_accuracy": 0.6473137189944586, + "num_tokens": 2193276538.0, + "step": 13078 + }, + { + "entropy": 1.6820653875668843, + "epoch": 1.4367910796187966, + "grad_norm": 0.6352739334106445, + "learning_rate": 5.49255747323415e-06, + "loss": 1.3586, + "mean_token_accuracy": 0.6693233251571655, + "num_tokens": 2193450625.0, + "step": 13079 + }, + { + "entropy": 1.7216021815935771, + "epoch": 1.4369009365301695, + "grad_norm": 0.6901304721832275, + "learning_rate": 5.4912912184964e-06, + "loss": 1.2759, + "mean_token_accuracy": 0.6686844925085703, + "num_tokens": 2193553352.0, + "step": 13080 + }, + { + "entropy": 1.7992511590321858, + "epoch": 1.4370107934415424, + "grad_norm": 0.6699839234352112, + "learning_rate": 5.490025138106795e-06, + "loss": 1.445, + "mean_token_accuracy": 0.6500095178683599, + "num_tokens": 2193705121.0, + "step": 13081 + }, + { + "entropy": 1.7488137980302174, + "epoch": 1.4371206503529153, + "grad_norm": 0.7710155844688416, + "learning_rate": 5.488759232105412e-06, + "loss": 1.5234, + "mean_token_accuracy": 0.6365531980991364, + "num_tokens": 2193866132.0, + "step": 13082 + }, + { + "entropy": 1.7383404672145844, + "epoch": 1.4372305072642884, + "grad_norm": 0.7229591608047485, + "learning_rate": 5.487493500532318e-06, + "loss": 1.5024, + "mean_token_accuracy": 0.6449161618947983, + "num_tokens": 2194048183.0, + "step": 13083 + }, + { + "entropy": 1.74971208969752, + "epoch": 1.4373403641756612, + "grad_norm": 0.8270702362060547, + "learning_rate": 5.4862279434275716e-06, + "loss": 1.5017, + "mean_token_accuracy": 0.6444364488124847, + "num_tokens": 2194201146.0, + "step": 13084 + }, + { + "entropy": 1.710715075333913, + "epoch": 1.4374502210870341, + "grad_norm": 0.8305548429489136, + "learning_rate": 5.484962560831223e-06, + "loss": 1.5135, + "mean_token_accuracy": 0.6555256595214208, + "num_tokens": 2194373810.0, + "step": 13085 + }, + { + "entropy": 1.740293820699056, + "epoch": 1.437560077998407, + "grad_norm": 0.6739172339439392, + "learning_rate": 5.483697352783326e-06, + "loss": 1.48, + "mean_token_accuracy": 0.6560692836840948, + "num_tokens": 2194537495.0, + "step": 13086 + }, + { + "entropy": 1.687047153711319, + "epoch": 1.43766993490978, + "grad_norm": 0.6256750226020813, + "learning_rate": 5.48243231932392e-06, + "loss": 1.3393, + "mean_token_accuracy": 0.6665924340486526, + "num_tokens": 2194693871.0, + "step": 13087 + }, + { + "entropy": 1.6996191541353862, + "epoch": 1.437779791821153, + "grad_norm": 0.7012233734130859, + "learning_rate": 5.481167460493049e-06, + "loss": 1.3996, + "mean_token_accuracy": 0.653436486919721, + "num_tokens": 2194841359.0, + "step": 13088 + }, + { + "entropy": 1.683770517508189, + "epoch": 1.437889648732526, + "grad_norm": 0.7458353042602539, + "learning_rate": 5.479902776330739e-06, + "loss": 1.2305, + "mean_token_accuracy": 0.674822653333346, + "num_tokens": 2194979501.0, + "step": 13089 + }, + { + "entropy": 1.7308926284313202, + "epoch": 1.4379995056438988, + "grad_norm": 0.8598765134811401, + "learning_rate": 5.478638266877016e-06, + "loss": 1.5589, + "mean_token_accuracy": 0.6545391033093134, + "num_tokens": 2195155093.0, + "step": 13090 + }, + { + "entropy": 1.774180034796397, + "epoch": 1.4381093625552719, + "grad_norm": 0.6300092935562134, + "learning_rate": 5.4773739321719055e-06, + "loss": 1.4823, + "mean_token_accuracy": 0.6337632189194361, + "num_tokens": 2195363872.0, + "step": 13091 + }, + { + "entropy": 1.686318536599477, + "epoch": 1.4382192194666448, + "grad_norm": 0.591356098651886, + "learning_rate": 5.4761097722554264e-06, + "loss": 1.3622, + "mean_token_accuracy": 0.6576072623332342, + "num_tokens": 2195569789.0, + "step": 13092 + }, + { + "entropy": 1.7126056949297588, + "epoch": 1.4383290763780177, + "grad_norm": 0.6735844016075134, + "learning_rate": 5.474845787167578e-06, + "loss": 1.433, + "mean_token_accuracy": 0.6552617400884628, + "num_tokens": 2195731935.0, + "step": 13093 + }, + { + "entropy": 1.6593547960122426, + "epoch": 1.4384389332893905, + "grad_norm": 0.7271912097930908, + "learning_rate": 5.47358197694837e-06, + "loss": 1.3903, + "mean_token_accuracy": 0.6405983914931616, + "num_tokens": 2195915850.0, + "step": 13094 + }, + { + "entropy": 1.7243566314379375, + "epoch": 1.4385487902007634, + "grad_norm": 0.7322264909744263, + "learning_rate": 5.472318341637805e-06, + "loss": 1.3498, + "mean_token_accuracy": 0.6661138186852137, + "num_tokens": 2196067340.0, + "step": 13095 + }, + { + "entropy": 1.7625857293605804, + "epoch": 1.4386586471121365, + "grad_norm": 0.7507118582725525, + "learning_rate": 5.471054881275875e-06, + "loss": 1.3823, + "mean_token_accuracy": 0.6484930912653605, + "num_tokens": 2196190131.0, + "step": 13096 + }, + { + "entropy": 1.6906941831111908, + "epoch": 1.4387685040235094, + "grad_norm": 0.6131132245063782, + "learning_rate": 5.4697915959025625e-06, + "loss": 1.465, + "mean_token_accuracy": 0.6531191219886144, + "num_tokens": 2196349851.0, + "step": 13097 + }, + { + "entropy": 1.6890461246172588, + "epoch": 1.4388783609348823, + "grad_norm": 0.6666757464408875, + "learning_rate": 5.468528485557858e-06, + "loss": 1.312, + "mean_token_accuracy": 0.6747940282026926, + "num_tokens": 2196519592.0, + "step": 13098 + }, + { + "entropy": 1.645443985859553, + "epoch": 1.4389882178462552, + "grad_norm": 0.617806613445282, + "learning_rate": 5.4672655502817315e-06, + "loss": 1.3039, + "mean_token_accuracy": 0.6676177283128103, + "num_tokens": 2196708869.0, + "step": 13099 + }, + { + "entropy": 1.728934407234192, + "epoch": 1.439098074757628, + "grad_norm": 0.6053014993667603, + "learning_rate": 5.46600279011416e-06, + "loss": 1.4976, + "mean_token_accuracy": 0.632400318980217, + "num_tokens": 2196936983.0, + "step": 13100 + }, + { + "entropy": 1.6949416001637776, + "epoch": 1.4392079316690012, + "grad_norm": 0.719120442867279, + "learning_rate": 5.464740205095106e-06, + "loss": 1.4367, + "mean_token_accuracy": 0.6461255997419357, + "num_tokens": 2197102049.0, + "step": 13101 + }, + { + "entropy": 1.6866117616494496, + "epoch": 1.439317788580374, + "grad_norm": 0.6704388856887817, + "learning_rate": 5.463477795264527e-06, + "loss": 1.4006, + "mean_token_accuracy": 0.6509098261594772, + "num_tokens": 2197273218.0, + "step": 13102 + }, + { + "entropy": 1.6496765514214833, + "epoch": 1.439427645491747, + "grad_norm": 0.6062201261520386, + "learning_rate": 5.462215560662383e-06, + "loss": 1.3943, + "mean_token_accuracy": 0.6622524907191595, + "num_tokens": 2197423275.0, + "step": 13103 + }, + { + "entropy": 1.68293896317482, + "epoch": 1.43953750240312, + "grad_norm": 0.7043601870536804, + "learning_rate": 5.460953501328626e-06, + "loss": 1.3067, + "mean_token_accuracy": 0.6639659106731415, + "num_tokens": 2197540176.0, + "step": 13104 + }, + { + "entropy": 1.7241731981436412, + "epoch": 1.439647359314493, + "grad_norm": 0.810967743396759, + "learning_rate": 5.459691617303187e-06, + "loss": 1.6757, + "mean_token_accuracy": 0.6301688055197397, + "num_tokens": 2197770837.0, + "step": 13105 + }, + { + "entropy": 1.7493579188982646, + "epoch": 1.4397572162258658, + "grad_norm": 0.6431688070297241, + "learning_rate": 5.458429908626013e-06, + "loss": 1.4976, + "mean_token_accuracy": 0.6429360012213389, + "num_tokens": 2197961177.0, + "step": 13106 + }, + { + "entropy": 1.7181467115879059, + "epoch": 1.4398670731372387, + "grad_norm": 0.8091310262680054, + "learning_rate": 5.457168375337039e-06, + "loss": 1.4059, + "mean_token_accuracy": 0.662377749880155, + "num_tokens": 2198144542.0, + "step": 13107 + }, + { + "entropy": 1.7004645963509877, + "epoch": 1.4399769300486116, + "grad_norm": 0.6322354078292847, + "learning_rate": 5.455907017476188e-06, + "loss": 1.3193, + "mean_token_accuracy": 0.6652690172195435, + "num_tokens": 2198283451.0, + "step": 13108 + }, + { + "entropy": 1.7281455794970195, + "epoch": 1.4400867869599847, + "grad_norm": 0.7495424747467041, + "learning_rate": 5.4546458350833775e-06, + "loss": 1.4623, + "mean_token_accuracy": 0.6501360982656479, + "num_tokens": 2198417040.0, + "step": 13109 + }, + { + "entropy": 1.6602643132209778, + "epoch": 1.4401966438713576, + "grad_norm": 0.6429694294929504, + "learning_rate": 5.453384828198532e-06, + "loss": 1.5415, + "mean_token_accuracy": 0.642531914015611, + "num_tokens": 2198623547.0, + "step": 13110 + }, + { + "entropy": 1.6596784790356953, + "epoch": 1.4403065007827305, + "grad_norm": 0.597550630569458, + "learning_rate": 5.452123996861554e-06, + "loss": 1.5658, + "mean_token_accuracy": 0.6343776235977808, + "num_tokens": 2198853878.0, + "step": 13111 + }, + { + "entropy": 1.6523742377758026, + "epoch": 1.4404163576941034, + "grad_norm": 0.812057614326477, + "learning_rate": 5.4508633411123535e-06, + "loss": 1.259, + "mean_token_accuracy": 0.6862892160813013, + "num_tokens": 2198973379.0, + "step": 13112 + }, + { + "entropy": 1.7168854574362438, + "epoch": 1.4405262146054763, + "grad_norm": 0.6416419148445129, + "learning_rate": 5.449602860990828e-06, + "loss": 1.4105, + "mean_token_accuracy": 0.6478788256645203, + "num_tokens": 2199129783.0, + "step": 13113 + }, + { + "entropy": 1.6594553391138713, + "epoch": 1.4406360715168494, + "grad_norm": 0.681231677532196, + "learning_rate": 5.448342556536869e-06, + "loss": 1.4884, + "mean_token_accuracy": 0.639866515994072, + "num_tokens": 2199387460.0, + "step": 13114 + }, + { + "entropy": 1.6886393030484517, + "epoch": 1.4407459284282222, + "grad_norm": 0.6803062558174133, + "learning_rate": 5.447082427790368e-06, + "loss": 1.3325, + "mean_token_accuracy": 0.677370235323906, + "num_tokens": 2199547310.0, + "step": 13115 + }, + { + "entropy": 1.709013928969701, + "epoch": 1.4408557853395951, + "grad_norm": 0.5610901713371277, + "learning_rate": 5.445822474791207e-06, + "loss": 1.4096, + "mean_token_accuracy": 0.6584896892309189, + "num_tokens": 2199735361.0, + "step": 13116 + }, + { + "entropy": 1.7390046020348866, + "epoch": 1.4409656422509682, + "grad_norm": 0.6881232261657715, + "learning_rate": 5.444562697579259e-06, + "loss": 1.388, + "mean_token_accuracy": 0.6504150678714117, + "num_tokens": 2199928625.0, + "step": 13117 + }, + { + "entropy": 1.7665140330791473, + "epoch": 1.4410754991623411, + "grad_norm": 0.6938253045082092, + "learning_rate": 5.443303096194401e-06, + "loss": 1.3693, + "mean_token_accuracy": 0.66404556731383, + "num_tokens": 2200069167.0, + "step": 13118 + }, + { + "entropy": 1.7453898986180623, + "epoch": 1.441185356073714, + "grad_norm": 0.6774733662605286, + "learning_rate": 5.442043670676494e-06, + "loss": 1.5307, + "mean_token_accuracy": 0.6523572206497192, + "num_tokens": 2200247454.0, + "step": 13119 + }, + { + "entropy": 1.7234888970851898, + "epoch": 1.441295212985087, + "grad_norm": 0.6913623213768005, + "learning_rate": 5.440784421065402e-06, + "loss": 1.2941, + "mean_token_accuracy": 0.6711312582095464, + "num_tokens": 2200358346.0, + "step": 13120 + }, + { + "entropy": 1.6904946466286976, + "epoch": 1.4414050698964598, + "grad_norm": 0.6734454035758972, + "learning_rate": 5.439525347400978e-06, + "loss": 1.2959, + "mean_token_accuracy": 0.6673662761847178, + "num_tokens": 2200517442.0, + "step": 13121 + }, + { + "entropy": 1.6776606639226277, + "epoch": 1.4415149268078329, + "grad_norm": 0.5592838525772095, + "learning_rate": 5.438266449723069e-06, + "loss": 1.4938, + "mean_token_accuracy": 0.634076843659083, + "num_tokens": 2200776827.0, + "step": 13122 + }, + { + "entropy": 1.661406288544337, + "epoch": 1.4416247837192058, + "grad_norm": 0.7140949964523315, + "learning_rate": 5.437007728071519e-06, + "loss": 1.3046, + "mean_token_accuracy": 0.6735624670982361, + "num_tokens": 2200946844.0, + "step": 13123 + }, + { + "entropy": 1.6646969815095265, + "epoch": 1.4417346406305787, + "grad_norm": 0.6813852190971375, + "learning_rate": 5.435749182486175e-06, + "loss": 1.4353, + "mean_token_accuracy": 0.6477916638056437, + "num_tokens": 2201107821.0, + "step": 13124 + }, + { + "entropy": 1.6899653573830922, + "epoch": 1.4418444975419515, + "grad_norm": 0.7384040951728821, + "learning_rate": 5.4344908130068566e-06, + "loss": 1.3519, + "mean_token_accuracy": 0.6741080085436503, + "num_tokens": 2201275870.0, + "step": 13125 + }, + { + "entropy": 1.6645598411560059, + "epoch": 1.4419543544533244, + "grad_norm": 0.6502087712287903, + "learning_rate": 5.433232619673396e-06, + "loss": 1.4388, + "mean_token_accuracy": 0.662499854962031, + "num_tokens": 2201458482.0, + "step": 13126 + }, + { + "entropy": 1.6533268988132477, + "epoch": 1.4420642113646975, + "grad_norm": 0.728032112121582, + "learning_rate": 5.431974602525617e-06, + "loss": 1.2845, + "mean_token_accuracy": 0.6791494935750961, + "num_tokens": 2201632361.0, + "step": 13127 + }, + { + "entropy": 1.7500494917233784, + "epoch": 1.4421740682760704, + "grad_norm": 0.6480644941329956, + "learning_rate": 5.430716761603332e-06, + "loss": 1.3909, + "mean_token_accuracy": 0.6503734489281973, + "num_tokens": 2201814325.0, + "step": 13128 + }, + { + "entropy": 1.7360434929529827, + "epoch": 1.4422839251874433, + "grad_norm": 0.7137820720672607, + "learning_rate": 5.42945909694635e-06, + "loss": 1.4508, + "mean_token_accuracy": 0.6532419472932816, + "num_tokens": 2201964448.0, + "step": 13129 + }, + { + "entropy": 1.6693811416625977, + "epoch": 1.4423937820988164, + "grad_norm": 0.6493667960166931, + "learning_rate": 5.42820160859448e-06, + "loss": 1.3158, + "mean_token_accuracy": 0.6628794223070145, + "num_tokens": 2202094892.0, + "step": 13130 + }, + { + "entropy": 1.738204260667165, + "epoch": 1.4425036390101893, + "grad_norm": 0.8041599988937378, + "learning_rate": 5.426944296587515e-06, + "loss": 1.6111, + "mean_token_accuracy": 0.6330114702383677, + "num_tokens": 2202341847.0, + "step": 13131 + }, + { + "entropy": 1.6739278137683868, + "epoch": 1.4426134959215622, + "grad_norm": 0.7445047497749329, + "learning_rate": 5.425687160965256e-06, + "loss": 1.3705, + "mean_token_accuracy": 0.6565392563740412, + "num_tokens": 2202489083.0, + "step": 13132 + }, + { + "entropy": 1.7505607505639393, + "epoch": 1.442723352832935, + "grad_norm": 0.6818254590034485, + "learning_rate": 5.424430201767486e-06, + "loss": 1.3646, + "mean_token_accuracy": 0.6577309419711431, + "num_tokens": 2202634715.0, + "step": 13133 + }, + { + "entropy": 1.6894071300824482, + "epoch": 1.442833209744308, + "grad_norm": 0.6561980247497559, + "learning_rate": 5.423173419033985e-06, + "loss": 1.2953, + "mean_token_accuracy": 0.6628958334525427, + "num_tokens": 2202768025.0, + "step": 13134 + }, + { + "entropy": 1.6657811105251312, + "epoch": 1.442943066655681, + "grad_norm": 0.753799557685852, + "learning_rate": 5.4219168128045315e-06, + "loss": 1.275, + "mean_token_accuracy": 0.6830256134271622, + "num_tokens": 2202911105.0, + "step": 13135 + }, + { + "entropy": 1.7518598437309265, + "epoch": 1.443052923567054, + "grad_norm": 0.6313562393188477, + "learning_rate": 5.420660383118903e-06, + "loss": 1.3652, + "mean_token_accuracy": 0.6742727309465408, + "num_tokens": 2203082635.0, + "step": 13136 + }, + { + "entropy": 1.6960064272085826, + "epoch": 1.4431627804784268, + "grad_norm": 0.7373610138893127, + "learning_rate": 5.419404130016854e-06, + "loss": 1.3989, + "mean_token_accuracy": 0.6607239097356796, + "num_tokens": 2203239727.0, + "step": 13137 + }, + { + "entropy": 1.6716348230838776, + "epoch": 1.4432726373897997, + "grad_norm": 0.7793363928794861, + "learning_rate": 5.41814805353815e-06, + "loss": 1.484, + "mean_token_accuracy": 0.6447274684906006, + "num_tokens": 2203425558.0, + "step": 13138 + }, + { + "entropy": 1.6881347199281056, + "epoch": 1.4433824943011726, + "grad_norm": 0.6784332394599915, + "learning_rate": 5.416892153722548e-06, + "loss": 1.3327, + "mean_token_accuracy": 0.6610169510046641, + "num_tokens": 2203588235.0, + "step": 13139 + }, + { + "entropy": 1.752338171005249, + "epoch": 1.4434923512125457, + "grad_norm": 0.8655160665512085, + "learning_rate": 5.415636430609792e-06, + "loss": 1.2634, + "mean_token_accuracy": 0.6677990953127543, + "num_tokens": 2203720757.0, + "step": 13140 + }, + { + "entropy": 1.705585926771164, + "epoch": 1.4436022081239186, + "grad_norm": 0.6548242568969727, + "learning_rate": 5.414380884239625e-06, + "loss": 1.5122, + "mean_token_accuracy": 0.64339513083299, + "num_tokens": 2203891963.0, + "step": 13141 + }, + { + "entropy": 1.7506338755289714, + "epoch": 1.4437120650352915, + "grad_norm": 0.6873526573181152, + "learning_rate": 5.413125514651789e-06, + "loss": 1.3258, + "mean_token_accuracy": 0.6546006848414739, + "num_tokens": 2204046733.0, + "step": 13142 + }, + { + "entropy": 1.6650571823120117, + "epoch": 1.4438219219466646, + "grad_norm": 0.8502957820892334, + "learning_rate": 5.411870321886009e-06, + "loss": 1.4231, + "mean_token_accuracy": 0.6577673802773157, + "num_tokens": 2204230937.0, + "step": 13143 + }, + { + "entropy": 1.7141657968362172, + "epoch": 1.4439317788580375, + "grad_norm": 0.6910656690597534, + "learning_rate": 5.410615305982019e-06, + "loss": 1.4016, + "mean_token_accuracy": 0.6469135781129202, + "num_tokens": 2204437250.0, + "step": 13144 + }, + { + "entropy": 1.6625679234663646, + "epoch": 1.4440416357694104, + "grad_norm": 0.6473484039306641, + "learning_rate": 5.409360466979537e-06, + "loss": 1.3574, + "mean_token_accuracy": 0.6670989692211151, + "num_tokens": 2204612238.0, + "step": 13145 + }, + { + "entropy": 1.713254948457082, + "epoch": 1.4441514926807832, + "grad_norm": 0.6853258013725281, + "learning_rate": 5.408105804918271e-06, + "loss": 1.3635, + "mean_token_accuracy": 0.6598356068134308, + "num_tokens": 2204763539.0, + "step": 13146 + }, + { + "entropy": 1.7004669805367787, + "epoch": 1.4442613495921561, + "grad_norm": 0.7649045586585999, + "learning_rate": 5.406851319837938e-06, + "loss": 1.4385, + "mean_token_accuracy": 0.6565716167291006, + "num_tokens": 2204909107.0, + "step": 13147 + }, + { + "entropy": 1.658108522494634, + "epoch": 1.4443712065035292, + "grad_norm": 0.6900636553764343, + "learning_rate": 5.405597011778248e-06, + "loss": 1.3316, + "mean_token_accuracy": 0.6711122145255407, + "num_tokens": 2205065286.0, + "step": 13148 + }, + { + "entropy": 1.7093073030312855, + "epoch": 1.4444810634149021, + "grad_norm": 0.7754839658737183, + "learning_rate": 5.404342880778883e-06, + "loss": 1.2241, + "mean_token_accuracy": 0.6803254435459772, + "num_tokens": 2205185259.0, + "step": 13149 + }, + { + "entropy": 1.743496169646581, + "epoch": 1.444590920326275, + "grad_norm": 0.7815446853637695, + "learning_rate": 5.403088926879546e-06, + "loss": 1.4544, + "mean_token_accuracy": 0.6503811130921046, + "num_tokens": 2205326720.0, + "step": 13150 + }, + { + "entropy": 1.7162681818008423, + "epoch": 1.444700777237648, + "grad_norm": 0.6339192986488342, + "learning_rate": 5.401835150119925e-06, + "loss": 1.4343, + "mean_token_accuracy": 0.6413728495438894, + "num_tokens": 2205487610.0, + "step": 13151 + }, + { + "entropy": 1.7141146957874298, + "epoch": 1.4448106341490208, + "grad_norm": 0.6438986659049988, + "learning_rate": 5.400581550539699e-06, + "loss": 1.4737, + "mean_token_accuracy": 0.655816008647283, + "num_tokens": 2205642144.0, + "step": 13152 + }, + { + "entropy": 1.7310173114140828, + "epoch": 1.4449204910603939, + "grad_norm": 0.7602280974388123, + "learning_rate": 5.3993281281785415e-06, + "loss": 1.4238, + "mean_token_accuracy": 0.6691502779722214, + "num_tokens": 2205789686.0, + "step": 13153 + }, + { + "entropy": 1.720057229200999, + "epoch": 1.4450303479717668, + "grad_norm": 0.6413891911506653, + "learning_rate": 5.398074883076127e-06, + "loss": 1.3896, + "mean_token_accuracy": 0.6561499089002609, + "num_tokens": 2205968385.0, + "step": 13154 + }, + { + "entropy": 1.6746338705221813, + "epoch": 1.4451402048831397, + "grad_norm": 0.6567736864089966, + "learning_rate": 5.396821815272115e-06, + "loss": 1.3772, + "mean_token_accuracy": 0.6559559206167856, + "num_tokens": 2206140553.0, + "step": 13155 + }, + { + "entropy": 1.7500001788139343, + "epoch": 1.4452500617945128, + "grad_norm": 0.7306400537490845, + "learning_rate": 5.395568924806171e-06, + "loss": 1.3747, + "mean_token_accuracy": 0.6527500202258428, + "num_tokens": 2206310921.0, + "step": 13156 + }, + { + "entropy": 1.7621792455514271, + "epoch": 1.4453599187058856, + "grad_norm": 0.6839621067047119, + "learning_rate": 5.394316211717945e-06, + "loss": 1.3053, + "mean_token_accuracy": 0.6592706839243571, + "num_tokens": 2206415388.0, + "step": 13157 + }, + { + "entropy": 1.6396544377009075, + "epoch": 1.4454697756172585, + "grad_norm": 0.64600670337677, + "learning_rate": 5.393063676047083e-06, + "loss": 1.2404, + "mean_token_accuracy": 0.6779094239075979, + "num_tokens": 2206534812.0, + "step": 13158 + }, + { + "entropy": 1.6962849795818329, + "epoch": 1.4455796325286314, + "grad_norm": 0.7013195157051086, + "learning_rate": 5.391811317833229e-06, + "loss": 1.3592, + "mean_token_accuracy": 0.6624196718136469, + "num_tokens": 2206682279.0, + "step": 13159 + }, + { + "entropy": 1.7327560285727184, + "epoch": 1.4456894894400043, + "grad_norm": 0.5949588418006897, + "learning_rate": 5.390559137116025e-06, + "loss": 1.4016, + "mean_token_accuracy": 0.6511549949645996, + "num_tokens": 2206866260.0, + "step": 13160 + }, + { + "entropy": 1.7158630589644115, + "epoch": 1.4457993463513774, + "grad_norm": 0.6915740370750427, + "learning_rate": 5.38930713393509e-06, + "loss": 1.463, + "mean_token_accuracy": 0.6487186849117279, + "num_tokens": 2207049927.0, + "step": 13161 + }, + { + "entropy": 1.675972153743108, + "epoch": 1.4459092032627503, + "grad_norm": 0.622731626033783, + "learning_rate": 5.388055308330057e-06, + "loss": 1.4674, + "mean_token_accuracy": 0.6402994245290756, + "num_tokens": 2207239023.0, + "step": 13162 + }, + { + "entropy": 1.749548574288686, + "epoch": 1.4460190601741232, + "grad_norm": 0.776073694229126, + "learning_rate": 5.386803660340547e-06, + "loss": 1.4152, + "mean_token_accuracy": 0.6454198757807413, + "num_tokens": 2207381870.0, + "step": 13163 + }, + { + "entropy": 1.618944267431895, + "epoch": 1.446128917085496, + "grad_norm": 0.592343807220459, + "learning_rate": 5.3855521900061725e-06, + "loss": 1.4439, + "mean_token_accuracy": 0.6512120515108109, + "num_tokens": 2207569716.0, + "step": 13164 + }, + { + "entropy": 1.6773555179437, + "epoch": 1.446238773996869, + "grad_norm": 0.7094241380691528, + "learning_rate": 5.384300897366537e-06, + "loss": 1.3302, + "mean_token_accuracy": 0.6653115550676981, + "num_tokens": 2207697714.0, + "step": 13165 + }, + { + "entropy": 1.7085080246130626, + "epoch": 1.446348630908242, + "grad_norm": 0.7474088072776794, + "learning_rate": 5.383049782461251e-06, + "loss": 1.5104, + "mean_token_accuracy": 0.6549822489420573, + "num_tokens": 2207875635.0, + "step": 13166 + }, + { + "entropy": 1.6417442560195923, + "epoch": 1.446458487819615, + "grad_norm": 0.7819391489028931, + "learning_rate": 5.3817988453299064e-06, + "loss": 1.2799, + "mean_token_accuracy": 0.6651287525892258, + "num_tokens": 2208032382.0, + "step": 13167 + }, + { + "entropy": 1.7218000292778015, + "epoch": 1.4465683447309878, + "grad_norm": 0.7218011021614075, + "learning_rate": 5.380548086012099e-06, + "loss": 1.4495, + "mean_token_accuracy": 0.6531488001346588, + "num_tokens": 2208182762.0, + "step": 13168 + }, + { + "entropy": 1.710701008637746, + "epoch": 1.446678201642361, + "grad_norm": 1.3408112525939941, + "learning_rate": 5.379297504547412e-06, + "loss": 1.1826, + "mean_token_accuracy": 0.6656645238399506, + "num_tokens": 2208344898.0, + "step": 13169 + }, + { + "entropy": 1.6560823222001393, + "epoch": 1.4467880585537338, + "grad_norm": 0.6145971417427063, + "learning_rate": 5.378047100975424e-06, + "loss": 1.3074, + "mean_token_accuracy": 0.6633361180623373, + "num_tokens": 2208515722.0, + "step": 13170 + }, + { + "entropy": 1.6993583242098491, + "epoch": 1.4468979154651067, + "grad_norm": 0.8017585277557373, + "learning_rate": 5.376796875335713e-06, + "loss": 1.5519, + "mean_token_accuracy": 0.6469237754742304, + "num_tokens": 2208708203.0, + "step": 13171 + }, + { + "entropy": 1.7376844882965088, + "epoch": 1.4470077723764796, + "grad_norm": 0.6851217150688171, + "learning_rate": 5.375546827667851e-06, + "loss": 1.5577, + "mean_token_accuracy": 0.6304005980491638, + "num_tokens": 2208910762.0, + "step": 13172 + }, + { + "entropy": 1.7630626857280731, + "epoch": 1.4471176292878525, + "grad_norm": 0.6860126256942749, + "learning_rate": 5.3742969580113915e-06, + "loss": 1.4536, + "mean_token_accuracy": 0.6431644906600317, + "num_tokens": 2209114237.0, + "step": 13173 + }, + { + "entropy": 1.668500433365504, + "epoch": 1.4472274861992256, + "grad_norm": 0.7579459547996521, + "learning_rate": 5.3730472664059e-06, + "loss": 1.37, + "mean_token_accuracy": 0.6733713150024414, + "num_tokens": 2209269635.0, + "step": 13174 + }, + { + "entropy": 1.6985759337743123, + "epoch": 1.4473373431105985, + "grad_norm": 0.7717159986495972, + "learning_rate": 5.371797752890928e-06, + "loss": 1.4412, + "mean_token_accuracy": 0.6420165300369263, + "num_tokens": 2209444154.0, + "step": 13175 + }, + { + "entropy": 1.6884556114673615, + "epoch": 1.4474472000219714, + "grad_norm": 0.7163543701171875, + "learning_rate": 5.370548417506023e-06, + "loss": 1.3367, + "mean_token_accuracy": 0.6587778131167094, + "num_tokens": 2209553018.0, + "step": 13176 + }, + { + "entropy": 1.6677973469098408, + "epoch": 1.4475570569333442, + "grad_norm": 0.5317003130912781, + "learning_rate": 5.369299260290723e-06, + "loss": 1.5134, + "mean_token_accuracy": 0.636811763048172, + "num_tokens": 2209829266.0, + "step": 13177 + }, + { + "entropy": 1.7124519248803456, + "epoch": 1.4476669138447171, + "grad_norm": 0.704308271408081, + "learning_rate": 5.3680502812845606e-06, + "loss": 1.4137, + "mean_token_accuracy": 0.6537395964066187, + "num_tokens": 2209975703.0, + "step": 13178 + }, + { + "entropy": 1.6899917125701904, + "epoch": 1.4477767707560902, + "grad_norm": 0.7196908593177795, + "learning_rate": 5.366801480527068e-06, + "loss": 1.3503, + "mean_token_accuracy": 0.6669531762599945, + "num_tokens": 2210131799.0, + "step": 13179 + }, + { + "entropy": 1.726241260766983, + "epoch": 1.4478866276674631, + "grad_norm": 0.6660773158073425, + "learning_rate": 5.3655528580577785e-06, + "loss": 1.4985, + "mean_token_accuracy": 0.6414483537276586, + "num_tokens": 2210311687.0, + "step": 13180 + }, + { + "entropy": 1.7482622861862183, + "epoch": 1.447996484578836, + "grad_norm": 0.6919118762016296, + "learning_rate": 5.364304413916195e-06, + "loss": 1.3277, + "mean_token_accuracy": 0.6661824136972427, + "num_tokens": 2210497399.0, + "step": 13181 + }, + { + "entropy": 1.6985487540562947, + "epoch": 1.4481063414902091, + "grad_norm": 0.7943996787071228, + "learning_rate": 5.363056148141838e-06, + "loss": 1.2813, + "mean_token_accuracy": 0.6696479817231497, + "num_tokens": 2210622604.0, + "step": 13182 + }, + { + "entropy": 1.6598475178082783, + "epoch": 1.448216198401582, + "grad_norm": 0.6746430397033691, + "learning_rate": 5.361808060774216e-06, + "loss": 1.3158, + "mean_token_accuracy": 0.6594479928414027, + "num_tokens": 2210777670.0, + "step": 13183 + }, + { + "entropy": 1.7671760121981304, + "epoch": 1.4483260553129549, + "grad_norm": 0.7635937333106995, + "learning_rate": 5.360560151852828e-06, + "loss": 1.4199, + "mean_token_accuracy": 0.6667883445819219, + "num_tokens": 2210926645.0, + "step": 13184 + }, + { + "entropy": 1.690992146730423, + "epoch": 1.4484359122243278, + "grad_norm": 0.7546091675758362, + "learning_rate": 5.359312421417168e-06, + "loss": 1.5281, + "mean_token_accuracy": 0.6310638238986334, + "num_tokens": 2211176157.0, + "step": 13185 + }, + { + "entropy": 1.6956369777520497, + "epoch": 1.4485457691357007, + "grad_norm": 0.6036320328712463, + "learning_rate": 5.358064869506731e-06, + "loss": 1.469, + "mean_token_accuracy": 0.6401687761147817, + "num_tokens": 2211393488.0, + "step": 13186 + }, + { + "entropy": 1.6649706959724426, + "epoch": 1.4486556260470738, + "grad_norm": 0.6010493040084839, + "learning_rate": 5.356817496160994e-06, + "loss": 1.3602, + "mean_token_accuracy": 0.6572969208161036, + "num_tokens": 2211585446.0, + "step": 13187 + }, + { + "entropy": 1.7027298510074615, + "epoch": 1.4487654829584466, + "grad_norm": 0.7195990681648254, + "learning_rate": 5.355570301419446e-06, + "loss": 1.3763, + "mean_token_accuracy": 0.6585088024536768, + "num_tokens": 2211740010.0, + "step": 13188 + }, + { + "entropy": 1.6455882887045543, + "epoch": 1.4488753398698195, + "grad_norm": 0.7932417392730713, + "learning_rate": 5.354323285321552e-06, + "loss": 1.3226, + "mean_token_accuracy": 0.6679457773764929, + "num_tokens": 2211907834.0, + "step": 13189 + }, + { + "entropy": 1.6916816929976146, + "epoch": 1.4489851967811924, + "grad_norm": 2.6084368228912354, + "learning_rate": 5.3530764479067795e-06, + "loss": 1.1454, + "mean_token_accuracy": 0.6937383910020193, + "num_tokens": 2212070495.0, + "step": 13190 + }, + { + "entropy": 1.6980760792891185, + "epoch": 1.4490950536925653, + "grad_norm": 0.7769445776939392, + "learning_rate": 5.3518297892145955e-06, + "loss": 1.463, + "mean_token_accuracy": 0.6506317506233851, + "num_tokens": 2212241884.0, + "step": 13191 + }, + { + "entropy": 1.7203065752983093, + "epoch": 1.4492049106039384, + "grad_norm": 0.671053409576416, + "learning_rate": 5.350583309284456e-06, + "loss": 1.4886, + "mean_token_accuracy": 0.6554620762666067, + "num_tokens": 2212416939.0, + "step": 13192 + }, + { + "entropy": 1.6941389739513397, + "epoch": 1.4493147675153113, + "grad_norm": 0.6860373616218567, + "learning_rate": 5.349337008155805e-06, + "loss": 1.2588, + "mean_token_accuracy": 0.6677055060863495, + "num_tokens": 2212572265.0, + "step": 13193 + }, + { + "entropy": 1.7378354767958324, + "epoch": 1.4494246244266842, + "grad_norm": 0.7001467943191528, + "learning_rate": 5.348090885868091e-06, + "loss": 1.4168, + "mean_token_accuracy": 0.6458163360754648, + "num_tokens": 2212720904.0, + "step": 13194 + }, + { + "entropy": 1.7072244087855022, + "epoch": 1.4495344813380573, + "grad_norm": 0.6630334854125977, + "learning_rate": 5.346844942460756e-06, + "loss": 1.2521, + "mean_token_accuracy": 0.6737230718135834, + "num_tokens": 2212833669.0, + "step": 13195 + }, + { + "entropy": 1.6764145195484161, + "epoch": 1.4496443382494302, + "grad_norm": 0.6521428823471069, + "learning_rate": 5.345599177973233e-06, + "loss": 1.3123, + "mean_token_accuracy": 0.6774703562259674, + "num_tokens": 2213000587.0, + "step": 13196 + }, + { + "entropy": 1.767681509256363, + "epoch": 1.449754195160803, + "grad_norm": 0.7817487120628357, + "learning_rate": 5.344353592444943e-06, + "loss": 1.2971, + "mean_token_accuracy": 0.6568180421988169, + "num_tokens": 2213145438.0, + "step": 13197 + }, + { + "entropy": 1.663481096426646, + "epoch": 1.449864052072176, + "grad_norm": 0.5956518650054932, + "learning_rate": 5.3431081859153174e-06, + "loss": 1.3152, + "mean_token_accuracy": 0.6751365313927332, + "num_tokens": 2213305808.0, + "step": 13198 + }, + { + "entropy": 1.6766654352347057, + "epoch": 1.4499739089835488, + "grad_norm": 0.6852260231971741, + "learning_rate": 5.341862958423765e-06, + "loss": 1.3912, + "mean_token_accuracy": 0.6689743250608444, + "num_tokens": 2213454974.0, + "step": 13199 + }, + { + "entropy": 1.7198993066946666, + "epoch": 1.450083765894922, + "grad_norm": 0.7866743206977844, + "learning_rate": 5.340617910009705e-06, + "loss": 1.4372, + "mean_token_accuracy": 0.6474678864081701, + "num_tokens": 2213626936.0, + "step": 13200 + }, + { + "entropy": 1.6825834314028423, + "epoch": 1.4501936228062948, + "grad_norm": 0.8059370517730713, + "learning_rate": 5.3393730407125365e-06, + "loss": 1.2714, + "mean_token_accuracy": 0.6706172774235407, + "num_tokens": 2213775434.0, + "step": 13201 + }, + { + "entropy": 1.6637190183003743, + "epoch": 1.4503034797176677, + "grad_norm": 14.398298263549805, + "learning_rate": 5.338128350571659e-06, + "loss": 1.3967, + "mean_token_accuracy": 0.6558897644281387, + "num_tokens": 2214009223.0, + "step": 13202 + }, + { + "entropy": 1.7083501815795898, + "epoch": 1.4504133366290406, + "grad_norm": 0.7195196747779846, + "learning_rate": 5.336883839626466e-06, + "loss": 1.4648, + "mean_token_accuracy": 0.6481581528981527, + "num_tokens": 2214193907.0, + "step": 13203 + }, + { + "entropy": 1.708481788635254, + "epoch": 1.4505231935404135, + "grad_norm": 0.815433919429779, + "learning_rate": 5.335639507916354e-06, + "loss": 1.4208, + "mean_token_accuracy": 0.6755526115496954, + "num_tokens": 2214358384.0, + "step": 13204 + }, + { + "entropy": 1.7456343571345012, + "epoch": 1.4506330504517866, + "grad_norm": 0.653662919998169, + "learning_rate": 5.334395355480692e-06, + "loss": 1.4671, + "mean_token_accuracy": 0.6519175618886948, + "num_tokens": 2214500150.0, + "step": 13205 + }, + { + "entropy": 1.683765749136607, + "epoch": 1.4507429073631595, + "grad_norm": 1.1602147817611694, + "learning_rate": 5.333151382358867e-06, + "loss": 1.3494, + "mean_token_accuracy": 0.6610773354768753, + "num_tokens": 2214625247.0, + "step": 13206 + }, + { + "entropy": 1.7299201289812725, + "epoch": 1.4508527642745324, + "grad_norm": 0.6895291805267334, + "learning_rate": 5.331907588590248e-06, + "loss": 1.4314, + "mean_token_accuracy": 0.6494416346152624, + "num_tokens": 2214811498.0, + "step": 13207 + }, + { + "entropy": 1.658159464597702, + "epoch": 1.4509626211859055, + "grad_norm": 0.6312925219535828, + "learning_rate": 5.3306639742142015e-06, + "loss": 1.36, + "mean_token_accuracy": 0.6664116680622101, + "num_tokens": 2214974047.0, + "step": 13208 + }, + { + "entropy": 1.7353888948758442, + "epoch": 1.4510724780972784, + "grad_norm": 0.6342319846153259, + "learning_rate": 5.329420539270082e-06, + "loss": 1.3164, + "mean_token_accuracy": 0.6630458980798721, + "num_tokens": 2215193942.0, + "step": 13209 + }, + { + "entropy": 1.7596177558104198, + "epoch": 1.4511823350086512, + "grad_norm": 0.6443141102790833, + "learning_rate": 5.328177283797249e-06, + "loss": 1.5036, + "mean_token_accuracy": 0.6435278157393137, + "num_tokens": 2215360444.0, + "step": 13210 + }, + { + "entropy": 1.6973899205525715, + "epoch": 1.4512921919200241, + "grad_norm": 0.6556523442268372, + "learning_rate": 5.3269342078350465e-06, + "loss": 1.4033, + "mean_token_accuracy": 0.6569543530543646, + "num_tokens": 2215502761.0, + "step": 13211 + }, + { + "entropy": 1.7224473754564922, + "epoch": 1.451402048831397, + "grad_norm": 0.6724802851676941, + "learning_rate": 5.325691311422824e-06, + "loss": 1.4342, + "mean_token_accuracy": 0.6421651244163513, + "num_tokens": 2215672804.0, + "step": 13212 + }, + { + "entropy": 1.6691329777240753, + "epoch": 1.4515119057427701, + "grad_norm": 0.6420386433601379, + "learning_rate": 5.324448594599914e-06, + "loss": 1.5087, + "mean_token_accuracy": 0.6413849592208862, + "num_tokens": 2215874049.0, + "step": 13213 + }, + { + "entropy": 1.624968518813451, + "epoch": 1.451621762654143, + "grad_norm": 0.7023099064826965, + "learning_rate": 5.323206057405645e-06, + "loss": 1.3814, + "mean_token_accuracy": 0.6555198530356089, + "num_tokens": 2216024759.0, + "step": 13214 + }, + { + "entropy": 1.6852657397588093, + "epoch": 1.4517316195655159, + "grad_norm": 0.7389397621154785, + "learning_rate": 5.321963699879347e-06, + "loss": 1.4723, + "mean_token_accuracy": 0.646138941248258, + "num_tokens": 2216188508.0, + "step": 13215 + }, + { + "entropy": 1.6744611859321594, + "epoch": 1.4518414764768888, + "grad_norm": 0.6439229846000671, + "learning_rate": 5.320721522060346e-06, + "loss": 1.4371, + "mean_token_accuracy": 0.6564580400784811, + "num_tokens": 2216352452.0, + "step": 13216 + }, + { + "entropy": 1.683128794034322, + "epoch": 1.4519513333882617, + "grad_norm": 0.7556068301200867, + "learning_rate": 5.319479523987943e-06, + "loss": 1.2867, + "mean_token_accuracy": 0.6663307448228201, + "num_tokens": 2216522725.0, + "step": 13217 + }, + { + "entropy": 1.7548390924930573, + "epoch": 1.4520611902996348, + "grad_norm": 0.6996464133262634, + "learning_rate": 5.318237705701451e-06, + "loss": 1.3232, + "mean_token_accuracy": 0.6729239821434021, + "num_tokens": 2216664562.0, + "step": 13218 + }, + { + "entropy": 1.7093205749988556, + "epoch": 1.4521710472110076, + "grad_norm": 0.7138844728469849, + "learning_rate": 5.316996067240181e-06, + "loss": 1.2359, + "mean_token_accuracy": 0.6787795623143514, + "num_tokens": 2216772838.0, + "step": 13219 + }, + { + "entropy": 1.6784123480319977, + "epoch": 1.4522809041223805, + "grad_norm": 0.6939015984535217, + "learning_rate": 5.3157546086434245e-06, + "loss": 1.2327, + "mean_token_accuracy": 0.6807336856921514, + "num_tokens": 2216919138.0, + "step": 13220 + }, + { + "entropy": 1.7018550237019856, + "epoch": 1.4523907610337536, + "grad_norm": 0.8771721124649048, + "learning_rate": 5.314513329950469e-06, + "loss": 1.3378, + "mean_token_accuracy": 0.6704970449209213, + "num_tokens": 2217068295.0, + "step": 13221 + }, + { + "entropy": 1.7389337023099263, + "epoch": 1.4525006179451265, + "grad_norm": 0.6254299879074097, + "learning_rate": 5.313272231200609e-06, + "loss": 1.4183, + "mean_token_accuracy": 0.6511034518480301, + "num_tokens": 2217280813.0, + "step": 13222 + }, + { + "entropy": 1.783752590417862, + "epoch": 1.4526104748564994, + "grad_norm": 0.6950295567512512, + "learning_rate": 5.312031312433117e-06, + "loss": 1.3607, + "mean_token_accuracy": 0.6620252877473831, + "num_tokens": 2217419842.0, + "step": 13223 + }, + { + "entropy": 1.7052730023860931, + "epoch": 1.4527203317678723, + "grad_norm": 0.6519191861152649, + "learning_rate": 5.3107905736872745e-06, + "loss": 1.4891, + "mean_token_accuracy": 0.6575401375691096, + "num_tokens": 2217599361.0, + "step": 13224 + }, + { + "entropy": 1.7535746296246846, + "epoch": 1.4528301886792452, + "grad_norm": 0.8356174826622009, + "learning_rate": 5.309550015002346e-06, + "loss": 1.2754, + "mean_token_accuracy": 0.6758607228597006, + "num_tokens": 2217738156.0, + "step": 13225 + }, + { + "entropy": 1.692326823870341, + "epoch": 1.4529400455906183, + "grad_norm": 0.6344167590141296, + "learning_rate": 5.308309636417593e-06, + "loss": 1.4251, + "mean_token_accuracy": 0.6521053711573283, + "num_tokens": 2217913111.0, + "step": 13226 + }, + { + "entropy": 1.68477068344752, + "epoch": 1.4530499025019912, + "grad_norm": 0.722823977470398, + "learning_rate": 5.307069437972274e-06, + "loss": 1.3475, + "mean_token_accuracy": 0.6687405457099279, + "num_tokens": 2218072811.0, + "step": 13227 + }, + { + "entropy": 1.6299297511577606, + "epoch": 1.453159759413364, + "grad_norm": 0.7247095704078674, + "learning_rate": 5.305829419705648e-06, + "loss": 1.2793, + "mean_token_accuracy": 0.6687569071849188, + "num_tokens": 2218206884.0, + "step": 13228 + }, + { + "entropy": 1.726058046023051, + "epoch": 1.4532696163247372, + "grad_norm": 0.6222012639045715, + "learning_rate": 5.30458958165695e-06, + "loss": 1.4566, + "mean_token_accuracy": 0.6559246480464935, + "num_tokens": 2218388967.0, + "step": 13229 + }, + { + "entropy": 1.7035260498523712, + "epoch": 1.4533794732361098, + "grad_norm": 0.6351275444030762, + "learning_rate": 5.303349923865425e-06, + "loss": 1.4242, + "mean_token_accuracy": 0.6489528665939966, + "num_tokens": 2218568637.0, + "step": 13230 + }, + { + "entropy": 1.6851453681786854, + "epoch": 1.453489330147483, + "grad_norm": 0.7717143297195435, + "learning_rate": 5.30211044637031e-06, + "loss": 1.2123, + "mean_token_accuracy": 0.6865204274654388, + "num_tokens": 2218672479.0, + "step": 13231 + }, + { + "entropy": 1.6864939232667286, + "epoch": 1.4535991870588558, + "grad_norm": 0.7055935859680176, + "learning_rate": 5.300871149210833e-06, + "loss": 1.3668, + "mean_token_accuracy": 0.6618408660093943, + "num_tokens": 2218894075.0, + "step": 13232 + }, + { + "entropy": 1.651742806037267, + "epoch": 1.4537090439702287, + "grad_norm": 0.7507491707801819, + "learning_rate": 5.299632032426213e-06, + "loss": 1.3153, + "mean_token_accuracy": 0.6831634243329366, + "num_tokens": 2219006787.0, + "step": 13233 + }, + { + "entropy": 1.772430956363678, + "epoch": 1.4538189008816018, + "grad_norm": 0.7279871702194214, + "learning_rate": 5.298393096055674e-06, + "loss": 1.4669, + "mean_token_accuracy": 0.6384557783603668, + "num_tokens": 2219193713.0, + "step": 13234 + }, + { + "entropy": 1.6615471144517262, + "epoch": 1.4539287577929747, + "grad_norm": 0.7124606966972351, + "learning_rate": 5.297154340138419e-06, + "loss": 1.5806, + "mean_token_accuracy": 0.6216800361871719, + "num_tokens": 2219441667.0, + "step": 13235 + }, + { + "entropy": 1.6834536989529927, + "epoch": 1.4540386147043476, + "grad_norm": 0.6642992496490479, + "learning_rate": 5.295915764713666e-06, + "loss": 1.22, + "mean_token_accuracy": 0.673203244805336, + "num_tokens": 2219605960.0, + "step": 13236 + }, + { + "entropy": 1.7114764948685963, + "epoch": 1.4541484716157205, + "grad_norm": 0.6740455627441406, + "learning_rate": 5.294677369820605e-06, + "loss": 1.433, + "mean_token_accuracy": 0.6435778339703878, + "num_tokens": 2219763478.0, + "step": 13237 + }, + { + "entropy": 1.66608660419782, + "epoch": 1.4542583285270934, + "grad_norm": 0.6613836288452148, + "learning_rate": 5.293439155498435e-06, + "loss": 1.507, + "mean_token_accuracy": 0.6401470750570297, + "num_tokens": 2219939231.0, + "step": 13238 + }, + { + "entropy": 1.7152255574862163, + "epoch": 1.4543681854384665, + "grad_norm": 0.799233615398407, + "learning_rate": 5.292201121786345e-06, + "loss": 1.3541, + "mean_token_accuracy": 0.6578367203474045, + "num_tokens": 2220093449.0, + "step": 13239 + }, + { + "entropy": 1.6978593568007152, + "epoch": 1.4544780423498394, + "grad_norm": 0.6676912903785706, + "learning_rate": 5.290963268723517e-06, + "loss": 1.367, + "mean_token_accuracy": 0.6562477846940359, + "num_tokens": 2220241070.0, + "step": 13240 + }, + { + "entropy": 1.726622184117635, + "epoch": 1.4545878992612122, + "grad_norm": 0.804278552532196, + "learning_rate": 5.289725596349128e-06, + "loss": 1.3472, + "mean_token_accuracy": 0.6571770707766215, + "num_tokens": 2220390433.0, + "step": 13241 + }, + { + "entropy": 1.6938590904076893, + "epoch": 1.4546977561725853, + "grad_norm": 0.6081349849700928, + "learning_rate": 5.2884881047023516e-06, + "loss": 1.4959, + "mean_token_accuracy": 0.64292544623216, + "num_tokens": 2220584496.0, + "step": 13242 + }, + { + "entropy": 1.6884879171848297, + "epoch": 1.454807613083958, + "grad_norm": 0.7846350073814392, + "learning_rate": 5.287250793822352e-06, + "loss": 1.4016, + "mean_token_accuracy": 0.6731372624635696, + "num_tokens": 2220768356.0, + "step": 13243 + }, + { + "entropy": 1.6507751047611237, + "epoch": 1.4549174699953311, + "grad_norm": 0.5410248637199402, + "learning_rate": 5.286013663748292e-06, + "loss": 1.4474, + "mean_token_accuracy": 0.6416066288948059, + "num_tokens": 2220995309.0, + "step": 13244 + }, + { + "entropy": 1.7409183184305828, + "epoch": 1.455027326906704, + "grad_norm": 0.7013614773750305, + "learning_rate": 5.284776714519326e-06, + "loss": 1.4582, + "mean_token_accuracy": 0.6478712111711502, + "num_tokens": 2221198221.0, + "step": 13245 + }, + { + "entropy": 1.7032929261525471, + "epoch": 1.4551371838180769, + "grad_norm": 0.6887391209602356, + "learning_rate": 5.2835399461745965e-06, + "loss": 1.4032, + "mean_token_accuracy": 0.6849873264630636, + "num_tokens": 2221400687.0, + "step": 13246 + }, + { + "entropy": 1.6999091704686482, + "epoch": 1.45524704072945, + "grad_norm": 0.7940466403961182, + "learning_rate": 5.2823033587532545e-06, + "loss": 1.2728, + "mean_token_accuracy": 0.675998126467069, + "num_tokens": 2221549840.0, + "step": 13247 + }, + { + "entropy": 1.657990833123525, + "epoch": 1.4553568976408229, + "grad_norm": 0.6163055896759033, + "learning_rate": 5.281066952294436e-06, + "loss": 1.4401, + "mean_token_accuracy": 0.6552244772513708, + "num_tokens": 2221785884.0, + "step": 13248 + }, + { + "entropy": 1.7675037880738576, + "epoch": 1.4554667545521958, + "grad_norm": 0.6566433310508728, + "learning_rate": 5.2798307268372714e-06, + "loss": 1.453, + "mean_token_accuracy": 0.6563322295745214, + "num_tokens": 2221930165.0, + "step": 13249 + }, + { + "entropy": 1.7534189720948536, + "epoch": 1.4555766114635686, + "grad_norm": 0.659052848815918, + "learning_rate": 5.2785946824208845e-06, + "loss": 1.4248, + "mean_token_accuracy": 0.6439740558465322, + "num_tokens": 2222126390.0, + "step": 13250 + }, + { + "entropy": 1.693113644917806, + "epoch": 1.4556864683749415, + "grad_norm": 0.7174281477928162, + "learning_rate": 5.277358819084401e-06, + "loss": 1.4873, + "mean_token_accuracy": 0.6578061381975809, + "num_tokens": 2222313824.0, + "step": 13251 + }, + { + "entropy": 1.6609934270381927, + "epoch": 1.4557963252863146, + "grad_norm": 0.5869874954223633, + "learning_rate": 5.276123136866931e-06, + "loss": 1.3664, + "mean_token_accuracy": 0.6591756095488867, + "num_tokens": 2222498021.0, + "step": 13252 + }, + { + "entropy": 1.6955777903397877, + "epoch": 1.4559061821976875, + "grad_norm": 0.736538290977478, + "learning_rate": 5.274887635807584e-06, + "loss": 1.3698, + "mean_token_accuracy": 0.6608079870541891, + "num_tokens": 2222666538.0, + "step": 13253 + }, + { + "entropy": 1.7050454417864482, + "epoch": 1.4560160391090604, + "grad_norm": 0.7196376323699951, + "learning_rate": 5.273652315945464e-06, + "loss": 1.3256, + "mean_token_accuracy": 0.6665113717317581, + "num_tokens": 2222791127.0, + "step": 13254 + }, + { + "entropy": 1.7047736942768097, + "epoch": 1.4561258960204335, + "grad_norm": 0.787895917892456, + "learning_rate": 5.2724171773196665e-06, + "loss": 1.4962, + "mean_token_accuracy": 0.6398163984219233, + "num_tokens": 2222997665.0, + "step": 13255 + }, + { + "entropy": 1.7137849926948547, + "epoch": 1.4562357529318064, + "grad_norm": 0.8692581057548523, + "learning_rate": 5.271182219969286e-06, + "loss": 1.3989, + "mean_token_accuracy": 0.6480956127246221, + "num_tokens": 2223183484.0, + "step": 13256 + }, + { + "entropy": 1.7420273820559184, + "epoch": 1.4563456098431793, + "grad_norm": 0.7221806049346924, + "learning_rate": 5.269947443933408e-06, + "loss": 1.3607, + "mean_token_accuracy": 0.6581288725137711, + "num_tokens": 2223324598.0, + "step": 13257 + }, + { + "entropy": 1.624452531337738, + "epoch": 1.4564554667545522, + "grad_norm": 0.6961126327514648, + "learning_rate": 5.2687128492511075e-06, + "loss": 1.4168, + "mean_token_accuracy": 0.6610220770041147, + "num_tokens": 2223496057.0, + "step": 13258 + }, + { + "entropy": 1.6884084045886993, + "epoch": 1.456565323665925, + "grad_norm": 1.0178797245025635, + "learning_rate": 5.267478435961462e-06, + "loss": 1.3582, + "mean_token_accuracy": 0.665848026672999, + "num_tokens": 2223654632.0, + "step": 13259 + }, + { + "entropy": 1.6696566045284271, + "epoch": 1.4566751805772982, + "grad_norm": 0.6774824261665344, + "learning_rate": 5.266244204103548e-06, + "loss": 1.343, + "mean_token_accuracy": 0.6636465241511663, + "num_tokens": 2223806523.0, + "step": 13260 + }, + { + "entropy": 1.7208555539449055, + "epoch": 1.456785037488671, + "grad_norm": 0.6381753087043762, + "learning_rate": 5.265010153716415e-06, + "loss": 1.3636, + "mean_token_accuracy": 0.6624864041805267, + "num_tokens": 2223993713.0, + "step": 13261 + }, + { + "entropy": 1.697748472293218, + "epoch": 1.456894894400044, + "grad_norm": 0.7066287994384766, + "learning_rate": 5.263776284839126e-06, + "loss": 1.2882, + "mean_token_accuracy": 0.6692610581715902, + "num_tokens": 2224121187.0, + "step": 13262 + }, + { + "entropy": 1.6702364484469097, + "epoch": 1.4570047513114168, + "grad_norm": 0.6550009250640869, + "learning_rate": 5.2625425975107366e-06, + "loss": 1.5535, + "mean_token_accuracy": 0.6461095362901688, + "num_tokens": 2224294928.0, + "step": 13263 + }, + { + "entropy": 1.671968440214793, + "epoch": 1.4571146082227897, + "grad_norm": 0.7919005751609802, + "learning_rate": 5.261309091770288e-06, + "loss": 1.3144, + "mean_token_accuracy": 0.6730043093363444, + "num_tokens": 2224442529.0, + "step": 13264 + }, + { + "entropy": 1.6805502672990162, + "epoch": 1.4572244651341628, + "grad_norm": 0.6840505599975586, + "learning_rate": 5.260075767656818e-06, + "loss": 1.3058, + "mean_token_accuracy": 0.6673836757739385, + "num_tokens": 2224580676.0, + "step": 13265 + }, + { + "entropy": 1.723142812649409, + "epoch": 1.4573343220455357, + "grad_norm": 0.7580272555351257, + "learning_rate": 5.258842625209367e-06, + "loss": 1.4996, + "mean_token_accuracy": 0.641799122095108, + "num_tokens": 2224774246.0, + "step": 13266 + }, + { + "entropy": 1.702703317006429, + "epoch": 1.4574441789569086, + "grad_norm": 1.6131107807159424, + "learning_rate": 5.257609664466956e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.6902973006169001, + "num_tokens": 2224908967.0, + "step": 13267 + }, + { + "entropy": 1.7422963480154674, + "epoch": 1.4575540358682817, + "grad_norm": 0.830781102180481, + "learning_rate": 5.256376885468615e-06, + "loss": 1.5733, + "mean_token_accuracy": 0.6498822967211405, + "num_tokens": 2225102321.0, + "step": 13268 + }, + { + "entropy": 1.7461797297000885, + "epoch": 1.4576638927796546, + "grad_norm": 0.7430237531661987, + "learning_rate": 5.255144288253357e-06, + "loss": 1.408, + "mean_token_accuracy": 0.6493665178616842, + "num_tokens": 2225238306.0, + "step": 13269 + }, + { + "entropy": 1.6967601478099823, + "epoch": 1.4577737496910275, + "grad_norm": 0.7158797979354858, + "learning_rate": 5.253911872860191e-06, + "loss": 1.2596, + "mean_token_accuracy": 0.6785516838232676, + "num_tokens": 2225363858.0, + "step": 13270 + }, + { + "entropy": 1.7182322641213734, + "epoch": 1.4578836066024004, + "grad_norm": 0.5696946978569031, + "learning_rate": 5.252679639328125e-06, + "loss": 1.5107, + "mean_token_accuracy": 0.6287727604309717, + "num_tokens": 2225608182.0, + "step": 13271 + }, + { + "entropy": 1.7183633248011272, + "epoch": 1.4579934635137732, + "grad_norm": 0.8003261685371399, + "learning_rate": 5.2514475876961655e-06, + "loss": 1.3841, + "mean_token_accuracy": 0.6599841763575872, + "num_tokens": 2225734061.0, + "step": 13272 + }, + { + "entropy": 1.6641955971717834, + "epoch": 1.4581033204251463, + "grad_norm": 0.6576728820800781, + "learning_rate": 5.250215718003293e-06, + "loss": 1.2564, + "mean_token_accuracy": 0.6735943456490835, + "num_tokens": 2225892115.0, + "step": 13273 + }, + { + "entropy": 1.6959330240885417, + "epoch": 1.4582131773365192, + "grad_norm": 0.7016428112983704, + "learning_rate": 5.2489840302885e-06, + "loss": 1.2863, + "mean_token_accuracy": 0.665631502866745, + "num_tokens": 2226025946.0, + "step": 13274 + }, + { + "entropy": 1.6741726001103718, + "epoch": 1.4583230342478921, + "grad_norm": 0.7208593487739563, + "learning_rate": 5.247752524590776e-06, + "loss": 1.4776, + "mean_token_accuracy": 0.6564379036426544, + "num_tokens": 2226179358.0, + "step": 13275 + }, + { + "entropy": 1.682859222094218, + "epoch": 1.458432891159265, + "grad_norm": 0.7038945555686951, + "learning_rate": 5.246521200949093e-06, + "loss": 1.3761, + "mean_token_accuracy": 0.6612346222003301, + "num_tokens": 2226343715.0, + "step": 13276 + }, + { + "entropy": 1.7394512792428334, + "epoch": 1.4585427480706379, + "grad_norm": 0.778741180896759, + "learning_rate": 5.245290059402417e-06, + "loss": 1.3302, + "mean_token_accuracy": 0.6651216298341751, + "num_tokens": 2226515575.0, + "step": 13277 + }, + { + "entropy": 1.7190166016419728, + "epoch": 1.458652604982011, + "grad_norm": 0.6520856022834778, + "learning_rate": 5.24405909998972e-06, + "loss": 1.2736, + "mean_token_accuracy": 0.6683216094970703, + "num_tokens": 2226636398.0, + "step": 13278 + }, + { + "entropy": 1.736037790775299, + "epoch": 1.4587624618933839, + "grad_norm": 0.7920129299163818, + "learning_rate": 5.242828322749958e-06, + "loss": 1.4525, + "mean_token_accuracy": 0.6551641374826431, + "num_tokens": 2226803388.0, + "step": 13279 + }, + { + "entropy": 1.693130115667979, + "epoch": 1.4588723188047568, + "grad_norm": 0.6134016513824463, + "learning_rate": 5.241597727722088e-06, + "loss": 1.3696, + "mean_token_accuracy": 0.6638036072254181, + "num_tokens": 2226951444.0, + "step": 13280 + }, + { + "entropy": 1.7208605806032817, + "epoch": 1.4589821757161299, + "grad_norm": 0.7502411603927612, + "learning_rate": 5.240367314945054e-06, + "loss": 1.4216, + "mean_token_accuracy": 0.6602864662806193, + "num_tokens": 2227074694.0, + "step": 13281 + }, + { + "entropy": 1.6609876056512196, + "epoch": 1.4590920326275028, + "grad_norm": 0.6450250744819641, + "learning_rate": 5.239137084457795e-06, + "loss": 1.3909, + "mean_token_accuracy": 0.656602198878924, + "num_tokens": 2227238549.0, + "step": 13282 + }, + { + "entropy": 1.6838933726151784, + "epoch": 1.4592018895388756, + "grad_norm": 0.6704497337341309, + "learning_rate": 5.2379070362992525e-06, + "loss": 1.2862, + "mean_token_accuracy": 0.6715071648359299, + "num_tokens": 2227393583.0, + "step": 13283 + }, + { + "entropy": 1.7411844432353973, + "epoch": 1.4593117464502485, + "grad_norm": 0.7020705342292786, + "learning_rate": 5.236677170508363e-06, + "loss": 1.7397, + "mean_token_accuracy": 0.6055120974779129, + "num_tokens": 2227595673.0, + "step": 13284 + }, + { + "entropy": 1.6689063012599945, + "epoch": 1.4594216033616214, + "grad_norm": 0.7527252435684204, + "learning_rate": 5.235447487124037e-06, + "loss": 1.296, + "mean_token_accuracy": 0.6730232934157053, + "num_tokens": 2227767400.0, + "step": 13285 + }, + { + "entropy": 1.6576413909594219, + "epoch": 1.4595314602729945, + "grad_norm": 0.7640280723571777, + "learning_rate": 5.234217986185201e-06, + "loss": 1.4514, + "mean_token_accuracy": 0.6499234984318415, + "num_tokens": 2227956387.0, + "step": 13286 + }, + { + "entropy": 1.7037740747133892, + "epoch": 1.4596413171843674, + "grad_norm": 0.6326615214347839, + "learning_rate": 5.23298866773077e-06, + "loss": 1.4418, + "mean_token_accuracy": 0.6500868995984396, + "num_tokens": 2228170702.0, + "step": 13287 + }, + { + "entropy": 1.7462695737679799, + "epoch": 1.4597511740957403, + "grad_norm": 0.8364010453224182, + "learning_rate": 5.231759531799649e-06, + "loss": 1.4275, + "mean_token_accuracy": 0.6656624972820282, + "num_tokens": 2228377715.0, + "step": 13288 + }, + { + "entropy": 1.707568754752477, + "epoch": 1.4598610310071132, + "grad_norm": 0.820871889591217, + "learning_rate": 5.230530578430737e-06, + "loss": 1.284, + "mean_token_accuracy": 0.6757774303356806, + "num_tokens": 2228529705.0, + "step": 13289 + }, + { + "entropy": 1.7350817521413167, + "epoch": 1.459970887918486, + "grad_norm": 0.7010106444358826, + "learning_rate": 5.229301807662937e-06, + "loss": 1.3151, + "mean_token_accuracy": 0.6694407761096954, + "num_tokens": 2228686318.0, + "step": 13290 + }, + { + "entropy": 1.7383721967538197, + "epoch": 1.4600807448298592, + "grad_norm": 0.5950272679328918, + "learning_rate": 5.228073219535128e-06, + "loss": 1.6986, + "mean_token_accuracy": 0.6260428552826246, + "num_tokens": 2228943488.0, + "step": 13291 + }, + { + "entropy": 1.7025316456953685, + "epoch": 1.460190601741232, + "grad_norm": 0.7957791090011597, + "learning_rate": 5.226844814086206e-06, + "loss": 1.3464, + "mean_token_accuracy": 0.6687312970558802, + "num_tokens": 2229072202.0, + "step": 13292 + }, + { + "entropy": 1.671481430530548, + "epoch": 1.460300458652605, + "grad_norm": 0.7400625348091125, + "learning_rate": 5.2256165913550425e-06, + "loss": 1.3572, + "mean_token_accuracy": 0.6515611658493677, + "num_tokens": 2229265397.0, + "step": 13293 + }, + { + "entropy": 1.6869538923104603, + "epoch": 1.460410315563978, + "grad_norm": 0.7097235321998596, + "learning_rate": 5.22438855138051e-06, + "loss": 1.3306, + "mean_token_accuracy": 0.6697370956341425, + "num_tokens": 2229437081.0, + "step": 13294 + }, + { + "entropy": 1.7623671690622966, + "epoch": 1.460520172475351, + "grad_norm": 0.7999326586723328, + "learning_rate": 5.223160694201477e-06, + "loss": 1.4252, + "mean_token_accuracy": 0.6673903316259384, + "num_tokens": 2229599506.0, + "step": 13295 + }, + { + "entropy": 1.699026753505071, + "epoch": 1.4606300293867238, + "grad_norm": 0.6532884240150452, + "learning_rate": 5.221933019856813e-06, + "loss": 1.2955, + "mean_token_accuracy": 0.6673917869726816, + "num_tokens": 2229733052.0, + "step": 13296 + }, + { + "entropy": 1.6547558307647705, + "epoch": 1.4607398862980967, + "grad_norm": 0.6670539975166321, + "learning_rate": 5.220705528385357e-06, + "loss": 1.2823, + "mean_token_accuracy": 0.669133797287941, + "num_tokens": 2229887116.0, + "step": 13297 + }, + { + "entropy": 1.6800335148970287, + "epoch": 1.4608497432094696, + "grad_norm": 0.7013092041015625, + "learning_rate": 5.219478219825969e-06, + "loss": 1.2742, + "mean_token_accuracy": 0.6806422223647436, + "num_tokens": 2230068473.0, + "step": 13298 + }, + { + "entropy": 1.7160185774167378, + "epoch": 1.4609596001208427, + "grad_norm": 0.706506073474884, + "learning_rate": 5.2182510942174904e-06, + "loss": 1.3858, + "mean_token_accuracy": 0.6637303580840429, + "num_tokens": 2230208267.0, + "step": 13299 + }, + { + "entropy": 1.7366038858890533, + "epoch": 1.4610694570322156, + "grad_norm": 0.7497095465660095, + "learning_rate": 5.217024151598759e-06, + "loss": 1.656, + "mean_token_accuracy": 0.6435926059881846, + "num_tokens": 2230359536.0, + "step": 13300 + }, + { + "entropy": 1.6858433783054352, + "epoch": 1.4611793139435885, + "grad_norm": 0.6425523161888123, + "learning_rate": 5.21579739200861e-06, + "loss": 1.3171, + "mean_token_accuracy": 0.6678670247395834, + "num_tokens": 2230496567.0, + "step": 13301 + }, + { + "entropy": 1.712745487689972, + "epoch": 1.4612891708549614, + "grad_norm": 0.746553897857666, + "learning_rate": 5.214570815485865e-06, + "loss": 1.3764, + "mean_token_accuracy": 0.6616001923878988, + "num_tokens": 2230655773.0, + "step": 13302 + }, + { + "entropy": 1.681189884742101, + "epoch": 1.4613990277663342, + "grad_norm": 0.6803275346755981, + "learning_rate": 5.213344422069344e-06, + "loss": 1.4087, + "mean_token_accuracy": 0.6556661377350489, + "num_tokens": 2230883918.0, + "step": 13303 + }, + { + "entropy": 1.7151753803094227, + "epoch": 1.4615088846777073, + "grad_norm": 0.6623923778533936, + "learning_rate": 5.212118211797868e-06, + "loss": 1.5692, + "mean_token_accuracy": 0.6383712540070215, + "num_tokens": 2231057143.0, + "step": 13304 + }, + { + "entropy": 1.7497306366761525, + "epoch": 1.4616187415890802, + "grad_norm": 0.682961106300354, + "learning_rate": 5.210892184710243e-06, + "loss": 1.2886, + "mean_token_accuracy": 0.6717896262804667, + "num_tokens": 2231220320.0, + "step": 13305 + }, + { + "entropy": 1.6684472461541493, + "epoch": 1.4617285985004531, + "grad_norm": 0.8259005546569824, + "learning_rate": 5.209666340845268e-06, + "loss": 1.5261, + "mean_token_accuracy": 0.6499257162213326, + "num_tokens": 2231385621.0, + "step": 13306 + }, + { + "entropy": 1.644069214661916, + "epoch": 1.4618384554118262, + "grad_norm": 0.6260018944740295, + "learning_rate": 5.2084406802417484e-06, + "loss": 1.4294, + "mean_token_accuracy": 0.6403475701808929, + "num_tokens": 2231582756.0, + "step": 13307 + }, + { + "entropy": 1.7100238502025604, + "epoch": 1.461948312323199, + "grad_norm": 0.7612260580062866, + "learning_rate": 5.207215202938471e-06, + "loss": 1.4929, + "mean_token_accuracy": 0.6612754563490549, + "num_tokens": 2231709892.0, + "step": 13308 + }, + { + "entropy": 1.6849770645300548, + "epoch": 1.462058169234572, + "grad_norm": 0.7276026606559753, + "learning_rate": 5.205989908974218e-06, + "loss": 1.4184, + "mean_token_accuracy": 0.6592111438512802, + "num_tokens": 2231854359.0, + "step": 13309 + }, + { + "entropy": 1.698674072821935, + "epoch": 1.4621680261459449, + "grad_norm": 0.6991817951202393, + "learning_rate": 5.204764798387778e-06, + "loss": 1.4016, + "mean_token_accuracy": 0.6530411044756571, + "num_tokens": 2232053954.0, + "step": 13310 + }, + { + "entropy": 1.6900160908699036, + "epoch": 1.4622778830573178, + "grad_norm": 0.6570863127708435, + "learning_rate": 5.203539871217918e-06, + "loss": 1.4676, + "mean_token_accuracy": 0.6459223727385203, + "num_tokens": 2232234666.0, + "step": 13311 + }, + { + "entropy": 1.69076007604599, + "epoch": 1.4623877399686909, + "grad_norm": 0.8549068570137024, + "learning_rate": 5.202315127503411e-06, + "loss": 1.1945, + "mean_token_accuracy": 0.6830791085958481, + "num_tokens": 2232351276.0, + "step": 13312 + }, + { + "entropy": 1.7095185021559398, + "epoch": 1.4624975968800638, + "grad_norm": 0.7216442823410034, + "learning_rate": 5.201090567283019e-06, + "loss": 1.3842, + "mean_token_accuracy": 0.6556618362665176, + "num_tokens": 2232487699.0, + "step": 13313 + }, + { + "entropy": 1.7185616195201874, + "epoch": 1.4626074537914366, + "grad_norm": 0.6242141723632812, + "learning_rate": 5.1998661905954925e-06, + "loss": 1.2984, + "mean_token_accuracy": 0.6604965478181839, + "num_tokens": 2232640944.0, + "step": 13314 + }, + { + "entropy": 1.7047783136367798, + "epoch": 1.4627173107028095, + "grad_norm": 0.7990993857383728, + "learning_rate": 5.1986419974795895e-06, + "loss": 1.3772, + "mean_token_accuracy": 0.6700956672430038, + "num_tokens": 2232790937.0, + "step": 13315 + }, + { + "entropy": 1.6940909028053284, + "epoch": 1.4628271676141824, + "grad_norm": 0.6506087779998779, + "learning_rate": 5.197417987974056e-06, + "loss": 1.3113, + "mean_token_accuracy": 0.6701582570870718, + "num_tokens": 2232959603.0, + "step": 13316 + }, + { + "entropy": 1.7363331615924835, + "epoch": 1.4629370245255555, + "grad_norm": 0.668000340461731, + "learning_rate": 5.196194162117627e-06, + "loss": 1.4191, + "mean_token_accuracy": 0.6679625312487284, + "num_tokens": 2233101967.0, + "step": 13317 + }, + { + "entropy": 1.64047638575236, + "epoch": 1.4630468814369284, + "grad_norm": 0.7590168714523315, + "learning_rate": 5.194970519949035e-06, + "loss": 1.3215, + "mean_token_accuracy": 0.6645925690730413, + "num_tokens": 2233240156.0, + "step": 13318 + }, + { + "entropy": 1.64529550075531, + "epoch": 1.4631567383483013, + "grad_norm": 0.6108586192131042, + "learning_rate": 5.193747061507015e-06, + "loss": 1.4469, + "mean_token_accuracy": 0.65325299402078, + "num_tokens": 2233428355.0, + "step": 13319 + }, + { + "entropy": 1.7181050678094227, + "epoch": 1.4632665952596744, + "grad_norm": 0.6974697113037109, + "learning_rate": 5.1925237868302815e-06, + "loss": 1.4742, + "mean_token_accuracy": 0.6386492003997167, + "num_tokens": 2233622238.0, + "step": 13320 + }, + { + "entropy": 1.6982758343219757, + "epoch": 1.4633764521710473, + "grad_norm": 0.6342235207557678, + "learning_rate": 5.1913006959575515e-06, + "loss": 1.4225, + "mean_token_accuracy": 0.6377789328495661, + "num_tokens": 2233805645.0, + "step": 13321 + }, + { + "entropy": 1.6800893247127533, + "epoch": 1.4634863090824202, + "grad_norm": 0.7242743372917175, + "learning_rate": 5.19007778892754e-06, + "loss": 1.2808, + "mean_token_accuracy": 0.6725705116987228, + "num_tokens": 2233968192.0, + "step": 13322 + }, + { + "entropy": 1.7239407698313396, + "epoch": 1.463596165993793, + "grad_norm": 0.6715902090072632, + "learning_rate": 5.188855065778946e-06, + "loss": 1.4202, + "mean_token_accuracy": 0.6526324351628622, + "num_tokens": 2234137698.0, + "step": 13323 + }, + { + "entropy": 1.7070810496807098, + "epoch": 1.463706022905166, + "grad_norm": 0.6120285987854004, + "learning_rate": 5.187632526550472e-06, + "loss": 1.3874, + "mean_token_accuracy": 0.6460235466559728, + "num_tokens": 2234325035.0, + "step": 13324 + }, + { + "entropy": 1.738279104232788, + "epoch": 1.463815879816539, + "grad_norm": 0.776631236076355, + "learning_rate": 5.1864101712808115e-06, + "loss": 1.4277, + "mean_token_accuracy": 0.6572244515021642, + "num_tokens": 2234503600.0, + "step": 13325 + }, + { + "entropy": 1.6941909690697987, + "epoch": 1.463925736727912, + "grad_norm": 0.6817474961280823, + "learning_rate": 5.185188000008645e-06, + "loss": 1.251, + "mean_token_accuracy": 0.6787453691164652, + "num_tokens": 2234640034.0, + "step": 13326 + }, + { + "entropy": 1.7105824053287506, + "epoch": 1.4640355936392848, + "grad_norm": 0.6769583821296692, + "learning_rate": 5.183966012772657e-06, + "loss": 1.3502, + "mean_token_accuracy": 0.6620890498161316, + "num_tokens": 2234800322.0, + "step": 13327 + }, + { + "entropy": 1.7065203487873077, + "epoch": 1.4641454505506577, + "grad_norm": 0.7230082154273987, + "learning_rate": 5.18274420961153e-06, + "loss": 1.2583, + "mean_token_accuracy": 0.6677148640155792, + "num_tokens": 2234902867.0, + "step": 13328 + }, + { + "entropy": 1.6912482976913452, + "epoch": 1.4642553074620306, + "grad_norm": 0.6191965937614441, + "learning_rate": 5.181522590563925e-06, + "loss": 1.3609, + "mean_token_accuracy": 0.6615066925684611, + "num_tokens": 2235083263.0, + "step": 13329 + }, + { + "entropy": 1.7631979684034984, + "epoch": 1.4643651643734037, + "grad_norm": 0.6107144355773926, + "learning_rate": 5.180301155668506e-06, + "loss": 1.5027, + "mean_token_accuracy": 0.634604016939799, + "num_tokens": 2235264330.0, + "step": 13330 + }, + { + "entropy": 1.641640196243922, + "epoch": 1.4644750212847766, + "grad_norm": 0.5961340665817261, + "learning_rate": 5.179079904963936e-06, + "loss": 1.2884, + "mean_token_accuracy": 0.6740356385707855, + "num_tokens": 2235492587.0, + "step": 13331 + }, + { + "entropy": 1.68045578400294, + "epoch": 1.4645848781961495, + "grad_norm": 0.774403989315033, + "learning_rate": 5.177858838488864e-06, + "loss": 1.3224, + "mean_token_accuracy": 0.6614306718111038, + "num_tokens": 2235653770.0, + "step": 13332 + }, + { + "entropy": 1.6718662977218628, + "epoch": 1.4646947351075226, + "grad_norm": 0.730610728263855, + "learning_rate": 5.176637956281934e-06, + "loss": 1.4181, + "mean_token_accuracy": 0.6523456772168478, + "num_tokens": 2235821839.0, + "step": 13333 + }, + { + "entropy": 1.7122483650843303, + "epoch": 1.4648045920188955, + "grad_norm": 0.7947407960891724, + "learning_rate": 5.175417258381789e-06, + "loss": 1.2752, + "mean_token_accuracy": 0.6747554838657379, + "num_tokens": 2235967588.0, + "step": 13334 + }, + { + "entropy": 1.692185898621877, + "epoch": 1.4649144489302683, + "grad_norm": 0.7685208320617676, + "learning_rate": 5.174196744827063e-06, + "loss": 1.5189, + "mean_token_accuracy": 0.6478336552778879, + "num_tokens": 2236139430.0, + "step": 13335 + }, + { + "entropy": 1.7424322664737701, + "epoch": 1.4650243058416412, + "grad_norm": 0.8064534068107605, + "learning_rate": 5.172976415656385e-06, + "loss": 1.3833, + "mean_token_accuracy": 0.6672340482473373, + "num_tokens": 2236303607.0, + "step": 13336 + }, + { + "entropy": 1.7538822293281555, + "epoch": 1.4651341627530141, + "grad_norm": 0.6886154413223267, + "learning_rate": 5.171756270908381e-06, + "loss": 1.4997, + "mean_token_accuracy": 0.6409474760293961, + "num_tokens": 2236475795.0, + "step": 13337 + }, + { + "entropy": 1.6770086487134297, + "epoch": 1.4652440196643872, + "grad_norm": 0.7982631325721741, + "learning_rate": 5.170536310621661e-06, + "loss": 1.2743, + "mean_token_accuracy": 0.6710592210292816, + "num_tokens": 2236606141.0, + "step": 13338 + }, + { + "entropy": 1.7379637956619263, + "epoch": 1.46535387657576, + "grad_norm": 0.7204148173332214, + "learning_rate": 5.169316534834838e-06, + "loss": 1.39, + "mean_token_accuracy": 0.6521616876125336, + "num_tokens": 2236743945.0, + "step": 13339 + }, + { + "entropy": 1.764588902393977, + "epoch": 1.465463733487133, + "grad_norm": 0.7058348655700684, + "learning_rate": 5.168096943586527e-06, + "loss": 1.3477, + "mean_token_accuracy": 0.6545776476462682, + "num_tokens": 2236858423.0, + "step": 13340 + }, + { + "entropy": 1.7272200087706249, + "epoch": 1.4655735903985059, + "grad_norm": 0.7121959924697876, + "learning_rate": 5.166877536915313e-06, + "loss": 1.2527, + "mean_token_accuracy": 0.6780747969945272, + "num_tokens": 2236992537.0, + "step": 13341 + }, + { + "entropy": 1.7803178131580353, + "epoch": 1.4656834473098788, + "grad_norm": 0.7428368926048279, + "learning_rate": 5.165658314859798e-06, + "loss": 1.3522, + "mean_token_accuracy": 0.6523331006368002, + "num_tokens": 2237127144.0, + "step": 13342 + }, + { + "entropy": 1.7277946869532268, + "epoch": 1.4657933042212519, + "grad_norm": 0.7097252607345581, + "learning_rate": 5.164439277458569e-06, + "loss": 1.3009, + "mean_token_accuracy": 0.6694452812274297, + "num_tokens": 2237262457.0, + "step": 13343 + }, + { + "entropy": 1.6825427611668904, + "epoch": 1.4659031611326248, + "grad_norm": 0.723095178604126, + "learning_rate": 5.163220424750209e-06, + "loss": 1.5455, + "mean_token_accuracy": 0.6467806448539098, + "num_tokens": 2237409509.0, + "step": 13344 + }, + { + "entropy": 1.6955235799153645, + "epoch": 1.4660130180439976, + "grad_norm": 0.6675035357475281, + "learning_rate": 5.162001756773289e-06, + "loss": 1.5552, + "mean_token_accuracy": 0.6430306434631348, + "num_tokens": 2237551260.0, + "step": 13345 + }, + { + "entropy": 1.7177290419737499, + "epoch": 1.4661228749553707, + "grad_norm": 0.76436448097229, + "learning_rate": 5.160783273566385e-06, + "loss": 1.3634, + "mean_token_accuracy": 0.6647897511720657, + "num_tokens": 2237675393.0, + "step": 13346 + }, + { + "entropy": 1.6530766189098358, + "epoch": 1.4662327318667436, + "grad_norm": 0.613264799118042, + "learning_rate": 5.1595649751680575e-06, + "loss": 1.414, + "mean_token_accuracy": 0.6538712580998739, + "num_tokens": 2237881038.0, + "step": 13347 + }, + { + "entropy": 1.7521977821985881, + "epoch": 1.4663425887781165, + "grad_norm": 0.5653010010719299, + "learning_rate": 5.1583468616168685e-06, + "loss": 1.5067, + "mean_token_accuracy": 0.6414316246906916, + "num_tokens": 2238070216.0, + "step": 13348 + }, + { + "entropy": 1.7145592470963795, + "epoch": 1.4664524456894894, + "grad_norm": 0.6285973787307739, + "learning_rate": 5.157128932951369e-06, + "loss": 1.344, + "mean_token_accuracy": 0.668559322754542, + "num_tokens": 2238222293.0, + "step": 13349 + }, + { + "entropy": 1.7896797955036163, + "epoch": 1.4665623026008623, + "grad_norm": 0.6855942606925964, + "learning_rate": 5.155911189210105e-06, + "loss": 1.3821, + "mean_token_accuracy": 0.6510206758975983, + "num_tokens": 2238356636.0, + "step": 13350 + }, + { + "entropy": 1.6923208236694336, + "epoch": 1.4666721595122354, + "grad_norm": 0.6504180431365967, + "learning_rate": 5.154693630431617e-06, + "loss": 1.4542, + "mean_token_accuracy": 0.6632367918888727, + "num_tokens": 2238572585.0, + "step": 13351 + }, + { + "entropy": 1.7046404878298442, + "epoch": 1.4667820164236083, + "grad_norm": 0.6475574970245361, + "learning_rate": 5.153476256654448e-06, + "loss": 1.4873, + "mean_token_accuracy": 0.6398185839255651, + "num_tokens": 2238780003.0, + "step": 13352 + }, + { + "entropy": 1.6447254419326782, + "epoch": 1.4668918733349812, + "grad_norm": 0.6468961834907532, + "learning_rate": 5.1522590679171135e-06, + "loss": 1.4823, + "mean_token_accuracy": 0.6522268503904343, + "num_tokens": 2238954299.0, + "step": 13353 + }, + { + "entropy": 1.7484534879525502, + "epoch": 1.467001730246354, + "grad_norm": 0.8634352684020996, + "learning_rate": 5.151042064258145e-06, + "loss": 1.4664, + "mean_token_accuracy": 0.6500384410222372, + "num_tokens": 2239151290.0, + "step": 13354 + }, + { + "entropy": 1.7502192457516987, + "epoch": 1.467111587157727, + "grad_norm": 0.6729628443717957, + "learning_rate": 5.149825245716063e-06, + "loss": 1.421, + "mean_token_accuracy": 0.6504283597071966, + "num_tokens": 2239313609.0, + "step": 13355 + }, + { + "entropy": 1.658990353345871, + "epoch": 1.4672214440691, + "grad_norm": 0.7221643328666687, + "learning_rate": 5.148608612329378e-06, + "loss": 1.3597, + "mean_token_accuracy": 0.658984954158465, + "num_tokens": 2239519569.0, + "step": 13356 + }, + { + "entropy": 1.6480421324570973, + "epoch": 1.467331300980473, + "grad_norm": 0.578301727771759, + "learning_rate": 5.147392164136591e-06, + "loss": 1.3677, + "mean_token_accuracy": 0.6631327817837397, + "num_tokens": 2239722966.0, + "step": 13357 + }, + { + "entropy": 1.6934454341729481, + "epoch": 1.4674411578918458, + "grad_norm": 0.65192711353302, + "learning_rate": 5.146175901176203e-06, + "loss": 1.3089, + "mean_token_accuracy": 0.6639690001805624, + "num_tokens": 2239902756.0, + "step": 13358 + }, + { + "entropy": 1.7277617851893108, + "epoch": 1.467551014803219, + "grad_norm": 0.617236316204071, + "learning_rate": 5.144959823486708e-06, + "loss": 1.5418, + "mean_token_accuracy": 0.6341453293959299, + "num_tokens": 2240145800.0, + "step": 13359 + }, + { + "entropy": 1.6705620487531025, + "epoch": 1.4676608717145918, + "grad_norm": 0.6375599503517151, + "learning_rate": 5.1437439311066006e-06, + "loss": 1.3709, + "mean_token_accuracy": 0.6495741556088129, + "num_tokens": 2240318927.0, + "step": 13360 + }, + { + "entropy": 1.658727725346883, + "epoch": 1.4677707286259647, + "grad_norm": 0.6619511842727661, + "learning_rate": 5.142528224074359e-06, + "loss": 1.4811, + "mean_token_accuracy": 0.6575326571861903, + "num_tokens": 2240495823.0, + "step": 13361 + }, + { + "entropy": 1.669166515270869, + "epoch": 1.4678805855373376, + "grad_norm": 0.7321708798408508, + "learning_rate": 5.141312702428456e-06, + "loss": 1.3142, + "mean_token_accuracy": 0.6671041746934255, + "num_tokens": 2240668352.0, + "step": 13362 + }, + { + "entropy": 1.7291592756907146, + "epoch": 1.4679904424487105, + "grad_norm": 0.7490743398666382, + "learning_rate": 5.140097366207371e-06, + "loss": 1.3883, + "mean_token_accuracy": 0.6642330040534338, + "num_tokens": 2240837521.0, + "step": 13363 + }, + { + "entropy": 1.6821561257044475, + "epoch": 1.4681002993600836, + "grad_norm": 0.6379314661026001, + "learning_rate": 5.138882215449561e-06, + "loss": 1.2651, + "mean_token_accuracy": 0.6746839582920074, + "num_tokens": 2240975191.0, + "step": 13364 + }, + { + "entropy": 1.699595848719279, + "epoch": 1.4682101562714565, + "grad_norm": 0.7495248913764954, + "learning_rate": 5.137667250193487e-06, + "loss": 1.3034, + "mean_token_accuracy": 0.6714814802010854, + "num_tokens": 2241136736.0, + "step": 13365 + }, + { + "entropy": 1.6805048982302349, + "epoch": 1.4683200131828293, + "grad_norm": 0.7307020425796509, + "learning_rate": 5.136452470477605e-06, + "loss": 1.5607, + "mean_token_accuracy": 0.6429369499286016, + "num_tokens": 2241302570.0, + "step": 13366 + }, + { + "entropy": 1.6858853499094646, + "epoch": 1.4684298700942022, + "grad_norm": 0.7530251741409302, + "learning_rate": 5.135237876340357e-06, + "loss": 1.4323, + "mean_token_accuracy": 0.6499339739481608, + "num_tokens": 2241459406.0, + "step": 13367 + }, + { + "entropy": 1.7072277665138245, + "epoch": 1.4685397270055751, + "grad_norm": 0.7187586426734924, + "learning_rate": 5.1340234678201905e-06, + "loss": 1.4593, + "mean_token_accuracy": 0.6429513593514761, + "num_tokens": 2241601911.0, + "step": 13368 + }, + { + "entropy": 1.7173350950082142, + "epoch": 1.4686495839169482, + "grad_norm": 0.6327357888221741, + "learning_rate": 5.132809244955538e-06, + "loss": 1.3817, + "mean_token_accuracy": 0.6499977658192316, + "num_tokens": 2241787986.0, + "step": 13369 + }, + { + "entropy": 1.7320161958535512, + "epoch": 1.468759440828321, + "grad_norm": 0.5910692811012268, + "learning_rate": 5.131595207784826e-06, + "loss": 1.5099, + "mean_token_accuracy": 0.6356032888094584, + "num_tokens": 2241965938.0, + "step": 13370 + }, + { + "entropy": 1.6744110186894734, + "epoch": 1.468869297739694, + "grad_norm": 0.6589808464050293, + "learning_rate": 5.130381356346482e-06, + "loss": 1.4489, + "mean_token_accuracy": 0.6553682386875153, + "num_tokens": 2242170006.0, + "step": 13371 + }, + { + "entropy": 1.6842441360155742, + "epoch": 1.468979154651067, + "grad_norm": 0.6548722386360168, + "learning_rate": 5.129167690678926e-06, + "loss": 1.4434, + "mean_token_accuracy": 0.6569918642441431, + "num_tokens": 2242349906.0, + "step": 13372 + }, + { + "entropy": 1.6717151006062825, + "epoch": 1.46908901156244, + "grad_norm": 0.7034960985183716, + "learning_rate": 5.127954210820566e-06, + "loss": 1.2131, + "mean_token_accuracy": 0.6854620377222697, + "num_tokens": 2242479249.0, + "step": 13373 + }, + { + "entropy": 1.7261487344900768, + "epoch": 1.4691988684738129, + "grad_norm": 0.6977003812789917, + "learning_rate": 5.126740916809807e-06, + "loss": 1.4922, + "mean_token_accuracy": 0.6372009714444479, + "num_tokens": 2242681993.0, + "step": 13374 + }, + { + "entropy": 1.6832520266373951, + "epoch": 1.4693087253851858, + "grad_norm": 0.6861147284507751, + "learning_rate": 5.125527808685054e-06, + "loss": 1.4987, + "mean_token_accuracy": 0.6465383569399515, + "num_tokens": 2242854318.0, + "step": 13375 + }, + { + "entropy": 1.6995068192481995, + "epoch": 1.4694185822965586, + "grad_norm": 0.6299582123756409, + "learning_rate": 5.1243148864847e-06, + "loss": 1.3544, + "mean_token_accuracy": 0.6593434164921442, + "num_tokens": 2243028642.0, + "step": 13376 + }, + { + "entropy": 1.6980251967906952, + "epoch": 1.4695284392079317, + "grad_norm": 0.7787138223648071, + "learning_rate": 5.1231021502471275e-06, + "loss": 1.4421, + "mean_token_accuracy": 0.6528284599383672, + "num_tokens": 2243237493.0, + "step": 13377 + }, + { + "entropy": 1.717189719279607, + "epoch": 1.4696382961193046, + "grad_norm": 0.6779033541679382, + "learning_rate": 5.121889600010727e-06, + "loss": 1.5139, + "mean_token_accuracy": 0.6445932437976202, + "num_tokens": 2243438772.0, + "step": 13378 + }, + { + "entropy": 1.7422145505746205, + "epoch": 1.4697481530306775, + "grad_norm": 0.6294360756874084, + "learning_rate": 5.120677235813871e-06, + "loss": 1.3356, + "mean_token_accuracy": 0.6672088205814362, + "num_tokens": 2243605382.0, + "step": 13379 + }, + { + "entropy": 1.72358504931132, + "epoch": 1.4698580099420504, + "grad_norm": 0.7390589118003845, + "learning_rate": 5.1194650576949326e-06, + "loss": 1.3306, + "mean_token_accuracy": 0.6668838312228521, + "num_tokens": 2243725776.0, + "step": 13380 + }, + { + "entropy": 1.7053417166074116, + "epoch": 1.4699678668534233, + "grad_norm": 0.7105720043182373, + "learning_rate": 5.118253065692276e-06, + "loss": 1.4912, + "mean_token_accuracy": 0.6499655246734619, + "num_tokens": 2243901685.0, + "step": 13381 + }, + { + "entropy": 1.7427148222923279, + "epoch": 1.4700777237647964, + "grad_norm": 0.640266478061676, + "learning_rate": 5.117041259844256e-06, + "loss": 1.4751, + "mean_token_accuracy": 0.6555547416210175, + "num_tokens": 2244055138.0, + "step": 13382 + }, + { + "entropy": 1.7193391521771748, + "epoch": 1.4701875806761693, + "grad_norm": 0.7918505668640137, + "learning_rate": 5.115829640189229e-06, + "loss": 1.3312, + "mean_token_accuracy": 0.6624412635962168, + "num_tokens": 2244239813.0, + "step": 13383 + }, + { + "entropy": 1.6076705555121105, + "epoch": 1.4702974375875422, + "grad_norm": 0.700372040271759, + "learning_rate": 5.1146182067655445e-06, + "loss": 1.2775, + "mean_token_accuracy": 0.6678995142380396, + "num_tokens": 2244452998.0, + "step": 13384 + }, + { + "entropy": 1.7358313500881195, + "epoch": 1.4704072944989153, + "grad_norm": 0.6558439135551453, + "learning_rate": 5.113406959611545e-06, + "loss": 1.4388, + "mean_token_accuracy": 0.6488740295171738, + "num_tokens": 2244601173.0, + "step": 13385 + }, + { + "entropy": 1.7473020454247792, + "epoch": 1.4705171514102882, + "grad_norm": 0.704651415348053, + "learning_rate": 5.112195898765557e-06, + "loss": 1.5605, + "mean_token_accuracy": 0.6439760675032934, + "num_tokens": 2244776293.0, + "step": 13386 + }, + { + "entropy": 1.702320804198583, + "epoch": 1.470627008321661, + "grad_norm": 0.7008829712867737, + "learning_rate": 5.110985024265917e-06, + "loss": 1.4391, + "mean_token_accuracy": 0.6583433995644251, + "num_tokens": 2244939458.0, + "step": 13387 + }, + { + "entropy": 1.700803816318512, + "epoch": 1.470736865233034, + "grad_norm": 0.7770166993141174, + "learning_rate": 5.109774336150951e-06, + "loss": 1.4417, + "mean_token_accuracy": 0.6354402701059977, + "num_tokens": 2245145754.0, + "step": 13388 + }, + { + "entropy": 1.7102882862091064, + "epoch": 1.4708467221444068, + "grad_norm": 0.7285779118537903, + "learning_rate": 5.108563834458969e-06, + "loss": 1.4532, + "mean_token_accuracy": 0.6507706940174103, + "num_tokens": 2245356512.0, + "step": 13389 + }, + { + "entropy": 1.7527295649051666, + "epoch": 1.47095657905578, + "grad_norm": 0.793194055557251, + "learning_rate": 5.107353519228289e-06, + "loss": 1.2389, + "mean_token_accuracy": 0.6625605672597885, + "num_tokens": 2245496613.0, + "step": 13390 + }, + { + "entropy": 1.723008652528127, + "epoch": 1.4710664359671528, + "grad_norm": 0.6528907418251038, + "learning_rate": 5.106143390497211e-06, + "loss": 1.2856, + "mean_token_accuracy": 0.6656887034575144, + "num_tokens": 2245622811.0, + "step": 13391 + }, + { + "entropy": 1.6672236522038777, + "epoch": 1.4711762928785257, + "grad_norm": 0.5492684245109558, + "learning_rate": 5.1049334483040436e-06, + "loss": 1.3844, + "mean_token_accuracy": 0.6572863310575485, + "num_tokens": 2245825018.0, + "step": 13392 + }, + { + "entropy": 1.695264220237732, + "epoch": 1.4712861497898986, + "grad_norm": 0.6142525672912598, + "learning_rate": 5.103723692687076e-06, + "loss": 1.3716, + "mean_token_accuracy": 0.6567343175411224, + "num_tokens": 2246014688.0, + "step": 13393 + }, + { + "entropy": 1.7013497749964397, + "epoch": 1.4713960067012715, + "grad_norm": 0.6703829765319824, + "learning_rate": 5.102514123684594e-06, + "loss": 1.3899, + "mean_token_accuracy": 0.6593191623687744, + "num_tokens": 2246182122.0, + "step": 13394 + }, + { + "entropy": 1.7167363564173381, + "epoch": 1.4715058636126446, + "grad_norm": 0.7111037373542786, + "learning_rate": 5.101304741334883e-06, + "loss": 1.4901, + "mean_token_accuracy": 0.6491454988718033, + "num_tokens": 2246416059.0, + "step": 13395 + }, + { + "entropy": 1.6614757577578227, + "epoch": 1.4716157205240175, + "grad_norm": 0.6759634017944336, + "learning_rate": 5.10009554567622e-06, + "loss": 1.2934, + "mean_token_accuracy": 0.6705714662869772, + "num_tokens": 2246560395.0, + "step": 13396 + }, + { + "entropy": 1.6646903554598491, + "epoch": 1.4717255774353903, + "grad_norm": 0.6592537760734558, + "learning_rate": 5.0988865367468746e-06, + "loss": 1.2862, + "mean_token_accuracy": 0.6767180810372034, + "num_tokens": 2246720000.0, + "step": 13397 + }, + { + "entropy": 1.7495815654595692, + "epoch": 1.4718354343467634, + "grad_norm": 0.8151704668998718, + "learning_rate": 5.09767771458511e-06, + "loss": 1.5063, + "mean_token_accuracy": 0.6574457635482153, + "num_tokens": 2246873756.0, + "step": 13398 + }, + { + "entropy": 1.6735199590524037, + "epoch": 1.4719452912581363, + "grad_norm": 0.6688457131385803, + "learning_rate": 5.096469079229187e-06, + "loss": 1.3796, + "mean_token_accuracy": 0.6598214159409205, + "num_tokens": 2247046154.0, + "step": 13399 + }, + { + "entropy": 1.6267732282479603, + "epoch": 1.4720551481695092, + "grad_norm": 0.6082208156585693, + "learning_rate": 5.095260630717358e-06, + "loss": 1.3601, + "mean_token_accuracy": 0.6803909589846929, + "num_tokens": 2247214410.0, + "step": 13400 + }, + { + "entropy": 1.7172918021678925, + "epoch": 1.472165005080882, + "grad_norm": 0.7851716876029968, + "learning_rate": 5.0940523690878665e-06, + "loss": 1.2889, + "mean_token_accuracy": 0.6563605020443598, + "num_tokens": 2247347179.0, + "step": 13401 + }, + { + "entropy": 1.674240271250407, + "epoch": 1.472274861992255, + "grad_norm": 0.7133161425590515, + "learning_rate": 5.092844294378959e-06, + "loss": 1.6189, + "mean_token_accuracy": 0.65032958984375, + "num_tokens": 2247510287.0, + "step": 13402 + }, + { + "entropy": 1.7008503377437592, + "epoch": 1.472384718903628, + "grad_norm": 0.7033988237380981, + "learning_rate": 5.091636406628866e-06, + "loss": 1.4201, + "mean_token_accuracy": 0.6558716595172882, + "num_tokens": 2247651329.0, + "step": 13403 + }, + { + "entropy": 1.6965941190719604, + "epoch": 1.472494575815001, + "grad_norm": 0.7866081595420837, + "learning_rate": 5.090428705875821e-06, + "loss": 1.465, + "mean_token_accuracy": 0.6466370224952698, + "num_tokens": 2247825351.0, + "step": 13404 + }, + { + "entropy": 1.656824787457784, + "epoch": 1.4726044327263739, + "grad_norm": 0.7088666558265686, + "learning_rate": 5.089221192158043e-06, + "loss": 1.3121, + "mean_token_accuracy": 0.6727252850929896, + "num_tokens": 2248030695.0, + "step": 13405 + }, + { + "entropy": 1.7407319247722626, + "epoch": 1.4727142896377468, + "grad_norm": 0.7755722999572754, + "learning_rate": 5.088013865513749e-06, + "loss": 1.5342, + "mean_token_accuracy": 0.6439951807260513, + "num_tokens": 2248239120.0, + "step": 13406 + }, + { + "entropy": 1.6667874654134114, + "epoch": 1.4728241465491196, + "grad_norm": 0.6196835041046143, + "learning_rate": 5.086806725981153e-06, + "loss": 1.388, + "mean_token_accuracy": 0.667399138212204, + "num_tokens": 2248406198.0, + "step": 13407 + }, + { + "entropy": 1.7022119263807933, + "epoch": 1.4729340034604927, + "grad_norm": 0.6414451599121094, + "learning_rate": 5.08559977359846e-06, + "loss": 1.3475, + "mean_token_accuracy": 0.6679307371377945, + "num_tokens": 2248546706.0, + "step": 13408 + }, + { + "entropy": 1.67433958252271, + "epoch": 1.4730438603718656, + "grad_norm": 0.7130979895591736, + "learning_rate": 5.0843930084038696e-06, + "loss": 1.2636, + "mean_token_accuracy": 0.6698357065518697, + "num_tokens": 2248687310.0, + "step": 13409 + }, + { + "entropy": 1.658446768919627, + "epoch": 1.4731537172832385, + "grad_norm": 0.7370253801345825, + "learning_rate": 5.083186430435574e-06, + "loss": 1.4569, + "mean_token_accuracy": 0.6521903574466705, + "num_tokens": 2248871430.0, + "step": 13410 + }, + { + "entropy": 1.725239743789037, + "epoch": 1.4732635741946116, + "grad_norm": 0.6639283299446106, + "learning_rate": 5.0819800397317635e-06, + "loss": 1.3176, + "mean_token_accuracy": 0.6676472028096517, + "num_tokens": 2249035961.0, + "step": 13411 + }, + { + "entropy": 1.7703583141167958, + "epoch": 1.4733734311059845, + "grad_norm": 0.6325090527534485, + "learning_rate": 5.0807738363306165e-06, + "loss": 1.4219, + "mean_token_accuracy": 0.6441073268651962, + "num_tokens": 2249163917.0, + "step": 13412 + }, + { + "entropy": 1.7586402297019958, + "epoch": 1.4734832880173574, + "grad_norm": 0.6557300686836243, + "learning_rate": 5.0795678202703104e-06, + "loss": 1.4752, + "mean_token_accuracy": 0.6431727459033331, + "num_tokens": 2249359791.0, + "step": 13413 + }, + { + "entropy": 1.7578627566496532, + "epoch": 1.4735931449287303, + "grad_norm": 0.7130923271179199, + "learning_rate": 5.078361991589016e-06, + "loss": 1.3846, + "mean_token_accuracy": 0.6526891241470972, + "num_tokens": 2249514293.0, + "step": 13414 + }, + { + "entropy": 1.7255384922027588, + "epoch": 1.4737030018401032, + "grad_norm": 0.6839941740036011, + "learning_rate": 5.0771563503248944e-06, + "loss": 1.3951, + "mean_token_accuracy": 0.6575382997592291, + "num_tokens": 2249650382.0, + "step": 13415 + }, + { + "entropy": 1.6533535917599995, + "epoch": 1.4738128587514763, + "grad_norm": 0.6555220484733582, + "learning_rate": 5.075950896516107e-06, + "loss": 1.2708, + "mean_token_accuracy": 0.6787864863872528, + "num_tokens": 2249800802.0, + "step": 13416 + }, + { + "entropy": 1.7047683497269948, + "epoch": 1.4739227156628492, + "grad_norm": 0.7065283060073853, + "learning_rate": 5.074745630200806e-06, + "loss": 1.3563, + "mean_token_accuracy": 0.673611119389534, + "num_tokens": 2249944260.0, + "step": 13417 + }, + { + "entropy": 1.7077392141024272, + "epoch": 1.474032572574222, + "grad_norm": 0.7069240212440491, + "learning_rate": 5.073540551417131e-06, + "loss": 1.2927, + "mean_token_accuracy": 0.664387916525205, + "num_tokens": 2250053815.0, + "step": 13418 + }, + { + "entropy": 1.675834854443868, + "epoch": 1.474142429485595, + "grad_norm": 0.6132168173789978, + "learning_rate": 5.072335660203231e-06, + "loss": 1.5704, + "mean_token_accuracy": 0.6342288305362066, + "num_tokens": 2250337503.0, + "step": 13419 + }, + { + "entropy": 1.7231556475162506, + "epoch": 1.4742522863969678, + "grad_norm": 0.8251075744628906, + "learning_rate": 5.071130956597236e-06, + "loss": 1.3726, + "mean_token_accuracy": 0.6538362056016922, + "num_tokens": 2250483346.0, + "step": 13420 + }, + { + "entropy": 1.7046967844168346, + "epoch": 1.474362143308341, + "grad_norm": 0.7439585328102112, + "learning_rate": 5.069926440637272e-06, + "loss": 1.4672, + "mean_token_accuracy": 0.6513397047917048, + "num_tokens": 2250658255.0, + "step": 13421 + }, + { + "entropy": 1.756682167450587, + "epoch": 1.4744720002197138, + "grad_norm": 0.7163110375404358, + "learning_rate": 5.068722112361466e-06, + "loss": 1.5754, + "mean_token_accuracy": 0.6236685266097387, + "num_tokens": 2250872770.0, + "step": 13422 + }, + { + "entropy": 1.7206588784853618, + "epoch": 1.4745818571310867, + "grad_norm": 0.6744549870491028, + "learning_rate": 5.067517971807931e-06, + "loss": 1.4174, + "mean_token_accuracy": 0.6684413055578867, + "num_tokens": 2251059371.0, + "step": 13423 + }, + { + "entropy": 1.6995096405347188, + "epoch": 1.4746917140424598, + "grad_norm": 0.8165592551231384, + "learning_rate": 5.066314019014781e-06, + "loss": 1.1724, + "mean_token_accuracy": 0.6960208316644033, + "num_tokens": 2251185914.0, + "step": 13424 + }, + { + "entropy": 1.6548355321089427, + "epoch": 1.4748015709538327, + "grad_norm": 0.6455921530723572, + "learning_rate": 5.065110254020118e-06, + "loss": 1.5029, + "mean_token_accuracy": 0.6502973834673563, + "num_tokens": 2251393117.0, + "step": 13425 + }, + { + "entropy": 1.6629098852475483, + "epoch": 1.4749114278652056, + "grad_norm": 0.8761639595031738, + "learning_rate": 5.063906676862039e-06, + "loss": 1.3805, + "mean_token_accuracy": 0.6616864850123724, + "num_tokens": 2251558769.0, + "step": 13426 + }, + { + "entropy": 1.7066173553466797, + "epoch": 1.4750212847765785, + "grad_norm": 0.7723012566566467, + "learning_rate": 5.062703287578638e-06, + "loss": 1.3899, + "mean_token_accuracy": 0.65970512231191, + "num_tokens": 2251741724.0, + "step": 13427 + }, + { + "entropy": 1.6936110059420268, + "epoch": 1.4751311416879513, + "grad_norm": 0.5787246823310852, + "learning_rate": 5.061500086208007e-06, + "loss": 1.3812, + "mean_token_accuracy": 0.662451446056366, + "num_tokens": 2251977302.0, + "step": 13428 + }, + { + "entropy": 1.721141795317332, + "epoch": 1.4752409985993244, + "grad_norm": 0.5804960131645203, + "learning_rate": 5.060297072788221e-06, + "loss": 1.5953, + "mean_token_accuracy": 0.6185376693805059, + "num_tokens": 2252212559.0, + "step": 13429 + }, + { + "entropy": 1.7097849945227306, + "epoch": 1.4753508555106973, + "grad_norm": 0.7100227475166321, + "learning_rate": 5.059094247357354e-06, + "loss": 1.4159, + "mean_token_accuracy": 0.6612443824609121, + "num_tokens": 2252360518.0, + "step": 13430 + }, + { + "entropy": 1.6556785504023235, + "epoch": 1.4754607124220702, + "grad_norm": 0.6368470788002014, + "learning_rate": 5.05789160995348e-06, + "loss": 1.5139, + "mean_token_accuracy": 0.65046127140522, + "num_tokens": 2252576094.0, + "step": 13431 + }, + { + "entropy": 1.7037680546442668, + "epoch": 1.4755705693334433, + "grad_norm": 0.7826297879219055, + "learning_rate": 5.056689160614659e-06, + "loss": 1.3208, + "mean_token_accuracy": 0.6799842069546381, + "num_tokens": 2252703056.0, + "step": 13432 + }, + { + "entropy": 1.7210968534151714, + "epoch": 1.475680426244816, + "grad_norm": 0.8060481548309326, + "learning_rate": 5.055486899378944e-06, + "loss": 1.419, + "mean_token_accuracy": 0.6568311204512914, + "num_tokens": 2252857460.0, + "step": 13433 + }, + { + "entropy": 1.677445928255717, + "epoch": 1.475790283156189, + "grad_norm": 0.5863375067710876, + "learning_rate": 5.054284826284393e-06, + "loss": 1.4257, + "mean_token_accuracy": 0.6570474654436111, + "num_tokens": 2253067045.0, + "step": 13434 + }, + { + "entropy": 1.7341270844141643, + "epoch": 1.475900140067562, + "grad_norm": 0.594789445400238, + "learning_rate": 5.053082941369045e-06, + "loss": 1.3536, + "mean_token_accuracy": 0.6686640679836273, + "num_tokens": 2253265121.0, + "step": 13435 + }, + { + "entropy": 1.7422856092453003, + "epoch": 1.4760099969789349, + "grad_norm": 0.7513555884361267, + "learning_rate": 5.051881244670947e-06, + "loss": 1.4024, + "mean_token_accuracy": 0.6600150018930435, + "num_tokens": 2253440176.0, + "step": 13436 + }, + { + "entropy": 1.6602600614229839, + "epoch": 1.476119853890308, + "grad_norm": 0.6317045092582703, + "learning_rate": 5.050679736228125e-06, + "loss": 1.5157, + "mean_token_accuracy": 0.6511089901129404, + "num_tokens": 2253643777.0, + "step": 13437 + }, + { + "entropy": 1.7466975152492523, + "epoch": 1.4762297108016809, + "grad_norm": 0.647018313407898, + "learning_rate": 5.049478416078608e-06, + "loss": 1.3602, + "mean_token_accuracy": 0.6553884297609329, + "num_tokens": 2253771315.0, + "step": 13438 + }, + { + "entropy": 1.682116021712621, + "epoch": 1.4763395677130537, + "grad_norm": 0.8301473259925842, + "learning_rate": 5.048277284260416e-06, + "loss": 1.3466, + "mean_token_accuracy": 0.6662062009175619, + "num_tokens": 2253950173.0, + "step": 13439 + }, + { + "entropy": 1.7308276693026226, + "epoch": 1.4764494246244266, + "grad_norm": 0.8368297219276428, + "learning_rate": 5.047076340811569e-06, + "loss": 1.3698, + "mean_token_accuracy": 0.6646293699741364, + "num_tokens": 2254076894.0, + "step": 13440 + }, + { + "entropy": 1.7010807593663533, + "epoch": 1.4765592815357995, + "grad_norm": 0.6222649216651917, + "learning_rate": 5.0458755857700725e-06, + "loss": 1.3895, + "mean_token_accuracy": 0.6515608131885529, + "num_tokens": 2254301385.0, + "step": 13441 + }, + { + "entropy": 1.6313113868236542, + "epoch": 1.4766691384471726, + "grad_norm": 0.6975681185722351, + "learning_rate": 5.04467501917393e-06, + "loss": 1.1905, + "mean_token_accuracy": 0.6868863999843597, + "num_tokens": 2254455186.0, + "step": 13442 + }, + { + "entropy": 1.6619862020015717, + "epoch": 1.4767789953585455, + "grad_norm": 0.5563586354255676, + "learning_rate": 5.043474641061141e-06, + "loss": 1.4175, + "mean_token_accuracy": 0.6464668810367584, + "num_tokens": 2254700519.0, + "step": 13443 + }, + { + "entropy": 1.6394928991794586, + "epoch": 1.4768888522699184, + "grad_norm": 0.6185762882232666, + "learning_rate": 5.042274451469696e-06, + "loss": 1.3622, + "mean_token_accuracy": 0.6612973709901174, + "num_tokens": 2254900856.0, + "step": 13444 + }, + { + "entropy": 1.7395354708035786, + "epoch": 1.4769987091812915, + "grad_norm": 0.7197256684303284, + "learning_rate": 5.041074450437577e-06, + "loss": 1.3647, + "mean_token_accuracy": 0.6561250587304434, + "num_tokens": 2255066819.0, + "step": 13445 + }, + { + "entropy": 1.7001774807771046, + "epoch": 1.4771085660926644, + "grad_norm": 0.753657341003418, + "learning_rate": 5.039874638002771e-06, + "loss": 1.3652, + "mean_token_accuracy": 0.6524695505698522, + "num_tokens": 2255201358.0, + "step": 13446 + }, + { + "entropy": 1.6709985435009003, + "epoch": 1.4772184230040373, + "grad_norm": 0.6185214519500732, + "learning_rate": 5.038675014203243e-06, + "loss": 1.1932, + "mean_token_accuracy": 0.6734795669714609, + "num_tokens": 2255383642.0, + "step": 13447 + }, + { + "entropy": 1.7464906374613445, + "epoch": 1.4773282799154102, + "grad_norm": 0.6656374931335449, + "learning_rate": 5.037475579076966e-06, + "loss": 1.3962, + "mean_token_accuracy": 0.6644060959418615, + "num_tokens": 2255534586.0, + "step": 13448 + }, + { + "entropy": 1.685450941324234, + "epoch": 1.477438136826783, + "grad_norm": 0.808623731136322, + "learning_rate": 5.0362763326619e-06, + "loss": 1.4986, + "mean_token_accuracy": 0.6441106796264648, + "num_tokens": 2255683821.0, + "step": 13449 + }, + { + "entropy": 1.7671352128187816, + "epoch": 1.4775479937381562, + "grad_norm": 0.7077687382698059, + "learning_rate": 5.0350772749960004e-06, + "loss": 1.1849, + "mean_token_accuracy": 0.6855556517839432, + "num_tokens": 2255778360.0, + "step": 13450 + }, + { + "entropy": 1.633953034877777, + "epoch": 1.477657850649529, + "grad_norm": 0.6457007527351379, + "learning_rate": 5.033878406117215e-06, + "loss": 1.6397, + "mean_token_accuracy": 0.6252560267845789, + "num_tokens": 2256008403.0, + "step": 13451 + }, + { + "entropy": 1.6935044626394908, + "epoch": 1.477767707560902, + "grad_norm": 0.7718963623046875, + "learning_rate": 5.032679726063494e-06, + "loss": 1.2658, + "mean_token_accuracy": 0.6831946323315302, + "num_tokens": 2256131286.0, + "step": 13452 + }, + { + "entropy": 1.6343218088150024, + "epoch": 1.4778775644722748, + "grad_norm": 0.6215181350708008, + "learning_rate": 5.03148123487277e-06, + "loss": 1.3287, + "mean_token_accuracy": 0.6735940128564835, + "num_tokens": 2256326690.0, + "step": 13453 + }, + { + "entropy": 1.7170305450757344, + "epoch": 1.4779874213836477, + "grad_norm": 0.6218499541282654, + "learning_rate": 5.030282932582972e-06, + "loss": 1.4629, + "mean_token_accuracy": 0.6319693426291147, + "num_tokens": 2256500661.0, + "step": 13454 + }, + { + "entropy": 1.6966708103815715, + "epoch": 1.4780972782950208, + "grad_norm": 0.7736793756484985, + "learning_rate": 5.0290848192320344e-06, + "loss": 1.465, + "mean_token_accuracy": 0.6512386153141657, + "num_tokens": 2256690681.0, + "step": 13455 + }, + { + "entropy": 1.6986188689867656, + "epoch": 1.4782071352063937, + "grad_norm": 0.8579681515693665, + "learning_rate": 5.02788689485787e-06, + "loss": 1.4793, + "mean_token_accuracy": 0.6540632620453835, + "num_tokens": 2256859278.0, + "step": 13456 + }, + { + "entropy": 1.7905776103337605, + "epoch": 1.4783169921177666, + "grad_norm": 0.6369954347610474, + "learning_rate": 5.02668915949839e-06, + "loss": 1.5357, + "mean_token_accuracy": 0.651649167140325, + "num_tokens": 2257015967.0, + "step": 13457 + }, + { + "entropy": 1.7184994022051494, + "epoch": 1.4784268490291397, + "grad_norm": 0.5851972103118896, + "learning_rate": 5.025491613191511e-06, + "loss": 1.4093, + "mean_token_accuracy": 0.6455397953589758, + "num_tokens": 2257178746.0, + "step": 13458 + }, + { + "entropy": 1.694368600845337, + "epoch": 1.4785367059405126, + "grad_norm": 0.7007179856300354, + "learning_rate": 5.0242942559751275e-06, + "loss": 1.3101, + "mean_token_accuracy": 0.6777728994687399, + "num_tokens": 2257309719.0, + "step": 13459 + }, + { + "entropy": 1.741744190454483, + "epoch": 1.4786465628518854, + "grad_norm": 0.6834388971328735, + "learning_rate": 5.023097087887141e-06, + "loss": 1.3221, + "mean_token_accuracy": 0.6692759493986765, + "num_tokens": 2257454575.0, + "step": 13460 + }, + { + "entropy": 1.7196588615576427, + "epoch": 1.4787564197632583, + "grad_norm": 0.7268269658088684, + "learning_rate": 5.021900108965438e-06, + "loss": 1.2486, + "mean_token_accuracy": 0.6762056102355322, + "num_tokens": 2257600219.0, + "step": 13461 + }, + { + "entropy": 1.722573568423589, + "epoch": 1.4788662766746312, + "grad_norm": 0.7268519997596741, + "learning_rate": 5.0207033192479e-06, + "loss": 1.6339, + "mean_token_accuracy": 0.6259458661079407, + "num_tokens": 2257841931.0, + "step": 13462 + }, + { + "entropy": 1.6968311369419098, + "epoch": 1.4789761335860043, + "grad_norm": 0.6865484714508057, + "learning_rate": 5.019506718772407e-06, + "loss": 1.3771, + "mean_token_accuracy": 0.6622037986914316, + "num_tokens": 2258003647.0, + "step": 13463 + }, + { + "entropy": 1.6797509094079335, + "epoch": 1.4790859904973772, + "grad_norm": 0.6517234444618225, + "learning_rate": 5.018310307576835e-06, + "loss": 1.2848, + "mean_token_accuracy": 0.6743231564760208, + "num_tokens": 2258187580.0, + "step": 13464 + }, + { + "entropy": 1.7106235921382904, + "epoch": 1.47919584740875, + "grad_norm": 0.6443414092063904, + "learning_rate": 5.017114085699046e-06, + "loss": 1.4221, + "mean_token_accuracy": 0.6623661716779073, + "num_tokens": 2258378553.0, + "step": 13465 + }, + { + "entropy": 1.715536544720332, + "epoch": 1.479305704320123, + "grad_norm": 0.7782425284385681, + "learning_rate": 5.0159180531768985e-06, + "loss": 1.3477, + "mean_token_accuracy": 0.6619761238495508, + "num_tokens": 2258508761.0, + "step": 13466 + }, + { + "entropy": 1.7270130614439647, + "epoch": 1.4794155612314959, + "grad_norm": 0.6732892990112305, + "learning_rate": 5.014722210048251e-06, + "loss": 1.3856, + "mean_token_accuracy": 0.6509933620691299, + "num_tokens": 2258685957.0, + "step": 13467 + }, + { + "entropy": 1.73613902926445, + "epoch": 1.479525418142869, + "grad_norm": 0.6732542514801025, + "learning_rate": 5.0135265563509475e-06, + "loss": 1.3947, + "mean_token_accuracy": 0.6510529269774755, + "num_tokens": 2258863473.0, + "step": 13468 + }, + { + "entropy": 1.7343490421772003, + "epoch": 1.4796352750542419, + "grad_norm": 0.7667641639709473, + "learning_rate": 5.0123310921228265e-06, + "loss": 1.2903, + "mean_token_accuracy": 0.6681475838025411, + "num_tokens": 2259013861.0, + "step": 13469 + }, + { + "entropy": 1.7270642916361492, + "epoch": 1.4797451319656147, + "grad_norm": 0.7105488777160645, + "learning_rate": 5.011135817401733e-06, + "loss": 1.2928, + "mean_token_accuracy": 0.6620072424411774, + "num_tokens": 2259178477.0, + "step": 13470 + }, + { + "entropy": 1.7137524485588074, + "epoch": 1.4798549888769879, + "grad_norm": 0.7805687785148621, + "learning_rate": 5.009940732225489e-06, + "loss": 1.3888, + "mean_token_accuracy": 0.6767902622620264, + "num_tokens": 2259334869.0, + "step": 13471 + }, + { + "entropy": 1.7165914575258892, + "epoch": 1.4799648457883607, + "grad_norm": 0.7019093632698059, + "learning_rate": 5.008745836631925e-06, + "loss": 1.4593, + "mean_token_accuracy": 0.6598990907271703, + "num_tokens": 2259474030.0, + "step": 13472 + }, + { + "entropy": 1.6991265912850697, + "epoch": 1.4800747026997336, + "grad_norm": 0.6628867387771606, + "learning_rate": 5.007551130658857e-06, + "loss": 1.5422, + "mean_token_accuracy": 0.6464388569196066, + "num_tokens": 2259650297.0, + "step": 13473 + }, + { + "entropy": 1.6515828371047974, + "epoch": 1.4801845596111065, + "grad_norm": 0.622098982334137, + "learning_rate": 5.00635661434409e-06, + "loss": 1.3175, + "mean_token_accuracy": 0.6653083264827728, + "num_tokens": 2259822214.0, + "step": 13474 + }, + { + "entropy": 1.6867812772591908, + "epoch": 1.4802944165224794, + "grad_norm": 0.6378466486930847, + "learning_rate": 5.0051622877254355e-06, + "loss": 1.4009, + "mean_token_accuracy": 0.6632012327512106, + "num_tokens": 2259989102.0, + "step": 13475 + }, + { + "entropy": 1.683827131986618, + "epoch": 1.4804042734338525, + "grad_norm": 0.6706616282463074, + "learning_rate": 5.003968150840697e-06, + "loss": 1.4421, + "mean_token_accuracy": 0.6411708742380142, + "num_tokens": 2260172218.0, + "step": 13476 + }, + { + "entropy": 1.7033619185288746, + "epoch": 1.4805141303452254, + "grad_norm": 0.6120789051055908, + "learning_rate": 5.002774203727665e-06, + "loss": 1.3766, + "mean_token_accuracy": 0.6580955187479655, + "num_tokens": 2260328696.0, + "step": 13477 + }, + { + "entropy": 1.6510994335015614, + "epoch": 1.4806239872565983, + "grad_norm": 0.6583324670791626, + "learning_rate": 5.001580446424126e-06, + "loss": 1.2989, + "mean_token_accuracy": 0.6703228702147802, + "num_tokens": 2260527309.0, + "step": 13478 + }, + { + "entropy": 1.7263270119826, + "epoch": 1.4807338441679712, + "grad_norm": 0.8199454545974731, + "learning_rate": 5.00038687896786e-06, + "loss": 1.5701, + "mean_token_accuracy": 0.6611626545588175, + "num_tokens": 2260654170.0, + "step": 13479 + }, + { + "entropy": 1.6092917223771412, + "epoch": 1.480843701079344, + "grad_norm": 0.649434506893158, + "learning_rate": 4.999193501396651e-06, + "loss": 1.3777, + "mean_token_accuracy": 0.6647194971640905, + "num_tokens": 2260850430.0, + "step": 13480 + }, + { + "entropy": 1.7251697679360707, + "epoch": 1.4809535579907172, + "grad_norm": 0.6333536505699158, + "learning_rate": 4.998000313748261e-06, + "loss": 1.36, + "mean_token_accuracy": 0.6603184541066488, + "num_tokens": 2260977793.0, + "step": 13481 + }, + { + "entropy": 1.757962852716446, + "epoch": 1.48106341490209, + "grad_norm": 0.8047628402709961, + "learning_rate": 4.9968073160604545e-06, + "loss": 1.3915, + "mean_token_accuracy": 0.6611624906460444, + "num_tokens": 2261094020.0, + "step": 13482 + }, + { + "entropy": 1.6559306979179382, + "epoch": 1.481173271813463, + "grad_norm": 0.6404219269752502, + "learning_rate": 4.995614508370992e-06, + "loss": 1.3355, + "mean_token_accuracy": 0.6575894902149836, + "num_tokens": 2261227135.0, + "step": 13483 + }, + { + "entropy": 1.7070954938729603, + "epoch": 1.481283128724836, + "grad_norm": 0.749505341053009, + "learning_rate": 4.994421890717627e-06, + "loss": 1.2544, + "mean_token_accuracy": 0.6670717298984528, + "num_tokens": 2261352833.0, + "step": 13484 + }, + { + "entropy": 1.694403092066447, + "epoch": 1.481392985636209, + "grad_norm": 0.7458823919296265, + "learning_rate": 4.9932294631381025e-06, + "loss": 1.273, + "mean_token_accuracy": 0.6751887102921804, + "num_tokens": 2261469789.0, + "step": 13485 + }, + { + "entropy": 1.767996261517207, + "epoch": 1.4815028425475818, + "grad_norm": 0.7070764899253845, + "learning_rate": 4.992037225670156e-06, + "loss": 1.2761, + "mean_token_accuracy": 0.6802639961242676, + "num_tokens": 2261593206.0, + "step": 13486 + }, + { + "entropy": 1.6416256129741669, + "epoch": 1.4816126994589547, + "grad_norm": 0.6489446759223938, + "learning_rate": 4.990845178351528e-06, + "loss": 1.3192, + "mean_token_accuracy": 0.6602905988693237, + "num_tokens": 2261754373.0, + "step": 13487 + }, + { + "entropy": 1.7270208696524303, + "epoch": 1.4817225563703276, + "grad_norm": 0.5677620768547058, + "learning_rate": 4.989653321219938e-06, + "loss": 1.4977, + "mean_token_accuracy": 0.65739672879378, + "num_tokens": 2261959159.0, + "step": 13488 + }, + { + "entropy": 1.6949149171511333, + "epoch": 1.4818324132817007, + "grad_norm": 0.6971380114555359, + "learning_rate": 4.988461654313116e-06, + "loss": 1.3033, + "mean_token_accuracy": 0.6612110733985901, + "num_tokens": 2262083678.0, + "step": 13489 + }, + { + "entropy": 1.7387464841206868, + "epoch": 1.4819422701930736, + "grad_norm": 0.7100080251693726, + "learning_rate": 4.987270177668773e-06, + "loss": 1.2981, + "mean_token_accuracy": 0.6643347293138504, + "num_tokens": 2262239288.0, + "step": 13490 + }, + { + "entropy": 1.6716001530488331, + "epoch": 1.4820521271044464, + "grad_norm": 0.5574256181716919, + "learning_rate": 4.986078891324617e-06, + "loss": 1.4879, + "mean_token_accuracy": 0.6372141987085342, + "num_tokens": 2262514560.0, + "step": 13491 + }, + { + "entropy": 1.6284152368704479, + "epoch": 1.4821619840158193, + "grad_norm": 0.598142683506012, + "learning_rate": 4.9848877953183575e-06, + "loss": 1.3574, + "mean_token_accuracy": 0.6729972014824549, + "num_tokens": 2262727556.0, + "step": 13492 + }, + { + "entropy": 1.7167203028996785, + "epoch": 1.4822718409271922, + "grad_norm": 0.6333439946174622, + "learning_rate": 4.9836968896876885e-06, + "loss": 1.4435, + "mean_token_accuracy": 0.6489839653174082, + "num_tokens": 2262893662.0, + "step": 13493 + }, + { + "entropy": 1.72333358724912, + "epoch": 1.4823816978385653, + "grad_norm": 0.7966588139533997, + "learning_rate": 4.982506174470299e-06, + "loss": 1.6081, + "mean_token_accuracy": 0.6349592606226603, + "num_tokens": 2263116005.0, + "step": 13494 + }, + { + "entropy": 1.673676609992981, + "epoch": 1.4824915547499382, + "grad_norm": 0.6574962139129639, + "learning_rate": 4.981315649703877e-06, + "loss": 1.4094, + "mean_token_accuracy": 0.6473542600870132, + "num_tokens": 2263312879.0, + "step": 13495 + }, + { + "entropy": 1.6873856385548909, + "epoch": 1.482601411661311, + "grad_norm": 0.7907306551933289, + "learning_rate": 4.980125315426106e-06, + "loss": 1.4349, + "mean_token_accuracy": 0.6571491559346517, + "num_tokens": 2263522149.0, + "step": 13496 + }, + { + "entropy": 1.6957029402256012, + "epoch": 1.4827112685726842, + "grad_norm": 0.6600527763366699, + "learning_rate": 4.9789351716746555e-06, + "loss": 1.3383, + "mean_token_accuracy": 0.6599696377913157, + "num_tokens": 2263680544.0, + "step": 13497 + }, + { + "entropy": 1.716781238714854, + "epoch": 1.482821125484057, + "grad_norm": 0.7186543345451355, + "learning_rate": 4.9777452184871915e-06, + "loss": 1.299, + "mean_token_accuracy": 0.6676995903253555, + "num_tokens": 2263800785.0, + "step": 13498 + }, + { + "entropy": 1.7211474776268005, + "epoch": 1.48293098239543, + "grad_norm": 0.719422459602356, + "learning_rate": 4.97655545590138e-06, + "loss": 1.4209, + "mean_token_accuracy": 0.6576556066672007, + "num_tokens": 2263984167.0, + "step": 13499 + }, + { + "entropy": 1.7104551792144775, + "epoch": 1.4830408393068029, + "grad_norm": 0.7389053702354431, + "learning_rate": 4.9753658839548745e-06, + "loss": 1.4894, + "mean_token_accuracy": 0.6423831830422083, + "num_tokens": 2264212746.0, + "step": 13500 + }, + { + "entropy": 1.7473791042963664, + "epoch": 1.4831506962181757, + "grad_norm": 0.8776116967201233, + "learning_rate": 4.97417650268532e-06, + "loss": 1.3598, + "mean_token_accuracy": 0.6644879480202993, + "num_tokens": 2264366316.0, + "step": 13501 + }, + { + "entropy": 1.7517230312029521, + "epoch": 1.4832605531295489, + "grad_norm": 0.6201359033584595, + "learning_rate": 4.972987312130369e-06, + "loss": 1.4633, + "mean_token_accuracy": 0.6426295389731725, + "num_tokens": 2264580750.0, + "step": 13502 + }, + { + "entropy": 1.7220464249451954, + "epoch": 1.4833704100409217, + "grad_norm": 0.6405351758003235, + "learning_rate": 4.97179831232765e-06, + "loss": 1.4245, + "mean_token_accuracy": 0.6534823377927145, + "num_tokens": 2264733000.0, + "step": 13503 + }, + { + "entropy": 1.69864288965861, + "epoch": 1.4834802669522946, + "grad_norm": 0.624673068523407, + "learning_rate": 4.9706095033148e-06, + "loss": 1.3807, + "mean_token_accuracy": 0.6559500147898992, + "num_tokens": 2264898277.0, + "step": 13504 + }, + { + "entropy": 1.7481873134771984, + "epoch": 1.4835901238636675, + "grad_norm": 0.7108025550842285, + "learning_rate": 4.969420885129443e-06, + "loss": 1.4226, + "mean_token_accuracy": 0.6583486298720042, + "num_tokens": 2265061391.0, + "step": 13505 + }, + { + "entropy": 1.65755029519399, + "epoch": 1.4836999807750404, + "grad_norm": 0.6649439930915833, + "learning_rate": 4.968232457809195e-06, + "loss": 1.3684, + "mean_token_accuracy": 0.6610806783040365, + "num_tokens": 2265255559.0, + "step": 13506 + }, + { + "entropy": 1.7579138378302257, + "epoch": 1.4838098376864135, + "grad_norm": 0.686124324798584, + "learning_rate": 4.967044221391671e-06, + "loss": 1.438, + "mean_token_accuracy": 0.6544395188490549, + "num_tokens": 2265466450.0, + "step": 13507 + }, + { + "entropy": 1.743057797352473, + "epoch": 1.4839196945977864, + "grad_norm": 0.8767577409744263, + "learning_rate": 4.9658561759144815e-06, + "loss": 1.3654, + "mean_token_accuracy": 0.65755066772302, + "num_tokens": 2265596110.0, + "step": 13508 + }, + { + "entropy": 1.6500622431437175, + "epoch": 1.4840295515091593, + "grad_norm": 0.6735820174217224, + "learning_rate": 4.964668321415226e-06, + "loss": 1.2918, + "mean_token_accuracy": 0.6772434115409851, + "num_tokens": 2265768382.0, + "step": 13509 + }, + { + "entropy": 1.631582687298457, + "epoch": 1.4841394084205324, + "grad_norm": 0.6203559637069702, + "learning_rate": 4.963480657931496e-06, + "loss": 1.4507, + "mean_token_accuracy": 0.6736873388290405, + "num_tokens": 2265997734.0, + "step": 13510 + }, + { + "entropy": 1.7016975184281666, + "epoch": 1.4842492653319053, + "grad_norm": 0.8607683777809143, + "learning_rate": 4.9622931855008845e-06, + "loss": 1.2962, + "mean_token_accuracy": 0.6669759303331375, + "num_tokens": 2266140837.0, + "step": 13511 + }, + { + "entropy": 1.7488416135311127, + "epoch": 1.4843591222432782, + "grad_norm": 0.70814049243927, + "learning_rate": 4.961105904160974e-06, + "loss": 1.4504, + "mean_token_accuracy": 0.6467891732851664, + "num_tokens": 2266334509.0, + "step": 13512 + }, + { + "entropy": 1.6617756883303325, + "epoch": 1.484468979154651, + "grad_norm": 0.6905792355537415, + "learning_rate": 4.959918813949338e-06, + "loss": 1.4075, + "mean_token_accuracy": 0.6694660286108652, + "num_tokens": 2266497729.0, + "step": 13513 + }, + { + "entropy": 1.7315144042174022, + "epoch": 1.484578836066024, + "grad_norm": 0.6920166015625, + "learning_rate": 4.958731914903551e-06, + "loss": 1.302, + "mean_token_accuracy": 0.6789048910140991, + "num_tokens": 2266671543.0, + "step": 13514 + }, + { + "entropy": 1.6354870001475017, + "epoch": 1.484688692977397, + "grad_norm": 0.6212617754936218, + "learning_rate": 4.957545207061175e-06, + "loss": 1.3765, + "mean_token_accuracy": 0.6648024767637253, + "num_tokens": 2266861233.0, + "step": 13515 + }, + { + "entropy": 1.6495544612407684, + "epoch": 1.48479854988877, + "grad_norm": 0.7335532307624817, + "learning_rate": 4.956358690459772e-06, + "loss": 1.3946, + "mean_token_accuracy": 0.6647347460190455, + "num_tokens": 2267046550.0, + "step": 13516 + }, + { + "entropy": 1.704295853773753, + "epoch": 1.4849084068001428, + "grad_norm": 0.6387439966201782, + "learning_rate": 4.955172365136894e-06, + "loss": 1.4945, + "mean_token_accuracy": 0.6511774758497874, + "num_tokens": 2267211866.0, + "step": 13517 + }, + { + "entropy": 1.6961783468723297, + "epoch": 1.4850182637115157, + "grad_norm": 0.6863455176353455, + "learning_rate": 4.953986231130084e-06, + "loss": 1.2193, + "mean_token_accuracy": 0.6830638696750005, + "num_tokens": 2267318169.0, + "step": 13518 + }, + { + "entropy": 1.7797902425130208, + "epoch": 1.4851281206228886, + "grad_norm": 0.7058371305465698, + "learning_rate": 4.952800288476886e-06, + "loss": 1.5065, + "mean_token_accuracy": 0.6423899084329605, + "num_tokens": 2267468527.0, + "step": 13519 + }, + { + "entropy": 1.7239426573117573, + "epoch": 1.4852379775342617, + "grad_norm": 0.6466138958930969, + "learning_rate": 4.951614537214837e-06, + "loss": 1.4439, + "mean_token_accuracy": 0.6584398398796717, + "num_tokens": 2267672535.0, + "step": 13520 + }, + { + "entropy": 1.761413335800171, + "epoch": 1.4853478344456346, + "grad_norm": 0.7396242022514343, + "learning_rate": 4.950428977381461e-06, + "loss": 1.5382, + "mean_token_accuracy": 0.645782599846522, + "num_tokens": 2267851281.0, + "step": 13521 + }, + { + "entropy": 1.7479993800322216, + "epoch": 1.4854576913570074, + "grad_norm": 0.6907133460044861, + "learning_rate": 4.94924360901428e-06, + "loss": 1.5213, + "mean_token_accuracy": 0.6497216572364172, + "num_tokens": 2268048660.0, + "step": 13522 + }, + { + "entropy": 1.6639278034369152, + "epoch": 1.4855675482683806, + "grad_norm": 0.6215951442718506, + "learning_rate": 4.948058432150814e-06, + "loss": 1.4487, + "mean_token_accuracy": 0.6463829030593237, + "num_tokens": 2268254892.0, + "step": 13523 + }, + { + "entropy": 1.6794349352518718, + "epoch": 1.4856774051797534, + "grad_norm": 0.7702376842498779, + "learning_rate": 4.946873446828572e-06, + "loss": 1.4576, + "mean_token_accuracy": 0.6624968846638998, + "num_tokens": 2268433753.0, + "step": 13524 + }, + { + "entropy": 1.6901362140973408, + "epoch": 1.4857872620911263, + "grad_norm": 0.6038414239883423, + "learning_rate": 4.945688653085055e-06, + "loss": 1.4401, + "mean_token_accuracy": 0.657305101553599, + "num_tokens": 2268603211.0, + "step": 13525 + }, + { + "entropy": 1.7209720313549042, + "epoch": 1.4858971190024992, + "grad_norm": 0.6884058117866516, + "learning_rate": 4.944504050957767e-06, + "loss": 1.3502, + "mean_token_accuracy": 0.6485557009776434, + "num_tokens": 2268803883.0, + "step": 13526 + }, + { + "entropy": 1.6918534139792125, + "epoch": 1.486006975913872, + "grad_norm": 0.6835710406303406, + "learning_rate": 4.943319640484195e-06, + "loss": 1.4551, + "mean_token_accuracy": 0.6516717125972112, + "num_tokens": 2268996894.0, + "step": 13527 + }, + { + "entropy": 1.7772463758786519, + "epoch": 1.4861168328252452, + "grad_norm": 0.7287479639053345, + "learning_rate": 4.942135421701829e-06, + "loss": 1.5589, + "mean_token_accuracy": 0.6285470475753149, + "num_tokens": 2269206030.0, + "step": 13528 + }, + { + "entropy": 1.7186577022075653, + "epoch": 1.486226689736618, + "grad_norm": 0.6499691009521484, + "learning_rate": 4.940951394648148e-06, + "loss": 1.4773, + "mean_token_accuracy": 0.6425957729419073, + "num_tokens": 2269406402.0, + "step": 13529 + }, + { + "entropy": 1.7541224757830303, + "epoch": 1.486336546647991, + "grad_norm": 0.6786013841629028, + "learning_rate": 4.939767559360621e-06, + "loss": 1.4135, + "mean_token_accuracy": 0.6587019910415014, + "num_tokens": 2269547356.0, + "step": 13530 + }, + { + "entropy": 1.650489757458369, + "epoch": 1.4864464035593639, + "grad_norm": 0.7876176238059998, + "learning_rate": 4.938583915876721e-06, + "loss": 1.3363, + "mean_token_accuracy": 0.663826659321785, + "num_tokens": 2269723665.0, + "step": 13531 + }, + { + "entropy": 1.7275867958863576, + "epoch": 1.4865562604707367, + "grad_norm": 0.809974193572998, + "learning_rate": 4.937400464233911e-06, + "loss": 1.4697, + "mean_token_accuracy": 0.6594187666972479, + "num_tokens": 2269860339.0, + "step": 13532 + }, + { + "entropy": 1.6629600922266643, + "epoch": 1.4866661173821099, + "grad_norm": 0.7391853332519531, + "learning_rate": 4.936217204469645e-06, + "loss": 1.3979, + "mean_token_accuracy": 0.6491153140862783, + "num_tokens": 2270029166.0, + "step": 13533 + }, + { + "entropy": 1.6912591656049092, + "epoch": 1.4867759742934827, + "grad_norm": 0.6969987750053406, + "learning_rate": 4.9350341366213685e-06, + "loss": 1.3193, + "mean_token_accuracy": 0.6674131552378336, + "num_tokens": 2270166752.0, + "step": 13534 + }, + { + "entropy": 1.6451220214366913, + "epoch": 1.4868858312048556, + "grad_norm": 0.6137616634368896, + "learning_rate": 4.9338512607265325e-06, + "loss": 1.235, + "mean_token_accuracy": 0.6780912727117538, + "num_tokens": 2270320590.0, + "step": 13535 + }, + { + "entropy": 1.7086876134077709, + "epoch": 1.4869956881162287, + "grad_norm": 0.6514938473701477, + "learning_rate": 4.9326685768225695e-06, + "loss": 1.5391, + "mean_token_accuracy": 0.6244403074185053, + "num_tokens": 2270503385.0, + "step": 13536 + }, + { + "entropy": 1.6874873240788777, + "epoch": 1.4871055450276016, + "grad_norm": 0.6039011478424072, + "learning_rate": 4.9314860849469134e-06, + "loss": 1.4688, + "mean_token_accuracy": 0.6564833472172419, + "num_tokens": 2270710407.0, + "step": 13537 + }, + { + "entropy": 1.6665807565053303, + "epoch": 1.4872154019389745, + "grad_norm": 0.7873986959457397, + "learning_rate": 4.9303037851369836e-06, + "loss": 1.3184, + "mean_token_accuracy": 0.6774963239828745, + "num_tokens": 2270843801.0, + "step": 13538 + }, + { + "entropy": 1.700388679901759, + "epoch": 1.4873252588503474, + "grad_norm": 0.627056896686554, + "learning_rate": 4.929121677430204e-06, + "loss": 1.5074, + "mean_token_accuracy": 0.6504200547933578, + "num_tokens": 2271024231.0, + "step": 13539 + }, + { + "entropy": 1.6944973468780518, + "epoch": 1.4874351157617203, + "grad_norm": 0.6724776029586792, + "learning_rate": 4.927939761863993e-06, + "loss": 1.347, + "mean_token_accuracy": 0.6622181783119837, + "num_tokens": 2271156544.0, + "step": 13540 + }, + { + "entropy": 1.6311760048071544, + "epoch": 1.4875449726730934, + "grad_norm": 0.6557187438011169, + "learning_rate": 4.926758038475751e-06, + "loss": 1.4786, + "mean_token_accuracy": 0.6390694926182429, + "num_tokens": 2271409977.0, + "step": 13541 + }, + { + "entropy": 1.7377532819906871, + "epoch": 1.4876548295844663, + "grad_norm": 0.7433091998100281, + "learning_rate": 4.9255765073028764e-06, + "loss": 1.4261, + "mean_token_accuracy": 0.6504714637994766, + "num_tokens": 2271551261.0, + "step": 13542 + }, + { + "entropy": 1.6590198477109273, + "epoch": 1.4877646864958392, + "grad_norm": 0.6879268288612366, + "learning_rate": 4.924395168382772e-06, + "loss": 1.3696, + "mean_token_accuracy": 0.6614403426647186, + "num_tokens": 2271705504.0, + "step": 13543 + }, + { + "entropy": 1.716480682293574, + "epoch": 1.487874543407212, + "grad_norm": 0.7290632128715515, + "learning_rate": 4.9232140217528205e-06, + "loss": 1.4055, + "mean_token_accuracy": 0.6647356102863947, + "num_tokens": 2271842583.0, + "step": 13544 + }, + { + "entropy": 1.7306012709935505, + "epoch": 1.487984400318585, + "grad_norm": 0.7125338912010193, + "learning_rate": 4.922033067450408e-06, + "loss": 1.3186, + "mean_token_accuracy": 0.6647218614816666, + "num_tokens": 2271986650.0, + "step": 13545 + }, + { + "entropy": 1.6511406401793163, + "epoch": 1.488094257229958, + "grad_norm": 0.6755971908569336, + "learning_rate": 4.920852305512911e-06, + "loss": 1.2941, + "mean_token_accuracy": 0.677680104970932, + "num_tokens": 2272131530.0, + "step": 13546 + }, + { + "entropy": 1.6403106550375621, + "epoch": 1.488204114141331, + "grad_norm": 0.6104917526245117, + "learning_rate": 4.919671735977698e-06, + "loss": 1.2273, + "mean_token_accuracy": 0.677567387620608, + "num_tokens": 2272258498.0, + "step": 13547 + }, + { + "entropy": 1.6783630152543385, + "epoch": 1.4883139710527038, + "grad_norm": 0.7859485745429993, + "learning_rate": 4.9184913588821355e-06, + "loss": 1.3253, + "mean_token_accuracy": 0.6634241938591003, + "num_tokens": 2272400247.0, + "step": 13548 + }, + { + "entropy": 1.6718779901663463, + "epoch": 1.488423827964077, + "grad_norm": 0.6356991529464722, + "learning_rate": 4.917311174263582e-06, + "loss": 1.3806, + "mean_token_accuracy": 0.6621431310971578, + "num_tokens": 2272578243.0, + "step": 13549 + }, + { + "entropy": 1.6918590764204662, + "epoch": 1.4885336848754498, + "grad_norm": 0.6709624528884888, + "learning_rate": 4.916131182159385e-06, + "loss": 1.4217, + "mean_token_accuracy": 0.6508203744888306, + "num_tokens": 2272739851.0, + "step": 13550 + }, + { + "entropy": 1.6766121685504913, + "epoch": 1.4886435417868227, + "grad_norm": 0.6813299655914307, + "learning_rate": 4.914951382606896e-06, + "loss": 1.4075, + "mean_token_accuracy": 0.6498910933732986, + "num_tokens": 2272979858.0, + "step": 13551 + }, + { + "entropy": 1.6789940396944683, + "epoch": 1.4887533986981956, + "grad_norm": 0.7328557968139648, + "learning_rate": 4.913771775643456e-06, + "loss": 1.3464, + "mean_token_accuracy": 0.6630066633224487, + "num_tokens": 2273127557.0, + "step": 13552 + }, + { + "entropy": 1.7166900932788849, + "epoch": 1.4888632556095684, + "grad_norm": 0.7445768117904663, + "learning_rate": 4.912592361306397e-06, + "loss": 1.4081, + "mean_token_accuracy": 0.6634646505117416, + "num_tokens": 2273281855.0, + "step": 13553 + }, + { + "entropy": 1.7078879574934642, + "epoch": 1.4889731125209416, + "grad_norm": 0.7372060418128967, + "learning_rate": 4.911413139633044e-06, + "loss": 1.5168, + "mean_token_accuracy": 0.6509969532489777, + "num_tokens": 2273451264.0, + "step": 13554 + }, + { + "entropy": 1.650009383757909, + "epoch": 1.4890829694323144, + "grad_norm": 0.5854784250259399, + "learning_rate": 4.910234110660724e-06, + "loss": 1.3294, + "mean_token_accuracy": 0.6673146585623423, + "num_tokens": 2273598602.0, + "step": 13555 + }, + { + "entropy": 1.6960578461488087, + "epoch": 1.4891928263436873, + "grad_norm": 0.7097931504249573, + "learning_rate": 4.909055274426747e-06, + "loss": 1.3523, + "mean_token_accuracy": 0.6700321088234583, + "num_tokens": 2273761448.0, + "step": 13556 + }, + { + "entropy": 1.739314426978429, + "epoch": 1.4893026832550602, + "grad_norm": 0.7183867692947388, + "learning_rate": 4.907876630968429e-06, + "loss": 1.4509, + "mean_token_accuracy": 0.651702399055163, + "num_tokens": 2273921418.0, + "step": 13557 + }, + { + "entropy": 1.7305655578772228, + "epoch": 1.489412540166433, + "grad_norm": 0.6984145045280457, + "learning_rate": 4.906698180323072e-06, + "loss": 1.3913, + "mean_token_accuracy": 0.668786495923996, + "num_tokens": 2274158793.0, + "step": 13558 + }, + { + "entropy": 1.7573081453641255, + "epoch": 1.4895223970778062, + "grad_norm": 0.7101246118545532, + "learning_rate": 4.9055199225279674e-06, + "loss": 1.4446, + "mean_token_accuracy": 0.6512223184108734, + "num_tokens": 2274343919.0, + "step": 13559 + }, + { + "entropy": 1.7201645970344543, + "epoch": 1.489632253989179, + "grad_norm": 0.7223523855209351, + "learning_rate": 4.904341857620415e-06, + "loss": 1.3581, + "mean_token_accuracy": 0.6549399097760519, + "num_tokens": 2274495539.0, + "step": 13560 + }, + { + "entropy": 1.7697237531344097, + "epoch": 1.489742110900552, + "grad_norm": 0.7736477255821228, + "learning_rate": 4.903163985637695e-06, + "loss": 1.3867, + "mean_token_accuracy": 0.6666910151640574, + "num_tokens": 2274674058.0, + "step": 13561 + }, + { + "entropy": 1.7430163423220317, + "epoch": 1.489851967811925, + "grad_norm": 0.6027993559837341, + "learning_rate": 4.901986306617085e-06, + "loss": 1.4971, + "mean_token_accuracy": 0.6381118098894755, + "num_tokens": 2274879632.0, + "step": 13562 + }, + { + "entropy": 1.677459180355072, + "epoch": 1.489961824723298, + "grad_norm": 0.8948626518249512, + "learning_rate": 4.9008088205958605e-06, + "loss": 1.2984, + "mean_token_accuracy": 0.6767031649748484, + "num_tokens": 2275027702.0, + "step": 13563 + }, + { + "entropy": 1.7206537226835887, + "epoch": 1.4900716816346709, + "grad_norm": 0.6878028512001038, + "learning_rate": 4.89963152761129e-06, + "loss": 1.4045, + "mean_token_accuracy": 0.6523149311542511, + "num_tokens": 2275187790.0, + "step": 13564 + }, + { + "entropy": 1.6853440205256145, + "epoch": 1.4901815385460437, + "grad_norm": 0.6887045502662659, + "learning_rate": 4.898454427700635e-06, + "loss": 1.3973, + "mean_token_accuracy": 0.6593828996022543, + "num_tokens": 2275344735.0, + "step": 13565 + }, + { + "entropy": 1.708954284588496, + "epoch": 1.4902913954574166, + "grad_norm": 0.7596457600593567, + "learning_rate": 4.897277520901144e-06, + "loss": 1.4333, + "mean_token_accuracy": 0.6570628980795542, + "num_tokens": 2275476549.0, + "step": 13566 + }, + { + "entropy": 1.6877670685450237, + "epoch": 1.4904012523687897, + "grad_norm": 0.7278691530227661, + "learning_rate": 4.896100807250073e-06, + "loss": 1.3566, + "mean_token_accuracy": 0.663535346587499, + "num_tokens": 2275637937.0, + "step": 13567 + }, + { + "entropy": 1.6145448486010234, + "epoch": 1.4905111092801626, + "grad_norm": 0.6225070953369141, + "learning_rate": 4.894924286784657e-06, + "loss": 1.1552, + "mean_token_accuracy": 0.6904433170954386, + "num_tokens": 2275760835.0, + "step": 13568 + }, + { + "entropy": 1.717700292666753, + "epoch": 1.4906209661915355, + "grad_norm": 0.6022528409957886, + "learning_rate": 4.89374795954214e-06, + "loss": 1.3925, + "mean_token_accuracy": 0.664552241563797, + "num_tokens": 2275936821.0, + "step": 13569 + }, + { + "entropy": 1.7442820469538372, + "epoch": 1.4907308231029084, + "grad_norm": 0.708743691444397, + "learning_rate": 4.892571825559749e-06, + "loss": 1.3119, + "mean_token_accuracy": 0.6691465228796005, + "num_tokens": 2276075486.0, + "step": 13570 + }, + { + "entropy": 1.7310235400994618, + "epoch": 1.4908406800142813, + "grad_norm": 0.6880050301551819, + "learning_rate": 4.891395884874705e-06, + "loss": 1.4393, + "mean_token_accuracy": 0.6460549732049307, + "num_tokens": 2276292408.0, + "step": 13571 + }, + { + "entropy": 1.6751873592535655, + "epoch": 1.4909505369256544, + "grad_norm": 0.6142544746398926, + "learning_rate": 4.890220137524229e-06, + "loss": 1.3366, + "mean_token_accuracy": 0.6657705307006836, + "num_tokens": 2276454867.0, + "step": 13572 + }, + { + "entropy": 1.6691831449667613, + "epoch": 1.4910603938370273, + "grad_norm": 0.7860159873962402, + "learning_rate": 4.889044583545535e-06, + "loss": 1.4698, + "mean_token_accuracy": 0.6658653269211451, + "num_tokens": 2276631139.0, + "step": 13573 + }, + { + "entropy": 1.6530809303124745, + "epoch": 1.4911702507484002, + "grad_norm": 0.6658622622489929, + "learning_rate": 4.887869222975823e-06, + "loss": 1.3893, + "mean_token_accuracy": 0.6663869768381119, + "num_tokens": 2276811087.0, + "step": 13574 + }, + { + "entropy": 1.6668502887090046, + "epoch": 1.4912801076597733, + "grad_norm": 0.7318044900894165, + "learning_rate": 4.886694055852295e-06, + "loss": 1.2348, + "mean_token_accuracy": 0.6809373696645101, + "num_tokens": 2276936016.0, + "step": 13575 + }, + { + "entropy": 1.7107643485069275, + "epoch": 1.4913899645711461, + "grad_norm": 0.618091881275177, + "learning_rate": 4.885519082212148e-06, + "loss": 1.4674, + "mean_token_accuracy": 0.6436606744925181, + "num_tokens": 2277114355.0, + "step": 13576 + }, + { + "entropy": 1.716734250386556, + "epoch": 1.491499821482519, + "grad_norm": 0.6230522990226746, + "learning_rate": 4.884344302092569e-06, + "loss": 1.4011, + "mean_token_accuracy": 0.6655239959557852, + "num_tokens": 2277275543.0, + "step": 13577 + }, + { + "entropy": 1.710130383570989, + "epoch": 1.491609678393892, + "grad_norm": 0.6273171305656433, + "learning_rate": 4.883169715530732e-06, + "loss": 1.4362, + "mean_token_accuracy": 0.638149564464887, + "num_tokens": 2277452793.0, + "step": 13578 + }, + { + "entropy": 1.7487995425860088, + "epoch": 1.4917195353052648, + "grad_norm": 0.7999676465988159, + "learning_rate": 4.881995322563821e-06, + "loss": 1.2994, + "mean_token_accuracy": 0.671253576874733, + "num_tokens": 2277600260.0, + "step": 13579 + }, + { + "entropy": 1.6912379463513691, + "epoch": 1.491829392216638, + "grad_norm": 0.6340394616127014, + "learning_rate": 4.880821123229002e-06, + "loss": 1.295, + "mean_token_accuracy": 0.6654437184333801, + "num_tokens": 2277753911.0, + "step": 13580 + }, + { + "entropy": 1.6957992414633434, + "epoch": 1.4919392491280108, + "grad_norm": 0.778068482875824, + "learning_rate": 4.879647117563432e-06, + "loss": 1.4732, + "mean_token_accuracy": 0.6568873922030131, + "num_tokens": 2277934943.0, + "step": 13581 + }, + { + "entropy": 1.6554166972637177, + "epoch": 1.4920491060393837, + "grad_norm": 0.6689861416816711, + "learning_rate": 4.8784733056042775e-06, + "loss": 1.2573, + "mean_token_accuracy": 0.67509492735068, + "num_tokens": 2278061278.0, + "step": 13582 + }, + { + "entropy": 1.754564344882965, + "epoch": 1.4921589629507566, + "grad_norm": 0.7395191788673401, + "learning_rate": 4.877299687388681e-06, + "loss": 1.3777, + "mean_token_accuracy": 0.6584471513827642, + "num_tokens": 2278186060.0, + "step": 13583 + }, + { + "entropy": 1.7302316029866536, + "epoch": 1.4922688198621294, + "grad_norm": 0.6741985082626343, + "learning_rate": 4.876126262953793e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.6610042850176493, + "num_tokens": 2278357055.0, + "step": 13584 + }, + { + "entropy": 1.6924872994422913, + "epoch": 1.4923786767735026, + "grad_norm": 1.0467135906219482, + "learning_rate": 4.87495303233675e-06, + "loss": 1.2743, + "mean_token_accuracy": 0.6736592451731364, + "num_tokens": 2278554005.0, + "step": 13585 + }, + { + "entropy": 1.6124165058135986, + "epoch": 1.4924885336848754, + "grad_norm": 0.6575326919555664, + "learning_rate": 4.87377999557468e-06, + "loss": 1.3924, + "mean_token_accuracy": 0.6724445472160975, + "num_tokens": 2278713474.0, + "step": 13586 + }, + { + "entropy": 1.7254461348056793, + "epoch": 1.4925983905962483, + "grad_norm": 0.6653977632522583, + "learning_rate": 4.872607152704713e-06, + "loss": 1.3827, + "mean_token_accuracy": 0.6479282975196838, + "num_tokens": 2278875327.0, + "step": 13587 + }, + { + "entropy": 1.7107085188229878, + "epoch": 1.4927082475076214, + "grad_norm": 0.7601565718650818, + "learning_rate": 4.871434503763971e-06, + "loss": 1.2622, + "mean_token_accuracy": 0.6768785417079926, + "num_tokens": 2278986177.0, + "step": 13588 + }, + { + "entropy": 1.7034134566783905, + "epoch": 1.4928181044189943, + "grad_norm": 0.8460795879364014, + "learning_rate": 4.870262048789566e-06, + "loss": 1.3049, + "mean_token_accuracy": 0.6756768574317297, + "num_tokens": 2279104088.0, + "step": 13589 + }, + { + "entropy": 1.7182818551858265, + "epoch": 1.4929279613303672, + "grad_norm": 0.6793023347854614, + "learning_rate": 4.869089787818602e-06, + "loss": 1.2894, + "mean_token_accuracy": 0.666498064994812, + "num_tokens": 2279231469.0, + "step": 13590 + }, + { + "entropy": 1.6521180470784504, + "epoch": 1.49303781824174, + "grad_norm": 0.7240891456604004, + "learning_rate": 4.8679177208881855e-06, + "loss": 1.4182, + "mean_token_accuracy": 0.6724280714988708, + "num_tokens": 2279385523.0, + "step": 13591 + }, + { + "entropy": 1.7422040899594624, + "epoch": 1.493147675153113, + "grad_norm": 0.623020589351654, + "learning_rate": 4.866745848035412e-06, + "loss": 1.3963, + "mean_token_accuracy": 0.6499710033337275, + "num_tokens": 2279561897.0, + "step": 13592 + }, + { + "entropy": 1.6989782949288685, + "epoch": 1.493257532064486, + "grad_norm": 0.6899892091751099, + "learning_rate": 4.865574169297364e-06, + "loss": 1.5596, + "mean_token_accuracy": 0.6443688968817393, + "num_tokens": 2279737279.0, + "step": 13593 + }, + { + "entropy": 1.6587830980618794, + "epoch": 1.493367388975859, + "grad_norm": 0.7197852730751038, + "learning_rate": 4.864402684711133e-06, + "loss": 1.5172, + "mean_token_accuracy": 0.6575819750626882, + "num_tokens": 2279901224.0, + "step": 13594 + }, + { + "entropy": 1.7458328505357106, + "epoch": 1.4934772458872319, + "grad_norm": 0.7012457847595215, + "learning_rate": 4.863231394313789e-06, + "loss": 1.4828, + "mean_token_accuracy": 0.6473122785488764, + "num_tokens": 2280096468.0, + "step": 13595 + }, + { + "entropy": 1.700047234694163, + "epoch": 1.4935871027986047, + "grad_norm": 0.705938994884491, + "learning_rate": 4.8620602981424085e-06, + "loss": 1.364, + "mean_token_accuracy": 0.6618212511142095, + "num_tokens": 2280279827.0, + "step": 13596 + }, + { + "entropy": 1.7099564174811046, + "epoch": 1.4936969597099776, + "grad_norm": 0.8156763315200806, + "learning_rate": 4.860889396234055e-06, + "loss": 1.3052, + "mean_token_accuracy": 0.6710335661967596, + "num_tokens": 2280416224.0, + "step": 13597 + }, + { + "entropy": 1.7591630319754283, + "epoch": 1.4938068166213507, + "grad_norm": 0.7641903162002563, + "learning_rate": 4.859718688625782e-06, + "loss": 1.3536, + "mean_token_accuracy": 0.6579357037941614, + "num_tokens": 2280551404.0, + "step": 13598 + }, + { + "entropy": 1.684128353993098, + "epoch": 1.4939166735327236, + "grad_norm": 0.6860581040382385, + "learning_rate": 4.8585481753546486e-06, + "loss": 1.3339, + "mean_token_accuracy": 0.6609357595443726, + "num_tokens": 2280670594.0, + "step": 13599 + }, + { + "entropy": 1.7321399648984273, + "epoch": 1.4940265304440965, + "grad_norm": 0.7905336618423462, + "learning_rate": 4.8573778564576955e-06, + "loss": 1.362, + "mean_token_accuracy": 0.6644478042920431, + "num_tokens": 2280802125.0, + "step": 13600 + }, + { + "entropy": 1.7287112375100453, + "epoch": 1.4941363873554696, + "grad_norm": 0.842991054058075, + "learning_rate": 4.856207731971968e-06, + "loss": 1.2462, + "mean_token_accuracy": 0.6818788150946299, + "num_tokens": 2280911360.0, + "step": 13601 + }, + { + "entropy": 1.7669661144415538, + "epoch": 1.4942462442668425, + "grad_norm": 0.6029599905014038, + "learning_rate": 4.855037801934497e-06, + "loss": 1.5295, + "mean_token_accuracy": 0.6283855885267258, + "num_tokens": 2281119023.0, + "step": 13602 + }, + { + "entropy": 1.6890939672787983, + "epoch": 1.4943561011782154, + "grad_norm": 0.6715998649597168, + "learning_rate": 4.853868066382308e-06, + "loss": 1.3511, + "mean_token_accuracy": 0.6587308794260025, + "num_tokens": 2281311484.0, + "step": 13603 + }, + { + "entropy": 1.7232101559638977, + "epoch": 1.4944659580895883, + "grad_norm": 0.6396613717079163, + "learning_rate": 4.852698525352427e-06, + "loss": 1.4827, + "mean_token_accuracy": 0.649900938073794, + "num_tokens": 2281526052.0, + "step": 13604 + }, + { + "entropy": 1.7208843032519023, + "epoch": 1.4945758150009611, + "grad_norm": 0.6806747913360596, + "learning_rate": 4.8515291788818695e-06, + "loss": 1.3837, + "mean_token_accuracy": 0.6432156016429266, + "num_tokens": 2281723549.0, + "step": 13605 + }, + { + "entropy": 1.6507877906163533, + "epoch": 1.4946856719123343, + "grad_norm": 0.829356849193573, + "learning_rate": 4.850360027007639e-06, + "loss": 1.257, + "mean_token_accuracy": 0.677089735865593, + "num_tokens": 2281892258.0, + "step": 13606 + }, + { + "entropy": 1.7317637304464977, + "epoch": 1.4947955288237071, + "grad_norm": 0.6524384021759033, + "learning_rate": 4.8491910697667425e-06, + "loss": 1.5544, + "mean_token_accuracy": 0.6447519858678182, + "num_tokens": 2282084102.0, + "step": 13607 + }, + { + "entropy": 1.7073566317558289, + "epoch": 1.49490538573508, + "grad_norm": 0.5912280678749084, + "learning_rate": 4.848022307196181e-06, + "loss": 1.368, + "mean_token_accuracy": 0.6703123350938162, + "num_tokens": 2282240654.0, + "step": 13608 + }, + { + "entropy": 1.7419903775056202, + "epoch": 1.495015242646453, + "grad_norm": 0.6560501456260681, + "learning_rate": 4.84685373933294e-06, + "loss": 1.2839, + "mean_token_accuracy": 0.6740489850441614, + "num_tokens": 2282376898.0, + "step": 13609 + }, + { + "entropy": 1.6600103080272675, + "epoch": 1.4951250995578258, + "grad_norm": 0.6871337890625, + "learning_rate": 4.845685366214003e-06, + "loss": 1.51, + "mean_token_accuracy": 0.6429749627908071, + "num_tokens": 2282552047.0, + "step": 13610 + }, + { + "entropy": 1.7339093486467998, + "epoch": 1.495234956469199, + "grad_norm": 0.7348884344100952, + "learning_rate": 4.8445171878763536e-06, + "loss": 1.2774, + "mean_token_accuracy": 0.6790765027205149, + "num_tokens": 2282690136.0, + "step": 13611 + }, + { + "entropy": 1.7065096199512482, + "epoch": 1.4953448133805718, + "grad_norm": 0.548129141330719, + "learning_rate": 4.84334920435696e-06, + "loss": 1.3133, + "mean_token_accuracy": 0.6773122251033783, + "num_tokens": 2282868665.0, + "step": 13612 + }, + { + "entropy": 1.7016609410444896, + "epoch": 1.4954546702919447, + "grad_norm": 0.6310706734657288, + "learning_rate": 4.842181415692791e-06, + "loss": 1.4762, + "mean_token_accuracy": 0.6426133811473846, + "num_tokens": 2283027370.0, + "step": 13613 + }, + { + "entropy": 1.7180238564809163, + "epoch": 1.4955645272033178, + "grad_norm": 0.6619656682014465, + "learning_rate": 4.841013821920805e-06, + "loss": 1.289, + "mean_token_accuracy": 0.6721247384945551, + "num_tokens": 2283197730.0, + "step": 13614 + }, + { + "entropy": 1.6656270027160645, + "epoch": 1.4956743841146907, + "grad_norm": 0.6908462047576904, + "learning_rate": 4.839846423077955e-06, + "loss": 1.2454, + "mean_token_accuracy": 0.6713667859633764, + "num_tokens": 2283312940.0, + "step": 13615 + }, + { + "entropy": 1.7119423349698384, + "epoch": 1.4957842410260636, + "grad_norm": 0.7182555198669434, + "learning_rate": 4.838679219201192e-06, + "loss": 1.5939, + "mean_token_accuracy": 0.6521507352590561, + "num_tokens": 2283454962.0, + "step": 13616 + }, + { + "entropy": 1.7009160617987316, + "epoch": 1.4958940979374364, + "grad_norm": 0.5092278718948364, + "learning_rate": 4.837512210327456e-06, + "loss": 1.493, + "mean_token_accuracy": 0.624845340847969, + "num_tokens": 2283755025.0, + "step": 13617 + }, + { + "entropy": 1.6453999876976013, + "epoch": 1.4960039548488093, + "grad_norm": 0.6449650526046753, + "learning_rate": 4.836345396493678e-06, + "loss": 1.3157, + "mean_token_accuracy": 0.6672974874575933, + "num_tokens": 2283966223.0, + "step": 13618 + }, + { + "entropy": 1.7281176149845123, + "epoch": 1.4961138117601824, + "grad_norm": 0.6479874849319458, + "learning_rate": 4.835178777736791e-06, + "loss": 1.4652, + "mean_token_accuracy": 0.6569635172684988, + "num_tokens": 2284138007.0, + "step": 13619 + }, + { + "entropy": 1.7001449863115947, + "epoch": 1.4962236686715553, + "grad_norm": 0.7589164972305298, + "learning_rate": 4.83401235409372e-06, + "loss": 1.4462, + "mean_token_accuracy": 0.6550010740756989, + "num_tokens": 2284287063.0, + "step": 13620 + }, + { + "entropy": 1.7176421483357747, + "epoch": 1.4963335255829282, + "grad_norm": 0.6610986590385437, + "learning_rate": 4.832846125601381e-06, + "loss": 1.4037, + "mean_token_accuracy": 0.6466071307659149, + "num_tokens": 2284436041.0, + "step": 13621 + }, + { + "entropy": 1.682494064172109, + "epoch": 1.4964433824943013, + "grad_norm": 0.7629041075706482, + "learning_rate": 4.831680092296679e-06, + "loss": 1.3422, + "mean_token_accuracy": 0.6677883863449097, + "num_tokens": 2284602710.0, + "step": 13622 + }, + { + "entropy": 1.6760949591795604, + "epoch": 1.496553239405674, + "grad_norm": 0.73377525806427, + "learning_rate": 4.830514254216527e-06, + "loss": 1.3821, + "mean_token_accuracy": 0.6497271855672201, + "num_tokens": 2284767266.0, + "step": 13623 + }, + { + "entropy": 1.6992427905400593, + "epoch": 1.496663096317047, + "grad_norm": 0.6510926485061646, + "learning_rate": 4.829348611397815e-06, + "loss": 1.3355, + "mean_token_accuracy": 0.653320108850797, + "num_tokens": 2284923786.0, + "step": 13624 + }, + { + "entropy": 1.6941948135693867, + "epoch": 1.49677295322842, + "grad_norm": 0.7802711725234985, + "learning_rate": 4.828183163877441e-06, + "loss": 1.2977, + "mean_token_accuracy": 0.6674275199572245, + "num_tokens": 2285069673.0, + "step": 13625 + }, + { + "entropy": 1.7570783694585164, + "epoch": 1.4968828101397929, + "grad_norm": 0.7150521278381348, + "learning_rate": 4.82701791169229e-06, + "loss": 1.4907, + "mean_token_accuracy": 0.6550498505433401, + "num_tokens": 2285211769.0, + "step": 13626 + }, + { + "entropy": 1.711845616499583, + "epoch": 1.496992667051166, + "grad_norm": 0.8639612793922424, + "learning_rate": 4.825852854879236e-06, + "loss": 1.3726, + "mean_token_accuracy": 0.6677491863568624, + "num_tokens": 2285352305.0, + "step": 13627 + }, + { + "entropy": 1.703562508026759, + "epoch": 1.4971025239625388, + "grad_norm": 0.6228059530258179, + "learning_rate": 4.8246879934751615e-06, + "loss": 1.2863, + "mean_token_accuracy": 0.662003293633461, + "num_tokens": 2285514680.0, + "step": 13628 + }, + { + "entropy": 1.661176194747289, + "epoch": 1.4972123808739117, + "grad_norm": 0.651232123374939, + "learning_rate": 4.823523327516929e-06, + "loss": 1.4186, + "mean_token_accuracy": 0.6596865157286326, + "num_tokens": 2285640894.0, + "step": 13629 + }, + { + "entropy": 1.7480360170205433, + "epoch": 1.4973222377852846, + "grad_norm": 0.6672519445419312, + "learning_rate": 4.822358857041396e-06, + "loss": 1.338, + "mean_token_accuracy": 0.6738019635279974, + "num_tokens": 2285786016.0, + "step": 13630 + }, + { + "entropy": 1.728183130423228, + "epoch": 1.4974320946966575, + "grad_norm": 0.6858686208724976, + "learning_rate": 4.821194582085423e-06, + "loss": 1.5963, + "mean_token_accuracy": 0.6362834026416143, + "num_tokens": 2285993402.0, + "step": 13631 + }, + { + "entropy": 1.7002196311950684, + "epoch": 1.4975419516080306, + "grad_norm": 0.7614853382110596, + "learning_rate": 4.82003050268586e-06, + "loss": 1.2961, + "mean_token_accuracy": 0.6599853783845901, + "num_tokens": 2286130357.0, + "step": 13632 + }, + { + "entropy": 1.6387390891710918, + "epoch": 1.4976518085194035, + "grad_norm": 0.6776584982872009, + "learning_rate": 4.818866618879546e-06, + "loss": 1.3764, + "mean_token_accuracy": 0.6690385490655899, + "num_tokens": 2286305495.0, + "step": 13633 + }, + { + "entropy": 1.7022567987442017, + "epoch": 1.4977616654307764, + "grad_norm": 0.6763585209846497, + "learning_rate": 4.817702930703316e-06, + "loss": 1.3917, + "mean_token_accuracy": 0.6675192614396414, + "num_tokens": 2286436486.0, + "step": 13634 + }, + { + "entropy": 1.7336170276006062, + "epoch": 1.4978715223421495, + "grad_norm": 0.5927034020423889, + "learning_rate": 4.816539438194004e-06, + "loss": 1.3455, + "mean_token_accuracy": 0.6625049064556757, + "num_tokens": 2286591633.0, + "step": 13635 + }, + { + "entropy": 1.7186884780724843, + "epoch": 1.4979813792535221, + "grad_norm": 0.6435331106185913, + "learning_rate": 4.815376141388432e-06, + "loss": 1.2864, + "mean_token_accuracy": 0.6589618921279907, + "num_tokens": 2286735105.0, + "step": 13636 + }, + { + "entropy": 1.7794620990753174, + "epoch": 1.4980912361648953, + "grad_norm": 0.6708908677101135, + "learning_rate": 4.814213040323419e-06, + "loss": 1.2498, + "mean_token_accuracy": 0.6653878291447958, + "num_tokens": 2286838400.0, + "step": 13637 + }, + { + "entropy": 1.6840010782082875, + "epoch": 1.4982010930762681, + "grad_norm": 0.6124277114868164, + "learning_rate": 4.813050135035776e-06, + "loss": 1.4343, + "mean_token_accuracy": 0.6520940760771433, + "num_tokens": 2287052287.0, + "step": 13638 + }, + { + "entropy": 1.6653833985328674, + "epoch": 1.498310949987641, + "grad_norm": 0.582930862903595, + "learning_rate": 4.811887425562305e-06, + "loss": 1.4394, + "mean_token_accuracy": 0.6390776584545771, + "num_tokens": 2287256232.0, + "step": 13639 + }, + { + "entropy": 1.7373347878456116, + "epoch": 1.4984208068990141, + "grad_norm": 0.7825767397880554, + "learning_rate": 4.810724911939813e-06, + "loss": 1.3372, + "mean_token_accuracy": 0.6598907858133316, + "num_tokens": 2287415673.0, + "step": 13640 + }, + { + "entropy": 1.6686055858929951, + "epoch": 1.498530663810387, + "grad_norm": 0.5538728833198547, + "learning_rate": 4.809562594205088e-06, + "loss": 1.4187, + "mean_token_accuracy": 0.6480937798817953, + "num_tokens": 2287619130.0, + "step": 13641 + }, + { + "entropy": 1.7331142822901409, + "epoch": 1.49864052072176, + "grad_norm": 0.7984141111373901, + "learning_rate": 4.808400472394915e-06, + "loss": 1.5261, + "mean_token_accuracy": 0.6605927993853887, + "num_tokens": 2287777134.0, + "step": 13642 + }, + { + "entropy": 1.7319643298784893, + "epoch": 1.4987503776331328, + "grad_norm": 0.7856102585792542, + "learning_rate": 4.807238546546077e-06, + "loss": 1.4331, + "mean_token_accuracy": 0.6482569624980291, + "num_tokens": 2287929596.0, + "step": 13643 + }, + { + "entropy": 1.7171655396620433, + "epoch": 1.4988602345445057, + "grad_norm": 0.7845726609230042, + "learning_rate": 4.806076816695351e-06, + "loss": 1.515, + "mean_token_accuracy": 0.6387662142515182, + "num_tokens": 2288091381.0, + "step": 13644 + }, + { + "entropy": 1.6830125947793324, + "epoch": 1.4989700914558788, + "grad_norm": 0.6584486365318298, + "learning_rate": 4.804915282879503e-06, + "loss": 1.2917, + "mean_token_accuracy": 0.6716272433598837, + "num_tokens": 2288232915.0, + "step": 13645 + }, + { + "entropy": 1.7228589157263439, + "epoch": 1.4990799483672517, + "grad_norm": 0.7010536193847656, + "learning_rate": 4.80375394513529e-06, + "loss": 1.4135, + "mean_token_accuracy": 0.6520048628250757, + "num_tokens": 2288398947.0, + "step": 13646 + }, + { + "entropy": 1.7012008130550385, + "epoch": 1.4991898052786246, + "grad_norm": 0.7033513784408569, + "learning_rate": 4.802592803499477e-06, + "loss": 1.2313, + "mean_token_accuracy": 0.6781880507866541, + "num_tokens": 2288506315.0, + "step": 13647 + }, + { + "entropy": 1.6403611302375793, + "epoch": 1.4992996621899977, + "grad_norm": 0.6568671464920044, + "learning_rate": 4.80143185800881e-06, + "loss": 1.2665, + "mean_token_accuracy": 0.6699175884326299, + "num_tokens": 2288658086.0, + "step": 13648 + }, + { + "entropy": 1.7293101052443187, + "epoch": 1.4994095191013705, + "grad_norm": 0.7070812582969666, + "learning_rate": 4.800271108700027e-06, + "loss": 1.623, + "mean_token_accuracy": 0.626306434472402, + "num_tokens": 2288892470.0, + "step": 13649 + }, + { + "entropy": 1.687088151772817, + "epoch": 1.4995193760127434, + "grad_norm": 0.5417489409446716, + "learning_rate": 4.799110555609874e-06, + "loss": 1.1432, + "mean_token_accuracy": 0.6710360199213028, + "num_tokens": 2289111747.0, + "step": 13650 + }, + { + "entropy": 1.7193756500879924, + "epoch": 1.4996292329241163, + "grad_norm": 0.590350329875946, + "learning_rate": 4.797950198775074e-06, + "loss": 1.3533, + "mean_token_accuracy": 0.6762384523948034, + "num_tokens": 2289293601.0, + "step": 13651 + }, + { + "entropy": 1.6460750301678975, + "epoch": 1.4997390898354892, + "grad_norm": 0.617831826210022, + "learning_rate": 4.796790038232359e-06, + "loss": 1.4126, + "mean_token_accuracy": 0.6668714483579, + "num_tokens": 2289481612.0, + "step": 13652 + }, + { + "entropy": 1.6675611039002736, + "epoch": 1.4998489467468623, + "grad_norm": 0.6598964333534241, + "learning_rate": 4.795630074018443e-06, + "loss": 1.541, + "mean_token_accuracy": 0.6393274962902069, + "num_tokens": 2289662266.0, + "step": 13653 + }, + { + "entropy": 1.6732101341088612, + "epoch": 1.4999588036582352, + "grad_norm": 0.837954580783844, + "learning_rate": 4.794470306170038e-06, + "loss": 1.5671, + "mean_token_accuracy": 0.6427912364403406, + "num_tokens": 2289836806.0, + "step": 13654 + }, + { + "entropy": 1.7142541805903118, + "epoch": 1.500068660569608, + "grad_norm": 0.7065649032592773, + "learning_rate": 4.79331073472385e-06, + "loss": 1.6684, + "mean_token_accuracy": 0.6320697516202927, + "num_tokens": 2290122996.0, + "step": 13655 + }, + { + "entropy": 1.7274170815944672, + "epoch": 1.5001785174809812, + "grad_norm": 0.6198680996894836, + "learning_rate": 4.792151359716585e-06, + "loss": 1.383, + "mean_token_accuracy": 0.6606019486983618, + "num_tokens": 2290319640.0, + "step": 13656 + }, + { + "entropy": 1.7504333357016246, + "epoch": 1.5002883743923539, + "grad_norm": 0.6796494126319885, + "learning_rate": 4.79099218118493e-06, + "loss": 1.3203, + "mean_token_accuracy": 0.6651289115349451, + "num_tokens": 2290446566.0, + "step": 13657 + }, + { + "entropy": 1.6876095831394196, + "epoch": 1.500398231303727, + "grad_norm": 0.6724913716316223, + "learning_rate": 4.7898331991655764e-06, + "loss": 1.6636, + "mean_token_accuracy": 0.6144607166449229, + "num_tokens": 2290691085.0, + "step": 13658 + }, + { + "entropy": 1.7187560300032299, + "epoch": 1.5005080882150998, + "grad_norm": 0.6489282250404358, + "learning_rate": 4.7886744136951996e-06, + "loss": 1.5508, + "mean_token_accuracy": 0.6420976668596268, + "num_tokens": 2290903404.0, + "step": 13659 + }, + { + "entropy": 1.6801285644372304, + "epoch": 1.5006179451264727, + "grad_norm": 0.6258361339569092, + "learning_rate": 4.787515824810483e-06, + "loss": 1.3144, + "mean_token_accuracy": 0.6689251512289047, + "num_tokens": 2291106588.0, + "step": 13660 + }, + { + "entropy": 1.7926131387551625, + "epoch": 1.5007278020378458, + "grad_norm": 0.7377539277076721, + "learning_rate": 4.78635743254809e-06, + "loss": 1.3222, + "mean_token_accuracy": 0.6706551959117254, + "num_tokens": 2291214514.0, + "step": 13661 + }, + { + "entropy": 1.708193560441335, + "epoch": 1.5008376589492185, + "grad_norm": 0.6403104066848755, + "learning_rate": 4.785199236944681e-06, + "loss": 1.4077, + "mean_token_accuracy": 0.644023617108663, + "num_tokens": 2291407970.0, + "step": 13662 + }, + { + "entropy": 1.6744596858819325, + "epoch": 1.5009475158605916, + "grad_norm": 0.751727819442749, + "learning_rate": 4.784041238036917e-06, + "loss": 1.3342, + "mean_token_accuracy": 0.6770372043053309, + "num_tokens": 2291571695.0, + "step": 13663 + }, + { + "entropy": 1.7439305186271667, + "epoch": 1.5010573727719645, + "grad_norm": 0.824898362159729, + "learning_rate": 4.782883435861449e-06, + "loss": 1.4083, + "mean_token_accuracy": 0.6511137386163076, + "num_tokens": 2291720500.0, + "step": 13664 + }, + { + "entropy": 1.7442241807778676, + "epoch": 1.5011672296833374, + "grad_norm": 0.8911228179931641, + "learning_rate": 4.781725830454919e-06, + "loss": 1.4769, + "mean_token_accuracy": 0.6623394538958868, + "num_tokens": 2291875701.0, + "step": 13665 + }, + { + "entropy": 1.6710403362909954, + "epoch": 1.5012770865947105, + "grad_norm": 0.6459485292434692, + "learning_rate": 4.780568421853962e-06, + "loss": 1.3773, + "mean_token_accuracy": 0.6579457273085912, + "num_tokens": 2292060235.0, + "step": 13666 + }, + { + "entropy": 1.7651425302028656, + "epoch": 1.5013869435060834, + "grad_norm": 0.8423007130622864, + "learning_rate": 4.779411210095214e-06, + "loss": 1.4055, + "mean_token_accuracy": 0.6594301611185074, + "num_tokens": 2292192597.0, + "step": 13667 + }, + { + "entropy": 1.6896374821662903, + "epoch": 1.5014968004174563, + "grad_norm": 0.7431081533432007, + "learning_rate": 4.778254195215295e-06, + "loss": 1.3766, + "mean_token_accuracy": 0.6599178363879522, + "num_tokens": 2292354541.0, + "step": 13668 + }, + { + "entropy": 1.6598813434441884, + "epoch": 1.5016066573288294, + "grad_norm": 0.8162408471107483, + "learning_rate": 4.777097377250831e-06, + "loss": 1.3112, + "mean_token_accuracy": 0.677242711186409, + "num_tokens": 2292495597.0, + "step": 13669 + }, + { + "entropy": 1.7197925249735515, + "epoch": 1.501716514240202, + "grad_norm": 0.649800181388855, + "learning_rate": 4.775940756238431e-06, + "loss": 1.5137, + "mean_token_accuracy": 0.6383554091056188, + "num_tokens": 2292653072.0, + "step": 13670 + }, + { + "entropy": 1.6852848728497822, + "epoch": 1.5018263711515751, + "grad_norm": 0.7410378456115723, + "learning_rate": 4.774784332214697e-06, + "loss": 1.4563, + "mean_token_accuracy": 0.6579979757467905, + "num_tokens": 2292806988.0, + "step": 13671 + }, + { + "entropy": 1.746435950199763, + "epoch": 1.501936228062948, + "grad_norm": 0.6971200108528137, + "learning_rate": 4.773628105216238e-06, + "loss": 1.3093, + "mean_token_accuracy": 0.6615460316340128, + "num_tokens": 2292939408.0, + "step": 13672 + }, + { + "entropy": 1.6766025920708973, + "epoch": 1.502046084974321, + "grad_norm": 0.6413285732269287, + "learning_rate": 4.772472075279643e-06, + "loss": 1.3751, + "mean_token_accuracy": 0.6748560518026352, + "num_tokens": 2293083000.0, + "step": 13673 + }, + { + "entropy": 1.6996668179829915, + "epoch": 1.502155941885694, + "grad_norm": 0.7790882587432861, + "learning_rate": 4.771316242441498e-06, + "loss": 1.327, + "mean_token_accuracy": 0.661717543999354, + "num_tokens": 2293233616.0, + "step": 13674 + }, + { + "entropy": 1.6642019947369893, + "epoch": 1.5022657987970667, + "grad_norm": 0.7140267491340637, + "learning_rate": 4.7701606067383875e-06, + "loss": 1.2386, + "mean_token_accuracy": 0.6947644750277201, + "num_tokens": 2293363011.0, + "step": 13675 + }, + { + "entropy": 1.7092399597167969, + "epoch": 1.5023756557084398, + "grad_norm": 0.6863964796066284, + "learning_rate": 4.76900516820689e-06, + "loss": 1.3732, + "mean_token_accuracy": 0.6574582954247793, + "num_tokens": 2293515091.0, + "step": 13676 + }, + { + "entropy": 1.6853445172309875, + "epoch": 1.5024855126198127, + "grad_norm": 0.7969051599502563, + "learning_rate": 4.76784992688357e-06, + "loss": 1.4371, + "mean_token_accuracy": 0.6546561618645986, + "num_tokens": 2293667579.0, + "step": 13677 + }, + { + "entropy": 1.6716053783893585, + "epoch": 1.5025953695311856, + "grad_norm": 8.106036186218262, + "learning_rate": 4.76669488280499e-06, + "loss": 1.2827, + "mean_token_accuracy": 0.6833441058794657, + "num_tokens": 2293876244.0, + "step": 13678 + }, + { + "entropy": 1.6749347150325775, + "epoch": 1.5027052264425587, + "grad_norm": 0.7246283292770386, + "learning_rate": 4.76554003600771e-06, + "loss": 1.4164, + "mean_token_accuracy": 0.6572469621896744, + "num_tokens": 2294045369.0, + "step": 13679 + }, + { + "entropy": 1.6581110556920369, + "epoch": 1.5028150833539315, + "grad_norm": 0.684249997138977, + "learning_rate": 4.764385386528276e-06, + "loss": 1.3176, + "mean_token_accuracy": 0.6688032547632853, + "num_tokens": 2294188555.0, + "step": 13680 + }, + { + "entropy": 1.8024831314881642, + "epoch": 1.5029249402653044, + "grad_norm": 0.7943997979164124, + "learning_rate": 4.763230934403237e-06, + "loss": 1.44, + "mean_token_accuracy": 0.6461151192585627, + "num_tokens": 2294350821.0, + "step": 13681 + }, + { + "entropy": 1.6654022733370464, + "epoch": 1.5030347971766775, + "grad_norm": 0.6539124250411987, + "learning_rate": 4.762076679669128e-06, + "loss": 1.4501, + "mean_token_accuracy": 0.6525823424259821, + "num_tokens": 2294535256.0, + "step": 13682 + }, + { + "entropy": 1.6741309265295665, + "epoch": 1.5031446540880502, + "grad_norm": 0.7052878737449646, + "learning_rate": 4.760922622362481e-06, + "loss": 1.3265, + "mean_token_accuracy": 0.6755642145872116, + "num_tokens": 2294715316.0, + "step": 13683 + }, + { + "entropy": 1.7677266299724579, + "epoch": 1.5032545109994233, + "grad_norm": 0.6491983532905579, + "learning_rate": 4.759768762519822e-06, + "loss": 1.4475, + "mean_token_accuracy": 0.6450400104125341, + "num_tokens": 2294970194.0, + "step": 13684 + }, + { + "entropy": 1.706259439388911, + "epoch": 1.5033643679107962, + "grad_norm": 0.6732988953590393, + "learning_rate": 4.75861510017767e-06, + "loss": 1.291, + "mean_token_accuracy": 0.6787703533967336, + "num_tokens": 2295122089.0, + "step": 13685 + }, + { + "entropy": 1.7781917651494343, + "epoch": 1.503474224822169, + "grad_norm": 0.7497197985649109, + "learning_rate": 4.757461635372536e-06, + "loss": 1.3894, + "mean_token_accuracy": 0.6516469866037369, + "num_tokens": 2295249519.0, + "step": 13686 + }, + { + "entropy": 1.6908331016699474, + "epoch": 1.5035840817335422, + "grad_norm": 0.6211578845977783, + "learning_rate": 4.756308368140927e-06, + "loss": 1.4081, + "mean_token_accuracy": 0.6597599039475123, + "num_tokens": 2295417437.0, + "step": 13687 + }, + { + "entropy": 1.738340864578883, + "epoch": 1.5036939386449149, + "grad_norm": 0.7150505781173706, + "learning_rate": 4.755155298519349e-06, + "loss": 1.4526, + "mean_token_accuracy": 0.6501429776350657, + "num_tokens": 2295603865.0, + "step": 13688 + }, + { + "entropy": 1.7665735979874928, + "epoch": 1.503803795556288, + "grad_norm": 0.7368733286857605, + "learning_rate": 4.7540024265442905e-06, + "loss": 1.5544, + "mean_token_accuracy": 0.6462236990531286, + "num_tokens": 2295751755.0, + "step": 13689 + }, + { + "entropy": 1.7254281441370647, + "epoch": 1.5039136524676608, + "grad_norm": 0.7418520450592041, + "learning_rate": 4.7528497522522385e-06, + "loss": 1.4659, + "mean_token_accuracy": 0.6408105889956156, + "num_tokens": 2295946046.0, + "step": 13690 + }, + { + "entropy": 1.7010563015937805, + "epoch": 1.5040235093790337, + "grad_norm": 0.6515077948570251, + "learning_rate": 4.75169727567968e-06, + "loss": 1.448, + "mean_token_accuracy": 0.6640833069880804, + "num_tokens": 2296143617.0, + "step": 13691 + }, + { + "entropy": 1.693510760863622, + "epoch": 1.5041333662904068, + "grad_norm": 0.650558590888977, + "learning_rate": 4.750544996863083e-06, + "loss": 1.4015, + "mean_token_accuracy": 0.6564928144216537, + "num_tokens": 2296290922.0, + "step": 13692 + }, + { + "entropy": 1.7150411407152812, + "epoch": 1.5042432232017797, + "grad_norm": 0.6943185329437256, + "learning_rate": 4.749392915838925e-06, + "loss": 1.3855, + "mean_token_accuracy": 0.6770055890083313, + "num_tokens": 2296481790.0, + "step": 13693 + }, + { + "entropy": 1.7828082740306854, + "epoch": 1.5043530801131526, + "grad_norm": 0.7607249617576599, + "learning_rate": 4.748241032643664e-06, + "loss": 1.4255, + "mean_token_accuracy": 0.6515330821275711, + "num_tokens": 2296600354.0, + "step": 13694 + }, + { + "entropy": 1.6512650549411774, + "epoch": 1.5044629370245257, + "grad_norm": 0.6560864448547363, + "learning_rate": 4.747089347313755e-06, + "loss": 1.2883, + "mean_token_accuracy": 0.6671764502922694, + "num_tokens": 2296774491.0, + "step": 13695 + }, + { + "entropy": 1.7189783950646718, + "epoch": 1.5045727939358984, + "grad_norm": 0.7325725555419922, + "learning_rate": 4.7459378598856525e-06, + "loss": 1.3908, + "mean_token_accuracy": 0.6567257990439733, + "num_tokens": 2296914415.0, + "step": 13696 + }, + { + "entropy": 1.729427436987559, + "epoch": 1.5046826508472715, + "grad_norm": 0.751825749874115, + "learning_rate": 4.744786570395798e-06, + "loss": 1.3657, + "mean_token_accuracy": 0.6634038190046946, + "num_tokens": 2297093221.0, + "step": 13697 + }, + { + "entropy": 1.7347900966803234, + "epoch": 1.5047925077586444, + "grad_norm": 0.6090309619903564, + "learning_rate": 4.743635478880628e-06, + "loss": 1.462, + "mean_token_accuracy": 0.6348727444807688, + "num_tokens": 2297310033.0, + "step": 13698 + }, + { + "entropy": 1.7234489421049755, + "epoch": 1.5049023646700173, + "grad_norm": 0.6301156878471375, + "learning_rate": 4.742484585376576e-06, + "loss": 1.4262, + "mean_token_accuracy": 0.6584373613198599, + "num_tokens": 2297493384.0, + "step": 13699 + }, + { + "entropy": 1.6984918216864269, + "epoch": 1.5050122215813904, + "grad_norm": 0.6758148074150085, + "learning_rate": 4.74133388992007e-06, + "loss": 1.4503, + "mean_token_accuracy": 0.6438490003347397, + "num_tokens": 2297684605.0, + "step": 13700 + }, + { + "entropy": 1.7280223667621613, + "epoch": 1.505122078492763, + "grad_norm": 0.5646396279335022, + "learning_rate": 4.740183392547526e-06, + "loss": 1.4605, + "mean_token_accuracy": 0.6355783194303513, + "num_tokens": 2297901393.0, + "step": 13701 + }, + { + "entropy": 1.6897727847099304, + "epoch": 1.5052319354041361, + "grad_norm": 0.7985787987709045, + "learning_rate": 4.739033093295354e-06, + "loss": 1.3079, + "mean_token_accuracy": 0.6786791036526362, + "num_tokens": 2298041421.0, + "step": 13702 + }, + { + "entropy": 1.6934953530629475, + "epoch": 1.505341792315509, + "grad_norm": 0.6441080570220947, + "learning_rate": 4.737882992199966e-06, + "loss": 1.4262, + "mean_token_accuracy": 0.6507407377163569, + "num_tokens": 2298251682.0, + "step": 13703 + }, + { + "entropy": 1.7239458660284679, + "epoch": 1.505451649226882, + "grad_norm": 0.6519790291786194, + "learning_rate": 4.7367330892977575e-06, + "loss": 1.5049, + "mean_token_accuracy": 0.6607875376939774, + "num_tokens": 2298459653.0, + "step": 13704 + }, + { + "entropy": 1.669982651869456, + "epoch": 1.505561506138255, + "grad_norm": 0.7108668684959412, + "learning_rate": 4.735583384625126e-06, + "loss": 1.4123, + "mean_token_accuracy": 0.6692193200190862, + "num_tokens": 2298597296.0, + "step": 13705 + }, + { + "entropy": 1.6570941507816315, + "epoch": 1.505671363049628, + "grad_norm": 0.7075870633125305, + "learning_rate": 4.734433878218458e-06, + "loss": 1.4478, + "mean_token_accuracy": 0.6687259177366892, + "num_tokens": 2298807746.0, + "step": 13706 + }, + { + "entropy": 1.733141968647639, + "epoch": 1.5057812199610008, + "grad_norm": 0.7741393446922302, + "learning_rate": 4.733284570114132e-06, + "loss": 1.4514, + "mean_token_accuracy": 0.6441124876340231, + "num_tokens": 2298973558.0, + "step": 13707 + }, + { + "entropy": 1.7128113210201263, + "epoch": 1.505891076872374, + "grad_norm": 0.7485846877098083, + "learning_rate": 4.732135460348528e-06, + "loss": 1.362, + "mean_token_accuracy": 0.6596867889165878, + "num_tokens": 2299133295.0, + "step": 13708 + }, + { + "entropy": 1.695679912964503, + "epoch": 1.5060009337837466, + "grad_norm": 0.7252331376075745, + "learning_rate": 4.730986548958013e-06, + "loss": 1.5427, + "mean_token_accuracy": 0.6482022255659103, + "num_tokens": 2299358470.0, + "step": 13709 + }, + { + "entropy": 1.6781230370203655, + "epoch": 1.5061107906951197, + "grad_norm": 0.6633203029632568, + "learning_rate": 4.729837835978946e-06, + "loss": 1.2652, + "mean_token_accuracy": 0.6692434151967367, + "num_tokens": 2299493276.0, + "step": 13710 + }, + { + "entropy": 1.7279355724652607, + "epoch": 1.5062206476064925, + "grad_norm": 0.9011074900627136, + "learning_rate": 4.728689321447685e-06, + "loss": 1.4516, + "mean_token_accuracy": 0.660559723774592, + "num_tokens": 2299656386.0, + "step": 13711 + }, + { + "entropy": 1.6402093569437664, + "epoch": 1.5063305045178654, + "grad_norm": 0.7499086260795593, + "learning_rate": 4.727541005400584e-06, + "loss": 1.4408, + "mean_token_accuracy": 0.6577077358961105, + "num_tokens": 2299799422.0, + "step": 13712 + }, + { + "entropy": 1.70639768242836, + "epoch": 1.5064403614292385, + "grad_norm": 0.7071443200111389, + "learning_rate": 4.726392887873984e-06, + "loss": 1.4686, + "mean_token_accuracy": 0.6468622287114462, + "num_tokens": 2299983882.0, + "step": 13713 + }, + { + "entropy": 1.6494310796260834, + "epoch": 1.5065502183406112, + "grad_norm": 0.7178316116333008, + "learning_rate": 4.725244968904219e-06, + "loss": 1.3299, + "mean_token_accuracy": 0.6607310126225153, + "num_tokens": 2300124046.0, + "step": 13714 + }, + { + "entropy": 1.6953360736370087, + "epoch": 1.5066600752519843, + "grad_norm": 0.861855685710907, + "learning_rate": 4.724097248527627e-06, + "loss": 1.4663, + "mean_token_accuracy": 0.6720197945833206, + "num_tokens": 2300287184.0, + "step": 13715 + }, + { + "entropy": 1.7422000865141551, + "epoch": 1.5067699321633572, + "grad_norm": 0.7510853409767151, + "learning_rate": 4.722949726780526e-06, + "loss": 1.318, + "mean_token_accuracy": 0.6652670900026957, + "num_tokens": 2300394163.0, + "step": 13716 + }, + { + "entropy": 1.7209021250406902, + "epoch": 1.50687978907473, + "grad_norm": 0.8294375538825989, + "learning_rate": 4.721802403699244e-06, + "loss": 1.4164, + "mean_token_accuracy": 0.6530443280935287, + "num_tokens": 2300530554.0, + "step": 13717 + }, + { + "entropy": 1.7540078063805897, + "epoch": 1.5069896459861032, + "grad_norm": 0.6351104378700256, + "learning_rate": 4.720655279320079e-06, + "loss": 1.4425, + "mean_token_accuracy": 0.6411314457654953, + "num_tokens": 2300742561.0, + "step": 13718 + }, + { + "entropy": 1.665945549805959, + "epoch": 1.507099502897476, + "grad_norm": 0.6579576134681702, + "learning_rate": 4.719508353679347e-06, + "loss": 1.4437, + "mean_token_accuracy": 0.6524225821097692, + "num_tokens": 2300910840.0, + "step": 13719 + }, + { + "entropy": 1.6633223791917164, + "epoch": 1.507209359808849, + "grad_norm": 0.6341217756271362, + "learning_rate": 4.718361626813347e-06, + "loss": 1.3326, + "mean_token_accuracy": 0.6552438537279764, + "num_tokens": 2301079382.0, + "step": 13720 + }, + { + "entropy": 1.722684770822525, + "epoch": 1.507319216720222, + "grad_norm": 0.6889148354530334, + "learning_rate": 4.717215098758373e-06, + "loss": 1.4923, + "mean_token_accuracy": 0.6483738025029501, + "num_tokens": 2301291490.0, + "step": 13721 + }, + { + "entropy": 1.728617916504542, + "epoch": 1.5074290736315947, + "grad_norm": 0.811132550239563, + "learning_rate": 4.716068769550705e-06, + "loss": 1.4803, + "mean_token_accuracy": 0.6466376930475235, + "num_tokens": 2301465815.0, + "step": 13722 + }, + { + "entropy": 1.717352608839671, + "epoch": 1.5075389305429678, + "grad_norm": 0.7719509601593018, + "learning_rate": 4.714922639226632e-06, + "loss": 1.4298, + "mean_token_accuracy": 0.65669085085392, + "num_tokens": 2301627400.0, + "step": 13723 + }, + { + "entropy": 1.6580866078535716, + "epoch": 1.5076487874543407, + "grad_norm": 0.6603320240974426, + "learning_rate": 4.713776707822424e-06, + "loss": 1.4617, + "mean_token_accuracy": 0.6624788045883179, + "num_tokens": 2301832465.0, + "step": 13724 + }, + { + "entropy": 1.7385274867216747, + "epoch": 1.5077586443657136, + "grad_norm": 0.7888380885124207, + "learning_rate": 4.712630975374352e-06, + "loss": 1.3048, + "mean_token_accuracy": 0.6638933221499125, + "num_tokens": 2301979094.0, + "step": 13725 + }, + { + "entropy": 1.7239436507225037, + "epoch": 1.5078685012770867, + "grad_norm": 0.8936779499053955, + "learning_rate": 4.711485441918676e-06, + "loss": 1.2566, + "mean_token_accuracy": 0.678350642323494, + "num_tokens": 2302110862.0, + "step": 13726 + }, + { + "entropy": 1.61548513174057, + "epoch": 1.5079783581884594, + "grad_norm": 0.5887323021888733, + "learning_rate": 4.7103401074916505e-06, + "loss": 1.3824, + "mean_token_accuracy": 0.6588394343852997, + "num_tokens": 2302319582.0, + "step": 13727 + }, + { + "entropy": 1.7819407383600872, + "epoch": 1.5080882150998325, + "grad_norm": 0.7063882946968079, + "learning_rate": 4.70919497212953e-06, + "loss": 1.699, + "mean_token_accuracy": 0.610893577337265, + "num_tokens": 2302538990.0, + "step": 13728 + }, + { + "entropy": 1.7448661824067433, + "epoch": 1.5081980720112054, + "grad_norm": 0.6677849888801575, + "learning_rate": 4.708050035868552e-06, + "loss": 1.5426, + "mean_token_accuracy": 0.6525959322849909, + "num_tokens": 2302738791.0, + "step": 13729 + }, + { + "entropy": 1.6619132260481517, + "epoch": 1.5083079289225783, + "grad_norm": 0.7120320200920105, + "learning_rate": 4.706905298744953e-06, + "loss": 1.3273, + "mean_token_accuracy": 0.660373717546463, + "num_tokens": 2302885338.0, + "step": 13730 + }, + { + "entropy": 1.7256252070267994, + "epoch": 1.5084177858339514, + "grad_norm": 0.7116526961326599, + "learning_rate": 4.705760760794966e-06, + "loss": 1.4432, + "mean_token_accuracy": 0.6434483329455057, + "num_tokens": 2303047678.0, + "step": 13731 + }, + { + "entropy": 1.727946698665619, + "epoch": 1.5085276427453242, + "grad_norm": 0.8816052079200745, + "learning_rate": 4.704616422054816e-06, + "loss": 1.2795, + "mean_token_accuracy": 0.6723741243282954, + "num_tokens": 2303171804.0, + "step": 13732 + }, + { + "entropy": 1.7127221127351124, + "epoch": 1.5086374996566971, + "grad_norm": 0.6119446754455566, + "learning_rate": 4.7034722825607205e-06, + "loss": 1.3865, + "mean_token_accuracy": 0.652864803870519, + "num_tokens": 2303339544.0, + "step": 13733 + }, + { + "entropy": 1.6904581189155579, + "epoch": 1.5087473565680702, + "grad_norm": 0.7307989001274109, + "learning_rate": 4.702328342348888e-06, + "loss": 1.4435, + "mean_token_accuracy": 0.6472090234359106, + "num_tokens": 2303523201.0, + "step": 13734 + }, + { + "entropy": 1.7181105117003124, + "epoch": 1.508857213479443, + "grad_norm": 0.6447550654411316, + "learning_rate": 4.701184601455527e-06, + "loss": 1.4236, + "mean_token_accuracy": 0.6380515098571777, + "num_tokens": 2303765509.0, + "step": 13735 + }, + { + "entropy": 1.727871795495351, + "epoch": 1.508967070390816, + "grad_norm": 0.7264792919158936, + "learning_rate": 4.700041059916833e-06, + "loss": 1.4861, + "mean_token_accuracy": 0.6438801288604736, + "num_tokens": 2303938570.0, + "step": 13736 + }, + { + "entropy": 1.6387112736701965, + "epoch": 1.509076927302189, + "grad_norm": 0.6865224838256836, + "learning_rate": 4.6988977177690035e-06, + "loss": 1.4266, + "mean_token_accuracy": 0.6634560376405716, + "num_tokens": 2304111132.0, + "step": 13737 + }, + { + "entropy": 1.6926419138908386, + "epoch": 1.5091867842135618, + "grad_norm": 0.6934045553207397, + "learning_rate": 4.697754575048223e-06, + "loss": 1.4896, + "mean_token_accuracy": 0.6502173244953156, + "num_tokens": 2304281050.0, + "step": 13738 + }, + { + "entropy": 1.7066146433353424, + "epoch": 1.509296641124935, + "grad_norm": 0.7495198249816895, + "learning_rate": 4.696611631790665e-06, + "loss": 1.3706, + "mean_token_accuracy": 0.6793716996908188, + "num_tokens": 2304436350.0, + "step": 13739 + }, + { + "entropy": 1.7890824973583221, + "epoch": 1.5094064980363076, + "grad_norm": 0.7411239743232727, + "learning_rate": 4.695468888032513e-06, + "loss": 1.4913, + "mean_token_accuracy": 0.6336054851611456, + "num_tokens": 2304575159.0, + "step": 13740 + }, + { + "entropy": 1.7109392881393433, + "epoch": 1.5095163549476807, + "grad_norm": 0.6763020157814026, + "learning_rate": 4.694326343809929e-06, + "loss": 1.4573, + "mean_token_accuracy": 0.6470163067181905, + "num_tokens": 2304755316.0, + "step": 13741 + }, + { + "entropy": 1.6757989923159282, + "epoch": 1.5096262118590535, + "grad_norm": 0.6875401139259338, + "learning_rate": 4.693183999159073e-06, + "loss": 1.4078, + "mean_token_accuracy": 0.6494525969028473, + "num_tokens": 2304937028.0, + "step": 13742 + }, + { + "entropy": 1.7914471526940663, + "epoch": 1.5097360687704264, + "grad_norm": 0.8657746911048889, + "learning_rate": 4.692041854116101e-06, + "loss": 1.4989, + "mean_token_accuracy": 0.6594147632519404, + "num_tokens": 2305111526.0, + "step": 13743 + }, + { + "entropy": 1.7403202851613362, + "epoch": 1.5098459256817995, + "grad_norm": 0.6036834716796875, + "learning_rate": 4.6908999087171645e-06, + "loss": 1.3853, + "mean_token_accuracy": 0.6570370892683665, + "num_tokens": 2305273638.0, + "step": 13744 + }, + { + "entropy": 1.699168711900711, + "epoch": 1.5099557825931724, + "grad_norm": 0.6418320536613464, + "learning_rate": 4.689758162998403e-06, + "loss": 1.4483, + "mean_token_accuracy": 0.6439376026391983, + "num_tokens": 2305466484.0, + "step": 13745 + }, + { + "entropy": 1.7933777173360188, + "epoch": 1.5100656395045453, + "grad_norm": 0.7398175001144409, + "learning_rate": 4.688616616995949e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.6565447101990382, + "num_tokens": 2305625923.0, + "step": 13746 + }, + { + "entropy": 1.712761531273524, + "epoch": 1.5101754964159184, + "grad_norm": 0.5988736152648926, + "learning_rate": 4.687475270745939e-06, + "loss": 1.4272, + "mean_token_accuracy": 0.6635372291008631, + "num_tokens": 2305795495.0, + "step": 13747 + }, + { + "entropy": 1.6717861990133922, + "epoch": 1.510285353327291, + "grad_norm": 0.6338712573051453, + "learning_rate": 4.686334124284489e-06, + "loss": 1.4542, + "mean_token_accuracy": 0.6616056164105734, + "num_tokens": 2306022578.0, + "step": 13748 + }, + { + "entropy": 1.6783255338668823, + "epoch": 1.5103952102386642, + "grad_norm": 0.7062821984291077, + "learning_rate": 4.685193177647721e-06, + "loss": 1.4289, + "mean_token_accuracy": 0.6619517654180527, + "num_tokens": 2306208025.0, + "step": 13749 + }, + { + "entropy": 1.7133217950661976, + "epoch": 1.510505067150037, + "grad_norm": 0.8360413312911987, + "learning_rate": 4.684052430871744e-06, + "loss": 1.3356, + "mean_token_accuracy": 0.6822609504063925, + "num_tokens": 2306394670.0, + "step": 13750 + }, + { + "entropy": 1.7764535943667095, + "epoch": 1.51061492406141, + "grad_norm": 0.7038290500640869, + "learning_rate": 4.682911883992659e-06, + "loss": 1.3857, + "mean_token_accuracy": 0.6598734309275945, + "num_tokens": 2306545028.0, + "step": 13751 + }, + { + "entropy": 1.7245705723762512, + "epoch": 1.510724780972783, + "grad_norm": 0.6591954231262207, + "learning_rate": 4.681771537046568e-06, + "loss": 1.3675, + "mean_token_accuracy": 0.6651880691448847, + "num_tokens": 2306708834.0, + "step": 13752 + }, + { + "entropy": 1.7273605664571126, + "epoch": 1.5108346378841557, + "grad_norm": 0.754412055015564, + "learning_rate": 4.680631390069561e-06, + "loss": 1.3327, + "mean_token_accuracy": 0.665368507305781, + "num_tokens": 2306860973.0, + "step": 13753 + }, + { + "entropy": 1.6641030311584473, + "epoch": 1.5109444947955288, + "grad_norm": 0.8478244543075562, + "learning_rate": 4.679491443097721e-06, + "loss": 1.3195, + "mean_token_accuracy": 0.6591992974281311, + "num_tokens": 2307024826.0, + "step": 13754 + }, + { + "entropy": 1.7281495829423268, + "epoch": 1.5110543517069017, + "grad_norm": 0.8327801823616028, + "learning_rate": 4.678351696167129e-06, + "loss": 1.2827, + "mean_token_accuracy": 0.66518135368824, + "num_tokens": 2307155483.0, + "step": 13755 + }, + { + "entropy": 1.676687588294347, + "epoch": 1.5111642086182746, + "grad_norm": 0.6652231812477112, + "learning_rate": 4.677212149313859e-06, + "loss": 1.3258, + "mean_token_accuracy": 0.6645462463299433, + "num_tokens": 2307328046.0, + "step": 13756 + }, + { + "entropy": 1.7564114332199097, + "epoch": 1.5112740655296477, + "grad_norm": 0.6835631132125854, + "learning_rate": 4.676072802573976e-06, + "loss": 1.3074, + "mean_token_accuracy": 0.6630533536275228, + "num_tokens": 2307471442.0, + "step": 13757 + }, + { + "entropy": 1.7643627524375916, + "epoch": 1.5113839224410206, + "grad_norm": 0.7151616215705872, + "learning_rate": 4.674933655983535e-06, + "loss": 1.4011, + "mean_token_accuracy": 0.6479361802339554, + "num_tokens": 2307601581.0, + "step": 13758 + }, + { + "entropy": 1.7416835725307465, + "epoch": 1.5114937793523935, + "grad_norm": 0.6428912878036499, + "learning_rate": 4.673794709578598e-06, + "loss": 1.5211, + "mean_token_accuracy": 0.6297041177749634, + "num_tokens": 2307869204.0, + "step": 13759 + }, + { + "entropy": 1.7405222256978352, + "epoch": 1.5116036362637666, + "grad_norm": 0.7016375660896301, + "learning_rate": 4.672655963395205e-06, + "loss": 1.3975, + "mean_token_accuracy": 0.6664147426684698, + "num_tokens": 2308073195.0, + "step": 13760 + }, + { + "entropy": 1.6804000735282898, + "epoch": 1.5117134931751393, + "grad_norm": 0.6481202840805054, + "learning_rate": 4.671517417469402e-06, + "loss": 1.496, + "mean_token_accuracy": 0.6482650935649872, + "num_tokens": 2308239309.0, + "step": 13761 + }, + { + "entropy": 1.7526653309663136, + "epoch": 1.5118233500865124, + "grad_norm": 0.7293574213981628, + "learning_rate": 4.670379071837221e-06, + "loss": 1.4853, + "mean_token_accuracy": 0.6492472440004349, + "num_tokens": 2308401820.0, + "step": 13762 + }, + { + "entropy": 1.7497336467107136, + "epoch": 1.5119332069978852, + "grad_norm": 0.6734301447868347, + "learning_rate": 4.6692409265346876e-06, + "loss": 1.3734, + "mean_token_accuracy": 0.6600970327854156, + "num_tokens": 2308542755.0, + "step": 13763 + }, + { + "entropy": 1.701109786828359, + "epoch": 1.5120430639092581, + "grad_norm": 0.8500663042068481, + "learning_rate": 4.668102981597828e-06, + "loss": 1.5474, + "mean_token_accuracy": 0.6489445865154266, + "num_tokens": 2308734003.0, + "step": 13764 + }, + { + "entropy": 1.723372757434845, + "epoch": 1.5121529208206312, + "grad_norm": 0.6357081532478333, + "learning_rate": 4.666965237062657e-06, + "loss": 1.3514, + "mean_token_accuracy": 0.6554248780012131, + "num_tokens": 2308878086.0, + "step": 13765 + }, + { + "entropy": 1.702651709318161, + "epoch": 1.512262777732004, + "grad_norm": 0.7335965633392334, + "learning_rate": 4.66582769296518e-06, + "loss": 1.3079, + "mean_token_accuracy": 0.6714355101188024, + "num_tokens": 2308988685.0, + "step": 13766 + }, + { + "entropy": 1.710147311290105, + "epoch": 1.512372634643377, + "grad_norm": 0.7795404195785522, + "learning_rate": 4.664690349341402e-06, + "loss": 1.4638, + "mean_token_accuracy": 0.659597784280777, + "num_tokens": 2309151111.0, + "step": 13767 + }, + { + "entropy": 1.7290584842363994, + "epoch": 1.51248249155475, + "grad_norm": 0.6968294382095337, + "learning_rate": 4.663553206227321e-06, + "loss": 1.3245, + "mean_token_accuracy": 0.6631123870611191, + "num_tokens": 2309280184.0, + "step": 13768 + }, + { + "entropy": 1.7158755660057068, + "epoch": 1.5125923484661228, + "grad_norm": 0.6981979608535767, + "learning_rate": 4.662416263658927e-06, + "loss": 1.3123, + "mean_token_accuracy": 0.6700327694416046, + "num_tokens": 2309436766.0, + "step": 13769 + }, + { + "entropy": 1.7104793687661488, + "epoch": 1.512702205377496, + "grad_norm": 0.7425520420074463, + "learning_rate": 4.661279521672199e-06, + "loss": 1.4128, + "mean_token_accuracy": 0.6677025308211645, + "num_tokens": 2309588572.0, + "step": 13770 + }, + { + "entropy": 1.610454837481181, + "epoch": 1.5128120622888688, + "grad_norm": 0.6415528655052185, + "learning_rate": 4.660142980303121e-06, + "loss": 1.2953, + "mean_token_accuracy": 0.6645541985829672, + "num_tokens": 2309754449.0, + "step": 13771 + }, + { + "entropy": 1.6928213934103649, + "epoch": 1.5129219192002417, + "grad_norm": 0.6795879602432251, + "learning_rate": 4.659006639587659e-06, + "loss": 1.4469, + "mean_token_accuracy": 0.6411414295434952, + "num_tokens": 2309947529.0, + "step": 13772 + }, + { + "entropy": 1.7953710655371349, + "epoch": 1.5130317761116148, + "grad_norm": 0.6941425800323486, + "learning_rate": 4.657870499561781e-06, + "loss": 1.5126, + "mean_token_accuracy": 0.629363218943278, + "num_tokens": 2310124867.0, + "step": 13773 + }, + { + "entropy": 1.6879153450330098, + "epoch": 1.5131416330229874, + "grad_norm": 0.6987261772155762, + "learning_rate": 4.656734560261445e-06, + "loss": 1.2105, + "mean_token_accuracy": 0.6808223128318787, + "num_tokens": 2310246484.0, + "step": 13774 + }, + { + "entropy": 1.6733311613400776, + "epoch": 1.5132514899343605, + "grad_norm": 0.6962072253227234, + "learning_rate": 4.655598821722597e-06, + "loss": 1.3406, + "mean_token_accuracy": 0.6601444731156031, + "num_tokens": 2310381379.0, + "step": 13775 + }, + { + "entropy": 1.6864116390546162, + "epoch": 1.5133613468457334, + "grad_norm": 0.6691348552703857, + "learning_rate": 4.654463283981193e-06, + "loss": 1.3422, + "mean_token_accuracy": 0.6588364889224371, + "num_tokens": 2310624298.0, + "step": 13776 + }, + { + "entropy": 1.6842545072237651, + "epoch": 1.5134712037571063, + "grad_norm": 0.7775861024856567, + "learning_rate": 4.653327947073165e-06, + "loss": 1.3162, + "mean_token_accuracy": 0.665345624089241, + "num_tokens": 2310810645.0, + "step": 13777 + }, + { + "entropy": 1.6352218687534332, + "epoch": 1.5135810606684794, + "grad_norm": 0.626237690448761, + "learning_rate": 4.652192811034445e-06, + "loss": 1.3978, + "mean_token_accuracy": 0.6582045257091522, + "num_tokens": 2311014180.0, + "step": 13778 + }, + { + "entropy": 1.642144391934077, + "epoch": 1.513690917579852, + "grad_norm": 0.7304378747940063, + "learning_rate": 4.651057875900964e-06, + "loss": 1.4529, + "mean_token_accuracy": 0.6429022600253423, + "num_tokens": 2311188278.0, + "step": 13779 + }, + { + "entropy": 1.7175097266832988, + "epoch": 1.5138007744912252, + "grad_norm": 0.6320850253105164, + "learning_rate": 4.649923141708639e-06, + "loss": 1.4223, + "mean_token_accuracy": 0.6590802123149236, + "num_tokens": 2311368476.0, + "step": 13780 + }, + { + "entropy": 1.6938395003477733, + "epoch": 1.513910631402598, + "grad_norm": 0.6381179690361023, + "learning_rate": 4.648788608493388e-06, + "loss": 1.3746, + "mean_token_accuracy": 0.6710842897494634, + "num_tokens": 2311553528.0, + "step": 13781 + }, + { + "entropy": 1.7335260311762493, + "epoch": 1.514020488313971, + "grad_norm": 0.7260196805000305, + "learning_rate": 4.647654276291114e-06, + "loss": 1.3556, + "mean_token_accuracy": 0.654750128587087, + "num_tokens": 2311740346.0, + "step": 13782 + }, + { + "entropy": 1.793048232793808, + "epoch": 1.514130345225344, + "grad_norm": 0.7554465532302856, + "learning_rate": 4.646520145137719e-06, + "loss": 1.5026, + "mean_token_accuracy": 0.6503528704245886, + "num_tokens": 2311938873.0, + "step": 13783 + }, + { + "entropy": 1.7190141479174297, + "epoch": 1.514240202136717, + "grad_norm": 0.7326686382293701, + "learning_rate": 4.645386215069097e-06, + "loss": 1.423, + "mean_token_accuracy": 0.6645475178956985, + "num_tokens": 2312120733.0, + "step": 13784 + }, + { + "entropy": 1.676575392484665, + "epoch": 1.5143500590480898, + "grad_norm": 0.6733846068382263, + "learning_rate": 4.644252486121145e-06, + "loss": 1.3577, + "mean_token_accuracy": 0.6720403631528219, + "num_tokens": 2312307679.0, + "step": 13785 + }, + { + "entropy": 1.7140410840511322, + "epoch": 1.514459915959463, + "grad_norm": 0.6526421904563904, + "learning_rate": 4.643118958329731e-06, + "loss": 1.3907, + "mean_token_accuracy": 0.6629425088564554, + "num_tokens": 2312492173.0, + "step": 13786 + }, + { + "entropy": 1.6425736447175343, + "epoch": 1.5145697728708356, + "grad_norm": 0.7509266138076782, + "learning_rate": 4.641985631730737e-06, + "loss": 1.4446, + "mean_token_accuracy": 0.6570387085278829, + "num_tokens": 2312697867.0, + "step": 13787 + }, + { + "entropy": 1.730595697959264, + "epoch": 1.5146796297822087, + "grad_norm": 10.333463668823242, + "learning_rate": 4.640852506360037e-06, + "loss": 1.1871, + "mean_token_accuracy": 0.6863798399766287, + "num_tokens": 2312869978.0, + "step": 13788 + }, + { + "entropy": 1.7071086366971333, + "epoch": 1.5147894866935816, + "grad_norm": 0.7106054425239563, + "learning_rate": 4.639719582253489e-06, + "loss": 1.3772, + "mean_token_accuracy": 0.6516473690668741, + "num_tokens": 2313025921.0, + "step": 13789 + }, + { + "entropy": 1.6998887260754902, + "epoch": 1.5148993436049545, + "grad_norm": 0.536662757396698, + "learning_rate": 4.638586859446947e-06, + "loss": 1.4427, + "mean_token_accuracy": 0.6548623442649841, + "num_tokens": 2313202721.0, + "step": 13790 + }, + { + "entropy": 1.7144613564014435, + "epoch": 1.5150092005163276, + "grad_norm": 0.8173153400421143, + "learning_rate": 4.637454337976267e-06, + "loss": 1.3728, + "mean_token_accuracy": 0.6525384138027827, + "num_tokens": 2313375827.0, + "step": 13791 + }, + { + "entropy": 1.668924331665039, + "epoch": 1.5151190574277003, + "grad_norm": 0.6728224754333496, + "learning_rate": 4.636322017877289e-06, + "loss": 1.3425, + "mean_token_accuracy": 0.6699813405672709, + "num_tokens": 2313559355.0, + "step": 13792 + }, + { + "entropy": 1.6745288372039795, + "epoch": 1.5152289143390734, + "grad_norm": 0.6349695920944214, + "learning_rate": 4.6351898991858526e-06, + "loss": 1.274, + "mean_token_accuracy": 0.6697569986184438, + "num_tokens": 2313698496.0, + "step": 13793 + }, + { + "entropy": 1.7029302318890889, + "epoch": 1.5153387712504462, + "grad_norm": 0.6972919702529907, + "learning_rate": 4.6340579819377885e-06, + "loss": 1.4831, + "mean_token_accuracy": 0.6308980584144592, + "num_tokens": 2313906683.0, + "step": 13794 + }, + { + "entropy": 1.701465239127477, + "epoch": 1.5154486281618191, + "grad_norm": 0.6877496242523193, + "learning_rate": 4.632926266168918e-06, + "loss": 1.2802, + "mean_token_accuracy": 0.6722718824942907, + "num_tokens": 2314039947.0, + "step": 13795 + }, + { + "entropy": 1.755535235007604, + "epoch": 1.5155584850731922, + "grad_norm": 0.6645422577857971, + "learning_rate": 4.631794751915063e-06, + "loss": 1.5432, + "mean_token_accuracy": 0.6427832990884781, + "num_tokens": 2314236598.0, + "step": 13796 + }, + { + "entropy": 1.8075012763341267, + "epoch": 1.5156683419845651, + "grad_norm": 0.7589188814163208, + "learning_rate": 4.630663439212039e-06, + "loss": 1.6916, + "mean_token_accuracy": 0.6184907828768095, + "num_tokens": 2314460621.0, + "step": 13797 + }, + { + "entropy": 1.6989454329013824, + "epoch": 1.515778198895938, + "grad_norm": 0.604321300983429, + "learning_rate": 4.629532328095641e-06, + "loss": 1.3933, + "mean_token_accuracy": 0.6470302095015844, + "num_tokens": 2314660461.0, + "step": 13798 + }, + { + "entropy": 1.6902850965658824, + "epoch": 1.5158880558073111, + "grad_norm": 0.7460362911224365, + "learning_rate": 4.628401418601675e-06, + "loss": 1.4371, + "mean_token_accuracy": 0.671076680223147, + "num_tokens": 2314804887.0, + "step": 13799 + }, + { + "entropy": 1.7061065534750621, + "epoch": 1.5159979127186838, + "grad_norm": 0.6932515501976013, + "learning_rate": 4.627270710765935e-06, + "loss": 1.2638, + "mean_token_accuracy": 0.6705667823553085, + "num_tokens": 2314930379.0, + "step": 13800 + }, + { + "entropy": 1.6936753690242767, + "epoch": 1.516107769630057, + "grad_norm": 0.6244261860847473, + "learning_rate": 4.626140204624207e-06, + "loss": 1.4434, + "mean_token_accuracy": 0.6529469887415568, + "num_tokens": 2315130329.0, + "step": 13801 + }, + { + "entropy": 1.7133234739303589, + "epoch": 1.5162176265414298, + "grad_norm": 0.7487544417381287, + "learning_rate": 4.625009900212265e-06, + "loss": 1.3369, + "mean_token_accuracy": 0.6595756113529205, + "num_tokens": 2315276874.0, + "step": 13802 + }, + { + "entropy": 1.6450997491677601, + "epoch": 1.5163274834528027, + "grad_norm": 0.7113902568817139, + "learning_rate": 4.62387979756589e-06, + "loss": 1.3556, + "mean_token_accuracy": 0.662777175505956, + "num_tokens": 2315477438.0, + "step": 13803 + }, + { + "entropy": 1.7127596934636433, + "epoch": 1.5164373403641758, + "grad_norm": 0.7134985327720642, + "learning_rate": 4.622749896720845e-06, + "loss": 1.4482, + "mean_token_accuracy": 0.6417205582062403, + "num_tokens": 2315669497.0, + "step": 13804 + }, + { + "entropy": 1.659109354019165, + "epoch": 1.5165471972755487, + "grad_norm": 0.7004081010818481, + "learning_rate": 4.621620197712894e-06, + "loss": 1.4047, + "mean_token_accuracy": 0.6536833544572195, + "num_tokens": 2315841211.0, + "step": 13805 + }, + { + "entropy": 1.7210937837759654, + "epoch": 1.5166570541869215, + "grad_norm": 0.6302258372306824, + "learning_rate": 4.620490700577788e-06, + "loss": 1.6054, + "mean_token_accuracy": 0.6396430979172388, + "num_tokens": 2316082178.0, + "step": 13806 + }, + { + "entropy": 1.7338979343573253, + "epoch": 1.5167669110982944, + "grad_norm": 0.6911170482635498, + "learning_rate": 4.619361405351276e-06, + "loss": 1.3438, + "mean_token_accuracy": 0.657525877157847, + "num_tokens": 2316281274.0, + "step": 13807 + }, + { + "entropy": 1.6707657376925151, + "epoch": 1.5168767680096673, + "grad_norm": 0.5436812043190002, + "learning_rate": 4.618232312069102e-06, + "loss": 1.3353, + "mean_token_accuracy": 0.6588641007741293, + "num_tokens": 2316456603.0, + "step": 13808 + }, + { + "entropy": 1.7034710347652435, + "epoch": 1.5169866249210404, + "grad_norm": 0.8173933625221252, + "learning_rate": 4.6171034207670005e-06, + "loss": 1.2925, + "mean_token_accuracy": 0.6720810929934183, + "num_tokens": 2316592576.0, + "step": 13809 + }, + { + "entropy": 1.6645126938819885, + "epoch": 1.5170964818324133, + "grad_norm": 0.6110033392906189, + "learning_rate": 4.615974731480695e-06, + "loss": 1.3464, + "mean_token_accuracy": 0.6670532127221426, + "num_tokens": 2316743979.0, + "step": 13810 + }, + { + "entropy": 1.7283632159233093, + "epoch": 1.5172063387437862, + "grad_norm": 0.5693283081054688, + "learning_rate": 4.614846244245914e-06, + "loss": 1.3587, + "mean_token_accuracy": 0.661205435792605, + "num_tokens": 2316928848.0, + "step": 13811 + }, + { + "entropy": 1.7009834845860798, + "epoch": 1.5173161956551593, + "grad_norm": 0.6659391522407532, + "learning_rate": 4.613717959098374e-06, + "loss": 1.4906, + "mean_token_accuracy": 0.6506659984588623, + "num_tokens": 2317098556.0, + "step": 13812 + }, + { + "entropy": 1.7381359835465748, + "epoch": 1.517426052566532, + "grad_norm": 0.8102354407310486, + "learning_rate": 4.612589876073785e-06, + "loss": 1.3489, + "mean_token_accuracy": 0.6604036937157313, + "num_tokens": 2317270868.0, + "step": 13813 + }, + { + "entropy": 1.7617174784342449, + "epoch": 1.517535909477905, + "grad_norm": 0.816353440284729, + "learning_rate": 4.611461995207843e-06, + "loss": 1.4868, + "mean_token_accuracy": 0.6646532714366913, + "num_tokens": 2317428278.0, + "step": 13814 + }, + { + "entropy": 1.7236855427424114, + "epoch": 1.517645766389278, + "grad_norm": 0.6324769258499146, + "learning_rate": 4.610334316536255e-06, + "loss": 1.4888, + "mean_token_accuracy": 0.6348064343134562, + "num_tokens": 2317659506.0, + "step": 13815 + }, + { + "entropy": 1.6722245911757152, + "epoch": 1.5177556233006508, + "grad_norm": 0.737623393535614, + "learning_rate": 4.609206840094702e-06, + "loss": 1.3835, + "mean_token_accuracy": 0.6555936386187872, + "num_tokens": 2317836071.0, + "step": 13816 + }, + { + "entropy": 1.700161616007487, + "epoch": 1.517865480212024, + "grad_norm": 0.7109667658805847, + "learning_rate": 4.608079565918877e-06, + "loss": 1.3775, + "mean_token_accuracy": 0.6652699112892151, + "num_tokens": 2317969020.0, + "step": 13817 + }, + { + "entropy": 1.7235205272833507, + "epoch": 1.5179753371233968, + "grad_norm": 0.6419909000396729, + "learning_rate": 4.606952494044452e-06, + "loss": 1.4529, + "mean_token_accuracy": 0.6587186654408773, + "num_tokens": 2318125925.0, + "step": 13818 + }, + { + "entropy": 1.7614581386248271, + "epoch": 1.5180851940347697, + "grad_norm": 0.7397036552429199, + "learning_rate": 4.605825624507097e-06, + "loss": 1.2682, + "mean_token_accuracy": 0.6707401523987452, + "num_tokens": 2318235404.0, + "step": 13819 + }, + { + "entropy": 1.6948228081067402, + "epoch": 1.5181950509461426, + "grad_norm": 0.6969819068908691, + "learning_rate": 4.604698957342484e-06, + "loss": 1.3792, + "mean_token_accuracy": 0.6615195969740549, + "num_tokens": 2318404543.0, + "step": 13820 + }, + { + "entropy": 1.6588495473066966, + "epoch": 1.5183049078575155, + "grad_norm": 0.7565116286277771, + "learning_rate": 4.603572492586266e-06, + "loss": 1.4351, + "mean_token_accuracy": 0.6644074221452078, + "num_tokens": 2318598937.0, + "step": 13821 + }, + { + "entropy": 1.6587398151556652, + "epoch": 1.5184147647688886, + "grad_norm": 0.7786286473274231, + "learning_rate": 4.602446230274094e-06, + "loss": 1.3448, + "mean_token_accuracy": 0.6546828200419744, + "num_tokens": 2318769448.0, + "step": 13822 + }, + { + "entropy": 1.6828916768232982, + "epoch": 1.5185246216802615, + "grad_norm": 0.6554774045944214, + "learning_rate": 4.601320170441616e-06, + "loss": 1.3457, + "mean_token_accuracy": 0.6564191430807114, + "num_tokens": 2318939196.0, + "step": 13823 + }, + { + "entropy": 1.7403921981652577, + "epoch": 1.5186344785916344, + "grad_norm": 0.6817828416824341, + "learning_rate": 4.6001943131244745e-06, + "loss": 1.4085, + "mean_token_accuracy": 0.650087426106135, + "num_tokens": 2319099808.0, + "step": 13824 + }, + { + "entropy": 1.7434031864007313, + "epoch": 1.5187443355030075, + "grad_norm": 0.753669798374176, + "learning_rate": 4.5990686583582985e-06, + "loss": 1.3568, + "mean_token_accuracy": 0.6551444629828135, + "num_tokens": 2319261658.0, + "step": 13825 + }, + { + "entropy": 1.7295333445072174, + "epoch": 1.5188541924143801, + "grad_norm": 0.6525160670280457, + "learning_rate": 4.597943206178712e-06, + "loss": 1.3787, + "mean_token_accuracy": 0.6602396667003632, + "num_tokens": 2319392048.0, + "step": 13826 + }, + { + "entropy": 1.6922452350457509, + "epoch": 1.5189640493257532, + "grad_norm": 0.6596596837043762, + "learning_rate": 4.596817956621342e-06, + "loss": 1.4606, + "mean_token_accuracy": 0.6499723295370737, + "num_tokens": 2319591828.0, + "step": 13827 + }, + { + "entropy": 1.7007356186707814, + "epoch": 1.5190739062371261, + "grad_norm": 0.675512433052063, + "learning_rate": 4.595692909721794e-06, + "loss": 1.4131, + "mean_token_accuracy": 0.6725195000569025, + "num_tokens": 2319750099.0, + "step": 13828 + }, + { + "entropy": 1.7956644495328267, + "epoch": 1.519183763148499, + "grad_norm": 0.7226914763450623, + "learning_rate": 4.5945680655156835e-06, + "loss": 1.5228, + "mean_token_accuracy": 0.6371510376532873, + "num_tokens": 2319933057.0, + "step": 13829 + }, + { + "entropy": 1.7390548785527546, + "epoch": 1.5192936200598721, + "grad_norm": 0.7265375852584839, + "learning_rate": 4.593443424038608e-06, + "loss": 1.1547, + "mean_token_accuracy": 0.6949647714694341, + "num_tokens": 2320025210.0, + "step": 13830 + }, + { + "entropy": 1.6455240448315938, + "epoch": 1.519403476971245, + "grad_norm": 0.6959369778633118, + "learning_rate": 4.592318985326158e-06, + "loss": 1.2745, + "mean_token_accuracy": 0.6683288365602493, + "num_tokens": 2320188621.0, + "step": 13831 + }, + { + "entropy": 1.7331445614496868, + "epoch": 1.519513333882618, + "grad_norm": 0.7264030575752258, + "learning_rate": 4.591194749413927e-06, + "loss": 1.4339, + "mean_token_accuracy": 0.6577616731325785, + "num_tokens": 2320334001.0, + "step": 13832 + }, + { + "entropy": 1.6974613467852275, + "epoch": 1.5196231907939908, + "grad_norm": 0.709572970867157, + "learning_rate": 4.590070716337495e-06, + "loss": 1.3339, + "mean_token_accuracy": 0.6570964654286703, + "num_tokens": 2320480424.0, + "step": 13833 + }, + { + "entropy": 1.6866892476876576, + "epoch": 1.5197330477053637, + "grad_norm": 0.6560536623001099, + "learning_rate": 4.588946886132433e-06, + "loss": 1.3631, + "mean_token_accuracy": 0.6670355498790741, + "num_tokens": 2320674360.0, + "step": 13834 + }, + { + "entropy": 1.7293624182542164, + "epoch": 1.5198429046167368, + "grad_norm": 0.758017897605896, + "learning_rate": 4.587823258834313e-06, + "loss": 1.3618, + "mean_token_accuracy": 0.6667436609665552, + "num_tokens": 2320819919.0, + "step": 13835 + }, + { + "entropy": 1.6839136183261871, + "epoch": 1.5199527615281097, + "grad_norm": 0.6840667724609375, + "learning_rate": 4.5866998344787e-06, + "loss": 1.2936, + "mean_token_accuracy": 0.6722035010655721, + "num_tokens": 2321002330.0, + "step": 13836 + }, + { + "entropy": 1.6908271114031475, + "epoch": 1.5200626184394825, + "grad_norm": 0.7176492810249329, + "learning_rate": 4.585576613101149e-06, + "loss": 1.3865, + "mean_token_accuracy": 0.6679946233828863, + "num_tokens": 2321179746.0, + "step": 13837 + }, + { + "entropy": 1.6899594763914745, + "epoch": 1.5201724753508556, + "grad_norm": 0.6602792739868164, + "learning_rate": 4.5844535947372066e-06, + "loss": 1.3103, + "mean_token_accuracy": 0.6629238277673721, + "num_tokens": 2321320167.0, + "step": 13838 + }, + { + "entropy": 1.6667678654193878, + "epoch": 1.5202823322622283, + "grad_norm": 0.6572140455245972, + "learning_rate": 4.583330779422415e-06, + "loss": 1.2763, + "mean_token_accuracy": 0.678403819600741, + "num_tokens": 2321461364.0, + "step": 13839 + }, + { + "entropy": 1.6754031876722972, + "epoch": 1.5203921891736014, + "grad_norm": 0.7893751859664917, + "learning_rate": 4.582208167192312e-06, + "loss": 1.4581, + "mean_token_accuracy": 0.6384941240151724, + "num_tokens": 2321661021.0, + "step": 13840 + }, + { + "entropy": 1.7845016022523243, + "epoch": 1.5205020460849743, + "grad_norm": 0.7893801331520081, + "learning_rate": 4.581085758082434e-06, + "loss": 1.4384, + "mean_token_accuracy": 0.6567031691471735, + "num_tokens": 2321784876.0, + "step": 13841 + }, + { + "entropy": 1.6785170336564381, + "epoch": 1.5206119029963472, + "grad_norm": 0.6259093284606934, + "learning_rate": 4.579963552128294e-06, + "loss": 1.3312, + "mean_token_accuracy": 0.6632338911294937, + "num_tokens": 2321935360.0, + "step": 13842 + }, + { + "entropy": 1.696050186951955, + "epoch": 1.5207217599077203, + "grad_norm": 0.6764148473739624, + "learning_rate": 4.578841549365415e-06, + "loss": 1.3908, + "mean_token_accuracy": 0.6596755584081014, + "num_tokens": 2322083641.0, + "step": 13843 + }, + { + "entropy": 1.6868244409561157, + "epoch": 1.5208316168190932, + "grad_norm": 0.7002372741699219, + "learning_rate": 4.57771974982931e-06, + "loss": 1.3793, + "mean_token_accuracy": 0.6510612765947977, + "num_tokens": 2322237455.0, + "step": 13844 + }, + { + "entropy": 1.7343276540438335, + "epoch": 1.520941473730466, + "grad_norm": 0.8684859275817871, + "learning_rate": 4.576598153555481e-06, + "loss": 1.436, + "mean_token_accuracy": 0.6581882784763972, + "num_tokens": 2322398245.0, + "step": 13845 + }, + { + "entropy": 1.658086081345876, + "epoch": 1.521051330641839, + "grad_norm": 0.7486425042152405, + "learning_rate": 4.575476760579422e-06, + "loss": 1.4714, + "mean_token_accuracy": 0.6543067147334417, + "num_tokens": 2322602433.0, + "step": 13846 + }, + { + "entropy": 1.7451417048772175, + "epoch": 1.5211611875532118, + "grad_norm": 0.6304759979248047, + "learning_rate": 4.574355570936633e-06, + "loss": 1.4442, + "mean_token_accuracy": 0.6402058055003484, + "num_tokens": 2322787006.0, + "step": 13847 + }, + { + "entropy": 1.7013601462046306, + "epoch": 1.521271044464585, + "grad_norm": 0.8568814396858215, + "learning_rate": 4.573234584662592e-06, + "loss": 1.3864, + "mean_token_accuracy": 0.6570970316727957, + "num_tokens": 2322948041.0, + "step": 13848 + }, + { + "entropy": 1.699530432621638, + "epoch": 1.5213809013759578, + "grad_norm": 0.8710819482803345, + "learning_rate": 4.572113801792783e-06, + "loss": 1.4918, + "mean_token_accuracy": 0.6583302120367686, + "num_tokens": 2323150595.0, + "step": 13849 + }, + { + "entropy": 1.6952558259169261, + "epoch": 1.5214907582873307, + "grad_norm": 0.6915019154548645, + "learning_rate": 4.570993222362674e-06, + "loss": 1.3737, + "mean_token_accuracy": 0.6718220909436544, + "num_tokens": 2323287484.0, + "step": 13850 + }, + { + "entropy": 1.7063826123873393, + "epoch": 1.5216006151987038, + "grad_norm": 0.740079939365387, + "learning_rate": 4.569872846407732e-06, + "loss": 1.4068, + "mean_token_accuracy": 0.6677322387695312, + "num_tokens": 2323439617.0, + "step": 13851 + }, + { + "entropy": 1.7466611762841542, + "epoch": 1.5217104721100765, + "grad_norm": 0.6274285912513733, + "learning_rate": 4.568752673963416e-06, + "loss": 1.4659, + "mean_token_accuracy": 0.6516986141602198, + "num_tokens": 2323617361.0, + "step": 13852 + }, + { + "entropy": 1.7027616401513417, + "epoch": 1.5218203290214496, + "grad_norm": 0.57676762342453, + "learning_rate": 4.567632705065186e-06, + "loss": 1.5237, + "mean_token_accuracy": 0.6254571576913198, + "num_tokens": 2323812936.0, + "step": 13853 + }, + { + "entropy": 1.7192297577857971, + "epoch": 1.5219301859328225, + "grad_norm": 0.7230188846588135, + "learning_rate": 4.566512939748476e-06, + "loss": 1.4566, + "mean_token_accuracy": 0.6502372076114019, + "num_tokens": 2323949563.0, + "step": 13854 + }, + { + "entropy": 1.6509900987148285, + "epoch": 1.5220400428441954, + "grad_norm": 0.5848521590232849, + "learning_rate": 4.565393378048733e-06, + "loss": 1.5132, + "mean_token_accuracy": 0.6319101750850677, + "num_tokens": 2324180972.0, + "step": 13855 + }, + { + "entropy": 1.7650962670644124, + "epoch": 1.5221498997555685, + "grad_norm": 0.720257580280304, + "learning_rate": 4.564274020001393e-06, + "loss": 1.3704, + "mean_token_accuracy": 0.6599539568026861, + "num_tokens": 2324303927.0, + "step": 13856 + }, + { + "entropy": 1.7729399303595226, + "epoch": 1.5222597566669414, + "grad_norm": 0.6432046294212341, + "learning_rate": 4.56315486564188e-06, + "loss": 1.4875, + "mean_token_accuracy": 0.6416818896929423, + "num_tokens": 2324492636.0, + "step": 13857 + }, + { + "entropy": 1.752627670764923, + "epoch": 1.5223696135783142, + "grad_norm": 0.6898931264877319, + "learning_rate": 4.562035915005611e-06, + "loss": 1.3701, + "mean_token_accuracy": 0.659583792090416, + "num_tokens": 2324610878.0, + "step": 13858 + }, + { + "entropy": 1.730319658915202, + "epoch": 1.5224794704896873, + "grad_norm": 0.754682183265686, + "learning_rate": 4.560917168128009e-06, + "loss": 1.4008, + "mean_token_accuracy": 0.6601613610982895, + "num_tokens": 2324771008.0, + "step": 13859 + }, + { + "entropy": 1.6441446642080944, + "epoch": 1.52258932740106, + "grad_norm": 0.6233265399932861, + "learning_rate": 4.559798625044473e-06, + "loss": 1.3951, + "mean_token_accuracy": 0.6597211956977844, + "num_tokens": 2324942810.0, + "step": 13860 + }, + { + "entropy": 1.7084046204884846, + "epoch": 1.5226991843124331, + "grad_norm": 0.6157132983207703, + "learning_rate": 4.558680285790413e-06, + "loss": 1.2911, + "mean_token_accuracy": 0.6648881336053213, + "num_tokens": 2325106728.0, + "step": 13861 + }, + { + "entropy": 1.6908452014128368, + "epoch": 1.522809041223806, + "grad_norm": 0.6794646382331848, + "learning_rate": 4.557562150401218e-06, + "loss": 1.438, + "mean_token_accuracy": 0.6520710190137228, + "num_tokens": 2325273843.0, + "step": 13862 + }, + { + "entropy": 1.7190004388491313, + "epoch": 1.5229188981351789, + "grad_norm": 0.7677698731422424, + "learning_rate": 4.556444218912275e-06, + "loss": 1.4297, + "mean_token_accuracy": 0.6583776374657949, + "num_tokens": 2325414563.0, + "step": 13863 + }, + { + "entropy": 1.7089968224366505, + "epoch": 1.523028755046552, + "grad_norm": 0.6367853283882141, + "learning_rate": 4.55532649135897e-06, + "loss": 1.4434, + "mean_token_accuracy": 0.6484352995951971, + "num_tokens": 2325584116.0, + "step": 13864 + }, + { + "entropy": 1.6946298082669575, + "epoch": 1.5231386119579247, + "grad_norm": 0.7599850296974182, + "learning_rate": 4.554208967776681e-06, + "loss": 1.2423, + "mean_token_accuracy": 0.6795276800791422, + "num_tokens": 2325727142.0, + "step": 13865 + }, + { + "entropy": 1.6953876912593842, + "epoch": 1.5232484688692978, + "grad_norm": 0.7022786736488342, + "learning_rate": 4.553091648200771e-06, + "loss": 1.443, + "mean_token_accuracy": 0.6698449452718099, + "num_tokens": 2325866024.0, + "step": 13866 + }, + { + "entropy": 1.786557177702586, + "epoch": 1.5233583257806707, + "grad_norm": 0.8300307393074036, + "learning_rate": 4.551974532666602e-06, + "loss": 1.5473, + "mean_token_accuracy": 0.62337193886439, + "num_tokens": 2326045071.0, + "step": 13867 + }, + { + "entropy": 1.7563962737719219, + "epoch": 1.5234681826920435, + "grad_norm": 0.8392931222915649, + "learning_rate": 4.550857621209538e-06, + "loss": 1.4791, + "mean_token_accuracy": 0.6514745354652405, + "num_tokens": 2326224405.0, + "step": 13868 + }, + { + "entropy": 1.6940257251262665, + "epoch": 1.5235780396034166, + "grad_norm": 0.6167490482330322, + "learning_rate": 4.549740913864926e-06, + "loss": 1.3881, + "mean_token_accuracy": 0.659416675567627, + "num_tokens": 2326372794.0, + "step": 13869 + }, + { + "entropy": 1.7137708365917206, + "epoch": 1.5236878965147895, + "grad_norm": 0.7310816645622253, + "learning_rate": 4.5486244106681025e-06, + "loss": 1.4028, + "mean_token_accuracy": 0.6513861964146296, + "num_tokens": 2326531861.0, + "step": 13870 + }, + { + "entropy": 1.6824666062990825, + "epoch": 1.5237977534261624, + "grad_norm": 0.7571474313735962, + "learning_rate": 4.547508111654412e-06, + "loss": 1.4133, + "mean_token_accuracy": 0.6456713875134786, + "num_tokens": 2326753188.0, + "step": 13871 + }, + { + "entropy": 1.6938877006371815, + "epoch": 1.5239076103375355, + "grad_norm": 0.6758849620819092, + "learning_rate": 4.546392016859181e-06, + "loss": 1.3251, + "mean_token_accuracy": 0.6616143981615702, + "num_tokens": 2326905195.0, + "step": 13872 + }, + { + "entropy": 1.6427725851535797, + "epoch": 1.5240174672489082, + "grad_norm": 0.7181432843208313, + "learning_rate": 4.545276126317736e-06, + "loss": 1.3627, + "mean_token_accuracy": 0.6573081215222677, + "num_tokens": 2327094426.0, + "step": 13873 + }, + { + "entropy": 1.7088764309883118, + "epoch": 1.5241273241602813, + "grad_norm": 0.7042227387428284, + "learning_rate": 4.544160440065394e-06, + "loss": 1.5437, + "mean_token_accuracy": 0.6491079305609068, + "num_tokens": 2327227303.0, + "step": 13874 + }, + { + "entropy": 1.7235973974068959, + "epoch": 1.5242371810716542, + "grad_norm": 0.6834602952003479, + "learning_rate": 4.54304495813746e-06, + "loss": 1.2941, + "mean_token_accuracy": 0.6685907791058222, + "num_tokens": 2327358886.0, + "step": 13875 + }, + { + "entropy": 1.6720999280611675, + "epoch": 1.524347037983027, + "grad_norm": 0.5712348818778992, + "learning_rate": 4.541929680569246e-06, + "loss": 1.4281, + "mean_token_accuracy": 0.6634454180796941, + "num_tokens": 2327567108.0, + "step": 13876 + }, + { + "entropy": 1.6583287914594014, + "epoch": 1.5244568948944002, + "grad_norm": 0.7057250738143921, + "learning_rate": 4.540814607396052e-06, + "loss": 1.4087, + "mean_token_accuracy": 0.6564200818538666, + "num_tokens": 2327755776.0, + "step": 13877 + }, + { + "entropy": 1.6734488407770793, + "epoch": 1.5245667518057728, + "grad_norm": 0.667782187461853, + "learning_rate": 4.53969973865316e-06, + "loss": 1.328, + "mean_token_accuracy": 0.6641414314508438, + "num_tokens": 2327934192.0, + "step": 13878 + }, + { + "entropy": 1.704680899779002, + "epoch": 1.524676608717146, + "grad_norm": 0.662611722946167, + "learning_rate": 4.538585074375861e-06, + "loss": 1.4161, + "mean_token_accuracy": 0.6625035852193832, + "num_tokens": 2328111156.0, + "step": 13879 + }, + { + "entropy": 1.7344763179620106, + "epoch": 1.5247864656285188, + "grad_norm": 0.6415066123008728, + "learning_rate": 4.537470614599434e-06, + "loss": 1.3515, + "mean_token_accuracy": 0.6630072891712189, + "num_tokens": 2328262464.0, + "step": 13880 + }, + { + "entropy": 1.7009705603122711, + "epoch": 1.5248963225398917, + "grad_norm": 0.7482420802116394, + "learning_rate": 4.5363563593591505e-06, + "loss": 1.4322, + "mean_token_accuracy": 0.6513066440820694, + "num_tokens": 2328427937.0, + "step": 13881 + }, + { + "entropy": 1.69928045074145, + "epoch": 1.5250061794512648, + "grad_norm": 0.6484189629554749, + "learning_rate": 4.5352423086902725e-06, + "loss": 1.4215, + "mean_token_accuracy": 0.6485116630792618, + "num_tokens": 2328606277.0, + "step": 13882 + }, + { + "entropy": 1.6935794452826183, + "epoch": 1.5251160363626377, + "grad_norm": 0.6256346106529236, + "learning_rate": 4.534128462628066e-06, + "loss": 1.288, + "mean_token_accuracy": 0.6697153101364771, + "num_tokens": 2328753230.0, + "step": 13883 + }, + { + "entropy": 1.684336672226588, + "epoch": 1.5252258932740106, + "grad_norm": 0.6663857102394104, + "learning_rate": 4.533014821207776e-06, + "loss": 1.5602, + "mean_token_accuracy": 0.6558093825976054, + "num_tokens": 2329026991.0, + "step": 13884 + }, + { + "entropy": 1.772289623816808, + "epoch": 1.5253357501853837, + "grad_norm": 0.722428023815155, + "learning_rate": 4.531901384464657e-06, + "loss": 1.4207, + "mean_token_accuracy": 0.6482864121596018, + "num_tokens": 2329184506.0, + "step": 13885 + }, + { + "entropy": 1.6704554855823517, + "epoch": 1.5254456070967564, + "grad_norm": 0.7864761352539062, + "learning_rate": 4.5307881524339436e-06, + "loss": 1.5056, + "mean_token_accuracy": 0.6587265928586324, + "num_tokens": 2329322566.0, + "step": 13886 + }, + { + "entropy": 1.7044276495774586, + "epoch": 1.5255554640081295, + "grad_norm": 1.0024720430374146, + "learning_rate": 4.529675125150868e-06, + "loss": 1.2254, + "mean_token_accuracy": 0.686733677983284, + "num_tokens": 2329467035.0, + "step": 13887 + }, + { + "entropy": 1.744888146718343, + "epoch": 1.5256653209195024, + "grad_norm": 0.9798945784568787, + "learning_rate": 4.528562302650661e-06, + "loss": 1.4146, + "mean_token_accuracy": 0.6556217769781748, + "num_tokens": 2329619365.0, + "step": 13888 + }, + { + "entropy": 1.5887063244978588, + "epoch": 1.5257751778308752, + "grad_norm": 0.6329157948493958, + "learning_rate": 4.527449684968542e-06, + "loss": 1.3162, + "mean_token_accuracy": 0.6746849020322164, + "num_tokens": 2329831176.0, + "step": 13889 + }, + { + "entropy": 1.7707992394765217, + "epoch": 1.5258850347422483, + "grad_norm": 0.7070748805999756, + "learning_rate": 4.5263372721397205e-06, + "loss": 1.4715, + "mean_token_accuracy": 0.6502692202727, + "num_tokens": 2330045897.0, + "step": 13890 + }, + { + "entropy": 1.6835823158423107, + "epoch": 1.525994891653621, + "grad_norm": 0.6027015447616577, + "learning_rate": 4.5252250641994066e-06, + "loss": 1.4211, + "mean_token_accuracy": 0.6563113729159037, + "num_tokens": 2330194857.0, + "step": 13891 + }, + { + "entropy": 1.6718124349912007, + "epoch": 1.5261047485649941, + "grad_norm": 0.7228647470474243, + "learning_rate": 4.524113061182806e-06, + "loss": 1.3283, + "mean_token_accuracy": 0.6579829454421997, + "num_tokens": 2330352101.0, + "step": 13892 + }, + { + "entropy": 1.703097979227702, + "epoch": 1.526214605476367, + "grad_norm": 0.7070342302322388, + "learning_rate": 4.523001263125108e-06, + "loss": 1.3875, + "mean_token_accuracy": 0.6609033346176147, + "num_tokens": 2330512728.0, + "step": 13893 + }, + { + "entropy": 1.6350172857443492, + "epoch": 1.5263244623877399, + "grad_norm": 0.6093044877052307, + "learning_rate": 4.5218896700614995e-06, + "loss": 1.4077, + "mean_token_accuracy": 0.6465565909941992, + "num_tokens": 2330771542.0, + "step": 13894 + }, + { + "entropy": 1.7364859382311504, + "epoch": 1.526434319299113, + "grad_norm": 0.7398340106010437, + "learning_rate": 4.520778282027166e-06, + "loss": 1.4541, + "mean_token_accuracy": 0.6436052819093069, + "num_tokens": 2330944232.0, + "step": 13895 + }, + { + "entropy": 1.6555716196695964, + "epoch": 1.5265441762104859, + "grad_norm": 0.7023627758026123, + "learning_rate": 4.5196670990572775e-06, + "loss": 1.3531, + "mean_token_accuracy": 0.6626766125361124, + "num_tokens": 2331077782.0, + "step": 13896 + }, + { + "entropy": 1.6921831766764324, + "epoch": 1.5266540331218588, + "grad_norm": 0.7476381063461304, + "learning_rate": 4.518556121187008e-06, + "loss": 1.2434, + "mean_token_accuracy": 0.6709140290816625, + "num_tokens": 2331204207.0, + "step": 13897 + }, + { + "entropy": 1.656636933485667, + "epoch": 1.5267638900332319, + "grad_norm": 0.7082569003105164, + "learning_rate": 4.517445348451517e-06, + "loss": 1.3313, + "mean_token_accuracy": 0.6678289026021957, + "num_tokens": 2331349418.0, + "step": 13898 + }, + { + "entropy": 1.6638062099615734, + "epoch": 1.5268737469446045, + "grad_norm": 0.929499626159668, + "learning_rate": 4.516334780885956e-06, + "loss": 1.4912, + "mean_token_accuracy": 0.6647545297940572, + "num_tokens": 2331514470.0, + "step": 13899 + }, + { + "entropy": 1.7199692924817402, + "epoch": 1.5269836038559776, + "grad_norm": 0.7336742877960205, + "learning_rate": 4.515224418525481e-06, + "loss": 1.4878, + "mean_token_accuracy": 0.6436636795600256, + "num_tokens": 2331681866.0, + "step": 13900 + }, + { + "entropy": 1.706304907798767, + "epoch": 1.5270934607673505, + "grad_norm": 0.6889383792877197, + "learning_rate": 4.51411426140523e-06, + "loss": 1.3244, + "mean_token_accuracy": 0.6611945678790411, + "num_tokens": 2331824203.0, + "step": 13901 + }, + { + "entropy": 1.6809849242369335, + "epoch": 1.5272033176787234, + "grad_norm": 0.7422223687171936, + "learning_rate": 4.513004309560339e-06, + "loss": 1.2971, + "mean_token_accuracy": 0.6715661436319351, + "num_tokens": 2332002531.0, + "step": 13902 + }, + { + "entropy": 1.7066124081611633, + "epoch": 1.5273131745900965, + "grad_norm": 0.7259396910667419, + "learning_rate": 4.511894563025941e-06, + "loss": 1.4524, + "mean_token_accuracy": 0.656074732542038, + "num_tokens": 2332142038.0, + "step": 13903 + }, + { + "entropy": 1.668474902709325, + "epoch": 1.5274230315014692, + "grad_norm": 0.6683173179626465, + "learning_rate": 4.510785021837152e-06, + "loss": 1.3409, + "mean_token_accuracy": 0.6598901102940241, + "num_tokens": 2332282595.0, + "step": 13904 + }, + { + "entropy": 1.6978270014127095, + "epoch": 1.5275328884128423, + "grad_norm": 0.7189866900444031, + "learning_rate": 4.509675686029098e-06, + "loss": 1.2363, + "mean_token_accuracy": 0.6774458686510721, + "num_tokens": 2332402578.0, + "step": 13905 + }, + { + "entropy": 1.7065897683302562, + "epoch": 1.5276427453242152, + "grad_norm": 0.6405203342437744, + "learning_rate": 4.508566555636883e-06, + "loss": 1.4826, + "mean_token_accuracy": 0.643202950557073, + "num_tokens": 2332591335.0, + "step": 13906 + }, + { + "entropy": 1.6972811818122864, + "epoch": 1.527752602235588, + "grad_norm": 0.9096667170524597, + "learning_rate": 4.507457630695608e-06, + "loss": 1.2485, + "mean_token_accuracy": 0.6712992439667383, + "num_tokens": 2332720592.0, + "step": 13907 + }, + { + "entropy": 1.747529496749242, + "epoch": 1.5278624591469612, + "grad_norm": 0.6828884482383728, + "learning_rate": 4.506348911240373e-06, + "loss": 1.3577, + "mean_token_accuracy": 0.6520382066567739, + "num_tokens": 2332905147.0, + "step": 13908 + }, + { + "entropy": 1.7126144965489705, + "epoch": 1.527972316058334, + "grad_norm": 0.7815736532211304, + "learning_rate": 4.505240397306276e-06, + "loss": 1.3709, + "mean_token_accuracy": 0.6534913231929144, + "num_tokens": 2333089144.0, + "step": 13909 + }, + { + "entropy": 1.6652606030305226, + "epoch": 1.528082172969707, + "grad_norm": 0.619626522064209, + "learning_rate": 4.504132088928387e-06, + "loss": 1.3865, + "mean_token_accuracy": 0.6694531738758087, + "num_tokens": 2333339666.0, + "step": 13910 + }, + { + "entropy": 1.6987803876399994, + "epoch": 1.52819202988108, + "grad_norm": 0.670711874961853, + "learning_rate": 4.50302398614179e-06, + "loss": 1.3458, + "mean_token_accuracy": 0.6516731629769007, + "num_tokens": 2333501206.0, + "step": 13911 + }, + { + "entropy": 1.7137329777081807, + "epoch": 1.5283018867924527, + "grad_norm": 0.6769323945045471, + "learning_rate": 4.50191608898156e-06, + "loss": 1.3155, + "mean_token_accuracy": 0.6620151499907175, + "num_tokens": 2333669854.0, + "step": 13912 + }, + { + "entropy": 1.7096583346525829, + "epoch": 1.5284117437038258, + "grad_norm": 1.8156622648239136, + "learning_rate": 4.500808397482758e-06, + "loss": 1.4023, + "mean_token_accuracy": 0.6582985719045004, + "num_tokens": 2333802912.0, + "step": 13913 + }, + { + "entropy": 1.7230294446150463, + "epoch": 1.5285216006151987, + "grad_norm": 0.8509101867675781, + "learning_rate": 4.499700911680438e-06, + "loss": 1.3916, + "mean_token_accuracy": 0.6567439685265223, + "num_tokens": 2333964505.0, + "step": 13914 + }, + { + "entropy": 1.6359948416550953, + "epoch": 1.5286314575265716, + "grad_norm": 0.5980956554412842, + "learning_rate": 4.498593631609659e-06, + "loss": 1.3394, + "mean_token_accuracy": 0.655903235077858, + "num_tokens": 2334153055.0, + "step": 13915 + }, + { + "entropy": 1.6873148282368977, + "epoch": 1.5287413144379447, + "grad_norm": 0.7259606719017029, + "learning_rate": 4.497486557305457e-06, + "loss": 1.4662, + "mean_token_accuracy": 0.649679829676946, + "num_tokens": 2334300460.0, + "step": 13916 + }, + { + "entropy": 1.717822104692459, + "epoch": 1.5288511713493174, + "grad_norm": 0.6588619947433472, + "learning_rate": 4.4963796888028795e-06, + "loss": 1.3461, + "mean_token_accuracy": 0.6689090430736542, + "num_tokens": 2334450903.0, + "step": 13917 + }, + { + "entropy": 1.7424447536468506, + "epoch": 1.5289610282606905, + "grad_norm": 0.6434611082077026, + "learning_rate": 4.495273026136955e-06, + "loss": 1.4264, + "mean_token_accuracy": 0.6540406395991644, + "num_tokens": 2334682613.0, + "step": 13918 + }, + { + "entropy": 1.701530744632085, + "epoch": 1.5290708851720634, + "grad_norm": 0.6814815402030945, + "learning_rate": 4.494166569342703e-06, + "loss": 1.3523, + "mean_token_accuracy": 0.6607520679632822, + "num_tokens": 2334831018.0, + "step": 13919 + }, + { + "entropy": 1.7284032305081685, + "epoch": 1.5291807420834362, + "grad_norm": 0.6312406063079834, + "learning_rate": 4.493060318455149e-06, + "loss": 1.3887, + "mean_token_accuracy": 0.6543530275424322, + "num_tokens": 2334984799.0, + "step": 13920 + }, + { + "entropy": 1.6821238696575165, + "epoch": 1.5292905989948093, + "grad_norm": 0.8389406204223633, + "learning_rate": 4.49195427350931e-06, + "loss": 1.5546, + "mean_token_accuracy": 0.6384274909893671, + "num_tokens": 2335183329.0, + "step": 13921 + }, + { + "entropy": 1.6805053154627483, + "epoch": 1.5294004559061822, + "grad_norm": 0.6420016884803772, + "learning_rate": 4.49084843454018e-06, + "loss": 1.4169, + "mean_token_accuracy": 0.6547876248757044, + "num_tokens": 2335394113.0, + "step": 13922 + }, + { + "entropy": 1.7125700910886128, + "epoch": 1.5295103128175551, + "grad_norm": 0.6581193804740906, + "learning_rate": 4.489742801582763e-06, + "loss": 1.4741, + "mean_token_accuracy": 0.6349633236726125, + "num_tokens": 2335586212.0, + "step": 13923 + }, + { + "entropy": 1.6752095818519592, + "epoch": 1.5296201697289282, + "grad_norm": 0.6447793841362, + "learning_rate": 4.488637374672055e-06, + "loss": 1.3054, + "mean_token_accuracy": 0.6653489669164022, + "num_tokens": 2335744802.0, + "step": 13924 + }, + { + "entropy": 1.7021776934464772, + "epoch": 1.5297300266403009, + "grad_norm": 0.7589120864868164, + "learning_rate": 4.487532153843042e-06, + "loss": 1.3662, + "mean_token_accuracy": 0.6518423855304718, + "num_tokens": 2335891904.0, + "step": 13925 + }, + { + "entropy": 1.7558875183264415, + "epoch": 1.529839883551674, + "grad_norm": 0.7773360013961792, + "learning_rate": 4.4864271391306966e-06, + "loss": 1.587, + "mean_token_accuracy": 0.6268220792214075, + "num_tokens": 2336077877.0, + "step": 13926 + }, + { + "entropy": 1.7166087726751964, + "epoch": 1.5299497404630469, + "grad_norm": 0.7485700845718384, + "learning_rate": 4.485322330570001e-06, + "loss": 1.5295, + "mean_token_accuracy": 0.6402496894200643, + "num_tokens": 2336277354.0, + "step": 13927 + }, + { + "entropy": 1.6612951358159382, + "epoch": 1.5300595973744198, + "grad_norm": 0.6849234104156494, + "learning_rate": 4.484217728195916e-06, + "loss": 1.4499, + "mean_token_accuracy": 0.6464580297470093, + "num_tokens": 2336483683.0, + "step": 13928 + }, + { + "entropy": 1.7225702504316966, + "epoch": 1.5301694542857929, + "grad_norm": 0.7260335087776184, + "learning_rate": 4.483113332043406e-06, + "loss": 1.5129, + "mean_token_accuracy": 0.6519743303457896, + "num_tokens": 2336649224.0, + "step": 13929 + }, + { + "entropy": 1.7422150870164235, + "epoch": 1.5302793111971655, + "grad_norm": 0.7213874459266663, + "learning_rate": 4.482009142147423e-06, + "loss": 1.4632, + "mean_token_accuracy": 0.6494071384270986, + "num_tokens": 2336842981.0, + "step": 13930 + }, + { + "entropy": 1.7239739795525868, + "epoch": 1.5303891681085386, + "grad_norm": 1.510428786277771, + "learning_rate": 4.48090515854291e-06, + "loss": 1.2778, + "mean_token_accuracy": 0.6638440688451132, + "num_tokens": 2337021650.0, + "step": 13931 + }, + { + "entropy": 1.6931169827779133, + "epoch": 1.5304990250199115, + "grad_norm": 0.6036139130592346, + "learning_rate": 4.479801381264812e-06, + "loss": 1.3952, + "mean_token_accuracy": 0.6582480867703756, + "num_tokens": 2337184063.0, + "step": 13932 + }, + { + "entropy": 1.7547406653563182, + "epoch": 1.5306088819312844, + "grad_norm": 0.7975739240646362, + "learning_rate": 4.478697810348067e-06, + "loss": 1.4196, + "mean_token_accuracy": 0.6527342349290848, + "num_tokens": 2337334775.0, + "step": 13933 + }, + { + "entropy": 1.7274678846200306, + "epoch": 1.5307187388426575, + "grad_norm": 0.6878901124000549, + "learning_rate": 4.477594445827593e-06, + "loss": 1.2836, + "mean_token_accuracy": 0.6727159321308136, + "num_tokens": 2337481459.0, + "step": 13934 + }, + { + "entropy": 1.7022731602191925, + "epoch": 1.5308285957540304, + "grad_norm": 0.5858879685401917, + "learning_rate": 4.476491287738315e-06, + "loss": 1.4707, + "mean_token_accuracy": 0.6447849820057551, + "num_tokens": 2337670469.0, + "step": 13935 + }, + { + "entropy": 1.7229323883851368, + "epoch": 1.5309384526654033, + "grad_norm": 0.7864375710487366, + "learning_rate": 4.47538833611515e-06, + "loss": 1.4633, + "mean_token_accuracy": 0.660116657614708, + "num_tokens": 2337834873.0, + "step": 13936 + }, + { + "entropy": 1.6673548420270283, + "epoch": 1.5310483095767764, + "grad_norm": 0.6428960561752319, + "learning_rate": 4.474285590993006e-06, + "loss": 1.2677, + "mean_token_accuracy": 0.6740302940209707, + "num_tokens": 2337956224.0, + "step": 13937 + }, + { + "entropy": 1.710586170355479, + "epoch": 1.531158166488149, + "grad_norm": 0.7057774066925049, + "learning_rate": 4.473183052406779e-06, + "loss": 1.5108, + "mean_token_accuracy": 0.6455043405294418, + "num_tokens": 2338114195.0, + "step": 13938 + }, + { + "entropy": 1.8051166435082753, + "epoch": 1.5312680233995222, + "grad_norm": 0.7137125134468079, + "learning_rate": 4.47208072039137e-06, + "loss": 1.3484, + "mean_token_accuracy": 0.6553170531988144, + "num_tokens": 2338255652.0, + "step": 13939 + }, + { + "entropy": 1.7347020109494526, + "epoch": 1.531377880310895, + "grad_norm": 0.6780955195426941, + "learning_rate": 4.470978594981662e-06, + "loss": 1.2638, + "mean_token_accuracy": 0.6686906566222509, + "num_tokens": 2338407717.0, + "step": 13940 + }, + { + "entropy": 1.7669414679209392, + "epoch": 1.531487737222268, + "grad_norm": 0.6672480702400208, + "learning_rate": 4.4698766762125424e-06, + "loss": 1.5253, + "mean_token_accuracy": 0.6340082536141077, + "num_tokens": 2338614879.0, + "step": 13941 + }, + { + "entropy": 1.7260774771372478, + "epoch": 1.531597594133641, + "grad_norm": 0.7122741937637329, + "learning_rate": 4.4687749641188825e-06, + "loss": 1.1811, + "mean_token_accuracy": 0.6872084339459738, + "num_tokens": 2338713200.0, + "step": 13942 + }, + { + "entropy": 1.7119547426700592, + "epoch": 1.5317074510450137, + "grad_norm": 0.6557315587997437, + "learning_rate": 4.4676734587355495e-06, + "loss": 1.4215, + "mean_token_accuracy": 0.6668926427761713, + "num_tokens": 2338895649.0, + "step": 13943 + }, + { + "entropy": 1.7337224682172139, + "epoch": 1.5318173079563868, + "grad_norm": 0.688232421875, + "learning_rate": 4.466572160097409e-06, + "loss": 1.312, + "mean_token_accuracy": 0.6711380928754807, + "num_tokens": 2339057198.0, + "step": 13944 + }, + { + "entropy": 1.735473394393921, + "epoch": 1.5319271648677597, + "grad_norm": 0.7628278732299805, + "learning_rate": 4.46547106823932e-06, + "loss": 1.372, + "mean_token_accuracy": 0.6484810014565786, + "num_tokens": 2339218108.0, + "step": 13945 + }, + { + "entropy": 1.6890499293804169, + "epoch": 1.5320370217791326, + "grad_norm": 0.9208077788352966, + "learning_rate": 4.464370183196122e-06, + "loss": 1.479, + "mean_token_accuracy": 0.6497959345579147, + "num_tokens": 2339408501.0, + "step": 13946 + }, + { + "entropy": 1.7463324666023254, + "epoch": 1.5321468786905057, + "grad_norm": 0.8276370167732239, + "learning_rate": 4.463269505002663e-06, + "loss": 1.4574, + "mean_token_accuracy": 0.6638544549544653, + "num_tokens": 2339542985.0, + "step": 13947 + }, + { + "entropy": 1.671470006306966, + "epoch": 1.5322567356018786, + "grad_norm": 0.6009271740913391, + "learning_rate": 4.462169033693782e-06, + "loss": 1.3789, + "mean_token_accuracy": 0.6519037485122681, + "num_tokens": 2339734500.0, + "step": 13948 + }, + { + "entropy": 1.7509864171346028, + "epoch": 1.5323665925132515, + "grad_norm": 0.7692368626594543, + "learning_rate": 4.461068769304303e-06, + "loss": 1.3132, + "mean_token_accuracy": 0.6650248964627584, + "num_tokens": 2339859100.0, + "step": 13949 + }, + { + "entropy": 1.7392914295196533, + "epoch": 1.5324764494246246, + "grad_norm": 0.683042049407959, + "learning_rate": 4.45996871186905e-06, + "loss": 1.373, + "mean_token_accuracy": 0.65072533984979, + "num_tokens": 2339994657.0, + "step": 13950 + }, + { + "entropy": 1.709182192881902, + "epoch": 1.5325863063359972, + "grad_norm": 0.6510934829711914, + "learning_rate": 4.4588688614228425e-06, + "loss": 1.3961, + "mean_token_accuracy": 0.654055600365003, + "num_tokens": 2340139163.0, + "step": 13951 + }, + { + "entropy": 1.6690978010495503, + "epoch": 1.5326961632473703, + "grad_norm": 0.6553038358688354, + "learning_rate": 4.457769218000485e-06, + "loss": 1.3819, + "mean_token_accuracy": 0.6699864417314529, + "num_tokens": 2340285613.0, + "step": 13952 + }, + { + "entropy": 1.7530159155527751, + "epoch": 1.5328060201587432, + "grad_norm": 0.7141207456588745, + "learning_rate": 4.456669781636787e-06, + "loss": 1.4391, + "mean_token_accuracy": 0.6459661523501078, + "num_tokens": 2340410082.0, + "step": 13953 + }, + { + "entropy": 1.7613732715447743, + "epoch": 1.5329158770701161, + "grad_norm": 0.6994073987007141, + "learning_rate": 4.455570552366541e-06, + "loss": 1.4204, + "mean_token_accuracy": 0.6491125027338663, + "num_tokens": 2340593994.0, + "step": 13954 + }, + { + "entropy": 1.642663260300954, + "epoch": 1.5330257339814892, + "grad_norm": 0.6598994731903076, + "learning_rate": 4.454471530224536e-06, + "loss": 1.3548, + "mean_token_accuracy": 0.6538633108139038, + "num_tokens": 2340770191.0, + "step": 13955 + }, + { + "entropy": 1.6616478463013966, + "epoch": 1.5331355908928619, + "grad_norm": 0.6697360277175903, + "learning_rate": 4.453372715245557e-06, + "loss": 1.3995, + "mean_token_accuracy": 0.6621593882640203, + "num_tokens": 2340949740.0, + "step": 13956 + }, + { + "entropy": 1.723496437072754, + "epoch": 1.533245447804235, + "grad_norm": 0.7380031943321228, + "learning_rate": 4.452274107464388e-06, + "loss": 1.4068, + "mean_token_accuracy": 0.6533598005771637, + "num_tokens": 2341134749.0, + "step": 13957 + }, + { + "entropy": 1.7315025826295216, + "epoch": 1.5333553047156079, + "grad_norm": 0.6608657240867615, + "learning_rate": 4.451175706915787e-06, + "loss": 1.5382, + "mean_token_accuracy": 0.6362046400705973, + "num_tokens": 2341397004.0, + "step": 13958 + }, + { + "entropy": 1.7039255797863007, + "epoch": 1.5334651616269808, + "grad_norm": 0.6795163154602051, + "learning_rate": 4.450077513634527e-06, + "loss": 1.3976, + "mean_token_accuracy": 0.6478537817796072, + "num_tokens": 2341604517.0, + "step": 13959 + }, + { + "entropy": 1.6687067747116089, + "epoch": 1.5335750185383539, + "grad_norm": 0.7309846878051758, + "learning_rate": 4.44897952765536e-06, + "loss": 1.5048, + "mean_token_accuracy": 0.655499721566836, + "num_tokens": 2341781573.0, + "step": 13960 + }, + { + "entropy": 1.723157713810603, + "epoch": 1.5336848754497268, + "grad_norm": 0.7755676507949829, + "learning_rate": 4.44788174901304e-06, + "loss": 1.411, + "mean_token_accuracy": 0.6578606764475504, + "num_tokens": 2341911128.0, + "step": 13961 + }, + { + "entropy": 1.6915369133154552, + "epoch": 1.5337947323610996, + "grad_norm": 0.7471572160720825, + "learning_rate": 4.446784177742312e-06, + "loss": 1.2909, + "mean_token_accuracy": 0.6779088576634725, + "num_tokens": 2342089929.0, + "step": 13962 + }, + { + "entropy": 1.679351806640625, + "epoch": 1.5339045892724728, + "grad_norm": 0.5526012778282166, + "learning_rate": 4.445686813877907e-06, + "loss": 1.485, + "mean_token_accuracy": 0.6317119797070821, + "num_tokens": 2342319068.0, + "step": 13963 + }, + { + "entropy": 1.6245819826920826, + "epoch": 1.5340144461838454, + "grad_norm": 0.6974883079528809, + "learning_rate": 4.444589657454562e-06, + "loss": 1.4006, + "mean_token_accuracy": 0.6653065234422684, + "num_tokens": 2342464403.0, + "step": 13964 + }, + { + "entropy": 1.6968108018239338, + "epoch": 1.5341243030952185, + "grad_norm": 0.5962358713150024, + "learning_rate": 4.443492708507007e-06, + "loss": 1.4658, + "mean_token_accuracy": 0.6390899419784546, + "num_tokens": 2342734419.0, + "step": 13965 + }, + { + "entropy": 1.7429889142513275, + "epoch": 1.5342341600065914, + "grad_norm": 0.6705568432807922, + "learning_rate": 4.442395967069947e-06, + "loss": 1.4232, + "mean_token_accuracy": 0.643167644739151, + "num_tokens": 2342906016.0, + "step": 13966 + }, + { + "entropy": 1.7325883607069652, + "epoch": 1.5343440169179643, + "grad_norm": 0.7288616895675659, + "learning_rate": 4.441299433178099e-06, + "loss": 1.4707, + "mean_token_accuracy": 0.6572986940542856, + "num_tokens": 2343042540.0, + "step": 13967 + }, + { + "entropy": 1.6961339712142944, + "epoch": 1.5344538738293374, + "grad_norm": 0.7040743827819824, + "learning_rate": 4.440203106866172e-06, + "loss": 1.4501, + "mean_token_accuracy": 0.6512836913267771, + "num_tokens": 2343232269.0, + "step": 13968 + }, + { + "entropy": 1.6818045775095622, + "epoch": 1.53456373074071, + "grad_norm": 0.6538490653038025, + "learning_rate": 4.439106988168861e-06, + "loss": 1.3832, + "mean_token_accuracy": 0.6568224181731542, + "num_tokens": 2343424419.0, + "step": 13969 + }, + { + "entropy": 1.7116661369800568, + "epoch": 1.5346735876520832, + "grad_norm": 0.6646193265914917, + "learning_rate": 4.438011077120854e-06, + "loss": 1.4384, + "mean_token_accuracy": 0.633455440402031, + "num_tokens": 2343599348.0, + "step": 13970 + }, + { + "entropy": 1.6582121352354686, + "epoch": 1.534783444563456, + "grad_norm": 0.5454766154289246, + "learning_rate": 4.436915373756843e-06, + "loss": 1.3379, + "mean_token_accuracy": 0.6576869090398153, + "num_tokens": 2343772491.0, + "step": 13971 + }, + { + "entropy": 1.6670493185520172, + "epoch": 1.534893301474829, + "grad_norm": 0.6243618130683899, + "learning_rate": 4.4358198781114995e-06, + "loss": 1.3609, + "mean_token_accuracy": 0.6600524286429087, + "num_tokens": 2343939801.0, + "step": 13972 + }, + { + "entropy": 1.6370833118756611, + "epoch": 1.535003158386202, + "grad_norm": 0.6674981713294983, + "learning_rate": 4.434724590219502e-06, + "loss": 1.3437, + "mean_token_accuracy": 0.6636191656192144, + "num_tokens": 2344139798.0, + "step": 13973 + }, + { + "entropy": 1.7490037282307942, + "epoch": 1.535113015297575, + "grad_norm": 0.7179242372512817, + "learning_rate": 4.433629510115512e-06, + "loss": 1.3214, + "mean_token_accuracy": 0.6645645598570505, + "num_tokens": 2344267913.0, + "step": 13974 + }, + { + "entropy": 1.7178294559319813, + "epoch": 1.5352228722089478, + "grad_norm": 0.6131008267402649, + "learning_rate": 4.432534637834188e-06, + "loss": 1.5438, + "mean_token_accuracy": 0.6416826993227005, + "num_tokens": 2344466156.0, + "step": 13975 + }, + { + "entropy": 1.6926537454128265, + "epoch": 1.535332729120321, + "grad_norm": 0.673001766204834, + "learning_rate": 4.431439973410183e-06, + "loss": 1.5398, + "mean_token_accuracy": 0.6400438646475474, + "num_tokens": 2344663909.0, + "step": 13976 + }, + { + "entropy": 1.6792364219824474, + "epoch": 1.5354425860316936, + "grad_norm": 0.6953542828559875, + "learning_rate": 4.430345516878147e-06, + "loss": 1.4084, + "mean_token_accuracy": 0.6717520505189896, + "num_tokens": 2344803361.0, + "step": 13977 + }, + { + "entropy": 1.6765713791052501, + "epoch": 1.5355524429430667, + "grad_norm": 0.6225120425224304, + "learning_rate": 4.4292512682727115e-06, + "loss": 1.3553, + "mean_token_accuracy": 0.6637383997440338, + "num_tokens": 2345013367.0, + "step": 13978 + }, + { + "entropy": 1.7007201512654622, + "epoch": 1.5356622998544396, + "grad_norm": 0.7577117085456848, + "learning_rate": 4.428157227628511e-06, + "loss": 1.4322, + "mean_token_accuracy": 0.6638199587663015, + "num_tokens": 2345180653.0, + "step": 13979 + }, + { + "entropy": 1.6954215864340465, + "epoch": 1.5357721567658125, + "grad_norm": 0.6490945816040039, + "learning_rate": 4.427063394980177e-06, + "loss": 1.3916, + "mean_token_accuracy": 0.6532481213410696, + "num_tokens": 2345343665.0, + "step": 13980 + }, + { + "entropy": 1.6678927838802338, + "epoch": 1.5358820136771856, + "grad_norm": 0.6423428058624268, + "learning_rate": 4.425969770362323e-06, + "loss": 1.2533, + "mean_token_accuracy": 0.6784861932198206, + "num_tokens": 2345473755.0, + "step": 13981 + }, + { + "entropy": 1.6874541540940602, + "epoch": 1.5359918705885582, + "grad_norm": 0.6954723000526428, + "learning_rate": 4.424876353809563e-06, + "loss": 1.3609, + "mean_token_accuracy": 0.6523398011922836, + "num_tokens": 2345647433.0, + "step": 13982 + }, + { + "entropy": 1.7374683419863384, + "epoch": 1.5361017274999313, + "grad_norm": 0.6964279413223267, + "learning_rate": 4.4237831453565035e-06, + "loss": 1.3207, + "mean_token_accuracy": 0.6694158862034479, + "num_tokens": 2345753042.0, + "step": 13983 + }, + { + "entropy": 1.7551298042138417, + "epoch": 1.5362115844113042, + "grad_norm": 0.6724309921264648, + "learning_rate": 4.422690145037743e-06, + "loss": 1.3518, + "mean_token_accuracy": 0.6594842871030172, + "num_tokens": 2345889659.0, + "step": 13984 + }, + { + "entropy": 1.7073955833911896, + "epoch": 1.5363214413226771, + "grad_norm": 0.7073147296905518, + "learning_rate": 4.421597352887879e-06, + "loss": 1.4573, + "mean_token_accuracy": 0.6550180613994598, + "num_tokens": 2346072026.0, + "step": 13985 + }, + { + "entropy": 1.6307064195473988, + "epoch": 1.5364312982340502, + "grad_norm": 0.6468693017959595, + "learning_rate": 4.420504768941493e-06, + "loss": 1.4167, + "mean_token_accuracy": 0.6737861136595408, + "num_tokens": 2346229976.0, + "step": 13986 + }, + { + "entropy": 1.7437759339809418, + "epoch": 1.536541155145423, + "grad_norm": 0.7528170943260193, + "learning_rate": 4.419412393233164e-06, + "loss": 1.4906, + "mean_token_accuracy": 0.6429897795120875, + "num_tokens": 2346443997.0, + "step": 13987 + }, + { + "entropy": 1.6810493369897206, + "epoch": 1.536651012056796, + "grad_norm": 0.6314464807510376, + "learning_rate": 4.4183202257974685e-06, + "loss": 1.4204, + "mean_token_accuracy": 0.6624763359626135, + "num_tokens": 2346619458.0, + "step": 13988 + }, + { + "entropy": 1.7033787270387013, + "epoch": 1.536760868968169, + "grad_norm": 0.742909848690033, + "learning_rate": 4.417228266668976e-06, + "loss": 1.3033, + "mean_token_accuracy": 0.6721209386984507, + "num_tokens": 2346763776.0, + "step": 13989 + }, + { + "entropy": 1.7384433150291443, + "epoch": 1.5368707258795418, + "grad_norm": 0.688539445400238, + "learning_rate": 4.4161365158822386e-06, + "loss": 1.3789, + "mean_token_accuracy": 0.6550500591595968, + "num_tokens": 2346925583.0, + "step": 13990 + }, + { + "entropy": 1.7031288743019104, + "epoch": 1.5369805827909149, + "grad_norm": 0.7073594331741333, + "learning_rate": 4.415044973471812e-06, + "loss": 1.3824, + "mean_token_accuracy": 0.6627199401458105, + "num_tokens": 2347097785.0, + "step": 13991 + }, + { + "entropy": 1.674454540014267, + "epoch": 1.5370904397022878, + "grad_norm": 0.889224648475647, + "learning_rate": 4.413953639472249e-06, + "loss": 1.2342, + "mean_token_accuracy": 0.6825215369462967, + "num_tokens": 2347220132.0, + "step": 13992 + }, + { + "entropy": 1.6853329439957936, + "epoch": 1.5372002966136606, + "grad_norm": 0.6673226952552795, + "learning_rate": 4.412862513918085e-06, + "loss": 1.301, + "mean_token_accuracy": 0.6678221076726913, + "num_tokens": 2347354586.0, + "step": 13993 + }, + { + "entropy": 1.6549971401691437, + "epoch": 1.5373101535250338, + "grad_norm": 0.6486871242523193, + "learning_rate": 4.411771596843852e-06, + "loss": 1.4182, + "mean_token_accuracy": 0.6577600389719009, + "num_tokens": 2347518232.0, + "step": 13994 + }, + { + "entropy": 1.6686547100543976, + "epoch": 1.5374200104364064, + "grad_norm": 0.7365626096725464, + "learning_rate": 4.410680888284081e-06, + "loss": 1.428, + "mean_token_accuracy": 0.6594243546326956, + "num_tokens": 2347704419.0, + "step": 13995 + }, + { + "entropy": 1.697216699520747, + "epoch": 1.5375298673477795, + "grad_norm": 0.6379204392433167, + "learning_rate": 4.409590388273288e-06, + "loss": 1.4551, + "mean_token_accuracy": 0.6572363177935282, + "num_tokens": 2347903688.0, + "step": 13996 + }, + { + "entropy": 1.7163971066474915, + "epoch": 1.5376397242591524, + "grad_norm": 0.7252550721168518, + "learning_rate": 4.4085000968459925e-06, + "loss": 1.4578, + "mean_token_accuracy": 0.6468437065680822, + "num_tokens": 2348059580.0, + "step": 13997 + }, + { + "entropy": 1.6949025789896648, + "epoch": 1.5377495811705253, + "grad_norm": 0.7134789824485779, + "learning_rate": 4.407410014036699e-06, + "loss": 1.4531, + "mean_token_accuracy": 0.6462388386329015, + "num_tokens": 2348228716.0, + "step": 13998 + }, + { + "entropy": 1.705321490764618, + "epoch": 1.5378594380818984, + "grad_norm": 0.8609808087348938, + "learning_rate": 4.406320139879906e-06, + "loss": 1.4904, + "mean_token_accuracy": 0.6510612418254217, + "num_tokens": 2348400900.0, + "step": 13999 + }, + { + "entropy": 1.7144020994504292, + "epoch": 1.5379692949932713, + "grad_norm": 0.7137477993965149, + "learning_rate": 4.405230474410108e-06, + "loss": 1.4114, + "mean_token_accuracy": 0.6484651267528534, + "num_tokens": 2348613516.0, + "step": 14000 + }, + { + "entropy": 1.7028910517692566, + "epoch": 1.5380791519046442, + "grad_norm": 0.562455415725708, + "learning_rate": 4.4041410176618e-06, + "loss": 1.417, + "mean_token_accuracy": 0.6450492938359579, + "num_tokens": 2348799990.0, + "step": 14001 + }, + { + "entropy": 1.688763548930486, + "epoch": 1.5381890088160173, + "grad_norm": 0.7308263182640076, + "learning_rate": 4.403051769669451e-06, + "loss": 1.2927, + "mean_token_accuracy": 0.6648479749759039, + "num_tokens": 2348918497.0, + "step": 14002 + }, + { + "entropy": 1.738109012444814, + "epoch": 1.53829886572739, + "grad_norm": 0.6443284153938293, + "learning_rate": 4.40196273046754e-06, + "loss": 1.3536, + "mean_token_accuracy": 0.6667558401823044, + "num_tokens": 2349071410.0, + "step": 14003 + }, + { + "entropy": 1.702557345231374, + "epoch": 1.538408722638763, + "grad_norm": 0.8023272156715393, + "learning_rate": 4.40087390009054e-06, + "loss": 1.3405, + "mean_token_accuracy": 0.676914319396019, + "num_tokens": 2349234865.0, + "step": 14004 + }, + { + "entropy": 1.711880385875702, + "epoch": 1.538518579550136, + "grad_norm": 0.80071622133255, + "learning_rate": 4.399785278572906e-06, + "loss": 1.336, + "mean_token_accuracy": 0.6736197620630264, + "num_tokens": 2349415753.0, + "step": 14005 + }, + { + "entropy": 1.687764436006546, + "epoch": 1.5386284364615088, + "grad_norm": 0.7393195033073425, + "learning_rate": 4.39869686594909e-06, + "loss": 1.3066, + "mean_token_accuracy": 0.6780485212802887, + "num_tokens": 2349536443.0, + "step": 14006 + }, + { + "entropy": 1.6960475146770477, + "epoch": 1.538738293372882, + "grad_norm": 0.7331027984619141, + "learning_rate": 4.397608662253548e-06, + "loss": 1.2219, + "mean_token_accuracy": 0.6794477055470148, + "num_tokens": 2349691007.0, + "step": 14007 + }, + { + "entropy": 1.753263344367345, + "epoch": 1.5388481502842548, + "grad_norm": 0.6345871686935425, + "learning_rate": 4.396520667520714e-06, + "loss": 1.4942, + "mean_token_accuracy": 0.639187882343928, + "num_tokens": 2349883106.0, + "step": 14008 + }, + { + "entropy": 1.7145345509052277, + "epoch": 1.5389580071956277, + "grad_norm": 0.6685234308242798, + "learning_rate": 4.395432881785028e-06, + "loss": 1.438, + "mean_token_accuracy": 0.6546867787837982, + "num_tokens": 2350060890.0, + "step": 14009 + }, + { + "entropy": 1.7457486589749653, + "epoch": 1.5390678641070006, + "grad_norm": 0.6667968034744263, + "learning_rate": 4.3943453050809144e-06, + "loss": 1.4756, + "mean_token_accuracy": 0.6380013773838679, + "num_tokens": 2350245230.0, + "step": 14010 + }, + { + "entropy": 1.6892044444878895, + "epoch": 1.5391777210183735, + "grad_norm": 0.6611399054527283, + "learning_rate": 4.393257937442793e-06, + "loss": 1.2343, + "mean_token_accuracy": 0.6810240397850672, + "num_tokens": 2350353525.0, + "step": 14011 + }, + { + "entropy": 1.7096824645996094, + "epoch": 1.5392875779297466, + "grad_norm": 0.6070430278778076, + "learning_rate": 4.392170778905081e-06, + "loss": 1.3684, + "mean_token_accuracy": 0.6506505062182745, + "num_tokens": 2350592197.0, + "step": 14012 + }, + { + "entropy": 1.6973026096820831, + "epoch": 1.5393974348411195, + "grad_norm": 0.7121232748031616, + "learning_rate": 4.3910838295021905e-06, + "loss": 1.25, + "mean_token_accuracy": 0.6867650945981344, + "num_tokens": 2350738599.0, + "step": 14013 + }, + { + "entropy": 1.794579843680064, + "epoch": 1.5395072917524923, + "grad_norm": 0.7066651582717896, + "learning_rate": 4.389997089268516e-06, + "loss": 1.3076, + "mean_token_accuracy": 0.6670825928449631, + "num_tokens": 2350881742.0, + "step": 14014 + }, + { + "entropy": 1.6792183915774028, + "epoch": 1.5396171486638655, + "grad_norm": 0.6690120697021484, + "learning_rate": 4.3889105582384525e-06, + "loss": 1.3286, + "mean_token_accuracy": 0.6693990727265676, + "num_tokens": 2351073624.0, + "step": 14015 + }, + { + "entropy": 1.7471397519111633, + "epoch": 1.5397270055752381, + "grad_norm": 0.6697660684585571, + "learning_rate": 4.387824236446395e-06, + "loss": 1.5972, + "mean_token_accuracy": 0.616848016778628, + "num_tokens": 2351288596.0, + "step": 14016 + }, + { + "entropy": 1.7265910804271698, + "epoch": 1.5398368624866112, + "grad_norm": 0.6318051218986511, + "learning_rate": 4.38673812392672e-06, + "loss": 1.3871, + "mean_token_accuracy": 0.6530221005280813, + "num_tokens": 2351452744.0, + "step": 14017 + }, + { + "entropy": 1.6270829439163208, + "epoch": 1.539946719397984, + "grad_norm": 0.7931689023971558, + "learning_rate": 4.385652220713801e-06, + "loss": 1.4203, + "mean_token_accuracy": 0.6615447551012039, + "num_tokens": 2351609983.0, + "step": 14018 + }, + { + "entropy": 1.6672570705413818, + "epoch": 1.540056576309357, + "grad_norm": 0.7277114391326904, + "learning_rate": 4.384566526842011e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6881892184416453, + "num_tokens": 2351740970.0, + "step": 14019 + }, + { + "entropy": 1.6719843447208405, + "epoch": 1.54016643322073, + "grad_norm": 0.6636250615119934, + "learning_rate": 4.383481042345707e-06, + "loss": 1.334, + "mean_token_accuracy": 0.669186050693194, + "num_tokens": 2351887213.0, + "step": 14020 + }, + { + "entropy": 1.7347903450330098, + "epoch": 1.540276290132103, + "grad_norm": 0.7051345109939575, + "learning_rate": 4.382395767259252e-06, + "loss": 1.3205, + "mean_token_accuracy": 0.6803038567304611, + "num_tokens": 2352004799.0, + "step": 14021 + }, + { + "entropy": 1.658490777015686, + "epoch": 1.5403861470434759, + "grad_norm": 0.7130881547927856, + "learning_rate": 4.381310701616985e-06, + "loss": 1.368, + "mean_token_accuracy": 0.6594650596380234, + "num_tokens": 2352206971.0, + "step": 14022 + }, + { + "entropy": 1.7292596499125164, + "epoch": 1.5404960039548488, + "grad_norm": 0.6630394458770752, + "learning_rate": 4.3802258454532495e-06, + "loss": 1.3735, + "mean_token_accuracy": 0.6567981640497843, + "num_tokens": 2352400744.0, + "step": 14023 + }, + { + "entropy": 1.7222495377063751, + "epoch": 1.5406058608662216, + "grad_norm": 0.6828321814537048, + "learning_rate": 4.379141198802388e-06, + "loss": 1.4488, + "mean_token_accuracy": 0.6561195055643717, + "num_tokens": 2352562667.0, + "step": 14024 + }, + { + "entropy": 1.646317849556605, + "epoch": 1.5407157177775948, + "grad_norm": 0.7620025277137756, + "learning_rate": 4.378056761698722e-06, + "loss": 1.3757, + "mean_token_accuracy": 0.6582992623249689, + "num_tokens": 2352756006.0, + "step": 14025 + }, + { + "entropy": 1.6566996177037556, + "epoch": 1.5408255746889676, + "grad_norm": 0.6729485988616943, + "learning_rate": 4.3769725341765745e-06, + "loss": 1.4421, + "mean_token_accuracy": 0.6499632894992828, + "num_tokens": 2352952033.0, + "step": 14026 + }, + { + "entropy": 1.7033120195070903, + "epoch": 1.5409354316003405, + "grad_norm": 0.6999285817146301, + "learning_rate": 4.375888516270264e-06, + "loss": 1.3622, + "mean_token_accuracy": 0.6672971496979395, + "num_tokens": 2353094627.0, + "step": 14027 + }, + { + "entropy": 1.7603072027365367, + "epoch": 1.5410452885117136, + "grad_norm": 0.7112701535224915, + "learning_rate": 4.3748047080140935e-06, + "loss": 1.4744, + "mean_token_accuracy": 0.6439538995424906, + "num_tokens": 2353238406.0, + "step": 14028 + }, + { + "entropy": 1.691114326318105, + "epoch": 1.5411551454230863, + "grad_norm": 0.6865422129631042, + "learning_rate": 4.373721109442373e-06, + "loss": 1.3223, + "mean_token_accuracy": 0.6618143618106842, + "num_tokens": 2353364805.0, + "step": 14029 + }, + { + "entropy": 1.7279209593931835, + "epoch": 1.5412650023344594, + "grad_norm": 0.6755762100219727, + "learning_rate": 4.3726377205893925e-06, + "loss": 1.6106, + "mean_token_accuracy": 0.6444238399465879, + "num_tokens": 2353566096.0, + "step": 14030 + }, + { + "entropy": 1.7265671888987224, + "epoch": 1.5413748592458323, + "grad_norm": 0.7081911563873291, + "learning_rate": 4.371554541489439e-06, + "loss": 1.4627, + "mean_token_accuracy": 0.658619354168574, + "num_tokens": 2353736452.0, + "step": 14031 + }, + { + "entropy": 1.679870496193568, + "epoch": 1.5414847161572052, + "grad_norm": 0.6351853013038635, + "learning_rate": 4.370471572176797e-06, + "loss": 1.4357, + "mean_token_accuracy": 0.651072566707929, + "num_tokens": 2353911720.0, + "step": 14032 + }, + { + "entropy": 1.72749329606692, + "epoch": 1.5415945730685783, + "grad_norm": 0.7487029433250427, + "learning_rate": 4.369388812685748e-06, + "loss": 1.3973, + "mean_token_accuracy": 0.6484367648760477, + "num_tokens": 2354086543.0, + "step": 14033 + }, + { + "entropy": 1.662652164697647, + "epoch": 1.5417044299799512, + "grad_norm": 0.7096850872039795, + "learning_rate": 4.3683062630505515e-06, + "loss": 1.4054, + "mean_token_accuracy": 0.6659991989533106, + "num_tokens": 2354250773.0, + "step": 14034 + }, + { + "entropy": 1.680237591266632, + "epoch": 1.541814286891324, + "grad_norm": 0.8764726519584656, + "learning_rate": 4.367223923305471e-06, + "loss": 1.4226, + "mean_token_accuracy": 0.6559638977050781, + "num_tokens": 2354385839.0, + "step": 14035 + }, + { + "entropy": 1.6535289386908214, + "epoch": 1.541924143802697, + "grad_norm": 0.651672899723053, + "learning_rate": 4.366141793484769e-06, + "loss": 1.2837, + "mean_token_accuracy": 0.6699432631333669, + "num_tokens": 2354550541.0, + "step": 14036 + }, + { + "entropy": 1.706730951865514, + "epoch": 1.5420340007140698, + "grad_norm": 0.7192301154136658, + "learning_rate": 4.365059873622689e-06, + "loss": 1.2958, + "mean_token_accuracy": 0.6683982561031977, + "num_tokens": 2354690354.0, + "step": 14037 + }, + { + "entropy": 1.7195076942443848, + "epoch": 1.542143857625443, + "grad_norm": 0.6625783443450928, + "learning_rate": 4.363978163753472e-06, + "loss": 1.5496, + "mean_token_accuracy": 0.6505264093478521, + "num_tokens": 2354936587.0, + "step": 14038 + }, + { + "entropy": 1.7424062093098958, + "epoch": 1.5422537145368158, + "grad_norm": 0.7096309661865234, + "learning_rate": 4.362896663911359e-06, + "loss": 1.4185, + "mean_token_accuracy": 0.6444617807865143, + "num_tokens": 2355169144.0, + "step": 14039 + }, + { + "entropy": 1.6861979564030964, + "epoch": 1.5423635714481887, + "grad_norm": 0.6280454397201538, + "learning_rate": 4.361815374130572e-06, + "loss": 1.3581, + "mean_token_accuracy": 0.6629171371459961, + "num_tokens": 2355322642.0, + "step": 14040 + }, + { + "entropy": 1.7170844674110413, + "epoch": 1.5424734283595618, + "grad_norm": 0.6770321726799011, + "learning_rate": 4.360734294445341e-06, + "loss": 1.4312, + "mean_token_accuracy": 0.650084396203359, + "num_tokens": 2355507572.0, + "step": 14041 + }, + { + "entropy": 1.688788741827011, + "epoch": 1.5425832852709345, + "grad_norm": 0.8258582949638367, + "learning_rate": 4.359653424889877e-06, + "loss": 1.3963, + "mean_token_accuracy": 0.6617969572544098, + "num_tokens": 2355672193.0, + "step": 14042 + }, + { + "entropy": 1.6985827187697093, + "epoch": 1.5426931421823076, + "grad_norm": 0.6929754614830017, + "learning_rate": 4.358572765498388e-06, + "loss": 1.3762, + "mean_token_accuracy": 0.6613185753424963, + "num_tokens": 2355849560.0, + "step": 14043 + }, + { + "entropy": 1.7190652589003246, + "epoch": 1.5428029990936805, + "grad_norm": 0.6878488659858704, + "learning_rate": 4.357492316305078e-06, + "loss": 1.3982, + "mean_token_accuracy": 0.659120962023735, + "num_tokens": 2356025745.0, + "step": 14044 + }, + { + "entropy": 1.638331522544225, + "epoch": 1.5429128560050533, + "grad_norm": 0.5437892079353333, + "learning_rate": 4.356412077344148e-06, + "loss": 1.4459, + "mean_token_accuracy": 0.6453542610009512, + "num_tokens": 2356249597.0, + "step": 14045 + }, + { + "entropy": 1.6433692872524261, + "epoch": 1.5430227129164265, + "grad_norm": 0.7393973469734192, + "learning_rate": 4.355332048649777e-06, + "loss": 1.3323, + "mean_token_accuracy": 0.6734596192836761, + "num_tokens": 2356435811.0, + "step": 14046 + }, + { + "entropy": 1.6932270030180614, + "epoch": 1.5431325698277993, + "grad_norm": 0.7015334367752075, + "learning_rate": 4.354252230256152e-06, + "loss": 1.5119, + "mean_token_accuracy": 0.6505384395519892, + "num_tokens": 2356616344.0, + "step": 14047 + }, + { + "entropy": 1.722126583258311, + "epoch": 1.5432424267391722, + "grad_norm": 0.8697376251220703, + "learning_rate": 4.353172622197453e-06, + "loss": 1.593, + "mean_token_accuracy": 0.6482644279797872, + "num_tokens": 2356757894.0, + "step": 14048 + }, + { + "entropy": 1.7142964998881023, + "epoch": 1.5433522836505453, + "grad_norm": 0.8031371235847473, + "learning_rate": 4.352093224507844e-06, + "loss": 1.4184, + "mean_token_accuracy": 0.6482335776090622, + "num_tokens": 2356905299.0, + "step": 14049 + }, + { + "entropy": 1.7244684199492137, + "epoch": 1.543462140561918, + "grad_norm": 0.8741907477378845, + "learning_rate": 4.351014037221487e-06, + "loss": 1.163, + "mean_token_accuracy": 0.687325323621432, + "num_tokens": 2357051562.0, + "step": 14050 + }, + { + "entropy": 1.733912189801534, + "epoch": 1.543571997473291, + "grad_norm": 0.7576857209205627, + "learning_rate": 4.349935060372542e-06, + "loss": 1.4504, + "mean_token_accuracy": 0.6613880942265192, + "num_tokens": 2357268692.0, + "step": 14051 + }, + { + "entropy": 1.711068868637085, + "epoch": 1.543681854384664, + "grad_norm": 0.8415870666503906, + "learning_rate": 4.348856293995154e-06, + "loss": 1.3542, + "mean_token_accuracy": 0.6686215748389562, + "num_tokens": 2357397384.0, + "step": 14052 + }, + { + "entropy": 1.7130617996056874, + "epoch": 1.5437917112960369, + "grad_norm": 0.6683608889579773, + "learning_rate": 4.347777738123469e-06, + "loss": 1.4191, + "mean_token_accuracy": 0.6496846874554952, + "num_tokens": 2357561125.0, + "step": 14053 + }, + { + "entropy": 1.6761254767576854, + "epoch": 1.54390156820741, + "grad_norm": 0.7907574772834778, + "learning_rate": 4.3466993927916215e-06, + "loss": 1.5052, + "mean_token_accuracy": 0.6459088623523712, + "num_tokens": 2357745122.0, + "step": 14054 + }, + { + "entropy": 1.6257309913635254, + "epoch": 1.5440114251187826, + "grad_norm": 0.6445624828338623, + "learning_rate": 4.345621258033737e-06, + "loss": 1.421, + "mean_token_accuracy": 0.6715798825025558, + "num_tokens": 2357927515.0, + "step": 14055 + }, + { + "entropy": 1.6763292849063873, + "epoch": 1.5441212820301558, + "grad_norm": 0.6625300645828247, + "learning_rate": 4.344543333883941e-06, + "loss": 1.3599, + "mean_token_accuracy": 0.6491398314634959, + "num_tokens": 2358124048.0, + "step": 14056 + }, + { + "entropy": 1.7738927900791168, + "epoch": 1.5442311389415286, + "grad_norm": 0.7023770213127136, + "learning_rate": 4.343465620376355e-06, + "loss": 1.3112, + "mean_token_accuracy": 0.6682771146297455, + "num_tokens": 2358256450.0, + "step": 14057 + }, + { + "entropy": 1.685390333334605, + "epoch": 1.5443409958529015, + "grad_norm": 0.7113889455795288, + "learning_rate": 4.342388117545078e-06, + "loss": 1.3734, + "mean_token_accuracy": 0.6664382467667261, + "num_tokens": 2358391277.0, + "step": 14058 + }, + { + "entropy": 1.6166270176569622, + "epoch": 1.5444508527642746, + "grad_norm": 0.6758726835250854, + "learning_rate": 4.341310825424215e-06, + "loss": 1.3477, + "mean_token_accuracy": 0.6600983242193857, + "num_tokens": 2358561825.0, + "step": 14059 + }, + { + "entropy": 1.6522872944672902, + "epoch": 1.5445607096756475, + "grad_norm": 0.596356213092804, + "learning_rate": 4.340233744047868e-06, + "loss": 1.4209, + "mean_token_accuracy": 0.6476166248321533, + "num_tokens": 2358778206.0, + "step": 14060 + }, + { + "entropy": 1.724749763806661, + "epoch": 1.5446705665870204, + "grad_norm": 0.6655259728431702, + "learning_rate": 4.339156873450122e-06, + "loss": 1.3939, + "mean_token_accuracy": 0.6588339308897654, + "num_tokens": 2358936328.0, + "step": 14061 + }, + { + "entropy": 1.7044113278388977, + "epoch": 1.5447804234983935, + "grad_norm": 0.7894936203956604, + "learning_rate": 4.338080213665058e-06, + "loss": 1.362, + "mean_token_accuracy": 0.6574911077817281, + "num_tokens": 2359127677.0, + "step": 14062 + }, + { + "entropy": 1.6766592065493267, + "epoch": 1.5448902804097662, + "grad_norm": 0.7808144688606262, + "learning_rate": 4.337003764726754e-06, + "loss": 1.4714, + "mean_token_accuracy": 0.6505367159843445, + "num_tokens": 2359318644.0, + "step": 14063 + }, + { + "entropy": 1.6718900700410206, + "epoch": 1.5450001373211393, + "grad_norm": 0.7390754222869873, + "learning_rate": 4.335927526669277e-06, + "loss": 1.2496, + "mean_token_accuracy": 0.6705302894115448, + "num_tokens": 2359475453.0, + "step": 14064 + }, + { + "entropy": 1.6547542810440063, + "epoch": 1.5451099942325122, + "grad_norm": 0.716488242149353, + "learning_rate": 4.334851499526693e-06, + "loss": 1.3649, + "mean_token_accuracy": 0.65887650847435, + "num_tokens": 2359637946.0, + "step": 14065 + }, + { + "entropy": 1.6610281368096669, + "epoch": 1.545219851143885, + "grad_norm": 0.636939525604248, + "learning_rate": 4.333775683333056e-06, + "loss": 1.2319, + "mean_token_accuracy": 0.6830330838759741, + "num_tokens": 2359770202.0, + "step": 14066 + }, + { + "entropy": 1.6054012378056843, + "epoch": 1.5453297080552582, + "grad_norm": 0.6318295001983643, + "learning_rate": 4.332700078122411e-06, + "loss": 1.3044, + "mean_token_accuracy": 0.6684317042430242, + "num_tokens": 2359979968.0, + "step": 14067 + }, + { + "entropy": 1.734905183315277, + "epoch": 1.5454395649666308, + "grad_norm": 0.5726717114448547, + "learning_rate": 4.3316246839288055e-06, + "loss": 1.5647, + "mean_token_accuracy": 0.6327670514583588, + "num_tokens": 2360193178.0, + "step": 14068 + }, + { + "entropy": 1.7301185925801594, + "epoch": 1.545549421878004, + "grad_norm": 0.7241100668907166, + "learning_rate": 4.330549500786279e-06, + "loss": 1.5143, + "mean_token_accuracy": 0.6339599887530009, + "num_tokens": 2360414280.0, + "step": 14069 + }, + { + "entropy": 1.6495436231295268, + "epoch": 1.5456592787893768, + "grad_norm": 0.6548447012901306, + "learning_rate": 4.329474528728851e-06, + "loss": 1.3183, + "mean_token_accuracy": 0.6611727774143219, + "num_tokens": 2360550978.0, + "step": 14070 + }, + { + "entropy": 1.733685662349065, + "epoch": 1.5457691357007497, + "grad_norm": 0.7285884618759155, + "learning_rate": 4.328399767790546e-06, + "loss": 1.448, + "mean_token_accuracy": 0.6412131836016973, + "num_tokens": 2360730661.0, + "step": 14071 + }, + { + "entropy": 1.7295173903306325, + "epoch": 1.5458789926121228, + "grad_norm": 0.7899370789527893, + "learning_rate": 4.327325218005386e-06, + "loss": 1.3316, + "mean_token_accuracy": 0.6687415341536204, + "num_tokens": 2360851373.0, + "step": 14072 + }, + { + "entropy": 1.6831912994384766, + "epoch": 1.5459888495234957, + "grad_norm": 0.658294677734375, + "learning_rate": 4.326250879407377e-06, + "loss": 1.249, + "mean_token_accuracy": 0.6833125005165736, + "num_tokens": 2360979800.0, + "step": 14073 + }, + { + "entropy": 1.654470185438792, + "epoch": 1.5460987064348686, + "grad_norm": 0.6260450482368469, + "learning_rate": 4.325176752030516e-06, + "loss": 1.3658, + "mean_token_accuracy": 0.6646214425563812, + "num_tokens": 2361129203.0, + "step": 14074 + }, + { + "entropy": 1.6546796262264252, + "epoch": 1.5462085633462417, + "grad_norm": 0.6665990948677063, + "learning_rate": 4.324102835908807e-06, + "loss": 1.3189, + "mean_token_accuracy": 0.6715873231490453, + "num_tokens": 2361304020.0, + "step": 14075 + }, + { + "entropy": 1.684925526380539, + "epoch": 1.5463184202576143, + "grad_norm": 0.7178765535354614, + "learning_rate": 4.323029131076232e-06, + "loss": 1.3917, + "mean_token_accuracy": 0.654693936308225, + "num_tokens": 2361468876.0, + "step": 14076 + }, + { + "entropy": 1.7206127643585205, + "epoch": 1.5464282771689875, + "grad_norm": 0.7334764003753662, + "learning_rate": 4.321955637566779e-06, + "loss": 1.4133, + "mean_token_accuracy": 0.6604679723580679, + "num_tokens": 2361644981.0, + "step": 14077 + }, + { + "entropy": 1.7566125492254894, + "epoch": 1.5465381340803603, + "grad_norm": 0.9064491391181946, + "learning_rate": 4.320882355414421e-06, + "loss": 1.3328, + "mean_token_accuracy": 0.6624845316012701, + "num_tokens": 2361768215.0, + "step": 14078 + }, + { + "entropy": 1.6920438210169475, + "epoch": 1.5466479909917332, + "grad_norm": 0.8833717107772827, + "learning_rate": 4.319809284653123e-06, + "loss": 1.3987, + "mean_token_accuracy": 0.6700094143549601, + "num_tokens": 2361915240.0, + "step": 14079 + }, + { + "entropy": 1.6645349264144897, + "epoch": 1.5467578479031063, + "grad_norm": 0.5727179646492004, + "learning_rate": 4.318736425316855e-06, + "loss": 1.528, + "mean_token_accuracy": 0.6394601066907247, + "num_tokens": 2362137844.0, + "step": 14080 + }, + { + "entropy": 1.7052935063838959, + "epoch": 1.546867704814479, + "grad_norm": 0.7055914402008057, + "learning_rate": 4.317663777439567e-06, + "loss": 1.3783, + "mean_token_accuracy": 0.6631141553322474, + "num_tokens": 2362348283.0, + "step": 14081 + }, + { + "entropy": 1.6607483228047688, + "epoch": 1.546977561725852, + "grad_norm": 0.6008187532424927, + "learning_rate": 4.316591341055208e-06, + "loss": 1.3967, + "mean_token_accuracy": 0.6505735764900843, + "num_tokens": 2362519677.0, + "step": 14082 + }, + { + "entropy": 1.7044240633646648, + "epoch": 1.547087418637225, + "grad_norm": 0.6458705067634583, + "learning_rate": 4.315519116197724e-06, + "loss": 1.4219, + "mean_token_accuracy": 0.6473012765248617, + "num_tokens": 2362691629.0, + "step": 14083 + }, + { + "entropy": 1.7407875955104828, + "epoch": 1.5471972755485979, + "grad_norm": 0.7819077372550964, + "learning_rate": 4.314447102901045e-06, + "loss": 1.2604, + "mean_token_accuracy": 0.687695175409317, + "num_tokens": 2362816475.0, + "step": 14084 + }, + { + "entropy": 1.6973053614298503, + "epoch": 1.547307132459971, + "grad_norm": 0.6520508527755737, + "learning_rate": 4.3133753011991046e-06, + "loss": 1.3195, + "mean_token_accuracy": 0.6731201509634653, + "num_tokens": 2362972835.0, + "step": 14085 + }, + { + "entropy": 1.732239951690038, + "epoch": 1.5474169893713439, + "grad_norm": 0.8456202149391174, + "learning_rate": 4.312303711125824e-06, + "loss": 1.278, + "mean_token_accuracy": 0.666911373535792, + "num_tokens": 2363081271.0, + "step": 14086 + }, + { + "entropy": 1.644601583480835, + "epoch": 1.5475268462827168, + "grad_norm": 0.8071454763412476, + "learning_rate": 4.311232332715114e-06, + "loss": 1.4276, + "mean_token_accuracy": 0.659256507953008, + "num_tokens": 2363264153.0, + "step": 14087 + }, + { + "entropy": 1.6631284455458324, + "epoch": 1.5476367031940899, + "grad_norm": 0.6700156927108765, + "learning_rate": 4.310161166000887e-06, + "loss": 1.2801, + "mean_token_accuracy": 0.6774795204401016, + "num_tokens": 2363392436.0, + "step": 14088 + }, + { + "entropy": 1.7066160937150319, + "epoch": 1.5477465601054625, + "grad_norm": 0.6427406072616577, + "learning_rate": 4.309090211017049e-06, + "loss": 1.3209, + "mean_token_accuracy": 0.6586870650450388, + "num_tokens": 2363528126.0, + "step": 14089 + }, + { + "entropy": 1.799843966960907, + "epoch": 1.5478564170168356, + "grad_norm": 0.7809091210365295, + "learning_rate": 4.308019467797487e-06, + "loss": 1.4235, + "mean_token_accuracy": 0.6456949164470037, + "num_tokens": 2363671865.0, + "step": 14090 + }, + { + "entropy": 1.6500937740008037, + "epoch": 1.5479662739282085, + "grad_norm": 0.783412516117096, + "learning_rate": 4.306948936376093e-06, + "loss": 1.4475, + "mean_token_accuracy": 0.6537269403537115, + "num_tokens": 2363846696.0, + "step": 14091 + }, + { + "entropy": 1.7201881210009258, + "epoch": 1.5480761308395814, + "grad_norm": 0.6507890224456787, + "learning_rate": 4.3058786167867505e-06, + "loss": 1.349, + "mean_token_accuracy": 0.6581819206476212, + "num_tokens": 2364020827.0, + "step": 14092 + }, + { + "entropy": 1.702020267645518, + "epoch": 1.5481859877509545, + "grad_norm": 0.6586436033248901, + "learning_rate": 4.304808509063335e-06, + "loss": 1.4886, + "mean_token_accuracy": 0.6433099905649821, + "num_tokens": 2364256888.0, + "step": 14093 + }, + { + "entropy": 1.666669249534607, + "epoch": 1.5482958446623272, + "grad_norm": 0.7112491130828857, + "learning_rate": 4.30373861323971e-06, + "loss": 1.3181, + "mean_token_accuracy": 0.6703788836797079, + "num_tokens": 2364409964.0, + "step": 14094 + }, + { + "entropy": 1.702845573425293, + "epoch": 1.5484057015737003, + "grad_norm": 0.5408870577812195, + "learning_rate": 4.302668929349742e-06, + "loss": 1.4346, + "mean_token_accuracy": 0.6451524297396342, + "num_tokens": 2364604014.0, + "step": 14095 + }, + { + "entropy": 1.7024723092714946, + "epoch": 1.5485155584850732, + "grad_norm": 0.709581196308136, + "learning_rate": 4.301599457427284e-06, + "loss": 1.2413, + "mean_token_accuracy": 0.6726260830958685, + "num_tokens": 2364759857.0, + "step": 14096 + }, + { + "entropy": 1.6831977367401123, + "epoch": 1.548625415396446, + "grad_norm": 0.9029605388641357, + "learning_rate": 4.300530197506187e-06, + "loss": 1.4204, + "mean_token_accuracy": 0.6639485061168671, + "num_tokens": 2364941448.0, + "step": 14097 + }, + { + "entropy": 1.6596945226192474, + "epoch": 1.5487352723078192, + "grad_norm": 0.7375509142875671, + "learning_rate": 4.299461149620289e-06, + "loss": 1.2836, + "mean_token_accuracy": 0.6749891887108485, + "num_tokens": 2365057955.0, + "step": 14098 + }, + { + "entropy": 1.732172687848409, + "epoch": 1.548845129219192, + "grad_norm": 0.8708524107933044, + "learning_rate": 4.298392313803423e-06, + "loss": 1.6704, + "mean_token_accuracy": 0.6292888720830282, + "num_tokens": 2365272724.0, + "step": 14099 + }, + { + "entropy": 1.7026624778906505, + "epoch": 1.548954986130565, + "grad_norm": 0.6516265273094177, + "learning_rate": 4.297323690089423e-06, + "loss": 1.3864, + "mean_token_accuracy": 0.6629829307397207, + "num_tokens": 2365445475.0, + "step": 14100 + }, + { + "entropy": 1.757458617289861, + "epoch": 1.549064843041938, + "grad_norm": 0.6837176084518433, + "learning_rate": 4.296255278512112e-06, + "loss": 1.4276, + "mean_token_accuracy": 0.6513576706250509, + "num_tokens": 2365594575.0, + "step": 14101 + }, + { + "entropy": 1.6596699754397075, + "epoch": 1.5491746999533107, + "grad_norm": 0.7006385922431946, + "learning_rate": 4.295187079105296e-06, + "loss": 1.4615, + "mean_token_accuracy": 0.6513244410355886, + "num_tokens": 2365749946.0, + "step": 14102 + }, + { + "entropy": 1.7063644925753276, + "epoch": 1.5492845568646838, + "grad_norm": 0.712912917137146, + "learning_rate": 4.294119091902786e-06, + "loss": 1.4283, + "mean_token_accuracy": 0.6631234188874563, + "num_tokens": 2365907668.0, + "step": 14103 + }, + { + "entropy": 1.6990328629811604, + "epoch": 1.5493944137760567, + "grad_norm": 0.7050018310546875, + "learning_rate": 4.293051316938389e-06, + "loss": 1.2285, + "mean_token_accuracy": 0.6825551042954127, + "num_tokens": 2366041547.0, + "step": 14104 + }, + { + "entropy": 1.6146796643733978, + "epoch": 1.5495042706874296, + "grad_norm": 0.7505760192871094, + "learning_rate": 4.291983754245895e-06, + "loss": 1.3202, + "mean_token_accuracy": 0.6715732961893082, + "num_tokens": 2366209639.0, + "step": 14105 + }, + { + "entropy": 1.793668379386266, + "epoch": 1.5496141275988027, + "grad_norm": 0.7830557823181152, + "learning_rate": 4.2909164038590915e-06, + "loss": 1.3973, + "mean_token_accuracy": 0.6473236183325449, + "num_tokens": 2366333296.0, + "step": 14106 + }, + { + "entropy": 1.6847633024056752, + "epoch": 1.5497239845101753, + "grad_norm": 0.6471010446548462, + "learning_rate": 4.289849265811761e-06, + "loss": 1.3483, + "mean_token_accuracy": 0.6691886434952418, + "num_tokens": 2366478156.0, + "step": 14107 + }, + { + "entropy": 1.6979444523652394, + "epoch": 1.5498338414215485, + "grad_norm": 0.7506217360496521, + "learning_rate": 4.288782340137675e-06, + "loss": 1.3874, + "mean_token_accuracy": 0.6577440400918325, + "num_tokens": 2366636399.0, + "step": 14108 + }, + { + "entropy": 1.6835704545180004, + "epoch": 1.5499436983329213, + "grad_norm": 0.6772280931472778, + "learning_rate": 4.287715626870609e-06, + "loss": 1.318, + "mean_token_accuracy": 0.6703698684771856, + "num_tokens": 2366777293.0, + "step": 14109 + }, + { + "entropy": 1.716042975584666, + "epoch": 1.5500535552442942, + "grad_norm": 0.6948290467262268, + "learning_rate": 4.286649126044316e-06, + "loss": 1.5699, + "mean_token_accuracy": 0.6452557643254598, + "num_tokens": 2366963739.0, + "step": 14110 + }, + { + "entropy": 1.6994237899780273, + "epoch": 1.5501634121556673, + "grad_norm": 0.7720609307289124, + "learning_rate": 4.2855828376925515e-06, + "loss": 1.3042, + "mean_token_accuracy": 0.6619250476360321, + "num_tokens": 2367094397.0, + "step": 14111 + }, + { + "entropy": 1.7140393952528636, + "epoch": 1.5502732690670402, + "grad_norm": 0.6399521827697754, + "learning_rate": 4.2845167618490645e-06, + "loss": 1.4772, + "mean_token_accuracy": 0.6374075512091318, + "num_tokens": 2367332174.0, + "step": 14112 + }, + { + "entropy": 1.6940363347530365, + "epoch": 1.550383125978413, + "grad_norm": 0.723393976688385, + "learning_rate": 4.283450898547601e-06, + "loss": 1.3998, + "mean_token_accuracy": 0.6478741665681204, + "num_tokens": 2367479323.0, + "step": 14113 + }, + { + "entropy": 1.676634858051936, + "epoch": 1.5504929828897862, + "grad_norm": 0.724162220954895, + "learning_rate": 4.282385247821886e-06, + "loss": 1.234, + "mean_token_accuracy": 0.6788554986317953, + "num_tokens": 2367643037.0, + "step": 14114 + }, + { + "entropy": 1.6137764851252239, + "epoch": 1.5506028398011589, + "grad_norm": 0.7134261727333069, + "learning_rate": 4.28131980970565e-06, + "loss": 1.2265, + "mean_token_accuracy": 0.682574192682902, + "num_tokens": 2367828930.0, + "step": 14115 + }, + { + "entropy": 1.7493961155414581, + "epoch": 1.550712696712532, + "grad_norm": 0.6667389273643494, + "learning_rate": 4.280254584232616e-06, + "loss": 1.4779, + "mean_token_accuracy": 0.6394469936688741, + "num_tokens": 2368035180.0, + "step": 14116 + }, + { + "entropy": 1.6516866981983185, + "epoch": 1.5508225536239049, + "grad_norm": 0.7217621803283691, + "learning_rate": 4.279189571436497e-06, + "loss": 1.3961, + "mean_token_accuracy": 0.6504537761211395, + "num_tokens": 2368265315.0, + "step": 14117 + }, + { + "entropy": 1.720879077911377, + "epoch": 1.5509324105352778, + "grad_norm": 0.7750219702720642, + "learning_rate": 4.2781247713509985e-06, + "loss": 1.5135, + "mean_token_accuracy": 0.6509824097156525, + "num_tokens": 2368447962.0, + "step": 14118 + }, + { + "entropy": 1.7540039718151093, + "epoch": 1.5510422674466509, + "grad_norm": 0.768004834651947, + "learning_rate": 4.2770601840098235e-06, + "loss": 1.4708, + "mean_token_accuracy": 0.664596493045489, + "num_tokens": 2368632837.0, + "step": 14119 + }, + { + "entropy": 1.6874784628550212, + "epoch": 1.5511521243580235, + "grad_norm": 0.6328350305557251, + "learning_rate": 4.275995809446661e-06, + "loss": 1.3143, + "mean_token_accuracy": 0.661526824037234, + "num_tokens": 2368796121.0, + "step": 14120 + }, + { + "entropy": 1.7491315305233002, + "epoch": 1.5512619812693966, + "grad_norm": 0.6634995341300964, + "learning_rate": 4.274931647695205e-06, + "loss": 1.4015, + "mean_token_accuracy": 0.6602429201205572, + "num_tokens": 2368954333.0, + "step": 14121 + }, + { + "entropy": 1.7477340896924336, + "epoch": 1.5513718381807695, + "grad_norm": 0.8590699434280396, + "learning_rate": 4.273867698789132e-06, + "loss": 1.4338, + "mean_token_accuracy": 0.6396941244602203, + "num_tokens": 2369138689.0, + "step": 14122 + }, + { + "entropy": 1.671852171421051, + "epoch": 1.5514816950921424, + "grad_norm": 0.63103187084198, + "learning_rate": 4.272803962762112e-06, + "loss": 1.3111, + "mean_token_accuracy": 0.6657995829979578, + "num_tokens": 2369305079.0, + "step": 14123 + }, + { + "entropy": 1.6969023446242015, + "epoch": 1.5515915520035155, + "grad_norm": 0.8352360725402832, + "learning_rate": 4.271740439647815e-06, + "loss": 1.5118, + "mean_token_accuracy": 0.6522821436325709, + "num_tokens": 2369463395.0, + "step": 14124 + }, + { + "entropy": 1.7147069871425629, + "epoch": 1.5517014089148884, + "grad_norm": 0.714336097240448, + "learning_rate": 4.270677129479908e-06, + "loss": 1.3111, + "mean_token_accuracy": 0.6640656888484955, + "num_tokens": 2369580422.0, + "step": 14125 + }, + { + "entropy": 1.6699798206488292, + "epoch": 1.5518112658262613, + "grad_norm": 0.6139026284217834, + "learning_rate": 4.2696140322920305e-06, + "loss": 1.3299, + "mean_token_accuracy": 0.6675732731819153, + "num_tokens": 2369739830.0, + "step": 14126 + }, + { + "entropy": 1.642260581254959, + "epoch": 1.5519211227376344, + "grad_norm": 0.6656533479690552, + "learning_rate": 4.268551148117836e-06, + "loss": 1.4251, + "mean_token_accuracy": 0.651205783089002, + "num_tokens": 2369902180.0, + "step": 14127 + }, + { + "entropy": 1.6300160487492878, + "epoch": 1.552030979649007, + "grad_norm": 0.7406736612319946, + "learning_rate": 4.26748847699097e-06, + "loss": 1.4728, + "mean_token_accuracy": 0.6558305223782858, + "num_tokens": 2370101604.0, + "step": 14128 + }, + { + "entropy": 1.705411930878957, + "epoch": 1.5521408365603802, + "grad_norm": 0.6777219772338867, + "learning_rate": 4.266426018945058e-06, + "loss": 1.3698, + "mean_token_accuracy": 0.6538793096939722, + "num_tokens": 2370260513.0, + "step": 14129 + }, + { + "entropy": 1.6665898859500885, + "epoch": 1.552250693471753, + "grad_norm": 0.7032326459884644, + "learning_rate": 4.265363774013724e-06, + "loss": 1.4292, + "mean_token_accuracy": 0.6513901352882385, + "num_tokens": 2370418589.0, + "step": 14130 + }, + { + "entropy": 1.7191846172014873, + "epoch": 1.552360550383126, + "grad_norm": 0.6671421527862549, + "learning_rate": 4.264301742230597e-06, + "loss": 1.4887, + "mean_token_accuracy": 0.6549615909655889, + "num_tokens": 2370593404.0, + "step": 14131 + }, + { + "entropy": 1.7173360486825306, + "epoch": 1.552470407294499, + "grad_norm": 0.7962349057197571, + "learning_rate": 4.263239923629281e-06, + "loss": 1.4229, + "mean_token_accuracy": 0.6445601582527161, + "num_tokens": 2370764543.0, + "step": 14132 + }, + { + "entropy": 1.7082890371481578, + "epoch": 1.5525802642058717, + "grad_norm": 0.8383502960205078, + "learning_rate": 4.262178318243388e-06, + "loss": 1.1737, + "mean_token_accuracy": 0.6861835420131683, + "num_tokens": 2370883229.0, + "step": 14133 + }, + { + "entropy": 1.7188211580117543, + "epoch": 1.5526901211172448, + "grad_norm": 0.7299179434776306, + "learning_rate": 4.261116926106516e-06, + "loss": 1.3521, + "mean_token_accuracy": 0.6593069980541865, + "num_tokens": 2371046757.0, + "step": 14134 + }, + { + "entropy": 1.6964614987373352, + "epoch": 1.5527999780286177, + "grad_norm": 0.7393060326576233, + "learning_rate": 4.260055747252254e-06, + "loss": 1.5476, + "mean_token_accuracy": 0.6422794361909231, + "num_tokens": 2371236555.0, + "step": 14135 + }, + { + "entropy": 1.7541022598743439, + "epoch": 1.5529098349399906, + "grad_norm": 0.753736674785614, + "learning_rate": 4.25899478171419e-06, + "loss": 1.4018, + "mean_token_accuracy": 0.6527466426293055, + "num_tokens": 2371417078.0, + "step": 14136 + }, + { + "entropy": 1.7315999070803325, + "epoch": 1.5530196918513637, + "grad_norm": 2.197566270828247, + "learning_rate": 4.25793402952591e-06, + "loss": 1.1632, + "mean_token_accuracy": 0.6761045108238856, + "num_tokens": 2371602729.0, + "step": 14137 + }, + { + "entropy": 1.7214235365390778, + "epoch": 1.5531295487627366, + "grad_norm": 0.764700710773468, + "learning_rate": 4.256873490720973e-06, + "loss": 1.6514, + "mean_token_accuracy": 0.631921668847402, + "num_tokens": 2371808926.0, + "step": 14138 + }, + { + "entropy": 1.7141134142875671, + "epoch": 1.5532394056741095, + "grad_norm": 0.5947176814079285, + "learning_rate": 4.2558131653329544e-06, + "loss": 1.3755, + "mean_token_accuracy": 0.6477245340744654, + "num_tokens": 2372023718.0, + "step": 14139 + }, + { + "entropy": 1.692691445350647, + "epoch": 1.5533492625854826, + "grad_norm": 0.6217459440231323, + "learning_rate": 4.254753053395409e-06, + "loss": 1.3618, + "mean_token_accuracy": 0.6693163911501566, + "num_tokens": 2372178895.0, + "step": 14140 + }, + { + "entropy": 1.7047333717346191, + "epoch": 1.5534591194968552, + "grad_norm": 0.6178786158561707, + "learning_rate": 4.2536931549418904e-06, + "loss": 1.4113, + "mean_token_accuracy": 0.6456644187370936, + "num_tokens": 2372352839.0, + "step": 14141 + }, + { + "entropy": 1.6846307615439098, + "epoch": 1.5535689764082283, + "grad_norm": 0.6864225268363953, + "learning_rate": 4.252633470005945e-06, + "loss": 1.3063, + "mean_token_accuracy": 0.6656875361998876, + "num_tokens": 2372488611.0, + "step": 14142 + }, + { + "entropy": 1.7061450779438019, + "epoch": 1.5536788333196012, + "grad_norm": 0.6568386554718018, + "learning_rate": 4.2515739986211055e-06, + "loss": 1.6754, + "mean_token_accuracy": 0.6310825794935226, + "num_tokens": 2372678594.0, + "step": 14143 + }, + { + "entropy": 1.6643461883068085, + "epoch": 1.553788690230974, + "grad_norm": 0.5872832536697388, + "learning_rate": 4.25051474082091e-06, + "loss": 1.4162, + "mean_token_accuracy": 0.6413596421480179, + "num_tokens": 2372892692.0, + "step": 14144 + }, + { + "entropy": 1.7513802250226338, + "epoch": 1.5538985471423472, + "grad_norm": 0.6232224702835083, + "learning_rate": 4.249455696638883e-06, + "loss": 1.2908, + "mean_token_accuracy": 0.6734890739123026, + "num_tokens": 2373012728.0, + "step": 14145 + }, + { + "entropy": 1.7303274869918823, + "epoch": 1.5540084040537199, + "grad_norm": 0.7580487728118896, + "learning_rate": 4.248396866108543e-06, + "loss": 1.3061, + "mean_token_accuracy": 0.6751085370779037, + "num_tokens": 2373133773.0, + "step": 14146 + }, + { + "entropy": 1.7423675457636516, + "epoch": 1.554118260965093, + "grad_norm": 0.7742456197738647, + "learning_rate": 4.247338249263395e-06, + "loss": 1.3927, + "mean_token_accuracy": 0.6640025774637858, + "num_tokens": 2373288121.0, + "step": 14147 + }, + { + "entropy": 1.7087977528572083, + "epoch": 1.5542281178764659, + "grad_norm": 0.6881621479988098, + "learning_rate": 4.246279846136953e-06, + "loss": 1.3576, + "mean_token_accuracy": 0.6579535851875941, + "num_tokens": 2373465928.0, + "step": 14148 + }, + { + "entropy": 1.7393077313899994, + "epoch": 1.5543379747878387, + "grad_norm": 0.7589281797409058, + "learning_rate": 4.24522165676271e-06, + "loss": 1.6051, + "mean_token_accuracy": 0.6269615292549133, + "num_tokens": 2373690820.0, + "step": 14149 + }, + { + "entropy": 1.690729945898056, + "epoch": 1.5544478316992119, + "grad_norm": 0.6652538776397705, + "learning_rate": 4.244163681174155e-06, + "loss": 1.266, + "mean_token_accuracy": 0.6762543171644211, + "num_tokens": 2373802485.0, + "step": 14150 + }, + { + "entropy": 1.6694500545660655, + "epoch": 1.5545576886105847, + "grad_norm": 0.6579715013504028, + "learning_rate": 4.243105919404778e-06, + "loss": 1.298, + "mean_token_accuracy": 0.6703143616517385, + "num_tokens": 2373952556.0, + "step": 14151 + }, + { + "entropy": 1.678593675295512, + "epoch": 1.5546675455219576, + "grad_norm": 0.6812438368797302, + "learning_rate": 4.2420483714880515e-06, + "loss": 1.3788, + "mean_token_accuracy": 0.6640019963184992, + "num_tokens": 2374097793.0, + "step": 14152 + }, + { + "entropy": 1.6968460778395336, + "epoch": 1.5547774024333307, + "grad_norm": 0.6862781047821045, + "learning_rate": 4.2409910374574504e-06, + "loss": 1.4078, + "mean_token_accuracy": 0.6522165536880493, + "num_tokens": 2374255586.0, + "step": 14153 + }, + { + "entropy": 1.7099703550338745, + "epoch": 1.5548872593447034, + "grad_norm": 0.6841778755187988, + "learning_rate": 4.239933917346437e-06, + "loss": 1.4141, + "mean_token_accuracy": 0.6437595734993616, + "num_tokens": 2374450111.0, + "step": 14154 + }, + { + "entropy": 1.8270711302757263, + "epoch": 1.5549971162560765, + "grad_norm": 1.8875739574432373, + "learning_rate": 4.238877011188468e-06, + "loss": 1.5367, + "mean_token_accuracy": 0.6540461281935374, + "num_tokens": 2374596550.0, + "step": 14155 + }, + { + "entropy": 1.6991891662279766, + "epoch": 1.5551069731674494, + "grad_norm": 1.2882972955703735, + "learning_rate": 4.237820319016994e-06, + "loss": 1.2703, + "mean_token_accuracy": 0.6688820570707321, + "num_tokens": 2374811196.0, + "step": 14156 + }, + { + "entropy": 1.7167048652966816, + "epoch": 1.5552168300788223, + "grad_norm": 0.5811730027198792, + "learning_rate": 4.236763840865467e-06, + "loss": 1.508, + "mean_token_accuracy": 0.6333837409814199, + "num_tokens": 2375025686.0, + "step": 14157 + }, + { + "entropy": 1.677632709344228, + "epoch": 1.5553266869901954, + "grad_norm": 0.5716765522956848, + "learning_rate": 4.23570757676731e-06, + "loss": 1.3812, + "mean_token_accuracy": 0.6522747029860815, + "num_tokens": 2375216440.0, + "step": 14158 + }, + { + "entropy": 1.708607812722524, + "epoch": 1.555436543901568, + "grad_norm": 0.7346358299255371, + "learning_rate": 4.23465152675596e-06, + "loss": 1.4194, + "mean_token_accuracy": 0.6497345666090647, + "num_tokens": 2375379987.0, + "step": 14159 + }, + { + "entropy": 1.6388816436131795, + "epoch": 1.5555464008129412, + "grad_norm": 0.7345917820930481, + "learning_rate": 4.2335956908648425e-06, + "loss": 1.2366, + "mean_token_accuracy": 0.6748053133487701, + "num_tokens": 2375590298.0, + "step": 14160 + }, + { + "entropy": 1.7384839057922363, + "epoch": 1.555656257724314, + "grad_norm": 0.6710056066513062, + "learning_rate": 4.2325400691273735e-06, + "loss": 1.5245, + "mean_token_accuracy": 0.6526643683513006, + "num_tokens": 2375744576.0, + "step": 14161 + }, + { + "entropy": 1.6977645556132, + "epoch": 1.555766114635687, + "grad_norm": 0.635971188545227, + "learning_rate": 4.231484661576959e-06, + "loss": 1.3574, + "mean_token_accuracy": 0.6610794266064962, + "num_tokens": 2375881526.0, + "step": 14162 + }, + { + "entropy": 1.6987218856811523, + "epoch": 1.55587597154706, + "grad_norm": 0.6726696491241455, + "learning_rate": 4.2304294682470074e-06, + "loss": 1.2988, + "mean_token_accuracy": 0.673176700870196, + "num_tokens": 2376011240.0, + "step": 14163 + }, + { + "entropy": 1.6941667199134827, + "epoch": 1.555985828458433, + "grad_norm": 0.7962403297424316, + "learning_rate": 4.22937448917091e-06, + "loss": 1.2457, + "mean_token_accuracy": 0.6745900958776474, + "num_tokens": 2376165761.0, + "step": 14164 + }, + { + "entropy": 1.6965778370698292, + "epoch": 1.5560956853698058, + "grad_norm": 0.6440910696983337, + "learning_rate": 4.228319724382062e-06, + "loss": 1.5532, + "mean_token_accuracy": 0.644252801934878, + "num_tokens": 2376349498.0, + "step": 14165 + }, + { + "entropy": 1.7470646401246388, + "epoch": 1.556205542281179, + "grad_norm": 0.789556086063385, + "learning_rate": 4.227265173913843e-06, + "loss": 1.4675, + "mean_token_accuracy": 0.6519816418488821, + "num_tokens": 2376516296.0, + "step": 14166 + }, + { + "entropy": 1.7214811444282532, + "epoch": 1.5563153991925516, + "grad_norm": 0.62434321641922, + "learning_rate": 4.226210837799627e-06, + "loss": 1.4814, + "mean_token_accuracy": 0.6433521310488383, + "num_tokens": 2376722817.0, + "step": 14167 + }, + { + "entropy": 1.684100478887558, + "epoch": 1.5564252561039247, + "grad_norm": 0.7984063029289246, + "learning_rate": 4.2251567160727855e-06, + "loss": 1.3731, + "mean_token_accuracy": 0.6490538815657297, + "num_tokens": 2376895776.0, + "step": 14168 + }, + { + "entropy": 1.7043040891488392, + "epoch": 1.5565351130152976, + "grad_norm": 0.6661989092826843, + "learning_rate": 4.224102808766687e-06, + "loss": 1.381, + "mean_token_accuracy": 0.6549601207176844, + "num_tokens": 2377035427.0, + "step": 14169 + }, + { + "entropy": 1.725882242123286, + "epoch": 1.5566449699266705, + "grad_norm": 0.6910036206245422, + "learning_rate": 4.223049115914676e-06, + "loss": 1.567, + "mean_token_accuracy": 0.6483024209737778, + "num_tokens": 2377217821.0, + "step": 14170 + }, + { + "entropy": 1.665495256582896, + "epoch": 1.5567548268380436, + "grad_norm": 0.5861942172050476, + "learning_rate": 4.221995637550106e-06, + "loss": 1.4559, + "mean_token_accuracy": 0.6493095854918162, + "num_tokens": 2377446515.0, + "step": 14171 + }, + { + "entropy": 1.7102225919564564, + "epoch": 1.5568646837494162, + "grad_norm": 0.8284782767295837, + "learning_rate": 4.220942373706323e-06, + "loss": 1.524, + "mean_token_accuracy": 0.6480698237816492, + "num_tokens": 2377592535.0, + "step": 14172 + }, + { + "entropy": 1.6960971355438232, + "epoch": 1.5569745406607893, + "grad_norm": 0.5749043822288513, + "learning_rate": 4.219889324416659e-06, + "loss": 1.4179, + "mean_token_accuracy": 0.6462828616301218, + "num_tokens": 2377780665.0, + "step": 14173 + }, + { + "entropy": 1.7206454773743947, + "epoch": 1.5570843975721622, + "grad_norm": 0.6680061221122742, + "learning_rate": 4.218836489714439e-06, + "loss": 1.3162, + "mean_token_accuracy": 0.6785482615232468, + "num_tokens": 2377910201.0, + "step": 14174 + }, + { + "entropy": 1.6479010879993439, + "epoch": 1.557194254483535, + "grad_norm": 0.6720148921012878, + "learning_rate": 4.217783869632992e-06, + "loss": 1.3067, + "mean_token_accuracy": 0.6648319562276205, + "num_tokens": 2378062830.0, + "step": 14175 + }, + { + "entropy": 1.6538492143154144, + "epoch": 1.5573041113949082, + "grad_norm": 0.6240495443344116, + "learning_rate": 4.216731464205627e-06, + "loss": 1.5057, + "mean_token_accuracy": 0.6449542989333471, + "num_tokens": 2378278509.0, + "step": 14176 + }, + { + "entropy": 1.6563106775283813, + "epoch": 1.557413968306281, + "grad_norm": 0.6286166906356812, + "learning_rate": 4.215679273465657e-06, + "loss": 1.374, + "mean_token_accuracy": 0.6685599933067957, + "num_tokens": 2378484457.0, + "step": 14177 + }, + { + "entropy": 1.7658939957618713, + "epoch": 1.557523825217654, + "grad_norm": 0.7517053484916687, + "learning_rate": 4.214627297446381e-06, + "loss": 1.3914, + "mean_token_accuracy": 0.6562267889579138, + "num_tokens": 2378641243.0, + "step": 14178 + }, + { + "entropy": 1.7408847510814667, + "epoch": 1.557633682129027, + "grad_norm": 0.7769100666046143, + "learning_rate": 4.2135755361810905e-06, + "loss": 1.4162, + "mean_token_accuracy": 0.6557272672653198, + "num_tokens": 2378783586.0, + "step": 14179 + }, + { + "entropy": 1.634146640698115, + "epoch": 1.5577435390403997, + "grad_norm": 0.695136547088623, + "learning_rate": 4.212523989703077e-06, + "loss": 1.3489, + "mean_token_accuracy": 0.6566696465015411, + "num_tokens": 2378935922.0, + "step": 14180 + }, + { + "entropy": 1.7598876059055328, + "epoch": 1.5578533959517729, + "grad_norm": 0.7562305331230164, + "learning_rate": 4.211472658045625e-06, + "loss": 1.368, + "mean_token_accuracy": 0.6584480106830597, + "num_tokens": 2379071944.0, + "step": 14181 + }, + { + "entropy": 1.6786730587482452, + "epoch": 1.5579632528631457, + "grad_norm": 0.6836439371109009, + "learning_rate": 4.210421541242e-06, + "loss": 1.4001, + "mean_token_accuracy": 0.6513353139162064, + "num_tokens": 2379284248.0, + "step": 14182 + }, + { + "entropy": 1.6920330325762432, + "epoch": 1.5580731097745186, + "grad_norm": 0.9302690029144287, + "learning_rate": 4.209370639325473e-06, + "loss": 1.6559, + "mean_token_accuracy": 0.6281924843788147, + "num_tokens": 2379450051.0, + "step": 14183 + }, + { + "entropy": 1.6634094417095184, + "epoch": 1.5581829666858917, + "grad_norm": 0.6819601655006409, + "learning_rate": 4.208319952329308e-06, + "loss": 1.4073, + "mean_token_accuracy": 0.6675108969211578, + "num_tokens": 2379644676.0, + "step": 14184 + }, + { + "entropy": 1.8039693931738536, + "epoch": 1.5582928235972644, + "grad_norm": 0.7473592758178711, + "learning_rate": 4.207269480286757e-06, + "loss": 1.52, + "mean_token_accuracy": 0.6487270295619965, + "num_tokens": 2379777755.0, + "step": 14185 + }, + { + "entropy": 1.8343103528022766, + "epoch": 1.5584026805086375, + "grad_norm": 0.7583062648773193, + "learning_rate": 4.2062192232310626e-06, + "loss": 1.5376, + "mean_token_accuracy": 0.6354842483997345, + "num_tokens": 2379973389.0, + "step": 14186 + }, + { + "entropy": 1.6272041896979015, + "epoch": 1.5585125374200104, + "grad_norm": 0.6743770837783813, + "learning_rate": 4.205169181195471e-06, + "loss": 1.2484, + "mean_token_accuracy": 0.672362208366394, + "num_tokens": 2380116473.0, + "step": 14187 + }, + { + "entropy": 1.7308520078659058, + "epoch": 1.5586223943313833, + "grad_norm": 0.7099290490150452, + "learning_rate": 4.204119354213211e-06, + "loss": 1.4756, + "mean_token_accuracy": 0.6549335817495981, + "num_tokens": 2380265493.0, + "step": 14188 + }, + { + "entropy": 1.6808405816555023, + "epoch": 1.5587322512427564, + "grad_norm": 0.6774865984916687, + "learning_rate": 4.203069742317514e-06, + "loss": 1.4098, + "mean_token_accuracy": 0.6618853112061819, + "num_tokens": 2380433799.0, + "step": 14189 + }, + { + "entropy": 1.792554259300232, + "epoch": 1.5588421081541293, + "grad_norm": 0.7282685041427612, + "learning_rate": 4.202020345541596e-06, + "loss": 1.3166, + "mean_token_accuracy": 0.6641228546698889, + "num_tokens": 2380605424.0, + "step": 14190 + }, + { + "entropy": 1.718471388022105, + "epoch": 1.5589519650655022, + "grad_norm": 0.6723158359527588, + "learning_rate": 4.200971163918669e-06, + "loss": 1.3729, + "mean_token_accuracy": 0.6559204707543055, + "num_tokens": 2380791222.0, + "step": 14191 + }, + { + "entropy": 1.7398603161176045, + "epoch": 1.5590618219768753, + "grad_norm": 0.7290942668914795, + "learning_rate": 4.199922197481939e-06, + "loss": 1.3562, + "mean_token_accuracy": 0.6515438904364904, + "num_tokens": 2380945915.0, + "step": 14192 + }, + { + "entropy": 1.7121857802073162, + "epoch": 1.559171678888248, + "grad_norm": 0.6564915180206299, + "learning_rate": 4.198873446264615e-06, + "loss": 1.4534, + "mean_token_accuracy": 0.6428688168525696, + "num_tokens": 2381107747.0, + "step": 14193 + }, + { + "entropy": 1.7519804040590923, + "epoch": 1.559281535799621, + "grad_norm": 0.764492392539978, + "learning_rate": 4.197824910299875e-06, + "loss": 1.3467, + "mean_token_accuracy": 0.6621157228946686, + "num_tokens": 2381225043.0, + "step": 14194 + }, + { + "entropy": 1.6727059384187062, + "epoch": 1.559391392710994, + "grad_norm": 0.6586747169494629, + "learning_rate": 4.1967765896209115e-06, + "loss": 1.3698, + "mean_token_accuracy": 0.6521917631228765, + "num_tokens": 2381386769.0, + "step": 14195 + }, + { + "entropy": 1.7378324270248413, + "epoch": 1.5595012496223668, + "grad_norm": 0.7085768580436707, + "learning_rate": 4.195728484260906e-06, + "loss": 1.2976, + "mean_token_accuracy": 0.6643084188302358, + "num_tokens": 2381524930.0, + "step": 14196 + }, + { + "entropy": 1.6326094667116802, + "epoch": 1.55961110653374, + "grad_norm": 1.9627199172973633, + "learning_rate": 4.19468059425303e-06, + "loss": 1.4151, + "mean_token_accuracy": 0.6560747673114141, + "num_tokens": 2381764868.0, + "step": 14197 + }, + { + "entropy": 1.6878510216871898, + "epoch": 1.5597209634451128, + "grad_norm": 0.5996699333190918, + "learning_rate": 4.193632919630441e-06, + "loss": 1.3121, + "mean_token_accuracy": 0.6628714253505071, + "num_tokens": 2381931844.0, + "step": 14198 + }, + { + "entropy": 1.6812229951222737, + "epoch": 1.5598308203564857, + "grad_norm": 0.7253499627113342, + "learning_rate": 4.192585460426307e-06, + "loss": 1.3634, + "mean_token_accuracy": 0.6608059406280518, + "num_tokens": 2382152497.0, + "step": 14199 + }, + { + "entropy": 1.7064985831578572, + "epoch": 1.5599406772678586, + "grad_norm": 0.7470372915267944, + "learning_rate": 4.191538216673774e-06, + "loss": 1.3977, + "mean_token_accuracy": 0.6608186711867651, + "num_tokens": 2382323536.0, + "step": 14200 + }, + { + "entropy": 1.704755167166392, + "epoch": 1.5600505341792315, + "grad_norm": 0.665684700012207, + "learning_rate": 4.190491188405989e-06, + "loss": 1.5564, + "mean_token_accuracy": 0.6435166969895363, + "num_tokens": 2382522953.0, + "step": 14201 + }, + { + "entropy": 1.6741365194320679, + "epoch": 1.5601603910906046, + "grad_norm": 0.6628307104110718, + "learning_rate": 4.189444375656091e-06, + "loss": 1.4985, + "mean_token_accuracy": 0.6512368569771448, + "num_tokens": 2382704610.0, + "step": 14202 + }, + { + "entropy": 1.6700753569602966, + "epoch": 1.5602702480019774, + "grad_norm": 0.678337812423706, + "learning_rate": 4.188397778457207e-06, + "loss": 1.4405, + "mean_token_accuracy": 0.6458247303962708, + "num_tokens": 2382894458.0, + "step": 14203 + }, + { + "entropy": 1.6928566992282867, + "epoch": 1.5603801049133503, + "grad_norm": 0.7208871841430664, + "learning_rate": 4.187351396842466e-06, + "loss": 1.2387, + "mean_token_accuracy": 0.6764888813098272, + "num_tokens": 2383057750.0, + "step": 14204 + }, + { + "entropy": 1.670570929845174, + "epoch": 1.5604899618247234, + "grad_norm": 0.6567087173461914, + "learning_rate": 4.186305230844984e-06, + "loss": 1.3304, + "mean_token_accuracy": 0.6640313764413198, + "num_tokens": 2383237390.0, + "step": 14205 + }, + { + "entropy": 1.7132259011268616, + "epoch": 1.560599818736096, + "grad_norm": 0.7733318209648132, + "learning_rate": 4.185259280497867e-06, + "loss": 1.4444, + "mean_token_accuracy": 0.6487619827191035, + "num_tokens": 2383433523.0, + "step": 14206 + }, + { + "entropy": 1.7129162947336833, + "epoch": 1.5607096756474692, + "grad_norm": 0.8005481362342834, + "learning_rate": 4.184213545834227e-06, + "loss": 1.2981, + "mean_token_accuracy": 0.6748090038696924, + "num_tokens": 2383559455.0, + "step": 14207 + }, + { + "entropy": 1.720715989669164, + "epoch": 1.560819532558842, + "grad_norm": 0.7148941159248352, + "learning_rate": 4.183168026887154e-06, + "loss": 1.5122, + "mean_token_accuracy": 0.6497367918491364, + "num_tokens": 2383724385.0, + "step": 14208 + }, + { + "entropy": 1.647435188293457, + "epoch": 1.560929389470215, + "grad_norm": 0.5823018550872803, + "learning_rate": 4.1821227236897445e-06, + "loss": 1.3786, + "mean_token_accuracy": 0.6537514179944992, + "num_tokens": 2383964981.0, + "step": 14209 + }, + { + "entropy": 1.6917157073815663, + "epoch": 1.561039246381588, + "grad_norm": 0.6810054779052734, + "learning_rate": 4.1810776362750785e-06, + "loss": 1.4568, + "mean_token_accuracy": 0.6532232761383057, + "num_tokens": 2384142302.0, + "step": 14210 + }, + { + "entropy": 1.7271502912044525, + "epoch": 1.561149103292961, + "grad_norm": 0.7245599627494812, + "learning_rate": 4.180032764676228e-06, + "loss": 1.3084, + "mean_token_accuracy": 0.6805372933546702, + "num_tokens": 2384268777.0, + "step": 14211 + }, + { + "entropy": 1.7125717997550964, + "epoch": 1.5612589602043339, + "grad_norm": 0.6059805750846863, + "learning_rate": 4.178988108926269e-06, + "loss": 1.4136, + "mean_token_accuracy": 0.6488851606845856, + "num_tokens": 2384458391.0, + "step": 14212 + }, + { + "entropy": 1.7332377235094707, + "epoch": 1.5613688171157067, + "grad_norm": 0.633551299571991, + "learning_rate": 4.177943669058267e-06, + "loss": 1.4808, + "mean_token_accuracy": 0.6372072199980418, + "num_tokens": 2384688739.0, + "step": 14213 + }, + { + "entropy": 1.7179057399431865, + "epoch": 1.5614786740270796, + "grad_norm": 0.6609451174736023, + "learning_rate": 4.176899445105271e-06, + "loss": 1.4831, + "mean_token_accuracy": 0.6471300423145294, + "num_tokens": 2384873167.0, + "step": 14214 + }, + { + "entropy": 1.672248860200246, + "epoch": 1.5615885309384527, + "grad_norm": 0.6449457406997681, + "learning_rate": 4.175855437100331e-06, + "loss": 1.2892, + "mean_token_accuracy": 0.6665244797865549, + "num_tokens": 2385012942.0, + "step": 14215 + }, + { + "entropy": 1.6491312483946483, + "epoch": 1.5616983878498256, + "grad_norm": 0.7436006665229797, + "learning_rate": 4.174811645076494e-06, + "loss": 1.5262, + "mean_token_accuracy": 0.6568896919488907, + "num_tokens": 2385161630.0, + "step": 14216 + }, + { + "entropy": 1.7142470479011536, + "epoch": 1.5618082447611985, + "grad_norm": 0.732032060623169, + "learning_rate": 4.1737680690667935e-06, + "loss": 1.3078, + "mean_token_accuracy": 0.6583902637163798, + "num_tokens": 2385282234.0, + "step": 14217 + }, + { + "entropy": 1.6656326552232106, + "epoch": 1.5619181016725716, + "grad_norm": 0.6994863152503967, + "learning_rate": 4.172724709104256e-06, + "loss": 1.4121, + "mean_token_accuracy": 0.6581338991721472, + "num_tokens": 2385490081.0, + "step": 14218 + }, + { + "entropy": 1.7145332098007202, + "epoch": 1.5620279585839443, + "grad_norm": 0.7812539339065552, + "learning_rate": 4.171681565221905e-06, + "loss": 1.4355, + "mean_token_accuracy": 0.6510881582895914, + "num_tokens": 2385658398.0, + "step": 14219 + }, + { + "entropy": 1.6682457625865936, + "epoch": 1.5621378154953174, + "grad_norm": 0.8907629251480103, + "learning_rate": 4.170638637452755e-06, + "loss": 1.455, + "mean_token_accuracy": 0.6465255270401636, + "num_tokens": 2385858820.0, + "step": 14220 + }, + { + "entropy": 1.6630571881930034, + "epoch": 1.5622476724066903, + "grad_norm": 0.7714592814445496, + "learning_rate": 4.1695959258298155e-06, + "loss": 1.2204, + "mean_token_accuracy": 0.6844823310772578, + "num_tokens": 2386000558.0, + "step": 14221 + }, + { + "entropy": 1.7163423299789429, + "epoch": 1.5623575293180632, + "grad_norm": 0.7476485371589661, + "learning_rate": 4.1685534303860895e-06, + "loss": 1.4154, + "mean_token_accuracy": 0.6500615924596786, + "num_tokens": 2386147024.0, + "step": 14222 + }, + { + "entropy": 1.7059412399927776, + "epoch": 1.5624673862294363, + "grad_norm": 0.7124969959259033, + "learning_rate": 4.1675111511545655e-06, + "loss": 1.2981, + "mean_token_accuracy": 0.6605760852495829, + "num_tokens": 2386304325.0, + "step": 14223 + }, + { + "entropy": 1.7372475763161976, + "epoch": 1.5625772431408091, + "grad_norm": 0.6242848038673401, + "learning_rate": 4.166469088168235e-06, + "loss": 1.3766, + "mean_token_accuracy": 0.6519166280825933, + "num_tokens": 2386470095.0, + "step": 14224 + }, + { + "entropy": 1.7890447576840718, + "epoch": 1.562687100052182, + "grad_norm": 0.9228449463844299, + "learning_rate": 4.16542724146008e-06, + "loss": 1.5208, + "mean_token_accuracy": 0.6443865299224854, + "num_tokens": 2386662952.0, + "step": 14225 + }, + { + "entropy": 1.7128015756607056, + "epoch": 1.562796956963555, + "grad_norm": 0.585515022277832, + "learning_rate": 4.164385611063074e-06, + "loss": 1.3964, + "mean_token_accuracy": 0.645780528585116, + "num_tokens": 2386869100.0, + "step": 14226 + }, + { + "entropy": 1.6932969292004902, + "epoch": 1.5629068138749278, + "grad_norm": 0.6653163433074951, + "learning_rate": 4.163344197010181e-06, + "loss": 1.3276, + "mean_token_accuracy": 0.674016997218132, + "num_tokens": 2387014196.0, + "step": 14227 + }, + { + "entropy": 1.7202288210391998, + "epoch": 1.563016670786301, + "grad_norm": 0.7792028784751892, + "learning_rate": 4.162302999334366e-06, + "loss": 1.3553, + "mean_token_accuracy": 0.6684650580088297, + "num_tokens": 2387133483.0, + "step": 14228 + }, + { + "entropy": 1.6807933350404103, + "epoch": 1.5631265276976738, + "grad_norm": 0.6629685759544373, + "learning_rate": 4.1612620180685795e-06, + "loss": 1.5153, + "mean_token_accuracy": 0.6433508296807607, + "num_tokens": 2387302924.0, + "step": 14229 + }, + { + "entropy": 1.690634439388911, + "epoch": 1.5632363846090467, + "grad_norm": 0.6891497373580933, + "learning_rate": 4.160221253245765e-06, + "loss": 1.2502, + "mean_token_accuracy": 0.6735278566678365, + "num_tokens": 2387484689.0, + "step": 14230 + }, + { + "entropy": 1.7032279173533122, + "epoch": 1.5633462415204198, + "grad_norm": 0.6984847784042358, + "learning_rate": 4.15918070489887e-06, + "loss": 1.3396, + "mean_token_accuracy": 0.6606222689151764, + "num_tokens": 2387626318.0, + "step": 14231 + }, + { + "entropy": 1.678362290064494, + "epoch": 1.5634560984317925, + "grad_norm": 0.6913422346115112, + "learning_rate": 4.1581403730608185e-06, + "loss": 1.3096, + "mean_token_accuracy": 0.6657731880744299, + "num_tokens": 2387753790.0, + "step": 14232 + }, + { + "entropy": 1.6654574970404308, + "epoch": 1.5635659553431656, + "grad_norm": 0.8690144419670105, + "learning_rate": 4.157100257764545e-06, + "loss": 1.5989, + "mean_token_accuracy": 0.6529507786035538, + "num_tokens": 2387921051.0, + "step": 14233 + }, + { + "entropy": 1.715543528397878, + "epoch": 1.5636758122545384, + "grad_norm": 0.6697078347206116, + "learning_rate": 4.156060359042966e-06, + "loss": 1.4025, + "mean_token_accuracy": 0.6578785429398218, + "num_tokens": 2388071690.0, + "step": 14234 + }, + { + "entropy": 1.66527725259463, + "epoch": 1.5637856691659113, + "grad_norm": 0.7394384145736694, + "learning_rate": 4.1550206769289885e-06, + "loss": 1.3616, + "mean_token_accuracy": 0.6681285699208578, + "num_tokens": 2388294674.0, + "step": 14235 + }, + { + "entropy": 1.7145603199799855, + "epoch": 1.5638955260772844, + "grad_norm": 0.7570204734802246, + "learning_rate": 4.1539812114555225e-06, + "loss": 1.4025, + "mean_token_accuracy": 0.6636460820833842, + "num_tokens": 2388439276.0, + "step": 14236 + }, + { + "entropy": 1.624147782723109, + "epoch": 1.5640053829886573, + "grad_norm": 0.6261785626411438, + "learning_rate": 4.152941962655472e-06, + "loss": 1.3659, + "mean_token_accuracy": 0.6665515998999277, + "num_tokens": 2388609168.0, + "step": 14237 + }, + { + "entropy": 1.7175993124643962, + "epoch": 1.5641152399000302, + "grad_norm": 0.7182031869888306, + "learning_rate": 4.151902930561718e-06, + "loss": 1.3104, + "mean_token_accuracy": 0.6693373173475266, + "num_tokens": 2388741757.0, + "step": 14238 + }, + { + "entropy": 1.638315846522649, + "epoch": 1.564225096811403, + "grad_norm": 0.6741671562194824, + "learning_rate": 4.150864115207149e-06, + "loss": 1.3031, + "mean_token_accuracy": 0.6736795554558436, + "num_tokens": 2388882713.0, + "step": 14239 + }, + { + "entropy": 1.6951660414536793, + "epoch": 1.564334953722776, + "grad_norm": 0.8368586301803589, + "learning_rate": 4.149825516624648e-06, + "loss": 1.4751, + "mean_token_accuracy": 0.6537247101465861, + "num_tokens": 2389032474.0, + "step": 14240 + }, + { + "entropy": 1.7529374957084656, + "epoch": 1.564444810634149, + "grad_norm": 0.781149685382843, + "learning_rate": 4.148787134847083e-06, + "loss": 1.4741, + "mean_token_accuracy": 0.6479151596625646, + "num_tokens": 2389195676.0, + "step": 14241 + }, + { + "entropy": 1.6890328228473663, + "epoch": 1.564554667545522, + "grad_norm": 0.7580424547195435, + "learning_rate": 4.147748969907315e-06, + "loss": 1.5287, + "mean_token_accuracy": 0.6358017772436142, + "num_tokens": 2389402290.0, + "step": 14242 + }, + { + "entropy": 1.6835271914800007, + "epoch": 1.5646645244568949, + "grad_norm": 0.691645085811615, + "learning_rate": 4.1467110218382065e-06, + "loss": 1.3766, + "mean_token_accuracy": 0.6494946281115214, + "num_tokens": 2389597971.0, + "step": 14243 + }, + { + "entropy": 1.7536411086718242, + "epoch": 1.564774381368268, + "grad_norm": 0.7124913334846497, + "learning_rate": 4.145673290672604e-06, + "loss": 1.4361, + "mean_token_accuracy": 0.6526350329319636, + "num_tokens": 2389750552.0, + "step": 14244 + }, + { + "entropy": 1.761474738518397, + "epoch": 1.5648842382796406, + "grad_norm": 0.752509593963623, + "learning_rate": 4.144635776443355e-06, + "loss": 1.6245, + "mean_token_accuracy": 0.6290792971849442, + "num_tokens": 2390004070.0, + "step": 14245 + }, + { + "entropy": 1.6417441566785176, + "epoch": 1.5649940951910137, + "grad_norm": 0.6076568961143494, + "learning_rate": 4.143598479183296e-06, + "loss": 1.2921, + "mean_token_accuracy": 0.6611118962367376, + "num_tokens": 2390155985.0, + "step": 14246 + }, + { + "entropy": 1.7019707262516022, + "epoch": 1.5651039521023866, + "grad_norm": 0.7136411666870117, + "learning_rate": 4.142561398925251e-06, + "loss": 1.3393, + "mean_token_accuracy": 0.6593159635861715, + "num_tokens": 2390306312.0, + "step": 14247 + }, + { + "entropy": 1.676996996005376, + "epoch": 1.5652138090137595, + "grad_norm": 0.6909713745117188, + "learning_rate": 4.14152453570205e-06, + "loss": 1.2317, + "mean_token_accuracy": 0.6901508718729019, + "num_tokens": 2390448153.0, + "step": 14248 + }, + { + "entropy": 1.7591987252235413, + "epoch": 1.5653236659251326, + "grad_norm": 0.6709699630737305, + "learning_rate": 4.140487889546511e-06, + "loss": 1.5019, + "mean_token_accuracy": 0.6351676136255264, + "num_tokens": 2390635144.0, + "step": 14249 + }, + { + "entropy": 1.662870168685913, + "epoch": 1.5654335228365055, + "grad_norm": 0.7932735085487366, + "learning_rate": 4.1394514604914346e-06, + "loss": 1.273, + "mean_token_accuracy": 0.6632749090592066, + "num_tokens": 2390760110.0, + "step": 14250 + }, + { + "entropy": 1.716610203186671, + "epoch": 1.5655433797478784, + "grad_norm": 0.6800168752670288, + "learning_rate": 4.138415248569627e-06, + "loss": 1.5949, + "mean_token_accuracy": 0.6412303000688553, + "num_tokens": 2390971824.0, + "step": 14251 + }, + { + "entropy": 1.7617238660653431, + "epoch": 1.5656532366592515, + "grad_norm": 2.7349250316619873, + "learning_rate": 4.137379253813888e-06, + "loss": 1.3383, + "mean_token_accuracy": 0.6525413393974304, + "num_tokens": 2391201582.0, + "step": 14252 + }, + { + "entropy": 1.734719494978587, + "epoch": 1.5657630935706242, + "grad_norm": 0.6799651384353638, + "learning_rate": 4.136343476257003e-06, + "loss": 1.382, + "mean_token_accuracy": 0.6536955088376999, + "num_tokens": 2391339481.0, + "step": 14253 + }, + { + "entropy": 1.683516263961792, + "epoch": 1.5658729504819973, + "grad_norm": 0.6154446601867676, + "learning_rate": 4.135307915931752e-06, + "loss": 1.2975, + "mean_token_accuracy": 0.6653460214535395, + "num_tokens": 2391495107.0, + "step": 14254 + }, + { + "entropy": 1.6860613723595936, + "epoch": 1.5659828073933701, + "grad_norm": 0.6810904145240784, + "learning_rate": 4.1342725728709155e-06, + "loss": 1.3292, + "mean_token_accuracy": 0.6534279535214106, + "num_tokens": 2391656633.0, + "step": 14255 + }, + { + "entropy": 1.7145483096440632, + "epoch": 1.566092664304743, + "grad_norm": 0.6879013180732727, + "learning_rate": 4.133237447107254e-06, + "loss": 1.4758, + "mean_token_accuracy": 0.6510107268889745, + "num_tokens": 2391830562.0, + "step": 14256 + }, + { + "entropy": 1.716945578654607, + "epoch": 1.5662025212161161, + "grad_norm": 0.7154614329338074, + "learning_rate": 4.1322025386735366e-06, + "loss": 1.2937, + "mean_token_accuracy": 0.6757306108872095, + "num_tokens": 2391949707.0, + "step": 14257 + }, + { + "entropy": 1.684073011080424, + "epoch": 1.5663123781274888, + "grad_norm": 0.6808174252510071, + "learning_rate": 4.131167847602514e-06, + "loss": 1.3949, + "mean_token_accuracy": 0.653533269961675, + "num_tokens": 2392133416.0, + "step": 14258 + }, + { + "entropy": 1.6810634036858876, + "epoch": 1.566422235038862, + "grad_norm": 0.5711365342140198, + "learning_rate": 4.130133373926931e-06, + "loss": 1.2044, + "mean_token_accuracy": 0.672467311223348, + "num_tokens": 2392345107.0, + "step": 14259 + }, + { + "entropy": 1.6876142223676045, + "epoch": 1.5665320919502348, + "grad_norm": 0.6411077976226807, + "learning_rate": 4.129099117679534e-06, + "loss": 1.3813, + "mean_token_accuracy": 0.6480342298746109, + "num_tokens": 2392499285.0, + "step": 14260 + }, + { + "entropy": 1.74393226703008, + "epoch": 1.5666419488616077, + "grad_norm": 0.634601891040802, + "learning_rate": 4.128065078893054e-06, + "loss": 1.4692, + "mean_token_accuracy": 0.6403647114833196, + "num_tokens": 2392663103.0, + "step": 14261 + }, + { + "entropy": 1.7082592248916626, + "epoch": 1.5667518057729808, + "grad_norm": 0.5808596014976501, + "learning_rate": 4.127031257600215e-06, + "loss": 1.4834, + "mean_token_accuracy": 0.6419193595647812, + "num_tokens": 2392871214.0, + "step": 14262 + }, + { + "entropy": 1.7498057583967845, + "epoch": 1.5668616626843537, + "grad_norm": 0.7248362302780151, + "learning_rate": 4.125997653833742e-06, + "loss": 1.4285, + "mean_token_accuracy": 0.6467612981796265, + "num_tokens": 2393024951.0, + "step": 14263 + }, + { + "entropy": 1.6748530566692352, + "epoch": 1.5669715195957266, + "grad_norm": 0.6871564388275146, + "learning_rate": 4.124964267626344e-06, + "loss": 1.3883, + "mean_token_accuracy": 0.657158151268959, + "num_tokens": 2393157081.0, + "step": 14264 + }, + { + "entropy": 1.7383250097433727, + "epoch": 1.5670813765070997, + "grad_norm": 0.7244443893432617, + "learning_rate": 4.123931099010731e-06, + "loss": 1.475, + "mean_token_accuracy": 0.648836076259613, + "num_tokens": 2393358853.0, + "step": 14265 + }, + { + "entropy": 1.7299003303050995, + "epoch": 1.5671912334184723, + "grad_norm": 0.608918309211731, + "learning_rate": 4.1228981480196e-06, + "loss": 1.4105, + "mean_token_accuracy": 0.6505444248517355, + "num_tokens": 2393537058.0, + "step": 14266 + }, + { + "entropy": 1.768057684103648, + "epoch": 1.5673010903298454, + "grad_norm": 0.6040147542953491, + "learning_rate": 4.121865414685641e-06, + "loss": 1.35, + "mean_token_accuracy": 0.6549276908238729, + "num_tokens": 2393697269.0, + "step": 14267 + }, + { + "entropy": 1.7645529806613922, + "epoch": 1.5674109472412183, + "grad_norm": 0.7420253753662109, + "learning_rate": 4.120832899041542e-06, + "loss": 1.3501, + "mean_token_accuracy": 0.6557344893614451, + "num_tokens": 2393879833.0, + "step": 14268 + }, + { + "entropy": 1.685241311788559, + "epoch": 1.5675208041525912, + "grad_norm": 0.6206194758415222, + "learning_rate": 4.1198006011199855e-06, + "loss": 1.3303, + "mean_token_accuracy": 0.6640505194664001, + "num_tokens": 2394034031.0, + "step": 14269 + }, + { + "entropy": 1.664696882168452, + "epoch": 1.5676306610639643, + "grad_norm": 0.6417528986930847, + "learning_rate": 4.118768520953638e-06, + "loss": 1.2617, + "mean_token_accuracy": 0.6767093986272812, + "num_tokens": 2394158033.0, + "step": 14270 + }, + { + "entropy": 1.6648876368999481, + "epoch": 1.567740517975337, + "grad_norm": 0.6877491474151611, + "learning_rate": 4.117736658575165e-06, + "loss": 1.2169, + "mean_token_accuracy": 0.6888188421726227, + "num_tokens": 2394306056.0, + "step": 14271 + }, + { + "entropy": 1.7249459822972615, + "epoch": 1.56785037488671, + "grad_norm": 0.6589949131011963, + "learning_rate": 4.116705014017229e-06, + "loss": 1.3756, + "mean_token_accuracy": 0.6467818568150202, + "num_tokens": 2394459499.0, + "step": 14272 + }, + { + "entropy": 1.6618477304776509, + "epoch": 1.567960231798083, + "grad_norm": 0.7302327156066895, + "learning_rate": 4.115673587312476e-06, + "loss": 1.3993, + "mean_token_accuracy": 0.6579805115858713, + "num_tokens": 2394645406.0, + "step": 14273 + }, + { + "entropy": 1.7453274031480153, + "epoch": 1.5680700887094559, + "grad_norm": 0.6717466115951538, + "learning_rate": 4.114642378493549e-06, + "loss": 1.2956, + "mean_token_accuracy": 0.6660959323247274, + "num_tokens": 2394770158.0, + "step": 14274 + }, + { + "entropy": 1.7191137572129567, + "epoch": 1.568179945620829, + "grad_norm": 0.6410926580429077, + "learning_rate": 4.113611387593091e-06, + "loss": 1.3987, + "mean_token_accuracy": 0.6500120759010315, + "num_tokens": 2394936523.0, + "step": 14275 + }, + { + "entropy": 1.7772869765758514, + "epoch": 1.5682898025322018, + "grad_norm": 0.72120600938797, + "learning_rate": 4.1125806146437285e-06, + "loss": 1.4901, + "mean_token_accuracy": 0.6559230337540308, + "num_tokens": 2395116288.0, + "step": 14276 + }, + { + "entropy": 1.7197218139966328, + "epoch": 1.5683996594435747, + "grad_norm": 0.5482208132743835, + "learning_rate": 4.111550059678087e-06, + "loss": 1.5672, + "mean_token_accuracy": 0.6186061501502991, + "num_tokens": 2395375622.0, + "step": 14277 + }, + { + "entropy": 1.717966765165329, + "epoch": 1.5685095163549478, + "grad_norm": 0.787023663520813, + "learning_rate": 4.110519722728782e-06, + "loss": 1.2755, + "mean_token_accuracy": 0.6763796657323837, + "num_tokens": 2395509047.0, + "step": 14278 + }, + { + "entropy": 1.7471245626608531, + "epoch": 1.5686193732663205, + "grad_norm": 1.0187658071517944, + "learning_rate": 4.109489603828422e-06, + "loss": 1.2793, + "mean_token_accuracy": 0.6738806962966919, + "num_tokens": 2395635521.0, + "step": 14279 + }, + { + "entropy": 1.6846247414747875, + "epoch": 1.5687292301776936, + "grad_norm": 0.8501124382019043, + "learning_rate": 4.10845970300961e-06, + "loss": 1.3052, + "mean_token_accuracy": 0.6631153573592504, + "num_tokens": 2395770509.0, + "step": 14280 + }, + { + "entropy": 1.7312610546747844, + "epoch": 1.5688390870890665, + "grad_norm": 0.7959895133972168, + "learning_rate": 4.107430020304945e-06, + "loss": 1.5674, + "mean_token_accuracy": 0.6618924016753832, + "num_tokens": 2395923852.0, + "step": 14281 + }, + { + "entropy": 1.7540445923805237, + "epoch": 1.5689489440004394, + "grad_norm": 0.7842367887496948, + "learning_rate": 4.106400555747015e-06, + "loss": 1.2894, + "mean_token_accuracy": 0.6724207550287247, + "num_tokens": 2396076155.0, + "step": 14282 + }, + { + "entropy": 1.7668162484963734, + "epoch": 1.5690588009118125, + "grad_norm": 0.6241871118545532, + "learning_rate": 4.105371309368399e-06, + "loss": 1.3579, + "mean_token_accuracy": 0.6569240589936575, + "num_tokens": 2396220925.0, + "step": 14283 + }, + { + "entropy": 1.6470149258772533, + "epoch": 1.5691686578231852, + "grad_norm": 0.6123558878898621, + "learning_rate": 4.104342281201676e-06, + "loss": 1.3126, + "mean_token_accuracy": 0.6688317805528641, + "num_tokens": 2396398381.0, + "step": 14284 + }, + { + "entropy": 1.7287775576114655, + "epoch": 1.5692785147345583, + "grad_norm": 0.6877973675727844, + "learning_rate": 4.103313471279413e-06, + "loss": 1.3715, + "mean_token_accuracy": 0.6662652442852656, + "num_tokens": 2396575584.0, + "step": 14285 + }, + { + "entropy": 1.6712758839130402, + "epoch": 1.5693883716459311, + "grad_norm": 0.6240174174308777, + "learning_rate": 4.102284879634167e-06, + "loss": 1.3257, + "mean_token_accuracy": 0.6696380823850632, + "num_tokens": 2396771081.0, + "step": 14286 + }, + { + "entropy": 1.6926849484443665, + "epoch": 1.569498228557304, + "grad_norm": 0.7340817451477051, + "learning_rate": 4.1012565062985e-06, + "loss": 1.5994, + "mean_token_accuracy": 0.620560958981514, + "num_tokens": 2397011942.0, + "step": 14287 + }, + { + "entropy": 1.6864181657632191, + "epoch": 1.5696080854686771, + "grad_norm": 0.6771414875984192, + "learning_rate": 4.100228351304954e-06, + "loss": 1.3305, + "mean_token_accuracy": 0.6688184440135956, + "num_tokens": 2397167841.0, + "step": 14288 + }, + { + "entropy": 1.691802740097046, + "epoch": 1.56971794238005, + "grad_norm": 0.9737116098403931, + "learning_rate": 4.0992004146860735e-06, + "loss": 1.4348, + "mean_token_accuracy": 0.6667732695738474, + "num_tokens": 2397303533.0, + "step": 14289 + }, + { + "entropy": 1.6837505499521892, + "epoch": 1.569827799291423, + "grad_norm": 0.6902137994766235, + "learning_rate": 4.098172696474389e-06, + "loss": 1.3924, + "mean_token_accuracy": 0.6759957373142242, + "num_tokens": 2397479982.0, + "step": 14290 + }, + { + "entropy": 1.713492174943288, + "epoch": 1.569937656202796, + "grad_norm": 0.6719781160354614, + "learning_rate": 4.097145196702429e-06, + "loss": 1.3084, + "mean_token_accuracy": 0.664943128824234, + "num_tokens": 2397660597.0, + "step": 14291 + }, + { + "entropy": 1.714007943868637, + "epoch": 1.5700475131141687, + "grad_norm": 0.7276617884635925, + "learning_rate": 4.096117915402711e-06, + "loss": 1.4587, + "mean_token_accuracy": 0.642642746369044, + "num_tokens": 2397840863.0, + "step": 14292 + }, + { + "entropy": 1.6744861801465352, + "epoch": 1.5701573700255418, + "grad_norm": 0.7124812006950378, + "learning_rate": 4.095090852607753e-06, + "loss": 1.6228, + "mean_token_accuracy": 0.6388173699378967, + "num_tokens": 2398021773.0, + "step": 14293 + }, + { + "entropy": 1.6718362669150035, + "epoch": 1.5702672269369147, + "grad_norm": 0.6724779009819031, + "learning_rate": 4.094064008350059e-06, + "loss": 1.321, + "mean_token_accuracy": 0.6795742710431417, + "num_tokens": 2398160853.0, + "step": 14294 + }, + { + "entropy": 1.7398958404858906, + "epoch": 1.5703770838482876, + "grad_norm": 0.611190140247345, + "learning_rate": 4.093037382662123e-06, + "loss": 1.3787, + "mean_token_accuracy": 0.6572253008683523, + "num_tokens": 2398302307.0, + "step": 14295 + }, + { + "entropy": 1.6891403396924336, + "epoch": 1.5704869407596607, + "grad_norm": 0.6904042363166809, + "learning_rate": 4.0920109755764445e-06, + "loss": 1.2991, + "mean_token_accuracy": 0.6698387066523234, + "num_tokens": 2398439925.0, + "step": 14296 + }, + { + "entropy": 1.6572466492652893, + "epoch": 1.5705967976710333, + "grad_norm": 0.7035029530525208, + "learning_rate": 4.090984787125506e-06, + "loss": 1.3059, + "mean_token_accuracy": 0.6694705088933309, + "num_tokens": 2398606382.0, + "step": 14297 + }, + { + "entropy": 1.7408252656459808, + "epoch": 1.5707066545824064, + "grad_norm": 0.7190991640090942, + "learning_rate": 4.089958817341783e-06, + "loss": 1.4693, + "mean_token_accuracy": 0.6496324787537257, + "num_tokens": 2398775722.0, + "step": 14298 + }, + { + "entropy": 1.703038364648819, + "epoch": 1.5708165114937793, + "grad_norm": 0.7542386651039124, + "learning_rate": 4.088933066257753e-06, + "loss": 1.2668, + "mean_token_accuracy": 0.6750974357128143, + "num_tokens": 2398914013.0, + "step": 14299 + }, + { + "entropy": 1.6845628917217255, + "epoch": 1.5709263684051522, + "grad_norm": 0.6732815504074097, + "learning_rate": 4.087907533905874e-06, + "loss": 1.3025, + "mean_token_accuracy": 0.6741761565208435, + "num_tokens": 2399094931.0, + "step": 14300 + }, + { + "entropy": 1.7796473304430644, + "epoch": 1.5710362253165253, + "grad_norm": 0.6704933047294617, + "learning_rate": 4.08688222031861e-06, + "loss": 1.4761, + "mean_token_accuracy": 0.6373851150274277, + "num_tokens": 2399264259.0, + "step": 14301 + }, + { + "entropy": 1.7238643169403076, + "epoch": 1.5711460822278982, + "grad_norm": 0.8131667971611023, + "learning_rate": 4.0858571255284075e-06, + "loss": 1.5472, + "mean_token_accuracy": 0.6483301371335983, + "num_tokens": 2399417678.0, + "step": 14302 + }, + { + "entropy": 1.670019308725993, + "epoch": 1.571255939139271, + "grad_norm": 0.6241254210472107, + "learning_rate": 4.084832249567709e-06, + "loss": 1.497, + "mean_token_accuracy": 0.641557107369105, + "num_tokens": 2399615093.0, + "step": 14303 + }, + { + "entropy": 1.7172284523646038, + "epoch": 1.5713657960506442, + "grad_norm": 0.6693923473358154, + "learning_rate": 4.083807592468956e-06, + "loss": 1.2975, + "mean_token_accuracy": 0.6784742772579193, + "num_tokens": 2399759030.0, + "step": 14304 + }, + { + "entropy": 1.7683460513750713, + "epoch": 1.5714756529620169, + "grad_norm": 0.8016403317451477, + "learning_rate": 4.0827831542645764e-06, + "loss": 1.584, + "mean_token_accuracy": 0.6237875620524088, + "num_tokens": 2400010908.0, + "step": 14305 + }, + { + "entropy": 1.669872482617696, + "epoch": 1.57158550987339, + "grad_norm": 0.6497310996055603, + "learning_rate": 4.081758934986993e-06, + "loss": 1.3014, + "mean_token_accuracy": 0.673602357506752, + "num_tokens": 2400130678.0, + "step": 14306 + }, + { + "entropy": 1.7502439816792805, + "epoch": 1.5716953667847628, + "grad_norm": 0.8661501407623291, + "learning_rate": 4.08073493466862e-06, + "loss": 1.4551, + "mean_token_accuracy": 0.6611292113860449, + "num_tokens": 2400270119.0, + "step": 14307 + }, + { + "entropy": 1.7579331596692402, + "epoch": 1.5718052236961357, + "grad_norm": 0.7373813390731812, + "learning_rate": 4.079711153341871e-06, + "loss": 1.2837, + "mean_token_accuracy": 0.6710260063409805, + "num_tokens": 2400381408.0, + "step": 14308 + }, + { + "entropy": 1.7175275286038716, + "epoch": 1.5719150806075088, + "grad_norm": 0.7899195551872253, + "learning_rate": 4.078687591039146e-06, + "loss": 1.4791, + "mean_token_accuracy": 0.6414479861656824, + "num_tokens": 2400555300.0, + "step": 14309 + }, + { + "entropy": 1.7731333871682484, + "epoch": 1.5720249375188815, + "grad_norm": 0.7760249972343445, + "learning_rate": 4.077664247792838e-06, + "loss": 1.5491, + "mean_token_accuracy": 0.6391358077526093, + "num_tokens": 2400692168.0, + "step": 14310 + }, + { + "entropy": 1.658635934193929, + "epoch": 1.5721347944302546, + "grad_norm": 0.7568308115005493, + "learning_rate": 4.076641123635338e-06, + "loss": 1.5812, + "mean_token_accuracy": 0.6311882634957632, + "num_tokens": 2400892665.0, + "step": 14311 + }, + { + "entropy": 1.7353158593177795, + "epoch": 1.5722446513416275, + "grad_norm": 0.7283448576927185, + "learning_rate": 4.0756182185990245e-06, + "loss": 1.3225, + "mean_token_accuracy": 0.6635488321383795, + "num_tokens": 2401050516.0, + "step": 14312 + }, + { + "entropy": 1.605364441871643, + "epoch": 1.5723545082530004, + "grad_norm": 0.7137781977653503, + "learning_rate": 4.0745955327162775e-06, + "loss": 1.4384, + "mean_token_accuracy": 0.6605170965194702, + "num_tokens": 2401223303.0, + "step": 14313 + }, + { + "entropy": 1.7696949640909831, + "epoch": 1.5724643651643735, + "grad_norm": 0.7161397933959961, + "learning_rate": 4.073573066019461e-06, + "loss": 1.5051, + "mean_token_accuracy": 0.6313910136620203, + "num_tokens": 2401398847.0, + "step": 14314 + }, + { + "entropy": 1.7221232950687408, + "epoch": 1.5725742220757464, + "grad_norm": 0.6790109276771545, + "learning_rate": 4.072550818540934e-06, + "loss": 1.3352, + "mean_token_accuracy": 0.6639846116304398, + "num_tokens": 2401545939.0, + "step": 14315 + }, + { + "entropy": 1.7193631132443745, + "epoch": 1.5726840789871193, + "grad_norm": 0.6720796227455139, + "learning_rate": 4.071528790313049e-06, + "loss": 1.4618, + "mean_token_accuracy": 0.6459654122591019, + "num_tokens": 2401728338.0, + "step": 14316 + }, + { + "entropy": 1.6730712354183197, + "epoch": 1.5727939358984924, + "grad_norm": 0.7082514762878418, + "learning_rate": 4.070506981368164e-06, + "loss": 1.4599, + "mean_token_accuracy": 0.6565167158842087, + "num_tokens": 2401904602.0, + "step": 14317 + }, + { + "entropy": 1.7497392197450001, + "epoch": 1.572903792809865, + "grad_norm": 0.7780535817146301, + "learning_rate": 4.069485391738605e-06, + "loss": 1.4053, + "mean_token_accuracy": 0.6494750926891962, + "num_tokens": 2402107235.0, + "step": 14318 + }, + { + "entropy": 1.6548576653003693, + "epoch": 1.5730136497212381, + "grad_norm": 0.6952147483825684, + "learning_rate": 4.068464021456709e-06, + "loss": 1.426, + "mean_token_accuracy": 0.6591857820749283, + "num_tokens": 2402300956.0, + "step": 14319 + }, + { + "entropy": 1.6779307921727498, + "epoch": 1.573123506632611, + "grad_norm": 0.7041146159172058, + "learning_rate": 4.0674428705548075e-06, + "loss": 1.3681, + "mean_token_accuracy": 0.6691206991672516, + "num_tokens": 2402446715.0, + "step": 14320 + }, + { + "entropy": 1.6910231411457062, + "epoch": 1.573233363543984, + "grad_norm": 0.5578975081443787, + "learning_rate": 4.0664219390652146e-06, + "loss": 1.407, + "mean_token_accuracy": 0.6494897405306498, + "num_tokens": 2402638395.0, + "step": 14321 + }, + { + "entropy": 1.6795838276545207, + "epoch": 1.573343220455357, + "grad_norm": 0.6371523141860962, + "learning_rate": 4.065401227020243e-06, + "loss": 1.3769, + "mean_token_accuracy": 0.6553529649972916, + "num_tokens": 2402777742.0, + "step": 14322 + }, + { + "entropy": 1.6951970160007477, + "epoch": 1.5734530773667297, + "grad_norm": 0.7004981637001038, + "learning_rate": 4.064380734452195e-06, + "loss": 1.1673, + "mean_token_accuracy": 0.687474250793457, + "num_tokens": 2402925563.0, + "step": 14323 + }, + { + "entropy": 1.6712155938148499, + "epoch": 1.5735629342781028, + "grad_norm": 0.7162665128707886, + "learning_rate": 4.06336046139337e-06, + "loss": 1.3009, + "mean_token_accuracy": 0.6811398416757584, + "num_tokens": 2403103844.0, + "step": 14324 + }, + { + "entropy": 1.5985010763009389, + "epoch": 1.5736727911894757, + "grad_norm": 0.6248446702957153, + "learning_rate": 4.062340407876066e-06, + "loss": 1.1624, + "mean_token_accuracy": 0.6961935559908549, + "num_tokens": 2403284801.0, + "step": 14325 + }, + { + "entropy": 1.699522962172826, + "epoch": 1.5737826481008486, + "grad_norm": 0.7275028228759766, + "learning_rate": 4.06132057393256e-06, + "loss": 1.5205, + "mean_token_accuracy": 0.6402701983849207, + "num_tokens": 2403451049.0, + "step": 14326 + }, + { + "entropy": 1.717777858177821, + "epoch": 1.5738925050122217, + "grad_norm": 0.6977814435958862, + "learning_rate": 4.060300959595129e-06, + "loss": 1.3162, + "mean_token_accuracy": 0.6749143203099569, + "num_tokens": 2403611909.0, + "step": 14327 + }, + { + "entropy": 1.7565401196479797, + "epoch": 1.5740023619235946, + "grad_norm": 0.7143315076828003, + "learning_rate": 4.059281564896049e-06, + "loss": 1.4808, + "mean_token_accuracy": 0.6375414083401362, + "num_tokens": 2403777557.0, + "step": 14328 + }, + { + "entropy": 1.7590028643608093, + "epoch": 1.5741122188349674, + "grad_norm": 0.7009501457214355, + "learning_rate": 4.058262389867579e-06, + "loss": 1.4635, + "mean_token_accuracy": 0.6421099056800207, + "num_tokens": 2403952220.0, + "step": 14329 + }, + { + "entropy": 1.7278287311395009, + "epoch": 1.5742220757463405, + "grad_norm": 0.6733378767967224, + "learning_rate": 4.0572434345419746e-06, + "loss": 1.5208, + "mean_token_accuracy": 0.6346190224091212, + "num_tokens": 2404211887.0, + "step": 14330 + }, + { + "entropy": 1.7005733052889507, + "epoch": 1.5743319326577132, + "grad_norm": 0.6368502974510193, + "learning_rate": 4.056224698951489e-06, + "loss": 1.2437, + "mean_token_accuracy": 0.6910210798184077, + "num_tokens": 2404370898.0, + "step": 14331 + }, + { + "entropy": 1.7627998689810436, + "epoch": 1.5744417895690863, + "grad_norm": 0.6787126660346985, + "learning_rate": 4.055206183128359e-06, + "loss": 1.4583, + "mean_token_accuracy": 0.645544116695722, + "num_tokens": 2404538240.0, + "step": 14332 + }, + { + "entropy": 1.729688048362732, + "epoch": 1.5745516464804592, + "grad_norm": 0.7091361284255981, + "learning_rate": 4.054187887104829e-06, + "loss": 1.4387, + "mean_token_accuracy": 0.6646422247091929, + "num_tokens": 2404674194.0, + "step": 14333 + }, + { + "entropy": 1.753710041443507, + "epoch": 1.574661503391832, + "grad_norm": 0.9322096705436707, + "learning_rate": 4.053169810913121e-06, + "loss": 1.5747, + "mean_token_accuracy": 0.6433406124512354, + "num_tokens": 2404827461.0, + "step": 14334 + }, + { + "entropy": 1.7040641208489735, + "epoch": 1.5747713603032052, + "grad_norm": 0.6148183941841125, + "learning_rate": 4.0521519545854555e-06, + "loss": 1.4987, + "mean_token_accuracy": 0.6388400246699651, + "num_tokens": 2405061087.0, + "step": 14335 + }, + { + "entropy": 1.7134417394797008, + "epoch": 1.5748812172145779, + "grad_norm": 0.779222846031189, + "learning_rate": 4.051134318154049e-06, + "loss": 1.5683, + "mean_token_accuracy": 0.6611418028672537, + "num_tokens": 2405251828.0, + "step": 14336 + }, + { + "entropy": 1.7143574953079224, + "epoch": 1.574991074125951, + "grad_norm": 0.6745566129684448, + "learning_rate": 4.050116901651116e-06, + "loss": 1.4113, + "mean_token_accuracy": 0.6613113085428873, + "num_tokens": 2405407958.0, + "step": 14337 + }, + { + "entropy": 1.7491061985492706, + "epoch": 1.5751009310373238, + "grad_norm": 0.8248865604400635, + "learning_rate": 4.049099705108849e-06, + "loss": 1.2324, + "mean_token_accuracy": 0.6748235374689102, + "num_tokens": 2405559259.0, + "step": 14338 + }, + { + "entropy": 1.752279927333196, + "epoch": 1.5752107879486967, + "grad_norm": 0.6685039401054382, + "learning_rate": 4.048082728559441e-06, + "loss": 1.4519, + "mean_token_accuracy": 0.6412715241312981, + "num_tokens": 2405734315.0, + "step": 14339 + }, + { + "entropy": 1.680857280890147, + "epoch": 1.5753206448600698, + "grad_norm": 0.8184103965759277, + "learning_rate": 4.047065972035085e-06, + "loss": 1.3274, + "mean_token_accuracy": 0.6744781285524368, + "num_tokens": 2405872010.0, + "step": 14340 + }, + { + "entropy": 1.691278914610545, + "epoch": 1.5754305017714427, + "grad_norm": 0.6790178418159485, + "learning_rate": 4.046049435567959e-06, + "loss": 1.3451, + "mean_token_accuracy": 0.6628639151652654, + "num_tokens": 2406022115.0, + "step": 14341 + }, + { + "entropy": 1.6819725433985393, + "epoch": 1.5755403586828156, + "grad_norm": 0.6255328059196472, + "learning_rate": 4.0450331191902315e-06, + "loss": 1.2874, + "mean_token_accuracy": 0.6705329616864523, + "num_tokens": 2406169222.0, + "step": 14342 + }, + { + "entropy": 1.6964583198229473, + "epoch": 1.5756502155941887, + "grad_norm": 0.7723869681358337, + "learning_rate": 4.044017022934074e-06, + "loss": 1.336, + "mean_token_accuracy": 0.6657444735368093, + "num_tokens": 2406304501.0, + "step": 14343 + }, + { + "entropy": 1.6860394378503163, + "epoch": 1.5757600725055614, + "grad_norm": 0.6104413270950317, + "learning_rate": 4.043001146831642e-06, + "loss": 1.4501, + "mean_token_accuracy": 0.6295960744222006, + "num_tokens": 2406502733.0, + "step": 14344 + }, + { + "entropy": 1.718522051970164, + "epoch": 1.5758699294169345, + "grad_norm": 0.7444132566452026, + "learning_rate": 4.0419854909150905e-06, + "loss": 1.4821, + "mean_token_accuracy": 0.6521776219209036, + "num_tokens": 2406665496.0, + "step": 14345 + }, + { + "entropy": 1.7012490928173065, + "epoch": 1.5759797863283074, + "grad_norm": 0.6309324502944946, + "learning_rate": 4.040970055216562e-06, + "loss": 1.3502, + "mean_token_accuracy": 0.6527341256539027, + "num_tokens": 2406863488.0, + "step": 14346 + }, + { + "entropy": 1.6834677159786224, + "epoch": 1.5760896432396803, + "grad_norm": 0.6865068078041077, + "learning_rate": 4.039954839768194e-06, + "loss": 1.4619, + "mean_token_accuracy": 0.6443218390146891, + "num_tokens": 2407018418.0, + "step": 14347 + }, + { + "entropy": 1.6774198611577351, + "epoch": 1.5761995001510534, + "grad_norm": 0.6005121469497681, + "learning_rate": 4.038939844602119e-06, + "loss": 1.3968, + "mean_token_accuracy": 0.6480477452278137, + "num_tokens": 2407171807.0, + "step": 14348 + }, + { + "entropy": 1.6578874389330547, + "epoch": 1.576309357062426, + "grad_norm": 0.6394762992858887, + "learning_rate": 4.0379250697504645e-06, + "loss": 1.2588, + "mean_token_accuracy": 0.6768279870351156, + "num_tokens": 2407288051.0, + "step": 14349 + }, + { + "entropy": 1.6758925318717957, + "epoch": 1.5764192139737991, + "grad_norm": 0.6489967107772827, + "learning_rate": 4.036910515245343e-06, + "loss": 1.3984, + "mean_token_accuracy": 0.6536735345919927, + "num_tokens": 2407473963.0, + "step": 14350 + }, + { + "entropy": 1.7418983777364094, + "epoch": 1.576529070885172, + "grad_norm": 0.6643052697181702, + "learning_rate": 4.0358961811188635e-06, + "loss": 1.5226, + "mean_token_accuracy": 0.6297204593817393, + "num_tokens": 2407657529.0, + "step": 14351 + }, + { + "entropy": 1.6687126159667969, + "epoch": 1.576638927796545, + "grad_norm": 0.6805570721626282, + "learning_rate": 4.034882067403135e-06, + "loss": 1.4111, + "mean_token_accuracy": 0.6512214044729868, + "num_tokens": 2407830944.0, + "step": 14352 + }, + { + "entropy": 1.6523280044396718, + "epoch": 1.576748784707918, + "grad_norm": 0.6789618134498596, + "learning_rate": 4.0338681741302495e-06, + "loss": 1.4588, + "mean_token_accuracy": 0.6637892872095108, + "num_tokens": 2407974966.0, + "step": 14353 + }, + { + "entropy": 1.6389523049195607, + "epoch": 1.576858641619291, + "grad_norm": 0.6752464771270752, + "learning_rate": 4.032854501332297e-06, + "loss": 1.408, + "mean_token_accuracy": 0.6546342919270197, + "num_tokens": 2408137262.0, + "step": 14354 + }, + { + "entropy": 1.7064806123574574, + "epoch": 1.5769684985306638, + "grad_norm": 0.5836341381072998, + "learning_rate": 4.031841049041361e-06, + "loss": 1.3355, + "mean_token_accuracy": 0.6571672906478246, + "num_tokens": 2408339160.0, + "step": 14355 + }, + { + "entropy": 1.7815323770046234, + "epoch": 1.577078355442037, + "grad_norm": 0.7685762643814087, + "learning_rate": 4.030827817289513e-06, + "loss": 1.372, + "mean_token_accuracy": 0.6627372950315475, + "num_tokens": 2408471295.0, + "step": 14356 + }, + { + "entropy": 1.7619872987270355, + "epoch": 1.5771882123534096, + "grad_norm": 0.7153518795967102, + "learning_rate": 4.029814806108827e-06, + "loss": 1.4392, + "mean_token_accuracy": 0.636270801226298, + "num_tokens": 2408634566.0, + "step": 14357 + }, + { + "entropy": 1.6729972461859386, + "epoch": 1.5772980692647827, + "grad_norm": 0.6680687069892883, + "learning_rate": 4.028802015531362e-06, + "loss": 1.3128, + "mean_token_accuracy": 0.6736362675825754, + "num_tokens": 2408765313.0, + "step": 14358 + }, + { + "entropy": 1.7162837485472362, + "epoch": 1.5774079261761556, + "grad_norm": 0.7068792581558228, + "learning_rate": 4.027789445589169e-06, + "loss": 1.3913, + "mean_token_accuracy": 0.6578982969125112, + "num_tokens": 2408929681.0, + "step": 14359 + }, + { + "entropy": 1.6619195342063904, + "epoch": 1.5775177830875284, + "grad_norm": 0.6548637747764587, + "learning_rate": 4.026777096314298e-06, + "loss": 1.3688, + "mean_token_accuracy": 0.6490531514088312, + "num_tokens": 2409119354.0, + "step": 14360 + }, + { + "entropy": 1.6902973055839539, + "epoch": 1.5776276399989015, + "grad_norm": 0.7334201335906982, + "learning_rate": 4.0257649677387924e-06, + "loss": 1.6172, + "mean_token_accuracy": 0.6320002973079681, + "num_tokens": 2409327791.0, + "step": 14361 + }, + { + "entropy": 1.7276590665181477, + "epoch": 1.5777374969102742, + "grad_norm": 0.8310177326202393, + "learning_rate": 4.024753059894683e-06, + "loss": 1.5126, + "mean_token_accuracy": 0.6533468067646027, + "num_tokens": 2409464553.0, + "step": 14362 + }, + { + "entropy": 1.68667929371198, + "epoch": 1.5778473538216473, + "grad_norm": 0.5872757434844971, + "learning_rate": 4.023741372813994e-06, + "loss": 1.4306, + "mean_token_accuracy": 0.6479167540868124, + "num_tokens": 2409673452.0, + "step": 14363 + }, + { + "entropy": 1.6317294637362163, + "epoch": 1.5779572107330202, + "grad_norm": 0.705324113368988, + "learning_rate": 4.02272990652875e-06, + "loss": 1.4657, + "mean_token_accuracy": 0.669963558514913, + "num_tokens": 2409848784.0, + "step": 14364 + }, + { + "entropy": 1.6726927657922108, + "epoch": 1.578067067644393, + "grad_norm": 0.6846646070480347, + "learning_rate": 4.021718661070959e-06, + "loss": 1.3991, + "mean_token_accuracy": 0.657963847120603, + "num_tokens": 2410027810.0, + "step": 14365 + }, + { + "entropy": 1.6807039578755696, + "epoch": 1.5781769245557662, + "grad_norm": 0.634922981262207, + "learning_rate": 4.020707636472626e-06, + "loss": 1.411, + "mean_token_accuracy": 0.6643590877453486, + "num_tokens": 2410191106.0, + "step": 14366 + }, + { + "entropy": 1.690766880909602, + "epoch": 1.578286781467139, + "grad_norm": 0.6886934638023376, + "learning_rate": 4.019696832765755e-06, + "loss": 1.3997, + "mean_token_accuracy": 0.6652351021766663, + "num_tokens": 2410352396.0, + "step": 14367 + }, + { + "entropy": 1.696602702140808, + "epoch": 1.578396638378512, + "grad_norm": 0.7700260877609253, + "learning_rate": 4.01868624998233e-06, + "loss": 1.3986, + "mean_token_accuracy": 0.654128318031629, + "num_tokens": 2410524627.0, + "step": 14368 + }, + { + "entropy": 1.6783056855201721, + "epoch": 1.578506495289885, + "grad_norm": 0.7106050252914429, + "learning_rate": 4.017675888154341e-06, + "loss": 1.4447, + "mean_token_accuracy": 0.6530628601710001, + "num_tokens": 2410671042.0, + "step": 14369 + }, + { + "entropy": 1.6856454213460286, + "epoch": 1.5786163522012577, + "grad_norm": 0.5910614132881165, + "learning_rate": 4.016665747313765e-06, + "loss": 1.4043, + "mean_token_accuracy": 0.6535949011643728, + "num_tokens": 2410840404.0, + "step": 14370 + }, + { + "entropy": 1.681627740462621, + "epoch": 1.5787262091126308, + "grad_norm": 0.5946781635284424, + "learning_rate": 4.0156558274925695e-06, + "loss": 1.3597, + "mean_token_accuracy": 0.6630496780077616, + "num_tokens": 2410999756.0, + "step": 14371 + }, + { + "entropy": 1.6094493865966797, + "epoch": 1.5788360660240037, + "grad_norm": 0.8007654547691345, + "learning_rate": 4.014646128722719e-06, + "loss": 1.2379, + "mean_token_accuracy": 0.6704281121492386, + "num_tokens": 2411138576.0, + "step": 14372 + }, + { + "entropy": 1.7133605281511943, + "epoch": 1.5789459229353766, + "grad_norm": 0.6362724900245667, + "learning_rate": 4.0136366510361735e-06, + "loss": 1.4868, + "mean_token_accuracy": 0.6381375938653946, + "num_tokens": 2411382890.0, + "step": 14373 + }, + { + "entropy": 1.8057750562826793, + "epoch": 1.5790557798467497, + "grad_norm": 0.7414250373840332, + "learning_rate": 4.01262739446488e-06, + "loss": 1.3424, + "mean_token_accuracy": 0.6564290225505829, + "num_tokens": 2411520175.0, + "step": 14374 + }, + { + "entropy": 1.688672701517741, + "epoch": 1.5791656367581224, + "grad_norm": 0.5574719905853271, + "learning_rate": 4.011618359040778e-06, + "loss": 1.3622, + "mean_token_accuracy": 0.6569213171799978, + "num_tokens": 2411699169.0, + "step": 14375 + }, + { + "entropy": 1.753764549891154, + "epoch": 1.5792754936694955, + "grad_norm": 0.6876077651977539, + "learning_rate": 4.010609544795808e-06, + "loss": 1.472, + "mean_token_accuracy": 0.6483410596847534, + "num_tokens": 2411855045.0, + "step": 14376 + }, + { + "entropy": 1.6760170062383015, + "epoch": 1.5793853505808684, + "grad_norm": 0.670947790145874, + "learning_rate": 4.009600951761896e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6729481816291809, + "num_tokens": 2412065147.0, + "step": 14377 + }, + { + "entropy": 1.756626029809316, + "epoch": 1.5794952074922413, + "grad_norm": 0.7405815124511719, + "learning_rate": 4.0085925799709635e-06, + "loss": 1.4296, + "mean_token_accuracy": 0.6655129392941793, + "num_tokens": 2412191329.0, + "step": 14378 + }, + { + "entropy": 1.735265185435613, + "epoch": 1.5796050644036144, + "grad_norm": 0.7957320213317871, + "learning_rate": 4.007584429454927e-06, + "loss": 1.2667, + "mean_token_accuracy": 0.6745762477318445, + "num_tokens": 2412318330.0, + "step": 14379 + }, + { + "entropy": 1.7310957809289296, + "epoch": 1.5797149213149873, + "grad_norm": 0.639903724193573, + "learning_rate": 4.006576500245689e-06, + "loss": 1.452, + "mean_token_accuracy": 0.6442805677652359, + "num_tokens": 2412496380.0, + "step": 14380 + }, + { + "entropy": 1.7322147190570831, + "epoch": 1.5798247782263601, + "grad_norm": 0.6460077166557312, + "learning_rate": 4.005568792375157e-06, + "loss": 1.4793, + "mean_token_accuracy": 0.6409311791261038, + "num_tokens": 2412670508.0, + "step": 14381 + }, + { + "entropy": 1.6922193666299183, + "epoch": 1.5799346351377332, + "grad_norm": 0.6333412528038025, + "learning_rate": 4.004561305875221e-06, + "loss": 1.2359, + "mean_token_accuracy": 0.6793088068564733, + "num_tokens": 2412810294.0, + "step": 14382 + }, + { + "entropy": 1.7221355736255646, + "epoch": 1.580044492049106, + "grad_norm": 0.6996757388114929, + "learning_rate": 4.003554040777765e-06, + "loss": 1.5011, + "mean_token_accuracy": 0.6347163567940394, + "num_tokens": 2413041747.0, + "step": 14383 + }, + { + "entropy": 1.6786732574303944, + "epoch": 1.580154348960479, + "grad_norm": 0.6720352172851562, + "learning_rate": 4.0025469971146725e-06, + "loss": 1.4039, + "mean_token_accuracy": 0.6694711993137995, + "num_tokens": 2413203259.0, + "step": 14384 + }, + { + "entropy": 1.7239131430784862, + "epoch": 1.580264205871852, + "grad_norm": 0.6879767775535583, + "learning_rate": 4.001540174917813e-06, + "loss": 1.4192, + "mean_token_accuracy": 0.6383152256409327, + "num_tokens": 2413404931.0, + "step": 14385 + }, + { + "entropy": 1.679287075996399, + "epoch": 1.5803740627832248, + "grad_norm": 0.8257265686988831, + "learning_rate": 4.0005335742190555e-06, + "loss": 1.2133, + "mean_token_accuracy": 0.6854179451862971, + "num_tokens": 2413542889.0, + "step": 14386 + }, + { + "entropy": 1.6591391563415527, + "epoch": 1.580483919694598, + "grad_norm": 0.8294792175292969, + "learning_rate": 3.999527195050255e-06, + "loss": 1.2861, + "mean_token_accuracy": 0.6734772324562073, + "num_tokens": 2413685078.0, + "step": 14387 + }, + { + "entropy": 1.7173769970734913, + "epoch": 1.5805937766059706, + "grad_norm": 0.6364870667457581, + "learning_rate": 3.998521037443264e-06, + "loss": 1.4887, + "mean_token_accuracy": 0.643697996934255, + "num_tokens": 2413867535.0, + "step": 14388 + }, + { + "entropy": 1.7607737878958385, + "epoch": 1.5807036335173437, + "grad_norm": 0.6072537899017334, + "learning_rate": 3.997515101429928e-06, + "loss": 1.5469, + "mean_token_accuracy": 0.6417495807011923, + "num_tokens": 2414078328.0, + "step": 14389 + }, + { + "entropy": 1.7044040362040203, + "epoch": 1.5808134904287166, + "grad_norm": 0.6410809755325317, + "learning_rate": 3.996509387042085e-06, + "loss": 1.5127, + "mean_token_accuracy": 0.6362091799577078, + "num_tokens": 2414267611.0, + "step": 14390 + }, + { + "entropy": 1.6906994581222534, + "epoch": 1.5809233473400894, + "grad_norm": 0.6977923512458801, + "learning_rate": 3.995503894311561e-06, + "loss": 1.4303, + "mean_token_accuracy": 0.6597084701061249, + "num_tokens": 2414429725.0, + "step": 14391 + }, + { + "entropy": 1.7433028519153595, + "epoch": 1.5810332042514625, + "grad_norm": 0.7054868936538696, + "learning_rate": 3.994498623270182e-06, + "loss": 1.4431, + "mean_token_accuracy": 0.6574334055185318, + "num_tokens": 2414610681.0, + "step": 14392 + }, + { + "entropy": 1.739876647790273, + "epoch": 1.5811430611628354, + "grad_norm": 0.8737093806266785, + "learning_rate": 3.993493573949768e-06, + "loss": 1.2597, + "mean_token_accuracy": 0.6745233436425527, + "num_tokens": 2414709082.0, + "step": 14393 + }, + { + "entropy": 1.7829668621222179, + "epoch": 1.5812529180742083, + "grad_norm": 0.8045809864997864, + "learning_rate": 3.992488746382125e-06, + "loss": 1.431, + "mean_token_accuracy": 0.6510123064120611, + "num_tokens": 2414904822.0, + "step": 14394 + }, + { + "entropy": 1.7516865233580272, + "epoch": 1.5813627749855814, + "grad_norm": 0.8814604878425598, + "learning_rate": 3.991484140599053e-06, + "loss": 1.3402, + "mean_token_accuracy": 0.6588575591643652, + "num_tokens": 2415024541.0, + "step": 14395 + }, + { + "entropy": 1.7599789202213287, + "epoch": 1.581472631896954, + "grad_norm": 0.6524580121040344, + "learning_rate": 3.990479756632352e-06, + "loss": 1.458, + "mean_token_accuracy": 0.6501910090446472, + "num_tokens": 2415189620.0, + "step": 14396 + }, + { + "entropy": 1.7022678454717, + "epoch": 1.5815824888083272, + "grad_norm": 0.7698002457618713, + "learning_rate": 3.989475594513808e-06, + "loss": 1.3612, + "mean_token_accuracy": 0.6615101943413416, + "num_tokens": 2415364902.0, + "step": 14397 + }, + { + "entropy": 1.7096090018749237, + "epoch": 1.5816923457197, + "grad_norm": 0.8302357792854309, + "learning_rate": 3.988471654275201e-06, + "loss": 1.2287, + "mean_token_accuracy": 0.6795289516448975, + "num_tokens": 2415465709.0, + "step": 14398 + }, + { + "entropy": 1.727865646282832, + "epoch": 1.581802202631073, + "grad_norm": 0.7796132564544678, + "learning_rate": 3.987467935948307e-06, + "loss": 1.4827, + "mean_token_accuracy": 0.6607057054837545, + "num_tokens": 2415621621.0, + "step": 14399 + }, + { + "entropy": 1.7068704466025035, + "epoch": 1.581912059542446, + "grad_norm": 0.7254052758216858, + "learning_rate": 3.986464439564893e-06, + "loss": 1.5308, + "mean_token_accuracy": 0.6460797290007273, + "num_tokens": 2415798172.0, + "step": 14400 + }, + { + "entropy": 1.7472402950127919, + "epoch": 1.582021916453819, + "grad_norm": 1.3230574131011963, + "learning_rate": 3.9854611651567196e-06, + "loss": 1.4057, + "mean_token_accuracy": 0.6481355031331381, + "num_tokens": 2415960359.0, + "step": 14401 + }, + { + "entropy": 1.7266095876693726, + "epoch": 1.5821317733651918, + "grad_norm": 0.7041482329368591, + "learning_rate": 3.98445811275554e-06, + "loss": 1.3587, + "mean_token_accuracy": 0.6671364406744639, + "num_tokens": 2416113815.0, + "step": 14402 + }, + { + "entropy": 1.7281249165534973, + "epoch": 1.5822416302765647, + "grad_norm": 0.6447805166244507, + "learning_rate": 3.983455282393099e-06, + "loss": 1.412, + "mean_token_accuracy": 0.6491978416840235, + "num_tokens": 2416313418.0, + "step": 14403 + }, + { + "entropy": 1.744963804880778, + "epoch": 1.5823514871879376, + "grad_norm": 0.6686639189720154, + "learning_rate": 3.9824526741011345e-06, + "loss": 1.4133, + "mean_token_accuracy": 0.6531851341327032, + "num_tokens": 2416494715.0, + "step": 14404 + }, + { + "entropy": 1.734292556842168, + "epoch": 1.5824613440993107, + "grad_norm": 0.8086754679679871, + "learning_rate": 3.981450287911385e-06, + "loss": 1.3749, + "mean_token_accuracy": 0.6653833836317062, + "num_tokens": 2416629130.0, + "step": 14405 + }, + { + "entropy": 1.7502967417240143, + "epoch": 1.5825712010106836, + "grad_norm": 0.8233333826065063, + "learning_rate": 3.9804481238555696e-06, + "loss": 1.4493, + "mean_token_accuracy": 0.649358481168747, + "num_tokens": 2416784379.0, + "step": 14406 + }, + { + "entropy": 1.6617790857950847, + "epoch": 1.5826810579220565, + "grad_norm": 0.544765055179596, + "learning_rate": 3.979446181965406e-06, + "loss": 1.4374, + "mean_token_accuracy": 0.6548088242610296, + "num_tokens": 2416981866.0, + "step": 14407 + }, + { + "entropy": 1.6831092139085133, + "epoch": 1.5827909148334296, + "grad_norm": 0.8361105918884277, + "learning_rate": 3.97844446227261e-06, + "loss": 1.621, + "mean_token_accuracy": 0.6430748477578163, + "num_tokens": 2417131866.0, + "step": 14408 + }, + { + "entropy": 1.7041123708089192, + "epoch": 1.5829007717448023, + "grad_norm": 0.8004279732704163, + "learning_rate": 3.977442964808883e-06, + "loss": 1.4952, + "mean_token_accuracy": 0.6577915449937185, + "num_tokens": 2417297788.0, + "step": 14409 + }, + { + "entropy": 1.6733851035435994, + "epoch": 1.5830106286561754, + "grad_norm": 0.6946509480476379, + "learning_rate": 3.976441689605919e-06, + "loss": 1.3204, + "mean_token_accuracy": 0.667379895846049, + "num_tokens": 2417418708.0, + "step": 14410 + }, + { + "entropy": 1.7224521934986115, + "epoch": 1.5831204855675483, + "grad_norm": 0.6349174380302429, + "learning_rate": 3.975440636695412e-06, + "loss": 1.3984, + "mean_token_accuracy": 0.6536268393198649, + "num_tokens": 2417584213.0, + "step": 14411 + }, + { + "entropy": 1.6723372340202332, + "epoch": 1.5832303424789211, + "grad_norm": 0.7625806331634521, + "learning_rate": 3.974439806109043e-06, + "loss": 1.2409, + "mean_token_accuracy": 0.6775754888852438, + "num_tokens": 2417759522.0, + "step": 14412 + }, + { + "entropy": 1.642015775044759, + "epoch": 1.5833401993902942, + "grad_norm": 0.7911476492881775, + "learning_rate": 3.973439197878489e-06, + "loss": 1.2949, + "mean_token_accuracy": 0.6780034005641937, + "num_tokens": 2417912492.0, + "step": 14413 + }, + { + "entropy": 1.7025137146313984, + "epoch": 1.5834500563016671, + "grad_norm": 0.7081897854804993, + "learning_rate": 3.972438812035419e-06, + "loss": 1.5179, + "mean_token_accuracy": 0.6537428398927053, + "num_tokens": 2418109884.0, + "step": 14414 + }, + { + "entropy": 1.768744687239329, + "epoch": 1.58355991321304, + "grad_norm": 0.804851233959198, + "learning_rate": 3.971438648611492e-06, + "loss": 1.482, + "mean_token_accuracy": 0.6415604799985886, + "num_tokens": 2418302598.0, + "step": 14415 + }, + { + "entropy": 1.700428346792857, + "epoch": 1.583669770124413, + "grad_norm": 0.6298617720603943, + "learning_rate": 3.970438707638364e-06, + "loss": 1.4271, + "mean_token_accuracy": 0.6591332703828812, + "num_tokens": 2418483413.0, + "step": 14416 + }, + { + "entropy": 1.7190834681193035, + "epoch": 1.5837796270357858, + "grad_norm": 0.7330955266952515, + "learning_rate": 3.969438989147685e-06, + "loss": 1.5435, + "mean_token_accuracy": 0.642086406548818, + "num_tokens": 2418645090.0, + "step": 14417 + }, + { + "entropy": 1.701053947210312, + "epoch": 1.583889483947159, + "grad_norm": 0.648068904876709, + "learning_rate": 3.9684394931710956e-06, + "loss": 1.282, + "mean_token_accuracy": 0.6661601016918818, + "num_tokens": 2418771132.0, + "step": 14418 + }, + { + "entropy": 1.7536637683709462, + "epoch": 1.5839993408585318, + "grad_norm": 0.7328144907951355, + "learning_rate": 3.967440219740224e-06, + "loss": 1.4416, + "mean_token_accuracy": 0.6516183565060297, + "num_tokens": 2418935915.0, + "step": 14419 + }, + { + "entropy": 1.7374096810817719, + "epoch": 1.5841091977699047, + "grad_norm": 0.6931876540184021, + "learning_rate": 3.966441168886704e-06, + "loss": 1.3514, + "mean_token_accuracy": 0.6611058761676153, + "num_tokens": 2419100654.0, + "step": 14420 + }, + { + "entropy": 1.7090267737706502, + "epoch": 1.5842190546812778, + "grad_norm": 0.7308075428009033, + "learning_rate": 3.96544234064215e-06, + "loss": 1.2949, + "mean_token_accuracy": 0.6706414471069971, + "num_tokens": 2419225860.0, + "step": 14421 + }, + { + "entropy": 1.779475748538971, + "epoch": 1.5843289115926504, + "grad_norm": 0.6749278903007507, + "learning_rate": 3.9644437350381745e-06, + "loss": 1.4604, + "mean_token_accuracy": 0.6398962736129761, + "num_tokens": 2419420436.0, + "step": 14422 + }, + { + "entropy": 1.6845888098080952, + "epoch": 1.5844387685040235, + "grad_norm": 0.6891003251075745, + "learning_rate": 3.9634453521063876e-06, + "loss": 1.3074, + "mean_token_accuracy": 0.6675096352895101, + "num_tokens": 2419590074.0, + "step": 14423 + }, + { + "entropy": 1.6917446851730347, + "epoch": 1.5845486254153964, + "grad_norm": 0.7120577096939087, + "learning_rate": 3.962447191878381e-06, + "loss": 1.4101, + "mean_token_accuracy": 0.6531930317481359, + "num_tokens": 2419762614.0, + "step": 14424 + }, + { + "entropy": 1.6405010223388672, + "epoch": 1.5846584823267693, + "grad_norm": 0.6640504002571106, + "learning_rate": 3.961449254385753e-06, + "loss": 1.3397, + "mean_token_accuracy": 0.67174232006073, + "num_tokens": 2419921978.0, + "step": 14425 + }, + { + "entropy": 1.6967049439748128, + "epoch": 1.5847683392381424, + "grad_norm": 0.6310282349586487, + "learning_rate": 3.960451539660084e-06, + "loss": 1.3417, + "mean_token_accuracy": 0.6760230660438538, + "num_tokens": 2420092987.0, + "step": 14426 + }, + { + "entropy": 1.7303445041179657, + "epoch": 1.5848781961495153, + "grad_norm": 0.6532360911369324, + "learning_rate": 3.959454047732949e-06, + "loss": 1.3444, + "mean_token_accuracy": 0.6584073007106781, + "num_tokens": 2420258161.0, + "step": 14427 + }, + { + "entropy": 1.673458496729533, + "epoch": 1.5849880530608882, + "grad_norm": 0.6821812391281128, + "learning_rate": 3.958456778635922e-06, + "loss": 1.2654, + "mean_token_accuracy": 0.6753448198239008, + "num_tokens": 2420433591.0, + "step": 14428 + }, + { + "entropy": 1.679155856370926, + "epoch": 1.585097909972261, + "grad_norm": 0.5954638123512268, + "learning_rate": 3.957459732400566e-06, + "loss": 1.3344, + "mean_token_accuracy": 0.6624444822470347, + "num_tokens": 2420624387.0, + "step": 14429 + }, + { + "entropy": 1.627536416053772, + "epoch": 1.585207766883634, + "grad_norm": 0.729608952999115, + "learning_rate": 3.956462909058436e-06, + "loss": 1.3367, + "mean_token_accuracy": 0.6631862074136734, + "num_tokens": 2420779608.0, + "step": 14430 + }, + { + "entropy": 1.7547686696052551, + "epoch": 1.585317623795007, + "grad_norm": 0.7445028424263, + "learning_rate": 3.95546630864108e-06, + "loss": 1.3996, + "mean_token_accuracy": 0.6553547183672587, + "num_tokens": 2420925812.0, + "step": 14431 + }, + { + "entropy": 1.7932293613751729, + "epoch": 1.58542748070638, + "grad_norm": 0.7431174516677856, + "learning_rate": 3.954469931180042e-06, + "loss": 1.3638, + "mean_token_accuracy": 0.6611845990022024, + "num_tokens": 2421059861.0, + "step": 14432 + }, + { + "entropy": 1.6480081578095753, + "epoch": 1.5855373376177528, + "grad_norm": 0.6314308643341064, + "learning_rate": 3.953473776706857e-06, + "loss": 1.3646, + "mean_token_accuracy": 0.6595342606306076, + "num_tokens": 2421221397.0, + "step": 14433 + }, + { + "entropy": 1.6911343236764271, + "epoch": 1.585647194529126, + "grad_norm": 0.653496265411377, + "learning_rate": 3.9524778452530476e-06, + "loss": 1.429, + "mean_token_accuracy": 0.6722802569468816, + "num_tokens": 2421386795.0, + "step": 14434 + }, + { + "entropy": 1.6934001346429188, + "epoch": 1.5857570514404986, + "grad_norm": 0.6944672465324402, + "learning_rate": 3.951482136850143e-06, + "loss": 1.35, + "mean_token_accuracy": 0.6598734756310781, + "num_tokens": 2421530857.0, + "step": 14435 + }, + { + "entropy": 1.7472728689511616, + "epoch": 1.5858669083518717, + "grad_norm": 0.6578483581542969, + "learning_rate": 3.950486651529649e-06, + "loss": 1.3643, + "mean_token_accuracy": 0.658632829785347, + "num_tokens": 2421743676.0, + "step": 14436 + }, + { + "entropy": 1.7039150198300679, + "epoch": 1.5859767652632446, + "grad_norm": 0.7169721722602844, + "learning_rate": 3.949491389323079e-06, + "loss": 1.3317, + "mean_token_accuracy": 0.671579380830129, + "num_tokens": 2421913835.0, + "step": 14437 + }, + { + "entropy": 1.7325179874897003, + "epoch": 1.5860866221746175, + "grad_norm": 0.7134926319122314, + "learning_rate": 3.948496350261929e-06, + "loss": 1.5328, + "mean_token_accuracy": 0.6371408551931381, + "num_tokens": 2422078510.0, + "step": 14438 + }, + { + "entropy": 1.6378303567568462, + "epoch": 1.5861964790859906, + "grad_norm": 0.63148432970047, + "learning_rate": 3.94750153437769e-06, + "loss": 1.4693, + "mean_token_accuracy": 0.6584180593490601, + "num_tokens": 2422262293.0, + "step": 14439 + }, + { + "entropy": 1.7055408656597137, + "epoch": 1.5863063359973635, + "grad_norm": 0.6454849243164062, + "learning_rate": 3.94650694170185e-06, + "loss": 1.4238, + "mean_token_accuracy": 0.6490067690610886, + "num_tokens": 2422423182.0, + "step": 14440 + }, + { + "entropy": 1.6568194329738617, + "epoch": 1.5864161929087364, + "grad_norm": 0.61830735206604, + "learning_rate": 3.945512572265888e-06, + "loss": 1.4239, + "mean_token_accuracy": 0.6476400097211202, + "num_tokens": 2422619592.0, + "step": 14441 + }, + { + "entropy": 1.652705987294515, + "epoch": 1.5865260498201095, + "grad_norm": 0.6976563930511475, + "learning_rate": 3.944518426101275e-06, + "loss": 1.3246, + "mean_token_accuracy": 0.6634400536616644, + "num_tokens": 2422744514.0, + "step": 14442 + }, + { + "entropy": 1.7328997552394867, + "epoch": 1.5866359067314821, + "grad_norm": 0.699158251285553, + "learning_rate": 3.943524503239474e-06, + "loss": 1.4902, + "mean_token_accuracy": 0.6478810012340546, + "num_tokens": 2422929691.0, + "step": 14443 + }, + { + "entropy": 1.6880051692326863, + "epoch": 1.5867457636428552, + "grad_norm": 0.6858848333358765, + "learning_rate": 3.942530803711941e-06, + "loss": 1.2967, + "mean_token_accuracy": 0.6767656803131104, + "num_tokens": 2423066587.0, + "step": 14444 + }, + { + "entropy": 1.6842141250769298, + "epoch": 1.5868556205542281, + "grad_norm": 0.6382424831390381, + "learning_rate": 3.941537327550131e-06, + "loss": 1.3361, + "mean_token_accuracy": 0.6656038562456766, + "num_tokens": 2423196266.0, + "step": 14445 + }, + { + "entropy": 1.6592636009057362, + "epoch": 1.586965477465601, + "grad_norm": 0.6557448506355286, + "learning_rate": 3.940544074785483e-06, + "loss": 1.3152, + "mean_token_accuracy": 0.6723695049683253, + "num_tokens": 2423356556.0, + "step": 14446 + }, + { + "entropy": 1.7335790693759918, + "epoch": 1.5870753343769741, + "grad_norm": 0.7295007109642029, + "learning_rate": 3.939551045449432e-06, + "loss": 1.5265, + "mean_token_accuracy": 0.6430895005663236, + "num_tokens": 2423517951.0, + "step": 14447 + }, + { + "entropy": 1.7022380630175273, + "epoch": 1.5871851912883468, + "grad_norm": 0.6990877389907837, + "learning_rate": 3.938558239573408e-06, + "loss": 1.4924, + "mean_token_accuracy": 0.6644400457541147, + "num_tokens": 2423670051.0, + "step": 14448 + }, + { + "entropy": 1.7559547921021779, + "epoch": 1.58729504819972, + "grad_norm": 0.6705808639526367, + "learning_rate": 3.937565657188838e-06, + "loss": 1.5399, + "mean_token_accuracy": 0.6576424241065979, + "num_tokens": 2423823200.0, + "step": 14449 + }, + { + "entropy": 1.6872046788533528, + "epoch": 1.5874049051110928, + "grad_norm": 0.7224948406219482, + "learning_rate": 3.93657329832713e-06, + "loss": 1.261, + "mean_token_accuracy": 0.6700985580682755, + "num_tokens": 2423942075.0, + "step": 14450 + }, + { + "entropy": 1.707588940858841, + "epoch": 1.5875147620224657, + "grad_norm": 0.7525602579116821, + "learning_rate": 3.935581163019694e-06, + "loss": 1.4265, + "mean_token_accuracy": 0.6589339872201284, + "num_tokens": 2424120103.0, + "step": 14451 + }, + { + "entropy": 1.7470983068148296, + "epoch": 1.5876246189338388, + "grad_norm": 0.8434122800827026, + "learning_rate": 3.9345892512979325e-06, + "loss": 1.4399, + "mean_token_accuracy": 0.6524594177802404, + "num_tokens": 2424267471.0, + "step": 14452 + }, + { + "entropy": 1.7276219228903453, + "epoch": 1.5877344758452117, + "grad_norm": 0.6105454564094543, + "learning_rate": 3.933597563193234e-06, + "loss": 1.2936, + "mean_token_accuracy": 0.6729495972394943, + "num_tokens": 2424428446.0, + "step": 14453 + }, + { + "entropy": 1.6018980145454407, + "epoch": 1.5878443327565845, + "grad_norm": 0.6616988182067871, + "learning_rate": 3.932606098736992e-06, + "loss": 1.3709, + "mean_token_accuracy": 0.6708630422751108, + "num_tokens": 2424558299.0, + "step": 14454 + }, + { + "entropy": 1.6875403026739757, + "epoch": 1.5879541896679576, + "grad_norm": 0.7595418691635132, + "learning_rate": 3.931614857960582e-06, + "loss": 1.5471, + "mean_token_accuracy": 0.6460252776741982, + "num_tokens": 2424730586.0, + "step": 14455 + }, + { + "entropy": 1.7008586128552754, + "epoch": 1.5880640465793303, + "grad_norm": 0.7267166972160339, + "learning_rate": 3.930623840895374e-06, + "loss": 1.2598, + "mean_token_accuracy": 0.6835194180409113, + "num_tokens": 2424863946.0, + "step": 14456 + }, + { + "entropy": 1.7498765190442402, + "epoch": 1.5881739034907034, + "grad_norm": 0.7018238306045532, + "learning_rate": 3.92963304757274e-06, + "loss": 1.3719, + "mean_token_accuracy": 0.6424062748750051, + "num_tokens": 2425008200.0, + "step": 14457 + }, + { + "entropy": 1.6699590682983398, + "epoch": 1.5882837604020763, + "grad_norm": 0.660306453704834, + "learning_rate": 3.928642478024032e-06, + "loss": 1.3016, + "mean_token_accuracy": 0.6737691263357798, + "num_tokens": 2425187685.0, + "step": 14458 + }, + { + "entropy": 1.7022275626659393, + "epoch": 1.5883936173134492, + "grad_norm": 0.6317709684371948, + "learning_rate": 3.927652132280601e-06, + "loss": 1.3297, + "mean_token_accuracy": 0.6666052093108495, + "num_tokens": 2425320325.0, + "step": 14459 + }, + { + "entropy": 1.7465975681940715, + "epoch": 1.5885034742248223, + "grad_norm": 0.6987557411193848, + "learning_rate": 3.926662010373794e-06, + "loss": 1.3083, + "mean_token_accuracy": 0.6656803041696548, + "num_tokens": 2425430057.0, + "step": 14460 + }, + { + "entropy": 1.7197660605112712, + "epoch": 1.588613331136195, + "grad_norm": 0.6313250064849854, + "learning_rate": 3.925672112334949e-06, + "loss": 1.3808, + "mean_token_accuracy": 0.6632373382647833, + "num_tokens": 2425565518.0, + "step": 14461 + }, + { + "entropy": 1.6192362904548645, + "epoch": 1.588723188047568, + "grad_norm": 0.6355708241462708, + "learning_rate": 3.924682438195394e-06, + "loss": 1.2976, + "mean_token_accuracy": 0.6806172430515289, + "num_tokens": 2425722341.0, + "step": 14462 + }, + { + "entropy": 1.7012809813022614, + "epoch": 1.588833044958941, + "grad_norm": 0.6677663326263428, + "learning_rate": 3.92369298798645e-06, + "loss": 1.3461, + "mean_token_accuracy": 0.6603120565414429, + "num_tokens": 2425871674.0, + "step": 14463 + }, + { + "entropy": 1.6768560310204823, + "epoch": 1.5889429018703138, + "grad_norm": 0.6279048919677734, + "learning_rate": 3.9227037617394345e-06, + "loss": 1.2773, + "mean_token_accuracy": 0.6672329306602478, + "num_tokens": 2426021027.0, + "step": 14464 + }, + { + "entropy": 1.687048117319743, + "epoch": 1.589052758781687, + "grad_norm": 0.6013140678405762, + "learning_rate": 3.921714759485657e-06, + "loss": 1.4521, + "mean_token_accuracy": 0.6478681514660517, + "num_tokens": 2426196122.0, + "step": 14465 + }, + { + "entropy": 1.6956083178520203, + "epoch": 1.5891626156930598, + "grad_norm": 0.6516452431678772, + "learning_rate": 3.920725981256416e-06, + "loss": 1.4494, + "mean_token_accuracy": 0.6545127183198929, + "num_tokens": 2426380157.0, + "step": 14466 + }, + { + "entropy": 1.6591267784436543, + "epoch": 1.5892724726044327, + "grad_norm": 0.7242151498794556, + "learning_rate": 3.9197374270830095e-06, + "loss": 1.3716, + "mean_token_accuracy": 0.671110580364863, + "num_tokens": 2426507459.0, + "step": 14467 + }, + { + "entropy": 1.6566206415494282, + "epoch": 1.5893823295158058, + "grad_norm": 0.6785814166069031, + "learning_rate": 3.918749096996721e-06, + "loss": 1.4111, + "mean_token_accuracy": 0.6570883542299271, + "num_tokens": 2426673369.0, + "step": 14468 + }, + { + "entropy": 1.7064830362796783, + "epoch": 1.5894921864271785, + "grad_norm": 0.5841078758239746, + "learning_rate": 3.917760991028835e-06, + "loss": 1.5365, + "mean_token_accuracy": 0.6401430120070776, + "num_tokens": 2426884502.0, + "step": 14469 + }, + { + "entropy": 1.7302399973074596, + "epoch": 1.5896020433385516, + "grad_norm": 0.670065701007843, + "learning_rate": 3.9167731092106225e-06, + "loss": 1.2846, + "mean_token_accuracy": 0.66989433268706, + "num_tokens": 2427038641.0, + "step": 14470 + }, + { + "entropy": 1.693581352631251, + "epoch": 1.5897119002499245, + "grad_norm": 0.6922983527183533, + "learning_rate": 3.915785451573346e-06, + "loss": 1.3616, + "mean_token_accuracy": 0.6663850297530493, + "num_tokens": 2427197712.0, + "step": 14471 + }, + { + "entropy": 1.7170843879381816, + "epoch": 1.5898217571612974, + "grad_norm": 0.665672779083252, + "learning_rate": 3.9147980181482685e-06, + "loss": 1.5425, + "mean_token_accuracy": 0.6393137524525324, + "num_tokens": 2427387540.0, + "step": 14472 + }, + { + "entropy": 1.7163531581560771, + "epoch": 1.5899316140726705, + "grad_norm": 0.719980776309967, + "learning_rate": 3.913810808966642e-06, + "loss": 1.3993, + "mean_token_accuracy": 0.6594651788473129, + "num_tokens": 2427546679.0, + "step": 14473 + }, + { + "entropy": 1.739640901486079, + "epoch": 1.5900414709840431, + "grad_norm": 0.7251821756362915, + "learning_rate": 3.9128238240597125e-06, + "loss": 1.5654, + "mean_token_accuracy": 0.6393600652615229, + "num_tokens": 2427761624.0, + "step": 14474 + }, + { + "entropy": 1.6863900522391002, + "epoch": 1.5901513278954162, + "grad_norm": 0.5635016560554504, + "learning_rate": 3.911837063458712e-06, + "loss": 1.3514, + "mean_token_accuracy": 0.663020983338356, + "num_tokens": 2427936133.0, + "step": 14475 + }, + { + "entropy": 1.7012007733186085, + "epoch": 1.5902611848067891, + "grad_norm": 0.617427408695221, + "learning_rate": 3.910850527194878e-06, + "loss": 1.3294, + "mean_token_accuracy": 0.6652300308148066, + "num_tokens": 2428129442.0, + "step": 14476 + }, + { + "entropy": 1.6678180694580078, + "epoch": 1.590371041718162, + "grad_norm": 0.5885007381439209, + "learning_rate": 3.9098642152994295e-06, + "loss": 1.2837, + "mean_token_accuracy": 0.6684055576721827, + "num_tokens": 2428271086.0, + "step": 14477 + }, + { + "entropy": 1.7171485026677449, + "epoch": 1.5904808986295351, + "grad_norm": 0.9225579500198364, + "learning_rate": 3.90887812780358e-06, + "loss": 1.3262, + "mean_token_accuracy": 0.6671409706274668, + "num_tokens": 2428414176.0, + "step": 14478 + }, + { + "entropy": 1.645038495461146, + "epoch": 1.590590755540908, + "grad_norm": 0.6282086372375488, + "learning_rate": 3.907892264738546e-06, + "loss": 1.3399, + "mean_token_accuracy": 0.6648639589548111, + "num_tokens": 2428564196.0, + "step": 14479 + }, + { + "entropy": 1.69185275832812, + "epoch": 1.590700612452281, + "grad_norm": 0.6884506344795227, + "learning_rate": 3.9069066261355235e-06, + "loss": 1.4349, + "mean_token_accuracy": 0.6530876805384954, + "num_tokens": 2428744290.0, + "step": 14480 + }, + { + "entropy": 1.6777120033899944, + "epoch": 1.590810469363654, + "grad_norm": 0.6747894883155823, + "learning_rate": 3.905921212025712e-06, + "loss": 1.2716, + "mean_token_accuracy": 0.6761378745237986, + "num_tokens": 2428886772.0, + "step": 14481 + }, + { + "entropy": 1.7333306272824605, + "epoch": 1.5909203262750267, + "grad_norm": 0.7000220417976379, + "learning_rate": 3.904936022440299e-06, + "loss": 1.4266, + "mean_token_accuracy": 0.6612260987361273, + "num_tokens": 2429022815.0, + "step": 14482 + }, + { + "entropy": 1.7049691180388133, + "epoch": 1.5910301831863998, + "grad_norm": 0.678167462348938, + "learning_rate": 3.90395105741046e-06, + "loss": 1.5999, + "mean_token_accuracy": 0.6353818227847418, + "num_tokens": 2429194580.0, + "step": 14483 + }, + { + "entropy": 1.7223326464494069, + "epoch": 1.5911400400977727, + "grad_norm": 0.6117254495620728, + "learning_rate": 3.9029663169673726e-06, + "loss": 1.4239, + "mean_token_accuracy": 0.6493904888629913, + "num_tokens": 2429410711.0, + "step": 14484 + }, + { + "entropy": 1.721522440512975, + "epoch": 1.5912498970091455, + "grad_norm": 1.1732066869735718, + "learning_rate": 3.901981801142206e-06, + "loss": 1.3452, + "mean_token_accuracy": 0.6608117173115412, + "num_tokens": 2429595212.0, + "step": 14485 + }, + { + "entropy": 1.7386885285377502, + "epoch": 1.5913597539205186, + "grad_norm": 0.7604197263717651, + "learning_rate": 3.900997509966116e-06, + "loss": 1.2881, + "mean_token_accuracy": 0.680141399304072, + "num_tokens": 2429725832.0, + "step": 14486 + }, + { + "entropy": 1.673358937104543, + "epoch": 1.5914696108318913, + "grad_norm": 0.676415741443634, + "learning_rate": 3.9000134434702546e-06, + "loss": 1.3912, + "mean_token_accuracy": 0.6618489970763525, + "num_tokens": 2429905266.0, + "step": 14487 + }, + { + "entropy": 1.7429005404313405, + "epoch": 1.5915794677432644, + "grad_norm": 0.6558493971824646, + "learning_rate": 3.899029601685771e-06, + "loss": 1.4027, + "mean_token_accuracy": 0.658114567399025, + "num_tokens": 2430082783.0, + "step": 14488 + }, + { + "entropy": 1.7437707682450612, + "epoch": 1.5916893246546373, + "grad_norm": 0.7400485277175903, + "learning_rate": 3.8980459846438e-06, + "loss": 1.3969, + "mean_token_accuracy": 0.6684872756401697, + "num_tokens": 2430232038.0, + "step": 14489 + }, + { + "entropy": 1.6967511971791585, + "epoch": 1.5917991815660102, + "grad_norm": 0.9593654870986938, + "learning_rate": 3.89706259237547e-06, + "loss": 1.4267, + "mean_token_accuracy": 0.6613242427508036, + "num_tokens": 2430407393.0, + "step": 14490 + }, + { + "entropy": 1.726237694422404, + "epoch": 1.5919090384773833, + "grad_norm": 0.6871761083602905, + "learning_rate": 3.896079424911913e-06, + "loss": 1.3982, + "mean_token_accuracy": 0.6530379752318064, + "num_tokens": 2430597598.0, + "step": 14491 + }, + { + "entropy": 1.660180926322937, + "epoch": 1.5920188953887562, + "grad_norm": 0.6090943813323975, + "learning_rate": 3.895096482284238e-06, + "loss": 1.3659, + "mean_token_accuracy": 0.6511796166499456, + "num_tokens": 2430787465.0, + "step": 14492 + }, + { + "entropy": 1.7284020980199177, + "epoch": 1.592128752300129, + "grad_norm": 0.7778229713439941, + "learning_rate": 3.89411376452356e-06, + "loss": 1.4237, + "mean_token_accuracy": 0.6680044829845428, + "num_tokens": 2430917537.0, + "step": 14493 + }, + { + "entropy": 1.6784189939498901, + "epoch": 1.5922386092115022, + "grad_norm": 0.5987834334373474, + "learning_rate": 3.8931312716609784e-06, + "loss": 1.494, + "mean_token_accuracy": 0.627329871058464, + "num_tokens": 2431184294.0, + "step": 14494 + }, + { + "entropy": 1.7303737103939056, + "epoch": 1.5923484661228748, + "grad_norm": 0.7138601541519165, + "learning_rate": 3.892149003727589e-06, + "loss": 1.5162, + "mean_token_accuracy": 0.6275907506545385, + "num_tokens": 2431349362.0, + "step": 14495 + }, + { + "entropy": 1.6458158493041992, + "epoch": 1.592458323034248, + "grad_norm": 0.6402990818023682, + "learning_rate": 3.891166960754479e-06, + "loss": 1.2598, + "mean_token_accuracy": 0.6761557509501775, + "num_tokens": 2431473143.0, + "step": 14496 + }, + { + "entropy": 1.6565284033616383, + "epoch": 1.5925681799456208, + "grad_norm": 0.631554126739502, + "learning_rate": 3.890185142772735e-06, + "loss": 1.4001, + "mean_token_accuracy": 0.6604942381381989, + "num_tokens": 2431643673.0, + "step": 14497 + }, + { + "entropy": 1.709353893995285, + "epoch": 1.5926780368569937, + "grad_norm": 0.6308357119560242, + "learning_rate": 3.889203549813426e-06, + "loss": 1.4271, + "mean_token_accuracy": 0.6546412259340286, + "num_tokens": 2431809228.0, + "step": 14498 + }, + { + "entropy": 1.7170674403508503, + "epoch": 1.5927878937683668, + "grad_norm": 0.6483573913574219, + "learning_rate": 3.88822218190762e-06, + "loss": 1.3596, + "mean_token_accuracy": 0.6566579739252726, + "num_tokens": 2431991019.0, + "step": 14499 + }, + { + "entropy": 1.7452252904574077, + "epoch": 1.5928977506797395, + "grad_norm": 0.6205641627311707, + "learning_rate": 3.887241039086378e-06, + "loss": 1.458, + "mean_token_accuracy": 0.655206615726153, + "num_tokens": 2432162864.0, + "step": 14500 + }, + { + "entropy": 1.738767405351003, + "epoch": 1.5930076075911126, + "grad_norm": 0.7384020686149597, + "learning_rate": 3.886260121380752e-06, + "loss": 1.4563, + "mean_token_accuracy": 0.6525509854157766, + "num_tokens": 2432321317.0, + "step": 14501 + }, + { + "entropy": 1.7347382108370464, + "epoch": 1.5931174645024855, + "grad_norm": 0.6322787404060364, + "learning_rate": 3.88527942882179e-06, + "loss": 1.3526, + "mean_token_accuracy": 0.6609780540068945, + "num_tokens": 2432460919.0, + "step": 14502 + }, + { + "entropy": 1.6986276010672252, + "epoch": 1.5932273214138584, + "grad_norm": 0.7408791780471802, + "learning_rate": 3.884298961440523e-06, + "loss": 1.3952, + "mean_token_accuracy": 0.6573623418807983, + "num_tokens": 2432607846.0, + "step": 14503 + }, + { + "entropy": 1.69478377699852, + "epoch": 1.5933371783252315, + "grad_norm": 0.6527783274650574, + "learning_rate": 3.883318719267989e-06, + "loss": 1.3336, + "mean_token_accuracy": 0.6538981248935064, + "num_tokens": 2432812759.0, + "step": 14504 + }, + { + "entropy": 1.6673340400060017, + "epoch": 1.5934470352366044, + "grad_norm": 0.7707885503768921, + "learning_rate": 3.8823387023352125e-06, + "loss": 1.4294, + "mean_token_accuracy": 0.645869846145312, + "num_tokens": 2432981544.0, + "step": 14505 + }, + { + "entropy": 1.634259045124054, + "epoch": 1.5935568921479772, + "grad_norm": 0.7244494557380676, + "learning_rate": 3.881358910673208e-06, + "loss": 1.3827, + "mean_token_accuracy": 0.6569319466749827, + "num_tokens": 2433143702.0, + "step": 14506 + }, + { + "entropy": 1.6752298176288605, + "epoch": 1.5936667490593504, + "grad_norm": 0.6288512349128723, + "learning_rate": 3.880379344312985e-06, + "loss": 1.305, + "mean_token_accuracy": 0.6659407715002695, + "num_tokens": 2433308563.0, + "step": 14507 + }, + { + "entropy": 1.6835540930430095, + "epoch": 1.593776605970723, + "grad_norm": 0.7133122682571411, + "learning_rate": 3.879400003285551e-06, + "loss": 1.4305, + "mean_token_accuracy": 0.663516491651535, + "num_tokens": 2433474529.0, + "step": 14508 + }, + { + "entropy": 1.713885635137558, + "epoch": 1.5938864628820961, + "grad_norm": 0.6772231459617615, + "learning_rate": 3.878420887621894e-06, + "loss": 1.2612, + "mean_token_accuracy": 0.6778315901756287, + "num_tokens": 2433580636.0, + "step": 14509 + }, + { + "entropy": 1.64861661195755, + "epoch": 1.593996319793469, + "grad_norm": 0.6854017972946167, + "learning_rate": 3.8774419973530096e-06, + "loss": 1.2963, + "mean_token_accuracy": 0.6728779971599579, + "num_tokens": 2433711203.0, + "step": 14510 + }, + { + "entropy": 1.6567076245943706, + "epoch": 1.594106176704842, + "grad_norm": 0.6105472445487976, + "learning_rate": 3.876463332509878e-06, + "loss": 1.3891, + "mean_token_accuracy": 0.6490083237489065, + "num_tokens": 2433910114.0, + "step": 14511 + }, + { + "entropy": 1.728460282087326, + "epoch": 1.594216033616215, + "grad_norm": 0.6174923777580261, + "learning_rate": 3.8754848931234675e-06, + "loss": 1.3485, + "mean_token_accuracy": 0.6621581812699636, + "num_tokens": 2434064870.0, + "step": 14512 + }, + { + "entropy": 1.686330407857895, + "epoch": 1.5943258905275877, + "grad_norm": 0.7254580855369568, + "learning_rate": 3.8745066792247535e-06, + "loss": 1.4355, + "mean_token_accuracy": 0.6480583598216375, + "num_tokens": 2434250206.0, + "step": 14513 + }, + { + "entropy": 1.712296078602473, + "epoch": 1.5944357474389608, + "grad_norm": 0.604725182056427, + "learning_rate": 3.873528690844691e-06, + "loss": 1.6318, + "mean_token_accuracy": 0.6211173211534818, + "num_tokens": 2434550589.0, + "step": 14514 + }, + { + "entropy": 1.736443022886912, + "epoch": 1.5945456043503337, + "grad_norm": 0.7887392640113831, + "learning_rate": 3.872550928014233e-06, + "loss": 1.4507, + "mean_token_accuracy": 0.6484967370827993, + "num_tokens": 2434689712.0, + "step": 14515 + }, + { + "entropy": 1.6901069581508636, + "epoch": 1.5946554612617065, + "grad_norm": 0.6353496313095093, + "learning_rate": 3.871573390764326e-06, + "loss": 1.3398, + "mean_token_accuracy": 0.664913609623909, + "num_tokens": 2434821889.0, + "step": 14516 + }, + { + "entropy": 1.737305094798406, + "epoch": 1.5947653181730796, + "grad_norm": 0.7598763704299927, + "learning_rate": 3.870596079125911e-06, + "loss": 1.6029, + "mean_token_accuracy": 0.6363438367843628, + "num_tokens": 2434981702.0, + "step": 14517 + }, + { + "entropy": 1.713841090599696, + "epoch": 1.5948751750844525, + "grad_norm": 0.7490180134773254, + "learning_rate": 3.869618993129919e-06, + "loss": 1.4271, + "mean_token_accuracy": 0.6571142375469208, + "num_tokens": 2435130128.0, + "step": 14518 + }, + { + "entropy": 1.7111783623695374, + "epoch": 1.5949850319958254, + "grad_norm": 0.6395829916000366, + "learning_rate": 3.868642132807268e-06, + "loss": 1.5478, + "mean_token_accuracy": 0.6382935494184494, + "num_tokens": 2435341990.0, + "step": 14519 + }, + { + "entropy": 1.7345607578754425, + "epoch": 1.5950948889071985, + "grad_norm": 0.6980369687080383, + "learning_rate": 3.8676654981888835e-06, + "loss": 1.453, + "mean_token_accuracy": 0.6387426902850469, + "num_tokens": 2435519614.0, + "step": 14520 + }, + { + "entropy": 1.657178282737732, + "epoch": 1.5952047458185712, + "grad_norm": 0.694017231464386, + "learning_rate": 3.866689089305671e-06, + "loss": 1.4342, + "mean_token_accuracy": 0.6631875882546107, + "num_tokens": 2435709231.0, + "step": 14521 + }, + { + "entropy": 1.6879088878631592, + "epoch": 1.5953146027299443, + "grad_norm": 0.7356616854667664, + "learning_rate": 3.865712906188535e-06, + "loss": 1.3698, + "mean_token_accuracy": 0.6680235962073008, + "num_tokens": 2435913341.0, + "step": 14522 + }, + { + "entropy": 1.6702347894509633, + "epoch": 1.5954244596413172, + "grad_norm": 0.6752198338508606, + "learning_rate": 3.8647369488683725e-06, + "loss": 1.3957, + "mean_token_accuracy": 0.6607193052768707, + "num_tokens": 2436123537.0, + "step": 14523 + }, + { + "entropy": 1.7805339296658833, + "epoch": 1.59553431655269, + "grad_norm": 0.6627401113510132, + "learning_rate": 3.863761217376066e-06, + "loss": 1.4888, + "mean_token_accuracy": 0.6396598418553671, + "num_tokens": 2436306893.0, + "step": 14524 + }, + { + "entropy": 1.6623384753863018, + "epoch": 1.5956441734640632, + "grad_norm": 0.702000617980957, + "learning_rate": 3.862785711742505e-06, + "loss": 1.4121, + "mean_token_accuracy": 0.669417624672254, + "num_tokens": 2436502954.0, + "step": 14525 + }, + { + "entropy": 1.6859069367249806, + "epoch": 1.5957540303754358, + "grad_norm": 0.6549301743507385, + "learning_rate": 3.861810431998561e-06, + "loss": 1.3679, + "mean_token_accuracy": 0.6617111215988795, + "num_tokens": 2436663539.0, + "step": 14526 + }, + { + "entropy": 1.7175857424736023, + "epoch": 1.595863887286809, + "grad_norm": 0.6823814511299133, + "learning_rate": 3.860835378175095e-06, + "loss": 1.3735, + "mean_token_accuracy": 0.6544593870639801, + "num_tokens": 2436807795.0, + "step": 14527 + }, + { + "entropy": 1.638688455025355, + "epoch": 1.5959737441981818, + "grad_norm": 0.7116491794586182, + "learning_rate": 3.859860550302975e-06, + "loss": 1.4848, + "mean_token_accuracy": 0.6531452437241873, + "num_tokens": 2436952648.0, + "step": 14528 + }, + { + "entropy": 1.8078931868076324, + "epoch": 1.5960836011095547, + "grad_norm": 0.7018418312072754, + "learning_rate": 3.858885948413053e-06, + "loss": 1.4072, + "mean_token_accuracy": 0.6488349239031473, + "num_tokens": 2437119533.0, + "step": 14529 + }, + { + "entropy": 1.682287057240804, + "epoch": 1.5961934580209278, + "grad_norm": 0.6741816401481628, + "learning_rate": 3.857911572536171e-06, + "loss": 1.4283, + "mean_token_accuracy": 0.6353452255328497, + "num_tokens": 2437396152.0, + "step": 14530 + }, + { + "entropy": 1.7795856694380443, + "epoch": 1.5963033149323007, + "grad_norm": 0.6997506022453308, + "learning_rate": 3.8569374227031685e-06, + "loss": 1.5904, + "mean_token_accuracy": 0.6392437120278677, + "num_tokens": 2437559139.0, + "step": 14531 + }, + { + "entropy": 1.6715736190478008, + "epoch": 1.5964131718436736, + "grad_norm": 0.7329205870628357, + "learning_rate": 3.855963498944881e-06, + "loss": 1.4854, + "mean_token_accuracy": 0.6508821298678716, + "num_tokens": 2437753550.0, + "step": 14532 + }, + { + "entropy": 1.7069322069485982, + "epoch": 1.5965230287550467, + "grad_norm": 0.6997405290603638, + "learning_rate": 3.854989801292126e-06, + "loss": 1.2177, + "mean_token_accuracy": 0.6853279570738474, + "num_tokens": 2437861866.0, + "step": 14533 + }, + { + "entropy": 1.7327150007088978, + "epoch": 1.5966328856664194, + "grad_norm": 0.8832059502601624, + "learning_rate": 3.854016329775727e-06, + "loss": 1.4684, + "mean_token_accuracy": 0.6543268064657847, + "num_tokens": 2438062320.0, + "step": 14534 + }, + { + "entropy": 1.7514649629592896, + "epoch": 1.5967427425777925, + "grad_norm": 0.5511615872383118, + "learning_rate": 3.853043084426491e-06, + "loss": 1.404, + "mean_token_accuracy": 0.6429890592892965, + "num_tokens": 2438275800.0, + "step": 14535 + }, + { + "entropy": 1.7066907584667206, + "epoch": 1.5968525994891654, + "grad_norm": 0.8445066213607788, + "learning_rate": 3.852070065275219e-06, + "loss": 1.1934, + "mean_token_accuracy": 0.6857452293237051, + "num_tokens": 2438398233.0, + "step": 14536 + }, + { + "entropy": 1.7238865693410237, + "epoch": 1.5969624564005382, + "grad_norm": 0.8096691966056824, + "learning_rate": 3.85109727235271e-06, + "loss": 1.3924, + "mean_token_accuracy": 0.6637987395127615, + "num_tokens": 2438542858.0, + "step": 14537 + }, + { + "entropy": 1.6195646623770397, + "epoch": 1.5970723133119114, + "grad_norm": 0.6838819980621338, + "learning_rate": 3.8501247056897516e-06, + "loss": 1.4494, + "mean_token_accuracy": 0.6555087268352509, + "num_tokens": 2438704318.0, + "step": 14538 + }, + { + "entropy": 1.7541901965936024, + "epoch": 1.597182170223284, + "grad_norm": 0.7576407790184021, + "learning_rate": 3.849152365317122e-06, + "loss": 1.4994, + "mean_token_accuracy": 0.6444768408934275, + "num_tokens": 2438875912.0, + "step": 14539 + }, + { + "entropy": 1.7162339687347412, + "epoch": 1.5972920271346571, + "grad_norm": 0.7209724187850952, + "learning_rate": 3.848180251265598e-06, + "loss": 1.509, + "mean_token_accuracy": 0.6356743176778158, + "num_tokens": 2439050609.0, + "step": 14540 + }, + { + "entropy": 1.69766765832901, + "epoch": 1.59740188404603, + "grad_norm": 0.7323725819587708, + "learning_rate": 3.847208363565948e-06, + "loss": 1.2303, + "mean_token_accuracy": 0.681826040148735, + "num_tokens": 2439160195.0, + "step": 14541 + }, + { + "entropy": 1.7064706285794575, + "epoch": 1.597511740957403, + "grad_norm": 0.7907741665840149, + "learning_rate": 3.84623670224893e-06, + "loss": 1.2365, + "mean_token_accuracy": 0.673534115155538, + "num_tokens": 2439296809.0, + "step": 14542 + }, + { + "entropy": 1.7028611103693645, + "epoch": 1.597621597868776, + "grad_norm": 0.6134085059165955, + "learning_rate": 3.845265267345295e-06, + "loss": 1.3897, + "mean_token_accuracy": 0.6668266952037811, + "num_tokens": 2439444265.0, + "step": 14543 + }, + { + "entropy": 1.7380881508191426, + "epoch": 1.5977314547801489, + "grad_norm": 0.668287456035614, + "learning_rate": 3.844294058885793e-06, + "loss": 1.3247, + "mean_token_accuracy": 0.6577390929063162, + "num_tokens": 2439575203.0, + "step": 14544 + }, + { + "entropy": 1.7180090347925823, + "epoch": 1.5978413116915218, + "grad_norm": 0.6699938774108887, + "learning_rate": 3.843323076901159e-06, + "loss": 1.3547, + "mean_token_accuracy": 0.6593746840953827, + "num_tokens": 2439717026.0, + "step": 14545 + }, + { + "entropy": 1.7316296100616455, + "epoch": 1.5979511686028949, + "grad_norm": 0.7291305661201477, + "learning_rate": 3.842352321422122e-06, + "loss": 1.5341, + "mean_token_accuracy": 0.6648634423812231, + "num_tokens": 2439859920.0, + "step": 14546 + }, + { + "entropy": 1.7074800829092662, + "epoch": 1.5980610255142675, + "grad_norm": 0.7385088205337524, + "learning_rate": 3.841381792479412e-06, + "loss": 1.4818, + "mean_token_accuracy": 0.641268327832222, + "num_tokens": 2440079130.0, + "step": 14547 + }, + { + "entropy": 1.7116615772247314, + "epoch": 1.5981708824256406, + "grad_norm": 0.6778597831726074, + "learning_rate": 3.840411490103739e-06, + "loss": 1.4609, + "mean_token_accuracy": 0.6559257407983144, + "num_tokens": 2440223981.0, + "step": 14548 + }, + { + "entropy": 1.7189983030160267, + "epoch": 1.5982807393370135, + "grad_norm": 0.5723182559013367, + "learning_rate": 3.83944141432582e-06, + "loss": 1.437, + "mean_token_accuracy": 0.6461120347181956, + "num_tokens": 2440395440.0, + "step": 14549 + }, + { + "entropy": 1.7937857309977214, + "epoch": 1.5983905962483864, + "grad_norm": 1.1871225833892822, + "learning_rate": 3.838471565176353e-06, + "loss": 1.5759, + "mean_token_accuracy": 0.6486638983090719, + "num_tokens": 2440517170.0, + "step": 14550 + }, + { + "entropy": 1.674616406361262, + "epoch": 1.5985004531597595, + "grad_norm": 0.5636810064315796, + "learning_rate": 3.837501942686031e-06, + "loss": 1.2935, + "mean_token_accuracy": 0.6660540401935577, + "num_tokens": 2440713057.0, + "step": 14551 + }, + { + "entropy": 1.5875491201877594, + "epoch": 1.5986103100711322, + "grad_norm": 0.7079348564147949, + "learning_rate": 3.836532546885546e-06, + "loss": 1.3004, + "mean_token_accuracy": 0.6757344851891199, + "num_tokens": 2440867258.0, + "step": 14552 + }, + { + "entropy": 1.7332369486490886, + "epoch": 1.5987201669825053, + "grad_norm": 0.6799494624137878, + "learning_rate": 3.83556337780558e-06, + "loss": 1.4675, + "mean_token_accuracy": 0.6664691468079885, + "num_tokens": 2441018581.0, + "step": 14553 + }, + { + "entropy": 1.7250041862328847, + "epoch": 1.5988300238938782, + "grad_norm": 0.6802073121070862, + "learning_rate": 3.834594435476805e-06, + "loss": 1.4498, + "mean_token_accuracy": 0.6497951696316401, + "num_tokens": 2441215526.0, + "step": 14554 + }, + { + "entropy": 1.6998497645060222, + "epoch": 1.598939880805251, + "grad_norm": 0.8127340078353882, + "learning_rate": 3.8336257199298845e-06, + "loss": 1.4345, + "mean_token_accuracy": 0.6507293184598287, + "num_tokens": 2441379883.0, + "step": 14555 + }, + { + "entropy": 1.7047206560770671, + "epoch": 1.5990497377166242, + "grad_norm": 0.6116032004356384, + "learning_rate": 3.832657231195483e-06, + "loss": 1.4392, + "mean_token_accuracy": 0.6522220075130463, + "num_tokens": 2441556694.0, + "step": 14556 + }, + { + "entropy": 1.6925352116425831, + "epoch": 1.599159594627997, + "grad_norm": 0.6775454878807068, + "learning_rate": 3.83168896930425e-06, + "loss": 1.3527, + "mean_token_accuracy": 0.6680527776479721, + "num_tokens": 2441706254.0, + "step": 14557 + }, + { + "entropy": 1.695090264081955, + "epoch": 1.59926945153937, + "grad_norm": 0.633358359336853, + "learning_rate": 3.8307209342868294e-06, + "loss": 1.3081, + "mean_token_accuracy": 0.664101724823316, + "num_tokens": 2441841173.0, + "step": 14558 + }, + { + "entropy": 1.70580060283343, + "epoch": 1.599379308450743, + "grad_norm": 0.7494609951972961, + "learning_rate": 3.8297531261738626e-06, + "loss": 1.37, + "mean_token_accuracy": 0.6554321199655533, + "num_tokens": 2441999550.0, + "step": 14559 + }, + { + "entropy": 1.682591011126836, + "epoch": 1.5994891653621157, + "grad_norm": 0.6495206952095032, + "learning_rate": 3.828785544995977e-06, + "loss": 1.4213, + "mean_token_accuracy": 0.6555017977952957, + "num_tokens": 2442176523.0, + "step": 14560 + }, + { + "entropy": 1.7068423926830292, + "epoch": 1.5995990222734888, + "grad_norm": 0.7015541195869446, + "learning_rate": 3.827818190783799e-06, + "loss": 1.3814, + "mean_token_accuracy": 0.6704583317041397, + "num_tokens": 2442314778.0, + "step": 14561 + }, + { + "entropy": 1.6887734134991963, + "epoch": 1.5997088791848617, + "grad_norm": 0.5791016817092896, + "learning_rate": 3.826851063567943e-06, + "loss": 1.4871, + "mean_token_accuracy": 0.6437655538320541, + "num_tokens": 2442514913.0, + "step": 14562 + }, + { + "entropy": 1.7405428489049275, + "epoch": 1.5998187360962346, + "grad_norm": 0.7147680521011353, + "learning_rate": 3.825884163379017e-06, + "loss": 1.3649, + "mean_token_accuracy": 0.6673022856314977, + "num_tokens": 2442648604.0, + "step": 14563 + }, + { + "entropy": 1.7529491583506267, + "epoch": 1.5999285930076077, + "grad_norm": 0.5654240250587463, + "learning_rate": 3.824917490247625e-06, + "loss": 1.4984, + "mean_token_accuracy": 0.6382785141468048, + "num_tokens": 2442869846.0, + "step": 14564 + }, + { + "entropy": 1.6878890097141266, + "epoch": 1.6000384499189804, + "grad_norm": 0.7083843350410461, + "learning_rate": 3.823951044204361e-06, + "loss": 1.3488, + "mean_token_accuracy": 0.6723784406979879, + "num_tokens": 2443030206.0, + "step": 14565 + }, + { + "entropy": 1.7418665091196697, + "epoch": 1.6001483068303535, + "grad_norm": 0.6479423642158508, + "learning_rate": 3.822984825279814e-06, + "loss": 1.3768, + "mean_token_accuracy": 0.6521689047416052, + "num_tokens": 2443196472.0, + "step": 14566 + }, + { + "entropy": 1.7459342181682587, + "epoch": 1.6002581637417264, + "grad_norm": 0.758888304233551, + "learning_rate": 3.822018833504564e-06, + "loss": 1.2715, + "mean_token_accuracy": 0.6687377045551935, + "num_tokens": 2443326770.0, + "step": 14567 + }, + { + "entropy": 1.6872256497542064, + "epoch": 1.6003680206530992, + "grad_norm": 0.5410403609275818, + "learning_rate": 3.821053068909182e-06, + "loss": 1.4807, + "mean_token_accuracy": 0.6485924671093622, + "num_tokens": 2443565031.0, + "step": 14568 + }, + { + "entropy": 1.6879334946473439, + "epoch": 1.6004778775644724, + "grad_norm": 0.68756103515625, + "learning_rate": 3.820087531524236e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6751703520615896, + "num_tokens": 2443702467.0, + "step": 14569 + }, + { + "entropy": 1.7088148792584736, + "epoch": 1.6005877344758452, + "grad_norm": 0.6238970756530762, + "learning_rate": 3.819122221380284e-06, + "loss": 1.3777, + "mean_token_accuracy": 0.6487255543470383, + "num_tokens": 2443858226.0, + "step": 14570 + }, + { + "entropy": 1.7447780867417653, + "epoch": 1.6006975913872181, + "grad_norm": 0.7313827872276306, + "learning_rate": 3.818157138507878e-06, + "loss": 1.4983, + "mean_token_accuracy": 0.6436322331428528, + "num_tokens": 2444051433.0, + "step": 14571 + }, + { + "entropy": 1.6420711676279705, + "epoch": 1.6008074482985912, + "grad_norm": 0.7198586463928223, + "learning_rate": 3.817192282937561e-06, + "loss": 1.4051, + "mean_token_accuracy": 0.6673461546500524, + "num_tokens": 2444215820.0, + "step": 14572 + }, + { + "entropy": 1.7294553816318512, + "epoch": 1.600917305209964, + "grad_norm": 0.7166528105735779, + "learning_rate": 3.816227654699873e-06, + "loss": 1.5608, + "mean_token_accuracy": 0.6288095712661743, + "num_tokens": 2444425376.0, + "step": 14573 + }, + { + "entropy": 1.750697026650111, + "epoch": 1.601027162121337, + "grad_norm": 0.6765901446342468, + "learning_rate": 3.815263253825344e-06, + "loss": 1.3493, + "mean_token_accuracy": 0.6524553100268046, + "num_tokens": 2444541889.0, + "step": 14574 + }, + { + "entropy": 1.749423881371816, + "epoch": 1.6011370190327099, + "grad_norm": 0.7044322490692139, + "learning_rate": 3.8142990803444935e-06, + "loss": 1.4577, + "mean_token_accuracy": 0.6440982123215994, + "num_tokens": 2444719105.0, + "step": 14575 + }, + { + "entropy": 1.7084167798360188, + "epoch": 1.6012468759440828, + "grad_norm": 0.7361584305763245, + "learning_rate": 3.8133351342878393e-06, + "loss": 1.5912, + "mean_token_accuracy": 0.6365940769513448, + "num_tokens": 2444993446.0, + "step": 14576 + }, + { + "entropy": 1.7147212425867717, + "epoch": 1.6013567328554559, + "grad_norm": 0.6779691576957703, + "learning_rate": 3.8123714156858886e-06, + "loss": 1.2403, + "mean_token_accuracy": 0.6808453897635142, + "num_tokens": 2445124199.0, + "step": 14577 + }, + { + "entropy": 1.671504944562912, + "epoch": 1.6014665897668285, + "grad_norm": 0.6467366814613342, + "learning_rate": 3.8114079245691473e-06, + "loss": 1.3685, + "mean_token_accuracy": 0.6617433130741119, + "num_tokens": 2445279228.0, + "step": 14578 + }, + { + "entropy": 1.7411832610766094, + "epoch": 1.6015764466782016, + "grad_norm": 0.6643005609512329, + "learning_rate": 3.810444660968104e-06, + "loss": 1.4469, + "mean_token_accuracy": 0.6542666604121526, + "num_tokens": 2445408287.0, + "step": 14579 + }, + { + "entropy": 1.605327715476354, + "epoch": 1.6016863035895745, + "grad_norm": 0.6111404299736023, + "learning_rate": 3.809481624913246e-06, + "loss": 1.3451, + "mean_token_accuracy": 0.6663858542839686, + "num_tokens": 2445582266.0, + "step": 14580 + }, + { + "entropy": 1.7256748775641124, + "epoch": 1.6017961605009474, + "grad_norm": 0.6152947545051575, + "learning_rate": 3.8085188164350574e-06, + "loss": 1.3795, + "mean_token_accuracy": 0.651614765326182, + "num_tokens": 2445723827.0, + "step": 14581 + }, + { + "entropy": 1.7151028116544087, + "epoch": 1.6019060174123205, + "grad_norm": 0.6862490773200989, + "learning_rate": 3.8075562355640066e-06, + "loss": 1.3588, + "mean_token_accuracy": 0.6671392222245535, + "num_tokens": 2445884515.0, + "step": 14582 + }, + { + "entropy": 1.7010668416817982, + "epoch": 1.6020158743236934, + "grad_norm": 0.6362306475639343, + "learning_rate": 3.806593882330558e-06, + "loss": 1.5877, + "mean_token_accuracy": 0.6236594518025717, + "num_tokens": 2446102916.0, + "step": 14583 + }, + { + "entropy": 1.7054578860600789, + "epoch": 1.6021257312350663, + "grad_norm": 0.686418890953064, + "learning_rate": 3.8056317567651723e-06, + "loss": 1.4425, + "mean_token_accuracy": 0.6407395700613657, + "num_tokens": 2446306735.0, + "step": 14584 + }, + { + "entropy": 1.720127671957016, + "epoch": 1.6022355881464394, + "grad_norm": 0.706402599811554, + "learning_rate": 3.804669858898301e-06, + "loss": 1.4191, + "mean_token_accuracy": 0.6532514144976934, + "num_tokens": 2446492829.0, + "step": 14585 + }, + { + "entropy": 1.673651397228241, + "epoch": 1.602345445057812, + "grad_norm": 0.6456170678138733, + "learning_rate": 3.803708188760387e-06, + "loss": 1.3936, + "mean_token_accuracy": 0.6565722078084946, + "num_tokens": 2446673307.0, + "step": 14586 + }, + { + "entropy": 1.7640669147173564, + "epoch": 1.6024553019691852, + "grad_norm": 0.7029922604560852, + "learning_rate": 3.8027467463818636e-06, + "loss": 1.4321, + "mean_token_accuracy": 0.6552887161572775, + "num_tokens": 2446854256.0, + "step": 14587 + }, + { + "entropy": 1.7160078982512157, + "epoch": 1.602565158880558, + "grad_norm": 0.6183205842971802, + "learning_rate": 3.801785531793164e-06, + "loss": 1.5031, + "mean_token_accuracy": 0.6445142378409704, + "num_tokens": 2447091822.0, + "step": 14588 + }, + { + "entropy": 1.7458133002122243, + "epoch": 1.602675015791931, + "grad_norm": 0.6344084739685059, + "learning_rate": 3.8008245450247085e-06, + "loss": 1.4217, + "mean_token_accuracy": 0.6480998347202936, + "num_tokens": 2447275539.0, + "step": 14589 + }, + { + "entropy": 1.7621017297108967, + "epoch": 1.602784872703304, + "grad_norm": 0.6531854867935181, + "learning_rate": 3.799863786106912e-06, + "loss": 1.4966, + "mean_token_accuracy": 0.6346394668022791, + "num_tokens": 2447496350.0, + "step": 14590 + }, + { + "entropy": 1.6988888482252757, + "epoch": 1.602894729614677, + "grad_norm": 0.6750460267066956, + "learning_rate": 3.798903255070184e-06, + "loss": 1.4873, + "mean_token_accuracy": 0.6548330287138621, + "num_tokens": 2447678997.0, + "step": 14591 + }, + { + "entropy": 1.6446965634822845, + "epoch": 1.6030045865260498, + "grad_norm": 0.7351519465446472, + "learning_rate": 3.79794295194492e-06, + "loss": 1.3521, + "mean_token_accuracy": 0.6712757696708044, + "num_tokens": 2447824026.0, + "step": 14592 + }, + { + "entropy": 1.701996664206187, + "epoch": 1.6031144434374227, + "grad_norm": 0.6086641550064087, + "learning_rate": 3.796982876761518e-06, + "loss": 1.3741, + "mean_token_accuracy": 0.6470549603303274, + "num_tokens": 2448040359.0, + "step": 14593 + }, + { + "entropy": 1.7051210800806682, + "epoch": 1.6032243003487956, + "grad_norm": 0.6576172113418579, + "learning_rate": 3.7960230295503636e-06, + "loss": 1.4443, + "mean_token_accuracy": 0.6446033616860708, + "num_tokens": 2448249961.0, + "step": 14594 + }, + { + "entropy": 1.6408964693546295, + "epoch": 1.6033341572601687, + "grad_norm": 0.6409726738929749, + "learning_rate": 3.7950634103418307e-06, + "loss": 1.3604, + "mean_token_accuracy": 0.6675408234198889, + "num_tokens": 2448420205.0, + "step": 14595 + }, + { + "entropy": 1.7251879175504048, + "epoch": 1.6034440141715416, + "grad_norm": 0.5918386578559875, + "learning_rate": 3.7941040191662943e-06, + "loss": 1.5234, + "mean_token_accuracy": 0.6395444025595983, + "num_tokens": 2448625696.0, + "step": 14596 + }, + { + "entropy": 1.6319251755873363, + "epoch": 1.6035538710829145, + "grad_norm": 0.9376781582832336, + "learning_rate": 3.793144856054122e-06, + "loss": 1.3813, + "mean_token_accuracy": 0.6574216683705648, + "num_tokens": 2448787918.0, + "step": 14597 + }, + { + "entropy": 1.6945286591847737, + "epoch": 1.6036637279942876, + "grad_norm": 0.7240238785743713, + "learning_rate": 3.7921859210356664e-06, + "loss": 1.4856, + "mean_token_accuracy": 0.6500385651985804, + "num_tokens": 2448973971.0, + "step": 14598 + }, + { + "entropy": 1.6881266335646312, + "epoch": 1.6037735849056602, + "grad_norm": 0.6214616298675537, + "learning_rate": 3.7912272141412767e-06, + "loss": 1.3828, + "mean_token_accuracy": 0.6629806409279505, + "num_tokens": 2449114059.0, + "step": 14599 + }, + { + "entropy": 1.6800266206264496, + "epoch": 1.6038834418170334, + "grad_norm": 0.8281370401382446, + "learning_rate": 3.7902687354012998e-06, + "loss": 1.4063, + "mean_token_accuracy": 0.6595364113648733, + "num_tokens": 2449278084.0, + "step": 14600 + }, + { + "entropy": 1.739363302787145, + "epoch": 1.6039932987284062, + "grad_norm": 0.6671850085258484, + "learning_rate": 3.789310484846065e-06, + "loss": 1.4597, + "mean_token_accuracy": 0.6446760495503744, + "num_tokens": 2449536151.0, + "step": 14601 + }, + { + "entropy": 1.6971477965513866, + "epoch": 1.6041031556397791, + "grad_norm": 0.7448716163635254, + "learning_rate": 3.7883524625059075e-06, + "loss": 1.2988, + "mean_token_accuracy": 0.6720754504203796, + "num_tokens": 2449693777.0, + "step": 14602 + }, + { + "entropy": 1.7345844606558483, + "epoch": 1.6042130125511522, + "grad_norm": 0.7584978342056274, + "learning_rate": 3.7873946684111452e-06, + "loss": 1.2274, + "mean_token_accuracy": 0.6714150657256445, + "num_tokens": 2449794928.0, + "step": 14603 + }, + { + "entropy": 1.7422133386135101, + "epoch": 1.6043228694625251, + "grad_norm": 0.7101638317108154, + "learning_rate": 3.78643710259209e-06, + "loss": 1.3554, + "mean_token_accuracy": 0.6665351639191309, + "num_tokens": 2449911300.0, + "step": 14604 + }, + { + "entropy": 1.7503991921742756, + "epoch": 1.604432726373898, + "grad_norm": 0.716066837310791, + "learning_rate": 3.78547976507905e-06, + "loss": 1.2696, + "mean_token_accuracy": 0.6812546650568644, + "num_tokens": 2450077891.0, + "step": 14605 + }, + { + "entropy": 1.7393498420715332, + "epoch": 1.6045425832852709, + "grad_norm": 0.7485668659210205, + "learning_rate": 3.7845226559023256e-06, + "loss": 1.3663, + "mean_token_accuracy": 0.6589196572701136, + "num_tokens": 2450221382.0, + "step": 14606 + }, + { + "entropy": 1.73830442627271, + "epoch": 1.6046524401966438, + "grad_norm": 0.6657488346099854, + "learning_rate": 3.783565775092206e-06, + "loss": 1.4914, + "mean_token_accuracy": 0.6288889646530151, + "num_tokens": 2450418829.0, + "step": 14607 + }, + { + "entropy": 1.659916838010152, + "epoch": 1.6047622971080169, + "grad_norm": 0.7344122529029846, + "learning_rate": 3.7826091226789772e-06, + "loss": 1.4672, + "mean_token_accuracy": 0.6499627828598022, + "num_tokens": 2450595410.0, + "step": 14608 + }, + { + "entropy": 1.7361893852551777, + "epoch": 1.6048721540193898, + "grad_norm": 0.7069867253303528, + "learning_rate": 3.7816526986929203e-06, + "loss": 1.3449, + "mean_token_accuracy": 0.657584935426712, + "num_tokens": 2450744883.0, + "step": 14609 + }, + { + "entropy": 1.7976744870344799, + "epoch": 1.6049820109307626, + "grad_norm": 0.6963937878608704, + "learning_rate": 3.780696503164303e-06, + "loss": 1.5181, + "mean_token_accuracy": 0.6381178746620814, + "num_tokens": 2450900651.0, + "step": 14610 + }, + { + "entropy": 1.6914178828398387, + "epoch": 1.6050918678421358, + "grad_norm": 0.6201428771018982, + "learning_rate": 3.7797405361233853e-06, + "loss": 1.5151, + "mean_token_accuracy": 0.6491784354050955, + "num_tokens": 2451111106.0, + "step": 14611 + }, + { + "entropy": 1.6507742206255596, + "epoch": 1.6052017247535084, + "grad_norm": 0.737235963344574, + "learning_rate": 3.7787847976004277e-06, + "loss": 1.2467, + "mean_token_accuracy": 0.6868196477492651, + "num_tokens": 2451234221.0, + "step": 14612 + }, + { + "entropy": 1.661937306324641, + "epoch": 1.6053115816648815, + "grad_norm": 0.6396856904029846, + "learning_rate": 3.7778292876256762e-06, + "loss": 1.4216, + "mean_token_accuracy": 0.6528457701206207, + "num_tokens": 2451452229.0, + "step": 14613 + }, + { + "entropy": 1.6698509057362874, + "epoch": 1.6054214385762544, + "grad_norm": 0.7439182996749878, + "learning_rate": 3.776874006229376e-06, + "loss": 1.3751, + "mean_token_accuracy": 0.6656199296315511, + "num_tokens": 2451611210.0, + "step": 14614 + }, + { + "entropy": 1.7334049840768178, + "epoch": 1.6055312954876273, + "grad_norm": 0.7342074513435364, + "learning_rate": 3.7759189534417575e-06, + "loss": 1.3407, + "mean_token_accuracy": 0.6541020025809606, + "num_tokens": 2451748585.0, + "step": 14615 + }, + { + "entropy": 1.6244231363137562, + "epoch": 1.6056411523990004, + "grad_norm": 0.6952174305915833, + "learning_rate": 3.774964129293046e-06, + "loss": 1.3148, + "mean_token_accuracy": 0.6725502957900366, + "num_tokens": 2451912029.0, + "step": 14616 + }, + { + "entropy": 1.6656245787938435, + "epoch": 1.6057510093103733, + "grad_norm": 0.6574463248252869, + "learning_rate": 3.7740095338134684e-06, + "loss": 1.3002, + "mean_token_accuracy": 0.67449023326238, + "num_tokens": 2452048004.0, + "step": 14617 + }, + { + "entropy": 1.643865704536438, + "epoch": 1.6058608662217462, + "grad_norm": 0.814515233039856, + "learning_rate": 3.7730551670332317e-06, + "loss": 1.4194, + "mean_token_accuracy": 0.6575757165749868, + "num_tokens": 2452173933.0, + "step": 14618 + }, + { + "entropy": 1.7123675048351288, + "epoch": 1.605970723133119, + "grad_norm": 0.6425331830978394, + "learning_rate": 3.7721010289825398e-06, + "loss": 1.3976, + "mean_token_accuracy": 0.6674291491508484, + "num_tokens": 2452314391.0, + "step": 14619 + }, + { + "entropy": 1.688428372144699, + "epoch": 1.606080580044492, + "grad_norm": 0.6733593344688416, + "learning_rate": 3.771147119691595e-06, + "loss": 1.3977, + "mean_token_accuracy": 0.6623414903879166, + "num_tokens": 2452466205.0, + "step": 14620 + }, + { + "entropy": 1.7384653389453888, + "epoch": 1.606190436955865, + "grad_norm": 0.7183213829994202, + "learning_rate": 3.7701934391905883e-06, + "loss": 1.5537, + "mean_token_accuracy": 0.6303740590810776, + "num_tokens": 2452659090.0, + "step": 14621 + }, + { + "entropy": 1.698512186606725, + "epoch": 1.606300293867238, + "grad_norm": 0.6138864755630493, + "learning_rate": 3.769239987509701e-06, + "loss": 1.4726, + "mean_token_accuracy": 0.6334889431794485, + "num_tokens": 2452852526.0, + "step": 14622 + }, + { + "entropy": 1.7259888648986816, + "epoch": 1.6064101507786108, + "grad_norm": 0.6284215450286865, + "learning_rate": 3.768286764679109e-06, + "loss": 1.3779, + "mean_token_accuracy": 0.652561808625857, + "num_tokens": 2453015973.0, + "step": 14623 + }, + { + "entropy": 1.7250304917494457, + "epoch": 1.606520007689984, + "grad_norm": 0.6159952282905579, + "learning_rate": 3.767333770728981e-06, + "loss": 1.3785, + "mean_token_accuracy": 0.6558371136585871, + "num_tokens": 2453166510.0, + "step": 14624 + }, + { + "entropy": 1.7731029192606609, + "epoch": 1.6066298646013566, + "grad_norm": 0.7708766460418701, + "learning_rate": 3.766381005689481e-06, + "loss": 1.5243, + "mean_token_accuracy": 0.63959468404452, + "num_tokens": 2453377002.0, + "step": 14625 + }, + { + "entropy": 1.7134467959403992, + "epoch": 1.6067397215127297, + "grad_norm": 0.6083643436431885, + "learning_rate": 3.7654284695907638e-06, + "loss": 1.3713, + "mean_token_accuracy": 0.666517436504364, + "num_tokens": 2453534002.0, + "step": 14626 + }, + { + "entropy": 1.695216139157613, + "epoch": 1.6068495784241026, + "grad_norm": 0.8802637457847595, + "learning_rate": 3.7644761624629745e-06, + "loss": 1.2867, + "mean_token_accuracy": 0.6799248705307642, + "num_tokens": 2453662704.0, + "step": 14627 + }, + { + "entropy": 1.7392724752426147, + "epoch": 1.6069594353354755, + "grad_norm": 0.8451277613639832, + "learning_rate": 3.763524084336252e-06, + "loss": 1.4013, + "mean_token_accuracy": 0.6632727136214575, + "num_tokens": 2453811717.0, + "step": 14628 + }, + { + "entropy": 1.727136602004369, + "epoch": 1.6070692922468486, + "grad_norm": 0.7121945023536682, + "learning_rate": 3.7625722352407348e-06, + "loss": 1.3258, + "mean_token_accuracy": 0.6634857207536697, + "num_tokens": 2453989012.0, + "step": 14629 + }, + { + "entropy": 1.629335989554723, + "epoch": 1.6071791491582215, + "grad_norm": 1.1426151990890503, + "learning_rate": 3.761620615206544e-06, + "loss": 1.4052, + "mean_token_accuracy": 0.660635307431221, + "num_tokens": 2454190517.0, + "step": 14630 + }, + { + "entropy": 1.6815782884756725, + "epoch": 1.6072890060695944, + "grad_norm": 0.7410414814949036, + "learning_rate": 3.760669224263798e-06, + "loss": 1.387, + "mean_token_accuracy": 0.6496629069248835, + "num_tokens": 2454362336.0, + "step": 14631 + }, + { + "entropy": 1.7174339493115742, + "epoch": 1.6073988629809672, + "grad_norm": 0.7126834988594055, + "learning_rate": 3.7597180624426106e-06, + "loss": 1.4129, + "mean_token_accuracy": 0.6453147878249487, + "num_tokens": 2454534467.0, + "step": 14632 + }, + { + "entropy": 1.7280255556106567, + "epoch": 1.6075087198923401, + "grad_norm": 0.9190917015075684, + "learning_rate": 3.7587671297730815e-06, + "loss": 1.4702, + "mean_token_accuracy": 0.6770395090182623, + "num_tokens": 2454693122.0, + "step": 14633 + }, + { + "entropy": 1.656565527121226, + "epoch": 1.6076185768037132, + "grad_norm": 0.7317885160446167, + "learning_rate": 3.7578164262853132e-06, + "loss": 1.5353, + "mean_token_accuracy": 0.6430183400710424, + "num_tokens": 2454923455.0, + "step": 14634 + }, + { + "entropy": 1.6855885187784831, + "epoch": 1.6077284337150861, + "grad_norm": 0.7330460548400879, + "learning_rate": 3.7568659520093908e-06, + "loss": 1.487, + "mean_token_accuracy": 0.6602890988190969, + "num_tokens": 2455134200.0, + "step": 14635 + }, + { + "entropy": 1.6330851515134175, + "epoch": 1.607838290626459, + "grad_norm": 0.7137647867202759, + "learning_rate": 3.7559157069753944e-06, + "loss": 1.4943, + "mean_token_accuracy": 0.6433676034212112, + "num_tokens": 2455322690.0, + "step": 14636 + }, + { + "entropy": 1.7134460806846619, + "epoch": 1.607948147537832, + "grad_norm": 0.740168571472168, + "learning_rate": 3.7549656912134047e-06, + "loss": 1.2794, + "mean_token_accuracy": 0.6720482061306635, + "num_tokens": 2455461450.0, + "step": 14637 + }, + { + "entropy": 1.7948347826798756, + "epoch": 1.6080580044492048, + "grad_norm": 0.7866724729537964, + "learning_rate": 3.754015904753486e-06, + "loss": 1.5648, + "mean_token_accuracy": 0.6387749413649241, + "num_tokens": 2455632231.0, + "step": 14638 + }, + { + "entropy": 1.7541786233584087, + "epoch": 1.6081678613605779, + "grad_norm": 0.679871678352356, + "learning_rate": 3.7530663476256966e-06, + "loss": 1.3942, + "mean_token_accuracy": 0.6533337185780207, + "num_tokens": 2455793295.0, + "step": 14639 + }, + { + "entropy": 1.7643111447493236, + "epoch": 1.6082777182719508, + "grad_norm": 1.0173559188842773, + "learning_rate": 3.752117019860091e-06, + "loss": 1.4631, + "mean_token_accuracy": 0.6418692767620087, + "num_tokens": 2456022775.0, + "step": 14640 + }, + { + "entropy": 1.7089822093645732, + "epoch": 1.6083875751833236, + "grad_norm": 0.7529508471488953, + "learning_rate": 3.7511679214867193e-06, + "loss": 1.4893, + "mean_token_accuracy": 0.6491026779015859, + "num_tokens": 2456210046.0, + "step": 14641 + }, + { + "entropy": 1.7521416048208873, + "epoch": 1.6084974320946968, + "grad_norm": 0.7861169576644897, + "learning_rate": 3.750219052535616e-06, + "loss": 1.4302, + "mean_token_accuracy": 0.6570564558108648, + "num_tokens": 2456370939.0, + "step": 14642 + }, + { + "entropy": 1.7036944031715393, + "epoch": 1.6086072890060696, + "grad_norm": 0.6966022253036499, + "learning_rate": 3.7492704130368103e-06, + "loss": 1.5472, + "mean_token_accuracy": 0.658886194229126, + "num_tokens": 2456574331.0, + "step": 14643 + }, + { + "entropy": 1.6719833314418793, + "epoch": 1.6087171459174425, + "grad_norm": 0.6288134455680847, + "learning_rate": 3.7483220030203305e-06, + "loss": 1.4149, + "mean_token_accuracy": 0.6646634787321091, + "num_tokens": 2456763691.0, + "step": 14644 + }, + { + "entropy": 1.659516602754593, + "epoch": 1.6088270028288156, + "grad_norm": 0.7653041481971741, + "learning_rate": 3.747373822516189e-06, + "loss": 1.2218, + "mean_token_accuracy": 0.6792045831680298, + "num_tokens": 2456919385.0, + "step": 14645 + }, + { + "entropy": 1.7163499097029369, + "epoch": 1.6089368597401883, + "grad_norm": 0.8180403709411621, + "learning_rate": 3.7464258715544023e-06, + "loss": 1.579, + "mean_token_accuracy": 0.6337461198369662, + "num_tokens": 2457093423.0, + "step": 14646 + }, + { + "entropy": 1.6461522082487743, + "epoch": 1.6090467166515614, + "grad_norm": 0.6194190979003906, + "learning_rate": 3.7454781501649674e-06, + "loss": 1.3822, + "mean_token_accuracy": 0.6567869633436203, + "num_tokens": 2457285536.0, + "step": 14647 + }, + { + "entropy": 1.752669632434845, + "epoch": 1.6091565735629343, + "grad_norm": 0.6721103191375732, + "learning_rate": 3.744530658377876e-06, + "loss": 1.4253, + "mean_token_accuracy": 0.6482215970754623, + "num_tokens": 2457409826.0, + "step": 14648 + }, + { + "entropy": 1.7362704177697499, + "epoch": 1.6092664304743072, + "grad_norm": 0.6712827682495117, + "learning_rate": 3.743583396223125e-06, + "loss": 1.57, + "mean_token_accuracy": 0.6405636916557947, + "num_tokens": 2457605817.0, + "step": 14649 + }, + { + "entropy": 1.6012630959351857, + "epoch": 1.6093762873856803, + "grad_norm": 0.6160146594047546, + "learning_rate": 3.7426363637306886e-06, + "loss": 1.2842, + "mean_token_accuracy": 0.6701871405045191, + "num_tokens": 2457774714.0, + "step": 14650 + }, + { + "entropy": 1.6824021935462952, + "epoch": 1.609486144297053, + "grad_norm": 0.5910770297050476, + "learning_rate": 3.741689560930538e-06, + "loss": 1.401, + "mean_token_accuracy": 0.6555624802907308, + "num_tokens": 2457985015.0, + "step": 14651 + }, + { + "entropy": 1.6749296089013417, + "epoch": 1.609596001208426, + "grad_norm": 0.696537435054779, + "learning_rate": 3.740742987852642e-06, + "loss": 1.3365, + "mean_token_accuracy": 0.6835020283857981, + "num_tokens": 2458114521.0, + "step": 14652 + }, + { + "entropy": 1.7262722849845886, + "epoch": 1.609705858119799, + "grad_norm": 0.6201340556144714, + "learning_rate": 3.7397966445269628e-06, + "loss": 1.4564, + "mean_token_accuracy": 0.6472860972086588, + "num_tokens": 2458323341.0, + "step": 14653 + }, + { + "entropy": 1.7126949429512024, + "epoch": 1.6098157150311718, + "grad_norm": 0.6349091529846191, + "learning_rate": 3.738850530983448e-06, + "loss": 1.4529, + "mean_token_accuracy": 0.6366288512945175, + "num_tokens": 2458517592.0, + "step": 14654 + }, + { + "entropy": 1.7143605947494507, + "epoch": 1.609925571942545, + "grad_norm": 0.7637413144111633, + "learning_rate": 3.737904647252039e-06, + "loss": 1.2987, + "mean_token_accuracy": 0.6668682942787806, + "num_tokens": 2458644660.0, + "step": 14655 + }, + { + "entropy": 1.6725689272085826, + "epoch": 1.6100354288539178, + "grad_norm": 0.754520058631897, + "learning_rate": 3.736958993362678e-06, + "loss": 1.2872, + "mean_token_accuracy": 0.6723710298538208, + "num_tokens": 2458794632.0, + "step": 14656 + }, + { + "entropy": 1.7350502808888753, + "epoch": 1.6101452857652907, + "grad_norm": 0.801001250743866, + "learning_rate": 3.73601356934529e-06, + "loss": 1.3272, + "mean_token_accuracy": 0.657276377081871, + "num_tokens": 2458944318.0, + "step": 14657 + }, + { + "entropy": 1.724273145198822, + "epoch": 1.6102551426766638, + "grad_norm": 0.788803219795227, + "learning_rate": 3.735068375229801e-06, + "loss": 1.3117, + "mean_token_accuracy": 0.6685859362284342, + "num_tokens": 2459069936.0, + "step": 14658 + }, + { + "entropy": 1.643745203812917, + "epoch": 1.6103649995880365, + "grad_norm": 0.7852417826652527, + "learning_rate": 3.7341234110461246e-06, + "loss": 1.3608, + "mean_token_accuracy": 0.6609650353590647, + "num_tokens": 2459214227.0, + "step": 14659 + }, + { + "entropy": 1.726538171370824, + "epoch": 1.6104748564994096, + "grad_norm": 0.6105849146842957, + "learning_rate": 3.7331786768241663e-06, + "loss": 1.4536, + "mean_token_accuracy": 0.6502714107433955, + "num_tokens": 2459388462.0, + "step": 14660 + }, + { + "entropy": 1.7064904570579529, + "epoch": 1.6105847134107825, + "grad_norm": 0.6803503632545471, + "learning_rate": 3.7322341725938314e-06, + "loss": 1.396, + "mean_token_accuracy": 0.658082311352094, + "num_tokens": 2459570588.0, + "step": 14661 + }, + { + "entropy": 1.7269649803638458, + "epoch": 1.6106945703221554, + "grad_norm": 0.6450273394584656, + "learning_rate": 3.7312898983850084e-06, + "loss": 1.6308, + "mean_token_accuracy": 0.628805602590243, + "num_tokens": 2459740581.0, + "step": 14662 + }, + { + "entropy": 1.7392071982224782, + "epoch": 1.6108044272335285, + "grad_norm": 0.6870610117912292, + "learning_rate": 3.7303458542275827e-06, + "loss": 1.4163, + "mean_token_accuracy": 0.6702167640129725, + "num_tokens": 2459875754.0, + "step": 14663 + }, + { + "entropy": 1.7333300908406575, + "epoch": 1.6109142841449011, + "grad_norm": 0.8031678199768066, + "learning_rate": 3.7294020401514364e-06, + "loss": 1.3774, + "mean_token_accuracy": 0.6614676515261332, + "num_tokens": 2460032544.0, + "step": 14664 + }, + { + "entropy": 1.7381873826185863, + "epoch": 1.6110241410562742, + "grad_norm": 0.733604907989502, + "learning_rate": 3.72845845618644e-06, + "loss": 1.3342, + "mean_token_accuracy": 0.6601580232381821, + "num_tokens": 2460176158.0, + "step": 14665 + }, + { + "entropy": 1.741199215253194, + "epoch": 1.6111339979676471, + "grad_norm": 0.7065275311470032, + "learning_rate": 3.727515102362457e-06, + "loss": 1.4099, + "mean_token_accuracy": 0.6452137182156245, + "num_tokens": 2460289880.0, + "step": 14666 + }, + { + "entropy": 1.6943379541238148, + "epoch": 1.61124385487902, + "grad_norm": 0.6393603682518005, + "learning_rate": 3.7265719787093425e-06, + "loss": 1.3706, + "mean_token_accuracy": 0.6622431923945745, + "num_tokens": 2460455949.0, + "step": 14667 + }, + { + "entropy": 1.7258077561855316, + "epoch": 1.611353711790393, + "grad_norm": 0.7554365396499634, + "learning_rate": 3.7256290852569486e-06, + "loss": 1.3566, + "mean_token_accuracy": 0.6608146925767263, + "num_tokens": 2460609656.0, + "step": 14668 + }, + { + "entropy": 1.689467837413152, + "epoch": 1.611463568701766, + "grad_norm": 0.6613262295722961, + "learning_rate": 3.724686422035115e-06, + "loss": 1.5024, + "mean_token_accuracy": 0.6482188751300176, + "num_tokens": 2460844330.0, + "step": 14669 + }, + { + "entropy": 1.6908029715220134, + "epoch": 1.6115734256131389, + "grad_norm": 0.6851189136505127, + "learning_rate": 3.7237439890736794e-06, + "loss": 1.3898, + "mean_token_accuracy": 0.6735121210416158, + "num_tokens": 2461005246.0, + "step": 14670 + }, + { + "entropy": 1.6708916127681732, + "epoch": 1.611683282524512, + "grad_norm": 0.5962818264961243, + "learning_rate": 3.7228017864024678e-06, + "loss": 1.3767, + "mean_token_accuracy": 0.6496349523464838, + "num_tokens": 2461164991.0, + "step": 14671 + }, + { + "entropy": 1.7484399875005086, + "epoch": 1.6117931394358846, + "grad_norm": 0.670310378074646, + "learning_rate": 3.7218598140512984e-06, + "loss": 1.3154, + "mean_token_accuracy": 0.6742733071247736, + "num_tokens": 2461315273.0, + "step": 14672 + }, + { + "entropy": 1.7393219470977783, + "epoch": 1.6119029963472578, + "grad_norm": 0.6643054485321045, + "learning_rate": 3.7209180720499895e-06, + "loss": 1.3544, + "mean_token_accuracy": 0.6552670349677404, + "num_tokens": 2461488510.0, + "step": 14673 + }, + { + "entropy": 1.8004189630349476, + "epoch": 1.6120128532586306, + "grad_norm": 0.79213947057724, + "learning_rate": 3.719976560428342e-06, + "loss": 1.3652, + "mean_token_accuracy": 0.6540059546629587, + "num_tokens": 2461587275.0, + "step": 14674 + }, + { + "entropy": 1.7245031495889027, + "epoch": 1.6121227101700035, + "grad_norm": 0.6856813430786133, + "learning_rate": 3.7190352792161544e-06, + "loss": 1.454, + "mean_token_accuracy": 0.6419583807388941, + "num_tokens": 2461884625.0, + "step": 14675 + }, + { + "entropy": 1.7429804404576619, + "epoch": 1.6122325670813766, + "grad_norm": 0.9906445145606995, + "learning_rate": 3.7180942284432187e-06, + "loss": 1.4102, + "mean_token_accuracy": 0.6640532414118449, + "num_tokens": 2462069882.0, + "step": 14676 + }, + { + "entropy": 1.7205411791801453, + "epoch": 1.6123424239927493, + "grad_norm": 0.7368789911270142, + "learning_rate": 3.7171534081393222e-06, + "loss": 1.2647, + "mean_token_accuracy": 0.6686208844184875, + "num_tokens": 2462226030.0, + "step": 14677 + }, + { + "entropy": 1.6727672219276428, + "epoch": 1.6124522809041224, + "grad_norm": 0.6884908676147461, + "learning_rate": 3.716212818334238e-06, + "loss": 1.5422, + "mean_token_accuracy": 0.66104227801164, + "num_tokens": 2462413094.0, + "step": 14678 + }, + { + "entropy": 1.6247599720954895, + "epoch": 1.6125621378154953, + "grad_norm": 0.590446949005127, + "learning_rate": 3.715272459057735e-06, + "loss": 1.4282, + "mean_token_accuracy": 0.644096295038859, + "num_tokens": 2462641811.0, + "step": 14679 + }, + { + "entropy": 1.6443546215693157, + "epoch": 1.6126719947268682, + "grad_norm": 0.7115086317062378, + "learning_rate": 3.714332330339577e-06, + "loss": 1.5669, + "mean_token_accuracy": 0.6451859523852667, + "num_tokens": 2462806338.0, + "step": 14680 + }, + { + "entropy": 1.7336850663026173, + "epoch": 1.6127818516382413, + "grad_norm": 0.7021939158439636, + "learning_rate": 3.7133924322095174e-06, + "loss": 1.4519, + "mean_token_accuracy": 0.6492075125376383, + "num_tokens": 2462930880.0, + "step": 14681 + }, + { + "entropy": 1.6663711071014404, + "epoch": 1.6128917085496142, + "grad_norm": 0.7189558148384094, + "learning_rate": 3.712452764697306e-06, + "loss": 1.2616, + "mean_token_accuracy": 0.667123039563497, + "num_tokens": 2463057469.0, + "step": 14682 + }, + { + "entropy": 1.7106225689252217, + "epoch": 1.613001565460987, + "grad_norm": 0.6529760956764221, + "learning_rate": 3.7115133278326776e-06, + "loss": 1.4855, + "mean_token_accuracy": 0.6439164827267329, + "num_tokens": 2463193771.0, + "step": 14683 + }, + { + "entropy": 1.7312338948249817, + "epoch": 1.6131114223723602, + "grad_norm": 0.6860626339912415, + "learning_rate": 3.7105741216453677e-06, + "loss": 1.3038, + "mean_token_accuracy": 0.6747928162415823, + "num_tokens": 2463375144.0, + "step": 14684 + }, + { + "entropy": 1.6784348785877228, + "epoch": 1.6132212792837328, + "grad_norm": 0.636982798576355, + "learning_rate": 3.7096351461651048e-06, + "loss": 1.3763, + "mean_token_accuracy": 0.647995188832283, + "num_tokens": 2463568102.0, + "step": 14685 + }, + { + "entropy": 1.6646142303943634, + "epoch": 1.613331136195106, + "grad_norm": 0.7830153703689575, + "learning_rate": 3.7086964014216044e-06, + "loss": 1.2812, + "mean_token_accuracy": 0.6732824593782425, + "num_tokens": 2463698353.0, + "step": 14686 + }, + { + "entropy": 1.727742314338684, + "epoch": 1.6134409931064788, + "grad_norm": 0.6397407650947571, + "learning_rate": 3.7077578874445747e-06, + "loss": 1.6028, + "mean_token_accuracy": 0.6402206718921661, + "num_tokens": 2463895915.0, + "step": 14687 + }, + { + "entropy": 1.708655208349228, + "epoch": 1.6135508500178517, + "grad_norm": 0.7308383584022522, + "learning_rate": 3.7068196042637243e-06, + "loss": 1.3993, + "mean_token_accuracy": 0.6531407485405604, + "num_tokens": 2464095797.0, + "step": 14688 + }, + { + "entropy": 1.704372376203537, + "epoch": 1.6136607069292248, + "grad_norm": 0.6979178786277771, + "learning_rate": 3.7058815519087444e-06, + "loss": 1.2332, + "mean_token_accuracy": 0.6791991045077642, + "num_tokens": 2464232587.0, + "step": 14689 + }, + { + "entropy": 1.6719367702802022, + "epoch": 1.6137705638405975, + "grad_norm": 0.8596528172492981, + "learning_rate": 3.7049437304093294e-06, + "loss": 1.3867, + "mean_token_accuracy": 0.6568711996078491, + "num_tokens": 2464385186.0, + "step": 14690 + }, + { + "entropy": 1.6047246555487316, + "epoch": 1.6138804207519706, + "grad_norm": 0.6490098834037781, + "learning_rate": 3.7040061397951576e-06, + "loss": 1.3677, + "mean_token_accuracy": 0.660823663075765, + "num_tokens": 2464553229.0, + "step": 14691 + }, + { + "entropy": 1.7173048158486683, + "epoch": 1.6139902776633435, + "grad_norm": 0.7776662111282349, + "learning_rate": 3.703068780095902e-06, + "loss": 1.2761, + "mean_token_accuracy": 0.6762077808380127, + "num_tokens": 2464701504.0, + "step": 14692 + }, + { + "entropy": 1.7215432325998943, + "epoch": 1.6141001345747163, + "grad_norm": 0.9005657434463501, + "learning_rate": 3.702131651341231e-06, + "loss": 1.3798, + "mean_token_accuracy": 0.6737534006436666, + "num_tokens": 2464866324.0, + "step": 14693 + }, + { + "entropy": 1.7409155865510304, + "epoch": 1.6142099914860895, + "grad_norm": 0.7071303129196167, + "learning_rate": 3.7011947535608105e-06, + "loss": 1.5843, + "mean_token_accuracy": 0.647487630446752, + "num_tokens": 2465076973.0, + "step": 14694 + }, + { + "entropy": 1.7262167433897655, + "epoch": 1.6143198483974623, + "grad_norm": 0.7587045431137085, + "learning_rate": 3.7002580867842815e-06, + "loss": 1.2918, + "mean_token_accuracy": 0.6742985248565674, + "num_tokens": 2465200868.0, + "step": 14695 + }, + { + "entropy": 1.6662676731745403, + "epoch": 1.6144297053088352, + "grad_norm": 0.6712886095046997, + "learning_rate": 3.6993216510412943e-06, + "loss": 1.375, + "mean_token_accuracy": 0.6569582025210062, + "num_tokens": 2465388988.0, + "step": 14696 + }, + { + "entropy": 1.730035165945689, + "epoch": 1.6145395622202083, + "grad_norm": 0.7809839844703674, + "learning_rate": 3.698385446361491e-06, + "loss": 1.2638, + "mean_token_accuracy": 0.6717608024676641, + "num_tokens": 2465527801.0, + "step": 14697 + }, + { + "entropy": 1.7651971677939098, + "epoch": 1.614649419131581, + "grad_norm": 0.6956320405006409, + "learning_rate": 3.6974494727744963e-06, + "loss": 1.2942, + "mean_token_accuracy": 0.6690488557020823, + "num_tokens": 2465636303.0, + "step": 14698 + }, + { + "entropy": 1.6958413124084473, + "epoch": 1.614759276042954, + "grad_norm": 0.6723313331604004, + "learning_rate": 3.6965137303099337e-06, + "loss": 1.4207, + "mean_token_accuracy": 0.6619421541690826, + "num_tokens": 2465851994.0, + "step": 14699 + }, + { + "entropy": 1.6834101875623066, + "epoch": 1.614869132954327, + "grad_norm": 0.7277176380157471, + "learning_rate": 3.695578218997423e-06, + "loss": 1.2127, + "mean_token_accuracy": 0.6854077279567719, + "num_tokens": 2465977857.0, + "step": 14700 + }, + { + "entropy": 1.709593951702118, + "epoch": 1.6149789898656999, + "grad_norm": 0.6318420171737671, + "learning_rate": 3.694642938866567e-06, + "loss": 1.4732, + "mean_token_accuracy": 0.6425252010424932, + "num_tokens": 2466148882.0, + "step": 14701 + }, + { + "entropy": 1.7026881178220112, + "epoch": 1.615088846777073, + "grad_norm": 0.5743200778961182, + "learning_rate": 3.6937078899469735e-06, + "loss": 1.4259, + "mean_token_accuracy": 0.6450713922580084, + "num_tokens": 2466367763.0, + "step": 14702 + }, + { + "entropy": 1.7224018573760986, + "epoch": 1.6151987036884456, + "grad_norm": 0.7370775938034058, + "learning_rate": 3.692773072268233e-06, + "loss": 1.5327, + "mean_token_accuracy": 0.6518655767043432, + "num_tokens": 2466513299.0, + "step": 14703 + }, + { + "entropy": 1.6792699694633484, + "epoch": 1.6153085605998188, + "grad_norm": 0.7573254108428955, + "learning_rate": 3.69183848585993e-06, + "loss": 1.3101, + "mean_token_accuracy": 0.6656599442164103, + "num_tokens": 2466674052.0, + "step": 14704 + }, + { + "entropy": 1.7601742148399353, + "epoch": 1.6154184175111916, + "grad_norm": 0.6558720469474792, + "learning_rate": 3.690904130751647e-06, + "loss": 1.3575, + "mean_token_accuracy": 0.6632993370294571, + "num_tokens": 2466835324.0, + "step": 14705 + }, + { + "entropy": 1.670517235994339, + "epoch": 1.6155282744225645, + "grad_norm": 0.7299153208732605, + "learning_rate": 3.689970006972955e-06, + "loss": 1.3617, + "mean_token_accuracy": 0.6664615025122961, + "num_tokens": 2466978382.0, + "step": 14706 + }, + { + "entropy": 1.7113747795422871, + "epoch": 1.6156381313339376, + "grad_norm": 0.6921692490577698, + "learning_rate": 3.689036114553416e-06, + "loss": 1.4798, + "mean_token_accuracy": 0.6365568687518438, + "num_tokens": 2467205232.0, + "step": 14707 + }, + { + "entropy": 1.6883227229118347, + "epoch": 1.6157479882453105, + "grad_norm": 0.648524820804596, + "learning_rate": 3.6881024535225895e-06, + "loss": 1.5209, + "mean_token_accuracy": 0.6537104596694311, + "num_tokens": 2467377154.0, + "step": 14708 + }, + { + "entropy": 1.6793282429377239, + "epoch": 1.6158578451566834, + "grad_norm": 0.6334054470062256, + "learning_rate": 3.687169023910029e-06, + "loss": 1.3909, + "mean_token_accuracy": 0.6522092173496882, + "num_tokens": 2467532555.0, + "step": 14709 + }, + { + "entropy": 1.6903114418188732, + "epoch": 1.6159677020680565, + "grad_norm": 0.7473300695419312, + "learning_rate": 3.6862358257452715e-06, + "loss": 1.2707, + "mean_token_accuracy": 0.6770381530125936, + "num_tokens": 2467642784.0, + "step": 14710 + }, + { + "entropy": 1.6912944614887238, + "epoch": 1.6160775589794292, + "grad_norm": 0.6412237286567688, + "learning_rate": 3.685302859057853e-06, + "loss": 1.4237, + "mean_token_accuracy": 0.648856391509374, + "num_tokens": 2467791714.0, + "step": 14711 + }, + { + "entropy": 1.7014042536417644, + "epoch": 1.6161874158908023, + "grad_norm": 0.9805091619491577, + "learning_rate": 3.6843701238773067e-06, + "loss": 1.3664, + "mean_token_accuracy": 0.6512833336989085, + "num_tokens": 2467936735.0, + "step": 14712 + }, + { + "entropy": 1.7085695664087932, + "epoch": 1.6162972728021752, + "grad_norm": 0.6498310565948486, + "learning_rate": 3.6834376202331457e-06, + "loss": 1.4279, + "mean_token_accuracy": 0.6503070195515951, + "num_tokens": 2468137097.0, + "step": 14713 + }, + { + "entropy": 1.754157284895579, + "epoch": 1.616407129713548, + "grad_norm": 0.6614096164703369, + "learning_rate": 3.68250534815489e-06, + "loss": 1.4499, + "mean_token_accuracy": 0.6387760390837988, + "num_tokens": 2468335831.0, + "step": 14714 + }, + { + "entropy": 1.748912364244461, + "epoch": 1.6165169866249212, + "grad_norm": 0.6944563388824463, + "learning_rate": 3.6815733076720417e-06, + "loss": 1.4188, + "mean_token_accuracy": 0.6436112423737844, + "num_tokens": 2468487713.0, + "step": 14715 + }, + { + "entropy": 1.6973415712515514, + "epoch": 1.6166268435362938, + "grad_norm": 0.7646819949150085, + "learning_rate": 3.6806414988140994e-06, + "loss": 1.4375, + "mean_token_accuracy": 0.6578048566977183, + "num_tokens": 2468669987.0, + "step": 14716 + }, + { + "entropy": 1.732655018568039, + "epoch": 1.616736700447667, + "grad_norm": 0.8368391990661621, + "learning_rate": 3.6797099216105574e-06, + "loss": 1.343, + "mean_token_accuracy": 0.666421135266622, + "num_tokens": 2468838710.0, + "step": 14717 + }, + { + "entropy": 1.7877886792023976, + "epoch": 1.6168465573590398, + "grad_norm": 0.7718168497085571, + "learning_rate": 3.6787785760908977e-06, + "loss": 1.4756, + "mean_token_accuracy": 0.6524422268072764, + "num_tokens": 2468982036.0, + "step": 14718 + }, + { + "entropy": 1.6961112916469574, + "epoch": 1.6169564142704127, + "grad_norm": 0.6764015555381775, + "learning_rate": 3.6778474622845944e-06, + "loss": 1.3609, + "mean_token_accuracy": 0.6668888131777445, + "num_tokens": 2469168789.0, + "step": 14719 + }, + { + "entropy": 1.6913793583710988, + "epoch": 1.6170662711817858, + "grad_norm": 0.6833027005195618, + "learning_rate": 3.6769165802211204e-06, + "loss": 1.5313, + "mean_token_accuracy": 0.630229189991951, + "num_tokens": 2469398606.0, + "step": 14720 + }, + { + "entropy": 1.6282474398612976, + "epoch": 1.6171761280931587, + "grad_norm": 0.7693440318107605, + "learning_rate": 3.675985929929938e-06, + "loss": 1.4429, + "mean_token_accuracy": 0.6655650039513906, + "num_tokens": 2469592922.0, + "step": 14721 + }, + { + "entropy": 1.711096356312434, + "epoch": 1.6172859850045316, + "grad_norm": 0.8111943006515503, + "learning_rate": 3.6750555114405006e-06, + "loss": 1.5209, + "mean_token_accuracy": 0.6461358418067297, + "num_tokens": 2469796479.0, + "step": 14722 + }, + { + "entropy": 1.6384065548578899, + "epoch": 1.6173958419159047, + "grad_norm": 0.8203321099281311, + "learning_rate": 3.674125324782254e-06, + "loss": 1.4875, + "mean_token_accuracy": 0.6577804535627365, + "num_tokens": 2469953785.0, + "step": 14723 + }, + { + "entropy": 1.7690180937449138, + "epoch": 1.6175056988272773, + "grad_norm": 0.6374251842498779, + "learning_rate": 3.6731953699846414e-06, + "loss": 1.3627, + "mean_token_accuracy": 0.6556740949551264, + "num_tokens": 2470124089.0, + "step": 14724 + }, + { + "entropy": 1.7309903005758922, + "epoch": 1.6176155557386505, + "grad_norm": 0.7207738757133484, + "learning_rate": 3.6722656470770923e-06, + "loss": 1.4916, + "mean_token_accuracy": 0.6519534190495809, + "num_tokens": 2470255712.0, + "step": 14725 + }, + { + "entropy": 1.7002464632193248, + "epoch": 1.6177254126500233, + "grad_norm": 0.7734901905059814, + "learning_rate": 3.6713361560890348e-06, + "loss": 1.5482, + "mean_token_accuracy": 0.6576615820328394, + "num_tokens": 2470420871.0, + "step": 14726 + }, + { + "entropy": 1.6754888991514842, + "epoch": 1.6178352695613962, + "grad_norm": 0.8016461133956909, + "learning_rate": 3.6704068970498864e-06, + "loss": 1.2781, + "mean_token_accuracy": 0.6687599966923395, + "num_tokens": 2470573999.0, + "step": 14727 + }, + { + "entropy": 1.7265417277812958, + "epoch": 1.6179451264727693, + "grad_norm": 0.8027196526527405, + "learning_rate": 3.6694778699890544e-06, + "loss": 1.2972, + "mean_token_accuracy": 0.672625203927358, + "num_tokens": 2470696124.0, + "step": 14728 + }, + { + "entropy": 1.678039421637853, + "epoch": 1.618054983384142, + "grad_norm": 0.9656848311424255, + "learning_rate": 3.6685490749359465e-06, + "loss": 1.4742, + "mean_token_accuracy": 0.6456383168697357, + "num_tokens": 2470903763.0, + "step": 14729 + }, + { + "entropy": 1.6626673638820648, + "epoch": 1.618164840295515, + "grad_norm": 0.6540632247924805, + "learning_rate": 3.6676205119199576e-06, + "loss": 1.3202, + "mean_token_accuracy": 0.6622842649618784, + "num_tokens": 2471090045.0, + "step": 14730 + }, + { + "entropy": 1.6490332384904225, + "epoch": 1.618274697206888, + "grad_norm": 0.8260558843612671, + "learning_rate": 3.6666921809704736e-06, + "loss": 1.161, + "mean_token_accuracy": 0.6894190460443497, + "num_tokens": 2471227403.0, + "step": 14731 + }, + { + "entropy": 1.6390781899293263, + "epoch": 1.6183845541182609, + "grad_norm": 0.5702754259109497, + "learning_rate": 3.665764082116876e-06, + "loss": 1.4722, + "mean_token_accuracy": 0.649658222993215, + "num_tokens": 2471437545.0, + "step": 14732 + }, + { + "entropy": 1.629872699578603, + "epoch": 1.618494411029634, + "grad_norm": 0.6536400318145752, + "learning_rate": 3.6648362153885436e-06, + "loss": 1.3237, + "mean_token_accuracy": 0.659297987818718, + "num_tokens": 2471629535.0, + "step": 14733 + }, + { + "entropy": 1.7587083180745442, + "epoch": 1.6186042679410069, + "grad_norm": 0.6709061861038208, + "learning_rate": 3.6639085808148393e-06, + "loss": 1.3405, + "mean_token_accuracy": 0.6536008963982264, + "num_tokens": 2471737875.0, + "step": 14734 + }, + { + "entropy": 1.6815871397654216, + "epoch": 1.6187141248523798, + "grad_norm": 0.7016004920005798, + "learning_rate": 3.66298117842512e-06, + "loss": 1.3005, + "mean_token_accuracy": 0.6671187877655029, + "num_tokens": 2471871399.0, + "step": 14735 + }, + { + "entropy": 1.6025499800841014, + "epoch": 1.6188239817637529, + "grad_norm": 0.7438717484474182, + "learning_rate": 3.662054008248743e-06, + "loss": 1.3128, + "mean_token_accuracy": 0.6726290682951609, + "num_tokens": 2472023286.0, + "step": 14736 + }, + { + "entropy": 1.6263412833213806, + "epoch": 1.6189338386751255, + "grad_norm": 0.7102988362312317, + "learning_rate": 3.661127070315048e-06, + "loss": 1.4156, + "mean_token_accuracy": 0.6527203271786371, + "num_tokens": 2472222430.0, + "step": 14737 + }, + { + "entropy": 1.703275889158249, + "epoch": 1.6190436955864986, + "grad_norm": 0.714640736579895, + "learning_rate": 3.660200364653377e-06, + "loss": 1.3723, + "mean_token_accuracy": 0.6539589911699295, + "num_tokens": 2472389845.0, + "step": 14738 + }, + { + "entropy": 1.6962252755959828, + "epoch": 1.6191535524978715, + "grad_norm": 0.6212570071220398, + "learning_rate": 3.6592738912930557e-06, + "loss": 1.4415, + "mean_token_accuracy": 0.644294947385788, + "num_tokens": 2472558761.0, + "step": 14739 + }, + { + "entropy": 1.7015782197316487, + "epoch": 1.6192634094092444, + "grad_norm": 0.7756522297859192, + "learning_rate": 3.6583476502634074e-06, + "loss": 1.3007, + "mean_token_accuracy": 0.670628140370051, + "num_tokens": 2472732516.0, + "step": 14740 + }, + { + "entropy": 1.5966882010300953, + "epoch": 1.6193732663206175, + "grad_norm": 0.7755955457687378, + "learning_rate": 3.657421641593748e-06, + "loss": 1.2803, + "mean_token_accuracy": 0.6795324633518854, + "num_tokens": 2472883412.0, + "step": 14741 + }, + { + "entropy": 1.7600494424502056, + "epoch": 1.6194831232319902, + "grad_norm": 0.7740442752838135, + "learning_rate": 3.6564958653133863e-06, + "loss": 1.4893, + "mean_token_accuracy": 0.6595326215028763, + "num_tokens": 2473018952.0, + "step": 14742 + }, + { + "entropy": 1.7393341660499573, + "epoch": 1.6195929801433633, + "grad_norm": 0.6907954812049866, + "learning_rate": 3.6555703214516193e-06, + "loss": 1.5718, + "mean_token_accuracy": 0.6427063147226969, + "num_tokens": 2473202617.0, + "step": 14743 + }, + { + "entropy": 1.6578579048315685, + "epoch": 1.6197028370547362, + "grad_norm": 0.6247424483299255, + "learning_rate": 3.654645010037744e-06, + "loss": 1.5738, + "mean_token_accuracy": 0.6394970516363779, + "num_tokens": 2473402435.0, + "step": 14744 + }, + { + "entropy": 1.6981489062309265, + "epoch": 1.619812693966109, + "grad_norm": 0.5729104280471802, + "learning_rate": 3.653719931101042e-06, + "loss": 1.3324, + "mean_token_accuracy": 0.6592134733994802, + "num_tokens": 2473562428.0, + "step": 14745 + }, + { + "entropy": 1.647686739762624, + "epoch": 1.6199225508774822, + "grad_norm": 0.6854268312454224, + "learning_rate": 3.652795084670795e-06, + "loss": 1.3264, + "mean_token_accuracy": 0.6706267396608988, + "num_tokens": 2473690175.0, + "step": 14746 + }, + { + "entropy": 1.7167788644631703, + "epoch": 1.620032407788855, + "grad_norm": 0.5726441144943237, + "learning_rate": 3.6518704707762747e-06, + "loss": 1.4047, + "mean_token_accuracy": 0.6441677361726761, + "num_tokens": 2473897473.0, + "step": 14747 + }, + { + "entropy": 1.6790929238001506, + "epoch": 1.620142264700228, + "grad_norm": 0.7980174422264099, + "learning_rate": 3.65094608944674e-06, + "loss": 1.286, + "mean_token_accuracy": 0.6767543057600657, + "num_tokens": 2474027480.0, + "step": 14748 + }, + { + "entropy": 1.7248100241025288, + "epoch": 1.620252121611601, + "grad_norm": 0.664738655090332, + "learning_rate": 3.650021940711449e-06, + "loss": 1.447, + "mean_token_accuracy": 0.6460002660751343, + "num_tokens": 2474233223.0, + "step": 14749 + }, + { + "entropy": 1.679869105418523, + "epoch": 1.6203619785229737, + "grad_norm": 0.5752595663070679, + "learning_rate": 3.6490980245996578e-06, + "loss": 1.4, + "mean_token_accuracy": 0.6550338019927343, + "num_tokens": 2474391564.0, + "step": 14750 + }, + { + "entropy": 1.6999848584334056, + "epoch": 1.6204718354343468, + "grad_norm": 0.8822055459022522, + "learning_rate": 3.6481743411405957e-06, + "loss": 1.4801, + "mean_token_accuracy": 0.6662048846483231, + "num_tokens": 2474520827.0, + "step": 14751 + }, + { + "entropy": 1.6986857652664185, + "epoch": 1.6205816923457197, + "grad_norm": 0.5632603168487549, + "learning_rate": 3.6472508903635035e-06, + "loss": 1.4075, + "mean_token_accuracy": 0.6511333485444387, + "num_tokens": 2474718631.0, + "step": 14752 + }, + { + "entropy": 1.690843830506007, + "epoch": 1.6206915492570926, + "grad_norm": 0.6789196729660034, + "learning_rate": 3.6463276722976094e-06, + "loss": 1.421, + "mean_token_accuracy": 0.6547816569606463, + "num_tokens": 2474877824.0, + "step": 14753 + }, + { + "entropy": 1.7533318003018696, + "epoch": 1.6208014061684657, + "grad_norm": 0.6676966547966003, + "learning_rate": 3.6454046869721314e-06, + "loss": 1.311, + "mean_token_accuracy": 0.6615221301714579, + "num_tokens": 2475020061.0, + "step": 14754 + }, + { + "entropy": 1.7552814086278279, + "epoch": 1.6209112630798383, + "grad_norm": 0.6395809054374695, + "learning_rate": 3.6444819344162785e-06, + "loss": 1.3817, + "mean_token_accuracy": 0.6580479294061661, + "num_tokens": 2475208467.0, + "step": 14755 + }, + { + "entropy": 1.692303051551183, + "epoch": 1.6210211199912115, + "grad_norm": 0.6294587254524231, + "learning_rate": 3.6435594146592602e-06, + "loss": 1.427, + "mean_token_accuracy": 0.6477192491292953, + "num_tokens": 2475440566.0, + "step": 14756 + }, + { + "entropy": 1.799072911341985, + "epoch": 1.6211309769025843, + "grad_norm": 0.8360262513160706, + "learning_rate": 3.6426371277302696e-06, + "loss": 1.5254, + "mean_token_accuracy": 0.6458015888929367, + "num_tokens": 2475559755.0, + "step": 14757 + }, + { + "entropy": 1.6935710906982422, + "epoch": 1.6212408338139572, + "grad_norm": 0.6417449116706848, + "learning_rate": 3.6417150736585005e-06, + "loss": 1.2667, + "mean_token_accuracy": 0.6743132919073105, + "num_tokens": 2475671516.0, + "step": 14758 + }, + { + "entropy": 1.7454969485600789, + "epoch": 1.6213506907253303, + "grad_norm": 0.7443606853485107, + "learning_rate": 3.6407932524731327e-06, + "loss": 1.2905, + "mean_token_accuracy": 0.6683808912833532, + "num_tokens": 2475789097.0, + "step": 14759 + }, + { + "entropy": 1.683669090270996, + "epoch": 1.6214605476367032, + "grad_norm": 0.6186245083808899, + "learning_rate": 3.6398716642033415e-06, + "loss": 1.3217, + "mean_token_accuracy": 0.6645027448733648, + "num_tokens": 2475957658.0, + "step": 14760 + }, + { + "entropy": 1.7226401766141255, + "epoch": 1.621570404548076, + "grad_norm": 0.7316539883613586, + "learning_rate": 3.638950308878295e-06, + "loss": 1.3299, + "mean_token_accuracy": 0.6640412161747614, + "num_tokens": 2476076424.0, + "step": 14761 + }, + { + "entropy": 1.7300419211387634, + "epoch": 1.6216802614594492, + "grad_norm": 0.7443107962608337, + "learning_rate": 3.638029186527159e-06, + "loss": 1.4486, + "mean_token_accuracy": 0.6463419745365778, + "num_tokens": 2476257373.0, + "step": 14762 + }, + { + "entropy": 1.653321127096812, + "epoch": 1.6217901183708219, + "grad_norm": 0.6316555142402649, + "learning_rate": 3.6371082971790774e-06, + "loss": 1.5808, + "mean_token_accuracy": 0.6333072433869044, + "num_tokens": 2476519674.0, + "step": 14763 + }, + { + "entropy": 1.7144930958747864, + "epoch": 1.621899975282195, + "grad_norm": 0.6967435479164124, + "learning_rate": 3.636187640863199e-06, + "loss": 1.3244, + "mean_token_accuracy": 0.6650471885999044, + "num_tokens": 2476646778.0, + "step": 14764 + }, + { + "entropy": 1.6741214394569397, + "epoch": 1.6220098321935679, + "grad_norm": 0.7144061923027039, + "learning_rate": 3.635267217608668e-06, + "loss": 1.4367, + "mean_token_accuracy": 0.6536417255798975, + "num_tokens": 2476791495.0, + "step": 14765 + }, + { + "entropy": 1.7125622133413951, + "epoch": 1.6221196891049408, + "grad_norm": 0.8451400399208069, + "learning_rate": 3.634347027444609e-06, + "loss": 1.5601, + "mean_token_accuracy": 0.6516855011383692, + "num_tokens": 2476981517.0, + "step": 14766 + }, + { + "entropy": 1.679990828037262, + "epoch": 1.6222295460163139, + "grad_norm": 0.8648212552070618, + "learning_rate": 3.6334270704001464e-06, + "loss": 1.3945, + "mean_token_accuracy": 0.6758679350217184, + "num_tokens": 2477118327.0, + "step": 14767 + }, + { + "entropy": 1.6886627574761708, + "epoch": 1.6223394029276865, + "grad_norm": 0.5979213714599609, + "learning_rate": 3.6325073465043998e-06, + "loss": 1.451, + "mean_token_accuracy": 0.6546831776698431, + "num_tokens": 2477323299.0, + "step": 14768 + }, + { + "entropy": 1.7278961042563121, + "epoch": 1.6224492598390596, + "grad_norm": 0.7632152438163757, + "learning_rate": 3.6315878557864732e-06, + "loss": 1.3506, + "mean_token_accuracy": 0.6647726694742838, + "num_tokens": 2477455404.0, + "step": 14769 + }, + { + "entropy": 1.661962906519572, + "epoch": 1.6225591167504325, + "grad_norm": 0.5683802962303162, + "learning_rate": 3.6306685982754725e-06, + "loss": 1.4222, + "mean_token_accuracy": 0.6534817218780518, + "num_tokens": 2477637665.0, + "step": 14770 + }, + { + "entropy": 1.7604417105515797, + "epoch": 1.6226689736618054, + "grad_norm": 0.6922222971916199, + "learning_rate": 3.629749574000491e-06, + "loss": 1.3952, + "mean_token_accuracy": 0.6722667813301086, + "num_tokens": 2477792042.0, + "step": 14771 + }, + { + "entropy": 1.7918286224206288, + "epoch": 1.6227788305731785, + "grad_norm": 0.7370676398277283, + "learning_rate": 3.628830782990611e-06, + "loss": 1.4686, + "mean_token_accuracy": 0.6477069209019343, + "num_tokens": 2477953240.0, + "step": 14772 + }, + { + "entropy": 1.6899705628554027, + "epoch": 1.6228886874845514, + "grad_norm": 0.7105181813240051, + "learning_rate": 3.627912225274916e-06, + "loss": 1.4461, + "mean_token_accuracy": 0.656821588675181, + "num_tokens": 2478178028.0, + "step": 14773 + }, + { + "entropy": 1.6704809963703156, + "epoch": 1.6229985443959243, + "grad_norm": 0.6295377016067505, + "learning_rate": 3.6269939008824818e-06, + "loss": 1.3896, + "mean_token_accuracy": 0.6608775307734808, + "num_tokens": 2478369209.0, + "step": 14774 + }, + { + "entropy": 1.7202841540177662, + "epoch": 1.6231084013072974, + "grad_norm": 0.6942985653877258, + "learning_rate": 3.6260758098423634e-06, + "loss": 1.3808, + "mean_token_accuracy": 0.664001539349556, + "num_tokens": 2478504947.0, + "step": 14775 + }, + { + "entropy": 1.678806871175766, + "epoch": 1.62321825821867, + "grad_norm": 0.6336562037467957, + "learning_rate": 3.6251579521836223e-06, + "loss": 1.3756, + "mean_token_accuracy": 0.6675303081671397, + "num_tokens": 2478694950.0, + "step": 14776 + }, + { + "entropy": 1.764718770980835, + "epoch": 1.6233281151300432, + "grad_norm": 0.6643718481063843, + "learning_rate": 3.624240327935312e-06, + "loss": 1.5157, + "mean_token_accuracy": 0.6346383889516195, + "num_tokens": 2478927778.0, + "step": 14777 + }, + { + "entropy": 1.6955519517262776, + "epoch": 1.623437972041416, + "grad_norm": 0.678653359413147, + "learning_rate": 3.6233229371264715e-06, + "loss": 1.3345, + "mean_token_accuracy": 0.6630487193663915, + "num_tokens": 2479106950.0, + "step": 14778 + }, + { + "entropy": 1.7352122167746227, + "epoch": 1.623547828952789, + "grad_norm": 0.7108410000801086, + "learning_rate": 3.6224057797861335e-06, + "loss": 1.4498, + "mean_token_accuracy": 0.6533684730529785, + "num_tokens": 2479280565.0, + "step": 14779 + }, + { + "entropy": 1.765261471271515, + "epoch": 1.623657685864162, + "grad_norm": 0.8185455799102783, + "learning_rate": 3.6214888559433303e-06, + "loss": 1.6564, + "mean_token_accuracy": 0.6357477903366089, + "num_tokens": 2479455073.0, + "step": 14780 + }, + { + "entropy": 1.7805627187093098, + "epoch": 1.6237675427755347, + "grad_norm": 0.7521522641181946, + "learning_rate": 3.6205721656270787e-06, + "loss": 1.3739, + "mean_token_accuracy": 0.6541548172632853, + "num_tokens": 2479612490.0, + "step": 14781 + }, + { + "entropy": 1.7144214709599812, + "epoch": 1.6238773996869078, + "grad_norm": 0.7456989288330078, + "learning_rate": 3.6196557088663933e-06, + "loss": 1.5344, + "mean_token_accuracy": 0.6387195686499277, + "num_tokens": 2479774711.0, + "step": 14782 + }, + { + "entropy": 1.6867867310841878, + "epoch": 1.6239872565982807, + "grad_norm": 0.6226841807365417, + "learning_rate": 3.6187394856902808e-06, + "loss": 1.3739, + "mean_token_accuracy": 0.6644567201534907, + "num_tokens": 2479941427.0, + "step": 14783 + }, + { + "entropy": 1.721395303805669, + "epoch": 1.6240971135096536, + "grad_norm": 0.6708065271377563, + "learning_rate": 3.617823496127734e-06, + "loss": 1.3871, + "mean_token_accuracy": 0.6624792019526163, + "num_tokens": 2480131568.0, + "step": 14784 + }, + { + "entropy": 1.7298158307870228, + "epoch": 1.6242069704210267, + "grad_norm": 0.6362797021865845, + "learning_rate": 3.6169077402077502e-06, + "loss": 1.5095, + "mean_token_accuracy": 0.6370205332835516, + "num_tokens": 2480402026.0, + "step": 14785 + }, + { + "entropy": 1.7130170961221058, + "epoch": 1.6243168273323996, + "grad_norm": 0.6537313461303711, + "learning_rate": 3.6159922179593087e-06, + "loss": 1.5106, + "mean_token_accuracy": 0.6379600862661997, + "num_tokens": 2480597966.0, + "step": 14786 + }, + { + "entropy": 1.7668890555699666, + "epoch": 1.6244266842437725, + "grad_norm": 0.7130351066589355, + "learning_rate": 3.615076929411384e-06, + "loss": 1.4481, + "mean_token_accuracy": 0.6494297136863073, + "num_tokens": 2480795687.0, + "step": 14787 + }, + { + "entropy": 1.7258010109265645, + "epoch": 1.6245365411551456, + "grad_norm": 0.747342050075531, + "learning_rate": 3.6141618745929472e-06, + "loss": 1.5332, + "mean_token_accuracy": 0.631104106704394, + "num_tokens": 2481012853.0, + "step": 14788 + }, + { + "entropy": 1.6782464186350505, + "epoch": 1.6246463980665182, + "grad_norm": 0.6709155440330505, + "learning_rate": 3.613247053532961e-06, + "loss": 1.2226, + "mean_token_accuracy": 0.690729891260465, + "num_tokens": 2481149441.0, + "step": 14789 + }, + { + "entropy": 1.695702721675237, + "epoch": 1.6247562549778913, + "grad_norm": 0.614595353603363, + "learning_rate": 3.6123324662603775e-06, + "loss": 1.4185, + "mean_token_accuracy": 0.6465272555748621, + "num_tokens": 2481355357.0, + "step": 14790 + }, + { + "entropy": 1.6653617123762767, + "epoch": 1.6248661118892642, + "grad_norm": 0.6412925124168396, + "learning_rate": 3.6114181128041404e-06, + "loss": 1.2842, + "mean_token_accuracy": 0.6678632348775864, + "num_tokens": 2481495478.0, + "step": 14791 + }, + { + "entropy": 1.7512604494889576, + "epoch": 1.624975968800637, + "grad_norm": 0.7046493887901306, + "learning_rate": 3.6105039931931917e-06, + "loss": 1.5298, + "mean_token_accuracy": 0.6425130367279053, + "num_tokens": 2481665561.0, + "step": 14792 + }, + { + "entropy": 1.7655756374200184, + "epoch": 1.6250858257120102, + "grad_norm": 0.6847085356712341, + "learning_rate": 3.6095901074564605e-06, + "loss": 1.5536, + "mean_token_accuracy": 0.6508872310320536, + "num_tokens": 2481826917.0, + "step": 14793 + }, + { + "entropy": 1.6053343216578166, + "epoch": 1.625195682623383, + "grad_norm": 0.7845448851585388, + "learning_rate": 3.608676455622874e-06, + "loss": 1.4418, + "mean_token_accuracy": 0.6726017246643702, + "num_tokens": 2481975809.0, + "step": 14794 + }, + { + "entropy": 1.7125616768995922, + "epoch": 1.625305539534756, + "grad_norm": 0.6936936378479004, + "learning_rate": 3.607763037721348e-06, + "loss": 1.5531, + "mean_token_accuracy": 0.6478741864363352, + "num_tokens": 2482194797.0, + "step": 14795 + }, + { + "entropy": 1.736180692911148, + "epoch": 1.6254153964461289, + "grad_norm": 0.6816456317901611, + "learning_rate": 3.6068498537807884e-06, + "loss": 1.339, + "mean_token_accuracy": 0.6603361467520396, + "num_tokens": 2482331620.0, + "step": 14796 + }, + { + "entropy": 1.7087311546007793, + "epoch": 1.6255252533575018, + "grad_norm": 0.8054825067520142, + "learning_rate": 3.6059369038301005e-06, + "loss": 1.4759, + "mean_token_accuracy": 0.6657463312149048, + "num_tokens": 2482466267.0, + "step": 14797 + }, + { + "entropy": 1.6767024497191112, + "epoch": 1.6256351102688749, + "grad_norm": 0.723479688167572, + "learning_rate": 3.605024187898178e-06, + "loss": 1.1712, + "mean_token_accuracy": 0.6891407519578934, + "num_tokens": 2482611295.0, + "step": 14798 + }, + { + "entropy": 1.6842226882775624, + "epoch": 1.6257449671802477, + "grad_norm": 0.5745367407798767, + "learning_rate": 3.604111706013906e-06, + "loss": 1.3578, + "mean_token_accuracy": 0.6609561542669932, + "num_tokens": 2482777747.0, + "step": 14799 + }, + { + "entropy": 1.719879557689031, + "epoch": 1.6258548240916206, + "grad_norm": 0.7957612872123718, + "learning_rate": 3.6031994582061657e-06, + "loss": 1.3992, + "mean_token_accuracy": 0.6573441376288732, + "num_tokens": 2482902723.0, + "step": 14800 + }, + { + "entropy": 1.7037660876909893, + "epoch": 1.6259646810029937, + "grad_norm": 0.5532712340354919, + "learning_rate": 3.6022874445038326e-06, + "loss": 1.4247, + "mean_token_accuracy": 0.6606289645036062, + "num_tokens": 2483116670.0, + "step": 14801 + }, + { + "entropy": 1.7359294990698497, + "epoch": 1.6260745379143664, + "grad_norm": 0.6897640228271484, + "learning_rate": 3.6013756649357675e-06, + "loss": 1.3636, + "mean_token_accuracy": 0.6593460738658905, + "num_tokens": 2483265825.0, + "step": 14802 + }, + { + "entropy": 1.7429955800374348, + "epoch": 1.6261843948257395, + "grad_norm": 0.7224751710891724, + "learning_rate": 3.6004641195308284e-06, + "loss": 1.4376, + "mean_token_accuracy": 0.6494200577338537, + "num_tokens": 2483418684.0, + "step": 14803 + }, + { + "entropy": 1.7316950261592865, + "epoch": 1.6262942517371124, + "grad_norm": 0.8159376382827759, + "learning_rate": 3.5995528083178632e-06, + "loss": 1.5238, + "mean_token_accuracy": 0.6503862986962, + "num_tokens": 2483577085.0, + "step": 14804 + }, + { + "entropy": 1.687993158896764, + "epoch": 1.6264041086484853, + "grad_norm": 0.7989717721939087, + "learning_rate": 3.5986417313257176e-06, + "loss": 1.3912, + "mean_token_accuracy": 0.659797266125679, + "num_tokens": 2483734490.0, + "step": 14805 + }, + { + "entropy": 1.6549718777338664, + "epoch": 1.6265139655598584, + "grad_norm": 0.579478919506073, + "learning_rate": 3.5977308885832297e-06, + "loss": 1.4292, + "mean_token_accuracy": 0.6463843236366907, + "num_tokens": 2483947255.0, + "step": 14806 + }, + { + "entropy": 1.6915274957815807, + "epoch": 1.6266238224712313, + "grad_norm": 0.7029345035552979, + "learning_rate": 3.596820280119221e-06, + "loss": 1.5627, + "mean_token_accuracy": 0.6520664145549139, + "num_tokens": 2484087439.0, + "step": 14807 + }, + { + "entropy": 1.6860439380009968, + "epoch": 1.6267336793826042, + "grad_norm": 0.6037241816520691, + "learning_rate": 3.5959099059625136e-06, + "loss": 1.3567, + "mean_token_accuracy": 0.6552727371454239, + "num_tokens": 2484248258.0, + "step": 14808 + }, + { + "entropy": 1.6769340336322784, + "epoch": 1.626843536293977, + "grad_norm": 0.6906334757804871, + "learning_rate": 3.594999766141922e-06, + "loss": 1.4036, + "mean_token_accuracy": 0.6545880983273188, + "num_tokens": 2484416040.0, + "step": 14809 + }, + { + "entropy": 1.7643942634264629, + "epoch": 1.62695339320535, + "grad_norm": 0.7934665083885193, + "learning_rate": 3.594089860686253e-06, + "loss": 1.4969, + "mean_token_accuracy": 0.6431198517481486, + "num_tokens": 2484557942.0, + "step": 14810 + }, + { + "entropy": 1.679926613966624, + "epoch": 1.627063250116723, + "grad_norm": 0.5863533020019531, + "learning_rate": 3.593180189624299e-06, + "loss": 1.4085, + "mean_token_accuracy": 0.6471660186847051, + "num_tokens": 2484819829.0, + "step": 14811 + }, + { + "entropy": 1.7257270713647206, + "epoch": 1.627173107028096, + "grad_norm": 0.6526218056678772, + "learning_rate": 3.5922707529848576e-06, + "loss": 1.4658, + "mean_token_accuracy": 0.6473792394002279, + "num_tokens": 2485029927.0, + "step": 14812 + }, + { + "entropy": 1.7281257609526317, + "epoch": 1.6272829639394688, + "grad_norm": 0.7226946353912354, + "learning_rate": 3.5913615507967057e-06, + "loss": 1.2854, + "mean_token_accuracy": 0.6790865163008372, + "num_tokens": 2485168825.0, + "step": 14813 + }, + { + "entropy": 1.6933965583642323, + "epoch": 1.627392820850842, + "grad_norm": 0.6387641429901123, + "learning_rate": 3.590452583088626e-06, + "loss": 1.4405, + "mean_token_accuracy": 0.6517203003168106, + "num_tokens": 2485352424.0, + "step": 14814 + }, + { + "entropy": 1.745787501335144, + "epoch": 1.6275026777622146, + "grad_norm": 0.655708909034729, + "learning_rate": 3.5895438498893827e-06, + "loss": 1.4132, + "mean_token_accuracy": 0.643621101975441, + "num_tokens": 2485521153.0, + "step": 14815 + }, + { + "entropy": 1.7322950462500255, + "epoch": 1.6276125346735877, + "grad_norm": 0.7635506391525269, + "learning_rate": 3.588635351227735e-06, + "loss": 1.4825, + "mean_token_accuracy": 0.6420500675837199, + "num_tokens": 2485727288.0, + "step": 14816 + }, + { + "entropy": 1.663044144709905, + "epoch": 1.6277223915849606, + "grad_norm": 0.6754635572433472, + "learning_rate": 3.5877270871324383e-06, + "loss": 1.5849, + "mean_token_accuracy": 0.6396665796637535, + "num_tokens": 2485909269.0, + "step": 14817 + }, + { + "entropy": 1.661100705464681, + "epoch": 1.6278322484963335, + "grad_norm": 0.5776710510253906, + "learning_rate": 3.586819057632245e-06, + "loss": 1.3384, + "mean_token_accuracy": 0.66503178079923, + "num_tokens": 2486085108.0, + "step": 14818 + }, + { + "entropy": 1.7141460180282593, + "epoch": 1.6279421054077066, + "grad_norm": 0.6838655471801758, + "learning_rate": 3.5859112627558823e-06, + "loss": 1.3575, + "mean_token_accuracy": 0.662881389260292, + "num_tokens": 2486228224.0, + "step": 14819 + }, + { + "entropy": 1.698822170495987, + "epoch": 1.6280519623190794, + "grad_norm": 0.8585372567176819, + "learning_rate": 3.585003702532087e-06, + "loss": 1.3737, + "mean_token_accuracy": 0.6647797971963882, + "num_tokens": 2486380485.0, + "step": 14820 + }, + { + "entropy": 1.7368709444999695, + "epoch": 1.6281618192304523, + "grad_norm": 0.7684159874916077, + "learning_rate": 3.5840963769895866e-06, + "loss": 1.2543, + "mean_token_accuracy": 0.6787616461515427, + "num_tokens": 2486507958.0, + "step": 14821 + }, + { + "entropy": 1.6780678729216258, + "epoch": 1.6282716761418252, + "grad_norm": 0.6047248840332031, + "learning_rate": 3.583189286157094e-06, + "loss": 1.3225, + "mean_token_accuracy": 0.6666500320037206, + "num_tokens": 2486689124.0, + "step": 14822 + }, + { + "entropy": 1.723960777123769, + "epoch": 1.628381533053198, + "grad_norm": 0.6120347380638123, + "learning_rate": 3.5822824300633153e-06, + "loss": 1.3838, + "mean_token_accuracy": 0.6574974805116653, + "num_tokens": 2486823706.0, + "step": 14823 + }, + { + "entropy": 1.731412132581075, + "epoch": 1.6284913899645712, + "grad_norm": 0.5602653622627258, + "learning_rate": 3.5813758087369577e-06, + "loss": 1.3932, + "mean_token_accuracy": 0.645255446434021, + "num_tokens": 2487003576.0, + "step": 14824 + }, + { + "entropy": 1.738511284192403, + "epoch": 1.628601246875944, + "grad_norm": 0.7291231751441956, + "learning_rate": 3.5804694222067117e-06, + "loss": 1.3607, + "mean_token_accuracy": 0.6644929597775141, + "num_tokens": 2487179613.0, + "step": 14825 + }, + { + "entropy": 1.7071664134661357, + "epoch": 1.628711103787317, + "grad_norm": 0.596044659614563, + "learning_rate": 3.579563270501266e-06, + "loss": 1.4555, + "mean_token_accuracy": 0.6338590929905573, + "num_tokens": 2487378214.0, + "step": 14826 + }, + { + "entropy": 1.7025366922219594, + "epoch": 1.62882096069869, + "grad_norm": 0.862602710723877, + "learning_rate": 3.5786573536493002e-06, + "loss": 1.4171, + "mean_token_accuracy": 0.6637825717528661, + "num_tokens": 2487541639.0, + "step": 14827 + }, + { + "entropy": 1.7075651188691456, + "epoch": 1.6289308176100628, + "grad_norm": 0.731027364730835, + "learning_rate": 3.5777516716794814e-06, + "loss": 1.4465, + "mean_token_accuracy": 0.6556923538446426, + "num_tokens": 2487713071.0, + "step": 14828 + }, + { + "entropy": 1.6427714824676514, + "epoch": 1.6290406745214359, + "grad_norm": 0.5780958533287048, + "learning_rate": 3.5768462246204793e-06, + "loss": 1.3354, + "mean_token_accuracy": 0.6602772623300552, + "num_tokens": 2487904465.0, + "step": 14829 + }, + { + "entropy": 1.7856932580471039, + "epoch": 1.6291505314328087, + "grad_norm": 0.6738545894622803, + "learning_rate": 3.575941012500952e-06, + "loss": 1.4451, + "mean_token_accuracy": 0.6467948655287424, + "num_tokens": 2488101377.0, + "step": 14830 + }, + { + "entropy": 1.6973777413368225, + "epoch": 1.6292603883441816, + "grad_norm": 0.646508514881134, + "learning_rate": 3.575036035349543e-06, + "loss": 1.4545, + "mean_token_accuracy": 0.6591275582710902, + "num_tokens": 2488282493.0, + "step": 14831 + }, + { + "entropy": 1.6603109538555145, + "epoch": 1.6293702452555547, + "grad_norm": 0.6032806038856506, + "learning_rate": 3.5741312931948973e-06, + "loss": 1.4004, + "mean_token_accuracy": 0.6521612008412679, + "num_tokens": 2488501507.0, + "step": 14832 + }, + { + "entropy": 1.6811530391375225, + "epoch": 1.6294801021669276, + "grad_norm": 0.5795667767524719, + "learning_rate": 3.573226786065652e-06, + "loss": 1.426, + "mean_token_accuracy": 0.6499424229065577, + "num_tokens": 2488732884.0, + "step": 14833 + }, + { + "entropy": 1.758106917142868, + "epoch": 1.6295899590783005, + "grad_norm": 0.6564192771911621, + "learning_rate": 3.5723225139904326e-06, + "loss": 1.5641, + "mean_token_accuracy": 0.6320732136567434, + "num_tokens": 2488941510.0, + "step": 14834 + }, + { + "entropy": 1.6827127536137898, + "epoch": 1.6296998159896734, + "grad_norm": 0.8532228469848633, + "learning_rate": 3.5714184769978564e-06, + "loss": 1.4796, + "mean_token_accuracy": 0.653532346089681, + "num_tokens": 2489149280.0, + "step": 14835 + }, + { + "entropy": 1.6751657327016194, + "epoch": 1.6298096729010463, + "grad_norm": 0.6216030120849609, + "learning_rate": 3.570514675116541e-06, + "loss": 1.5362, + "mean_token_accuracy": 0.6317654103040695, + "num_tokens": 2489351067.0, + "step": 14836 + }, + { + "entropy": 1.7361950079600017, + "epoch": 1.6299195298124194, + "grad_norm": 0.670462965965271, + "learning_rate": 3.569611108375085e-06, + "loss": 1.3815, + "mean_token_accuracy": 0.6552157799402872, + "num_tokens": 2489508353.0, + "step": 14837 + }, + { + "entropy": 1.6183799505233765, + "epoch": 1.6300293867237923, + "grad_norm": 0.7788172960281372, + "learning_rate": 3.568707776802093e-06, + "loss": 1.2651, + "mean_token_accuracy": 0.6860854128996531, + "num_tokens": 2489659360.0, + "step": 14838 + }, + { + "entropy": 1.6797572473684947, + "epoch": 1.6301392436351652, + "grad_norm": 0.6599597930908203, + "learning_rate": 3.567804680426149e-06, + "loss": 1.6206, + "mean_token_accuracy": 0.6344324350357056, + "num_tokens": 2489862270.0, + "step": 14839 + }, + { + "entropy": 1.7235714693864186, + "epoch": 1.6302491005465383, + "grad_norm": 0.6941717267036438, + "learning_rate": 3.5669018192758376e-06, + "loss": 1.4494, + "mean_token_accuracy": 0.644492988785108, + "num_tokens": 2490076035.0, + "step": 14840 + }, + { + "entropy": 1.7475056151549022, + "epoch": 1.630358957457911, + "grad_norm": 0.8079373836517334, + "learning_rate": 3.5659991933797335e-06, + "loss": 1.2693, + "mean_token_accuracy": 0.6746415694554647, + "num_tokens": 2490202356.0, + "step": 14841 + }, + { + "entropy": 1.7727145949999492, + "epoch": 1.630468814369284, + "grad_norm": 0.8447114825248718, + "learning_rate": 3.565096802766409e-06, + "loss": 1.4037, + "mean_token_accuracy": 0.6543415536483129, + "num_tokens": 2490342399.0, + "step": 14842 + }, + { + "entropy": 1.6871120929718018, + "epoch": 1.630578671280657, + "grad_norm": 0.7047051191329956, + "learning_rate": 3.564194647464416e-06, + "loss": 1.335, + "mean_token_accuracy": 0.6622759302457174, + "num_tokens": 2490474991.0, + "step": 14843 + }, + { + "entropy": 1.7004227538903554, + "epoch": 1.6306885281920298, + "grad_norm": 0.7728781700134277, + "learning_rate": 3.563292727502312e-06, + "loss": 1.2688, + "mean_token_accuracy": 0.679591124256452, + "num_tokens": 2490595258.0, + "step": 14844 + }, + { + "entropy": 1.7138066987196605, + "epoch": 1.630798385103403, + "grad_norm": 0.6996949315071106, + "learning_rate": 3.562391042908645e-06, + "loss": 1.3455, + "mean_token_accuracy": 0.6582550307114919, + "num_tokens": 2490731251.0, + "step": 14845 + }, + { + "entropy": 1.667192538579305, + "epoch": 1.6309082420147758, + "grad_norm": 0.5813220143318176, + "learning_rate": 3.5614895937119485e-06, + "loss": 1.4651, + "mean_token_accuracy": 0.6454577694336573, + "num_tokens": 2490944539.0, + "step": 14846 + }, + { + "entropy": 1.624341497818629, + "epoch": 1.6310180989261487, + "grad_norm": 0.7384394407272339, + "learning_rate": 3.5605883799407535e-06, + "loss": 1.1494, + "mean_token_accuracy": 0.6913396020730337, + "num_tokens": 2491095006.0, + "step": 14847 + }, + { + "entropy": 1.6980106929938, + "epoch": 1.6311279558375218, + "grad_norm": 0.5377804040908813, + "learning_rate": 3.559687401623586e-06, + "loss": 1.3711, + "mean_token_accuracy": 0.6495694518089294, + "num_tokens": 2491351384.0, + "step": 14848 + }, + { + "entropy": 1.779253711303075, + "epoch": 1.6312378127488945, + "grad_norm": 0.7473645806312561, + "learning_rate": 3.5587866587889576e-06, + "loss": 1.3689, + "mean_token_accuracy": 0.6602377941211065, + "num_tokens": 2491513439.0, + "step": 14849 + }, + { + "entropy": 1.6981875896453857, + "epoch": 1.6313476696602676, + "grad_norm": 0.8267091512680054, + "learning_rate": 3.5578861514653808e-06, + "loss": 1.2869, + "mean_token_accuracy": 0.6729562679926554, + "num_tokens": 2491659420.0, + "step": 14850 + }, + { + "entropy": 1.738324244817098, + "epoch": 1.6314575265716404, + "grad_norm": 0.6099868416786194, + "learning_rate": 3.5569858796813526e-06, + "loss": 1.5184, + "mean_token_accuracy": 0.6377961039543152, + "num_tokens": 2491859689.0, + "step": 14851 + }, + { + "entropy": 1.6928378343582153, + "epoch": 1.6315673834830133, + "grad_norm": 0.8429316878318787, + "learning_rate": 3.556085843465367e-06, + "loss": 1.4247, + "mean_token_accuracy": 0.6417907128731409, + "num_tokens": 2492026241.0, + "step": 14852 + }, + { + "entropy": 1.7215720117092133, + "epoch": 1.6316772403943864, + "grad_norm": 0.5659279227256775, + "learning_rate": 3.5551860428459083e-06, + "loss": 1.3285, + "mean_token_accuracy": 0.664972111582756, + "num_tokens": 2492199044.0, + "step": 14853 + }, + { + "entropy": 1.67046320438385, + "epoch": 1.631787097305759, + "grad_norm": 0.669076681137085, + "learning_rate": 3.554286477851461e-06, + "loss": 1.3329, + "mean_token_accuracy": 0.6556582550207773, + "num_tokens": 2492357468.0, + "step": 14854 + }, + { + "entropy": 1.6809894442558289, + "epoch": 1.6318969542171322, + "grad_norm": 1.6321697235107422, + "learning_rate": 3.5533871485104887e-06, + "loss": 1.1329, + "mean_token_accuracy": 0.6873187224070231, + "num_tokens": 2492564644.0, + "step": 14855 + }, + { + "entropy": 1.7081284324328105, + "epoch": 1.632006811128505, + "grad_norm": 0.6694812774658203, + "learning_rate": 3.5524880548514574e-06, + "loss": 1.3172, + "mean_token_accuracy": 0.6649055629968643, + "num_tokens": 2492727367.0, + "step": 14856 + }, + { + "entropy": 1.7426902850468953, + "epoch": 1.632116668039878, + "grad_norm": 0.9611921310424805, + "learning_rate": 3.551589196902824e-06, + "loss": 1.5354, + "mean_token_accuracy": 0.6367527097463608, + "num_tokens": 2492905137.0, + "step": 14857 + }, + { + "entropy": 1.7176842490832012, + "epoch": 1.632226524951251, + "grad_norm": 0.7354887127876282, + "learning_rate": 3.5506905746930365e-06, + "loss": 1.3753, + "mean_token_accuracy": 0.6750811090071996, + "num_tokens": 2493042841.0, + "step": 14858 + }, + { + "entropy": 1.7373330096403758, + "epoch": 1.632336381862624, + "grad_norm": 0.6578879356384277, + "learning_rate": 3.5497921882505345e-06, + "loss": 1.4284, + "mean_token_accuracy": 0.6463020741939545, + "num_tokens": 2493188295.0, + "step": 14859 + }, + { + "entropy": 1.7101022799809773, + "epoch": 1.6324462387739969, + "grad_norm": 0.7568862438201904, + "learning_rate": 3.548894037603754e-06, + "loss": 1.3947, + "mean_token_accuracy": 0.6567882696787516, + "num_tokens": 2493300490.0, + "step": 14860 + }, + { + "entropy": 1.6923839151859283, + "epoch": 1.63255609568537, + "grad_norm": 0.7015122175216675, + "learning_rate": 3.5479961227811176e-06, + "loss": 1.2996, + "mean_token_accuracy": 0.6799081216255823, + "num_tokens": 2493421728.0, + "step": 14861 + }, + { + "entropy": 1.6776911318302155, + "epoch": 1.6326659525967426, + "grad_norm": 0.7158594727516174, + "learning_rate": 3.547098443811048e-06, + "loss": 1.4477, + "mean_token_accuracy": 0.6565545201301575, + "num_tokens": 2493557145.0, + "step": 14862 + }, + { + "entropy": 1.6888268689314525, + "epoch": 1.6327758095081157, + "grad_norm": 0.6423219442367554, + "learning_rate": 3.546201000721955e-06, + "loss": 1.3946, + "mean_token_accuracy": 0.6625121484200159, + "num_tokens": 2493720563.0, + "step": 14863 + }, + { + "entropy": 1.7296898762385051, + "epoch": 1.6328856664194886, + "grad_norm": 0.6093067526817322, + "learning_rate": 3.5453037935422386e-06, + "loss": 1.2773, + "mean_token_accuracy": 0.6675838033358256, + "num_tokens": 2493874867.0, + "step": 14864 + }, + { + "entropy": 1.720589945713679, + "epoch": 1.6329955233308615, + "grad_norm": 0.6853680610656738, + "learning_rate": 3.544406822300301e-06, + "loss": 1.4858, + "mean_token_accuracy": 0.6388275722662607, + "num_tokens": 2494059360.0, + "step": 14865 + }, + { + "entropy": 1.7451708912849426, + "epoch": 1.6331053802422346, + "grad_norm": 0.6423693895339966, + "learning_rate": 3.543510087024527e-06, + "loss": 1.3918, + "mean_token_accuracy": 0.6528707345326742, + "num_tokens": 2494217568.0, + "step": 14866 + }, + { + "entropy": 1.7308546602725983, + "epoch": 1.6332152371536073, + "grad_norm": 0.711121678352356, + "learning_rate": 3.5426135877432964e-06, + "loss": 1.3704, + "mean_token_accuracy": 0.6558242936929067, + "num_tokens": 2494360987.0, + "step": 14867 + }, + { + "entropy": 1.7218280136585236, + "epoch": 1.6333250940649804, + "grad_norm": 0.6833885908126831, + "learning_rate": 3.541717324484989e-06, + "loss": 1.3251, + "mean_token_accuracy": 0.6657578895489374, + "num_tokens": 2494512404.0, + "step": 14868 + }, + { + "entropy": 1.6832963228225708, + "epoch": 1.6334349509763533, + "grad_norm": 0.680909276008606, + "learning_rate": 3.5408212972779637e-06, + "loss": 1.5483, + "mean_token_accuracy": 0.6394904057184855, + "num_tokens": 2494693858.0, + "step": 14869 + }, + { + "entropy": 1.7123718361059825, + "epoch": 1.6335448078877262, + "grad_norm": 0.6566091179847717, + "learning_rate": 3.5399255061505865e-06, + "loss": 1.543, + "mean_token_accuracy": 0.6367166439692179, + "num_tokens": 2494928751.0, + "step": 14870 + }, + { + "entropy": 1.7312154173851013, + "epoch": 1.6336546647990993, + "grad_norm": 0.6933454871177673, + "learning_rate": 3.5390299511312052e-06, + "loss": 1.3882, + "mean_token_accuracy": 0.6590891778469086, + "num_tokens": 2495100100.0, + "step": 14871 + }, + { + "entropy": 1.6795762479305267, + "epoch": 1.6337645217104722, + "grad_norm": 0.6371995806694031, + "learning_rate": 3.5381346322481615e-06, + "loss": 1.4628, + "mean_token_accuracy": 0.6527168452739716, + "num_tokens": 2495293299.0, + "step": 14872 + }, + { + "entropy": 1.7581091423829396, + "epoch": 1.633874378621845, + "grad_norm": 0.7635604739189148, + "learning_rate": 3.537239549529794e-06, + "loss": 1.3626, + "mean_token_accuracy": 0.6564153929551443, + "num_tokens": 2495404583.0, + "step": 14873 + }, + { + "entropy": 1.6930834452311199, + "epoch": 1.6339842355332181, + "grad_norm": 0.668928861618042, + "learning_rate": 3.536344703004437e-06, + "loss": 1.4902, + "mean_token_accuracy": 0.6442474573850632, + "num_tokens": 2495642014.0, + "step": 14874 + }, + { + "entropy": 1.7224301397800446, + "epoch": 1.6340940924445908, + "grad_norm": 0.6915125846862793, + "learning_rate": 3.535450092700402e-06, + "loss": 1.5863, + "mean_token_accuracy": 0.6436110337575277, + "num_tokens": 2495828498.0, + "step": 14875 + }, + { + "entropy": 1.675351361433665, + "epoch": 1.634203949355964, + "grad_norm": 0.6921147704124451, + "learning_rate": 3.5345557186460084e-06, + "loss": 1.3715, + "mean_token_accuracy": 0.6487573534250259, + "num_tokens": 2495990677.0, + "step": 14876 + }, + { + "entropy": 1.6654540499051411, + "epoch": 1.6343138062673368, + "grad_norm": 0.6828057169914246, + "learning_rate": 3.533661580869564e-06, + "loss": 1.3833, + "mean_token_accuracy": 0.6538653870423635, + "num_tokens": 2496196017.0, + "step": 14877 + }, + { + "entropy": 1.654507319132487, + "epoch": 1.6344236631787097, + "grad_norm": 0.620732307434082, + "learning_rate": 3.532767679399366e-06, + "loss": 1.3135, + "mean_token_accuracy": 0.671380952000618, + "num_tokens": 2496363867.0, + "step": 14878 + }, + { + "entropy": 1.6607101559638977, + "epoch": 1.6345335200900828, + "grad_norm": 0.7271916270256042, + "learning_rate": 3.5318740142637055e-06, + "loss": 1.2748, + "mean_token_accuracy": 0.6694884747266769, + "num_tokens": 2496485499.0, + "step": 14879 + }, + { + "entropy": 1.680219570795695, + "epoch": 1.6346433770014555, + "grad_norm": 0.6058405041694641, + "learning_rate": 3.530980585490868e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6739451040824255, + "num_tokens": 2496627375.0, + "step": 14880 + }, + { + "entropy": 1.7118292550245922, + "epoch": 1.6347532339128286, + "grad_norm": 0.7250180244445801, + "learning_rate": 3.5300873931091273e-06, + "loss": 1.3839, + "mean_token_accuracy": 0.6615385562181473, + "num_tokens": 2496789160.0, + "step": 14881 + }, + { + "entropy": 1.6516647239526112, + "epoch": 1.6348630908242014, + "grad_norm": 0.6745825409889221, + "learning_rate": 3.529194437146758e-06, + "loss": 1.3025, + "mean_token_accuracy": 0.6585622032483419, + "num_tokens": 2496933961.0, + "step": 14882 + }, + { + "entropy": 1.7175203661123912, + "epoch": 1.6349729477355743, + "grad_norm": 0.7522205114364624, + "learning_rate": 3.5283017176320165e-06, + "loss": 1.4299, + "mean_token_accuracy": 0.6582658936580023, + "num_tokens": 2497087188.0, + "step": 14883 + }, + { + "entropy": 1.7287144362926483, + "epoch": 1.6350828046469474, + "grad_norm": 0.7147111892700195, + "learning_rate": 3.5274092345931566e-06, + "loss": 1.4834, + "mean_token_accuracy": 0.6414237320423126, + "num_tokens": 2497221997.0, + "step": 14884 + }, + { + "entropy": 1.7016437649726868, + "epoch": 1.6351926615583203, + "grad_norm": 0.6433010101318359, + "learning_rate": 3.526516988058429e-06, + "loss": 1.5277, + "mean_token_accuracy": 0.6294675916433334, + "num_tokens": 2497440321.0, + "step": 14885 + }, + { + "entropy": 1.667111227909724, + "epoch": 1.6353025184696932, + "grad_norm": 0.6358702778816223, + "learning_rate": 3.525624978056075e-06, + "loss": 1.3189, + "mean_token_accuracy": 0.6666462322076162, + "num_tokens": 2497632696.0, + "step": 14886 + }, + { + "entropy": 1.7168804009755452, + "epoch": 1.6354123753810663, + "grad_norm": 0.8079360723495483, + "learning_rate": 3.5247332046143162e-06, + "loss": 1.4001, + "mean_token_accuracy": 0.6580467720826467, + "num_tokens": 2497763143.0, + "step": 14887 + }, + { + "entropy": 1.7009617785612743, + "epoch": 1.635522232292439, + "grad_norm": 0.6826213598251343, + "learning_rate": 3.523841667761384e-06, + "loss": 1.4124, + "mean_token_accuracy": 0.6428949236869812, + "num_tokens": 2497924766.0, + "step": 14888 + }, + { + "entropy": 1.6994600693384807, + "epoch": 1.635632089203812, + "grad_norm": 0.7632419466972351, + "learning_rate": 3.522950367525497e-06, + "loss": 1.3133, + "mean_token_accuracy": 0.6738540679216385, + "num_tokens": 2498058132.0, + "step": 14889 + }, + { + "entropy": 1.7564916412035625, + "epoch": 1.635741946115185, + "grad_norm": 0.6684771776199341, + "learning_rate": 3.522059303934862e-06, + "loss": 1.4357, + "mean_token_accuracy": 0.6621495882670084, + "num_tokens": 2498215496.0, + "step": 14890 + }, + { + "entropy": 1.7205718557039897, + "epoch": 1.6358518030265579, + "grad_norm": 0.7321786880493164, + "learning_rate": 3.5211684770176777e-06, + "loss": 1.4398, + "mean_token_accuracy": 0.6541054844856262, + "num_tokens": 2498439505.0, + "step": 14891 + }, + { + "entropy": 1.67322771747907, + "epoch": 1.635961659937931, + "grad_norm": 0.7148532867431641, + "learning_rate": 3.5202778868021423e-06, + "loss": 1.3382, + "mean_token_accuracy": 0.667180672287941, + "num_tokens": 2498635826.0, + "step": 14892 + }, + { + "entropy": 1.7368038892745972, + "epoch": 1.6360715168493036, + "grad_norm": 0.780015230178833, + "learning_rate": 3.5193875333164398e-06, + "loss": 1.4777, + "mean_token_accuracy": 0.6418419082959493, + "num_tokens": 2498823186.0, + "step": 14893 + }, + { + "entropy": 1.6647245784600575, + "epoch": 1.6361813737606767, + "grad_norm": 0.6493139863014221, + "learning_rate": 3.518497416588753e-06, + "loss": 1.4324, + "mean_token_accuracy": 0.6691676676273346, + "num_tokens": 2499000974.0, + "step": 14894 + }, + { + "entropy": 1.7569353878498077, + "epoch": 1.6362912306720496, + "grad_norm": 0.6052994728088379, + "learning_rate": 3.517607536647253e-06, + "loss": 1.4793, + "mean_token_accuracy": 0.6535971015691757, + "num_tokens": 2499187968.0, + "step": 14895 + }, + { + "entropy": 1.6488630374272664, + "epoch": 1.6364010875834225, + "grad_norm": 0.6986522078514099, + "learning_rate": 3.5167178935200996e-06, + "loss": 1.389, + "mean_token_accuracy": 0.6525800079107285, + "num_tokens": 2499352714.0, + "step": 14896 + }, + { + "entropy": 1.7244251767794292, + "epoch": 1.6365109444947956, + "grad_norm": 0.720429003238678, + "learning_rate": 3.515828487235453e-06, + "loss": 1.5057, + "mean_token_accuracy": 0.661791185537974, + "num_tokens": 2499494824.0, + "step": 14897 + }, + { + "entropy": 1.6545683940251668, + "epoch": 1.6366208014061685, + "grad_norm": 0.7149010300636292, + "learning_rate": 3.5149393178214663e-06, + "loss": 1.3648, + "mean_token_accuracy": 0.6682617863019308, + "num_tokens": 2499651766.0, + "step": 14898 + }, + { + "entropy": 1.7152326206366222, + "epoch": 1.6367306583175414, + "grad_norm": 0.6399795413017273, + "learning_rate": 3.5140503853062734e-06, + "loss": 1.3521, + "mean_token_accuracy": 0.6598746081193289, + "num_tokens": 2499796383.0, + "step": 14899 + }, + { + "entropy": 1.7084354956944783, + "epoch": 1.6368405152289145, + "grad_norm": 0.693622887134552, + "learning_rate": 3.5131616897180132e-06, + "loss": 1.5177, + "mean_token_accuracy": 0.641907716790835, + "num_tokens": 2499967826.0, + "step": 14900 + }, + { + "entropy": 1.763914128144582, + "epoch": 1.6369503721402872, + "grad_norm": 0.7751642465591431, + "learning_rate": 3.5122732310848124e-06, + "loss": 1.3232, + "mean_token_accuracy": 0.658448706070582, + "num_tokens": 2500085030.0, + "step": 14901 + }, + { + "entropy": 1.6992384692033131, + "epoch": 1.6370602290516603, + "grad_norm": 0.6677964925765991, + "learning_rate": 3.5113850094347906e-06, + "loss": 1.4129, + "mean_token_accuracy": 0.6515720884005228, + "num_tokens": 2500323347.0, + "step": 14902 + }, + { + "entropy": 1.65836563706398, + "epoch": 1.6371700859630332, + "grad_norm": 0.6140561103820801, + "learning_rate": 3.5104970247960567e-06, + "loss": 1.3939, + "mean_token_accuracy": 0.6596829444169998, + "num_tokens": 2500533957.0, + "step": 14903 + }, + { + "entropy": 1.764195293188095, + "epoch": 1.637279942874406, + "grad_norm": 0.7463130354881287, + "learning_rate": 3.50960927719672e-06, + "loss": 1.5801, + "mean_token_accuracy": 0.636942724386851, + "num_tokens": 2500723855.0, + "step": 14904 + }, + { + "entropy": 1.6985965470472972, + "epoch": 1.6373897997857791, + "grad_norm": 0.6363186240196228, + "learning_rate": 3.508721766664872e-06, + "loss": 1.3983, + "mean_token_accuracy": 0.6462677617867788, + "num_tokens": 2500898305.0, + "step": 14905 + }, + { + "entropy": 1.7432740926742554, + "epoch": 1.6374996566971518, + "grad_norm": 0.6543776988983154, + "learning_rate": 3.5078344932286055e-06, + "loss": 1.3426, + "mean_token_accuracy": 0.6671454260746638, + "num_tokens": 2501057739.0, + "step": 14906 + }, + { + "entropy": 1.640707751115163, + "epoch": 1.637609513608525, + "grad_norm": 0.6563522219657898, + "learning_rate": 3.506947456916002e-06, + "loss": 1.477, + "mean_token_accuracy": 0.6595013240973154, + "num_tokens": 2501271194.0, + "step": 14907 + }, + { + "entropy": 1.7347827851772308, + "epoch": 1.6377193705198978, + "grad_norm": 0.5993261337280273, + "learning_rate": 3.5060606577551325e-06, + "loss": 1.4884, + "mean_token_accuracy": 0.6420771131912867, + "num_tokens": 2501480540.0, + "step": 14908 + }, + { + "entropy": 1.7362710138161976, + "epoch": 1.6378292274312707, + "grad_norm": 0.6462193727493286, + "learning_rate": 3.5051740957740666e-06, + "loss": 1.5208, + "mean_token_accuracy": 0.6494198342164358, + "num_tokens": 2501713196.0, + "step": 14909 + }, + { + "entropy": 1.7323419352372487, + "epoch": 1.6379390843426438, + "grad_norm": 0.6800756454467773, + "learning_rate": 3.504287771000868e-06, + "loss": 1.3642, + "mean_token_accuracy": 0.6693668713172277, + "num_tokens": 2501875419.0, + "step": 14910 + }, + { + "entropy": 1.7118912140528362, + "epoch": 1.6380489412540167, + "grad_norm": 0.6431507468223572, + "learning_rate": 3.5034016834635787e-06, + "loss": 1.3699, + "mean_token_accuracy": 0.6649849017461141, + "num_tokens": 2502026670.0, + "step": 14911 + }, + { + "entropy": 1.6773951450983684, + "epoch": 1.6381587981653896, + "grad_norm": 0.6200417876243591, + "learning_rate": 3.5025158331902488e-06, + "loss": 1.3204, + "mean_token_accuracy": 0.6617454985777537, + "num_tokens": 2502200495.0, + "step": 14912 + }, + { + "entropy": 1.6460547248522441, + "epoch": 1.6382686550767627, + "grad_norm": 0.7539404630661011, + "learning_rate": 3.501630220208916e-06, + "loss": 1.2843, + "mean_token_accuracy": 0.6683847606182098, + "num_tokens": 2502345107.0, + "step": 14913 + }, + { + "entropy": 1.7317273020744324, + "epoch": 1.6383785119881353, + "grad_norm": 0.7395991086959839, + "learning_rate": 3.500744844547608e-06, + "loss": 1.2899, + "mean_token_accuracy": 0.6693265736103058, + "num_tokens": 2502503992.0, + "step": 14914 + }, + { + "entropy": 1.6876648664474487, + "epoch": 1.6384883688995084, + "grad_norm": 0.778742790222168, + "learning_rate": 3.4998597062343443e-06, + "loss": 1.2905, + "mean_token_accuracy": 0.6624922255674998, + "num_tokens": 2502666223.0, + "step": 14915 + }, + { + "entropy": 1.6516147553920746, + "epoch": 1.6385982258108813, + "grad_norm": 0.722156822681427, + "learning_rate": 3.498974805297144e-06, + "loss": 1.3988, + "mean_token_accuracy": 0.657736748456955, + "num_tokens": 2502847645.0, + "step": 14916 + }, + { + "entropy": 1.7666858335336049, + "epoch": 1.6387080827222542, + "grad_norm": 0.773971438407898, + "learning_rate": 3.4980901417640078e-06, + "loss": 1.4984, + "mean_token_accuracy": 0.6684010674556097, + "num_tokens": 2502969638.0, + "step": 14917 + }, + { + "entropy": 1.7049271663029988, + "epoch": 1.6388179396336273, + "grad_norm": 0.7004063725471497, + "learning_rate": 3.4972057156629407e-06, + "loss": 1.5013, + "mean_token_accuracy": 0.6599620431661606, + "num_tokens": 2503096630.0, + "step": 14918 + }, + { + "entropy": 1.7267379264036815, + "epoch": 1.638927796545, + "grad_norm": 0.7648240327835083, + "learning_rate": 3.4963215270219332e-06, + "loss": 1.4041, + "mean_token_accuracy": 0.6580546349287033, + "num_tokens": 2503215160.0, + "step": 14919 + }, + { + "entropy": 1.7685744762420654, + "epoch": 1.639037653456373, + "grad_norm": 0.6759129762649536, + "learning_rate": 3.495437575868964e-06, + "loss": 1.2746, + "mean_token_accuracy": 0.6785066773494085, + "num_tokens": 2503327731.0, + "step": 14920 + }, + { + "entropy": 1.6827283104260762, + "epoch": 1.639147510367746, + "grad_norm": 0.7047680020332336, + "learning_rate": 3.4945538622320147e-06, + "loss": 1.3335, + "mean_token_accuracy": 0.6698191513617834, + "num_tokens": 2503496301.0, + "step": 14921 + }, + { + "entropy": 1.6511322756608326, + "epoch": 1.6392573672791189, + "grad_norm": 0.6691418290138245, + "learning_rate": 3.4936703861390587e-06, + "loss": 1.2096, + "mean_token_accuracy": 0.6906089385350546, + "num_tokens": 2503618169.0, + "step": 14922 + }, + { + "entropy": 1.6849328478177388, + "epoch": 1.639367224190492, + "grad_norm": 0.5966286659240723, + "learning_rate": 3.4927871476180477e-06, + "loss": 1.3157, + "mean_token_accuracy": 0.6642686674992243, + "num_tokens": 2503762147.0, + "step": 14923 + }, + { + "entropy": 1.695220708847046, + "epoch": 1.6394770811018649, + "grad_norm": 0.5794353485107422, + "learning_rate": 3.4919041466969417e-06, + "loss": 1.4488, + "mean_token_accuracy": 0.6457569946845373, + "num_tokens": 2503993523.0, + "step": 14924 + }, + { + "entropy": 1.7141393721103668, + "epoch": 1.6395869380132377, + "grad_norm": 0.6625608801841736, + "learning_rate": 3.4910213834036848e-06, + "loss": 1.3348, + "mean_token_accuracy": 0.6721012790997823, + "num_tokens": 2504177429.0, + "step": 14925 + }, + { + "entropy": 1.6831740339597066, + "epoch": 1.6396967949246108, + "grad_norm": 0.6153568029403687, + "learning_rate": 3.4901388577662197e-06, + "loss": 1.3432, + "mean_token_accuracy": 0.6737964401642481, + "num_tokens": 2504376626.0, + "step": 14926 + }, + { + "entropy": 1.6954569518566132, + "epoch": 1.6398066518359835, + "grad_norm": 0.6646896600723267, + "learning_rate": 3.489256569812477e-06, + "loss": 1.2977, + "mean_token_accuracy": 0.6855068306128184, + "num_tokens": 2504515974.0, + "step": 14927 + }, + { + "entropy": 1.6713591814041138, + "epoch": 1.6399165087473566, + "grad_norm": 0.7084295153617859, + "learning_rate": 3.4883745195703754e-06, + "loss": 1.3215, + "mean_token_accuracy": 0.672945981224378, + "num_tokens": 2504673537.0, + "step": 14928 + }, + { + "entropy": 1.7667873203754425, + "epoch": 1.6400263656587295, + "grad_norm": 0.7727949619293213, + "learning_rate": 3.487492707067836e-06, + "loss": 1.5748, + "mean_token_accuracy": 0.6411859591801962, + "num_tokens": 2504883359.0, + "step": 14929 + }, + { + "entropy": 1.6953539848327637, + "epoch": 1.6401362225701024, + "grad_norm": 0.6839701533317566, + "learning_rate": 3.486611132332772e-06, + "loss": 1.3055, + "mean_token_accuracy": 0.6642041752735773, + "num_tokens": 2505002882.0, + "step": 14930 + }, + { + "entropy": 1.7143741349379222, + "epoch": 1.6402460794814755, + "grad_norm": 0.6058863997459412, + "learning_rate": 3.485729795393075e-06, + "loss": 1.4816, + "mean_token_accuracy": 0.6417601952950159, + "num_tokens": 2505217245.0, + "step": 14931 + }, + { + "entropy": 1.7433338264624278, + "epoch": 1.6403559363928482, + "grad_norm": 0.6095753312110901, + "learning_rate": 3.484848696276645e-06, + "loss": 1.4746, + "mean_token_accuracy": 0.6288290123144785, + "num_tokens": 2505482140.0, + "step": 14932 + }, + { + "entropy": 1.715920130411784, + "epoch": 1.6404657933042213, + "grad_norm": 0.7610313296318054, + "learning_rate": 3.4839678350113688e-06, + "loss": 1.3884, + "mean_token_accuracy": 0.6634253362814585, + "num_tokens": 2505640431.0, + "step": 14933 + }, + { + "entropy": 1.6781785488128662, + "epoch": 1.6405756502155942, + "grad_norm": 0.8470014333724976, + "learning_rate": 3.4830872116251235e-06, + "loss": 1.5552, + "mean_token_accuracy": 0.6412830402453741, + "num_tokens": 2505826234.0, + "step": 14934 + }, + { + "entropy": 1.7671978374322255, + "epoch": 1.640685507126967, + "grad_norm": 0.8706827163696289, + "learning_rate": 3.4822068261457785e-06, + "loss": 1.504, + "mean_token_accuracy": 0.6506080453594526, + "num_tokens": 2505955616.0, + "step": 14935 + }, + { + "entropy": 1.7093442579110463, + "epoch": 1.6407953640383401, + "grad_norm": 0.6982713341712952, + "learning_rate": 3.4813266786012024e-06, + "loss": 1.2674, + "mean_token_accuracy": 0.6796736617883047, + "num_tokens": 2506098680.0, + "step": 14936 + }, + { + "entropy": 1.676591416200002, + "epoch": 1.640905220949713, + "grad_norm": 0.641269326210022, + "learning_rate": 3.480446769019248e-06, + "loss": 1.3135, + "mean_token_accuracy": 0.674098422129949, + "num_tokens": 2506228004.0, + "step": 14937 + }, + { + "entropy": 1.7529121339321136, + "epoch": 1.641015077861086, + "grad_norm": 0.7853704690933228, + "learning_rate": 3.4795670974277657e-06, + "loss": 1.5918, + "mean_token_accuracy": 0.6261871109406153, + "num_tokens": 2506412039.0, + "step": 14938 + }, + { + "entropy": 1.7120693922042847, + "epoch": 1.641124934772459, + "grad_norm": 0.6365554332733154, + "learning_rate": 3.478687663854596e-06, + "loss": 1.4076, + "mean_token_accuracy": 0.6571402897437414, + "num_tokens": 2506587078.0, + "step": 14939 + }, + { + "entropy": 1.7732374270757039, + "epoch": 1.6412347916838317, + "grad_norm": 0.7040333151817322, + "learning_rate": 3.4778084683275703e-06, + "loss": 1.4869, + "mean_token_accuracy": 0.6406953384478887, + "num_tokens": 2506742349.0, + "step": 14940 + }, + { + "entropy": 1.7779802978038788, + "epoch": 1.6413446485952048, + "grad_norm": 0.8952456712722778, + "learning_rate": 3.4769295108745177e-06, + "loss": 1.5624, + "mean_token_accuracy": 0.6528476029634476, + "num_tokens": 2506869120.0, + "step": 14941 + }, + { + "entropy": 1.7203821142514546, + "epoch": 1.6414545055065777, + "grad_norm": 0.6606357097625732, + "learning_rate": 3.47605079152326e-06, + "loss": 1.2624, + "mean_token_accuracy": 0.670018677910169, + "num_tokens": 2506989406.0, + "step": 14942 + }, + { + "entropy": 1.708219975233078, + "epoch": 1.6415643624179506, + "grad_norm": 0.7178025841712952, + "learning_rate": 3.4751723103016e-06, + "loss": 1.3562, + "mean_token_accuracy": 0.6561354100704193, + "num_tokens": 2507122848.0, + "step": 14943 + }, + { + "entropy": 1.7271239757537842, + "epoch": 1.6416742193293237, + "grad_norm": 0.7366745471954346, + "learning_rate": 3.4742940672373464e-06, + "loss": 1.3486, + "mean_token_accuracy": 0.6560288916031519, + "num_tokens": 2507321375.0, + "step": 14944 + }, + { + "entropy": 1.7078354159990947, + "epoch": 1.6417840762406963, + "grad_norm": 0.6426877975463867, + "learning_rate": 3.473416062358296e-06, + "loss": 1.3707, + "mean_token_accuracy": 0.665172666311264, + "num_tokens": 2507486772.0, + "step": 14945 + }, + { + "entropy": 1.7115601897239685, + "epoch": 1.6418939331520694, + "grad_norm": 0.7070155143737793, + "learning_rate": 3.472538295692235e-06, + "loss": 1.3945, + "mean_token_accuracy": 0.6624555140733719, + "num_tokens": 2507685516.0, + "step": 14946 + }, + { + "entropy": 1.7395752469698589, + "epoch": 1.6420037900634423, + "grad_norm": 0.8616496920585632, + "learning_rate": 3.4716607672669435e-06, + "loss": 1.4793, + "mean_token_accuracy": 0.6566527982552847, + "num_tokens": 2507832761.0, + "step": 14947 + }, + { + "entropy": 1.670799434185028, + "epoch": 1.6421136469748152, + "grad_norm": 0.7337598204612732, + "learning_rate": 3.4707834771101985e-06, + "loss": 1.4037, + "mean_token_accuracy": 0.6581169764200846, + "num_tokens": 2508037225.0, + "step": 14948 + }, + { + "entropy": 1.654303212960561, + "epoch": 1.6422235038861883, + "grad_norm": 0.8170070052146912, + "learning_rate": 3.4699064252497616e-06, + "loss": 1.4118, + "mean_token_accuracy": 0.6585876593987147, + "num_tokens": 2508185283.0, + "step": 14949 + }, + { + "entropy": 1.6956948439280193, + "epoch": 1.6423333607975612, + "grad_norm": 0.6304458975791931, + "learning_rate": 3.469029611713395e-06, + "loss": 1.349, + "mean_token_accuracy": 0.6663823227087656, + "num_tokens": 2508349025.0, + "step": 14950 + }, + { + "entropy": 1.6847296555836995, + "epoch": 1.642443217708934, + "grad_norm": 0.7351884245872498, + "learning_rate": 3.4681530365288484e-06, + "loss": 1.3056, + "mean_token_accuracy": 0.674777110417684, + "num_tokens": 2508469488.0, + "step": 14951 + }, + { + "entropy": 1.7506476143995922, + "epoch": 1.6425530746203072, + "grad_norm": 0.7597024440765381, + "learning_rate": 3.4672766997238618e-06, + "loss": 1.4348, + "mean_token_accuracy": 0.6497123142083486, + "num_tokens": 2508616481.0, + "step": 14952 + }, + { + "entropy": 1.715817630290985, + "epoch": 1.6426629315316799, + "grad_norm": 0.6161386370658875, + "learning_rate": 3.4664006013261733e-06, + "loss": 1.4003, + "mean_token_accuracy": 0.6562450776497523, + "num_tokens": 2508774763.0, + "step": 14953 + }, + { + "entropy": 1.7526540557543437, + "epoch": 1.642772788443053, + "grad_norm": 0.7678797841072083, + "learning_rate": 3.465524741363515e-06, + "loss": 1.4103, + "mean_token_accuracy": 0.6473128894964854, + "num_tokens": 2508964757.0, + "step": 14954 + }, + { + "entropy": 1.6675111552079518, + "epoch": 1.6428826453544259, + "grad_norm": 0.6056217551231384, + "learning_rate": 3.464649119863599e-06, + "loss": 1.3892, + "mean_token_accuracy": 0.6614933907985687, + "num_tokens": 2509142762.0, + "step": 14955 + }, + { + "entropy": 1.7322762807210286, + "epoch": 1.6429925022657987, + "grad_norm": 0.711068868637085, + "learning_rate": 3.4637737368541436e-06, + "loss": 1.4018, + "mean_token_accuracy": 0.6589230199654897, + "num_tokens": 2509286834.0, + "step": 14956 + }, + { + "entropy": 1.6484521726767223, + "epoch": 1.6431023591771718, + "grad_norm": 0.6235215663909912, + "learning_rate": 3.462898592362855e-06, + "loss": 1.494, + "mean_token_accuracy": 0.6627842883268992, + "num_tokens": 2509473391.0, + "step": 14957 + }, + { + "entropy": 1.7106922964255016, + "epoch": 1.6432122160885445, + "grad_norm": 0.6999409794807434, + "learning_rate": 3.4620236864174308e-06, + "loss": 1.3157, + "mean_token_accuracy": 0.6589783877134323, + "num_tokens": 2509587763.0, + "step": 14958 + }, + { + "entropy": 1.67958868543307, + "epoch": 1.6433220729999176, + "grad_norm": 0.6608509421348572, + "learning_rate": 3.4611490190455566e-06, + "loss": 1.2957, + "mean_token_accuracy": 0.6818757752577463, + "num_tokens": 2509728836.0, + "step": 14959 + }, + { + "entropy": 1.6971628268559773, + "epoch": 1.6434319299112905, + "grad_norm": 0.7482900023460388, + "learning_rate": 3.460274590274922e-06, + "loss": 1.3908, + "mean_token_accuracy": 0.6718897273143133, + "num_tokens": 2509872917.0, + "step": 14960 + }, + { + "entropy": 1.650085061788559, + "epoch": 1.6435417868226634, + "grad_norm": 0.7086712718009949, + "learning_rate": 3.4594004001331964e-06, + "loss": 1.3207, + "mean_token_accuracy": 0.6673696339130402, + "num_tokens": 2510005891.0, + "step": 14961 + }, + { + "entropy": 1.7292489111423492, + "epoch": 1.6436516437340365, + "grad_norm": 0.7552675008773804, + "learning_rate": 3.458526448648053e-06, + "loss": 1.3219, + "mean_token_accuracy": 0.6736279179652532, + "num_tokens": 2510168802.0, + "step": 14962 + }, + { + "entropy": 1.6483072837193806, + "epoch": 1.6437615006454094, + "grad_norm": 0.74481600522995, + "learning_rate": 3.457652735847148e-06, + "loss": 1.3123, + "mean_token_accuracy": 0.6694979121287664, + "num_tokens": 2510308210.0, + "step": 14963 + }, + { + "entropy": 1.6588951746622722, + "epoch": 1.6438713575567823, + "grad_norm": 0.6103244423866272, + "learning_rate": 3.456779261758134e-06, + "loss": 1.3824, + "mean_token_accuracy": 0.6662068615357081, + "num_tokens": 2510492255.0, + "step": 14964 + }, + { + "entropy": 1.732719083627065, + "epoch": 1.6439812144681554, + "grad_norm": 0.7661134004592896, + "learning_rate": 3.455906026408658e-06, + "loss": 1.518, + "mean_token_accuracy": 0.6458713908990225, + "num_tokens": 2510699730.0, + "step": 14965 + }, + { + "entropy": 1.680096020301183, + "epoch": 1.644091071379528, + "grad_norm": 0.6338300704956055, + "learning_rate": 3.45503302982636e-06, + "loss": 1.4193, + "mean_token_accuracy": 0.6609679808219274, + "num_tokens": 2510884193.0, + "step": 14966 + }, + { + "entropy": 1.7061445514361064, + "epoch": 1.6442009282909011, + "grad_norm": 0.7027104496955872, + "learning_rate": 3.4541602720388633e-06, + "loss": 1.3129, + "mean_token_accuracy": 0.6661281039317449, + "num_tokens": 2511064612.0, + "step": 14967 + }, + { + "entropy": 1.7287393510341644, + "epoch": 1.644310785202274, + "grad_norm": 0.6110780239105225, + "learning_rate": 3.453287753073793e-06, + "loss": 1.3438, + "mean_token_accuracy": 0.673071970542272, + "num_tokens": 2511225799.0, + "step": 14968 + }, + { + "entropy": 1.7669932544231415, + "epoch": 1.644420642113647, + "grad_norm": 0.7104088068008423, + "learning_rate": 3.452415472958767e-06, + "loss": 1.5157, + "mean_token_accuracy": 0.6437924156586329, + "num_tokens": 2511418847.0, + "step": 14969 + }, + { + "entropy": 1.720796098311742, + "epoch": 1.64453049902502, + "grad_norm": 0.6525430679321289, + "learning_rate": 3.4515434317213904e-06, + "loss": 1.2248, + "mean_token_accuracy": 0.6795699944098791, + "num_tokens": 2511526638.0, + "step": 14970 + }, + { + "entropy": 1.7100069324175518, + "epoch": 1.6446403559363927, + "grad_norm": 0.7813405990600586, + "learning_rate": 3.4506716293892614e-06, + "loss": 1.2652, + "mean_token_accuracy": 0.6727269490559896, + "num_tokens": 2511659416.0, + "step": 14971 + }, + { + "entropy": 1.7985884646574657, + "epoch": 1.6447502128477658, + "grad_norm": 0.8207912445068359, + "learning_rate": 3.4498000659899745e-06, + "loss": 1.4898, + "mean_token_accuracy": 0.6614548414945602, + "num_tokens": 2511868418.0, + "step": 14972 + }, + { + "entropy": 1.6758454938729603, + "epoch": 1.6448600697591387, + "grad_norm": 0.6920216679573059, + "learning_rate": 3.4489287415511107e-06, + "loss": 1.2647, + "mean_token_accuracy": 0.676102747519811, + "num_tokens": 2511999213.0, + "step": 14973 + }, + { + "entropy": 1.7158870200316112, + "epoch": 1.6449699266705116, + "grad_norm": 2.2388501167297363, + "learning_rate": 3.4480576561002533e-06, + "loss": 1.3928, + "mean_token_accuracy": 0.6611330558856329, + "num_tokens": 2512146843.0, + "step": 14974 + }, + { + "entropy": 1.7090627551078796, + "epoch": 1.6450797835818847, + "grad_norm": 0.8154202699661255, + "learning_rate": 3.4471868096649676e-06, + "loss": 1.3153, + "mean_token_accuracy": 0.6673569430907568, + "num_tokens": 2512297770.0, + "step": 14975 + }, + { + "entropy": 1.6612287263075511, + "epoch": 1.6451896404932576, + "grad_norm": 0.7027810215950012, + "learning_rate": 3.4463162022728137e-06, + "loss": 1.3933, + "mean_token_accuracy": 0.6641417344411215, + "num_tokens": 2512435108.0, + "step": 14976 + }, + { + "entropy": 1.6808200577894847, + "epoch": 1.6452994974046304, + "grad_norm": 0.7793719172477722, + "learning_rate": 3.4454458339513487e-06, + "loss": 1.2633, + "mean_token_accuracy": 0.6780805687109629, + "num_tokens": 2512623707.0, + "step": 14977 + }, + { + "entropy": 1.7211743195851643, + "epoch": 1.6454093543160035, + "grad_norm": 1.0265836715698242, + "learning_rate": 3.4445757047281226e-06, + "loss": 1.3302, + "mean_token_accuracy": 0.6751032521327337, + "num_tokens": 2512792244.0, + "step": 14978 + }, + { + "entropy": 1.7436003684997559, + "epoch": 1.6455192112273762, + "grad_norm": 0.688589870929718, + "learning_rate": 3.443705814630666e-06, + "loss": 1.3625, + "mean_token_accuracy": 0.6572001427412033, + "num_tokens": 2512925361.0, + "step": 14979 + }, + { + "entropy": 1.7435889144738514, + "epoch": 1.6456290681387493, + "grad_norm": 3.754509210586548, + "learning_rate": 3.4428361636865167e-06, + "loss": 1.1535, + "mean_token_accuracy": 0.6706924239794413, + "num_tokens": 2513119666.0, + "step": 14980 + }, + { + "entropy": 1.7060795525709789, + "epoch": 1.6457389250501222, + "grad_norm": 0.5898798108100891, + "learning_rate": 3.441966751923199e-06, + "loss": 1.489, + "mean_token_accuracy": 0.6470472464958826, + "num_tokens": 2513302051.0, + "step": 14981 + }, + { + "entropy": 1.7448813021183014, + "epoch": 1.645848781961495, + "grad_norm": 0.7016264796257019, + "learning_rate": 3.441097579368228e-06, + "loss": 1.4278, + "mean_token_accuracy": 0.6593179255723953, + "num_tokens": 2513439223.0, + "step": 14982 + }, + { + "entropy": 1.7329972485701244, + "epoch": 1.6459586388728682, + "grad_norm": 0.72443687915802, + "learning_rate": 3.440228646049112e-06, + "loss": 1.4211, + "mean_token_accuracy": 0.6588300516208013, + "num_tokens": 2513578549.0, + "step": 14983 + }, + { + "entropy": 1.7184851070245106, + "epoch": 1.646068495784241, + "grad_norm": 0.7031991481781006, + "learning_rate": 3.439359951993351e-06, + "loss": 1.4099, + "mean_token_accuracy": 0.6423845837513605, + "num_tokens": 2513802978.0, + "step": 14984 + }, + { + "entropy": 1.7713424662748973, + "epoch": 1.646178352695614, + "grad_norm": 0.7661514282226562, + "learning_rate": 3.438491497228441e-06, + "loss": 1.3603, + "mean_token_accuracy": 0.6532770196596781, + "num_tokens": 2513909884.0, + "step": 14985 + }, + { + "entropy": 1.708994189898173, + "epoch": 1.6462882096069869, + "grad_norm": 0.7131803035736084, + "learning_rate": 3.4376232817818724e-06, + "loss": 1.4916, + "mean_token_accuracy": 0.6488902270793915, + "num_tokens": 2514101465.0, + "step": 14986 + }, + { + "entropy": 1.7118847767512004, + "epoch": 1.6463980665183597, + "grad_norm": 0.5871043801307678, + "learning_rate": 3.4367553056811143e-06, + "loss": 1.3621, + "mean_token_accuracy": 0.6566170553366343, + "num_tokens": 2514282837.0, + "step": 14987 + }, + { + "entropy": 1.6855728328227997, + "epoch": 1.6465079234297328, + "grad_norm": 0.6960379481315613, + "learning_rate": 3.4358875689536424e-06, + "loss": 1.2995, + "mean_token_accuracy": 0.6776407758394877, + "num_tokens": 2514466067.0, + "step": 14988 + }, + { + "entropy": 1.7425021131833394, + "epoch": 1.6466177803411057, + "grad_norm": 0.6158261299133301, + "learning_rate": 3.435020071626923e-06, + "loss": 1.5397, + "mean_token_accuracy": 0.6305239746967951, + "num_tokens": 2514682417.0, + "step": 14989 + }, + { + "entropy": 1.7520319521427155, + "epoch": 1.6467276372524786, + "grad_norm": 0.8240329623222351, + "learning_rate": 3.4341528137284097e-06, + "loss": 1.3623, + "mean_token_accuracy": 0.6614614178737005, + "num_tokens": 2514865345.0, + "step": 14990 + }, + { + "entropy": 1.660654256741206, + "epoch": 1.6468374941638517, + "grad_norm": 0.7162386775016785, + "learning_rate": 3.433285795285548e-06, + "loss": 1.2122, + "mean_token_accuracy": 0.6758624712626139, + "num_tokens": 2514996123.0, + "step": 14991 + }, + { + "entropy": 1.7671376864115398, + "epoch": 1.6469473510752244, + "grad_norm": 0.73747718334198, + "learning_rate": 3.432419016325784e-06, + "loss": 1.5858, + "mean_token_accuracy": 0.6482378343741099, + "num_tokens": 2515213008.0, + "step": 14992 + }, + { + "entropy": 1.7469678024450939, + "epoch": 1.6470572079865975, + "grad_norm": 0.7026829719543457, + "learning_rate": 3.431552476876545e-06, + "loss": 1.5166, + "mean_token_accuracy": 0.6324778149525324, + "num_tokens": 2515397718.0, + "step": 14993 + }, + { + "entropy": 1.6981361210346222, + "epoch": 1.6471670648979704, + "grad_norm": 0.6470394134521484, + "learning_rate": 3.4306861769652634e-06, + "loss": 1.4181, + "mean_token_accuracy": 0.6427850276231766, + "num_tokens": 2515598794.0, + "step": 14994 + }, + { + "entropy": 1.6396251320838928, + "epoch": 1.6472769218093433, + "grad_norm": 0.6603129506111145, + "learning_rate": 3.4298201166193512e-06, + "loss": 1.2424, + "mean_token_accuracy": 0.6694211512804031, + "num_tokens": 2515725829.0, + "step": 14995 + }, + { + "entropy": 1.6787129143873851, + "epoch": 1.6473867787207164, + "grad_norm": 0.7073856592178345, + "learning_rate": 3.4289542958662212e-06, + "loss": 1.3735, + "mean_token_accuracy": 0.6567325393358866, + "num_tokens": 2515892679.0, + "step": 14996 + }, + { + "entropy": 1.6887877583503723, + "epoch": 1.6474966356320893, + "grad_norm": 0.7140524983406067, + "learning_rate": 3.428088714733274e-06, + "loss": 1.3827, + "mean_token_accuracy": 0.6687111059824625, + "num_tokens": 2516054075.0, + "step": 14997 + }, + { + "entropy": 1.6639872093995411, + "epoch": 1.6476064925434621, + "grad_norm": 0.6772821545600891, + "learning_rate": 3.4272233732479134e-06, + "loss": 1.344, + "mean_token_accuracy": 0.6666690111160278, + "num_tokens": 2516209836.0, + "step": 14998 + }, + { + "entropy": 1.6863376994927723, + "epoch": 1.647716349454835, + "grad_norm": 0.6367020606994629, + "learning_rate": 3.4263582714375152e-06, + "loss": 1.3958, + "mean_token_accuracy": 0.6636939545472463, + "num_tokens": 2516398662.0, + "step": 14999 + }, + { + "entropy": 1.622712602217992, + "epoch": 1.647826206366208, + "grad_norm": 0.7956864237785339, + "learning_rate": 3.4254934093294655e-06, + "loss": 1.4934, + "mean_token_accuracy": 0.6606844613949457, + "num_tokens": 2516590676.0, + "step": 15000 + }, + { + "entropy": 1.7188996473948162, + "epoch": 1.647936063277581, + "grad_norm": 0.6622449159622192, + "learning_rate": 3.4246287869511373e-06, + "loss": 1.4132, + "mean_token_accuracy": 0.652355432510376, + "num_tokens": 2516765487.0, + "step": 15001 + }, + { + "entropy": 1.6729025741418202, + "epoch": 1.648045920188954, + "grad_norm": 0.7184236645698547, + "learning_rate": 3.423764404329895e-06, + "loss": 1.4057, + "mean_token_accuracy": 0.6480544259150823, + "num_tokens": 2516931514.0, + "step": 15002 + }, + { + "entropy": 1.721810112396876, + "epoch": 1.6481557771003268, + "grad_norm": 0.6725652813911438, + "learning_rate": 3.422900261493094e-06, + "loss": 1.4415, + "mean_token_accuracy": 0.6622193058331808, + "num_tokens": 2517100166.0, + "step": 15003 + }, + { + "entropy": 1.7754334608713787, + "epoch": 1.6482656340117, + "grad_norm": 0.7001306414604187, + "learning_rate": 3.4220363584680873e-06, + "loss": 1.4849, + "mean_token_accuracy": 0.6442708969116211, + "num_tokens": 2517323102.0, + "step": 15004 + }, + { + "entropy": 1.761822521686554, + "epoch": 1.6483754909230726, + "grad_norm": 0.6547316908836365, + "learning_rate": 3.421172695282213e-06, + "loss": 1.5457, + "mean_token_accuracy": 0.6352403461933136, + "num_tokens": 2517531301.0, + "step": 15005 + }, + { + "entropy": 1.711624006430308, + "epoch": 1.6484853478344457, + "grad_norm": 0.6645762324333191, + "learning_rate": 3.4203092719628096e-06, + "loss": 1.2913, + "mean_token_accuracy": 0.6747534225384394, + "num_tokens": 2517647850.0, + "step": 15006 + }, + { + "entropy": 1.6479672491550446, + "epoch": 1.6485952047458186, + "grad_norm": 0.7587347626686096, + "learning_rate": 3.4194460885372016e-06, + "loss": 1.2875, + "mean_token_accuracy": 0.665214791893959, + "num_tokens": 2517772787.0, + "step": 15007 + }, + { + "entropy": 1.6956773499647777, + "epoch": 1.6487050616571914, + "grad_norm": 0.7115501165390015, + "learning_rate": 3.4185831450327077e-06, + "loss": 1.3814, + "mean_token_accuracy": 0.6578154365221659, + "num_tokens": 2517915878.0, + "step": 15008 + }, + { + "entropy": 1.6962638994057972, + "epoch": 1.6488149185685645, + "grad_norm": 0.6763247847557068, + "learning_rate": 3.4177204414766405e-06, + "loss": 1.3814, + "mean_token_accuracy": 0.6662989805142084, + "num_tokens": 2518072919.0, + "step": 15009 + }, + { + "entropy": 1.6879964172840118, + "epoch": 1.6489247754799374, + "grad_norm": 0.5787865519523621, + "learning_rate": 3.4168579778963097e-06, + "loss": 1.4668, + "mean_token_accuracy": 0.6475926488637924, + "num_tokens": 2518230729.0, + "step": 15010 + }, + { + "entropy": 1.7373622755209606, + "epoch": 1.6490346323913103, + "grad_norm": 0.7441072463989258, + "learning_rate": 3.4159957543190015e-06, + "loss": 1.3008, + "mean_token_accuracy": 0.658590778708458, + "num_tokens": 2518364564.0, + "step": 15011 + }, + { + "entropy": 1.644492268562317, + "epoch": 1.6491444893026832, + "grad_norm": 0.6323592662811279, + "learning_rate": 3.4151337707720113e-06, + "loss": 1.3825, + "mean_token_accuracy": 0.6613515466451645, + "num_tokens": 2518576546.0, + "step": 15012 + }, + { + "entropy": 1.7661062677701314, + "epoch": 1.649254346214056, + "grad_norm": 0.7031536102294922, + "learning_rate": 3.414272027282621e-06, + "loss": 1.3482, + "mean_token_accuracy": 0.6614481111367544, + "num_tokens": 2518741668.0, + "step": 15013 + }, + { + "entropy": 1.725075602531433, + "epoch": 1.6493642031254292, + "grad_norm": 0.6364622712135315, + "learning_rate": 3.4134105238781033e-06, + "loss": 1.5204, + "mean_token_accuracy": 0.6369550327459971, + "num_tokens": 2518948581.0, + "step": 15014 + }, + { + "entropy": 1.716450273990631, + "epoch": 1.649474060036802, + "grad_norm": 0.897272527217865, + "learning_rate": 3.4125492605857215e-06, + "loss": 1.389, + "mean_token_accuracy": 0.6591392507155737, + "num_tokens": 2519111501.0, + "step": 15015 + }, + { + "entropy": 1.7680715421835582, + "epoch": 1.649583916948175, + "grad_norm": 0.6573340892791748, + "learning_rate": 3.411688237432739e-06, + "loss": 1.3537, + "mean_token_accuracy": 0.6606834232807159, + "num_tokens": 2519237910.0, + "step": 15016 + }, + { + "entropy": 1.744692752758662, + "epoch": 1.649693773859548, + "grad_norm": 0.6951401233673096, + "learning_rate": 3.4108274544464015e-06, + "loss": 1.5013, + "mean_token_accuracy": 0.6513955841461817, + "num_tokens": 2519403504.0, + "step": 15017 + }, + { + "entropy": 1.705136001110077, + "epoch": 1.6498036307709207, + "grad_norm": 0.6180141568183899, + "learning_rate": 3.409966911653958e-06, + "loss": 1.3271, + "mean_token_accuracy": 0.6569055368502935, + "num_tokens": 2519558250.0, + "step": 15018 + }, + { + "entropy": 1.733963628609975, + "epoch": 1.6499134876822938, + "grad_norm": 0.8402661085128784, + "learning_rate": 3.4091066090826415e-06, + "loss": 1.2062, + "mean_token_accuracy": 0.6875236531098684, + "num_tokens": 2519675800.0, + "step": 15019 + }, + { + "entropy": 1.7446452577908833, + "epoch": 1.6500233445936667, + "grad_norm": 0.6582059860229492, + "learning_rate": 3.4082465467596783e-06, + "loss": 1.5078, + "mean_token_accuracy": 0.6486608684062958, + "num_tokens": 2519833913.0, + "step": 15020 + }, + { + "entropy": 1.6334237158298492, + "epoch": 1.6501332015050396, + "grad_norm": 0.7269055247306824, + "learning_rate": 3.4073867247122906e-06, + "loss": 1.2758, + "mean_token_accuracy": 0.6774081140756607, + "num_tokens": 2519999588.0, + "step": 15021 + }, + { + "entropy": 1.709044208129247, + "epoch": 1.6502430584164127, + "grad_norm": 0.6881639957427979, + "learning_rate": 3.4065271429676965e-06, + "loss": 1.6085, + "mean_token_accuracy": 0.6218457967042923, + "num_tokens": 2520245755.0, + "step": 15022 + }, + { + "entropy": 1.6926098664601643, + "epoch": 1.6503529153277856, + "grad_norm": 0.6597875356674194, + "learning_rate": 3.405667801553092e-06, + "loss": 1.3715, + "mean_token_accuracy": 0.6609023263057073, + "num_tokens": 2520443165.0, + "step": 15023 + }, + { + "entropy": 1.7041344543298085, + "epoch": 1.6504627722391585, + "grad_norm": 0.6892362236976624, + "learning_rate": 3.4048087004956797e-06, + "loss": 1.2991, + "mean_token_accuracy": 0.6705830295880636, + "num_tokens": 2520581997.0, + "step": 15024 + }, + { + "entropy": 1.6341348787148793, + "epoch": 1.6505726291505314, + "grad_norm": 0.7210821509361267, + "learning_rate": 3.403949839822652e-06, + "loss": 1.3469, + "mean_token_accuracy": 0.6602633595466614, + "num_tokens": 2520821919.0, + "step": 15025 + }, + { + "entropy": 1.6658415794372559, + "epoch": 1.6506824860619043, + "grad_norm": 0.7110158205032349, + "learning_rate": 3.403091219561188e-06, + "loss": 1.327, + "mean_token_accuracy": 0.6694478690624237, + "num_tokens": 2520982309.0, + "step": 15026 + }, + { + "entropy": 1.7470557987689972, + "epoch": 1.6507923429732774, + "grad_norm": 0.6562235355377197, + "learning_rate": 3.4022328397384624e-06, + "loss": 1.1178, + "mean_token_accuracy": 0.6828839977582296, + "num_tokens": 2521145714.0, + "step": 15027 + }, + { + "entropy": 1.7470394472281139, + "epoch": 1.6509021998846503, + "grad_norm": 0.6661121845245361, + "learning_rate": 3.4013747003816454e-06, + "loss": 1.3084, + "mean_token_accuracy": 0.662788599729538, + "num_tokens": 2521295381.0, + "step": 15028 + }, + { + "entropy": 1.7361040512720745, + "epoch": 1.6510120567960231, + "grad_norm": 0.9415419697761536, + "learning_rate": 3.4005168015178935e-06, + "loss": 1.4151, + "mean_token_accuracy": 0.648168628414472, + "num_tokens": 2521442369.0, + "step": 15029 + }, + { + "entropy": 1.7062184512615204, + "epoch": 1.6511219137073962, + "grad_norm": 0.6462988257408142, + "learning_rate": 3.399659143174362e-06, + "loss": 1.3354, + "mean_token_accuracy": 0.6528641134500504, + "num_tokens": 2521654103.0, + "step": 15030 + }, + { + "entropy": 1.6909612814585369, + "epoch": 1.651231770618769, + "grad_norm": 0.7832234501838684, + "learning_rate": 3.3988017253781936e-06, + "loss": 1.2744, + "mean_token_accuracy": 0.6736619373162588, + "num_tokens": 2521818430.0, + "step": 15031 + }, + { + "entropy": 1.7374659776687622, + "epoch": 1.651341627530142, + "grad_norm": 0.6640949249267578, + "learning_rate": 3.3979445481565244e-06, + "loss": 1.4493, + "mean_token_accuracy": 0.6429694543282191, + "num_tokens": 2521991665.0, + "step": 15032 + }, + { + "entropy": 1.781148185332616, + "epoch": 1.651451484441515, + "grad_norm": 0.732386589050293, + "learning_rate": 3.397087611536485e-06, + "loss": 1.5056, + "mean_token_accuracy": 0.6434811403354009, + "num_tokens": 2522181259.0, + "step": 15033 + }, + { + "entropy": 1.7025253772735596, + "epoch": 1.6515613413528878, + "grad_norm": 0.7074692845344543, + "learning_rate": 3.3962309155451993e-06, + "loss": 1.3074, + "mean_token_accuracy": 0.681415448586146, + "num_tokens": 2522343910.0, + "step": 15034 + }, + { + "entropy": 1.6830189228057861, + "epoch": 1.651671198264261, + "grad_norm": 0.7864646911621094, + "learning_rate": 3.395374460209776e-06, + "loss": 1.5032, + "mean_token_accuracy": 0.6472100963195165, + "num_tokens": 2522528624.0, + "step": 15035 + }, + { + "entropy": 1.6839284698168437, + "epoch": 1.6517810551756338, + "grad_norm": 0.7211703062057495, + "learning_rate": 3.3945182455573234e-06, + "loss": 1.3273, + "mean_token_accuracy": 0.6669267763694128, + "num_tokens": 2522664262.0, + "step": 15036 + }, + { + "entropy": 1.6885885000228882, + "epoch": 1.6518909120870067, + "grad_norm": 0.5619406700134277, + "learning_rate": 3.3936622716149432e-06, + "loss": 1.3137, + "mean_token_accuracy": 0.6660237014293671, + "num_tokens": 2522855978.0, + "step": 15037 + }, + { + "entropy": 1.695332556962967, + "epoch": 1.6520007689983798, + "grad_norm": 0.7055963277816772, + "learning_rate": 3.3928065384097252e-06, + "loss": 1.3109, + "mean_token_accuracy": 0.6599967380364736, + "num_tokens": 2523011118.0, + "step": 15038 + }, + { + "entropy": 1.74105371038119, + "epoch": 1.6521106259097524, + "grad_norm": 0.8354093432426453, + "learning_rate": 3.3919510459687495e-06, + "loss": 1.4608, + "mean_token_accuracy": 0.6649217158555984, + "num_tokens": 2523151102.0, + "step": 15039 + }, + { + "entropy": 1.69448517759641, + "epoch": 1.6522204828211255, + "grad_norm": 1.118112325668335, + "learning_rate": 3.3910957943190974e-06, + "loss": 1.142, + "mean_token_accuracy": 0.6787567436695099, + "num_tokens": 2523391240.0, + "step": 15040 + }, + { + "entropy": 1.6811459958553314, + "epoch": 1.6523303397324984, + "grad_norm": 0.5884284377098083, + "learning_rate": 3.390240783487833e-06, + "loss": 1.4136, + "mean_token_accuracy": 0.6765343199173609, + "num_tokens": 2523565283.0, + "step": 15041 + }, + { + "entropy": 1.7217009564240773, + "epoch": 1.6524401966438713, + "grad_norm": 0.7727037072181702, + "learning_rate": 3.3893860135020213e-06, + "loss": 1.2789, + "mean_token_accuracy": 0.6699222077926, + "num_tokens": 2523681428.0, + "step": 15042 + }, + { + "entropy": 1.693113128344218, + "epoch": 1.6525500535552444, + "grad_norm": 0.6383533477783203, + "learning_rate": 3.388531484388711e-06, + "loss": 1.3722, + "mean_token_accuracy": 0.6580367684364319, + "num_tokens": 2523890610.0, + "step": 15043 + }, + { + "entropy": 1.7015477518240611, + "epoch": 1.652659910466617, + "grad_norm": 0.8015193343162537, + "learning_rate": 3.38767719617495e-06, + "loss": 1.401, + "mean_token_accuracy": 0.6783540596564611, + "num_tokens": 2524009740.0, + "step": 15044 + }, + { + "entropy": 1.7582744856675465, + "epoch": 1.6527697673779902, + "grad_norm": 0.9063988327980042, + "learning_rate": 3.3868231488877757e-06, + "loss": 1.3525, + "mean_token_accuracy": 0.6595317522684733, + "num_tokens": 2524141825.0, + "step": 15045 + }, + { + "entropy": 1.732056309779485, + "epoch": 1.652879624289363, + "grad_norm": 0.7740351557731628, + "learning_rate": 3.3859693425542186e-06, + "loss": 1.4082, + "mean_token_accuracy": 0.6550064533948898, + "num_tokens": 2524302789.0, + "step": 15046 + }, + { + "entropy": 1.6483930746714275, + "epoch": 1.652989481200736, + "grad_norm": 0.608165979385376, + "learning_rate": 3.385115777201298e-06, + "loss": 1.3458, + "mean_token_accuracy": 0.6636339922746023, + "num_tokens": 2524444935.0, + "step": 15047 + }, + { + "entropy": 1.7196594377358754, + "epoch": 1.653099338112109, + "grad_norm": 0.572071373462677, + "learning_rate": 3.3842624528560353e-06, + "loss": 1.5982, + "mean_token_accuracy": 0.6327051321665446, + "num_tokens": 2524643745.0, + "step": 15048 + }, + { + "entropy": 1.6865974863370259, + "epoch": 1.653209195023482, + "grad_norm": 0.5919678211212158, + "learning_rate": 3.3834093695454313e-06, + "loss": 1.3454, + "mean_token_accuracy": 0.6547940770785013, + "num_tokens": 2524802624.0, + "step": 15049 + }, + { + "entropy": 1.6919112801551819, + "epoch": 1.6533190519348548, + "grad_norm": 0.6698882579803467, + "learning_rate": 3.38255652729649e-06, + "loss": 1.3982, + "mean_token_accuracy": 0.6629907687505087, + "num_tokens": 2524980761.0, + "step": 15050 + }, + { + "entropy": 1.6763626833756764, + "epoch": 1.653428908846228, + "grad_norm": 0.6936999559402466, + "learning_rate": 3.381703926136204e-06, + "loss": 1.314, + "mean_token_accuracy": 0.6573221186796824, + "num_tokens": 2525126619.0, + "step": 15051 + }, + { + "entropy": 1.766062339146932, + "epoch": 1.6535387657576006, + "grad_norm": 0.8053560256958008, + "learning_rate": 3.380851566091552e-06, + "loss": 1.3774, + "mean_token_accuracy": 0.6592030425866445, + "num_tokens": 2525244165.0, + "step": 15052 + }, + { + "entropy": 1.7038917541503906, + "epoch": 1.6536486226689737, + "grad_norm": 0.6169702410697937, + "learning_rate": 3.379999447189516e-06, + "loss": 1.3704, + "mean_token_accuracy": 0.6640800684690475, + "num_tokens": 2525404922.0, + "step": 15053 + }, + { + "entropy": 1.6612831552823384, + "epoch": 1.6537584795803466, + "grad_norm": 0.63535076379776, + "learning_rate": 3.379147569457067e-06, + "loss": 1.5327, + "mean_token_accuracy": 0.6397574096918106, + "num_tokens": 2525625009.0, + "step": 15054 + }, + { + "entropy": 1.7347392141819, + "epoch": 1.6538683364917195, + "grad_norm": 0.6799290776252747, + "learning_rate": 3.3782959329211597e-06, + "loss": 1.3879, + "mean_token_accuracy": 0.6492925484975179, + "num_tokens": 2525797102.0, + "step": 15055 + }, + { + "entropy": 1.750627835591634, + "epoch": 1.6539781934030926, + "grad_norm": 0.6583788394927979, + "learning_rate": 3.3774445376087517e-06, + "loss": 1.6107, + "mean_token_accuracy": 0.6384320706129074, + "num_tokens": 2526036673.0, + "step": 15056 + }, + { + "entropy": 1.7362577716509502, + "epoch": 1.6540880503144653, + "grad_norm": 0.751219630241394, + "learning_rate": 3.3765933835467918e-06, + "loss": 1.3345, + "mean_token_accuracy": 0.6589648723602295, + "num_tokens": 2526248957.0, + "step": 15057 + }, + { + "entropy": 1.7241934339205425, + "epoch": 1.6541979072258384, + "grad_norm": 0.7931070923805237, + "learning_rate": 3.3757424707622156e-06, + "loss": 1.502, + "mean_token_accuracy": 0.6433264712492625, + "num_tokens": 2526468669.0, + "step": 15058 + }, + { + "entropy": 1.652980665365855, + "epoch": 1.6543077641372113, + "grad_norm": 0.7637212872505188, + "learning_rate": 3.374891799281952e-06, + "loss": 1.3658, + "mean_token_accuracy": 0.6645220816135406, + "num_tokens": 2526655073.0, + "step": 15059 + }, + { + "entropy": 1.7018746038277943, + "epoch": 1.6544176210485841, + "grad_norm": 0.673249363899231, + "learning_rate": 3.3740413691329294e-06, + "loss": 1.3373, + "mean_token_accuracy": 0.6641093840201696, + "num_tokens": 2526829589.0, + "step": 15060 + }, + { + "entropy": 1.6794364154338837, + "epoch": 1.6545274779599572, + "grad_norm": 0.7552919387817383, + "learning_rate": 3.3731911803420598e-06, + "loss": 1.4182, + "mean_token_accuracy": 0.6503365089495977, + "num_tokens": 2526986226.0, + "step": 15061 + }, + { + "entropy": 1.7320887843767803, + "epoch": 1.6546373348713301, + "grad_norm": 0.664562463760376, + "learning_rate": 3.3723412329362543e-06, + "loss": 1.3605, + "mean_token_accuracy": 0.6568738867839178, + "num_tokens": 2527153507.0, + "step": 15062 + }, + { + "entropy": 1.7606121798356373, + "epoch": 1.654747191782703, + "grad_norm": 1.1021904945373535, + "learning_rate": 3.3714915269424108e-06, + "loss": 1.6345, + "mean_token_accuracy": 0.6460135305921236, + "num_tokens": 2527341854.0, + "step": 15063 + }, + { + "entropy": 1.6257544159889221, + "epoch": 1.6548570486940761, + "grad_norm": 0.5650546550750732, + "learning_rate": 3.3706420623874213e-06, + "loss": 1.3168, + "mean_token_accuracy": 0.6696870078643163, + "num_tokens": 2527504304.0, + "step": 15064 + }, + { + "entropy": 1.7246950368086498, + "epoch": 1.6549669056054488, + "grad_norm": 0.7503829598426819, + "learning_rate": 3.3697928392981737e-06, + "loss": 1.3115, + "mean_token_accuracy": 0.6665904074907303, + "num_tokens": 2527623793.0, + "step": 15065 + }, + { + "entropy": 1.7343401908874512, + "epoch": 1.655076762516822, + "grad_norm": 0.6358250379562378, + "learning_rate": 3.3689438577015476e-06, + "loss": 1.3758, + "mean_token_accuracy": 0.6481455117464066, + "num_tokens": 2527756953.0, + "step": 15066 + }, + { + "entropy": 1.7057737906773884, + "epoch": 1.6551866194281948, + "grad_norm": 0.5834030508995056, + "learning_rate": 3.3680951176244064e-06, + "loss": 1.5161, + "mean_token_accuracy": 0.6389025648434957, + "num_tokens": 2528005318.0, + "step": 15067 + }, + { + "entropy": 1.6946783165136974, + "epoch": 1.6552964763395677, + "grad_norm": 0.7368245124816895, + "learning_rate": 3.367246619093615e-06, + "loss": 1.502, + "mean_token_accuracy": 0.6575753738482794, + "num_tokens": 2528171311.0, + "step": 15068 + }, + { + "entropy": 1.732050359249115, + "epoch": 1.6554063332509408, + "grad_norm": 0.6881213188171387, + "learning_rate": 3.366398362136031e-06, + "loss": 1.3458, + "mean_token_accuracy": 0.6613701532284418, + "num_tokens": 2528352242.0, + "step": 15069 + }, + { + "entropy": 1.7644882798194885, + "epoch": 1.6555161901623134, + "grad_norm": 0.6945415735244751, + "learning_rate": 3.3655503467784996e-06, + "loss": 1.3223, + "mean_token_accuracy": 0.6620460500319799, + "num_tokens": 2528508423.0, + "step": 15070 + }, + { + "entropy": 1.6908580263455708, + "epoch": 1.6556260470736865, + "grad_norm": 0.704821765422821, + "learning_rate": 3.3647025730478566e-06, + "loss": 1.459, + "mean_token_accuracy": 0.6593573639790217, + "num_tokens": 2528690165.0, + "step": 15071 + }, + { + "entropy": 1.6922107140223186, + "epoch": 1.6557359039850594, + "grad_norm": 0.6756055355072021, + "learning_rate": 3.363855040970939e-06, + "loss": 1.5345, + "mean_token_accuracy": 0.6271846890449524, + "num_tokens": 2528963549.0, + "step": 15072 + }, + { + "entropy": 1.712009459733963, + "epoch": 1.6558457608964323, + "grad_norm": 0.6728986501693726, + "learning_rate": 3.3630077505745664e-06, + "loss": 1.3403, + "mean_token_accuracy": 0.6565148731072744, + "num_tokens": 2529077053.0, + "step": 15073 + }, + { + "entropy": 1.6842322250207264, + "epoch": 1.6559556178078054, + "grad_norm": 0.6321828365325928, + "learning_rate": 3.362160701885559e-06, + "loss": 1.3954, + "mean_token_accuracy": 0.6590474247932434, + "num_tokens": 2529232793.0, + "step": 15074 + }, + { + "entropy": 1.7570477823416393, + "epoch": 1.6560654747191783, + "grad_norm": 0.6922457814216614, + "learning_rate": 3.3613138949307246e-06, + "loss": 1.485, + "mean_token_accuracy": 0.663332611322403, + "num_tokens": 2529415470.0, + "step": 15075 + }, + { + "entropy": 1.7421770294507344, + "epoch": 1.6561753316305512, + "grad_norm": 0.825492262840271, + "learning_rate": 3.3604673297368605e-06, + "loss": 1.296, + "mean_token_accuracy": 0.6683741807937622, + "num_tokens": 2529558572.0, + "step": 15076 + }, + { + "entropy": 1.6646507183710735, + "epoch": 1.6562851885419243, + "grad_norm": 0.6699401140213013, + "learning_rate": 3.3596210063307623e-06, + "loss": 1.3958, + "mean_token_accuracy": 0.6553126474221548, + "num_tokens": 2529704252.0, + "step": 15077 + }, + { + "entropy": 1.7033556004365284, + "epoch": 1.656395045453297, + "grad_norm": 0.6611201763153076, + "learning_rate": 3.3587749247392213e-06, + "loss": 1.3577, + "mean_token_accuracy": 0.6602742572625478, + "num_tokens": 2529840132.0, + "step": 15078 + }, + { + "entropy": 1.6565563877423604, + "epoch": 1.65650490236467, + "grad_norm": 0.6807793974876404, + "learning_rate": 3.3579290849890076e-06, + "loss": 1.4446, + "mean_token_accuracy": 0.6356561382611593, + "num_tokens": 2530055346.0, + "step": 15079 + }, + { + "entropy": 1.6784409979979198, + "epoch": 1.656614759276043, + "grad_norm": 0.7249003648757935, + "learning_rate": 3.3570834871068934e-06, + "loss": 1.263, + "mean_token_accuracy": 0.6803757299979528, + "num_tokens": 2530190673.0, + "step": 15080 + }, + { + "entropy": 1.6624310910701752, + "epoch": 1.6567246161874158, + "grad_norm": 0.5706498622894287, + "learning_rate": 3.356238131119645e-06, + "loss": 1.4071, + "mean_token_accuracy": 0.6508588592211405, + "num_tokens": 2530411293.0, + "step": 15081 + }, + { + "entropy": 1.7260961433251698, + "epoch": 1.656834473098789, + "grad_norm": 0.7238296866416931, + "learning_rate": 3.3553930170540166e-06, + "loss": 1.4912, + "mean_token_accuracy": 0.6538258691628774, + "num_tokens": 2530586549.0, + "step": 15082 + }, + { + "entropy": 1.7342715958754222, + "epoch": 1.6569443300101616, + "grad_norm": 0.6654472351074219, + "learning_rate": 3.354548144936751e-06, + "loss": 1.4534, + "mean_token_accuracy": 0.6400155772765478, + "num_tokens": 2530753690.0, + "step": 15083 + }, + { + "entropy": 1.7684525350729625, + "epoch": 1.6570541869215347, + "grad_norm": 1.3236089944839478, + "learning_rate": 3.353703514794594e-06, + "loss": 1.35, + "mean_token_accuracy": 0.669903039932251, + "num_tokens": 2530865045.0, + "step": 15084 + }, + { + "entropy": 1.6798753043015797, + "epoch": 1.6571640438329076, + "grad_norm": 0.6657638549804688, + "learning_rate": 3.3528591266542735e-06, + "loss": 1.3229, + "mean_token_accuracy": 0.6585825930039088, + "num_tokens": 2531001639.0, + "step": 15085 + }, + { + "entropy": 1.7244026064872742, + "epoch": 1.6572739007442805, + "grad_norm": 0.6760687828063965, + "learning_rate": 3.3520149805425174e-06, + "loss": 1.3559, + "mean_token_accuracy": 0.6678232202927271, + "num_tokens": 2531132537.0, + "step": 15086 + }, + { + "entropy": 1.6744465331236522, + "epoch": 1.6573837576556536, + "grad_norm": 0.6795402765274048, + "learning_rate": 3.3511710764860405e-06, + "loss": 1.3079, + "mean_token_accuracy": 0.6721722632646561, + "num_tokens": 2531305857.0, + "step": 15087 + }, + { + "entropy": 1.7191696266333263, + "epoch": 1.6574936145670265, + "grad_norm": 0.7159080505371094, + "learning_rate": 3.3503274145115516e-06, + "loss": 1.3132, + "mean_token_accuracy": 0.6738952944676081, + "num_tokens": 2531423758.0, + "step": 15088 + }, + { + "entropy": 1.6929751634597778, + "epoch": 1.6576034714783994, + "grad_norm": 0.6210808157920837, + "learning_rate": 3.3494839946457525e-06, + "loss": 1.3419, + "mean_token_accuracy": 0.6703117787837982, + "num_tokens": 2531575439.0, + "step": 15089 + }, + { + "entropy": 1.656501869360606, + "epoch": 1.6577133283897725, + "grad_norm": 0.7103717923164368, + "learning_rate": 3.3486408169153413e-06, + "loss": 1.3132, + "mean_token_accuracy": 0.6686868766943613, + "num_tokens": 2531724263.0, + "step": 15090 + }, + { + "entropy": 1.6718024512132008, + "epoch": 1.6578231853011451, + "grad_norm": 0.6390276551246643, + "learning_rate": 3.3477978813469957e-06, + "loss": 1.4407, + "mean_token_accuracy": 0.6513689408699671, + "num_tokens": 2531902650.0, + "step": 15091 + }, + { + "entropy": 1.6487050652503967, + "epoch": 1.6579330422125182, + "grad_norm": 0.6771929860115051, + "learning_rate": 3.3469551879674e-06, + "loss": 1.3452, + "mean_token_accuracy": 0.6610651115576426, + "num_tokens": 2532059077.0, + "step": 15092 + }, + { + "entropy": 1.6755038897196453, + "epoch": 1.6580428991238911, + "grad_norm": 0.697956919670105, + "learning_rate": 3.3461127368032266e-06, + "loss": 1.4732, + "mean_token_accuracy": 0.6597596059242884, + "num_tokens": 2532248252.0, + "step": 15093 + }, + { + "entropy": 1.6560774842898052, + "epoch": 1.658152756035264, + "grad_norm": 0.7152982354164124, + "learning_rate": 3.3452705278811352e-06, + "loss": 1.4238, + "mean_token_accuracy": 0.6569562057654063, + "num_tokens": 2532425762.0, + "step": 15094 + }, + { + "entropy": 1.7077242334683735, + "epoch": 1.6582626129466371, + "grad_norm": 0.5992992520332336, + "learning_rate": 3.3444285612277806e-06, + "loss": 1.3899, + "mean_token_accuracy": 0.6513581027587255, + "num_tokens": 2532616814.0, + "step": 15095 + }, + { + "entropy": 1.7806836167971294, + "epoch": 1.6583724698580098, + "grad_norm": 0.6730424761772156, + "learning_rate": 3.343586836869815e-06, + "loss": 1.4785, + "mean_token_accuracy": 0.6473760406176249, + "num_tokens": 2532754041.0, + "step": 15096 + }, + { + "entropy": 1.694075107574463, + "epoch": 1.658482326769383, + "grad_norm": 0.7324855327606201, + "learning_rate": 3.3427453548338724e-06, + "loss": 1.5491, + "mean_token_accuracy": 0.6529227097829183, + "num_tokens": 2532969301.0, + "step": 15097 + }, + { + "entropy": 1.712545742591222, + "epoch": 1.6585921836807558, + "grad_norm": 0.6637945175170898, + "learning_rate": 3.341904115146592e-06, + "loss": 1.3429, + "mean_token_accuracy": 0.6546710977951685, + "num_tokens": 2533146356.0, + "step": 15098 + }, + { + "entropy": 1.7674176394939423, + "epoch": 1.6587020405921287, + "grad_norm": 0.7276931405067444, + "learning_rate": 3.3410631178345956e-06, + "loss": 1.4362, + "mean_token_accuracy": 0.6518150369326273, + "num_tokens": 2533290788.0, + "step": 15099 + }, + { + "entropy": 1.7356145282586415, + "epoch": 1.6588118975035018, + "grad_norm": 0.6265212893486023, + "learning_rate": 3.3402223629244977e-06, + "loss": 1.3753, + "mean_token_accuracy": 0.6475427796443304, + "num_tokens": 2533436985.0, + "step": 15100 + }, + { + "entropy": 1.663461983203888, + "epoch": 1.6589217544148747, + "grad_norm": 0.724477231502533, + "learning_rate": 3.339381850442911e-06, + "loss": 1.2809, + "mean_token_accuracy": 0.667538528641065, + "num_tokens": 2533568360.0, + "step": 15101 + }, + { + "entropy": 1.705003599325816, + "epoch": 1.6590316113262475, + "grad_norm": 0.620273768901825, + "learning_rate": 3.33854158041644e-06, + "loss": 1.4139, + "mean_token_accuracy": 0.6586426943540573, + "num_tokens": 2533751596.0, + "step": 15102 + }, + { + "entropy": 1.6570688684781392, + "epoch": 1.6591414682376207, + "grad_norm": 0.6880425810813904, + "learning_rate": 3.3377015528716722e-06, + "loss": 1.3196, + "mean_token_accuracy": 0.6651638994614283, + "num_tokens": 2533914399.0, + "step": 15103 + }, + { + "entropy": 1.7261102298895519, + "epoch": 1.6592513251489933, + "grad_norm": 0.9097874164581299, + "learning_rate": 3.3368617678352e-06, + "loss": 1.373, + "mean_token_accuracy": 0.6556687106688818, + "num_tokens": 2534127252.0, + "step": 15104 + }, + { + "entropy": 1.741780122121175, + "epoch": 1.6593611820603664, + "grad_norm": 0.701435923576355, + "learning_rate": 3.3360222253335963e-06, + "loss": 1.3993, + "mean_token_accuracy": 0.6510543972253799, + "num_tokens": 2534282275.0, + "step": 15105 + }, + { + "entropy": 1.707371195157369, + "epoch": 1.6594710389717393, + "grad_norm": 0.7605389356613159, + "learning_rate": 3.335182925393439e-06, + "loss": 1.5429, + "mean_token_accuracy": 0.6444597393274307, + "num_tokens": 2534425544.0, + "step": 15106 + }, + { + "entropy": 1.6632187863190968, + "epoch": 1.6595808958831122, + "grad_norm": 0.6267088651657104, + "learning_rate": 3.334343868041288e-06, + "loss": 1.3156, + "mean_token_accuracy": 0.6677224983771642, + "num_tokens": 2534595314.0, + "step": 15107 + }, + { + "entropy": 1.7033604681491852, + "epoch": 1.6596907527944853, + "grad_norm": 0.6936233639717102, + "learning_rate": 3.3335050533036973e-06, + "loss": 1.3935, + "mean_token_accuracy": 0.6590020259221395, + "num_tokens": 2534749411.0, + "step": 15108 + }, + { + "entropy": 1.6776057581106822, + "epoch": 1.659800609705858, + "grad_norm": 0.6327299475669861, + "learning_rate": 3.332666481207217e-06, + "loss": 1.3723, + "mean_token_accuracy": 0.6604341218868891, + "num_tokens": 2534924233.0, + "step": 15109 + }, + { + "entropy": 1.7498537997404735, + "epoch": 1.659910466617231, + "grad_norm": 0.7666849493980408, + "learning_rate": 3.33182815177839e-06, + "loss": 1.2836, + "mean_token_accuracy": 0.6681917756795883, + "num_tokens": 2535064163.0, + "step": 15110 + }, + { + "entropy": 1.6705930332342784, + "epoch": 1.660020323528604, + "grad_norm": 0.9941990971565247, + "learning_rate": 3.3309900650437453e-06, + "loss": 1.332, + "mean_token_accuracy": 0.6723613291978836, + "num_tokens": 2535185568.0, + "step": 15111 + }, + { + "entropy": 1.6964355210463207, + "epoch": 1.6601301804399768, + "grad_norm": 0.5545011758804321, + "learning_rate": 3.330152221029809e-06, + "loss": 1.5134, + "mean_token_accuracy": 0.630453368028005, + "num_tokens": 2535418640.0, + "step": 15112 + }, + { + "entropy": 1.7623733182748158, + "epoch": 1.66024003735135, + "grad_norm": 0.7459884285926819, + "learning_rate": 3.3293146197631e-06, + "loss": 1.5485, + "mean_token_accuracy": 0.6495217035214106, + "num_tokens": 2535565378.0, + "step": 15113 + }, + { + "entropy": 1.7098113199075062, + "epoch": 1.6603498942627228, + "grad_norm": 0.6535688042640686, + "learning_rate": 3.3284772612701264e-06, + "loss": 1.3106, + "mean_token_accuracy": 0.6760827650626501, + "num_tokens": 2535695343.0, + "step": 15114 + }, + { + "entropy": 1.6740643779436748, + "epoch": 1.6604597511740957, + "grad_norm": 0.7787010073661804, + "learning_rate": 3.327640145577389e-06, + "loss": 1.276, + "mean_token_accuracy": 0.6757322053114573, + "num_tokens": 2535832356.0, + "step": 15115 + }, + { + "entropy": 1.7608892818291981, + "epoch": 1.6605696080854688, + "grad_norm": 0.6061299443244934, + "learning_rate": 3.3268032727113854e-06, + "loss": 1.4559, + "mean_token_accuracy": 0.6371789226929346, + "num_tokens": 2536059624.0, + "step": 15116 + }, + { + "entropy": 1.6933262546857197, + "epoch": 1.6606794649968415, + "grad_norm": 0.6764704585075378, + "learning_rate": 3.3259666426985992e-06, + "loss": 1.3633, + "mean_token_accuracy": 0.6675354987382889, + "num_tokens": 2536235498.0, + "step": 15117 + }, + { + "entropy": 1.721170296271642, + "epoch": 1.6607893219082146, + "grad_norm": 0.7042806148529053, + "learning_rate": 3.3251302555655125e-06, + "loss": 1.4992, + "mean_token_accuracy": 0.6421754111846288, + "num_tokens": 2536422548.0, + "step": 15118 + }, + { + "entropy": 1.7569746871789296, + "epoch": 1.6608991788195875, + "grad_norm": 0.8258860111236572, + "learning_rate": 3.3242941113385955e-06, + "loss": 1.3906, + "mean_token_accuracy": 0.6512050032615662, + "num_tokens": 2536588540.0, + "step": 15119 + }, + { + "entropy": 1.6643067598342896, + "epoch": 1.6610090357309604, + "grad_norm": 0.6636916399002075, + "learning_rate": 3.323458210044308e-06, + "loss": 1.3003, + "mean_token_accuracy": 0.6672434459129969, + "num_tokens": 2536722836.0, + "step": 15120 + }, + { + "entropy": 1.6818746825059254, + "epoch": 1.6611188926423335, + "grad_norm": 0.8212363123893738, + "learning_rate": 3.3226225517091092e-06, + "loss": 1.3068, + "mean_token_accuracy": 0.6724486698706945, + "num_tokens": 2536847383.0, + "step": 15121 + }, + { + "entropy": 1.6717171669006348, + "epoch": 1.6612287495537061, + "grad_norm": 0.6446511745452881, + "learning_rate": 3.32178713635945e-06, + "loss": 1.4976, + "mean_token_accuracy": 0.6454497029383978, + "num_tokens": 2537070978.0, + "step": 15122 + }, + { + "entropy": 1.7301104565461476, + "epoch": 1.6613386064650792, + "grad_norm": 0.6526685953140259, + "learning_rate": 3.3209519640217673e-06, + "loss": 1.4171, + "mean_token_accuracy": 0.6540913035472234, + "num_tokens": 2537248801.0, + "step": 15123 + }, + { + "entropy": 1.7408444384733837, + "epoch": 1.6614484633764521, + "grad_norm": 0.726782500743866, + "learning_rate": 3.320117034722493e-06, + "loss": 1.2939, + "mean_token_accuracy": 0.6629084100325903, + "num_tokens": 2537391472.0, + "step": 15124 + }, + { + "entropy": 1.704400509595871, + "epoch": 1.661558320287825, + "grad_norm": 0.928636372089386, + "learning_rate": 3.3192823484880554e-06, + "loss": 1.2594, + "mean_token_accuracy": 0.6687459697326025, + "num_tokens": 2537535837.0, + "step": 15125 + }, + { + "entropy": 1.6730639934539795, + "epoch": 1.6616681771991981, + "grad_norm": 0.5636922121047974, + "learning_rate": 3.3184479053448715e-06, + "loss": 1.349, + "mean_token_accuracy": 0.6587913980086645, + "num_tokens": 2537730701.0, + "step": 15126 + }, + { + "entropy": 1.7522354920705159, + "epoch": 1.661778034110571, + "grad_norm": 0.686957597732544, + "learning_rate": 3.317613705319347e-06, + "loss": 1.5405, + "mean_token_accuracy": 0.6359638373057047, + "num_tokens": 2537914771.0, + "step": 15127 + }, + { + "entropy": 1.6851915816466014, + "epoch": 1.661887891021944, + "grad_norm": 0.7741264700889587, + "learning_rate": 3.3167797484378885e-06, + "loss": 1.3679, + "mean_token_accuracy": 0.6594855835040411, + "num_tokens": 2538075327.0, + "step": 15128 + }, + { + "entropy": 1.7024510304133098, + "epoch": 1.661997747933317, + "grad_norm": 0.734610915184021, + "learning_rate": 3.3159460347268883e-06, + "loss": 1.4623, + "mean_token_accuracy": 0.6586558967828751, + "num_tokens": 2538238125.0, + "step": 15129 + }, + { + "entropy": 1.6769766708215077, + "epoch": 1.6621076048446897, + "grad_norm": 0.63493812084198, + "learning_rate": 3.3151125642127345e-06, + "loss": 1.3767, + "mean_token_accuracy": 0.6689134786526362, + "num_tokens": 2538419817.0, + "step": 15130 + }, + { + "entropy": 1.6704062322775524, + "epoch": 1.6622174617560628, + "grad_norm": 0.6507300734519958, + "learning_rate": 3.3142793369218062e-06, + "loss": 1.2955, + "mean_token_accuracy": 0.6776465276877085, + "num_tokens": 2538580537.0, + "step": 15131 + }, + { + "entropy": 1.6934023002783458, + "epoch": 1.6623273186674357, + "grad_norm": 0.6520810723304749, + "learning_rate": 3.3134463528804708e-06, + "loss": 1.3876, + "mean_token_accuracy": 0.6512720038493475, + "num_tokens": 2538783784.0, + "step": 15132 + }, + { + "entropy": 1.6996967792510986, + "epoch": 1.6624371755788085, + "grad_norm": 0.6800544261932373, + "learning_rate": 3.312613612115094e-06, + "loss": 1.5371, + "mean_token_accuracy": 0.6272955139478048, + "num_tokens": 2538987568.0, + "step": 15133 + }, + { + "entropy": 1.6887696584065754, + "epoch": 1.6625470324901817, + "grad_norm": 0.672918975353241, + "learning_rate": 3.311781114652037e-06, + "loss": 1.4571, + "mean_token_accuracy": 0.6463527331749598, + "num_tokens": 2539126976.0, + "step": 15134 + }, + { + "entropy": 1.6642581224441528, + "epoch": 1.6626568894015543, + "grad_norm": 0.7320646643638611, + "learning_rate": 3.3109488605176398e-06, + "loss": 1.3934, + "mean_token_accuracy": 0.6711856325467428, + "num_tokens": 2539263021.0, + "step": 15135 + }, + { + "entropy": 1.7170037130514781, + "epoch": 1.6627667463129274, + "grad_norm": 0.7974650263786316, + "learning_rate": 3.3101168497382463e-06, + "loss": 1.411, + "mean_token_accuracy": 0.6470285852750143, + "num_tokens": 2539450814.0, + "step": 15136 + }, + { + "entropy": 1.6689094603061676, + "epoch": 1.6628766032243003, + "grad_norm": 0.6423022150993347, + "learning_rate": 3.309285082340191e-06, + "loss": 1.3791, + "mean_token_accuracy": 0.649917870759964, + "num_tokens": 2539611432.0, + "step": 15137 + }, + { + "entropy": 1.733245462179184, + "epoch": 1.6629864601356732, + "grad_norm": 0.6242569088935852, + "learning_rate": 3.308453558349798e-06, + "loss": 1.5296, + "mean_token_accuracy": 0.6238453984260559, + "num_tokens": 2539837249.0, + "step": 15138 + }, + { + "entropy": 1.635475645462672, + "epoch": 1.6630963170470463, + "grad_norm": 0.7449822425842285, + "learning_rate": 3.307622277793382e-06, + "loss": 1.381, + "mean_token_accuracy": 0.6656597952047983, + "num_tokens": 2540008011.0, + "step": 15139 + }, + { + "entropy": 1.7413840492566426, + "epoch": 1.6632061739584192, + "grad_norm": 0.7251917719841003, + "learning_rate": 3.3067912406972553e-06, + "loss": 1.4038, + "mean_token_accuracy": 0.6577907751003901, + "num_tokens": 2540197866.0, + "step": 15140 + }, + { + "entropy": 1.6912154257297516, + "epoch": 1.663316030869792, + "grad_norm": 0.6151164174079895, + "learning_rate": 3.305960447087718e-06, + "loss": 1.5038, + "mean_token_accuracy": 0.6435969273249308, + "num_tokens": 2540388308.0, + "step": 15141 + }, + { + "entropy": 1.6632501184940338, + "epoch": 1.6634258877811652, + "grad_norm": 0.786320686340332, + "learning_rate": 3.3051298969910683e-06, + "loss": 1.2916, + "mean_token_accuracy": 0.6799655159314474, + "num_tokens": 2540553028.0, + "step": 15142 + }, + { + "entropy": 1.6826303203900654, + "epoch": 1.6635357446925378, + "grad_norm": 0.7635297179222107, + "learning_rate": 3.3042995904335884e-06, + "loss": 1.31, + "mean_token_accuracy": 0.6798640837272009, + "num_tokens": 2540699747.0, + "step": 15143 + }, + { + "entropy": 1.7032555242379506, + "epoch": 1.663645601603911, + "grad_norm": 0.7347438931465149, + "learning_rate": 3.3034695274415586e-06, + "loss": 1.3382, + "mean_token_accuracy": 0.6689636707305908, + "num_tokens": 2540856169.0, + "step": 15144 + }, + { + "entropy": 1.7159066100915272, + "epoch": 1.6637554585152838, + "grad_norm": 0.728591799736023, + "learning_rate": 3.3026397080412475e-06, + "loss": 1.2757, + "mean_token_accuracy": 0.6693220684925715, + "num_tokens": 2540986299.0, + "step": 15145 + }, + { + "entropy": 1.7285268604755402, + "epoch": 1.6638653154266567, + "grad_norm": 0.7702023386955261, + "learning_rate": 3.3018101322589276e-06, + "loss": 1.282, + "mean_token_accuracy": 0.6752820163965225, + "num_tokens": 2541134966.0, + "step": 15146 + }, + { + "entropy": 1.7311889429887135, + "epoch": 1.6639751723380298, + "grad_norm": 0.8523202538490295, + "learning_rate": 3.3009808001208433e-06, + "loss": 1.4396, + "mean_token_accuracy": 0.6446023831764857, + "num_tokens": 2541288982.0, + "step": 15147 + }, + { + "entropy": 1.743065595626831, + "epoch": 1.6640850292494025, + "grad_norm": 0.7855637073516846, + "learning_rate": 3.3001517116532467e-06, + "loss": 1.4077, + "mean_token_accuracy": 0.665867954492569, + "num_tokens": 2541427699.0, + "step": 15148 + }, + { + "entropy": 1.701552430788676, + "epoch": 1.6641948861607756, + "grad_norm": 0.6906160712242126, + "learning_rate": 3.299322866882382e-06, + "loss": 1.3014, + "mean_token_accuracy": 0.6663641184568405, + "num_tokens": 2541559947.0, + "step": 15149 + }, + { + "entropy": 1.696532428264618, + "epoch": 1.6643047430721485, + "grad_norm": 0.8275318741798401, + "learning_rate": 3.2984942658344775e-06, + "loss": 1.4308, + "mean_token_accuracy": 0.6603184888760248, + "num_tokens": 2541719607.0, + "step": 15150 + }, + { + "entropy": 1.7219399809837341, + "epoch": 1.6644145999835214, + "grad_norm": 0.7034138441085815, + "learning_rate": 3.297665908535757e-06, + "loss": 1.43, + "mean_token_accuracy": 0.6470849066972733, + "num_tokens": 2541892403.0, + "step": 15151 + }, + { + "entropy": 1.638861060142517, + "epoch": 1.6645244568948945, + "grad_norm": 0.8403314352035522, + "learning_rate": 3.2968377950124424e-06, + "loss": 1.3045, + "mean_token_accuracy": 0.66932080189387, + "num_tokens": 2542029755.0, + "step": 15152 + }, + { + "entropy": 1.675868570804596, + "epoch": 1.6646343138062674, + "grad_norm": 0.9272775650024414, + "learning_rate": 3.2960099252907383e-06, + "loss": 1.4506, + "mean_token_accuracy": 0.6282220433155695, + "num_tokens": 2542277314.0, + "step": 15153 + }, + { + "entropy": 1.6995552678902943, + "epoch": 1.6647441707176402, + "grad_norm": 0.6546932458877563, + "learning_rate": 3.2951822993968507e-06, + "loss": 1.4917, + "mean_token_accuracy": 0.6413043240706126, + "num_tokens": 2542450597.0, + "step": 15154 + }, + { + "entropy": 1.6610127687454224, + "epoch": 1.6648540276290134, + "grad_norm": 0.7705767154693604, + "learning_rate": 3.294354917356971e-06, + "loss": 1.3614, + "mean_token_accuracy": 0.6677778412898382, + "num_tokens": 2542605563.0, + "step": 15155 + }, + { + "entropy": 1.6522420446077983, + "epoch": 1.664963884540386, + "grad_norm": 0.9083729982376099, + "learning_rate": 3.2935277791972845e-06, + "loss": 1.3583, + "mean_token_accuracy": 0.6643216063578924, + "num_tokens": 2542750030.0, + "step": 15156 + }, + { + "entropy": 1.6629354059696198, + "epoch": 1.6650737414517591, + "grad_norm": 0.7083542346954346, + "learning_rate": 3.2927008849439713e-06, + "loss": 1.5038, + "mean_token_accuracy": 0.6516106476386389, + "num_tokens": 2542920378.0, + "step": 15157 + }, + { + "entropy": 1.6975993414719899, + "epoch": 1.665183598363132, + "grad_norm": 0.6380283832550049, + "learning_rate": 3.291874234623206e-06, + "loss": 1.3954, + "mean_token_accuracy": 0.6629827618598938, + "num_tokens": 2543100162.0, + "step": 15158 + }, + { + "entropy": 1.6631451447804768, + "epoch": 1.665293455274505, + "grad_norm": 0.6705272793769836, + "learning_rate": 3.2910478282611434e-06, + "loss": 1.4026, + "mean_token_accuracy": 0.6492062012354533, + "num_tokens": 2543321718.0, + "step": 15159 + }, + { + "entropy": 1.662650595108668, + "epoch": 1.665403312185878, + "grad_norm": 0.6082910299301147, + "learning_rate": 3.2902216658839437e-06, + "loss": 1.3955, + "mean_token_accuracy": 0.6513722836971283, + "num_tokens": 2543502527.0, + "step": 15160 + }, + { + "entropy": 1.7114491661389668, + "epoch": 1.6655131690972507, + "grad_norm": 0.6370794177055359, + "learning_rate": 3.2893957475177562e-06, + "loss": 1.4805, + "mean_token_accuracy": 0.6547950555880865, + "num_tokens": 2543686162.0, + "step": 15161 + }, + { + "entropy": 1.7338766554991405, + "epoch": 1.6656230260086238, + "grad_norm": 0.6354936957359314, + "learning_rate": 3.2885700731887184e-06, + "loss": 1.4639, + "mean_token_accuracy": 0.6365046302477518, + "num_tokens": 2543866432.0, + "step": 15162 + }, + { + "entropy": 1.7078477044900258, + "epoch": 1.6657328829199967, + "grad_norm": 0.8597061038017273, + "learning_rate": 3.287744642922961e-06, + "loss": 1.2784, + "mean_token_accuracy": 0.6691752125819524, + "num_tokens": 2544031768.0, + "step": 15163 + }, + { + "entropy": 1.725355605284373, + "epoch": 1.6658427398313695, + "grad_norm": 0.8394426107406616, + "learning_rate": 3.2869194567466126e-06, + "loss": 1.4591, + "mean_token_accuracy": 0.6521119624376297, + "num_tokens": 2544304098.0, + "step": 15164 + }, + { + "entropy": 1.711538831392924, + "epoch": 1.6659525967427427, + "grad_norm": 0.6709228754043579, + "learning_rate": 3.286094514685786e-06, + "loss": 1.5216, + "mean_token_accuracy": 0.648155947526296, + "num_tokens": 2544482608.0, + "step": 15165 + }, + { + "entropy": 1.7310082018375397, + "epoch": 1.6660624536541155, + "grad_norm": 0.7595032453536987, + "learning_rate": 3.285269816766593e-06, + "loss": 1.3219, + "mean_token_accuracy": 0.6648527532815933, + "num_tokens": 2544614022.0, + "step": 15166 + }, + { + "entropy": 1.6925914386908214, + "epoch": 1.6661723105654884, + "grad_norm": 0.6604565382003784, + "learning_rate": 3.284445363015135e-06, + "loss": 1.4728, + "mean_token_accuracy": 0.6658626943826675, + "num_tokens": 2544783172.0, + "step": 15167 + }, + { + "entropy": 1.6832049489021301, + "epoch": 1.6662821674768615, + "grad_norm": 0.7163446545600891, + "learning_rate": 3.2836211534575017e-06, + "loss": 1.5589, + "mean_token_accuracy": 0.6431887249151865, + "num_tokens": 2544991921.0, + "step": 15168 + }, + { + "entropy": 1.710367888212204, + "epoch": 1.6663920243882342, + "grad_norm": 0.6329286694526672, + "learning_rate": 3.282797188119784e-06, + "loss": 1.3939, + "mean_token_accuracy": 0.6490947405497233, + "num_tokens": 2545169008.0, + "step": 15169 + }, + { + "entropy": 1.6956720153490703, + "epoch": 1.6665018812996073, + "grad_norm": 0.7948725819587708, + "learning_rate": 3.281973467028059e-06, + "loss": 1.3948, + "mean_token_accuracy": 0.6627111285924911, + "num_tokens": 2545343998.0, + "step": 15170 + }, + { + "entropy": 1.6870111227035522, + "epoch": 1.6666117382109802, + "grad_norm": 0.7442490458488464, + "learning_rate": 3.2811499902083926e-06, + "loss": 1.3838, + "mean_token_accuracy": 0.6725998371839523, + "num_tokens": 2545494318.0, + "step": 15171 + }, + { + "entropy": 1.707838664452235, + "epoch": 1.666721595122353, + "grad_norm": 0.7813781499862671, + "learning_rate": 3.2803267576868537e-06, + "loss": 1.4931, + "mean_token_accuracy": 0.6501006484031677, + "num_tokens": 2545634917.0, + "step": 15172 + }, + { + "entropy": 1.7154215077559154, + "epoch": 1.6668314520337262, + "grad_norm": 0.7479304671287537, + "learning_rate": 3.2795037694894916e-06, + "loss": 1.2564, + "mean_token_accuracy": 0.6734669556220373, + "num_tokens": 2545793905.0, + "step": 15173 + }, + { + "entropy": 1.7019230524698894, + "epoch": 1.6669413089450988, + "grad_norm": 0.7185121774673462, + "learning_rate": 3.278681025642359e-06, + "loss": 1.2722, + "mean_token_accuracy": 0.689252441128095, + "num_tokens": 2545972358.0, + "step": 15174 + }, + { + "entropy": 1.6919790307680767, + "epoch": 1.667051165856472, + "grad_norm": 0.7885094285011292, + "learning_rate": 3.2778585261714925e-06, + "loss": 1.6047, + "mean_token_accuracy": 0.6392913907766342, + "num_tokens": 2546235675.0, + "step": 15175 + }, + { + "entropy": 1.660687933365504, + "epoch": 1.6671610227678448, + "grad_norm": 0.7216572761535645, + "learning_rate": 3.2770362711029226e-06, + "loss": 1.4412, + "mean_token_accuracy": 0.6594575295845667, + "num_tokens": 2546375840.0, + "step": 15176 + }, + { + "entropy": 1.6957957843939464, + "epoch": 1.6672708796792177, + "grad_norm": 0.788975715637207, + "learning_rate": 3.2762142604626724e-06, + "loss": 1.4064, + "mean_token_accuracy": 0.6570547719796499, + "num_tokens": 2546509636.0, + "step": 15177 + }, + { + "entropy": 1.67244353890419, + "epoch": 1.6673807365905908, + "grad_norm": 0.731098473072052, + "learning_rate": 3.2753924942767647e-06, + "loss": 1.3241, + "mean_token_accuracy": 0.6744396587212881, + "num_tokens": 2546682886.0, + "step": 15178 + }, + { + "entropy": 1.6986753741900127, + "epoch": 1.6674905935019637, + "grad_norm": 0.8727912902832031, + "learning_rate": 3.2745709725712027e-06, + "loss": 1.2156, + "mean_token_accuracy": 0.6811005771160126, + "num_tokens": 2546823461.0, + "step": 15179 + }, + { + "entropy": 1.697281688451767, + "epoch": 1.6676004504133366, + "grad_norm": 0.6751629710197449, + "learning_rate": 3.273749695371986e-06, + "loss": 1.3449, + "mean_token_accuracy": 0.6630785216887792, + "num_tokens": 2547036887.0, + "step": 15180 + }, + { + "entropy": 1.752416580915451, + "epoch": 1.6677103073247097, + "grad_norm": 0.6552797555923462, + "learning_rate": 3.2729286627051126e-06, + "loss": 1.3898, + "mean_token_accuracy": 0.6616188089052836, + "num_tokens": 2547192022.0, + "step": 15181 + }, + { + "entropy": 1.7320577601591747, + "epoch": 1.6678201642360824, + "grad_norm": 0.7161309719085693, + "learning_rate": 3.2721078745965653e-06, + "loss": 1.5004, + "mean_token_accuracy": 0.6625331242879232, + "num_tokens": 2547364976.0, + "step": 15182 + }, + { + "entropy": 1.7012326021989186, + "epoch": 1.6679300211474555, + "grad_norm": 1.3153455257415771, + "learning_rate": 3.2712873310723186e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6787421902020773, + "num_tokens": 2547535164.0, + "step": 15183 + }, + { + "entropy": 1.6177492539087932, + "epoch": 1.6680398780588284, + "grad_norm": 1.2736519575119019, + "learning_rate": 3.2704670321583474e-06, + "loss": 1.2213, + "mean_token_accuracy": 0.6746558050314585, + "num_tokens": 2547762852.0, + "step": 15184 + }, + { + "entropy": 1.7297306557496388, + "epoch": 1.6681497349702012, + "grad_norm": 0.8100583553314209, + "learning_rate": 3.2696469778806102e-06, + "loss": 1.369, + "mean_token_accuracy": 0.660000408689181, + "num_tokens": 2547894035.0, + "step": 15185 + }, + { + "entropy": 1.631914883852005, + "epoch": 1.6682595918815744, + "grad_norm": 0.7101684212684631, + "learning_rate": 3.2688271682650652e-06, + "loss": 1.2938, + "mean_token_accuracy": 0.6753945598999659, + "num_tokens": 2548003771.0, + "step": 15186 + }, + { + "entropy": 1.6929580171902974, + "epoch": 1.6683694487929472, + "grad_norm": 0.6653352379798889, + "learning_rate": 3.268007603337655e-06, + "loss": 1.5302, + "mean_token_accuracy": 0.6348066478967667, + "num_tokens": 2548213469.0, + "step": 15187 + }, + { + "entropy": 1.6800095836321514, + "epoch": 1.6684793057043201, + "grad_norm": 0.6623151302337646, + "learning_rate": 3.2671882831243192e-06, + "loss": 1.3365, + "mean_token_accuracy": 0.6838184396425883, + "num_tokens": 2548375660.0, + "step": 15188 + }, + { + "entropy": 1.735003262758255, + "epoch": 1.668589162615693, + "grad_norm": 0.5957344770431519, + "learning_rate": 3.26636920765099e-06, + "loss": 1.4901, + "mean_token_accuracy": 0.633953258395195, + "num_tokens": 2548619091.0, + "step": 15189 + }, + { + "entropy": 1.7986479699611664, + "epoch": 1.668699019527066, + "grad_norm": 0.8299116492271423, + "learning_rate": 3.2655503769435914e-06, + "loss": 1.7051, + "mean_token_accuracy": 0.6299788852532705, + "num_tokens": 2548760586.0, + "step": 15190 + }, + { + "entropy": 1.7730123003323872, + "epoch": 1.668808876438439, + "grad_norm": 0.7321978211402893, + "learning_rate": 3.2647317910280394e-06, + "loss": 1.5606, + "mean_token_accuracy": 0.647643451889356, + "num_tokens": 2548927359.0, + "step": 15191 + }, + { + "entropy": 1.625301976998647, + "epoch": 1.668918733349812, + "grad_norm": 0.6785169243812561, + "learning_rate": 3.2639134499302376e-06, + "loss": 1.382, + "mean_token_accuracy": 0.6625783642133077, + "num_tokens": 2549121709.0, + "step": 15192 + }, + { + "entropy": 1.7081403533617656, + "epoch": 1.6690285902611848, + "grad_norm": 0.808611273765564, + "learning_rate": 3.2630953536760912e-06, + "loss": 1.5018, + "mean_token_accuracy": 0.657961055636406, + "num_tokens": 2549298184.0, + "step": 15193 + }, + { + "entropy": 1.723660518725713, + "epoch": 1.6691384471725579, + "grad_norm": 0.7684034705162048, + "learning_rate": 3.2622775022914916e-06, + "loss": 1.5894, + "mean_token_accuracy": 0.6327784558137258, + "num_tokens": 2549481832.0, + "step": 15194 + }, + { + "entropy": 1.7609472672144573, + "epoch": 1.6692483040839305, + "grad_norm": 0.7250325083732605, + "learning_rate": 3.2614598958023197e-06, + "loss": 1.4629, + "mean_token_accuracy": 0.6485139379898707, + "num_tokens": 2549642631.0, + "step": 15195 + }, + { + "entropy": 1.678977221250534, + "epoch": 1.6693581609953037, + "grad_norm": 0.6951817870140076, + "learning_rate": 3.2606425342344563e-06, + "loss": 1.3847, + "mean_token_accuracy": 0.6604787260293961, + "num_tokens": 2549822114.0, + "step": 15196 + }, + { + "entropy": 1.6499019463857014, + "epoch": 1.6694680179066765, + "grad_norm": 0.7327429056167603, + "learning_rate": 3.259825417613768e-06, + "loss": 1.4187, + "mean_token_accuracy": 0.6621517539024353, + "num_tokens": 2550005790.0, + "step": 15197 + }, + { + "entropy": 1.6978709896405537, + "epoch": 1.6695778748180494, + "grad_norm": 0.637737512588501, + "learning_rate": 3.259008545966119e-06, + "loss": 1.4359, + "mean_token_accuracy": 0.6585992376009623, + "num_tokens": 2550179351.0, + "step": 15198 + }, + { + "entropy": 1.7029849688212078, + "epoch": 1.6696877317294225, + "grad_norm": 0.6902602314949036, + "learning_rate": 3.2581919193173617e-06, + "loss": 1.4087, + "mean_token_accuracy": 0.6876400311787924, + "num_tokens": 2550325763.0, + "step": 15199 + }, + { + "entropy": 1.7197512686252594, + "epoch": 1.6697975886407954, + "grad_norm": 0.7032921314239502, + "learning_rate": 3.25737553769334e-06, + "loss": 1.3647, + "mean_token_accuracy": 0.6540055871009827, + "num_tokens": 2550453496.0, + "step": 15200 + }, + { + "entropy": 1.6929913659890492, + "epoch": 1.6699074455521683, + "grad_norm": 0.7520516514778137, + "learning_rate": 3.2565594011198927e-06, + "loss": 1.3464, + "mean_token_accuracy": 0.6615286866823832, + "num_tokens": 2550611853.0, + "step": 15201 + }, + { + "entropy": 1.782798061768214, + "epoch": 1.6700173024635412, + "grad_norm": 0.7003825306892395, + "learning_rate": 3.255743509622854e-06, + "loss": 1.4342, + "mean_token_accuracy": 0.6429455975691477, + "num_tokens": 2550777852.0, + "step": 15202 + }, + { + "entropy": 1.7199705839157104, + "epoch": 1.670127159374914, + "grad_norm": 0.6969501376152039, + "learning_rate": 3.2549278632280428e-06, + "loss": 1.3403, + "mean_token_accuracy": 0.650229757030805, + "num_tokens": 2550953345.0, + "step": 15203 + }, + { + "entropy": 1.7010493278503418, + "epoch": 1.6702370162862872, + "grad_norm": 0.8231012225151062, + "learning_rate": 3.254112461961273e-06, + "loss": 1.5104, + "mean_token_accuracy": 0.6525203734636307, + "num_tokens": 2551164319.0, + "step": 15204 + }, + { + "entropy": 1.7223340173562367, + "epoch": 1.67034687319766, + "grad_norm": 0.5782705545425415, + "learning_rate": 3.2532973058483557e-06, + "loss": 1.4939, + "mean_token_accuracy": 0.6309877087672552, + "num_tokens": 2551397512.0, + "step": 15205 + }, + { + "entropy": 1.6429968476295471, + "epoch": 1.670456730109033, + "grad_norm": 0.7078797817230225, + "learning_rate": 3.2524823949150875e-06, + "loss": 1.2195, + "mean_token_accuracy": 0.6770479083061218, + "num_tokens": 2551523040.0, + "step": 15206 + }, + { + "entropy": 1.646560360987981, + "epoch": 1.670566587020406, + "grad_norm": 0.6111953854560852, + "learning_rate": 3.2516677291872577e-06, + "loss": 1.4736, + "mean_token_accuracy": 0.6481401324272156, + "num_tokens": 2551756130.0, + "step": 15207 + }, + { + "entropy": 1.685501754283905, + "epoch": 1.6706764439317787, + "grad_norm": 0.6546112298965454, + "learning_rate": 3.250853308690657e-06, + "loss": 1.4007, + "mean_token_accuracy": 0.6448431412378947, + "num_tokens": 2551921706.0, + "step": 15208 + }, + { + "entropy": 1.6584607859452565, + "epoch": 1.6707863008431518, + "grad_norm": 0.6745330095291138, + "learning_rate": 3.250039133451054e-06, + "loss": 1.3448, + "mean_token_accuracy": 0.6620303889115652, + "num_tokens": 2552094248.0, + "step": 15209 + }, + { + "entropy": 1.7307999233404796, + "epoch": 1.6708961577545247, + "grad_norm": 0.7921266555786133, + "learning_rate": 3.249225203494221e-06, + "loss": 1.3666, + "mean_token_accuracy": 0.6684642732143402, + "num_tokens": 2552225219.0, + "step": 15210 + }, + { + "entropy": 1.610274225473404, + "epoch": 1.6710060146658976, + "grad_norm": 0.6567273736000061, + "learning_rate": 3.2484115188459197e-06, + "loss": 1.3422, + "mean_token_accuracy": 0.6564729809761047, + "num_tokens": 2552450742.0, + "step": 15211 + }, + { + "entropy": 1.6868870158990223, + "epoch": 1.6711158715772707, + "grad_norm": 0.665790319442749, + "learning_rate": 3.2475980795318977e-06, + "loss": 1.2995, + "mean_token_accuracy": 0.6727713098128637, + "num_tokens": 2552570748.0, + "step": 15212 + }, + { + "entropy": 1.724908987681071, + "epoch": 1.6712257284886436, + "grad_norm": 0.8795796632766724, + "learning_rate": 3.246784885577903e-06, + "loss": 1.3513, + "mean_token_accuracy": 0.6731296479701996, + "num_tokens": 2552718100.0, + "step": 15213 + }, + { + "entropy": 1.6999436517556508, + "epoch": 1.6713355854000165, + "grad_norm": 0.6800674200057983, + "learning_rate": 3.2459719370096783e-06, + "loss": 1.4395, + "mean_token_accuracy": 0.6620732347170512, + "num_tokens": 2552898022.0, + "step": 15214 + }, + { + "entropy": 1.709017237027486, + "epoch": 1.6714454423113894, + "grad_norm": 0.76900714635849, + "learning_rate": 3.2451592338529424e-06, + "loss": 1.3666, + "mean_token_accuracy": 0.6669487059116364, + "num_tokens": 2553055703.0, + "step": 15215 + }, + { + "entropy": 1.704631100098292, + "epoch": 1.6715552992227622, + "grad_norm": 0.6997295618057251, + "learning_rate": 3.2443467761334236e-06, + "loss": 1.2892, + "mean_token_accuracy": 0.6640374610821406, + "num_tokens": 2553189498.0, + "step": 15216 + }, + { + "entropy": 1.7368600467840831, + "epoch": 1.6716651561341354, + "grad_norm": 0.6645811200141907, + "learning_rate": 3.243534563876835e-06, + "loss": 1.5858, + "mean_token_accuracy": 0.633993665377299, + "num_tokens": 2553385158.0, + "step": 15217 + }, + { + "entropy": 1.721531867980957, + "epoch": 1.6717750130455082, + "grad_norm": 0.6490238904953003, + "learning_rate": 3.242722597108883e-06, + "loss": 1.4305, + "mean_token_accuracy": 0.6537267516056696, + "num_tokens": 2553552812.0, + "step": 15218 + }, + { + "entropy": 1.7109013696511586, + "epoch": 1.6718848699568811, + "grad_norm": 0.717147171497345, + "learning_rate": 3.241910875855263e-06, + "loss": 1.508, + "mean_token_accuracy": 0.6526562124490738, + "num_tokens": 2553756616.0, + "step": 15219 + }, + { + "entropy": 1.7662996451059978, + "epoch": 1.6719947268682542, + "grad_norm": 0.7439054250717163, + "learning_rate": 3.2410994001416706e-06, + "loss": 1.5202, + "mean_token_accuracy": 0.6419854611158371, + "num_tokens": 2553904412.0, + "step": 15220 + }, + { + "entropy": 1.7582280735174816, + "epoch": 1.672104583779627, + "grad_norm": 0.6127282381057739, + "learning_rate": 3.240288169993784e-06, + "loss": 1.3346, + "mean_token_accuracy": 0.6562148282925288, + "num_tokens": 2554077430.0, + "step": 15221 + }, + { + "entropy": 1.7364347378412883, + "epoch": 1.672214440691, + "grad_norm": 0.6855073571205139, + "learning_rate": 3.239477185437281e-06, + "loss": 1.3535, + "mean_token_accuracy": 0.6602631757656733, + "num_tokens": 2554224197.0, + "step": 15222 + }, + { + "entropy": 1.7275860210259755, + "epoch": 1.6723242976023729, + "grad_norm": 0.643002986907959, + "learning_rate": 3.238666446497829e-06, + "loss": 1.4968, + "mean_token_accuracy": 0.6405756970246633, + "num_tokens": 2554382135.0, + "step": 15223 + }, + { + "entropy": 1.766138106584549, + "epoch": 1.6724341545137458, + "grad_norm": 0.7326632738113403, + "learning_rate": 3.2378559532010858e-06, + "loss": 1.3671, + "mean_token_accuracy": 0.666873628894488, + "num_tokens": 2554520361.0, + "step": 15224 + }, + { + "entropy": 1.687092532714208, + "epoch": 1.6725440114251189, + "grad_norm": 0.6920173168182373, + "learning_rate": 3.2370457055727046e-06, + "loss": 1.4491, + "mean_token_accuracy": 0.6508265684048334, + "num_tokens": 2554726838.0, + "step": 15225 + }, + { + "entropy": 1.7336925466855366, + "epoch": 1.6726538683364918, + "grad_norm": 0.6075726747512817, + "learning_rate": 3.2362357036383283e-06, + "loss": 1.4406, + "mean_token_accuracy": 0.64825872083505, + "num_tokens": 2554943464.0, + "step": 15226 + }, + { + "entropy": 1.7321422894795735, + "epoch": 1.6727637252478647, + "grad_norm": 0.658340334892273, + "learning_rate": 3.235425947423592e-06, + "loss": 1.3172, + "mean_token_accuracy": 0.6706226418415705, + "num_tokens": 2555094800.0, + "step": 15227 + }, + { + "entropy": 1.6634202202161152, + "epoch": 1.6728735821592375, + "grad_norm": 0.6695008277893066, + "learning_rate": 3.234616436954128e-06, + "loss": 1.509, + "mean_token_accuracy": 0.6520940413077673, + "num_tokens": 2555259787.0, + "step": 15228 + }, + { + "entropy": 1.6890574097633362, + "epoch": 1.6729834390706104, + "grad_norm": 0.5829101204872131, + "learning_rate": 3.233807172255552e-06, + "loss": 1.3955, + "mean_token_accuracy": 0.6553893884023031, + "num_tokens": 2555477181.0, + "step": 15229 + }, + { + "entropy": 1.6599874198436737, + "epoch": 1.6730932959819835, + "grad_norm": 0.6483899354934692, + "learning_rate": 3.2329981533534814e-06, + "loss": 1.4103, + "mean_token_accuracy": 0.6571273605028788, + "num_tokens": 2555666070.0, + "step": 15230 + }, + { + "entropy": 1.7212112446626027, + "epoch": 1.6732031528933564, + "grad_norm": 0.9130486249923706, + "learning_rate": 3.23218938027352e-06, + "loss": 1.3511, + "mean_token_accuracy": 0.6648656080166498, + "num_tokens": 2555817850.0, + "step": 15231 + }, + { + "entropy": 1.703871637582779, + "epoch": 1.6733130098047293, + "grad_norm": 0.6791821122169495, + "learning_rate": 3.2313808530412628e-06, + "loss": 1.2272, + "mean_token_accuracy": 0.6739940742651621, + "num_tokens": 2555973386.0, + "step": 15232 + }, + { + "entropy": 1.6598846117655437, + "epoch": 1.6734228667161024, + "grad_norm": 0.6614598035812378, + "learning_rate": 3.2305725716823005e-06, + "loss": 1.385, + "mean_token_accuracy": 0.6631904939810435, + "num_tokens": 2556157643.0, + "step": 15233 + }, + { + "entropy": 1.6710762580235798, + "epoch": 1.673532723627475, + "grad_norm": 0.6756139993667603, + "learning_rate": 3.2297645362222175e-06, + "loss": 1.3222, + "mean_token_accuracy": 0.6683625727891922, + "num_tokens": 2556365206.0, + "step": 15234 + }, + { + "entropy": 1.6408987541993458, + "epoch": 1.6736425805388482, + "grad_norm": 0.5655611753463745, + "learning_rate": 3.2289567466865858e-06, + "loss": 1.3738, + "mean_token_accuracy": 0.6523148367802302, + "num_tokens": 2556553810.0, + "step": 15235 + }, + { + "entropy": 1.6711041033267975, + "epoch": 1.673752437450221, + "grad_norm": 0.6098411679267883, + "learning_rate": 3.228149203100968e-06, + "loss": 1.3861, + "mean_token_accuracy": 0.653891901175181, + "num_tokens": 2556744619.0, + "step": 15236 + }, + { + "entropy": 1.6401523053646088, + "epoch": 1.673862294361594, + "grad_norm": 0.7218595743179321, + "learning_rate": 3.2273419054909283e-06, + "loss": 1.3142, + "mean_token_accuracy": 0.6689208696285883, + "num_tokens": 2556943085.0, + "step": 15237 + }, + { + "entropy": 1.7597449918588002, + "epoch": 1.673972151272967, + "grad_norm": 0.8478575944900513, + "learning_rate": 3.226534853882015e-06, + "loss": 1.5462, + "mean_token_accuracy": 0.6484808673461279, + "num_tokens": 2557088817.0, + "step": 15238 + }, + { + "entropy": 1.6971666316191356, + "epoch": 1.67408200818434, + "grad_norm": 0.6962342262268066, + "learning_rate": 3.225728048299769e-06, + "loss": 1.4707, + "mean_token_accuracy": 0.6557254840930303, + "num_tokens": 2557283065.0, + "step": 15239 + }, + { + "entropy": 1.6427591145038605, + "epoch": 1.6741918650957128, + "grad_norm": 0.6922114491462708, + "learning_rate": 3.22492148876973e-06, + "loss": 1.2838, + "mean_token_accuracy": 0.680359830458959, + "num_tokens": 2557411768.0, + "step": 15240 + }, + { + "entropy": 1.712716003259023, + "epoch": 1.674301722007086, + "grad_norm": 0.6969720125198364, + "learning_rate": 3.22411517531742e-06, + "loss": 1.2368, + "mean_token_accuracy": 0.6725062231222788, + "num_tokens": 2557545725.0, + "step": 15241 + }, + { + "entropy": 1.71072651942571, + "epoch": 1.6744115789184586, + "grad_norm": 0.5817194581031799, + "learning_rate": 3.2233091079683613e-06, + "loss": 1.3838, + "mean_token_accuracy": 0.6509876201550165, + "num_tokens": 2557714503.0, + "step": 15242 + }, + { + "entropy": 1.7713170647621155, + "epoch": 1.6745214358298317, + "grad_norm": 0.6657090783119202, + "learning_rate": 3.2225032867480664e-06, + "loss": 1.3812, + "mean_token_accuracy": 0.6571053812901179, + "num_tokens": 2557874292.0, + "step": 15243 + }, + { + "entropy": 1.6696379979451497, + "epoch": 1.6746312927412046, + "grad_norm": 0.6969082355499268, + "learning_rate": 3.2216977116820354e-06, + "loss": 1.2049, + "mean_token_accuracy": 0.6869342774152756, + "num_tokens": 2558040611.0, + "step": 15244 + }, + { + "entropy": 1.718798081080119, + "epoch": 1.6747411496525775, + "grad_norm": 0.7062221765518188, + "learning_rate": 3.2208923827957668e-06, + "loss": 1.3801, + "mean_token_accuracy": 0.6560978144407272, + "num_tokens": 2558170281.0, + "step": 15245 + }, + { + "entropy": 1.709840973218282, + "epoch": 1.6748510065639506, + "grad_norm": 0.836525022983551, + "learning_rate": 3.2200873001147513e-06, + "loss": 1.6282, + "mean_token_accuracy": 0.6493220552802086, + "num_tokens": 2558354120.0, + "step": 15246 + }, + { + "entropy": 1.7364205320676167, + "epoch": 1.6749608634753232, + "grad_norm": 0.6105183362960815, + "learning_rate": 3.219282463664467e-06, + "loss": 1.5489, + "mean_token_accuracy": 0.6435059358676275, + "num_tokens": 2558563661.0, + "step": 15247 + }, + { + "entropy": 1.7640726168950398, + "epoch": 1.6750707203866964, + "grad_norm": 0.736595869064331, + "learning_rate": 3.2184778734703848e-06, + "loss": 1.2694, + "mean_token_accuracy": 0.667763814330101, + "num_tokens": 2558661391.0, + "step": 15248 + }, + { + "entropy": 1.6720350682735443, + "epoch": 1.6751805772980692, + "grad_norm": 0.6239567399024963, + "learning_rate": 3.217673529557973e-06, + "loss": 1.3413, + "mean_token_accuracy": 0.6550045510133108, + "num_tokens": 2558840998.0, + "step": 15249 + }, + { + "entropy": 1.6599902311960857, + "epoch": 1.6752904342094421, + "grad_norm": 0.6722832918167114, + "learning_rate": 3.216869431952688e-06, + "loss": 1.2589, + "mean_token_accuracy": 0.6672548999389013, + "num_tokens": 2558963126.0, + "step": 15250 + }, + { + "entropy": 1.6761441230773926, + "epoch": 1.6754002911208152, + "grad_norm": 0.8985497355461121, + "learning_rate": 3.2160655806799744e-06, + "loss": 1.259, + "mean_token_accuracy": 0.675809289018313, + "num_tokens": 2559119390.0, + "step": 15251 + }, + { + "entropy": 1.717965970436732, + "epoch": 1.6755101480321881, + "grad_norm": 0.6145942211151123, + "learning_rate": 3.2152619757652813e-06, + "loss": 1.4449, + "mean_token_accuracy": 0.6425779561201731, + "num_tokens": 2559316872.0, + "step": 15252 + }, + { + "entropy": 1.6946504712104797, + "epoch": 1.675620004943561, + "grad_norm": 0.6808292865753174, + "learning_rate": 3.2144586172340365e-06, + "loss": 1.3685, + "mean_token_accuracy": 0.6554179340600967, + "num_tokens": 2559459695.0, + "step": 15253 + }, + { + "entropy": 1.6927851835886638, + "epoch": 1.675729861854934, + "grad_norm": 0.7276740074157715, + "learning_rate": 3.2136555051116704e-06, + "loss": 1.418, + "mean_token_accuracy": 0.679698646068573, + "num_tokens": 2559642979.0, + "step": 15254 + }, + { + "entropy": 1.6810623904069264, + "epoch": 1.6758397187663068, + "grad_norm": 0.6984881162643433, + "learning_rate": 3.2128526394235982e-06, + "loss": 1.2343, + "mean_token_accuracy": 0.6779455641905466, + "num_tokens": 2559759977.0, + "step": 15255 + }, + { + "entropy": 1.6839697062969208, + "epoch": 1.6759495756776799, + "grad_norm": 0.7221273183822632, + "learning_rate": 3.2120500201952298e-06, + "loss": 1.339, + "mean_token_accuracy": 0.6567999372879664, + "num_tokens": 2559902932.0, + "step": 15256 + }, + { + "entropy": 1.6946297883987427, + "epoch": 1.6760594325890528, + "grad_norm": 0.778820812702179, + "learning_rate": 3.2112476474519683e-06, + "loss": 1.3138, + "mean_token_accuracy": 0.6709027737379074, + "num_tokens": 2560025897.0, + "step": 15257 + }, + { + "entropy": 1.750508725643158, + "epoch": 1.6761692895004257, + "grad_norm": 0.7278999090194702, + "learning_rate": 3.2104455212192113e-06, + "loss": 1.3121, + "mean_token_accuracy": 0.6603905359903971, + "num_tokens": 2560135481.0, + "step": 15258 + }, + { + "entropy": 1.7683274547259014, + "epoch": 1.6762791464117988, + "grad_norm": 0.8146536350250244, + "learning_rate": 3.209643641522343e-06, + "loss": 1.4074, + "mean_token_accuracy": 0.6663754433393478, + "num_tokens": 2560275025.0, + "step": 15259 + }, + { + "entropy": 1.74715722600619, + "epoch": 1.6763890033231714, + "grad_norm": 0.7501146197319031, + "learning_rate": 3.208842008386742e-06, + "loss": 1.4665, + "mean_token_accuracy": 0.6487719466288885, + "num_tokens": 2560474451.0, + "step": 15260 + }, + { + "entropy": 1.6942894756793976, + "epoch": 1.6764988602345445, + "grad_norm": 0.5945084691047668, + "learning_rate": 3.2080406218377824e-06, + "loss": 1.3198, + "mean_token_accuracy": 0.6598286330699921, + "num_tokens": 2560668942.0, + "step": 15261 + }, + { + "entropy": 1.6762113670508068, + "epoch": 1.6766087171459174, + "grad_norm": 0.7053788304328918, + "learning_rate": 3.2072394819008263e-06, + "loss": 1.2167, + "mean_token_accuracy": 0.6785010149081548, + "num_tokens": 2560840956.0, + "step": 15262 + }, + { + "entropy": 1.7425096035003662, + "epoch": 1.6767185740572903, + "grad_norm": 0.7079997062683105, + "learning_rate": 3.2064385886012254e-06, + "loss": 1.4733, + "mean_token_accuracy": 0.6649157653252283, + "num_tokens": 2560988485.0, + "step": 15263 + }, + { + "entropy": 1.6938765247662861, + "epoch": 1.6768284309686634, + "grad_norm": 0.6834542751312256, + "learning_rate": 3.2056379419643353e-06, + "loss": 1.4556, + "mean_token_accuracy": 0.6481647590796152, + "num_tokens": 2561196564.0, + "step": 15264 + }, + { + "entropy": 1.642237663269043, + "epoch": 1.6769382878800363, + "grad_norm": 0.6120626330375671, + "learning_rate": 3.2048375420154887e-06, + "loss": 1.2315, + "mean_token_accuracy": 0.6777461071809133, + "num_tokens": 2561331437.0, + "step": 15265 + }, + { + "entropy": 1.7010113994280498, + "epoch": 1.6770481447914092, + "grad_norm": 0.7019853591918945, + "learning_rate": 3.204037388780025e-06, + "loss": 1.3454, + "mean_token_accuracy": 0.6715318908294042, + "num_tokens": 2561528048.0, + "step": 15266 + }, + { + "entropy": 1.6710281074047089, + "epoch": 1.6771580017027823, + "grad_norm": 0.819465696811676, + "learning_rate": 3.2032374822832634e-06, + "loss": 1.3401, + "mean_token_accuracy": 0.6738651841878891, + "num_tokens": 2561685491.0, + "step": 15267 + }, + { + "entropy": 1.800257682800293, + "epoch": 1.677267858614155, + "grad_norm": 0.5872465372085571, + "learning_rate": 3.2024378225505204e-06, + "loss": 1.4982, + "mean_token_accuracy": 0.6357658604780833, + "num_tokens": 2561911809.0, + "step": 15268 + }, + { + "entropy": 1.712759256362915, + "epoch": 1.677377715525528, + "grad_norm": 0.7056664824485779, + "learning_rate": 3.201638409607106e-06, + "loss": 1.4008, + "mean_token_accuracy": 0.6417889843384424, + "num_tokens": 2562124310.0, + "step": 15269 + }, + { + "entropy": 1.6932222247123718, + "epoch": 1.677487572436901, + "grad_norm": 0.6303336024284363, + "learning_rate": 3.2008392434783264e-06, + "loss": 1.4301, + "mean_token_accuracy": 0.6475923210382462, + "num_tokens": 2562279988.0, + "step": 15270 + }, + { + "entropy": 1.6386590401331584, + "epoch": 1.6775974293482738, + "grad_norm": 0.657673716545105, + "learning_rate": 3.2000403241894686e-06, + "loss": 1.3441, + "mean_token_accuracy": 0.6659832795461019, + "num_tokens": 2562444547.0, + "step": 15271 + }, + { + "entropy": 1.6764297584692638, + "epoch": 1.677707286259647, + "grad_norm": 0.7012315988540649, + "learning_rate": 3.1992416517658175e-06, + "loss": 1.3601, + "mean_token_accuracy": 0.6649115979671478, + "num_tokens": 2562607824.0, + "step": 15272 + }, + { + "entropy": 1.7825362384319305, + "epoch": 1.6778171431710196, + "grad_norm": 0.7959426641464233, + "learning_rate": 3.198443226232656e-06, + "loss": 1.3882, + "mean_token_accuracy": 0.6545184900363287, + "num_tokens": 2562779514.0, + "step": 15273 + }, + { + "entropy": 1.7837129334608715, + "epoch": 1.6779270000823927, + "grad_norm": 0.8002263307571411, + "learning_rate": 3.1976450476152506e-06, + "loss": 1.4926, + "mean_token_accuracy": 0.6270763973395029, + "num_tokens": 2562968133.0, + "step": 15274 + }, + { + "entropy": 1.6454266607761383, + "epoch": 1.6780368569937656, + "grad_norm": 0.6544117331504822, + "learning_rate": 3.19684711593886e-06, + "loss": 1.2687, + "mean_token_accuracy": 0.6670517573753992, + "num_tokens": 2563129301.0, + "step": 15275 + }, + { + "entropy": 1.7555846671263378, + "epoch": 1.6781467139051385, + "grad_norm": 0.6647533774375916, + "learning_rate": 3.196049431228746e-06, + "loss": 1.5171, + "mean_token_accuracy": 0.6388434370358785, + "num_tokens": 2563328620.0, + "step": 15276 + }, + { + "entropy": 1.6958951950073242, + "epoch": 1.6782565708165116, + "grad_norm": 0.7346828579902649, + "learning_rate": 3.195251993510149e-06, + "loss": 1.286, + "mean_token_accuracy": 0.6770187467336655, + "num_tokens": 2563453949.0, + "step": 15277 + }, + { + "entropy": 1.7482584714889526, + "epoch": 1.6783664277278845, + "grad_norm": 0.7439913153648376, + "learning_rate": 3.194454802808311e-06, + "loss": 1.4923, + "mean_token_accuracy": 0.6445967058340708, + "num_tokens": 2563620845.0, + "step": 15278 + }, + { + "entropy": 1.7300353248914082, + "epoch": 1.6784762846392574, + "grad_norm": 0.6056285500526428, + "learning_rate": 3.193657859148461e-06, + "loss": 1.5211, + "mean_token_accuracy": 0.6319058835506439, + "num_tokens": 2563832970.0, + "step": 15279 + }, + { + "entropy": 1.7123978634675343, + "epoch": 1.6785861415506305, + "grad_norm": 0.6761077642440796, + "learning_rate": 3.19286116255582e-06, + "loss": 1.4335, + "mean_token_accuracy": 0.6501255333423615, + "num_tokens": 2563991103.0, + "step": 15280 + }, + { + "entropy": 1.7233157257239025, + "epoch": 1.6786959984620031, + "grad_norm": 0.6616033911705017, + "learning_rate": 3.192064713055606e-06, + "loss": 1.3993, + "mean_token_accuracy": 0.6481630504131317, + "num_tokens": 2564169608.0, + "step": 15281 + }, + { + "entropy": 1.7163086732228596, + "epoch": 1.6788058553733762, + "grad_norm": 0.7168435454368591, + "learning_rate": 3.191268510673027e-06, + "loss": 1.3552, + "mean_token_accuracy": 0.6583843231201172, + "num_tokens": 2564312562.0, + "step": 15282 + }, + { + "entropy": 1.7421314418315887, + "epoch": 1.6789157122847491, + "grad_norm": 1.4026530981063843, + "learning_rate": 3.1904725554332805e-06, + "loss": 1.2168, + "mean_token_accuracy": 0.674383873740832, + "num_tokens": 2564501041.0, + "step": 15283 + }, + { + "entropy": 1.7259081999460857, + "epoch": 1.679025569196122, + "grad_norm": 0.7258220314979553, + "learning_rate": 3.189676847361559e-06, + "loss": 1.3833, + "mean_token_accuracy": 0.6572358012199402, + "num_tokens": 2564696992.0, + "step": 15284 + }, + { + "entropy": 1.7746508121490479, + "epoch": 1.679135426107495, + "grad_norm": 0.7986940145492554, + "learning_rate": 3.1888813864830435e-06, + "loss": 1.3888, + "mean_token_accuracy": 0.6498429874579111, + "num_tokens": 2564836244.0, + "step": 15285 + }, + { + "entropy": 1.699170559644699, + "epoch": 1.6792452830188678, + "grad_norm": 0.7032765746116638, + "learning_rate": 3.1880861728229152e-06, + "loss": 1.2493, + "mean_token_accuracy": 0.6778079668680826, + "num_tokens": 2564971115.0, + "step": 15286 + }, + { + "entropy": 1.6739700535933177, + "epoch": 1.6793551399302409, + "grad_norm": 0.7897632122039795, + "learning_rate": 3.1872912064063387e-06, + "loss": 1.461, + "mean_token_accuracy": 0.6509335339069366, + "num_tokens": 2565134895.0, + "step": 15287 + }, + { + "entropy": 1.6778443853060405, + "epoch": 1.6794649968416138, + "grad_norm": 0.6658824682235718, + "learning_rate": 3.186496487258474e-06, + "loss": 1.3738, + "mean_token_accuracy": 0.6780295670032501, + "num_tokens": 2565302480.0, + "step": 15288 + }, + { + "entropy": 1.7276048461596172, + "epoch": 1.6795748537529867, + "grad_norm": 0.6688079237937927, + "learning_rate": 3.185702015404474e-06, + "loss": 1.3869, + "mean_token_accuracy": 0.6666077673435211, + "num_tokens": 2565448467.0, + "step": 15289 + }, + { + "entropy": 1.7064704895019531, + "epoch": 1.6796847106643598, + "grad_norm": 0.674453854560852, + "learning_rate": 3.184907790869486e-06, + "loss": 1.2915, + "mean_token_accuracy": 0.6831353803475698, + "num_tokens": 2565628460.0, + "step": 15290 + }, + { + "entropy": 1.7536171277364094, + "epoch": 1.6797945675757326, + "grad_norm": 0.7042247653007507, + "learning_rate": 3.184113813678644e-06, + "loss": 1.5146, + "mean_token_accuracy": 0.662187417348226, + "num_tokens": 2565782360.0, + "step": 15291 + }, + { + "entropy": 1.7017661929130554, + "epoch": 1.6799044244871055, + "grad_norm": 0.6648768186569214, + "learning_rate": 3.183320083857076e-06, + "loss": 1.3611, + "mean_token_accuracy": 0.6698134889205297, + "num_tokens": 2565974485.0, + "step": 15292 + }, + { + "entropy": 1.6317310432593028, + "epoch": 1.6800142813984786, + "grad_norm": 0.6019257307052612, + "learning_rate": 3.1825266014299085e-06, + "loss": 1.3964, + "mean_token_accuracy": 0.6603737771511078, + "num_tokens": 2566150672.0, + "step": 15293 + }, + { + "entropy": 1.6843983232975006, + "epoch": 1.6801241383098513, + "grad_norm": 0.751380205154419, + "learning_rate": 3.1817333664222507e-06, + "loss": 1.42, + "mean_token_accuracy": 0.651827389995257, + "num_tokens": 2566345461.0, + "step": 15294 + }, + { + "entropy": 1.7201267182826996, + "epoch": 1.6802339952212244, + "grad_norm": 0.7158882021903992, + "learning_rate": 3.1809403788592066e-06, + "loss": 1.3936, + "mean_token_accuracy": 0.6722188790639242, + "num_tokens": 2566536727.0, + "step": 15295 + }, + { + "entropy": 1.7150601148605347, + "epoch": 1.6803438521325973, + "grad_norm": 0.6860437989234924, + "learning_rate": 3.180147638765878e-06, + "loss": 1.5585, + "mean_token_accuracy": 0.6306335628032684, + "num_tokens": 2566726509.0, + "step": 15296 + }, + { + "entropy": 1.7363272806008656, + "epoch": 1.6804537090439702, + "grad_norm": 0.7342987656593323, + "learning_rate": 3.179355146167351e-06, + "loss": 1.1886, + "mean_token_accuracy": 0.6802386889855067, + "num_tokens": 2566841914.0, + "step": 15297 + }, + { + "entropy": 1.703192909558614, + "epoch": 1.6805635659553433, + "grad_norm": 0.6351720690727234, + "learning_rate": 3.178562901088712e-06, + "loss": 1.2872, + "mean_token_accuracy": 0.674397294720014, + "num_tokens": 2566981790.0, + "step": 15298 + }, + { + "entropy": 1.6544977327187855, + "epoch": 1.680673422866716, + "grad_norm": 0.7234415411949158, + "learning_rate": 3.1777709035550318e-06, + "loss": 1.3261, + "mean_token_accuracy": 0.6573912451664606, + "num_tokens": 2567177661.0, + "step": 15299 + }, + { + "entropy": 1.6995947659015656, + "epoch": 1.680783279778089, + "grad_norm": 0.7154074907302856, + "learning_rate": 3.1769791535913767e-06, + "loss": 1.4392, + "mean_token_accuracy": 0.6578214665253957, + "num_tokens": 2567339837.0, + "step": 15300 + }, + { + "entropy": 1.6951833069324493, + "epoch": 1.680893136689462, + "grad_norm": 0.6429846286773682, + "learning_rate": 3.176187651222806e-06, + "loss": 1.5831, + "mean_token_accuracy": 0.6406177133321762, + "num_tokens": 2567544062.0, + "step": 15301 + }, + { + "entropy": 1.7316114902496338, + "epoch": 1.6810029936008348, + "grad_norm": 0.7316505312919617, + "learning_rate": 3.175396396474373e-06, + "loss": 1.4443, + "mean_token_accuracy": 0.6586751093467077, + "num_tokens": 2567720570.0, + "step": 15302 + }, + { + "entropy": 1.724622756242752, + "epoch": 1.681112850512208, + "grad_norm": 0.6855919361114502, + "learning_rate": 3.174605389371118e-06, + "loss": 1.3854, + "mean_token_accuracy": 0.6606116443872452, + "num_tokens": 2567892992.0, + "step": 15303 + }, + { + "entropy": 1.7502660353978474, + "epoch": 1.6812227074235808, + "grad_norm": 0.6851439476013184, + "learning_rate": 3.1738146299380746e-06, + "loss": 1.4903, + "mean_token_accuracy": 0.6492257912953695, + "num_tokens": 2568070857.0, + "step": 15304 + }, + { + "entropy": 1.74208668867747, + "epoch": 1.6813325643349537, + "grad_norm": 0.7176704406738281, + "learning_rate": 3.173024118200273e-06, + "loss": 1.5042, + "mean_token_accuracy": 0.6514915178219477, + "num_tokens": 2568241367.0, + "step": 15305 + }, + { + "entropy": 1.765565186738968, + "epoch": 1.6814424212463268, + "grad_norm": 0.6486221551895142, + "learning_rate": 3.1722338541827313e-06, + "loss": 1.4233, + "mean_token_accuracy": 0.6390677789847056, + "num_tokens": 2568434347.0, + "step": 15306 + }, + { + "entropy": 1.74800306558609, + "epoch": 1.6815522781576995, + "grad_norm": 0.6911203861236572, + "learning_rate": 3.1714438379104583e-06, + "loss": 1.5126, + "mean_token_accuracy": 0.6483513911565145, + "num_tokens": 2568568047.0, + "step": 15307 + }, + { + "entropy": 1.6643619934717815, + "epoch": 1.6816621350690726, + "grad_norm": 0.7120999693870544, + "learning_rate": 3.170654069408463e-06, + "loss": 1.2547, + "mean_token_accuracy": 0.6775770286719004, + "num_tokens": 2568718128.0, + "step": 15308 + }, + { + "entropy": 1.7326288719971974, + "epoch": 1.6817719919804455, + "grad_norm": 0.789517343044281, + "learning_rate": 3.169864548701736e-06, + "loss": 1.4496, + "mean_token_accuracy": 0.6548430124918619, + "num_tokens": 2568891251.0, + "step": 15309 + }, + { + "entropy": 1.7174657980600994, + "epoch": 1.6818818488918184, + "grad_norm": 0.6772891283035278, + "learning_rate": 3.1690752758152697e-06, + "loss": 1.4284, + "mean_token_accuracy": 0.6426176180442175, + "num_tokens": 2569037403.0, + "step": 15310 + }, + { + "entropy": 1.6879849930604298, + "epoch": 1.6819917058031915, + "grad_norm": 0.6050668358802795, + "learning_rate": 3.1682862507740425e-06, + "loss": 1.4879, + "mean_token_accuracy": 0.6514971653620402, + "num_tokens": 2569232119.0, + "step": 15311 + }, + { + "entropy": 1.7036270002524059, + "epoch": 1.6821015627145641, + "grad_norm": 0.6641897559165955, + "learning_rate": 3.1674974736030233e-06, + "loss": 1.3175, + "mean_token_accuracy": 0.6694677621126175, + "num_tokens": 2569363697.0, + "step": 15312 + }, + { + "entropy": 1.6906124949455261, + "epoch": 1.6822114196259372, + "grad_norm": 0.7731119990348816, + "learning_rate": 3.166708944327181e-06, + "loss": 1.3484, + "mean_token_accuracy": 0.667941133181254, + "num_tokens": 2569490915.0, + "step": 15313 + }, + { + "entropy": 1.7285043100516002, + "epoch": 1.6823212765373101, + "grad_norm": 0.747154176235199, + "learning_rate": 3.165920662971472e-06, + "loss": 1.3582, + "mean_token_accuracy": 0.6714018086592356, + "num_tokens": 2569599168.0, + "step": 15314 + }, + { + "entropy": 1.7480494777361553, + "epoch": 1.682431133448683, + "grad_norm": 0.7452878952026367, + "learning_rate": 3.1651326295608447e-06, + "loss": 1.2041, + "mean_token_accuracy": 0.6810566087563833, + "num_tokens": 2569700113.0, + "step": 15315 + }, + { + "entropy": 1.7515860497951508, + "epoch": 1.682540990360056, + "grad_norm": 0.7225151658058167, + "learning_rate": 3.164344844120237e-06, + "loss": 1.314, + "mean_token_accuracy": 0.6686030477285385, + "num_tokens": 2569822465.0, + "step": 15316 + }, + { + "entropy": 1.7134381830692291, + "epoch": 1.682650847271429, + "grad_norm": 0.6826877593994141, + "learning_rate": 3.1635573066745855e-06, + "loss": 1.4157, + "mean_token_accuracy": 0.64534163971742, + "num_tokens": 2570027829.0, + "step": 15317 + }, + { + "entropy": 1.705380419890086, + "epoch": 1.6827607041828019, + "grad_norm": 0.6510130167007446, + "learning_rate": 3.1627700172488147e-06, + "loss": 1.2904, + "mean_token_accuracy": 0.6670472820599874, + "num_tokens": 2570147721.0, + "step": 15318 + }, + { + "entropy": 1.683544745047887, + "epoch": 1.682870561094175, + "grad_norm": 0.5858747363090515, + "learning_rate": 3.1619829758678388e-06, + "loss": 1.493, + "mean_token_accuracy": 0.6476639409859976, + "num_tokens": 2570342162.0, + "step": 15319 + }, + { + "entropy": 1.6911889413992565, + "epoch": 1.6829804180055477, + "grad_norm": 0.8070988059043884, + "learning_rate": 3.1611961825565725e-06, + "loss": 1.2663, + "mean_token_accuracy": 0.6720109234253565, + "num_tokens": 2570525734.0, + "step": 15320 + }, + { + "entropy": 1.7007411917050679, + "epoch": 1.6830902749169208, + "grad_norm": 0.6247788071632385, + "learning_rate": 3.160409637339913e-06, + "loss": 1.417, + "mean_token_accuracy": 0.6442168205976486, + "num_tokens": 2570720758.0, + "step": 15321 + }, + { + "entropy": 1.7167495091756184, + "epoch": 1.6832001318282936, + "grad_norm": 0.5952068567276001, + "learning_rate": 3.159623340242757e-06, + "loss": 1.313, + "mean_token_accuracy": 0.6723757932583491, + "num_tokens": 2570871146.0, + "step": 15322 + }, + { + "entropy": 1.747285137573878, + "epoch": 1.6833099887396665, + "grad_norm": 0.7220256328582764, + "learning_rate": 3.158837291289989e-06, + "loss": 1.3158, + "mean_token_accuracy": 0.6664845049381256, + "num_tokens": 2571002515.0, + "step": 15323 + }, + { + "entropy": 1.7323359350363414, + "epoch": 1.6834198456510396, + "grad_norm": 0.841284453868866, + "learning_rate": 3.158051490506486e-06, + "loss": 1.4707, + "mean_token_accuracy": 0.660729338725408, + "num_tokens": 2571162161.0, + "step": 15324 + }, + { + "entropy": 1.725009063879649, + "epoch": 1.6835297025624123, + "grad_norm": 0.7810693383216858, + "learning_rate": 3.15726593791712e-06, + "loss": 1.4043, + "mean_token_accuracy": 0.6503799458344778, + "num_tokens": 2571297383.0, + "step": 15325 + }, + { + "entropy": 1.7619624336560566, + "epoch": 1.6836395594737854, + "grad_norm": 0.7505675554275513, + "learning_rate": 3.1564806335467544e-06, + "loss": 1.4133, + "mean_token_accuracy": 0.6519313355286916, + "num_tokens": 2571464771.0, + "step": 15326 + }, + { + "entropy": 1.7304276923338573, + "epoch": 1.6837494163851583, + "grad_norm": 0.6991888880729675, + "learning_rate": 3.1556955774202436e-06, + "loss": 1.2079, + "mean_token_accuracy": 0.6864756196737289, + "num_tokens": 2571579779.0, + "step": 15327 + }, + { + "entropy": 1.6945532461007435, + "epoch": 1.6838592732965312, + "grad_norm": 0.7731361985206604, + "learning_rate": 3.154910769562429e-06, + "loss": 1.3362, + "mean_token_accuracy": 0.6732407162586848, + "num_tokens": 2571731395.0, + "step": 15328 + }, + { + "entropy": 1.6676548918088276, + "epoch": 1.6839691302079043, + "grad_norm": 0.6803898215293884, + "learning_rate": 3.1541262099981573e-06, + "loss": 1.458, + "mean_token_accuracy": 0.6517745653788248, + "num_tokens": 2571903109.0, + "step": 15329 + }, + { + "entropy": 1.7044414083162944, + "epoch": 1.6840789871192772, + "grad_norm": 0.6813333630561829, + "learning_rate": 3.1533418987522547e-06, + "loss": 1.4173, + "mean_token_accuracy": 0.6529068152109782, + "num_tokens": 2572070768.0, + "step": 15330 + }, + { + "entropy": 1.690029243628184, + "epoch": 1.68418884403065, + "grad_norm": 0.6491711735725403, + "learning_rate": 3.1525578358495433e-06, + "loss": 1.2993, + "mean_token_accuracy": 0.6696517119805018, + "num_tokens": 2572198958.0, + "step": 15331 + }, + { + "entropy": 1.6722242434819539, + "epoch": 1.6842987009420232, + "grad_norm": 0.7356240749359131, + "learning_rate": 3.151774021314842e-06, + "loss": 1.319, + "mean_token_accuracy": 0.6778273731470108, + "num_tokens": 2572348869.0, + "step": 15332 + }, + { + "entropy": 1.7420857747395833, + "epoch": 1.6844085578533958, + "grad_norm": 0.7489916086196899, + "learning_rate": 3.1509904551729554e-06, + "loss": 1.4856, + "mean_token_accuracy": 0.6390324880679449, + "num_tokens": 2572512314.0, + "step": 15333 + }, + { + "entropy": 1.751690109570821, + "epoch": 1.684518414764769, + "grad_norm": 0.6961585879325867, + "learning_rate": 3.150207137448686e-06, + "loss": 1.2745, + "mean_token_accuracy": 0.6709187477827072, + "num_tokens": 2572666138.0, + "step": 15334 + }, + { + "entropy": 1.6826651493708293, + "epoch": 1.6846282716761418, + "grad_norm": 0.6414405703544617, + "learning_rate": 3.149424068166822e-06, + "loss": 1.2945, + "mean_token_accuracy": 0.6786713004112244, + "num_tokens": 2572843000.0, + "step": 15335 + }, + { + "entropy": 1.6688839693864186, + "epoch": 1.6847381285875147, + "grad_norm": 0.8924053907394409, + "learning_rate": 3.1486412473521476e-06, + "loss": 1.387, + "mean_token_accuracy": 0.6693562765916189, + "num_tokens": 2572979120.0, + "step": 15336 + }, + { + "entropy": 1.7230990827083588, + "epoch": 1.6848479854988878, + "grad_norm": 0.601993978023529, + "learning_rate": 3.14785867502944e-06, + "loss": 1.3809, + "mean_token_accuracy": 0.6529526164134344, + "num_tokens": 2573168637.0, + "step": 15337 + }, + { + "entropy": 1.7520112891991932, + "epoch": 1.6849578424102605, + "grad_norm": 0.5566615462303162, + "learning_rate": 3.147076351223469e-06, + "loss": 1.4751, + "mean_token_accuracy": 0.631900375088056, + "num_tokens": 2573388244.0, + "step": 15338 + }, + { + "entropy": 1.731001118818919, + "epoch": 1.6850676993216336, + "grad_norm": 0.7146487236022949, + "learning_rate": 3.1462942759589933e-06, + "loss": 1.2527, + "mean_token_accuracy": 0.6749810228745142, + "num_tokens": 2573519678.0, + "step": 15339 + }, + { + "entropy": 1.694351961215337, + "epoch": 1.6851775562330065, + "grad_norm": 0.6235674023628235, + "learning_rate": 3.145512449260762e-06, + "loss": 1.4673, + "mean_token_accuracy": 0.6534475237131119, + "num_tokens": 2573695861.0, + "step": 15340 + }, + { + "entropy": 1.7240705291430156, + "epoch": 1.6852874131443794, + "grad_norm": 1.222989797592163, + "learning_rate": 3.144730871153525e-06, + "loss": 1.5403, + "mean_token_accuracy": 0.643691211938858, + "num_tokens": 2573907238.0, + "step": 15341 + }, + { + "entropy": 1.7270687023798625, + "epoch": 1.6853972700557525, + "grad_norm": 0.6817310452461243, + "learning_rate": 3.1439495416620157e-06, + "loss": 1.4433, + "mean_token_accuracy": 0.662896732489268, + "num_tokens": 2574080197.0, + "step": 15342 + }, + { + "entropy": 1.6700053215026855, + "epoch": 1.6855071269671253, + "grad_norm": 0.6429228186607361, + "learning_rate": 3.1431684608109614e-06, + "loss": 1.5984, + "mean_token_accuracy": 0.6422629406054815, + "num_tokens": 2574260989.0, + "step": 15343 + }, + { + "entropy": 1.6401211122671764, + "epoch": 1.6856169838784982, + "grad_norm": 0.5946700572967529, + "learning_rate": 3.1423876286250872e-06, + "loss": 1.3121, + "mean_token_accuracy": 0.6619760394096375, + "num_tokens": 2574505321.0, + "step": 15344 + }, + { + "entropy": 1.6671649018923442, + "epoch": 1.6857268407898713, + "grad_norm": 0.8995504975318909, + "learning_rate": 3.1416070451291024e-06, + "loss": 1.3446, + "mean_token_accuracy": 0.6812218924363455, + "num_tokens": 2574649743.0, + "step": 15345 + }, + { + "entropy": 1.702040175596873, + "epoch": 1.685836697701244, + "grad_norm": 0.7074987292289734, + "learning_rate": 3.140826710347715e-06, + "loss": 1.3002, + "mean_token_accuracy": 0.6756115506092707, + "num_tokens": 2574848047.0, + "step": 15346 + }, + { + "entropy": 1.7741004427274067, + "epoch": 1.685946554612617, + "grad_norm": 0.6643980145454407, + "learning_rate": 3.14004662430562e-06, + "loss": 1.356, + "mean_token_accuracy": 0.6614086826642355, + "num_tokens": 2575008827.0, + "step": 15347 + }, + { + "entropy": 1.6915812889734905, + "epoch": 1.68605641152399, + "grad_norm": 0.6701132655143738, + "learning_rate": 3.1392667870275066e-06, + "loss": 1.4227, + "mean_token_accuracy": 0.6473148117462794, + "num_tokens": 2575176906.0, + "step": 15348 + }, + { + "entropy": 1.7173262635866802, + "epoch": 1.6861662684353629, + "grad_norm": 0.6805701851844788, + "learning_rate": 3.1384871985380582e-06, + "loss": 1.4934, + "mean_token_accuracy": 0.6477487633625666, + "num_tokens": 2575349117.0, + "step": 15349 + }, + { + "entropy": 1.7245887120564778, + "epoch": 1.686276125346736, + "grad_norm": 0.6441610455513, + "learning_rate": 3.137707858861947e-06, + "loss": 1.2899, + "mean_token_accuracy": 0.6831858903169632, + "num_tokens": 2575498227.0, + "step": 15350 + }, + { + "entropy": 1.722615083058675, + "epoch": 1.6863859822581087, + "grad_norm": 0.6894484758377075, + "learning_rate": 3.1369287680238403e-06, + "loss": 1.3521, + "mean_token_accuracy": 0.6721992939710617, + "num_tokens": 2575690922.0, + "step": 15351 + }, + { + "entropy": 1.7229991952578227, + "epoch": 1.6864958391694818, + "grad_norm": 0.6383141279220581, + "learning_rate": 3.1361499260483948e-06, + "loss": 1.4355, + "mean_token_accuracy": 0.6438505450884501, + "num_tokens": 2575897646.0, + "step": 15352 + }, + { + "entropy": 1.7160128851731618, + "epoch": 1.6866056960808546, + "grad_norm": 0.7071347236633301, + "learning_rate": 3.13537133296026e-06, + "loss": 1.3538, + "mean_token_accuracy": 0.6632434278726578, + "num_tokens": 2576026434.0, + "step": 15353 + }, + { + "entropy": 1.7085239390532176, + "epoch": 1.6867155529922275, + "grad_norm": 0.7150105237960815, + "learning_rate": 3.1345929887840785e-06, + "loss": 1.3223, + "mean_token_accuracy": 0.6567959388097128, + "num_tokens": 2576136070.0, + "step": 15354 + }, + { + "entropy": 1.7165654997030895, + "epoch": 1.6868254099036006, + "grad_norm": 0.7486876845359802, + "learning_rate": 3.1338148935444856e-06, + "loss": 1.3181, + "mean_token_accuracy": 0.6667283674081167, + "num_tokens": 2576242543.0, + "step": 15355 + }, + { + "entropy": 1.76658500234286, + "epoch": 1.6869352668149735, + "grad_norm": 0.7848101854324341, + "learning_rate": 3.133037047266105e-06, + "loss": 1.4248, + "mean_token_accuracy": 0.6585915784041086, + "num_tokens": 2576381772.0, + "step": 15356 + }, + { + "entropy": 1.7358842889467876, + "epoch": 1.6870451237263464, + "grad_norm": 0.6816839575767517, + "learning_rate": 3.1322594499735566e-06, + "loss": 1.3862, + "mean_token_accuracy": 0.6490218391021093, + "num_tokens": 2576531068.0, + "step": 15357 + }, + { + "entropy": 1.6990918318430583, + "epoch": 1.6871549806377195, + "grad_norm": 1.0314568281173706, + "learning_rate": 3.1314821016914535e-06, + "loss": 1.3518, + "mean_token_accuracy": 0.6715274453163147, + "num_tokens": 2576653983.0, + "step": 15358 + }, + { + "entropy": 1.6669905682404835, + "epoch": 1.6872648375490922, + "grad_norm": 0.590815007686615, + "learning_rate": 3.1307050024443963e-06, + "loss": 1.4015, + "mean_token_accuracy": 0.6566647191842397, + "num_tokens": 2576831940.0, + "step": 15359 + }, + { + "entropy": 1.673220157623291, + "epoch": 1.6873746944604653, + "grad_norm": 0.643791913986206, + "learning_rate": 3.129928152256978e-06, + "loss": 1.4797, + "mean_token_accuracy": 0.6434496690829595, + "num_tokens": 2577049426.0, + "step": 15360 + }, + { + "entropy": 1.6904392540454865, + "epoch": 1.6874845513718382, + "grad_norm": 0.5934916138648987, + "learning_rate": 3.129151551153789e-06, + "loss": 1.5356, + "mean_token_accuracy": 0.632567952076594, + "num_tokens": 2577254922.0, + "step": 15361 + }, + { + "entropy": 1.664735992749532, + "epoch": 1.687594408283211, + "grad_norm": 0.6659498810768127, + "learning_rate": 3.1283751991594064e-06, + "loss": 1.3438, + "mean_token_accuracy": 0.6631951779127121, + "num_tokens": 2577415164.0, + "step": 15362 + }, + { + "entropy": 1.719626933336258, + "epoch": 1.6877042651945842, + "grad_norm": 0.6992260813713074, + "learning_rate": 3.1275990962984e-06, + "loss": 1.4677, + "mean_token_accuracy": 0.6572467486063639, + "num_tokens": 2577570986.0, + "step": 15363 + }, + { + "entropy": 1.7156515419483185, + "epoch": 1.6878141221059568, + "grad_norm": 0.6852288842201233, + "learning_rate": 3.1268232425953364e-06, + "loss": 1.3626, + "mean_token_accuracy": 0.6607407828172048, + "num_tokens": 2577754997.0, + "step": 15364 + }, + { + "entropy": 1.7793689171473186, + "epoch": 1.68792397901733, + "grad_norm": 0.6801753044128418, + "learning_rate": 3.126047638074768e-06, + "loss": 1.4492, + "mean_token_accuracy": 0.639577383796374, + "num_tokens": 2577919506.0, + "step": 15365 + }, + { + "entropy": 1.7608330448468525, + "epoch": 1.6880338359287028, + "grad_norm": 0.7202026844024658, + "learning_rate": 3.1252722827612463e-06, + "loss": 1.4545, + "mean_token_accuracy": 0.6331879695256551, + "num_tokens": 2578100044.0, + "step": 15366 + }, + { + "entropy": 1.724254459142685, + "epoch": 1.6881436928400757, + "grad_norm": 0.6835639476776123, + "learning_rate": 3.124497176679308e-06, + "loss": 1.3549, + "mean_token_accuracy": 0.6532203555107117, + "num_tokens": 2578267048.0, + "step": 15367 + }, + { + "entropy": 1.6863794922828674, + "epoch": 1.6882535497514488, + "grad_norm": 0.7550612092018127, + "learning_rate": 3.1237223198534823e-06, + "loss": 1.1698, + "mean_token_accuracy": 0.6900685677925745, + "num_tokens": 2578366956.0, + "step": 15368 + }, + { + "entropy": 1.7102207442124684, + "epoch": 1.6883634066628217, + "grad_norm": 0.7050641179084778, + "learning_rate": 3.1229477123082968e-06, + "loss": 1.4534, + "mean_token_accuracy": 0.6498723477125168, + "num_tokens": 2578560893.0, + "step": 15369 + }, + { + "entropy": 1.6907469928264618, + "epoch": 1.6884732635741946, + "grad_norm": 0.5717144012451172, + "learning_rate": 3.1221733540682692e-06, + "loss": 1.4725, + "mean_token_accuracy": 0.647341325879097, + "num_tokens": 2578836855.0, + "step": 15370 + }, + { + "entropy": 1.676329771677653, + "epoch": 1.6885831204855677, + "grad_norm": 0.8052626252174377, + "learning_rate": 3.121399245157904e-06, + "loss": 1.558, + "mean_token_accuracy": 0.6526962419350942, + "num_tokens": 2579026404.0, + "step": 15371 + }, + { + "entropy": 1.671715994675954, + "epoch": 1.6886929773969404, + "grad_norm": 0.7153114080429077, + "learning_rate": 3.120625385601701e-06, + "loss": 1.2896, + "mean_token_accuracy": 0.6790550202131271, + "num_tokens": 2579188647.0, + "step": 15372 + }, + { + "entropy": 1.701625217994054, + "epoch": 1.6888028343083135, + "grad_norm": 0.7289735078811646, + "learning_rate": 3.1198517754241565e-06, + "loss": 1.3561, + "mean_token_accuracy": 0.674707810084025, + "num_tokens": 2579331926.0, + "step": 15373 + }, + { + "entropy": 1.7479176918665569, + "epoch": 1.6889126912196863, + "grad_norm": 0.7183084487915039, + "learning_rate": 3.119078414649753e-06, + "loss": 1.2997, + "mean_token_accuracy": 0.6685334344704946, + "num_tokens": 2579527532.0, + "step": 15374 + }, + { + "entropy": 1.7347841362158458, + "epoch": 1.6890225481310592, + "grad_norm": 0.7196807265281677, + "learning_rate": 3.118305303302962e-06, + "loss": 1.3305, + "mean_token_accuracy": 0.6832280606031418, + "num_tokens": 2579691633.0, + "step": 15375 + }, + { + "entropy": 1.736104021469752, + "epoch": 1.6891324050424323, + "grad_norm": 0.7943740487098694, + "learning_rate": 3.117532441408261e-06, + "loss": 1.5753, + "mean_token_accuracy": 0.6446651866038641, + "num_tokens": 2579897232.0, + "step": 15376 + }, + { + "entropy": 1.793796718120575, + "epoch": 1.6892422619538052, + "grad_norm": 0.7013726830482483, + "learning_rate": 3.116759828990103e-06, + "loss": 1.2496, + "mean_token_accuracy": 0.6713108470042547, + "num_tokens": 2580015103.0, + "step": 15377 + }, + { + "entropy": 1.6883254448572795, + "epoch": 1.689352118865178, + "grad_norm": 0.7592623829841614, + "learning_rate": 3.115987466072946e-06, + "loss": 1.456, + "mean_token_accuracy": 0.6466412742932638, + "num_tokens": 2580213602.0, + "step": 15378 + }, + { + "entropy": 1.6982381443182628, + "epoch": 1.689461975776551, + "grad_norm": 0.7187153697013855, + "learning_rate": 3.1152153526812343e-06, + "loss": 1.3754, + "mean_token_accuracy": 0.6655093431472778, + "num_tokens": 2580340316.0, + "step": 15379 + }, + { + "entropy": 1.719240536292394, + "epoch": 1.6895718326879239, + "grad_norm": 0.6955122351646423, + "learning_rate": 3.1144434888394003e-06, + "loss": 1.335, + "mean_token_accuracy": 0.6731832573811213, + "num_tokens": 2580460689.0, + "step": 15380 + }, + { + "entropy": 1.6896177033583324, + "epoch": 1.689681689599297, + "grad_norm": 0.689373791217804, + "learning_rate": 3.113671874571878e-06, + "loss": 1.38, + "mean_token_accuracy": 0.658390611410141, + "num_tokens": 2580613434.0, + "step": 15381 + }, + { + "entropy": 1.6444495916366577, + "epoch": 1.6897915465106699, + "grad_norm": 0.7218437194824219, + "learning_rate": 3.112900509903088e-06, + "loss": 1.1382, + "mean_token_accuracy": 0.699143057068189, + "num_tokens": 2580707620.0, + "step": 15382 + }, + { + "entropy": 1.7752784192562103, + "epoch": 1.6899014034220428, + "grad_norm": 0.7364796996116638, + "learning_rate": 3.1121293948574438e-06, + "loss": 1.4396, + "mean_token_accuracy": 0.6354698687791824, + "num_tokens": 2580891653.0, + "step": 15383 + }, + { + "entropy": 1.6694513857364655, + "epoch": 1.6900112603334159, + "grad_norm": 0.764617383480072, + "learning_rate": 3.111358529459348e-06, + "loss": 1.2351, + "mean_token_accuracy": 0.6757246901591619, + "num_tokens": 2581032836.0, + "step": 15384 + }, + { + "entropy": 1.7019239862759907, + "epoch": 1.6901211172447885, + "grad_norm": 0.7817053198814392, + "learning_rate": 3.1105879137332006e-06, + "loss": 1.4947, + "mean_token_accuracy": 0.6470496108134588, + "num_tokens": 2581191748.0, + "step": 15385 + }, + { + "entropy": 1.6950092216332753, + "epoch": 1.6902309741561616, + "grad_norm": 0.7115320563316345, + "learning_rate": 3.109817547703392e-06, + "loss": 1.3195, + "mean_token_accuracy": 0.665327916542689, + "num_tokens": 2581334830.0, + "step": 15386 + }, + { + "entropy": 1.7190478245417278, + "epoch": 1.6903408310675345, + "grad_norm": 0.6659532785415649, + "learning_rate": 3.1090474313942998e-06, + "loss": 1.3647, + "mean_token_accuracy": 0.6521613150835037, + "num_tokens": 2581483982.0, + "step": 15387 + }, + { + "entropy": 1.7014067073663075, + "epoch": 1.6904506879789074, + "grad_norm": 0.8550192713737488, + "learning_rate": 3.108277564830303e-06, + "loss": 1.5101, + "mean_token_accuracy": 0.6476392249266306, + "num_tokens": 2581639613.0, + "step": 15388 + }, + { + "entropy": 1.6655798256397247, + "epoch": 1.6905605448902805, + "grad_norm": 0.7115856409072876, + "learning_rate": 3.1075079480357634e-06, + "loss": 1.4428, + "mean_token_accuracy": 0.6531449556350708, + "num_tokens": 2581810059.0, + "step": 15389 + }, + { + "entropy": 1.7903032004833221, + "epoch": 1.6906704018016534, + "grad_norm": 0.7611822485923767, + "learning_rate": 3.106738581035042e-06, + "loss": 1.4924, + "mean_token_accuracy": 0.635255828499794, + "num_tokens": 2581989898.0, + "step": 15390 + }, + { + "entropy": 1.6987177928288777, + "epoch": 1.6907802587130263, + "grad_norm": 0.6165050268173218, + "learning_rate": 3.1059694638524886e-06, + "loss": 1.3535, + "mean_token_accuracy": 0.6634356826543808, + "num_tokens": 2582145058.0, + "step": 15391 + }, + { + "entropy": 1.6762576599915822, + "epoch": 1.6908901156243992, + "grad_norm": 0.6371328234672546, + "learning_rate": 3.105200596512442e-06, + "loss": 1.4345, + "mean_token_accuracy": 0.6503476947546005, + "num_tokens": 2582368067.0, + "step": 15392 + }, + { + "entropy": 1.702197919289271, + "epoch": 1.690999972535772, + "grad_norm": 0.75450199842453, + "learning_rate": 3.10443197903924e-06, + "loss": 1.4777, + "mean_token_accuracy": 0.6465541025002798, + "num_tokens": 2582567443.0, + "step": 15393 + }, + { + "entropy": 1.6959167917569478, + "epoch": 1.6911098294471452, + "grad_norm": 0.7660825252532959, + "learning_rate": 3.1036636114572088e-06, + "loss": 1.1762, + "mean_token_accuracy": 0.6866554866234461, + "num_tokens": 2582690966.0, + "step": 15394 + }, + { + "entropy": 1.7242404520511627, + "epoch": 1.691219686358518, + "grad_norm": 0.8516025543212891, + "learning_rate": 3.1028954937906668e-06, + "loss": 1.4467, + "mean_token_accuracy": 0.6590066701173782, + "num_tokens": 2582850808.0, + "step": 15395 + }, + { + "entropy": 1.7131598989168804, + "epoch": 1.691329543269891, + "grad_norm": 0.7381974458694458, + "learning_rate": 3.1021276260639217e-06, + "loss": 1.4181, + "mean_token_accuracy": 0.6618935763835907, + "num_tokens": 2583009712.0, + "step": 15396 + }, + { + "entropy": 1.6620845595995586, + "epoch": 1.691439400181264, + "grad_norm": 0.6693155169487, + "learning_rate": 3.10136000830128e-06, + "loss": 1.5235, + "mean_token_accuracy": 0.6472597966591517, + "num_tokens": 2583188095.0, + "step": 15397 + }, + { + "entropy": 1.672204573949178, + "epoch": 1.6915492570926367, + "grad_norm": 0.936718225479126, + "learning_rate": 3.1005926405270353e-06, + "loss": 1.2397, + "mean_token_accuracy": 0.6774502595265707, + "num_tokens": 2583334819.0, + "step": 15398 + }, + { + "entropy": 1.7408578594525654, + "epoch": 1.6916591140040098, + "grad_norm": 0.6551694869995117, + "learning_rate": 3.099825522765472e-06, + "loss": 1.3283, + "mean_token_accuracy": 0.6612852861483892, + "num_tokens": 2583476321.0, + "step": 15399 + }, + { + "entropy": 1.6576103170712788, + "epoch": 1.6917689709153827, + "grad_norm": 0.7109887003898621, + "learning_rate": 3.099058655040873e-06, + "loss": 1.4108, + "mean_token_accuracy": 0.6661920497814814, + "num_tokens": 2583634776.0, + "step": 15400 + }, + { + "entropy": 1.795667548974355, + "epoch": 1.6918788278267556, + "grad_norm": 0.8126919865608215, + "learning_rate": 3.098292037377505e-06, + "loss": 1.4247, + "mean_token_accuracy": 0.666491856177648, + "num_tokens": 2583784360.0, + "step": 15401 + }, + { + "entropy": 1.6871559222539265, + "epoch": 1.6919886847381287, + "grad_norm": 0.6559981107711792, + "learning_rate": 3.0975256697996358e-06, + "loss": 1.2416, + "mean_token_accuracy": 0.6782428324222565, + "num_tokens": 2583926742.0, + "step": 15402 + }, + { + "entropy": 1.7118937869866688, + "epoch": 1.6920985416495016, + "grad_norm": 0.7892350554466248, + "learning_rate": 3.096759552331518e-06, + "loss": 1.4847, + "mean_token_accuracy": 0.6499680678049723, + "num_tokens": 2584097203.0, + "step": 15403 + }, + { + "entropy": 1.7176280121008556, + "epoch": 1.6922083985608745, + "grad_norm": 0.6054561734199524, + "learning_rate": 3.0959936849973974e-06, + "loss": 1.2682, + "mean_token_accuracy": 0.6729069898525873, + "num_tokens": 2584226875.0, + "step": 15404 + }, + { + "entropy": 1.7033534049987793, + "epoch": 1.6923182554722473, + "grad_norm": 0.6824467778205872, + "learning_rate": 3.095228067821517e-06, + "loss": 1.376, + "mean_token_accuracy": 0.6606364697217941, + "num_tokens": 2584430184.0, + "step": 15405 + }, + { + "entropy": 1.6788690189520519, + "epoch": 1.6924281123836202, + "grad_norm": 0.6166786551475525, + "learning_rate": 3.0944627008281034e-06, + "loss": 1.3412, + "mean_token_accuracy": 0.6616918991009394, + "num_tokens": 2584586860.0, + "step": 15406 + }, + { + "entropy": 1.6675065159797668, + "epoch": 1.6925379692949933, + "grad_norm": 0.6525241732597351, + "learning_rate": 3.0936975840413863e-06, + "loss": 1.5037, + "mean_token_accuracy": 0.6569078887502352, + "num_tokens": 2584788098.0, + "step": 15407 + }, + { + "entropy": 1.719109723965327, + "epoch": 1.6926478262063662, + "grad_norm": 0.629504919052124, + "learning_rate": 3.0929327174855765e-06, + "loss": 1.4084, + "mean_token_accuracy": 0.6696446587642034, + "num_tokens": 2584978784.0, + "step": 15408 + }, + { + "entropy": 1.6656635701656342, + "epoch": 1.692757683117739, + "grad_norm": 0.5897053480148315, + "learning_rate": 3.092168101184883e-06, + "loss": 1.4735, + "mean_token_accuracy": 0.6469605465730032, + "num_tokens": 2585167598.0, + "step": 15409 + }, + { + "entropy": 1.7369298934936523, + "epoch": 1.6928675400291122, + "grad_norm": 0.7727683186531067, + "learning_rate": 3.091403735163507e-06, + "loss": 1.4347, + "mean_token_accuracy": 0.6634906083345413, + "num_tokens": 2585317585.0, + "step": 15410 + }, + { + "entropy": 1.6967070400714874, + "epoch": 1.6929773969404849, + "grad_norm": 0.6468930840492249, + "learning_rate": 3.090639619445638e-06, + "loss": 1.4222, + "mean_token_accuracy": 0.6505736857652664, + "num_tokens": 2585533635.0, + "step": 15411 + }, + { + "entropy": 1.6828734079996746, + "epoch": 1.693087253851858, + "grad_norm": 0.7482141256332397, + "learning_rate": 3.08987575405546e-06, + "loss": 1.2519, + "mean_token_accuracy": 0.6751055518786112, + "num_tokens": 2585666750.0, + "step": 15412 + }, + { + "entropy": 1.6802496711413066, + "epoch": 1.6931971107632309, + "grad_norm": 0.5791299939155579, + "learning_rate": 3.0891121390171498e-06, + "loss": 1.4935, + "mean_token_accuracy": 0.6477037717898687, + "num_tokens": 2585855103.0, + "step": 15413 + }, + { + "entropy": 1.7238198220729828, + "epoch": 1.6933069676746038, + "grad_norm": 0.6337864995002747, + "learning_rate": 3.088348774354878e-06, + "loss": 1.4572, + "mean_token_accuracy": 0.6338127752145132, + "num_tokens": 2586147894.0, + "step": 15414 + }, + { + "entropy": 1.7157885332902272, + "epoch": 1.6934168245859769, + "grad_norm": 0.647091269493103, + "learning_rate": 3.0875856600928017e-06, + "loss": 1.5886, + "mean_token_accuracy": 0.6402155508597692, + "num_tokens": 2586359826.0, + "step": 15415 + }, + { + "entropy": 1.7303306659062703, + "epoch": 1.6935266814973498, + "grad_norm": 0.6226432919502258, + "learning_rate": 3.0868227962550725e-06, + "loss": 1.3488, + "mean_token_accuracy": 0.6693485826253891, + "num_tokens": 2586542824.0, + "step": 15416 + }, + { + "entropy": 1.6407539049784343, + "epoch": 1.6936365384087226, + "grad_norm": 0.6539502739906311, + "learning_rate": 3.0860601828658377e-06, + "loss": 1.4628, + "mean_token_accuracy": 0.653552715977033, + "num_tokens": 2586738889.0, + "step": 15417 + }, + { + "entropy": 1.7392374575138092, + "epoch": 1.6937463953200955, + "grad_norm": 0.5961517691612244, + "learning_rate": 3.08529781994923e-06, + "loss": 1.4295, + "mean_token_accuracy": 0.654540628194809, + "num_tokens": 2586899359.0, + "step": 15418 + }, + { + "entropy": 1.6805303692817688, + "epoch": 1.6938562522314684, + "grad_norm": 0.6699274182319641, + "learning_rate": 3.0845357075293824e-06, + "loss": 1.3482, + "mean_token_accuracy": 0.6532176484664282, + "num_tokens": 2587053914.0, + "step": 15419 + }, + { + "entropy": 1.7027775545914967, + "epoch": 1.6939661091428415, + "grad_norm": 0.755435585975647, + "learning_rate": 3.0837738456304122e-06, + "loss": 1.3533, + "mean_token_accuracy": 0.6679652184247971, + "num_tokens": 2587186667.0, + "step": 15420 + }, + { + "entropy": 1.7010966738065083, + "epoch": 1.6940759660542144, + "grad_norm": 0.7067722082138062, + "learning_rate": 3.0830122342764314e-06, + "loss": 1.3864, + "mean_token_accuracy": 0.6638060361146927, + "num_tokens": 2587384376.0, + "step": 15421 + }, + { + "entropy": 1.6957202355066936, + "epoch": 1.6941858229655873, + "grad_norm": 0.6873775124549866, + "learning_rate": 3.0822508734915473e-06, + "loss": 1.2841, + "mean_token_accuracy": 0.6708781023820242, + "num_tokens": 2587502711.0, + "step": 15422 + }, + { + "entropy": 1.7451180815696716, + "epoch": 1.6942956798769604, + "grad_norm": 0.6628127098083496, + "learning_rate": 3.0814897632998546e-06, + "loss": 1.5383, + "mean_token_accuracy": 0.6356658140818278, + "num_tokens": 2587749796.0, + "step": 15423 + }, + { + "entropy": 1.7371935844421387, + "epoch": 1.694405536788333, + "grad_norm": 0.7622631788253784, + "learning_rate": 3.0807289037254417e-06, + "loss": 1.3687, + "mean_token_accuracy": 0.663616955280304, + "num_tokens": 2587884693.0, + "step": 15424 + }, + { + "entropy": 1.7322389682133992, + "epoch": 1.6945153936997062, + "grad_norm": 0.648070752620697, + "learning_rate": 3.0799682947923906e-06, + "loss": 1.3667, + "mean_token_accuracy": 0.6544249455134074, + "num_tokens": 2588016729.0, + "step": 15425 + }, + { + "entropy": 1.7039423783620198, + "epoch": 1.694625250611079, + "grad_norm": 0.6290963888168335, + "learning_rate": 3.0792079365247755e-06, + "loss": 1.3423, + "mean_token_accuracy": 0.6653772393862406, + "num_tokens": 2588197565.0, + "step": 15426 + }, + { + "entropy": 1.687830110390981, + "epoch": 1.694735107522452, + "grad_norm": 0.6381257176399231, + "learning_rate": 3.07844782894666e-06, + "loss": 1.3763, + "mean_token_accuracy": 0.6565463542938232, + "num_tokens": 2588365992.0, + "step": 15427 + }, + { + "entropy": 1.6616821885108948, + "epoch": 1.694844964433825, + "grad_norm": 0.8576768040657043, + "learning_rate": 3.0776879720820997e-06, + "loss": 1.4612, + "mean_token_accuracy": 0.6518943955500921, + "num_tokens": 2588519705.0, + "step": 15428 + }, + { + "entropy": 1.7395183543364208, + "epoch": 1.694954821345198, + "grad_norm": 0.71395343542099, + "learning_rate": 3.076928365955147e-06, + "loss": 1.4336, + "mean_token_accuracy": 0.6496130575736364, + "num_tokens": 2588695021.0, + "step": 15429 + }, + { + "entropy": 1.7011998693148296, + "epoch": 1.6950646782565708, + "grad_norm": 0.6785951256752014, + "learning_rate": 3.0761690105898393e-06, + "loss": 1.298, + "mean_token_accuracy": 0.6675257285435995, + "num_tokens": 2588869513.0, + "step": 15430 + }, + { + "entropy": 1.737764298915863, + "epoch": 1.695174535167944, + "grad_norm": 0.9960548877716064, + "learning_rate": 3.0754099060102135e-06, + "loss": 1.3802, + "mean_token_accuracy": 0.6851067890723547, + "num_tokens": 2589015214.0, + "step": 15431 + }, + { + "entropy": 1.6571769615014393, + "epoch": 1.6952843920793166, + "grad_norm": 0.6647917628288269, + "learning_rate": 3.074651052240294e-06, + "loss": 1.3572, + "mean_token_accuracy": 0.6534274220466614, + "num_tokens": 2589200929.0, + "step": 15432 + }, + { + "entropy": 1.6438042024771373, + "epoch": 1.6953942489906897, + "grad_norm": 0.7289125919342041, + "learning_rate": 3.073892449304095e-06, + "loss": 1.4241, + "mean_token_accuracy": 0.659342810511589, + "num_tokens": 2589353999.0, + "step": 15433 + }, + { + "entropy": 1.7090658744176228, + "epoch": 1.6955041059020626, + "grad_norm": 0.7296878695487976, + "learning_rate": 3.0731340972256303e-06, + "loss": 1.3166, + "mean_token_accuracy": 0.6685633112986883, + "num_tokens": 2589524135.0, + "step": 15434 + }, + { + "entropy": 1.6926236947377522, + "epoch": 1.6956139628134355, + "grad_norm": 0.5468199849128723, + "learning_rate": 3.0723759960288997e-06, + "loss": 1.4544, + "mean_token_accuracy": 0.6467462033033371, + "num_tokens": 2589746970.0, + "step": 15435 + }, + { + "entropy": 1.677124152580897, + "epoch": 1.6957238197248086, + "grad_norm": 0.6528844833374023, + "learning_rate": 3.0716181457378945e-06, + "loss": 1.412, + "mean_token_accuracy": 0.6589123407999674, + "num_tokens": 2589913367.0, + "step": 15436 + }, + { + "entropy": 1.735766738653183, + "epoch": 1.6958336766361812, + "grad_norm": 0.6354487538337708, + "learning_rate": 3.070860546376602e-06, + "loss": 1.4608, + "mean_token_accuracy": 0.6477210422356924, + "num_tokens": 2590137460.0, + "step": 15437 + }, + { + "entropy": 1.705346167087555, + "epoch": 1.6959435335475543, + "grad_norm": 0.7688722610473633, + "learning_rate": 3.0701031979690033e-06, + "loss": 1.3772, + "mean_token_accuracy": 0.658728207151095, + "num_tokens": 2590326917.0, + "step": 15438 + }, + { + "entropy": 1.728643884261449, + "epoch": 1.6960533904589272, + "grad_norm": 0.7233805060386658, + "learning_rate": 3.0693461005390636e-06, + "loss": 1.4744, + "mean_token_accuracy": 0.649912640452385, + "num_tokens": 2590497105.0, + "step": 15439 + }, + { + "entropy": 1.7629015843073528, + "epoch": 1.6961632473703, + "grad_norm": 0.6526691317558289, + "learning_rate": 3.0685892541107452e-06, + "loss": 1.4067, + "mean_token_accuracy": 0.6612003346284231, + "num_tokens": 2590642085.0, + "step": 15440 + }, + { + "entropy": 1.6741429766019185, + "epoch": 1.6962731042816732, + "grad_norm": 0.697309672832489, + "learning_rate": 3.067832658708004e-06, + "loss": 1.5036, + "mean_token_accuracy": 0.656085841357708, + "num_tokens": 2590814853.0, + "step": 15441 + }, + { + "entropy": 1.6753457883993785, + "epoch": 1.696382961193046, + "grad_norm": 0.6827280521392822, + "learning_rate": 3.0670763143547853e-06, + "loss": 1.4097, + "mean_token_accuracy": 0.6564631958802541, + "num_tokens": 2591007239.0, + "step": 15442 + }, + { + "entropy": 1.6726371546586354, + "epoch": 1.696492818104419, + "grad_norm": 0.7021117806434631, + "learning_rate": 3.066320221075025e-06, + "loss": 1.5803, + "mean_token_accuracy": 0.6457217087348303, + "num_tokens": 2591224463.0, + "step": 15443 + }, + { + "entropy": 1.6059078176816304, + "epoch": 1.696602675015792, + "grad_norm": 0.6383489370346069, + "learning_rate": 3.065564378892657e-06, + "loss": 1.2984, + "mean_token_accuracy": 0.6645799279212952, + "num_tokens": 2591396199.0, + "step": 15444 + }, + { + "entropy": 1.6366430819034576, + "epoch": 1.6967125319271648, + "grad_norm": 0.5697821378707886, + "learning_rate": 3.064808787831598e-06, + "loss": 1.3218, + "mean_token_accuracy": 0.6684574782848358, + "num_tokens": 2591589772.0, + "step": 15445 + }, + { + "entropy": 1.7955954174200695, + "epoch": 1.6968223888385379, + "grad_norm": 0.6045777201652527, + "learning_rate": 3.0640534479157686e-06, + "loss": 1.6147, + "mean_token_accuracy": 0.6359433382749557, + "num_tokens": 2591765239.0, + "step": 15446 + }, + { + "entropy": 1.7104682524998982, + "epoch": 1.6969322457499108, + "grad_norm": 0.6094769239425659, + "learning_rate": 3.0632983591690695e-06, + "loss": 1.4571, + "mean_token_accuracy": 0.645427738626798, + "num_tokens": 2591963924.0, + "step": 15447 + }, + { + "entropy": 1.6970649858315785, + "epoch": 1.6970421026612836, + "grad_norm": 0.7453381419181824, + "learning_rate": 3.062543521615401e-06, + "loss": 1.3063, + "mean_token_accuracy": 0.6602601408958435, + "num_tokens": 2592117840.0, + "step": 15448 + }, + { + "entropy": 1.6575465599695842, + "epoch": 1.6971519595726567, + "grad_norm": 0.5633898973464966, + "learning_rate": 3.061788935278653e-06, + "loss": 1.3509, + "mean_token_accuracy": 0.650563841064771, + "num_tokens": 2592299746.0, + "step": 15449 + }, + { + "entropy": 1.7399055063724518, + "epoch": 1.6972618164840294, + "grad_norm": 0.6714982390403748, + "learning_rate": 3.0610346001827085e-06, + "loss": 1.3131, + "mean_token_accuracy": 0.6603095183769861, + "num_tokens": 2592412511.0, + "step": 15450 + }, + { + "entropy": 1.6033929189046223, + "epoch": 1.6973716733954025, + "grad_norm": 0.5816755890846252, + "learning_rate": 3.060280516351444e-06, + "loss": 1.32, + "mean_token_accuracy": 0.6678592562675476, + "num_tokens": 2592565463.0, + "step": 15451 + }, + { + "entropy": 1.7209522624810536, + "epoch": 1.6974815303067754, + "grad_norm": 0.6687294840812683, + "learning_rate": 3.0595266838087195e-06, + "loss": 1.5754, + "mean_token_accuracy": 0.6195499996344248, + "num_tokens": 2592769048.0, + "step": 15452 + }, + { + "entropy": 1.695073793331782, + "epoch": 1.6975913872181483, + "grad_norm": 0.7344485521316528, + "learning_rate": 3.0587731025784006e-06, + "loss": 1.3381, + "mean_token_accuracy": 0.6686349560817083, + "num_tokens": 2592917112.0, + "step": 15453 + }, + { + "entropy": 1.748264600833257, + "epoch": 1.6977012441295214, + "grad_norm": 0.7268601655960083, + "learning_rate": 3.058019772684333e-06, + "loss": 1.3523, + "mean_token_accuracy": 0.660114531715711, + "num_tokens": 2593065546.0, + "step": 15454 + }, + { + "entropy": 1.7753359874089558, + "epoch": 1.6978111010408943, + "grad_norm": 0.6813443303108215, + "learning_rate": 3.0572666941503602e-06, + "loss": 1.3395, + "mean_token_accuracy": 0.6615369518597921, + "num_tokens": 2593207742.0, + "step": 15455 + }, + { + "entropy": 1.686709036429723, + "epoch": 1.6979209579522672, + "grad_norm": 0.7412280440330505, + "learning_rate": 3.0565138670003192e-06, + "loss": 1.1122, + "mean_token_accuracy": 0.6998019615809122, + "num_tokens": 2593328448.0, + "step": 15456 + }, + { + "entropy": 1.7757901052633922, + "epoch": 1.6980308148636403, + "grad_norm": 0.8149046897888184, + "learning_rate": 3.0557612912580332e-06, + "loss": 1.6069, + "mean_token_accuracy": 0.6343324283758799, + "num_tokens": 2593476516.0, + "step": 15457 + }, + { + "entropy": 1.6797574857870738, + "epoch": 1.698140671775013, + "grad_norm": 0.6735820770263672, + "learning_rate": 3.055008966947323e-06, + "loss": 1.4156, + "mean_token_accuracy": 0.6628180791934332, + "num_tokens": 2593628490.0, + "step": 15458 + }, + { + "entropy": 1.729365775982539, + "epoch": 1.698250528686386, + "grad_norm": 0.7286481857299805, + "learning_rate": 3.0542568940920007e-06, + "loss": 1.3168, + "mean_token_accuracy": 0.6649684309959412, + "num_tokens": 2593751547.0, + "step": 15459 + }, + { + "entropy": 1.7199760377407074, + "epoch": 1.698360385597759, + "grad_norm": 0.8374939560890198, + "learning_rate": 3.053505072715865e-06, + "loss": 1.4923, + "mean_token_accuracy": 0.6528761138518652, + "num_tokens": 2593899249.0, + "step": 15460 + }, + { + "entropy": 1.7205273906389873, + "epoch": 1.6984702425091318, + "grad_norm": 0.7138111591339111, + "learning_rate": 3.0527535028427126e-06, + "loss": 1.4606, + "mean_token_accuracy": 0.644857699672381, + "num_tokens": 2594036578.0, + "step": 15461 + }, + { + "entropy": 1.7191713253657024, + "epoch": 1.698580099420505, + "grad_norm": 0.6800480484962463, + "learning_rate": 3.0520021844963326e-06, + "loss": 1.4163, + "mean_token_accuracy": 0.6557717521985372, + "num_tokens": 2594203729.0, + "step": 15462 + }, + { + "entropy": 1.6896109481652577, + "epoch": 1.6986899563318776, + "grad_norm": 0.7030267715454102, + "learning_rate": 3.051251117700502e-06, + "loss": 1.2521, + "mean_token_accuracy": 0.6746014902989069, + "num_tokens": 2594347487.0, + "step": 15463 + }, + { + "entropy": 1.7087683777014415, + "epoch": 1.6987998132432507, + "grad_norm": 0.6061800122261047, + "learning_rate": 3.05050030247899e-06, + "loss": 1.3607, + "mean_token_accuracy": 0.653743584950765, + "num_tokens": 2594543405.0, + "step": 15464 + }, + { + "entropy": 1.7524566849072774, + "epoch": 1.6989096701546236, + "grad_norm": 0.6522343754768372, + "learning_rate": 3.049749738855563e-06, + "loss": 1.4177, + "mean_token_accuracy": 0.6434753388166428, + "num_tokens": 2594716593.0, + "step": 15465 + }, + { + "entropy": 1.6446398794651031, + "epoch": 1.6990195270659965, + "grad_norm": 0.6932255625724792, + "learning_rate": 3.0489994268539746e-06, + "loss": 1.3285, + "mean_token_accuracy": 0.6653372198343277, + "num_tokens": 2594867263.0, + "step": 15466 + }, + { + "entropy": 1.7672787706057231, + "epoch": 1.6991293839773696, + "grad_norm": 0.8536300659179688, + "learning_rate": 3.048249366497971e-06, + "loss": 1.435, + "mean_token_accuracy": 0.645067016283671, + "num_tokens": 2595026533.0, + "step": 15467 + }, + { + "entropy": 1.7172228395938873, + "epoch": 1.6992392408887425, + "grad_norm": 0.5975283980369568, + "learning_rate": 3.0474995578112907e-06, + "loss": 1.4422, + "mean_token_accuracy": 0.6420785139004389, + "num_tokens": 2595192916.0, + "step": 15468 + }, + { + "entropy": 1.7480799158414204, + "epoch": 1.6993490978001153, + "grad_norm": 0.6868378520011902, + "learning_rate": 3.0467500008176656e-06, + "loss": 1.3149, + "mean_token_accuracy": 0.6674151619275411, + "num_tokens": 2595340344.0, + "step": 15469 + }, + { + "entropy": 1.7092045744260151, + "epoch": 1.6994589547114884, + "grad_norm": 0.6722932457923889, + "learning_rate": 3.0460006955408206e-06, + "loss": 1.5016, + "mean_token_accuracy": 0.638856107989947, + "num_tokens": 2595532316.0, + "step": 15470 + }, + { + "entropy": 1.698676884174347, + "epoch": 1.699568811622861, + "grad_norm": 0.739374041557312, + "learning_rate": 3.0452516420044685e-06, + "loss": 1.4562, + "mean_token_accuracy": 0.6575873990853628, + "num_tokens": 2595705984.0, + "step": 15471 + }, + { + "entropy": 1.7235010464986165, + "epoch": 1.6996786685342342, + "grad_norm": 0.6385025382041931, + "learning_rate": 3.044502840232318e-06, + "loss": 1.4149, + "mean_token_accuracy": 0.6463307837645212, + "num_tokens": 2595905802.0, + "step": 15472 + }, + { + "entropy": 1.6757484376430511, + "epoch": 1.699788525445607, + "grad_norm": 0.6640949845314026, + "learning_rate": 3.043754290248069e-06, + "loss": 1.4525, + "mean_token_accuracy": 0.6534897486368815, + "num_tokens": 2596076575.0, + "step": 15473 + }, + { + "entropy": 1.6671640475591023, + "epoch": 1.69989838235698, + "grad_norm": 0.6480453014373779, + "learning_rate": 3.0430059920754084e-06, + "loss": 1.3501, + "mean_token_accuracy": 0.6624239881833395, + "num_tokens": 2596216668.0, + "step": 15474 + }, + { + "entropy": 1.6163178483645122, + "epoch": 1.700008239268353, + "grad_norm": 0.6514328718185425, + "learning_rate": 3.042257945738025e-06, + "loss": 1.4128, + "mean_token_accuracy": 0.6655259480079015, + "num_tokens": 2596389965.0, + "step": 15475 + }, + { + "entropy": 1.69280410806338, + "epoch": 1.7001180961797258, + "grad_norm": 0.8175613284111023, + "learning_rate": 3.041510151259592e-06, + "loss": 1.2518, + "mean_token_accuracy": 0.6777097036441168, + "num_tokens": 2596514410.0, + "step": 15476 + }, + { + "entropy": 1.8059017360210419, + "epoch": 1.7002279530910989, + "grad_norm": 0.7695567607879639, + "learning_rate": 3.0407626086637753e-06, + "loss": 1.5005, + "mean_token_accuracy": 0.6499375601609548, + "num_tokens": 2596651139.0, + "step": 15477 + }, + { + "entropy": 1.6848878860473633, + "epoch": 1.7003378100024718, + "grad_norm": 0.7237509489059448, + "learning_rate": 3.0400153179742366e-06, + "loss": 1.3545, + "mean_token_accuracy": 0.6637776046991348, + "num_tokens": 2596813442.0, + "step": 15478 + }, + { + "entropy": 1.6869426270325978, + "epoch": 1.7004476669138446, + "grad_norm": 0.7502117156982422, + "learning_rate": 3.039268279214626e-06, + "loss": 1.4079, + "mean_token_accuracy": 0.6469310919443766, + "num_tokens": 2596981179.0, + "step": 15479 + }, + { + "entropy": 1.7178981204827626, + "epoch": 1.7005575238252177, + "grad_norm": 0.6466111540794373, + "learning_rate": 3.038521492408586e-06, + "loss": 1.5021, + "mean_token_accuracy": 0.6554898222287496, + "num_tokens": 2597147861.0, + "step": 15480 + }, + { + "entropy": 1.710584968328476, + "epoch": 1.7006673807365906, + "grad_norm": 0.720398485660553, + "learning_rate": 3.037774957579752e-06, + "loss": 1.5024, + "mean_token_accuracy": 0.635799452662468, + "num_tokens": 2597371591.0, + "step": 15481 + }, + { + "entropy": 1.6549212435881298, + "epoch": 1.7007772376479635, + "grad_norm": 0.6625518798828125, + "learning_rate": 3.0370286747517565e-06, + "loss": 1.3681, + "mean_token_accuracy": 0.6604462365309397, + "num_tokens": 2597518168.0, + "step": 15482 + }, + { + "entropy": 1.6826780637105305, + "epoch": 1.7008870945593366, + "grad_norm": 0.762800395488739, + "learning_rate": 3.036282643948214e-06, + "loss": 1.282, + "mean_token_accuracy": 0.6670280794302622, + "num_tokens": 2597659060.0, + "step": 15483 + }, + { + "entropy": 1.7338972091674805, + "epoch": 1.7009969514707093, + "grad_norm": 0.7537745833396912, + "learning_rate": 3.0355368651927354e-06, + "loss": 1.4775, + "mean_token_accuracy": 0.6528653750816981, + "num_tokens": 2597822951.0, + "step": 15484 + }, + { + "entropy": 1.7104704082012177, + "epoch": 1.7011068083820824, + "grad_norm": 0.6628887057304382, + "learning_rate": 3.034791338508929e-06, + "loss": 1.4674, + "mean_token_accuracy": 0.6388405313094457, + "num_tokens": 2597981401.0, + "step": 15485 + }, + { + "entropy": 1.6909295320510864, + "epoch": 1.7012166652934553, + "grad_norm": 0.8931862711906433, + "learning_rate": 3.034046063920385e-06, + "loss": 1.2176, + "mean_token_accuracy": 0.6916706810394923, + "num_tokens": 2598102875.0, + "step": 15486 + }, + { + "entropy": 1.7621839741865795, + "epoch": 1.7013265222048282, + "grad_norm": 0.719618022441864, + "learning_rate": 3.033301041450695e-06, + "loss": 1.3934, + "mean_token_accuracy": 0.6588169485330582, + "num_tokens": 2598256543.0, + "step": 15487 + }, + { + "entropy": 1.7612866361935933, + "epoch": 1.7014363791162013, + "grad_norm": 0.6616114377975464, + "learning_rate": 3.0325562711234367e-06, + "loss": 1.5589, + "mean_token_accuracy": 0.6347967982292175, + "num_tokens": 2598469802.0, + "step": 15488 + }, + { + "entropy": 1.6207097272078197, + "epoch": 1.701546236027574, + "grad_norm": 0.595119059085846, + "learning_rate": 3.0318117529621813e-06, + "loss": 1.2714, + "mean_token_accuracy": 0.6714903662602106, + "num_tokens": 2598639989.0, + "step": 15489 + }, + { + "entropy": 1.7276874681313832, + "epoch": 1.701656092938947, + "grad_norm": 0.6539283990859985, + "learning_rate": 3.031067486990495e-06, + "loss": 1.5024, + "mean_token_accuracy": 0.6353075504302979, + "num_tokens": 2598835997.0, + "step": 15490 + }, + { + "entropy": 1.7242399354775746, + "epoch": 1.70176594985032, + "grad_norm": 0.7293862104415894, + "learning_rate": 3.0303234732319324e-06, + "loss": 1.4393, + "mean_token_accuracy": 0.6572525550921758, + "num_tokens": 2598956041.0, + "step": 15491 + }, + { + "entropy": 1.696977545817693, + "epoch": 1.7018758067616928, + "grad_norm": 0.6066075563430786, + "learning_rate": 3.029579711710038e-06, + "loss": 1.3658, + "mean_token_accuracy": 0.666529655456543, + "num_tokens": 2599160795.0, + "step": 15492 + }, + { + "entropy": 1.6414269904295604, + "epoch": 1.701985663673066, + "grad_norm": 0.7428905367851257, + "learning_rate": 3.028836202448355e-06, + "loss": 1.0967, + "mean_token_accuracy": 0.6895180543263754, + "num_tokens": 2599345751.0, + "step": 15493 + }, + { + "entropy": 1.7395719190438588, + "epoch": 1.7020955205844388, + "grad_norm": 0.8325342535972595, + "learning_rate": 3.0280929454704154e-06, + "loss": 1.2534, + "mean_token_accuracy": 0.6692363371451696, + "num_tokens": 2599447366.0, + "step": 15494 + }, + { + "entropy": 1.7216653128465016, + "epoch": 1.7022053774958117, + "grad_norm": 0.9464581608772278, + "learning_rate": 3.0273499407997424e-06, + "loss": 1.5236, + "mean_token_accuracy": 0.6325125495592753, + "num_tokens": 2599645835.0, + "step": 15495 + }, + { + "entropy": 1.643006682395935, + "epoch": 1.7023152344071848, + "grad_norm": 0.6483979225158691, + "learning_rate": 3.0266071884598485e-06, + "loss": 1.2448, + "mean_token_accuracy": 0.6851489593585333, + "num_tokens": 2599820998.0, + "step": 15496 + }, + { + "entropy": 1.7206957936286926, + "epoch": 1.7024250913185575, + "grad_norm": 0.8952434062957764, + "learning_rate": 3.025864688474247e-06, + "loss": 1.3475, + "mean_token_accuracy": 0.6655914137760798, + "num_tokens": 2600007097.0, + "step": 15497 + }, + { + "entropy": 1.666352113087972, + "epoch": 1.7025349482299306, + "grad_norm": 0.7107973694801331, + "learning_rate": 3.0251224408664327e-06, + "loss": 1.4151, + "mean_token_accuracy": 0.6601972033580145, + "num_tokens": 2600256180.0, + "step": 15498 + }, + { + "entropy": 1.6548288067181904, + "epoch": 1.7026448051413035, + "grad_norm": 0.6338147521018982, + "learning_rate": 3.024380445659901e-06, + "loss": 1.358, + "mean_token_accuracy": 0.6660072356462479, + "num_tokens": 2600424528.0, + "step": 15499 + }, + { + "entropy": 1.7549506922562916, + "epoch": 1.7027546620526763, + "grad_norm": 0.6739881038665771, + "learning_rate": 3.023638702878135e-06, + "loss": 1.5014, + "mean_token_accuracy": 0.651511957248052, + "num_tokens": 2600600015.0, + "step": 15500 + }, + { + "entropy": 1.6789835790793102, + "epoch": 1.7028645189640494, + "grad_norm": 0.7053590416908264, + "learning_rate": 3.022897212544608e-06, + "loss": 1.4764, + "mean_token_accuracy": 0.6552553325891495, + "num_tokens": 2600761099.0, + "step": 15501 + }, + { + "entropy": 1.746220628420512, + "epoch": 1.702974375875422, + "grad_norm": 0.6821596026420593, + "learning_rate": 3.0221559746827905e-06, + "loss": 1.2714, + "mean_token_accuracy": 0.6693116724491119, + "num_tokens": 2600880919.0, + "step": 15502 + }, + { + "entropy": 1.6446532607078552, + "epoch": 1.7030842327867952, + "grad_norm": 0.7012965083122253, + "learning_rate": 3.021414989316143e-06, + "loss": 1.5149, + "mean_token_accuracy": 0.6544087131818136, + "num_tokens": 2601075802.0, + "step": 15503 + }, + { + "entropy": 1.6924720704555511, + "epoch": 1.703194089698168, + "grad_norm": 0.7682718634605408, + "learning_rate": 3.0206742564681123e-06, + "loss": 1.4444, + "mean_token_accuracy": 0.6571909934282303, + "num_tokens": 2601229320.0, + "step": 15504 + }, + { + "entropy": 1.759785145521164, + "epoch": 1.703303946609541, + "grad_norm": 0.7660031914710999, + "learning_rate": 3.0199337761621465e-06, + "loss": 1.502, + "mean_token_accuracy": 0.6575988878806432, + "num_tokens": 2601411092.0, + "step": 15505 + }, + { + "entropy": 1.6723263065020244, + "epoch": 1.703413803520914, + "grad_norm": 0.6515182256698608, + "learning_rate": 3.019193548421683e-06, + "loss": 1.3288, + "mean_token_accuracy": 0.6691017051537832, + "num_tokens": 2601564465.0, + "step": 15506 + }, + { + "entropy": 1.7310113807519276, + "epoch": 1.703523660432287, + "grad_norm": 0.6577731370925903, + "learning_rate": 3.0184535732701464e-06, + "loss": 1.3704, + "mean_token_accuracy": 0.6577199498812357, + "num_tokens": 2601736489.0, + "step": 15507 + }, + { + "entropy": 1.6878896454970043, + "epoch": 1.7036335173436599, + "grad_norm": 0.8926342129707336, + "learning_rate": 3.0177138507309572e-06, + "loss": 1.2761, + "mean_token_accuracy": 0.6754782150189081, + "num_tokens": 2601847021.0, + "step": 15508 + }, + { + "entropy": 1.669161597887675, + "epoch": 1.703743374255033, + "grad_norm": 0.7536756992340088, + "learning_rate": 3.0169743808275286e-06, + "loss": 1.5964, + "mean_token_accuracy": 0.6591108938058218, + "num_tokens": 2602004441.0, + "step": 15509 + }, + { + "entropy": 1.7129058440526326, + "epoch": 1.7038532311664056, + "grad_norm": 0.6124458909034729, + "learning_rate": 3.016235163583262e-06, + "loss": 1.3916, + "mean_token_accuracy": 0.6562560399373373, + "num_tokens": 2602159772.0, + "step": 15510 + }, + { + "entropy": 1.724676748116811, + "epoch": 1.7039630880777787, + "grad_norm": 0.598620593547821, + "learning_rate": 3.0154961990215575e-06, + "loss": 1.4405, + "mean_token_accuracy": 0.6402031729618708, + "num_tokens": 2602341730.0, + "step": 15511 + }, + { + "entropy": 1.70653834939003, + "epoch": 1.7040729449891516, + "grad_norm": 0.6428411602973938, + "learning_rate": 3.0147574871658e-06, + "loss": 1.381, + "mean_token_accuracy": 0.6533275147279104, + "num_tokens": 2602508258.0, + "step": 15512 + }, + { + "entropy": 1.739314079284668, + "epoch": 1.7041828019005245, + "grad_norm": 0.6420696973800659, + "learning_rate": 3.0140190280393666e-06, + "loss": 1.4793, + "mean_token_accuracy": 0.640799934665362, + "num_tokens": 2602689022.0, + "step": 15513 + }, + { + "entropy": 1.6944958964983623, + "epoch": 1.7042926588118976, + "grad_norm": 0.5888864398002625, + "learning_rate": 3.013280821665636e-06, + "loss": 1.5175, + "mean_token_accuracy": 0.6373367408911387, + "num_tokens": 2602903410.0, + "step": 15514 + }, + { + "entropy": 1.7060741186141968, + "epoch": 1.7044025157232703, + "grad_norm": 0.6964573264122009, + "learning_rate": 3.012542868067968e-06, + "loss": 1.4072, + "mean_token_accuracy": 0.6693576574325562, + "num_tokens": 2603052533.0, + "step": 15515 + }, + { + "entropy": 1.716521163781484, + "epoch": 1.7045123726346434, + "grad_norm": 0.6152582764625549, + "learning_rate": 3.0118051672697164e-06, + "loss": 1.3661, + "mean_token_accuracy": 0.6634813745816549, + "num_tokens": 2603221790.0, + "step": 15516 + }, + { + "entropy": 1.748351812362671, + "epoch": 1.7046222295460163, + "grad_norm": 0.6902201771736145, + "learning_rate": 3.011067719294233e-06, + "loss": 1.3842, + "mean_token_accuracy": 0.6483493248621622, + "num_tokens": 2603365662.0, + "step": 15517 + }, + { + "entropy": 1.676300545533498, + "epoch": 1.7047320864573892, + "grad_norm": 0.7123953104019165, + "learning_rate": 3.010330524164857e-06, + "loss": 1.4256, + "mean_token_accuracy": 0.6634288628896078, + "num_tokens": 2603506810.0, + "step": 15518 + }, + { + "entropy": 1.7149119873841603, + "epoch": 1.7048419433687623, + "grad_norm": 0.6167012453079224, + "learning_rate": 3.0095935819049203e-06, + "loss": 1.3524, + "mean_token_accuracy": 0.6638331562280655, + "num_tokens": 2603657268.0, + "step": 15519 + }, + { + "entropy": 1.7451521356900532, + "epoch": 1.7049518002801352, + "grad_norm": 0.6651485562324524, + "learning_rate": 3.0088568925377444e-06, + "loss": 1.3648, + "mean_token_accuracy": 0.6588761260112127, + "num_tokens": 2603778482.0, + "step": 15520 + }, + { + "entropy": 1.7001748283704121, + "epoch": 1.705061657191508, + "grad_norm": 0.6478435397148132, + "learning_rate": 3.0081204560866482e-06, + "loss": 1.3306, + "mean_token_accuracy": 0.6700827330350876, + "num_tokens": 2603933217.0, + "step": 15521 + }, + { + "entropy": 1.6979700823624928, + "epoch": 1.7051715141028811, + "grad_norm": 0.5511773824691772, + "learning_rate": 3.007384272574939e-06, + "loss": 1.3932, + "mean_token_accuracy": 0.663392369945844, + "num_tokens": 2604155419.0, + "step": 15522 + }, + { + "entropy": 1.7112592458724976, + "epoch": 1.7052813710142538, + "grad_norm": 0.8108544945716858, + "learning_rate": 3.0066483420259145e-06, + "loss": 1.4776, + "mean_token_accuracy": 0.6554440756638845, + "num_tokens": 2604343465.0, + "step": 15523 + }, + { + "entropy": 1.7090557316939037, + "epoch": 1.705391227925627, + "grad_norm": 0.650626540184021, + "learning_rate": 3.005912664462869e-06, + "loss": 1.2697, + "mean_token_accuracy": 0.6703123350938162, + "num_tokens": 2604482670.0, + "step": 15524 + }, + { + "entropy": 1.7024895350138347, + "epoch": 1.7055010848369998, + "grad_norm": 0.6583566665649414, + "learning_rate": 3.0051772399090838e-06, + "loss": 1.4045, + "mean_token_accuracy": 0.6468039005994797, + "num_tokens": 2604665042.0, + "step": 15525 + }, + { + "entropy": 1.709368646144867, + "epoch": 1.7056109417483727, + "grad_norm": 0.6365029215812683, + "learning_rate": 3.0044420683878387e-06, + "loss": 1.5439, + "mean_token_accuracy": 0.6336111923058828, + "num_tokens": 2604893566.0, + "step": 15526 + }, + { + "entropy": 1.731093277533849, + "epoch": 1.7057207986597458, + "grad_norm": 0.6655638813972473, + "learning_rate": 3.003707149922398e-06, + "loss": 1.3796, + "mean_token_accuracy": 0.6612533827622732, + "num_tokens": 2605048388.0, + "step": 15527 + }, + { + "entropy": 1.6967615683873494, + "epoch": 1.7058306555711185, + "grad_norm": 0.8625466823577881, + "learning_rate": 3.002972484536022e-06, + "loss": 1.4847, + "mean_token_accuracy": 0.6741051077842712, + "num_tokens": 2605202061.0, + "step": 15528 + }, + { + "entropy": 1.6434633831183116, + "epoch": 1.7059405124824916, + "grad_norm": 0.6478009223937988, + "learning_rate": 3.002238072251965e-06, + "loss": 1.4601, + "mean_token_accuracy": 0.6438992669185003, + "num_tokens": 2605404372.0, + "step": 15529 + }, + { + "entropy": 1.7079529066880543, + "epoch": 1.7060503693938645, + "grad_norm": 0.7610868811607361, + "learning_rate": 3.001503913093468e-06, + "loss": 1.3129, + "mean_token_accuracy": 0.6602237820625305, + "num_tokens": 2605566223.0, + "step": 15530 + }, + { + "entropy": 1.7487321893374126, + "epoch": 1.7061602263052373, + "grad_norm": 0.6044963002204895, + "learning_rate": 3.0007700070837697e-06, + "loss": 1.3794, + "mean_token_accuracy": 0.6575321207443873, + "num_tokens": 2605735404.0, + "step": 15531 + }, + { + "entropy": 1.6425415972868602, + "epoch": 1.7062700832166104, + "grad_norm": 0.6396393775939941, + "learning_rate": 3.0000363542460953e-06, + "loss": 1.4136, + "mean_token_accuracy": 0.6657944619655609, + "num_tokens": 2605976420.0, + "step": 15532 + }, + { + "entropy": 1.7656619250774384, + "epoch": 1.7063799401279833, + "grad_norm": 0.7868967652320862, + "learning_rate": 2.999302954603664e-06, + "loss": 1.5443, + "mean_token_accuracy": 0.6374485790729523, + "num_tokens": 2606105556.0, + "step": 15533 + }, + { + "entropy": 1.7477157612641652, + "epoch": 1.7064897970393562, + "grad_norm": 0.7586115002632141, + "learning_rate": 2.9985698081796897e-06, + "loss": 1.4364, + "mean_token_accuracy": 0.6489766389131546, + "num_tokens": 2606253316.0, + "step": 15534 + }, + { + "entropy": 1.7553973694642384, + "epoch": 1.7065996539507293, + "grad_norm": 0.7467787861824036, + "learning_rate": 2.9978369149973773e-06, + "loss": 1.334, + "mean_token_accuracy": 0.6667229930559794, + "num_tokens": 2606371755.0, + "step": 15535 + }, + { + "entropy": 1.7056459089120228, + "epoch": 1.706709510862102, + "grad_norm": 0.6380571126937866, + "learning_rate": 2.997104275079918e-06, + "loss": 1.4379, + "mean_token_accuracy": 0.6464936286211014, + "num_tokens": 2606548341.0, + "step": 15536 + }, + { + "entropy": 1.7088016072909038, + "epoch": 1.706819367773475, + "grad_norm": 0.8023338913917542, + "learning_rate": 2.996371888450502e-06, + "loss": 1.421, + "mean_token_accuracy": 0.6561804662148157, + "num_tokens": 2606692621.0, + "step": 15537 + }, + { + "entropy": 1.7052049537499745, + "epoch": 1.706929224684848, + "grad_norm": 0.6686699390411377, + "learning_rate": 2.9956397551323113e-06, + "loss": 1.4714, + "mean_token_accuracy": 0.6560079008340836, + "num_tokens": 2606846788.0, + "step": 15538 + }, + { + "entropy": 1.7651503086090088, + "epoch": 1.7070390815962209, + "grad_norm": 0.6589861512184143, + "learning_rate": 2.9949078751485156e-06, + "loss": 1.5128, + "mean_token_accuracy": 0.640378495057424, + "num_tokens": 2607021409.0, + "step": 15539 + }, + { + "entropy": 1.69097700715065, + "epoch": 1.707148938507594, + "grad_norm": 0.5823447704315186, + "learning_rate": 2.9941762485222766e-06, + "loss": 1.3412, + "mean_token_accuracy": 0.662921796242396, + "num_tokens": 2607189347.0, + "step": 15540 + }, + { + "entropy": 1.7773426473140717, + "epoch": 1.7072587954189666, + "grad_norm": 0.6733897924423218, + "learning_rate": 2.993444875276753e-06, + "loss": 1.4578, + "mean_token_accuracy": 0.642360677321752, + "num_tokens": 2607339073.0, + "step": 15541 + }, + { + "entropy": 1.7012966771920521, + "epoch": 1.7073686523303397, + "grad_norm": 0.6776845455169678, + "learning_rate": 2.99271375543509e-06, + "loss": 1.3251, + "mean_token_accuracy": 0.6664983431498209, + "num_tokens": 2607490579.0, + "step": 15542 + }, + { + "entropy": 1.7195370694001515, + "epoch": 1.7074785092417126, + "grad_norm": 0.6434686779975891, + "learning_rate": 2.99198288902043e-06, + "loss": 1.3383, + "mean_token_accuracy": 0.6653375178575516, + "num_tokens": 2607646549.0, + "step": 15543 + }, + { + "entropy": 1.7077328364054363, + "epoch": 1.7075883661530855, + "grad_norm": 0.6161625385284424, + "learning_rate": 2.991252276055903e-06, + "loss": 1.3273, + "mean_token_accuracy": 0.6651216745376587, + "num_tokens": 2607792694.0, + "step": 15544 + }, + { + "entropy": 1.6930580735206604, + "epoch": 1.7076982230644586, + "grad_norm": 0.8237130045890808, + "learning_rate": 2.9905219165646316e-06, + "loss": 1.2765, + "mean_token_accuracy": 0.6694959203402201, + "num_tokens": 2607974833.0, + "step": 15545 + }, + { + "entropy": 1.7149873475233715, + "epoch": 1.7078080799758315, + "grad_norm": 0.7254765629768372, + "learning_rate": 2.989791810569734e-06, + "loss": 1.453, + "mean_token_accuracy": 0.6517289926608404, + "num_tokens": 2608130283.0, + "step": 15546 + }, + { + "entropy": 1.6859596868356068, + "epoch": 1.7079179368872044, + "grad_norm": 0.731508195400238, + "learning_rate": 2.989061958094316e-06, + "loss": 1.4882, + "mean_token_accuracy": 0.6498477756977081, + "num_tokens": 2608311879.0, + "step": 15547 + }, + { + "entropy": 1.7175701260566711, + "epoch": 1.7080277937985775, + "grad_norm": 0.7343022227287292, + "learning_rate": 2.9883323591614746e-06, + "loss": 1.5166, + "mean_token_accuracy": 0.6450098951657613, + "num_tokens": 2608528935.0, + "step": 15548 + }, + { + "entropy": 1.6760614514350891, + "epoch": 1.7081376507099502, + "grad_norm": 0.7836624383926392, + "learning_rate": 2.9876030137943045e-06, + "loss": 1.2276, + "mean_token_accuracy": 0.6732871532440186, + "num_tokens": 2608638073.0, + "step": 15549 + }, + { + "entropy": 1.733736475308736, + "epoch": 1.7082475076213233, + "grad_norm": 0.6803503036499023, + "learning_rate": 2.986873922015891e-06, + "loss": 1.3603, + "mean_token_accuracy": 0.6762413680553436, + "num_tokens": 2608784234.0, + "step": 15550 + }, + { + "entropy": 1.7393794854482014, + "epoch": 1.7083573645326962, + "grad_norm": 0.824210524559021, + "learning_rate": 2.9861450838493054e-06, + "loss": 1.3731, + "mean_token_accuracy": 0.6584658722082773, + "num_tokens": 2608916469.0, + "step": 15551 + }, + { + "entropy": 1.6854054033756256, + "epoch": 1.708467221444069, + "grad_norm": 0.728629469871521, + "learning_rate": 2.985416499317616e-06, + "loss": 1.2944, + "mean_token_accuracy": 0.6629828413327535, + "num_tokens": 2609072203.0, + "step": 15552 + }, + { + "entropy": 1.6955331861972809, + "epoch": 1.7085770783554421, + "grad_norm": 0.6643010377883911, + "learning_rate": 2.9846881684438853e-06, + "loss": 1.3922, + "mean_token_accuracy": 0.65622046093146, + "num_tokens": 2609271094.0, + "step": 15553 + }, + { + "entropy": 1.6978692213694255, + "epoch": 1.7086869352668148, + "grad_norm": 0.7663952708244324, + "learning_rate": 2.983960091251159e-06, + "loss": 1.3997, + "mean_token_accuracy": 0.6570919106403986, + "num_tokens": 2609403906.0, + "step": 15554 + }, + { + "entropy": 1.7347622215747833, + "epoch": 1.708796792178188, + "grad_norm": 0.6088001728057861, + "learning_rate": 2.9832322677624875e-06, + "loss": 1.5643, + "mean_token_accuracy": 0.6266407817602158, + "num_tokens": 2609617624.0, + "step": 15555 + }, + { + "entropy": 1.7591754694779713, + "epoch": 1.7089066490895608, + "grad_norm": 0.6738680601119995, + "learning_rate": 2.9825046980009005e-06, + "loss": 1.5579, + "mean_token_accuracy": 0.6592029680808386, + "num_tokens": 2609781843.0, + "step": 15556 + }, + { + "entropy": 1.7426554759343464, + "epoch": 1.7090165060009337, + "grad_norm": 0.7468079924583435, + "learning_rate": 2.981777381989426e-06, + "loss": 1.4327, + "mean_token_accuracy": 0.6514635235071182, + "num_tokens": 2609965574.0, + "step": 15557 + }, + { + "entropy": 1.7027061482270558, + "epoch": 1.7091263629123068, + "grad_norm": 0.6666781306266785, + "learning_rate": 2.9810503197510866e-06, + "loss": 1.4226, + "mean_token_accuracy": 0.6546925703684489, + "num_tokens": 2610133359.0, + "step": 15558 + }, + { + "entropy": 1.7047854562600453, + "epoch": 1.7092362198236797, + "grad_norm": 0.7272503972053528, + "learning_rate": 2.9803235113088904e-06, + "loss": 1.3809, + "mean_token_accuracy": 0.6511211693286896, + "num_tokens": 2610271326.0, + "step": 15559 + }, + { + "entropy": 1.664241353670756, + "epoch": 1.7093460767350526, + "grad_norm": 0.7079517245292664, + "learning_rate": 2.9795969566858394e-06, + "loss": 1.4665, + "mean_token_accuracy": 0.6483618170022964, + "num_tokens": 2610493434.0, + "step": 15560 + }, + { + "entropy": 1.7043645282586415, + "epoch": 1.7094559336464257, + "grad_norm": 0.7159388661384583, + "learning_rate": 2.9788706559049305e-06, + "loss": 1.1727, + "mean_token_accuracy": 0.683139776190122, + "num_tokens": 2610632343.0, + "step": 15561 + }, + { + "entropy": 1.7370853920777638, + "epoch": 1.7095657905577983, + "grad_norm": 0.5975568890571594, + "learning_rate": 2.978144608989154e-06, + "loss": 1.285, + "mean_token_accuracy": 0.6826331615447998, + "num_tokens": 2610789458.0, + "step": 15562 + }, + { + "entropy": 1.7373074094454448, + "epoch": 1.7096756474691714, + "grad_norm": 0.6020911931991577, + "learning_rate": 2.9774188159614847e-06, + "loss": 1.4642, + "mean_token_accuracy": 0.6399503002564112, + "num_tokens": 2610971836.0, + "step": 15563 + }, + { + "entropy": 1.7091583808263142, + "epoch": 1.7097855043805443, + "grad_norm": 0.8372467160224915, + "learning_rate": 2.9766932768448937e-06, + "loss": 1.404, + "mean_token_accuracy": 0.6612410992383957, + "num_tokens": 2611119136.0, + "step": 15564 + }, + { + "entropy": 1.7217505673567455, + "epoch": 1.7098953612919172, + "grad_norm": 0.8266700506210327, + "learning_rate": 2.9759679916623463e-06, + "loss": 1.2311, + "mean_token_accuracy": 0.6795663088560104, + "num_tokens": 2611227488.0, + "step": 15565 + }, + { + "entropy": 1.6986558934052784, + "epoch": 1.7100052182032903, + "grad_norm": 0.6963996887207031, + "learning_rate": 2.9752429604367945e-06, + "loss": 1.5835, + "mean_token_accuracy": 0.6428253799676895, + "num_tokens": 2611427414.0, + "step": 15566 + }, + { + "entropy": 1.6627737681070964, + "epoch": 1.710115075114663, + "grad_norm": 0.621704638004303, + "learning_rate": 2.9745181831911894e-06, + "loss": 1.4027, + "mean_token_accuracy": 0.6542961647113165, + "num_tokens": 2611589095.0, + "step": 15567 + }, + { + "entropy": 1.6755984326203663, + "epoch": 1.710224932026036, + "grad_norm": 0.7194933295249939, + "learning_rate": 2.973793659948466e-06, + "loss": 1.5311, + "mean_token_accuracy": 0.6462205847104391, + "num_tokens": 2611798545.0, + "step": 15568 + }, + { + "entropy": 1.7322870294253032, + "epoch": 1.710334788937409, + "grad_norm": 0.6190232038497925, + "learning_rate": 2.9730693907315566e-06, + "loss": 1.6186, + "mean_token_accuracy": 0.6203610102335612, + "num_tokens": 2612029911.0, + "step": 15569 + }, + { + "entropy": 1.7027688721815746, + "epoch": 1.7104446458487819, + "grad_norm": 0.6538522839546204, + "learning_rate": 2.9723453755633848e-06, + "loss": 1.4993, + "mean_token_accuracy": 0.6435906638701757, + "num_tokens": 2612189497.0, + "step": 15570 + }, + { + "entropy": 1.7072203656037648, + "epoch": 1.710554502760155, + "grad_norm": 0.6151244044303894, + "learning_rate": 2.9716216144668654e-06, + "loss": 1.4434, + "mean_token_accuracy": 0.6375455409288406, + "num_tokens": 2612351230.0, + "step": 15571 + }, + { + "entropy": 1.6858009199301403, + "epoch": 1.7106643596715279, + "grad_norm": 0.634488582611084, + "learning_rate": 2.9708981074649e-06, + "loss": 1.3779, + "mean_token_accuracy": 0.6542573670546213, + "num_tokens": 2612501804.0, + "step": 15572 + }, + { + "entropy": 1.6959696511427562, + "epoch": 1.7107742165829007, + "grad_norm": 0.7555203437805176, + "learning_rate": 2.9701748545803938e-06, + "loss": 1.5933, + "mean_token_accuracy": 0.6376071472962698, + "num_tokens": 2612709277.0, + "step": 15573 + }, + { + "entropy": 1.6876679261525471, + "epoch": 1.7108840734942738, + "grad_norm": 0.8136647939682007, + "learning_rate": 2.9694518558362363e-06, + "loss": 1.505, + "mean_token_accuracy": 0.6470917736490568, + "num_tokens": 2612885448.0, + "step": 15574 + }, + { + "entropy": 1.7294893463452656, + "epoch": 1.7109939304056465, + "grad_norm": 0.6215494275093079, + "learning_rate": 2.968729111255309e-06, + "loss": 1.4703, + "mean_token_accuracy": 0.6409442375103632, + "num_tokens": 2613096534.0, + "step": 15575 + }, + { + "entropy": 1.7100212673346202, + "epoch": 1.7111037873170196, + "grad_norm": 0.7254142761230469, + "learning_rate": 2.968006620860485e-06, + "loss": 1.4299, + "mean_token_accuracy": 0.6658960854013761, + "num_tokens": 2613248348.0, + "step": 15576 + }, + { + "entropy": 1.7584032714366913, + "epoch": 1.7112136442283925, + "grad_norm": 0.7717592120170593, + "learning_rate": 2.9672843846746326e-06, + "loss": 1.4585, + "mean_token_accuracy": 0.6663567970196406, + "num_tokens": 2613406761.0, + "step": 15577 + }, + { + "entropy": 1.719919741153717, + "epoch": 1.7113235011397654, + "grad_norm": 0.6627902388572693, + "learning_rate": 2.966562402720609e-06, + "loss": 1.4896, + "mean_token_accuracy": 0.6469202389319738, + "num_tokens": 2613583879.0, + "step": 15578 + }, + { + "entropy": 1.6661270360151927, + "epoch": 1.7114333580511385, + "grad_norm": 0.7028049230575562, + "learning_rate": 2.9658406750212664e-06, + "loss": 1.4709, + "mean_token_accuracy": 0.6556438406308492, + "num_tokens": 2613730637.0, + "step": 15579 + }, + { + "entropy": 1.7086673080921173, + "epoch": 1.7115432149625114, + "grad_norm": 0.648152232170105, + "learning_rate": 2.965119201599447e-06, + "loss": 1.3956, + "mean_token_accuracy": 0.6587710777918497, + "num_tokens": 2613881692.0, + "step": 15580 + }, + { + "entropy": 1.664686808983485, + "epoch": 1.7116530718738843, + "grad_norm": 0.63326096534729, + "learning_rate": 2.964397982477983e-06, + "loss": 1.2859, + "mean_token_accuracy": 0.6708424985408783, + "num_tokens": 2614039965.0, + "step": 15581 + }, + { + "entropy": 1.705962876478831, + "epoch": 1.7117629287852572, + "grad_norm": 0.6361053586006165, + "learning_rate": 2.963677017679702e-06, + "loss": 1.3926, + "mean_token_accuracy": 0.6527653137842814, + "num_tokens": 2614228171.0, + "step": 15582 + }, + { + "entropy": 1.6903795500596364, + "epoch": 1.71187278569663, + "grad_norm": 0.8894221186637878, + "learning_rate": 2.962956307227423e-06, + "loss": 1.2964, + "mean_token_accuracy": 0.661086842417717, + "num_tokens": 2614404823.0, + "step": 15583 + }, + { + "entropy": 1.7170774539311726, + "epoch": 1.7119826426080031, + "grad_norm": 0.5786034464836121, + "learning_rate": 2.962235851143955e-06, + "loss": 1.4699, + "mean_token_accuracy": 0.6363462110360464, + "num_tokens": 2614608986.0, + "step": 15584 + }, + { + "entropy": 1.700467934211095, + "epoch": 1.712092499519376, + "grad_norm": 0.7319923639297485, + "learning_rate": 2.9615156494520973e-06, + "loss": 1.507, + "mean_token_accuracy": 0.6545891861120859, + "num_tokens": 2614767392.0, + "step": 15585 + }, + { + "entropy": 1.6899384955565135, + "epoch": 1.712202356430749, + "grad_norm": 0.800560474395752, + "learning_rate": 2.9607957021746514e-06, + "loss": 1.2687, + "mean_token_accuracy": 0.6748671482006708, + "num_tokens": 2614895120.0, + "step": 15586 + }, + { + "entropy": 1.6918248236179352, + "epoch": 1.712312213342122, + "grad_norm": 0.7248471975326538, + "learning_rate": 2.9600760093343984e-06, + "loss": 1.3761, + "mean_token_accuracy": 0.6653676678737005, + "num_tokens": 2615067944.0, + "step": 15587 + }, + { + "entropy": 1.684990406036377, + "epoch": 1.7124220702534947, + "grad_norm": 0.6657690405845642, + "learning_rate": 2.959356570954116e-06, + "loss": 1.3267, + "mean_token_accuracy": 0.6671634962161382, + "num_tokens": 2615221322.0, + "step": 15588 + }, + { + "entropy": 1.726973295211792, + "epoch": 1.7125319271648678, + "grad_norm": 0.6965436935424805, + "learning_rate": 2.9586373870565743e-06, + "loss": 1.2313, + "mean_token_accuracy": 0.6775472164154053, + "num_tokens": 2615371929.0, + "step": 15589 + }, + { + "entropy": 1.7516135772069295, + "epoch": 1.7126417840762407, + "grad_norm": 0.6991093754768372, + "learning_rate": 2.9579184576645346e-06, + "loss": 1.3075, + "mean_token_accuracy": 0.6707476228475571, + "num_tokens": 2615543141.0, + "step": 15590 + }, + { + "entropy": 1.7394656638304393, + "epoch": 1.7127516409876136, + "grad_norm": 0.6370988488197327, + "learning_rate": 2.9571997828007567e-06, + "loss": 1.5721, + "mean_token_accuracy": 0.6540063172578812, + "num_tokens": 2615728924.0, + "step": 15591 + }, + { + "entropy": 1.6763994693756104, + "epoch": 1.7128614978989867, + "grad_norm": 0.6898596882820129, + "learning_rate": 2.956481362487977e-06, + "loss": 1.3456, + "mean_token_accuracy": 0.6678059051434199, + "num_tokens": 2615892674.0, + "step": 15592 + }, + { + "entropy": 1.7547740538914998, + "epoch": 1.7129713548103596, + "grad_norm": 0.7486645579338074, + "learning_rate": 2.9557631967489377e-06, + "loss": 1.3792, + "mean_token_accuracy": 0.6506419479846954, + "num_tokens": 2616041776.0, + "step": 15593 + }, + { + "entropy": 1.6951400637626648, + "epoch": 1.7130812117217324, + "grad_norm": 0.6499601602554321, + "learning_rate": 2.9550452856063705e-06, + "loss": 1.4844, + "mean_token_accuracy": 0.6406375219424566, + "num_tokens": 2616214747.0, + "step": 15594 + }, + { + "entropy": 1.7358328998088837, + "epoch": 1.7131910686331053, + "grad_norm": 0.7066434621810913, + "learning_rate": 2.954327629082995e-06, + "loss": 1.425, + "mean_token_accuracy": 0.6705079823732376, + "num_tokens": 2616327828.0, + "step": 15595 + }, + { + "entropy": 1.727910081545512, + "epoch": 1.7133009255444782, + "grad_norm": 0.6480644941329956, + "learning_rate": 2.953610227201522e-06, + "loss": 1.5293, + "mean_token_accuracy": 0.6270461082458496, + "num_tokens": 2616497708.0, + "step": 15596 + }, + { + "entropy": 1.6446092625459034, + "epoch": 1.7134107824558513, + "grad_norm": 0.9010108709335327, + "learning_rate": 2.9528930799846624e-06, + "loss": 1.3272, + "mean_token_accuracy": 0.6766605178515116, + "num_tokens": 2616622349.0, + "step": 15597 + }, + { + "entropy": 1.6904981931050618, + "epoch": 1.7135206393672242, + "grad_norm": 0.6076183915138245, + "learning_rate": 2.9521761874551074e-06, + "loss": 1.4624, + "mean_token_accuracy": 0.6577340712149938, + "num_tokens": 2616786261.0, + "step": 15598 + }, + { + "entropy": 1.6667085389296215, + "epoch": 1.713630496278597, + "grad_norm": 0.7304791808128357, + "learning_rate": 2.951459549635553e-06, + "loss": 1.2399, + "mean_token_accuracy": 0.684608002503713, + "num_tokens": 2616919152.0, + "step": 15599 + }, + { + "entropy": 1.7170383930206299, + "epoch": 1.7137403531899702, + "grad_norm": 0.758103609085083, + "learning_rate": 2.9507431665486762e-06, + "loss": 1.3148, + "mean_token_accuracy": 0.6641736576954523, + "num_tokens": 2617016602.0, + "step": 15600 + }, + { + "entropy": 1.7205977539221446, + "epoch": 1.7138502101013429, + "grad_norm": 0.7272992730140686, + "learning_rate": 2.95002703821715e-06, + "loss": 1.4594, + "mean_token_accuracy": 0.6407827585935593, + "num_tokens": 2617200134.0, + "step": 15601 + }, + { + "entropy": 1.7511567175388336, + "epoch": 1.713960067012716, + "grad_norm": 0.6522664427757263, + "learning_rate": 2.949311164663642e-06, + "loss": 1.4678, + "mean_token_accuracy": 0.6563850492238998, + "num_tokens": 2617347308.0, + "step": 15602 + }, + { + "entropy": 1.7190465529759724, + "epoch": 1.7140699239240889, + "grad_norm": 0.7146872878074646, + "learning_rate": 2.948595545910807e-06, + "loss": 1.5173, + "mean_token_accuracy": 0.6501129815975825, + "num_tokens": 2617520468.0, + "step": 15603 + }, + { + "entropy": 1.7591931919256847, + "epoch": 1.7141797808354617, + "grad_norm": 0.7043587565422058, + "learning_rate": 2.947880181981295e-06, + "loss": 1.4204, + "mean_token_accuracy": 0.6521059771378835, + "num_tokens": 2617656762.0, + "step": 15604 + }, + { + "entropy": 1.713607559601466, + "epoch": 1.7142896377468348, + "grad_norm": 0.7444068193435669, + "learning_rate": 2.947165072897745e-06, + "loss": 1.4725, + "mean_token_accuracy": 0.6571368873119354, + "num_tokens": 2617811535.0, + "step": 15605 + }, + { + "entropy": 1.7211280067761738, + "epoch": 1.7143994946582077, + "grad_norm": 0.6797099709510803, + "learning_rate": 2.946450218682796e-06, + "loss": 1.4251, + "mean_token_accuracy": 0.6561200817426046, + "num_tokens": 2617969068.0, + "step": 15606 + }, + { + "entropy": 1.7104970415433247, + "epoch": 1.7145093515695806, + "grad_norm": 0.7477222084999084, + "learning_rate": 2.945735619359066e-06, + "loss": 1.4978, + "mean_token_accuracy": 0.6518939783175787, + "num_tokens": 2618122469.0, + "step": 15607 + }, + { + "entropy": 1.7389554679393768, + "epoch": 1.7146192084809535, + "grad_norm": 0.7334529757499695, + "learning_rate": 2.9450212749491737e-06, + "loss": 1.3035, + "mean_token_accuracy": 0.6685802390178045, + "num_tokens": 2618295553.0, + "step": 15608 + }, + { + "entropy": 1.6965291400750477, + "epoch": 1.7147290653923264, + "grad_norm": 0.725472092628479, + "learning_rate": 2.9443071854757297e-06, + "loss": 1.3944, + "mean_token_accuracy": 0.6754897187153498, + "num_tokens": 2618487620.0, + "step": 15609 + }, + { + "entropy": 1.7117928862571716, + "epoch": 1.7148389223036995, + "grad_norm": 1.0294393301010132, + "learning_rate": 2.9435933509613323e-06, + "loss": 1.2737, + "mean_token_accuracy": 0.6840305080016454, + "num_tokens": 2618628386.0, + "step": 15610 + }, + { + "entropy": 1.7260564068953197, + "epoch": 1.7149487792150724, + "grad_norm": 0.6792541742324829, + "learning_rate": 2.942879771428577e-06, + "loss": 1.3423, + "mean_token_accuracy": 0.6633341958125433, + "num_tokens": 2618816606.0, + "step": 15611 + }, + { + "entropy": 1.6906549831231434, + "epoch": 1.7150586361264453, + "grad_norm": 0.7536963224411011, + "learning_rate": 2.9421664469000454e-06, + "loss": 1.3189, + "mean_token_accuracy": 0.6605163216590881, + "num_tokens": 2618969345.0, + "step": 15612 + }, + { + "entropy": 1.705591360727946, + "epoch": 1.7151684930378184, + "grad_norm": 0.7592849731445312, + "learning_rate": 2.941453377398313e-06, + "loss": 1.3855, + "mean_token_accuracy": 0.6538231472174326, + "num_tokens": 2619169192.0, + "step": 15613 + }, + { + "entropy": 1.7342075407505035, + "epoch": 1.715278349949191, + "grad_norm": 0.7230466604232788, + "learning_rate": 2.9407405629459525e-06, + "loss": 1.2666, + "mean_token_accuracy": 0.6689763913551966, + "num_tokens": 2619273955.0, + "step": 15614 + }, + { + "entropy": 1.6978387037913005, + "epoch": 1.7153882068605641, + "grad_norm": 0.7770639657974243, + "learning_rate": 2.940028003565521e-06, + "loss": 1.3855, + "mean_token_accuracy": 0.6582835217316946, + "num_tokens": 2619478319.0, + "step": 15615 + }, + { + "entropy": 1.6785812576611836, + "epoch": 1.715498063771937, + "grad_norm": 0.6288565993309021, + "learning_rate": 2.939315699279569e-06, + "loss": 1.3499, + "mean_token_accuracy": 0.6569116910298666, + "num_tokens": 2619593713.0, + "step": 15616 + }, + { + "entropy": 1.7079376081625621, + "epoch": 1.71560792068331, + "grad_norm": 0.6645893454551697, + "learning_rate": 2.938603650110644e-06, + "loss": 1.3913, + "mean_token_accuracy": 0.6528747181097666, + "num_tokens": 2619768328.0, + "step": 15617 + }, + { + "entropy": 1.6401109794775646, + "epoch": 1.715717777594683, + "grad_norm": 0.5823392868041992, + "learning_rate": 2.9378918560812825e-06, + "loss": 1.386, + "mean_token_accuracy": 0.6663381606340408, + "num_tokens": 2619922330.0, + "step": 15618 + }, + { + "entropy": 1.6434422830740611, + "epoch": 1.715827634506056, + "grad_norm": 0.8188596963882446, + "learning_rate": 2.93718031721401e-06, + "loss": 1.2486, + "mean_token_accuracy": 0.6767540127038956, + "num_tokens": 2620087889.0, + "step": 15619 + }, + { + "entropy": 1.706367423137029, + "epoch": 1.7159374914174288, + "grad_norm": 0.766272246837616, + "learning_rate": 2.9364690335313463e-06, + "loss": 1.4292, + "mean_token_accuracy": 0.6471919765075048, + "num_tokens": 2620250444.0, + "step": 15620 + }, + { + "entropy": 1.6769792238871257, + "epoch": 1.7160473483288017, + "grad_norm": 0.6159754991531372, + "learning_rate": 2.935758005055806e-06, + "loss": 1.4807, + "mean_token_accuracy": 0.638146718343099, + "num_tokens": 2620434298.0, + "step": 15621 + }, + { + "entropy": 1.6449984113375347, + "epoch": 1.7161572052401746, + "grad_norm": 0.7580591440200806, + "learning_rate": 2.9350472318098886e-06, + "loss": 1.2516, + "mean_token_accuracy": 0.672856385509173, + "num_tokens": 2620564546.0, + "step": 15622 + }, + { + "entropy": 1.6625539064407349, + "epoch": 1.7162670621515477, + "grad_norm": 0.713958203792572, + "learning_rate": 2.9343367138160943e-06, + "loss": 1.3285, + "mean_token_accuracy": 0.6687121589978536, + "num_tokens": 2620712844.0, + "step": 15623 + }, + { + "entropy": 1.6688397228717804, + "epoch": 1.7163769190629206, + "grad_norm": 0.6362452507019043, + "learning_rate": 2.9336264510969083e-06, + "loss": 1.4256, + "mean_token_accuracy": 0.6454948534568151, + "num_tokens": 2620910175.0, + "step": 15624 + }, + { + "entropy": 1.734671155611674, + "epoch": 1.7164867759742934, + "grad_norm": 0.6450325846672058, + "learning_rate": 2.9329164436748086e-06, + "loss": 1.4168, + "mean_token_accuracy": 0.6560704112052917, + "num_tokens": 2621043013.0, + "step": 15625 + }, + { + "entropy": 1.7354978024959564, + "epoch": 1.7165966328856666, + "grad_norm": 0.7361391186714172, + "learning_rate": 2.9322066915722706e-06, + "loss": 1.4561, + "mean_token_accuracy": 0.6466700434684753, + "num_tokens": 2621186220.0, + "step": 15626 + }, + { + "entropy": 1.6472897231578827, + "epoch": 1.7167064897970392, + "grad_norm": 0.7093019485473633, + "learning_rate": 2.931497194811755e-06, + "loss": 1.2352, + "mean_token_accuracy": 0.6806353082259496, + "num_tokens": 2621304502.0, + "step": 15627 + }, + { + "entropy": 1.6753909885883331, + "epoch": 1.7168163467084123, + "grad_norm": 0.7245997786521912, + "learning_rate": 2.930787953415716e-06, + "loss": 1.4154, + "mean_token_accuracy": 0.6562784959872564, + "num_tokens": 2621490780.0, + "step": 15628 + }, + { + "entropy": 1.6912651062011719, + "epoch": 1.7169262036197852, + "grad_norm": 0.752405047416687, + "learning_rate": 2.9300789674066014e-06, + "loss": 1.4143, + "mean_token_accuracy": 0.6599976718425751, + "num_tokens": 2621649870.0, + "step": 15629 + }, + { + "entropy": 1.6908113261063893, + "epoch": 1.717036060531158, + "grad_norm": 0.6509510278701782, + "learning_rate": 2.929370236806854e-06, + "loss": 1.3438, + "mean_token_accuracy": 0.663862998286883, + "num_tokens": 2621790608.0, + "step": 15630 + }, + { + "entropy": 1.6996460954348247, + "epoch": 1.7171459174425312, + "grad_norm": 0.6494753956794739, + "learning_rate": 2.9286617616389005e-06, + "loss": 1.3424, + "mean_token_accuracy": 0.6629728774229685, + "num_tokens": 2621924824.0, + "step": 15631 + }, + { + "entropy": 1.6819026172161102, + "epoch": 1.717255774353904, + "grad_norm": 0.8028758764266968, + "learning_rate": 2.9279535419251646e-06, + "loss": 1.5812, + "mean_token_accuracy": 0.636813203493754, + "num_tokens": 2622102501.0, + "step": 15632 + }, + { + "entropy": 1.7385485967000325, + "epoch": 1.717365631265277, + "grad_norm": 0.7110795378684998, + "learning_rate": 2.9272455776880632e-06, + "loss": 1.3385, + "mean_token_accuracy": 0.6732540826002756, + "num_tokens": 2622244211.0, + "step": 15633 + }, + { + "entropy": 1.6775075495243073, + "epoch": 1.71747548817665, + "grad_norm": 0.6379189491271973, + "learning_rate": 2.9265378689499995e-06, + "loss": 1.5725, + "mean_token_accuracy": 0.6385338008403778, + "num_tokens": 2622442665.0, + "step": 15634 + }, + { + "entropy": 1.654201736052831, + "epoch": 1.7175853450880227, + "grad_norm": 0.5896367430686951, + "learning_rate": 2.9258304157333763e-06, + "loss": 1.3321, + "mean_token_accuracy": 0.6641835123300552, + "num_tokens": 2622625441.0, + "step": 15635 + }, + { + "entropy": 1.7210518419742584, + "epoch": 1.7176952019993958, + "grad_norm": 0.657882809638977, + "learning_rate": 2.9251232180605822e-06, + "loss": 1.4463, + "mean_token_accuracy": 0.6423710286617279, + "num_tokens": 2622849920.0, + "step": 15636 + }, + { + "entropy": 1.6777076125144958, + "epoch": 1.7178050589107687, + "grad_norm": 0.6649149060249329, + "learning_rate": 2.9244162759539977e-06, + "loss": 1.4115, + "mean_token_accuracy": 0.6619361639022827, + "num_tokens": 2623045328.0, + "step": 15637 + }, + { + "entropy": 1.7354417145252228, + "epoch": 1.7179149158221416, + "grad_norm": 0.6765681505203247, + "learning_rate": 2.923709589436001e-06, + "loss": 1.3755, + "mean_token_accuracy": 0.664703369140625, + "num_tokens": 2623167633.0, + "step": 15638 + }, + { + "entropy": 1.7414036691188812, + "epoch": 1.7180247727335147, + "grad_norm": 0.7814067006111145, + "learning_rate": 2.9230031585289564e-06, + "loss": 1.4758, + "mean_token_accuracy": 0.6637212236722311, + "num_tokens": 2623339230.0, + "step": 15639 + }, + { + "entropy": 1.7026597261428833, + "epoch": 1.7181346296448874, + "grad_norm": 0.7521904110908508, + "learning_rate": 2.9222969832552205e-06, + "loss": 1.6311, + "mean_token_accuracy": 0.6423285851875941, + "num_tokens": 2623520275.0, + "step": 15640 + }, + { + "entropy": 1.7413414518038433, + "epoch": 1.7182444865562605, + "grad_norm": 0.6938111186027527, + "learning_rate": 2.9215910636371454e-06, + "loss": 1.4661, + "mean_token_accuracy": 0.6471091061830521, + "num_tokens": 2623733621.0, + "step": 15641 + }, + { + "entropy": 1.7483003437519073, + "epoch": 1.7183543434676334, + "grad_norm": 0.617083728313446, + "learning_rate": 2.920885399697074e-06, + "loss": 1.3296, + "mean_token_accuracy": 0.6537606020768484, + "num_tokens": 2623881830.0, + "step": 15642 + }, + { + "entropy": 1.65190593401591, + "epoch": 1.7184642003790063, + "grad_norm": 0.6645247936248779, + "learning_rate": 2.9201799914573397e-06, + "loss": 1.3003, + "mean_token_accuracy": 0.6721631934245428, + "num_tokens": 2624073705.0, + "step": 15643 + }, + { + "entropy": 1.7330858608086903, + "epoch": 1.7185740572903794, + "grad_norm": 0.6733470559120178, + "learning_rate": 2.919474838940266e-06, + "loss": 1.446, + "mean_token_accuracy": 0.6582320332527161, + "num_tokens": 2624229129.0, + "step": 15644 + }, + { + "entropy": 1.857384592294693, + "epoch": 1.7186839142017523, + "grad_norm": 0.659773588180542, + "learning_rate": 2.918769942168175e-06, + "loss": 1.4821, + "mean_token_accuracy": 0.6398710956176122, + "num_tokens": 2624382591.0, + "step": 15645 + }, + { + "entropy": 1.7804729243119557, + "epoch": 1.7187937711131251, + "grad_norm": 0.7186923027038574, + "learning_rate": 2.9180653011633718e-06, + "loss": 1.4907, + "mean_token_accuracy": 0.6509936352570852, + "num_tokens": 2624575510.0, + "step": 15646 + }, + { + "entropy": 1.740348070859909, + "epoch": 1.7189036280244983, + "grad_norm": 0.7147418856620789, + "learning_rate": 2.9173609159481623e-06, + "loss": 1.3815, + "mean_token_accuracy": 0.6621488879124323, + "num_tokens": 2624712724.0, + "step": 15647 + }, + { + "entropy": 1.6902793844540913, + "epoch": 1.719013484935871, + "grad_norm": 0.7417254447937012, + "learning_rate": 2.9166567865448354e-06, + "loss": 1.4179, + "mean_token_accuracy": 0.6671228508154551, + "num_tokens": 2624891490.0, + "step": 15648 + }, + { + "entropy": 1.7188851237297058, + "epoch": 1.719123341847244, + "grad_norm": 0.7302298545837402, + "learning_rate": 2.9159529129756786e-06, + "loss": 1.3433, + "mean_token_accuracy": 0.6725572695334753, + "num_tokens": 2625061436.0, + "step": 15649 + }, + { + "entropy": 1.7102177143096924, + "epoch": 1.719233198758617, + "grad_norm": 0.7004075050354004, + "learning_rate": 2.9152492952629705e-06, + "loss": 1.348, + "mean_token_accuracy": 0.6611120849847794, + "num_tokens": 2625204652.0, + "step": 15650 + }, + { + "entropy": 1.7715183695157368, + "epoch": 1.7193430556699898, + "grad_norm": 0.6727532148361206, + "learning_rate": 2.9145459334289793e-06, + "loss": 1.3649, + "mean_token_accuracy": 0.655080164472262, + "num_tokens": 2625372329.0, + "step": 15651 + }, + { + "entropy": 1.705493172009786, + "epoch": 1.719452912581363, + "grad_norm": 0.632786750793457, + "learning_rate": 2.913842827495964e-06, + "loss": 1.5373, + "mean_token_accuracy": 0.6413880536953608, + "num_tokens": 2625606863.0, + "step": 15652 + }, + { + "entropy": 1.6163564026355743, + "epoch": 1.7195627694927356, + "grad_norm": 0.6396132111549377, + "learning_rate": 2.9131399774861823e-06, + "loss": 1.3683, + "mean_token_accuracy": 0.6635381281375885, + "num_tokens": 2625790091.0, + "step": 15653 + }, + { + "entropy": 1.7325818141301472, + "epoch": 1.7196726264041087, + "grad_norm": 0.6826179623603821, + "learning_rate": 2.9124373834218733e-06, + "loss": 1.4846, + "mean_token_accuracy": 0.637732004125913, + "num_tokens": 2625965636.0, + "step": 15654 + }, + { + "entropy": 1.727001855770747, + "epoch": 1.7197824833154816, + "grad_norm": 0.7186253070831299, + "learning_rate": 2.9117350453252797e-06, + "loss": 1.3532, + "mean_token_accuracy": 0.6762463947137197, + "num_tokens": 2626105857.0, + "step": 15655 + }, + { + "entropy": 1.673914760351181, + "epoch": 1.7198923402268544, + "grad_norm": 0.7053311467170715, + "learning_rate": 2.9110329632186264e-06, + "loss": 1.4876, + "mean_token_accuracy": 0.6486860315004984, + "num_tokens": 2626261625.0, + "step": 15656 + }, + { + "entropy": 1.701512336730957, + "epoch": 1.7200021971382276, + "grad_norm": 0.5304204821586609, + "learning_rate": 2.9103311371241328e-06, + "loss": 1.4538, + "mean_token_accuracy": 0.6435932020346323, + "num_tokens": 2626484562.0, + "step": 15657 + }, + { + "entropy": 1.7574267089366913, + "epoch": 1.7201120540496004, + "grad_norm": 0.6676803231239319, + "learning_rate": 2.909629567064014e-06, + "loss": 1.3626, + "mean_token_accuracy": 0.6545713643232981, + "num_tokens": 2626646846.0, + "step": 15658 + }, + { + "entropy": 1.7169890503088634, + "epoch": 1.7202219109609733, + "grad_norm": 0.6293471455574036, + "learning_rate": 2.908928253060478e-06, + "loss": 1.4239, + "mean_token_accuracy": 0.6564174294471741, + "num_tokens": 2626829982.0, + "step": 15659 + }, + { + "entropy": 1.6367035309473674, + "epoch": 1.7203317678723464, + "grad_norm": 0.6502057909965515, + "learning_rate": 2.908227195135712e-06, + "loss": 1.4779, + "mean_token_accuracy": 0.6452774107456207, + "num_tokens": 2627025170.0, + "step": 15660 + }, + { + "entropy": 1.6846541166305542, + "epoch": 1.720441624783719, + "grad_norm": 0.842652440071106, + "learning_rate": 2.907526393311909e-06, + "loss": 1.4808, + "mean_token_accuracy": 0.6536863893270493, + "num_tokens": 2627194478.0, + "step": 15661 + }, + { + "entropy": 1.697983334461848, + "epoch": 1.7205514816950922, + "grad_norm": 0.6564600467681885, + "learning_rate": 2.906825847611252e-06, + "loss": 1.4753, + "mean_token_accuracy": 0.6554812788963318, + "num_tokens": 2627373566.0, + "step": 15662 + }, + { + "entropy": 1.7039678891499836, + "epoch": 1.720661338606465, + "grad_norm": 0.7069868445396423, + "learning_rate": 2.90612555805591e-06, + "loss": 1.3481, + "mean_token_accuracy": 0.6676451563835144, + "num_tokens": 2627564580.0, + "step": 15663 + }, + { + "entropy": 1.6884814302126567, + "epoch": 1.720771195517838, + "grad_norm": 0.6857156157493591, + "learning_rate": 2.905425524668044e-06, + "loss": 1.3325, + "mean_token_accuracy": 0.6730131804943085, + "num_tokens": 2627688092.0, + "step": 15664 + }, + { + "entropy": 1.7210382620493572, + "epoch": 1.720881052429211, + "grad_norm": 0.6751901507377625, + "learning_rate": 2.9047257474698155e-06, + "loss": 1.4386, + "mean_token_accuracy": 0.6408843944470087, + "num_tokens": 2627853159.0, + "step": 15665 + }, + { + "entropy": 1.6964699625968933, + "epoch": 1.7209909093405837, + "grad_norm": 0.7327737212181091, + "learning_rate": 2.9040262264833662e-06, + "loss": 1.3029, + "mean_token_accuracy": 0.6690873155991236, + "num_tokens": 2627964728.0, + "step": 15666 + }, + { + "entropy": 1.7300258974234264, + "epoch": 1.7211007662519568, + "grad_norm": 0.7446189522743225, + "learning_rate": 2.9033269617308417e-06, + "loss": 1.4907, + "mean_token_accuracy": 0.628525917728742, + "num_tokens": 2628197249.0, + "step": 15667 + }, + { + "entropy": 1.73529119292895, + "epoch": 1.7212106231633297, + "grad_norm": 0.618928074836731, + "learning_rate": 2.9026279532343702e-06, + "loss": 1.3959, + "mean_token_accuracy": 0.6569162358840307, + "num_tokens": 2628372683.0, + "step": 15668 + }, + { + "entropy": 1.7280128796895344, + "epoch": 1.7213204800747026, + "grad_norm": 0.6142230033874512, + "learning_rate": 2.9019292010160738e-06, + "loss": 1.2844, + "mean_token_accuracy": 0.6628156552712122, + "num_tokens": 2628508232.0, + "step": 15669 + }, + { + "entropy": 1.7332999805609386, + "epoch": 1.7214303369860757, + "grad_norm": 0.6877867579460144, + "learning_rate": 2.901230705098068e-06, + "loss": 1.3278, + "mean_token_accuracy": 0.6685926765203476, + "num_tokens": 2628666542.0, + "step": 15670 + }, + { + "entropy": 1.650905857483546, + "epoch": 1.7215401938974486, + "grad_norm": 0.7264050841331482, + "learning_rate": 2.9005324655024645e-06, + "loss": 1.1828, + "mean_token_accuracy": 0.6877222855885824, + "num_tokens": 2628793638.0, + "step": 15671 + }, + { + "entropy": 1.7235294878482819, + "epoch": 1.7216500508088215, + "grad_norm": 0.8157387375831604, + "learning_rate": 2.8998344822513563e-06, + "loss": 1.3123, + "mean_token_accuracy": 0.6598003009955088, + "num_tokens": 2628914826.0, + "step": 15672 + }, + { + "entropy": 1.7341767648855846, + "epoch": 1.7217599077201946, + "grad_norm": 0.645763635635376, + "learning_rate": 2.8991367553668364e-06, + "loss": 1.4893, + "mean_token_accuracy": 0.6368632217248281, + "num_tokens": 2629125796.0, + "step": 15673 + }, + { + "entropy": 1.7225966950257618, + "epoch": 1.7218697646315673, + "grad_norm": 0.6662589311599731, + "learning_rate": 2.89843928487099e-06, + "loss": 1.365, + "mean_token_accuracy": 0.6565856287876765, + "num_tokens": 2629318301.0, + "step": 15674 + }, + { + "entropy": 1.7104746202627819, + "epoch": 1.7219796215429404, + "grad_norm": 0.6441190242767334, + "learning_rate": 2.8977420707858896e-06, + "loss": 1.5115, + "mean_token_accuracy": 0.6391011476516724, + "num_tokens": 2629516989.0, + "step": 15675 + }, + { + "entropy": 1.580856482187907, + "epoch": 1.7220894784543133, + "grad_norm": 0.5921868681907654, + "learning_rate": 2.8970451131335987e-06, + "loss": 1.2911, + "mean_token_accuracy": 0.6746866156657537, + "num_tokens": 2629683168.0, + "step": 15676 + }, + { + "entropy": 1.6643680731455486, + "epoch": 1.7221993353656861, + "grad_norm": 0.5791040062904358, + "learning_rate": 2.8963484119361807e-06, + "loss": 1.36, + "mean_token_accuracy": 0.6568904668092728, + "num_tokens": 2629835992.0, + "step": 15677 + }, + { + "entropy": 1.7079329987366993, + "epoch": 1.7223091922770593, + "grad_norm": 0.6563646793365479, + "learning_rate": 2.895651967215683e-06, + "loss": 1.4994, + "mean_token_accuracy": 0.6590022444725037, + "num_tokens": 2630030474.0, + "step": 15678 + }, + { + "entropy": 1.6830817659695942, + "epoch": 1.722419049188432, + "grad_norm": 0.653887152671814, + "learning_rate": 2.8949557789941496e-06, + "loss": 1.4428, + "mean_token_accuracy": 0.6400194962819418, + "num_tokens": 2630253676.0, + "step": 15679 + }, + { + "entropy": 1.6895558039347331, + "epoch": 1.722528906099805, + "grad_norm": 0.6651843190193176, + "learning_rate": 2.894259847293614e-06, + "loss": 1.4028, + "mean_token_accuracy": 0.6565206199884415, + "num_tokens": 2630450982.0, + "step": 15680 + }, + { + "entropy": 1.671963373819987, + "epoch": 1.722638763011178, + "grad_norm": 0.667715847492218, + "learning_rate": 2.8935641721360997e-06, + "loss": 1.408, + "mean_token_accuracy": 0.6582218011220297, + "num_tokens": 2630606203.0, + "step": 15681 + }, + { + "entropy": 1.7464380860328674, + "epoch": 1.7227486199225508, + "grad_norm": 0.7251789569854736, + "learning_rate": 2.892868753543628e-06, + "loss": 1.4277, + "mean_token_accuracy": 0.6359966893990835, + "num_tokens": 2630771994.0, + "step": 15682 + }, + { + "entropy": 1.7634708881378174, + "epoch": 1.722858476833924, + "grad_norm": 0.84815913438797, + "learning_rate": 2.8921735915382077e-06, + "loss": 1.5548, + "mean_token_accuracy": 0.6365585227807363, + "num_tokens": 2630956170.0, + "step": 15683 + }, + { + "entropy": 1.7360106408596039, + "epoch": 1.7229683337452968, + "grad_norm": 0.76357102394104, + "learning_rate": 2.891478686141838e-06, + "loss": 1.3809, + "mean_token_accuracy": 0.6442870199680328, + "num_tokens": 2631129034.0, + "step": 15684 + }, + { + "entropy": 1.7115418414274852, + "epoch": 1.7230781906566697, + "grad_norm": 1.291322112083435, + "learning_rate": 2.890784037376514e-06, + "loss": 1.1999, + "mean_token_accuracy": 0.6683139950037003, + "num_tokens": 2631354553.0, + "step": 15685 + }, + { + "entropy": 1.7102199296156566, + "epoch": 1.7231880475680428, + "grad_norm": 0.6701323390007019, + "learning_rate": 2.8900896452642236e-06, + "loss": 1.2201, + "mean_token_accuracy": 0.6836532801389694, + "num_tokens": 2631473586.0, + "step": 15686 + }, + { + "entropy": 1.658743570248286, + "epoch": 1.7232979044794154, + "grad_norm": 0.7017245292663574, + "learning_rate": 2.8893955098269404e-06, + "loss": 1.1978, + "mean_token_accuracy": 0.6843846688667933, + "num_tokens": 2631581852.0, + "step": 15687 + }, + { + "entropy": 1.636157254378001, + "epoch": 1.7234077613907886, + "grad_norm": 0.6195902824401855, + "learning_rate": 2.888701631086633e-06, + "loss": 1.4816, + "mean_token_accuracy": 0.6684853285551071, + "num_tokens": 2631732814.0, + "step": 15688 + }, + { + "entropy": 1.7463614245255787, + "epoch": 1.7235176183021614, + "grad_norm": 0.6704382300376892, + "learning_rate": 2.888008009065266e-06, + "loss": 1.3736, + "mean_token_accuracy": 0.6524703949689865, + "num_tokens": 2631917876.0, + "step": 15689 + }, + { + "entropy": 1.7010905543963115, + "epoch": 1.7236274752135343, + "grad_norm": 0.5545864701271057, + "learning_rate": 2.8873146437847876e-06, + "loss": 1.4765, + "mean_token_accuracy": 0.638947606086731, + "num_tokens": 2632151477.0, + "step": 15690 + }, + { + "entropy": 1.7439953585465748, + "epoch": 1.7237373321249074, + "grad_norm": 0.8762944340705872, + "learning_rate": 2.8866215352671477e-06, + "loss": 1.3559, + "mean_token_accuracy": 0.6531753689050674, + "num_tokens": 2632332013.0, + "step": 15691 + }, + { + "entropy": 1.716113030910492, + "epoch": 1.72384718903628, + "grad_norm": 0.6564568281173706, + "learning_rate": 2.8859286835342793e-06, + "loss": 1.4302, + "mean_token_accuracy": 0.6436122357845306, + "num_tokens": 2632523258.0, + "step": 15692 + }, + { + "entropy": 1.7107328176498413, + "epoch": 1.7239570459476532, + "grad_norm": 0.6151549220085144, + "learning_rate": 2.885236088608111e-06, + "loss": 1.3062, + "mean_token_accuracy": 0.673832893371582, + "num_tokens": 2632670998.0, + "step": 15693 + }, + { + "entropy": 1.6899991532166798, + "epoch": 1.724066902859026, + "grad_norm": 0.6508071422576904, + "learning_rate": 2.8845437505105662e-06, + "loss": 1.3572, + "mean_token_accuracy": 0.6597887774308523, + "num_tokens": 2632825597.0, + "step": 15694 + }, + { + "entropy": 1.7111450632413228, + "epoch": 1.724176759770399, + "grad_norm": 0.6507290005683899, + "learning_rate": 2.883851669263554e-06, + "loss": 1.4109, + "mean_token_accuracy": 0.6530921260515848, + "num_tokens": 2633013026.0, + "step": 15695 + }, + { + "entropy": 1.7176082531611125, + "epoch": 1.724286616681772, + "grad_norm": 0.7525252103805542, + "learning_rate": 2.883159844888977e-06, + "loss": 1.3411, + "mean_token_accuracy": 0.6653605997562408, + "num_tokens": 2633157545.0, + "step": 15696 + }, + { + "entropy": 1.6218600273132324, + "epoch": 1.724396473593145, + "grad_norm": 0.6842386722564697, + "learning_rate": 2.8824682774087336e-06, + "loss": 1.2215, + "mean_token_accuracy": 0.6854247947533926, + "num_tokens": 2633332852.0, + "step": 15697 + }, + { + "entropy": 1.7413849532604218, + "epoch": 1.7245063305045178, + "grad_norm": 0.7053453326225281, + "learning_rate": 2.881776966844714e-06, + "loss": 1.3955, + "mean_token_accuracy": 0.6568620105584463, + "num_tokens": 2633497069.0, + "step": 15698 + }, + { + "entropy": 1.7630057732264202, + "epoch": 1.724616187415891, + "grad_norm": 0.7357109785079956, + "learning_rate": 2.881085913218794e-06, + "loss": 1.3177, + "mean_token_accuracy": 0.663982942700386, + "num_tokens": 2633664211.0, + "step": 15699 + }, + { + "entropy": 1.7018859187761943, + "epoch": 1.7247260443272636, + "grad_norm": 0.7314789295196533, + "learning_rate": 2.880395116552845e-06, + "loss": 1.3952, + "mean_token_accuracy": 0.6586244255304337, + "num_tokens": 2633847816.0, + "step": 15700 + }, + { + "entropy": 1.6985140244166057, + "epoch": 1.7248359012386367, + "grad_norm": 0.6651721596717834, + "learning_rate": 2.879704576868734e-06, + "loss": 1.3008, + "mean_token_accuracy": 0.672162319223086, + "num_tokens": 2634007895.0, + "step": 15701 + }, + { + "entropy": 1.6979452073574066, + "epoch": 1.7249457581500096, + "grad_norm": 0.6808459758758545, + "learning_rate": 2.8790142941883114e-06, + "loss": 1.4499, + "mean_token_accuracy": 0.6408154418071111, + "num_tokens": 2634259125.0, + "step": 15702 + }, + { + "entropy": 1.647172898054123, + "epoch": 1.7250556150613825, + "grad_norm": 0.6427389979362488, + "learning_rate": 2.87832426853343e-06, + "loss": 1.396, + "mean_token_accuracy": 0.6532814701398214, + "num_tokens": 2634408832.0, + "step": 15703 + }, + { + "entropy": 1.701138327519099, + "epoch": 1.7251654719727556, + "grad_norm": 0.7189123034477234, + "learning_rate": 2.8776344999259253e-06, + "loss": 1.5086, + "mean_token_accuracy": 0.637222687403361, + "num_tokens": 2634574740.0, + "step": 15704 + }, + { + "entropy": 1.7070972124735515, + "epoch": 1.7252753288841283, + "grad_norm": 0.6271539926528931, + "learning_rate": 2.876944988387626e-06, + "loss": 1.3537, + "mean_token_accuracy": 0.6601554602384567, + "num_tokens": 2634731896.0, + "step": 15705 + }, + { + "entropy": 1.653431475162506, + "epoch": 1.7253851857955014, + "grad_norm": 0.8477639555931091, + "learning_rate": 2.87625573394036e-06, + "loss": 1.3336, + "mean_token_accuracy": 0.6693581690390905, + "num_tokens": 2634853832.0, + "step": 15706 + }, + { + "entropy": 1.729806274175644, + "epoch": 1.7254950427068743, + "grad_norm": 0.6469348073005676, + "learning_rate": 2.8755667366059403e-06, + "loss": 1.5134, + "mean_token_accuracy": 0.650507057706515, + "num_tokens": 2635036350.0, + "step": 15707 + }, + { + "entropy": 1.692490776379903, + "epoch": 1.7256048996182471, + "grad_norm": 0.6189622282981873, + "learning_rate": 2.8748779964061697e-06, + "loss": 1.3822, + "mean_token_accuracy": 0.6697244842847189, + "num_tokens": 2635210409.0, + "step": 15708 + }, + { + "entropy": 1.7030345400174458, + "epoch": 1.7257147565296203, + "grad_norm": 0.6761047840118408, + "learning_rate": 2.8741895133628506e-06, + "loss": 1.2378, + "mean_token_accuracy": 0.6791710207859675, + "num_tokens": 2635326490.0, + "step": 15709 + }, + { + "entropy": 1.7439703047275543, + "epoch": 1.7258246134409931, + "grad_norm": 0.80422043800354, + "learning_rate": 2.873501287497771e-06, + "loss": 1.399, + "mean_token_accuracy": 0.6671061217784882, + "num_tokens": 2635455970.0, + "step": 15710 + }, + { + "entropy": 1.7496139506498973, + "epoch": 1.725934470352366, + "grad_norm": 0.6345416903495789, + "learning_rate": 2.8728133188327144e-06, + "loss": 1.4652, + "mean_token_accuracy": 0.6486029028892517, + "num_tokens": 2635665581.0, + "step": 15711 + }, + { + "entropy": 1.6556781927744548, + "epoch": 1.7260443272637391, + "grad_norm": 0.6797258257865906, + "learning_rate": 2.8721256073894554e-06, + "loss": 1.3187, + "mean_token_accuracy": 0.6713179250558218, + "num_tokens": 2635846577.0, + "step": 15712 + }, + { + "entropy": 1.7008158266544342, + "epoch": 1.7261541841751118, + "grad_norm": 0.8460964560508728, + "learning_rate": 2.8714381531897552e-06, + "loss": 1.4194, + "mean_token_accuracy": 0.662114754319191, + "num_tokens": 2636002590.0, + "step": 15713 + }, + { + "entropy": 1.6474326451619465, + "epoch": 1.726264041086485, + "grad_norm": 0.6482619047164917, + "learning_rate": 2.8707509562553754e-06, + "loss": 1.4091, + "mean_token_accuracy": 0.6451590359210968, + "num_tokens": 2636207089.0, + "step": 15714 + }, + { + "entropy": 1.7873999178409576, + "epoch": 1.7263738979978578, + "grad_norm": 0.7166942358016968, + "learning_rate": 2.8700640166080678e-06, + "loss": 1.4096, + "mean_token_accuracy": 0.6470801830291748, + "num_tokens": 2636356920.0, + "step": 15715 + }, + { + "entropy": 1.7664975921312969, + "epoch": 1.7264837549092307, + "grad_norm": 0.8807752728462219, + "learning_rate": 2.869377334269568e-06, + "loss": 1.3901, + "mean_token_accuracy": 0.6581128338972727, + "num_tokens": 2636495774.0, + "step": 15716 + }, + { + "entropy": 1.716484268506368, + "epoch": 1.7265936118206038, + "grad_norm": 0.6690646409988403, + "learning_rate": 2.868690909261611e-06, + "loss": 1.2976, + "mean_token_accuracy": 0.672653466463089, + "num_tokens": 2636615668.0, + "step": 15717 + }, + { + "entropy": 1.7186988592147827, + "epoch": 1.7267034687319764, + "grad_norm": 0.6803736090660095, + "learning_rate": 2.8680047416059255e-06, + "loss": 1.464, + "mean_token_accuracy": 0.6469851434230804, + "num_tokens": 2636766089.0, + "step": 15718 + }, + { + "entropy": 1.7901420791943867, + "epoch": 1.7268133256433496, + "grad_norm": 0.9188567399978638, + "learning_rate": 2.867318831324225e-06, + "loss": 1.7136, + "mean_token_accuracy": 0.6263786852359772, + "num_tokens": 2636910508.0, + "step": 15719 + }, + { + "entropy": 1.710059146086375, + "epoch": 1.7269231825547224, + "grad_norm": 0.7188782095909119, + "learning_rate": 2.8666331784382164e-06, + "loss": 1.4827, + "mean_token_accuracy": 0.6406730314095815, + "num_tokens": 2637071721.0, + "step": 15720 + }, + { + "entropy": 1.6877124905586243, + "epoch": 1.7270330394660953, + "grad_norm": 0.6510874629020691, + "learning_rate": 2.865947782969605e-06, + "loss": 1.3875, + "mean_token_accuracy": 0.6546644171079, + "num_tokens": 2637242150.0, + "step": 15721 + }, + { + "entropy": 1.7223760485649109, + "epoch": 1.7271428963774684, + "grad_norm": 0.713353157043457, + "learning_rate": 2.8652626449400794e-06, + "loss": 1.4072, + "mean_token_accuracy": 0.6563191761573156, + "num_tokens": 2637397453.0, + "step": 15722 + }, + { + "entropy": 1.687508871157964, + "epoch": 1.7272527532888413, + "grad_norm": 0.6655722856521606, + "learning_rate": 2.864577764371327e-06, + "loss": 1.5558, + "mean_token_accuracy": 0.6428915162881216, + "num_tokens": 2637576207.0, + "step": 15723 + }, + { + "entropy": 1.7700623273849487, + "epoch": 1.7273626102002142, + "grad_norm": 0.8303236365318298, + "learning_rate": 2.8638931412850226e-06, + "loss": 1.4077, + "mean_token_accuracy": 0.6632307171821594, + "num_tokens": 2637726614.0, + "step": 15724 + }, + { + "entropy": 1.7799305220444996, + "epoch": 1.7274724671115873, + "grad_norm": 0.8366572856903076, + "learning_rate": 2.8632087757028317e-06, + "loss": 1.5173, + "mean_token_accuracy": 0.6468792210022608, + "num_tokens": 2637888327.0, + "step": 15725 + }, + { + "entropy": 1.6914372245470684, + "epoch": 1.72758232402296, + "grad_norm": 0.5621321201324463, + "learning_rate": 2.862524667646417e-06, + "loss": 1.5679, + "mean_token_accuracy": 0.6358216305573782, + "num_tokens": 2638134217.0, + "step": 15726 + }, + { + "entropy": 1.700711299975713, + "epoch": 1.727692180934333, + "grad_norm": 0.7504645586013794, + "learning_rate": 2.861840817137433e-06, + "loss": 1.4857, + "mean_token_accuracy": 0.6607631792624792, + "num_tokens": 2638295434.0, + "step": 15727 + }, + { + "entropy": 1.7090523938337963, + "epoch": 1.727802037845706, + "grad_norm": 0.7692071795463562, + "learning_rate": 2.8611572241975167e-06, + "loss": 1.3441, + "mean_token_accuracy": 0.6551339974006017, + "num_tokens": 2638424528.0, + "step": 15728 + }, + { + "entropy": 1.7331831753253937, + "epoch": 1.7279118947570788, + "grad_norm": 0.5608404874801636, + "learning_rate": 2.8604738888483074e-06, + "loss": 1.4806, + "mean_token_accuracy": 0.6301528811454773, + "num_tokens": 2638711868.0, + "step": 15729 + }, + { + "entropy": 1.6656751334667206, + "epoch": 1.728021751668452, + "grad_norm": 0.6592541933059692, + "learning_rate": 2.8597908111114326e-06, + "loss": 1.402, + "mean_token_accuracy": 0.6527828822533289, + "num_tokens": 2638931324.0, + "step": 15730 + }, + { + "entropy": 1.6092142363389332, + "epoch": 1.7281316085798246, + "grad_norm": 0.6773507595062256, + "learning_rate": 2.8591079910085107e-06, + "loss": 1.2708, + "mean_token_accuracy": 0.678041805823644, + "num_tokens": 2639120616.0, + "step": 15731 + }, + { + "entropy": 1.6422138214111328, + "epoch": 1.7282414654911977, + "grad_norm": 0.594689130783081, + "learning_rate": 2.8584254285611512e-06, + "loss": 1.3481, + "mean_token_accuracy": 0.6701185554265976, + "num_tokens": 2639262161.0, + "step": 15732 + }, + { + "entropy": 1.7203916311264038, + "epoch": 1.7283513224025706, + "grad_norm": 0.7151592969894409, + "learning_rate": 2.8577431237909602e-06, + "loss": 1.349, + "mean_token_accuracy": 0.6614575286706289, + "num_tokens": 2639424805.0, + "step": 15733 + }, + { + "entropy": 1.806416392326355, + "epoch": 1.7284611793139435, + "grad_norm": 0.6949290037155151, + "learning_rate": 2.8570610767195274e-06, + "loss": 1.4947, + "mean_token_accuracy": 0.6405768990516663, + "num_tokens": 2639580707.0, + "step": 15734 + }, + { + "entropy": 1.6962719062964122, + "epoch": 1.7285710362253166, + "grad_norm": 0.6608180999755859, + "learning_rate": 2.8563792873684456e-06, + "loss": 1.2491, + "mean_token_accuracy": 0.6842715740203857, + "num_tokens": 2639699425.0, + "step": 15735 + }, + { + "entropy": 1.6750577787558238, + "epoch": 1.7286808931366895, + "grad_norm": 0.7081702947616577, + "learning_rate": 2.8556977557592884e-06, + "loss": 1.3426, + "mean_token_accuracy": 0.6623915582895279, + "num_tokens": 2639841659.0, + "step": 15736 + }, + { + "entropy": 1.6969606379667919, + "epoch": 1.7287907500480624, + "grad_norm": 0.620355486869812, + "learning_rate": 2.855016481913626e-06, + "loss": 1.5321, + "mean_token_accuracy": 0.6244812359412512, + "num_tokens": 2640063434.0, + "step": 15737 + }, + { + "entropy": 1.646816263596217, + "epoch": 1.7289006069594355, + "grad_norm": 0.5661360025405884, + "learning_rate": 2.854335465853022e-06, + "loss": 1.5014, + "mean_token_accuracy": 0.6421874364217123, + "num_tokens": 2640282850.0, + "step": 15738 + }, + { + "entropy": 1.7638940612475078, + "epoch": 1.7290104638708081, + "grad_norm": 0.7863647937774658, + "learning_rate": 2.8536547075990327e-06, + "loss": 1.4823, + "mean_token_accuracy": 0.6424362609783808, + "num_tokens": 2640444765.0, + "step": 15739 + }, + { + "entropy": 1.7247726917266846, + "epoch": 1.7291203207821813, + "grad_norm": 0.7812128067016602, + "learning_rate": 2.8529742071731985e-06, + "loss": 1.4218, + "mean_token_accuracy": 0.6599978854258856, + "num_tokens": 2640610371.0, + "step": 15740 + }, + { + "entropy": 1.7008132835229237, + "epoch": 1.7292301776935541, + "grad_norm": 0.6622722744941711, + "learning_rate": 2.8522939645970595e-06, + "loss": 1.3794, + "mean_token_accuracy": 0.6508124470710754, + "num_tokens": 2640802543.0, + "step": 15741 + }, + { + "entropy": 1.6437017023563385, + "epoch": 1.729340034604927, + "grad_norm": 0.7969115972518921, + "learning_rate": 2.851613979892146e-06, + "loss": 1.3394, + "mean_token_accuracy": 0.6736712157726288, + "num_tokens": 2640968961.0, + "step": 15742 + }, + { + "entropy": 1.6500834325949352, + "epoch": 1.7294498915163001, + "grad_norm": 0.7005840539932251, + "learning_rate": 2.8509342530799787e-06, + "loss": 1.3966, + "mean_token_accuracy": 0.6679046203692754, + "num_tokens": 2641142162.0, + "step": 15743 + }, + { + "entropy": 1.7105072836081188, + "epoch": 1.7295597484276728, + "grad_norm": 0.6108032464981079, + "learning_rate": 2.8502547841820684e-06, + "loss": 1.5941, + "mean_token_accuracy": 0.6181052277485529, + "num_tokens": 2641405364.0, + "step": 15744 + }, + { + "entropy": 1.777452568213145, + "epoch": 1.729669605339046, + "grad_norm": 0.6558803915977478, + "learning_rate": 2.8495755732199232e-06, + "loss": 1.6254, + "mean_token_accuracy": 0.6251836170752844, + "num_tokens": 2641627804.0, + "step": 15745 + }, + { + "entropy": 1.735464612642924, + "epoch": 1.7297794622504188, + "grad_norm": 0.7284940481185913, + "learning_rate": 2.848896620215037e-06, + "loss": 1.4115, + "mean_token_accuracy": 0.6465435773134232, + "num_tokens": 2641763821.0, + "step": 15746 + }, + { + "entropy": 1.6562157074610393, + "epoch": 1.7298893191617917, + "grad_norm": 0.6121866703033447, + "learning_rate": 2.848217925188902e-06, + "loss": 1.3641, + "mean_token_accuracy": 0.6681751608848572, + "num_tokens": 2641955876.0, + "step": 15747 + }, + { + "entropy": 1.6489443282286327, + "epoch": 1.7299991760731648, + "grad_norm": 0.7306373119354248, + "learning_rate": 2.8475394881629966e-06, + "loss": 1.2843, + "mean_token_accuracy": 0.6912119189898173, + "num_tokens": 2642077818.0, + "step": 15748 + }, + { + "entropy": 1.6806483666102092, + "epoch": 1.7301090329845377, + "grad_norm": 0.8181135058403015, + "learning_rate": 2.8468613091587902e-06, + "loss": 1.4591, + "mean_token_accuracy": 0.6678927342096964, + "num_tokens": 2642228543.0, + "step": 15749 + }, + { + "entropy": 1.7011501491069794, + "epoch": 1.7302188898959106, + "grad_norm": 0.5812907218933105, + "learning_rate": 2.84618338819775e-06, + "loss": 1.4, + "mean_token_accuracy": 0.6601615299781164, + "num_tokens": 2642381141.0, + "step": 15750 + }, + { + "entropy": 1.6654735505580902, + "epoch": 1.7303287468072837, + "grad_norm": 0.6332482099533081, + "learning_rate": 2.8455057253013354e-06, + "loss": 1.4628, + "mean_token_accuracy": 0.6520533412694931, + "num_tokens": 2642558612.0, + "step": 15751 + }, + { + "entropy": 1.6841372152169545, + "epoch": 1.7304386037186563, + "grad_norm": 0.6896584033966064, + "learning_rate": 2.8448283204909844e-06, + "loss": 1.4898, + "mean_token_accuracy": 0.6429760406414667, + "num_tokens": 2642767553.0, + "step": 15752 + }, + { + "entropy": 1.6826592286427815, + "epoch": 1.7305484606300294, + "grad_norm": 0.6982526779174805, + "learning_rate": 2.8441511737881443e-06, + "loss": 1.2862, + "mean_token_accuracy": 0.6770686457554499, + "num_tokens": 2642910286.0, + "step": 15753 + }, + { + "entropy": 1.6773277123769124, + "epoch": 1.7306583175414023, + "grad_norm": 0.5871797800064087, + "learning_rate": 2.843474285214246e-06, + "loss": 1.5036, + "mean_token_accuracy": 0.6401631236076355, + "num_tokens": 2643130615.0, + "step": 15754 + }, + { + "entropy": 1.6971320907274883, + "epoch": 1.7307681744527752, + "grad_norm": 0.6325240135192871, + "learning_rate": 2.8427976547907106e-06, + "loss": 1.4979, + "mean_token_accuracy": 0.6486349354187647, + "num_tokens": 2643295952.0, + "step": 15755 + }, + { + "entropy": 1.6904420753320057, + "epoch": 1.7308780313641483, + "grad_norm": 0.8593859672546387, + "learning_rate": 2.8421212825389516e-06, + "loss": 1.3975, + "mean_token_accuracy": 0.6446862071752548, + "num_tokens": 2643482901.0, + "step": 15756 + }, + { + "entropy": 1.7852684557437897, + "epoch": 1.730987888275521, + "grad_norm": 0.6767821907997131, + "learning_rate": 2.841445168480381e-06, + "loss": 1.5704, + "mean_token_accuracy": 0.6423915525277456, + "num_tokens": 2643676176.0, + "step": 15757 + }, + { + "entropy": 1.7145917117595673, + "epoch": 1.731097745186894, + "grad_norm": 0.6300607323646545, + "learning_rate": 2.8407693126363916e-06, + "loss": 1.3585, + "mean_token_accuracy": 0.6577344536781311, + "num_tokens": 2643885388.0, + "step": 15758 + }, + { + "entropy": 1.727539946635564, + "epoch": 1.731207602098267, + "grad_norm": 0.7048355937004089, + "learning_rate": 2.8400937150283793e-06, + "loss": 1.5526, + "mean_token_accuracy": 0.6489974558353424, + "num_tokens": 2644056977.0, + "step": 15759 + }, + { + "entropy": 1.7069771488507588, + "epoch": 1.7313174590096398, + "grad_norm": 0.5949591398239136, + "learning_rate": 2.8394183756777235e-06, + "loss": 1.4094, + "mean_token_accuracy": 0.6460785369078318, + "num_tokens": 2644273336.0, + "step": 15760 + }, + { + "entropy": 1.7462304333845775, + "epoch": 1.731427315921013, + "grad_norm": 0.7763286232948303, + "learning_rate": 2.838743294605797e-06, + "loss": 1.4839, + "mean_token_accuracy": 0.6531115273634592, + "num_tokens": 2644460738.0, + "step": 15761 + }, + { + "entropy": 1.6935663322607677, + "epoch": 1.7315371728323858, + "grad_norm": 0.68621426820755, + "learning_rate": 2.8380684718339696e-06, + "loss": 1.4743, + "mean_token_accuracy": 0.6545071303844452, + "num_tokens": 2644623609.0, + "step": 15762 + }, + { + "entropy": 1.6895051697889965, + "epoch": 1.7316470297437587, + "grad_norm": 0.7107670903205872, + "learning_rate": 2.8373939073835977e-06, + "loss": 1.4164, + "mean_token_accuracy": 0.665955513715744, + "num_tokens": 2644765811.0, + "step": 15763 + }, + { + "entropy": 1.7151564260323842, + "epoch": 1.7317568866551318, + "grad_norm": 0.6578272581100464, + "learning_rate": 2.8367196012760283e-06, + "loss": 1.4266, + "mean_token_accuracy": 0.6549387921889623, + "num_tokens": 2644908445.0, + "step": 15764 + }, + { + "entropy": 1.702756404876709, + "epoch": 1.7318667435665045, + "grad_norm": 0.7014343738555908, + "learning_rate": 2.836045553532605e-06, + "loss": 1.2898, + "mean_token_accuracy": 0.66953477760156, + "num_tokens": 2645016657.0, + "step": 15765 + }, + { + "entropy": 1.6990113755067189, + "epoch": 1.7319766004778776, + "grad_norm": 0.6619819402694702, + "learning_rate": 2.8353717641746625e-06, + "loss": 1.3552, + "mean_token_accuracy": 0.6663278589646021, + "num_tokens": 2645151759.0, + "step": 15766 + }, + { + "entropy": 1.6938722531000774, + "epoch": 1.7320864573892505, + "grad_norm": 0.729397714138031, + "learning_rate": 2.834698233223525e-06, + "loss": 1.3684, + "mean_token_accuracy": 0.6664966940879822, + "num_tokens": 2645321883.0, + "step": 15767 + }, + { + "entropy": 1.6677038371562958, + "epoch": 1.7321963143006234, + "grad_norm": 0.6559589505195618, + "learning_rate": 2.8340249607005087e-06, + "loss": 1.4055, + "mean_token_accuracy": 0.6565075367689133, + "num_tokens": 2645474895.0, + "step": 15768 + }, + { + "entropy": 1.688380589087804, + "epoch": 1.7323061712119965, + "grad_norm": 0.6308188438415527, + "learning_rate": 2.8333519466269223e-06, + "loss": 1.3668, + "mean_token_accuracy": 0.6556487778822581, + "num_tokens": 2645681696.0, + "step": 15769 + }, + { + "entropy": 1.6896365185578663, + "epoch": 1.7324160281233691, + "grad_norm": 0.7085091471672058, + "learning_rate": 2.832679191024066e-06, + "loss": 1.3655, + "mean_token_accuracy": 0.6658649444580078, + "num_tokens": 2645863103.0, + "step": 15770 + }, + { + "entropy": 1.6898165345191956, + "epoch": 1.7325258850347423, + "grad_norm": 0.6663379669189453, + "learning_rate": 2.8320066939132364e-06, + "loss": 1.3565, + "mean_token_accuracy": 0.6607397049665451, + "num_tokens": 2646008101.0, + "step": 15771 + }, + { + "entropy": 1.759082555770874, + "epoch": 1.7326357419461151, + "grad_norm": 0.7728214859962463, + "learning_rate": 2.83133445531571e-06, + "loss": 1.4906, + "mean_token_accuracy": 0.6619271288315455, + "num_tokens": 2646196373.0, + "step": 15772 + }, + { + "entropy": 1.705134669939677, + "epoch": 1.732745598857488, + "grad_norm": 0.7739623785018921, + "learning_rate": 2.8306624752527684e-06, + "loss": 1.324, + "mean_token_accuracy": 0.6735307027896246, + "num_tokens": 2646379370.0, + "step": 15773 + }, + { + "entropy": 1.6545397241910298, + "epoch": 1.7328554557688611, + "grad_norm": 0.7858942151069641, + "learning_rate": 2.82999075374568e-06, + "loss": 1.4899, + "mean_token_accuracy": 0.6507531503836314, + "num_tokens": 2646557248.0, + "step": 15774 + }, + { + "entropy": 1.7393496334552765, + "epoch": 1.732965312680234, + "grad_norm": 0.8852123022079468, + "learning_rate": 2.8293192908157025e-06, + "loss": 1.5377, + "mean_token_accuracy": 0.6492738674084345, + "num_tokens": 2646718540.0, + "step": 15775 + }, + { + "entropy": 1.738167365392049, + "epoch": 1.733075169591607, + "grad_norm": 0.7457848191261292, + "learning_rate": 2.828648086484086e-06, + "loss": 1.4497, + "mean_token_accuracy": 0.6467219044764837, + "num_tokens": 2646866109.0, + "step": 15776 + }, + { + "entropy": 1.6820921699206035, + "epoch": 1.73318502650298, + "grad_norm": 0.6062507033348083, + "learning_rate": 2.827977140772077e-06, + "loss": 1.3582, + "mean_token_accuracy": 0.6673463334639868, + "num_tokens": 2647037691.0, + "step": 15777 + }, + { + "entropy": 1.700913409392039, + "epoch": 1.7332948834143527, + "grad_norm": 0.5943799614906311, + "learning_rate": 2.827306453700907e-06, + "loss": 1.4493, + "mean_token_accuracy": 0.6339322676261266, + "num_tokens": 2647261416.0, + "step": 15778 + }, + { + "entropy": 1.7037220895290375, + "epoch": 1.7334047403257258, + "grad_norm": 0.7893034219741821, + "learning_rate": 2.826636025291808e-06, + "loss": 1.4788, + "mean_token_accuracy": 0.631934697429339, + "num_tokens": 2647428426.0, + "step": 15779 + }, + { + "entropy": 1.662857900063197, + "epoch": 1.7335145972370987, + "grad_norm": 0.705226719379425, + "learning_rate": 2.8259658555659947e-06, + "loss": 1.3723, + "mean_token_accuracy": 0.6748548299074173, + "num_tokens": 2647576867.0, + "step": 15780 + }, + { + "entropy": 1.6877172191937764, + "epoch": 1.7336244541484715, + "grad_norm": 0.642670750617981, + "learning_rate": 2.825295944544677e-06, + "loss": 1.4399, + "mean_token_accuracy": 0.6476767361164093, + "num_tokens": 2647788737.0, + "step": 15781 + }, + { + "entropy": 1.6754780113697052, + "epoch": 1.7337343110598447, + "grad_norm": 0.8167265057563782, + "learning_rate": 2.8246262922490596e-06, + "loss": 1.4949, + "mean_token_accuracy": 0.6553296744823456, + "num_tokens": 2647943883.0, + "step": 15782 + }, + { + "entropy": 1.7225460310777028, + "epoch": 1.7338441679712175, + "grad_norm": 0.6751701831817627, + "learning_rate": 2.8239568987003384e-06, + "loss": 1.4517, + "mean_token_accuracy": 0.6534865995248159, + "num_tokens": 2648137335.0, + "step": 15783 + }, + { + "entropy": 1.690890371799469, + "epoch": 1.7339540248825904, + "grad_norm": 0.7012602686882019, + "learning_rate": 2.8232877639196956e-06, + "loss": 1.3007, + "mean_token_accuracy": 0.6690143694480261, + "num_tokens": 2648251721.0, + "step": 15784 + }, + { + "entropy": 1.7479467689990997, + "epoch": 1.7340638817939633, + "grad_norm": 0.7268240451812744, + "learning_rate": 2.822618887928309e-06, + "loss": 1.4206, + "mean_token_accuracy": 0.6618320842583975, + "num_tokens": 2648409793.0, + "step": 15785 + }, + { + "entropy": 1.722074290116628, + "epoch": 1.7341737387053362, + "grad_norm": 1.3608239889144897, + "learning_rate": 2.8219502707473525e-06, + "loss": 1.5261, + "mean_token_accuracy": 0.6445515751838684, + "num_tokens": 2648531384.0, + "step": 15786 + }, + { + "entropy": 1.6843404074509938, + "epoch": 1.7342835956167093, + "grad_norm": 0.6605405211448669, + "learning_rate": 2.821281912397984e-06, + "loss": 1.3001, + "mean_token_accuracy": 0.6718050042788187, + "num_tokens": 2648640336.0, + "step": 15787 + }, + { + "entropy": 1.662507524092992, + "epoch": 1.7343934525280822, + "grad_norm": 0.6708582043647766, + "learning_rate": 2.820613812901356e-06, + "loss": 1.3266, + "mean_token_accuracy": 0.6668059825897217, + "num_tokens": 2648772568.0, + "step": 15788 + }, + { + "entropy": 1.6774542232354481, + "epoch": 1.734503309439455, + "grad_norm": 0.6292397975921631, + "learning_rate": 2.819945972278618e-06, + "loss": 1.5471, + "mean_token_accuracy": 0.6366194983323415, + "num_tokens": 2649041455.0, + "step": 15789 + }, + { + "entropy": 1.7329609791437786, + "epoch": 1.7346131663508282, + "grad_norm": 0.7120325565338135, + "learning_rate": 2.819278390550901e-06, + "loss": 1.4066, + "mean_token_accuracy": 0.6510707437992096, + "num_tokens": 2649272586.0, + "step": 15790 + }, + { + "entropy": 1.6792218287785847, + "epoch": 1.7347230232622008, + "grad_norm": 0.6149081587791443, + "learning_rate": 2.8186110677393387e-06, + "loss": 1.3502, + "mean_token_accuracy": 0.6708264698584875, + "num_tokens": 2649437345.0, + "step": 15791 + }, + { + "entropy": 1.7267203132311504, + "epoch": 1.734832880173574, + "grad_norm": 0.6599135994911194, + "learning_rate": 2.8179440038650496e-06, + "loss": 1.3767, + "mean_token_accuracy": 0.6644027580817541, + "num_tokens": 2649620738.0, + "step": 15792 + }, + { + "entropy": 1.7191211581230164, + "epoch": 1.7349427370849468, + "grad_norm": 0.7179271578788757, + "learning_rate": 2.817277198949144e-06, + "loss": 1.4368, + "mean_token_accuracy": 0.6469183464845022, + "num_tokens": 2649837132.0, + "step": 15793 + }, + { + "entropy": 1.746620883544286, + "epoch": 1.7350525939963197, + "grad_norm": 0.643526017665863, + "learning_rate": 2.8166106530127274e-06, + "loss": 1.613, + "mean_token_accuracy": 0.6203589936097463, + "num_tokens": 2650073284.0, + "step": 15794 + }, + { + "entropy": 1.714139034350713, + "epoch": 1.7351624509076928, + "grad_norm": 0.6725690960884094, + "learning_rate": 2.8159443660769002e-06, + "loss": 1.381, + "mean_token_accuracy": 0.6693058560291926, + "num_tokens": 2650216304.0, + "step": 15795 + }, + { + "entropy": 1.7154027422269185, + "epoch": 1.7352723078190657, + "grad_norm": 0.6718941330909729, + "learning_rate": 2.815278338162742e-06, + "loss": 1.3734, + "mean_token_accuracy": 0.6560245205958685, + "num_tokens": 2650389116.0, + "step": 15796 + }, + { + "entropy": 1.6776060263315837, + "epoch": 1.7353821647304386, + "grad_norm": 0.6263152360916138, + "learning_rate": 2.8146125692913373e-06, + "loss": 1.3378, + "mean_token_accuracy": 0.6764421413342158, + "num_tokens": 2650553193.0, + "step": 15797 + }, + { + "entropy": 1.7232110400994618, + "epoch": 1.7354920216418115, + "grad_norm": 0.5665870904922485, + "learning_rate": 2.8139470594837566e-06, + "loss": 1.3868, + "mean_token_accuracy": 0.6588666985432307, + "num_tokens": 2650715674.0, + "step": 15798 + }, + { + "entropy": 1.6804384191830952, + "epoch": 1.7356018785531844, + "grad_norm": 0.5942453742027283, + "learning_rate": 2.8132818087610637e-06, + "loss": 1.3806, + "mean_token_accuracy": 0.6652588794628779, + "num_tokens": 2650863032.0, + "step": 15799 + }, + { + "entropy": 1.7259988685448964, + "epoch": 1.7357117354645575, + "grad_norm": 0.6818102598190308, + "learning_rate": 2.81261681714431e-06, + "loss": 1.3696, + "mean_token_accuracy": 0.6753989507754644, + "num_tokens": 2650997983.0, + "step": 15800 + }, + { + "entropy": 1.7083225051561992, + "epoch": 1.7358215923759304, + "grad_norm": 0.6384536623954773, + "learning_rate": 2.811952084654548e-06, + "loss": 1.4066, + "mean_token_accuracy": 0.6457608987887701, + "num_tokens": 2651168003.0, + "step": 15801 + }, + { + "entropy": 1.6841741700967152, + "epoch": 1.7359314492873033, + "grad_norm": 0.7538333535194397, + "learning_rate": 2.8112876113128094e-06, + "loss": 1.2787, + "mean_token_accuracy": 0.665538469950358, + "num_tokens": 2651282390.0, + "step": 15802 + }, + { + "entropy": 1.6514563858509064, + "epoch": 1.7360413061986764, + "grad_norm": 0.604898989200592, + "learning_rate": 2.8106233971401305e-06, + "loss": 1.384, + "mean_token_accuracy": 0.6566628019014994, + "num_tokens": 2651468047.0, + "step": 15803 + }, + { + "entropy": 1.6742986639340718, + "epoch": 1.736151163110049, + "grad_norm": 0.7198217511177063, + "learning_rate": 2.8099594421575306e-06, + "loss": 1.3387, + "mean_token_accuracy": 0.6742167373498281, + "num_tokens": 2651638952.0, + "step": 15804 + }, + { + "entropy": 1.6387183368206024, + "epoch": 1.7362610200214221, + "grad_norm": 0.5498782396316528, + "learning_rate": 2.8092957463860225e-06, + "loss": 1.4036, + "mean_token_accuracy": 0.6512420624494553, + "num_tokens": 2651855946.0, + "step": 15805 + }, + { + "entropy": 1.713132123152415, + "epoch": 1.736370876932795, + "grad_norm": 0.6297646164894104, + "learning_rate": 2.8086323098466127e-06, + "loss": 1.2696, + "mean_token_accuracy": 0.6640227288007736, + "num_tokens": 2652083899.0, + "step": 15806 + }, + { + "entropy": 1.7311813334623973, + "epoch": 1.736480733844168, + "grad_norm": 0.7326252460479736, + "learning_rate": 2.8079691325603037e-06, + "loss": 1.4061, + "mean_token_accuracy": 0.6535738656918207, + "num_tokens": 2652252305.0, + "step": 15807 + }, + { + "entropy": 1.7351558605829875, + "epoch": 1.736590590755541, + "grad_norm": 0.809312105178833, + "learning_rate": 2.8073062145480766e-06, + "loss": 1.6552, + "mean_token_accuracy": 0.6350140472253164, + "num_tokens": 2652441076.0, + "step": 15808 + }, + { + "entropy": 1.7146221995353699, + "epoch": 1.736700447666914, + "grad_norm": 0.6093737483024597, + "learning_rate": 2.806643555830915e-06, + "loss": 1.3532, + "mean_token_accuracy": 0.6540501813093821, + "num_tokens": 2652599894.0, + "step": 15809 + }, + { + "entropy": 1.699708640575409, + "epoch": 1.7368103045782868, + "grad_norm": 0.6803867816925049, + "learning_rate": 2.8059811564297957e-06, + "loss": 1.3324, + "mean_token_accuracy": 0.6549798647562662, + "num_tokens": 2652768541.0, + "step": 15810 + }, + { + "entropy": 1.7326335211594899, + "epoch": 1.7369201614896597, + "grad_norm": 0.7687568664550781, + "learning_rate": 2.80531901636568e-06, + "loss": 1.2987, + "mean_token_accuracy": 0.6661215225855509, + "num_tokens": 2652899364.0, + "step": 15811 + }, + { + "entropy": 1.7323335111141205, + "epoch": 1.7370300184010325, + "grad_norm": 0.7186509370803833, + "learning_rate": 2.804657135659522e-06, + "loss": 1.5174, + "mean_token_accuracy": 0.6523576378822327, + "num_tokens": 2653039239.0, + "step": 15812 + }, + { + "entropy": 1.6271824638048809, + "epoch": 1.7371398753124057, + "grad_norm": 0.7200955152511597, + "learning_rate": 2.803995514332277e-06, + "loss": 1.2861, + "mean_token_accuracy": 0.6787689824899038, + "num_tokens": 2653192220.0, + "step": 15813 + }, + { + "entropy": 1.6728685200214386, + "epoch": 1.7372497322237785, + "grad_norm": 0.6453947424888611, + "learning_rate": 2.8033341524048764e-06, + "loss": 1.443, + "mean_token_accuracy": 0.6549021850029627, + "num_tokens": 2653384321.0, + "step": 15814 + }, + { + "entropy": 1.7225570380687714, + "epoch": 1.7373595891351514, + "grad_norm": 0.8567750453948975, + "learning_rate": 2.802673049898259e-06, + "loss": 1.4889, + "mean_token_accuracy": 0.6441005816062292, + "num_tokens": 2653559173.0, + "step": 15815 + }, + { + "entropy": 1.7080976366996765, + "epoch": 1.7374694460465245, + "grad_norm": 0.6325947046279907, + "learning_rate": 2.8020122068333466e-06, + "loss": 1.4394, + "mean_token_accuracy": 0.6394100387891134, + "num_tokens": 2653758046.0, + "step": 15816 + }, + { + "entropy": 1.7126984298229218, + "epoch": 1.7375793029578972, + "grad_norm": 0.7144211530685425, + "learning_rate": 2.801351623231051e-06, + "loss": 1.4652, + "mean_token_accuracy": 0.6530701269706091, + "num_tokens": 2653930419.0, + "step": 15817 + }, + { + "entropy": 1.7186160882314045, + "epoch": 1.7376891598692703, + "grad_norm": 0.589459240436554, + "learning_rate": 2.8006912991122827e-06, + "loss": 1.356, + "mean_token_accuracy": 0.6640836447477341, + "num_tokens": 2654147898.0, + "step": 15818 + }, + { + "entropy": 1.675024002790451, + "epoch": 1.7377990167806432, + "grad_norm": 0.6387170553207397, + "learning_rate": 2.8000312344979434e-06, + "loss": 1.5746, + "mean_token_accuracy": 0.646162673830986, + "num_tokens": 2654364535.0, + "step": 15819 + }, + { + "entropy": 1.6666264633337657, + "epoch": 1.737908873692016, + "grad_norm": 0.7120461463928223, + "learning_rate": 2.7993714294089173e-06, + "loss": 1.2804, + "mean_token_accuracy": 0.6800036976734797, + "num_tokens": 2654482190.0, + "step": 15820 + }, + { + "entropy": 1.7276875178019206, + "epoch": 1.7380187306033892, + "grad_norm": 0.818804144859314, + "learning_rate": 2.7987118838660903e-06, + "loss": 1.2782, + "mean_token_accuracy": 0.6826535513003668, + "num_tokens": 2654642615.0, + "step": 15821 + }, + { + "entropy": 1.7416711151599884, + "epoch": 1.738128587514762, + "grad_norm": 0.6067622900009155, + "learning_rate": 2.7980525978903378e-06, + "loss": 1.6374, + "mean_token_accuracy": 0.6284281214078268, + "num_tokens": 2654834393.0, + "step": 15822 + }, + { + "entropy": 1.6803725957870483, + "epoch": 1.738238444426135, + "grad_norm": 0.5896869897842407, + "learning_rate": 2.797393571502524e-06, + "loss": 1.455, + "mean_token_accuracy": 0.6401728590329488, + "num_tokens": 2655091358.0, + "step": 15823 + }, + { + "entropy": 1.6684520145257313, + "epoch": 1.738348301337508, + "grad_norm": 0.6544545292854309, + "learning_rate": 2.796734804723507e-06, + "loss": 1.2353, + "mean_token_accuracy": 0.6802895118792852, + "num_tokens": 2655219666.0, + "step": 15824 + }, + { + "entropy": 1.7058659692605336, + "epoch": 1.7384581582488807, + "grad_norm": 0.6241233348846436, + "learning_rate": 2.796076297574138e-06, + "loss": 1.367, + "mean_token_accuracy": 0.6578785528739294, + "num_tokens": 2655395368.0, + "step": 15825 + }, + { + "entropy": 1.77889946103096, + "epoch": 1.7385680151602538, + "grad_norm": 0.6710366010665894, + "learning_rate": 2.795418050075257e-06, + "loss": 1.2793, + "mean_token_accuracy": 0.6623863478501638, + "num_tokens": 2655516501.0, + "step": 15826 + }, + { + "entropy": 1.703927884499232, + "epoch": 1.7386778720716267, + "grad_norm": 0.6267694234848022, + "learning_rate": 2.7947600622476988e-06, + "loss": 1.1627, + "mean_token_accuracy": 0.6800014326969782, + "num_tokens": 2655677633.0, + "step": 15827 + }, + { + "entropy": 1.7275777161121368, + "epoch": 1.7387877289829996, + "grad_norm": 0.6682912111282349, + "learning_rate": 2.794102334112285e-06, + "loss": 1.3755, + "mean_token_accuracy": 0.6576682031154633, + "num_tokens": 2655842114.0, + "step": 15828 + }, + { + "entropy": 1.6405729452768962, + "epoch": 1.7388975858943727, + "grad_norm": 0.7250331044197083, + "learning_rate": 2.7934448656898357e-06, + "loss": 1.3655, + "mean_token_accuracy": 0.6576348741849264, + "num_tokens": 2656065244.0, + "step": 15829 + }, + { + "entropy": 1.694657524426778, + "epoch": 1.7390074428057454, + "grad_norm": 0.7095337510108948, + "learning_rate": 2.7927876570011594e-06, + "loss": 1.3874, + "mean_token_accuracy": 0.670227994521459, + "num_tokens": 2656181627.0, + "step": 15830 + }, + { + "entropy": 1.61565363407135, + "epoch": 1.7391172997171185, + "grad_norm": 3.940749406814575, + "learning_rate": 2.7921307080670553e-06, + "loss": 1.269, + "mean_token_accuracy": 0.6815206309159597, + "num_tokens": 2656368268.0, + "step": 15831 + }, + { + "entropy": 1.7019450465838115, + "epoch": 1.7392271566284914, + "grad_norm": 0.9675191640853882, + "learning_rate": 2.791474018908314e-06, + "loss": 1.5986, + "mean_token_accuracy": 0.6489445567131042, + "num_tokens": 2656518851.0, + "step": 15832 + }, + { + "entropy": 1.6337886949380238, + "epoch": 1.7393370135398643, + "grad_norm": 0.5778870582580566, + "learning_rate": 2.7908175895457224e-06, + "loss": 1.3799, + "mean_token_accuracy": 0.6472407778104147, + "num_tokens": 2656710252.0, + "step": 15833 + }, + { + "entropy": 1.7616515358289082, + "epoch": 1.7394468704512374, + "grad_norm": 0.6655634641647339, + "learning_rate": 2.7901614200000536e-06, + "loss": 1.531, + "mean_token_accuracy": 0.6265707910060883, + "num_tokens": 2656985584.0, + "step": 15834 + }, + { + "entropy": 1.6581469575564067, + "epoch": 1.7395567273626102, + "grad_norm": 0.6637037992477417, + "learning_rate": 2.789505510292078e-06, + "loss": 1.2826, + "mean_token_accuracy": 0.6793718685706457, + "num_tokens": 2657128235.0, + "step": 15835 + }, + { + "entropy": 1.6418430705865223, + "epoch": 1.7396665842739831, + "grad_norm": 0.6092338562011719, + "learning_rate": 2.788849860442554e-06, + "loss": 1.2874, + "mean_token_accuracy": 0.6675700594981512, + "num_tokens": 2657267098.0, + "step": 15836 + }, + { + "entropy": 1.710461030403773, + "epoch": 1.7397764411853562, + "grad_norm": 0.619476854801178, + "learning_rate": 2.7881944704722297e-06, + "loss": 1.4077, + "mean_token_accuracy": 0.6550353765487671, + "num_tokens": 2657447186.0, + "step": 15837 + }, + { + "entropy": 1.75293172399203, + "epoch": 1.739886298096729, + "grad_norm": 0.7521857619285583, + "learning_rate": 2.7875393404018498e-06, + "loss": 1.5018, + "mean_token_accuracy": 0.63949865847826, + "num_tokens": 2657625193.0, + "step": 15838 + }, + { + "entropy": 1.7228674193223317, + "epoch": 1.739996155008102, + "grad_norm": 0.6770578622817993, + "learning_rate": 2.786884470252153e-06, + "loss": 1.5646, + "mean_token_accuracy": 0.6267157097657522, + "num_tokens": 2657814471.0, + "step": 15839 + }, + { + "entropy": 1.6907674670219421, + "epoch": 1.740106011919475, + "grad_norm": 0.5758486986160278, + "learning_rate": 2.7862298600438577e-06, + "loss": 1.3123, + "mean_token_accuracy": 0.6664116332928339, + "num_tokens": 2657971780.0, + "step": 15840 + }, + { + "entropy": 1.6239832937717438, + "epoch": 1.7402158688308478, + "grad_norm": 0.6411721110343933, + "learning_rate": 2.7855755097976874e-06, + "loss": 1.4493, + "mean_token_accuracy": 0.6541901677846909, + "num_tokens": 2658181782.0, + "step": 15841 + }, + { + "entropy": 1.7198786338170369, + "epoch": 1.7403257257422209, + "grad_norm": 0.695188045501709, + "learning_rate": 2.784921419534351e-06, + "loss": 1.654, + "mean_token_accuracy": 0.6311604132254919, + "num_tokens": 2658402574.0, + "step": 15842 + }, + { + "entropy": 1.6882243553797405, + "epoch": 1.7404355826535935, + "grad_norm": 0.6721879839897156, + "learning_rate": 2.7842675892745503e-06, + "loss": 1.2438, + "mean_token_accuracy": 0.671802838643392, + "num_tokens": 2658526785.0, + "step": 15843 + }, + { + "entropy": 1.742331971724828, + "epoch": 1.7405454395649667, + "grad_norm": 0.6647438406944275, + "learning_rate": 2.7836140190389767e-06, + "loss": 1.2945, + "mean_token_accuracy": 0.6804736703634262, + "num_tokens": 2658650417.0, + "step": 15844 + }, + { + "entropy": 1.6729466617107391, + "epoch": 1.7406552964763395, + "grad_norm": 0.6848008036613464, + "learning_rate": 2.7829607088483192e-06, + "loss": 1.3458, + "mean_token_accuracy": 0.67551389336586, + "num_tokens": 2658819700.0, + "step": 15845 + }, + { + "entropy": 1.6987472077210743, + "epoch": 1.7407651533877124, + "grad_norm": 0.7082852125167847, + "learning_rate": 2.78230765872325e-06, + "loss": 1.4251, + "mean_token_accuracy": 0.6490759005149206, + "num_tokens": 2659000912.0, + "step": 15846 + }, + { + "entropy": 1.7281360030174255, + "epoch": 1.7408750102990855, + "grad_norm": 0.5942803025245667, + "learning_rate": 2.781654868684443e-06, + "loss": 1.4627, + "mean_token_accuracy": 0.6435799946387609, + "num_tokens": 2659232723.0, + "step": 15847 + }, + { + "entropy": 1.6801528135935466, + "epoch": 1.7409848672104584, + "grad_norm": 0.6154281497001648, + "learning_rate": 2.7810023387525553e-06, + "loss": 1.3461, + "mean_token_accuracy": 0.6573351869980494, + "num_tokens": 2659406577.0, + "step": 15848 + }, + { + "entropy": 1.7473509311676025, + "epoch": 1.7410947241218313, + "grad_norm": 0.6029602289199829, + "learning_rate": 2.780350068948239e-06, + "loss": 1.5064, + "mean_token_accuracy": 0.6355864902337393, + "num_tokens": 2659628818.0, + "step": 15849 + }, + { + "entropy": 1.7131327490011852, + "epoch": 1.7412045810332044, + "grad_norm": 0.7908769845962524, + "learning_rate": 2.7796980592921392e-06, + "loss": 1.3336, + "mean_token_accuracy": 0.6735485146443049, + "num_tokens": 2659779586.0, + "step": 15850 + }, + { + "entropy": 1.7140113910039265, + "epoch": 1.741314437944577, + "grad_norm": 0.7202388644218445, + "learning_rate": 2.779046309804895e-06, + "loss": 1.5897, + "mean_token_accuracy": 0.6321464478969574, + "num_tokens": 2659934329.0, + "step": 15851 + }, + { + "entropy": 1.6853972772757213, + "epoch": 1.7414242948559502, + "grad_norm": 0.7633290886878967, + "learning_rate": 2.7783948205071265e-06, + "loss": 1.39, + "mean_token_accuracy": 0.649382695555687, + "num_tokens": 2660091123.0, + "step": 15852 + }, + { + "entropy": 1.6331301033496857, + "epoch": 1.741534151767323, + "grad_norm": 0.7545872926712036, + "learning_rate": 2.7777435914194574e-06, + "loss": 1.3173, + "mean_token_accuracy": 0.6768523355325063, + "num_tokens": 2660240337.0, + "step": 15853 + }, + { + "entropy": 1.7424573004245758, + "epoch": 1.741644008678696, + "grad_norm": 0.7436056137084961, + "learning_rate": 2.7770926225625016e-06, + "loss": 1.4274, + "mean_token_accuracy": 0.6455424477656683, + "num_tokens": 2660404045.0, + "step": 15854 + }, + { + "entropy": 1.7621293663978577, + "epoch": 1.741753865590069, + "grad_norm": 0.8202974200248718, + "learning_rate": 2.7764419139568572e-06, + "loss": 1.4083, + "mean_token_accuracy": 0.6570224414269129, + "num_tokens": 2660549219.0, + "step": 15855 + }, + { + "entropy": 1.7344311475753784, + "epoch": 1.7418637225014417, + "grad_norm": 0.9669505953788757, + "learning_rate": 2.77579146562312e-06, + "loss": 1.5783, + "mean_token_accuracy": 0.643063947558403, + "num_tokens": 2660719518.0, + "step": 15856 + }, + { + "entropy": 1.7664933999379475, + "epoch": 1.7419735794128148, + "grad_norm": 0.7337760925292969, + "learning_rate": 2.7751412775818774e-06, + "loss": 1.3591, + "mean_token_accuracy": 0.6609189411004385, + "num_tokens": 2660852111.0, + "step": 15857 + }, + { + "entropy": 1.7087633113066356, + "epoch": 1.7420834363241877, + "grad_norm": 0.6853848099708557, + "learning_rate": 2.7744913498537073e-06, + "loss": 1.3429, + "mean_token_accuracy": 0.6679713129997253, + "num_tokens": 2661027069.0, + "step": 15858 + }, + { + "entropy": 1.7308754622936249, + "epoch": 1.7421932932355606, + "grad_norm": 0.6821447610855103, + "learning_rate": 2.77384168245918e-06, + "loss": 1.2652, + "mean_token_accuracy": 0.6845847517251968, + "num_tokens": 2661181830.0, + "step": 15859 + }, + { + "entropy": 1.6414716045061748, + "epoch": 1.7423031501469337, + "grad_norm": 1.1366279125213623, + "learning_rate": 2.7731922754188574e-06, + "loss": 1.192, + "mean_token_accuracy": 0.6885305742422739, + "num_tokens": 2661407962.0, + "step": 15860 + }, + { + "entropy": 1.7026232481002808, + "epoch": 1.7424130070583066, + "grad_norm": 0.7026152014732361, + "learning_rate": 2.77254312875329e-06, + "loss": 1.3201, + "mean_token_accuracy": 0.6643240998188654, + "num_tokens": 2661539074.0, + "step": 15861 + }, + { + "entropy": 1.6785250306129456, + "epoch": 1.7425228639696795, + "grad_norm": 0.7026225924491882, + "learning_rate": 2.7718942424830254e-06, + "loss": 1.3103, + "mean_token_accuracy": 0.6704440861940384, + "num_tokens": 2661712726.0, + "step": 15862 + }, + { + "entropy": 1.731045385201772, + "epoch": 1.7426327208810526, + "grad_norm": 0.7580122947692871, + "learning_rate": 2.771245616628603e-06, + "loss": 1.2901, + "mean_token_accuracy": 0.6796207278966904, + "num_tokens": 2661827026.0, + "step": 15863 + }, + { + "entropy": 1.724742700656255, + "epoch": 1.7427425777924253, + "grad_norm": 0.8208819031715393, + "learning_rate": 2.7705972512105454e-06, + "loss": 1.35, + "mean_token_accuracy": 0.6589020987351736, + "num_tokens": 2661955123.0, + "step": 15864 + }, + { + "entropy": 1.7283775707085927, + "epoch": 1.7428524347037984, + "grad_norm": 0.8843021392822266, + "learning_rate": 2.769949146249378e-06, + "loss": 1.3974, + "mean_token_accuracy": 0.6639308879772822, + "num_tokens": 2662121725.0, + "step": 15865 + }, + { + "entropy": 1.7786122262477875, + "epoch": 1.7429622916151712, + "grad_norm": 0.762104332447052, + "learning_rate": 2.769301301765612e-06, + "loss": 1.3663, + "mean_token_accuracy": 0.6581309884786606, + "num_tokens": 2662248768.0, + "step": 15866 + }, + { + "entropy": 1.7138410607973735, + "epoch": 1.7430721485265441, + "grad_norm": 0.6937832236289978, + "learning_rate": 2.7686537177797523e-06, + "loss": 1.3812, + "mean_token_accuracy": 0.655176599820455, + "num_tokens": 2662403894.0, + "step": 15867 + }, + { + "entropy": 1.761600911617279, + "epoch": 1.7431820054379172, + "grad_norm": 0.6983460783958435, + "learning_rate": 2.76800639431229e-06, + "loss": 1.5221, + "mean_token_accuracy": 0.6477015241980553, + "num_tokens": 2662548926.0, + "step": 15868 + }, + { + "entropy": 1.7244667708873749, + "epoch": 1.74329186234929, + "grad_norm": 0.9077073335647583, + "learning_rate": 2.767359331383718e-06, + "loss": 1.7027, + "mean_token_accuracy": 0.6437298407157263, + "num_tokens": 2662710873.0, + "step": 15869 + }, + { + "entropy": 1.724602371454239, + "epoch": 1.743401719260663, + "grad_norm": 0.7138944268226624, + "learning_rate": 2.766712529014512e-06, + "loss": 1.5175, + "mean_token_accuracy": 0.6394424885511398, + "num_tokens": 2662849006.0, + "step": 15870 + }, + { + "entropy": 1.7281849185625713, + "epoch": 1.743511576172036, + "grad_norm": 0.6491277813911438, + "learning_rate": 2.7660659872251465e-06, + "loss": 1.4241, + "mean_token_accuracy": 0.6524779796600342, + "num_tokens": 2663030042.0, + "step": 15871 + }, + { + "entropy": 1.7124824225902557, + "epoch": 1.7436214330834088, + "grad_norm": 1.1152174472808838, + "learning_rate": 2.7654197060360814e-06, + "loss": 1.3749, + "mean_token_accuracy": 0.6579590986172358, + "num_tokens": 2663167002.0, + "step": 15872 + }, + { + "entropy": 1.7078356345494587, + "epoch": 1.7437312899947819, + "grad_norm": 0.7276211380958557, + "learning_rate": 2.7647736854677713e-06, + "loss": 1.3144, + "mean_token_accuracy": 0.6709758639335632, + "num_tokens": 2663323208.0, + "step": 15873 + }, + { + "entropy": 1.692564715941747, + "epoch": 1.7438411469061548, + "grad_norm": 0.8237895369529724, + "learning_rate": 2.7641279255406627e-06, + "loss": 1.2983, + "mean_token_accuracy": 0.6772258182366689, + "num_tokens": 2663464049.0, + "step": 15874 + }, + { + "entropy": 1.6918930908044179, + "epoch": 1.7439510038175277, + "grad_norm": 0.6199659705162048, + "learning_rate": 2.763482426275198e-06, + "loss": 1.5252, + "mean_token_accuracy": 0.6460767934719721, + "num_tokens": 2663626836.0, + "step": 15875 + }, + { + "entropy": 1.6668222049872081, + "epoch": 1.7440608607289008, + "grad_norm": 0.6627802848815918, + "learning_rate": 2.762837187691799e-06, + "loss": 1.442, + "mean_token_accuracy": 0.6594364990790685, + "num_tokens": 2663781920.0, + "step": 15876 + }, + { + "entropy": 1.7106184164683025, + "epoch": 1.7441707176402734, + "grad_norm": 0.6994909048080444, + "learning_rate": 2.762192209810891e-06, + "loss": 1.4027, + "mean_token_accuracy": 0.6450665394465128, + "num_tokens": 2663934972.0, + "step": 15877 + }, + { + "entropy": 1.664261003335317, + "epoch": 1.7442805745516465, + "grad_norm": 0.7002906203269958, + "learning_rate": 2.7615474926528897e-06, + "loss": 1.4519, + "mean_token_accuracy": 0.6614581495523453, + "num_tokens": 2664127135.0, + "step": 15878 + }, + { + "entropy": 1.7660688559214275, + "epoch": 1.7443904314630194, + "grad_norm": 0.6302309036254883, + "learning_rate": 2.7609030362381985e-06, + "loss": 1.3609, + "mean_token_accuracy": 0.6554263929526011, + "num_tokens": 2664317047.0, + "step": 15879 + }, + { + "entropy": 1.723916381597519, + "epoch": 1.7445002883743923, + "grad_norm": 0.6405919194221497, + "learning_rate": 2.76025884058721e-06, + "loss": 1.4031, + "mean_token_accuracy": 0.656665583451589, + "num_tokens": 2664491829.0, + "step": 15880 + }, + { + "entropy": 1.698687841494878, + "epoch": 1.7446101452857654, + "grad_norm": 0.6089337468147278, + "learning_rate": 2.7596149057203198e-06, + "loss": 1.2856, + "mean_token_accuracy": 0.6691889415184656, + "num_tokens": 2664635435.0, + "step": 15881 + }, + { + "entropy": 1.7116204798221588, + "epoch": 1.744720002197138, + "grad_norm": 0.698288083076477, + "learning_rate": 2.758971231657902e-06, + "loss": 1.3977, + "mean_token_accuracy": 0.6574417501688004, + "num_tokens": 2664771325.0, + "step": 15882 + }, + { + "entropy": 1.7519585887591045, + "epoch": 1.7448298591085112, + "grad_norm": 0.6853853464126587, + "learning_rate": 2.758327818420333e-06, + "loss": 1.5416, + "mean_token_accuracy": 0.643589456876119, + "num_tokens": 2665007886.0, + "step": 15883 + }, + { + "entropy": 1.688419868548711, + "epoch": 1.744939716019884, + "grad_norm": 0.6974164247512817, + "learning_rate": 2.757684666027975e-06, + "loss": 1.5013, + "mean_token_accuracy": 0.6468348503112793, + "num_tokens": 2665173280.0, + "step": 15884 + }, + { + "entropy": 1.7338370283444722, + "epoch": 1.745049572931257, + "grad_norm": 0.8303990960121155, + "learning_rate": 2.757041774501182e-06, + "loss": 1.3642, + "mean_token_accuracy": 0.6649026970068613, + "num_tokens": 2665324438.0, + "step": 15885 + }, + { + "entropy": 1.6935912072658539, + "epoch": 1.74515942984263, + "grad_norm": 0.7695938944816589, + "learning_rate": 2.7563991438603017e-06, + "loss": 1.4399, + "mean_token_accuracy": 0.644097218910853, + "num_tokens": 2665475264.0, + "step": 15886 + }, + { + "entropy": 1.6517487665017445, + "epoch": 1.745269286754003, + "grad_norm": 0.6604319214820862, + "learning_rate": 2.755756774125678e-06, + "loss": 1.2338, + "mean_token_accuracy": 0.6787222623825073, + "num_tokens": 2665614784.0, + "step": 15887 + }, + { + "entropy": 1.7014476756254833, + "epoch": 1.7453791436653758, + "grad_norm": 0.7577602863311768, + "learning_rate": 2.755114665317634e-06, + "loss": 1.2983, + "mean_token_accuracy": 0.667173316081365, + "num_tokens": 2665727625.0, + "step": 15888 + }, + { + "entropy": 1.6906098127365112, + "epoch": 1.745489000576749, + "grad_norm": 0.639340341091156, + "learning_rate": 2.754472817456496e-06, + "loss": 1.4082, + "mean_token_accuracy": 0.6597124834855398, + "num_tokens": 2665930344.0, + "step": 15889 + }, + { + "entropy": 1.6897248029708862, + "epoch": 1.7455988574881216, + "grad_norm": 0.665076732635498, + "learning_rate": 2.7538312305625775e-06, + "loss": 1.3761, + "mean_token_accuracy": 0.6529867599407831, + "num_tokens": 2666079301.0, + "step": 15890 + }, + { + "entropy": 1.7365977764129639, + "epoch": 1.7457087143994947, + "grad_norm": 0.7009277939796448, + "learning_rate": 2.7531899046561862e-06, + "loss": 1.3898, + "mean_token_accuracy": 0.6456332057714462, + "num_tokens": 2666266419.0, + "step": 15891 + }, + { + "entropy": 1.7120999991893768, + "epoch": 1.7458185713108676, + "grad_norm": 0.6818946003913879, + "learning_rate": 2.7525488397576173e-06, + "loss": 1.379, + "mean_token_accuracy": 0.6504452576239904, + "num_tokens": 2666457767.0, + "step": 15892 + }, + { + "entropy": 1.7892510890960693, + "epoch": 1.7459284282222405, + "grad_norm": 0.8706643581390381, + "learning_rate": 2.751908035887161e-06, + "loss": 1.6292, + "mean_token_accuracy": 0.6330756644407908, + "num_tokens": 2666652930.0, + "step": 15893 + }, + { + "entropy": 1.697370360294978, + "epoch": 1.7460382851336136, + "grad_norm": 0.7226040363311768, + "learning_rate": 2.7512674930650974e-06, + "loss": 1.4393, + "mean_token_accuracy": 0.6513659656047821, + "num_tokens": 2666817818.0, + "step": 15894 + }, + { + "entropy": 1.7232015530268352, + "epoch": 1.7461481420449863, + "grad_norm": 0.6826181411743164, + "learning_rate": 2.7506272113117044e-06, + "loss": 1.5085, + "mean_token_accuracy": 0.638033077120781, + "num_tokens": 2667047183.0, + "step": 15895 + }, + { + "entropy": 1.7857304712136586, + "epoch": 1.7462579989563594, + "grad_norm": 0.7245029807090759, + "learning_rate": 2.74998719064724e-06, + "loss": 1.3768, + "mean_token_accuracy": 0.6544994562864304, + "num_tokens": 2667225814.0, + "step": 15896 + }, + { + "entropy": 1.6997434000174205, + "epoch": 1.7463678558677322, + "grad_norm": 0.5723074078559875, + "learning_rate": 2.749347431091963e-06, + "loss": 1.4982, + "mean_token_accuracy": 0.6429259975751241, + "num_tokens": 2667426464.0, + "step": 15897 + }, + { + "entropy": 1.696702629327774, + "epoch": 1.7464777127791051, + "grad_norm": 0.7994809150695801, + "learning_rate": 2.748707932666124e-06, + "loss": 1.4103, + "mean_token_accuracy": 0.6626110722621282, + "num_tokens": 2667592380.0, + "step": 15898 + }, + { + "entropy": 1.7208527425924938, + "epoch": 1.7465875696904782, + "grad_norm": 0.7930123209953308, + "learning_rate": 2.748068695389961e-06, + "loss": 1.2325, + "mean_token_accuracy": 0.6823674192031225, + "num_tokens": 2667677926.0, + "step": 15899 + }, + { + "entropy": 1.7191575070222218, + "epoch": 1.7466974266018511, + "grad_norm": 0.896288275718689, + "learning_rate": 2.7474297192837036e-06, + "loss": 1.5166, + "mean_token_accuracy": 0.6732039203246435, + "num_tokens": 2667810576.0, + "step": 15900 + }, + { + "entropy": 1.6295614341894786, + "epoch": 1.746807283513224, + "grad_norm": 0.7545291185379028, + "learning_rate": 2.7467910043675777e-06, + "loss": 1.3663, + "mean_token_accuracy": 0.6583205610513687, + "num_tokens": 2667990884.0, + "step": 15901 + }, + { + "entropy": 1.7029616435368855, + "epoch": 1.7469171404245971, + "grad_norm": 0.6451340913772583, + "learning_rate": 2.746152550661797e-06, + "loss": 1.4962, + "mean_token_accuracy": 0.637171596288681, + "num_tokens": 2668221796.0, + "step": 15902 + }, + { + "entropy": 1.7344611088434856, + "epoch": 1.7470269973359698, + "grad_norm": 0.7526887655258179, + "learning_rate": 2.74551435818657e-06, + "loss": 1.2977, + "mean_token_accuracy": 0.6728590279817581, + "num_tokens": 2668370216.0, + "step": 15903 + }, + { + "entropy": 1.7479794124762218, + "epoch": 1.7471368542473429, + "grad_norm": 0.6179333925247192, + "learning_rate": 2.7448764269620935e-06, + "loss": 1.3405, + "mean_token_accuracy": 0.6603627453247706, + "num_tokens": 2668497399.0, + "step": 15904 + }, + { + "entropy": 1.749539703130722, + "epoch": 1.7472467111587158, + "grad_norm": 0.7413578629493713, + "learning_rate": 2.744238757008557e-06, + "loss": 1.4768, + "mean_token_accuracy": 0.6490618834892908, + "num_tokens": 2668630277.0, + "step": 15905 + }, + { + "entropy": 1.6794603963692982, + "epoch": 1.7473565680700887, + "grad_norm": 0.6921920776367188, + "learning_rate": 2.7436013483461444e-06, + "loss": 1.2721, + "mean_token_accuracy": 0.6702764679988226, + "num_tokens": 2668781076.0, + "step": 15906 + }, + { + "entropy": 1.697002778450648, + "epoch": 1.7474664249814618, + "grad_norm": 0.6494120955467224, + "learning_rate": 2.742964200995031e-06, + "loss": 1.4141, + "mean_token_accuracy": 0.6404003153244654, + "num_tokens": 2668988627.0, + "step": 15907 + }, + { + "entropy": 1.6904459396998088, + "epoch": 1.7475762818928344, + "grad_norm": 1.0672904253005981, + "learning_rate": 2.7423273149753772e-06, + "loss": 1.3819, + "mean_token_accuracy": 0.6539155195156733, + "num_tokens": 2669160412.0, + "step": 15908 + }, + { + "entropy": 1.7080066402753193, + "epoch": 1.7476861388042075, + "grad_norm": 0.6226124167442322, + "learning_rate": 2.7416906903073428e-06, + "loss": 1.4143, + "mean_token_accuracy": 0.6591801842053732, + "num_tokens": 2669354488.0, + "step": 15909 + }, + { + "entropy": 1.7091161111990611, + "epoch": 1.7477959957155804, + "grad_norm": 0.7031316757202148, + "learning_rate": 2.7410543270110783e-06, + "loss": 1.4967, + "mean_token_accuracy": 0.6567925910154978, + "num_tokens": 2669527028.0, + "step": 15910 + }, + { + "entropy": 1.66484734416008, + "epoch": 1.7479058526269533, + "grad_norm": 0.5733075737953186, + "learning_rate": 2.7404182251067223e-06, + "loss": 1.3354, + "mean_token_accuracy": 0.6688386301199595, + "num_tokens": 2669743539.0, + "step": 15911 + }, + { + "entropy": 1.682132512331009, + "epoch": 1.7480157095383264, + "grad_norm": 0.7437959313392639, + "learning_rate": 2.739782384614407e-06, + "loss": 1.4501, + "mean_token_accuracy": 0.6545865833759308, + "num_tokens": 2669897155.0, + "step": 15912 + }, + { + "entropy": 1.715892086426417, + "epoch": 1.7481255664496993, + "grad_norm": 0.6199830174446106, + "learning_rate": 2.7391468055542573e-06, + "loss": 1.3871, + "mean_token_accuracy": 0.6614372233549753, + "num_tokens": 2670031369.0, + "step": 15913 + }, + { + "entropy": 1.7419675091902416, + "epoch": 1.7482354233610722, + "grad_norm": 0.6958132386207581, + "learning_rate": 2.7385114879463886e-06, + "loss": 1.466, + "mean_token_accuracy": 0.6350632160902023, + "num_tokens": 2670226720.0, + "step": 15914 + }, + { + "entropy": 1.7340802152951558, + "epoch": 1.7483452802724453, + "grad_norm": 0.6458380818367004, + "learning_rate": 2.73787643181091e-06, + "loss": 1.3632, + "mean_token_accuracy": 0.651315172513326, + "num_tokens": 2670367360.0, + "step": 15915 + }, + { + "entropy": 1.7228031158447266, + "epoch": 1.748455137183818, + "grad_norm": 0.8244587779045105, + "learning_rate": 2.7372416371679196e-06, + "loss": 1.3318, + "mean_token_accuracy": 0.6637958685557047, + "num_tokens": 2670503435.0, + "step": 15916 + }, + { + "entropy": 1.730597714583079, + "epoch": 1.748564994095191, + "grad_norm": 0.6726572513580322, + "learning_rate": 2.7366071040375055e-06, + "loss": 1.4834, + "mean_token_accuracy": 0.6449510852495829, + "num_tokens": 2670754908.0, + "step": 15917 + }, + { + "entropy": 1.697900931040446, + "epoch": 1.748674851006564, + "grad_norm": 0.6586857438087463, + "learning_rate": 2.7359728324397527e-06, + "loss": 1.3441, + "mean_token_accuracy": 0.6652401685714722, + "num_tokens": 2670943327.0, + "step": 15918 + }, + { + "entropy": 1.742193837960561, + "epoch": 1.7487847079179368, + "grad_norm": 0.7539701461791992, + "learning_rate": 2.73533882239474e-06, + "loss": 1.4356, + "mean_token_accuracy": 0.6551166623830795, + "num_tokens": 2671090212.0, + "step": 15919 + }, + { + "entropy": 1.7344173789024353, + "epoch": 1.74889456482931, + "grad_norm": 0.6886321902275085, + "learning_rate": 2.7347050739225255e-06, + "loss": 1.4913, + "mean_token_accuracy": 0.6443561265865961, + "num_tokens": 2671261264.0, + "step": 15920 + }, + { + "entropy": 1.654103030761083, + "epoch": 1.7490044217406826, + "grad_norm": 0.6048797965049744, + "learning_rate": 2.734071587043172e-06, + "loss": 1.3701, + "mean_token_accuracy": 0.6593893716732661, + "num_tokens": 2671423561.0, + "step": 15921 + }, + { + "entropy": 1.673624058564504, + "epoch": 1.7491142786520557, + "grad_norm": 0.6645485758781433, + "learning_rate": 2.733438361776729e-06, + "loss": 1.4122, + "mean_token_accuracy": 0.6572486211856207, + "num_tokens": 2671579500.0, + "step": 15922 + }, + { + "entropy": 1.7188350359598796, + "epoch": 1.7492241355634286, + "grad_norm": 0.7232167720794678, + "learning_rate": 2.7328053981432373e-06, + "loss": 1.3182, + "mean_token_accuracy": 0.6738363355398178, + "num_tokens": 2671713076.0, + "step": 15923 + }, + { + "entropy": 1.6765947341918945, + "epoch": 1.7493339924748015, + "grad_norm": 0.7521688342094421, + "learning_rate": 2.7321726961627272e-06, + "loss": 1.3101, + "mean_token_accuracy": 0.6763416528701782, + "num_tokens": 2671916077.0, + "step": 15924 + }, + { + "entropy": 1.6944889426231384, + "epoch": 1.7494438493861746, + "grad_norm": 0.6953690648078918, + "learning_rate": 2.731540255855228e-06, + "loss": 1.3638, + "mean_token_accuracy": 0.6754336108764013, + "num_tokens": 2672113899.0, + "step": 15925 + }, + { + "entropy": 1.6916892528533936, + "epoch": 1.7495537062975475, + "grad_norm": 0.6988839507102966, + "learning_rate": 2.7309080772407513e-06, + "loss": 1.3953, + "mean_token_accuracy": 0.6575096398591995, + "num_tokens": 2672290280.0, + "step": 15926 + }, + { + "entropy": 1.7188960711161296, + "epoch": 1.7496635632089204, + "grad_norm": 0.7189807295799255, + "learning_rate": 2.7302761603393102e-06, + "loss": 1.475, + "mean_token_accuracy": 0.645105391740799, + "num_tokens": 2672469646.0, + "step": 15927 + }, + { + "entropy": 1.6774901350339253, + "epoch": 1.7497734201202935, + "grad_norm": 0.761957585811615, + "learning_rate": 2.7296445051709012e-06, + "loss": 1.3203, + "mean_token_accuracy": 0.6618892600138983, + "num_tokens": 2672626896.0, + "step": 15928 + }, + { + "entropy": 1.6909812192122142, + "epoch": 1.7498832770316661, + "grad_norm": 0.5992552638053894, + "learning_rate": 2.7290131117555164e-06, + "loss": 1.3377, + "mean_token_accuracy": 0.6602785636981329, + "num_tokens": 2672830089.0, + "step": 15929 + }, + { + "entropy": 1.7229216794172924, + "epoch": 1.7499931339430392, + "grad_norm": 0.6808377504348755, + "learning_rate": 2.7283819801131393e-06, + "loss": 1.3918, + "mean_token_accuracy": 0.6523537784814835, + "num_tokens": 2673004119.0, + "step": 15930 + }, + { + "entropy": 1.8220161596934001, + "epoch": 1.7501029908544121, + "grad_norm": 0.7409783601760864, + "learning_rate": 2.727751110263749e-06, + "loss": 1.3421, + "mean_token_accuracy": 0.6639614452918371, + "num_tokens": 2673106484.0, + "step": 15931 + }, + { + "entropy": 1.6994484464327495, + "epoch": 1.750212847765785, + "grad_norm": 0.6716399192810059, + "learning_rate": 2.7271205022273044e-06, + "loss": 1.4798, + "mean_token_accuracy": 0.6541763444741567, + "num_tokens": 2673266017.0, + "step": 15932 + }, + { + "entropy": 1.7303711573282878, + "epoch": 1.7503227046771581, + "grad_norm": 0.7343233823776245, + "learning_rate": 2.7264901560237685e-06, + "loss": 1.4671, + "mean_token_accuracy": 0.6574054459730784, + "num_tokens": 2673426360.0, + "step": 15933 + }, + { + "entropy": 1.6783941288789113, + "epoch": 1.7504325615885308, + "grad_norm": 0.6652559638023376, + "learning_rate": 2.725860071673093e-06, + "loss": 1.2442, + "mean_token_accuracy": 0.6868862261374792, + "num_tokens": 2673564234.0, + "step": 15934 + }, + { + "entropy": 1.7397794624169667, + "epoch": 1.7505424184999039, + "grad_norm": 0.7660438418388367, + "learning_rate": 2.7252302491952166e-06, + "loss": 1.5224, + "mean_token_accuracy": 0.6459332555532455, + "num_tokens": 2673745150.0, + "step": 15935 + }, + { + "entropy": 1.6459727088610332, + "epoch": 1.7506522754112768, + "grad_norm": 1.2467116117477417, + "learning_rate": 2.724600688610073e-06, + "loss": 1.157, + "mean_token_accuracy": 0.6878319978713989, + "num_tokens": 2673974532.0, + "step": 15936 + }, + { + "entropy": 1.6995637615521748, + "epoch": 1.7507621323226497, + "grad_norm": 0.6141746640205383, + "learning_rate": 2.723971389937591e-06, + "loss": 1.4254, + "mean_token_accuracy": 0.6578077226877213, + "num_tokens": 2674144964.0, + "step": 15937 + }, + { + "entropy": 1.785159985224406, + "epoch": 1.7508719892340228, + "grad_norm": 0.7061741352081299, + "learning_rate": 2.7233423531976827e-06, + "loss": 1.3695, + "mean_token_accuracy": 0.6479012419780096, + "num_tokens": 2674288702.0, + "step": 15938 + }, + { + "entropy": 1.7027852634588878, + "epoch": 1.7509818461453956, + "grad_norm": 0.7895951271057129, + "learning_rate": 2.7227135784102622e-06, + "loss": 1.4436, + "mean_token_accuracy": 0.6551746229330698, + "num_tokens": 2674455082.0, + "step": 15939 + }, + { + "entropy": 1.7255665163199108, + "epoch": 1.7510917030567685, + "grad_norm": 0.6441416144371033, + "learning_rate": 2.722085065595226e-06, + "loss": 1.6238, + "mean_token_accuracy": 0.6162713964780172, + "num_tokens": 2674717678.0, + "step": 15940 + }, + { + "entropy": 1.6721422374248505, + "epoch": 1.7512015599681416, + "grad_norm": 0.6975085735321045, + "learning_rate": 2.7214568147724656e-06, + "loss": 1.3447, + "mean_token_accuracy": 0.6642735848824183, + "num_tokens": 2674888510.0, + "step": 15941 + }, + { + "entropy": 1.70290403564771, + "epoch": 1.7513114168795143, + "grad_norm": 0.7391501665115356, + "learning_rate": 2.7208288259618674e-06, + "loss": 1.2404, + "mean_token_accuracy": 0.66946313281854, + "num_tokens": 2675027189.0, + "step": 15942 + }, + { + "entropy": 1.662659337123235, + "epoch": 1.7514212737908874, + "grad_norm": 0.7047905921936035, + "learning_rate": 2.720201099183309e-06, + "loss": 1.4786, + "mean_token_accuracy": 0.6650376369555792, + "num_tokens": 2675168738.0, + "step": 15943 + }, + { + "entropy": 1.767273743947347, + "epoch": 1.7515311307022603, + "grad_norm": 0.7692185640335083, + "learning_rate": 2.719573634456652e-06, + "loss": 1.2778, + "mean_token_accuracy": 0.6670770943164825, + "num_tokens": 2675281899.0, + "step": 15944 + }, + { + "entropy": 1.7268462379773457, + "epoch": 1.7516409876136332, + "grad_norm": 0.6072038412094116, + "learning_rate": 2.7189464318017572e-06, + "loss": 1.5135, + "mean_token_accuracy": 0.6426151494185129, + "num_tokens": 2675495851.0, + "step": 15945 + }, + { + "entropy": 1.7973586320877075, + "epoch": 1.7517508445250063, + "grad_norm": 0.7870144248008728, + "learning_rate": 2.718319491238479e-06, + "loss": 1.4825, + "mean_token_accuracy": 0.6487486610809962, + "num_tokens": 2675698102.0, + "step": 15946 + }, + { + "entropy": 1.6900157729784648, + "epoch": 1.751860701436379, + "grad_norm": 0.6972094178199768, + "learning_rate": 2.7176928127866565e-06, + "loss": 1.4772, + "mean_token_accuracy": 0.6373498241106669, + "num_tokens": 2675859370.0, + "step": 15947 + }, + { + "entropy": 1.7243396242459614, + "epoch": 1.751970558347752, + "grad_norm": 0.7015838623046875, + "learning_rate": 2.7170663964661246e-06, + "loss": 1.5138, + "mean_token_accuracy": 0.6468863636255264, + "num_tokens": 2676047567.0, + "step": 15948 + }, + { + "entropy": 1.714859535296758, + "epoch": 1.752080415259125, + "grad_norm": 0.6738941669464111, + "learning_rate": 2.716440242296707e-06, + "loss": 1.3198, + "mean_token_accuracy": 0.6629375318686167, + "num_tokens": 2676207126.0, + "step": 15949 + }, + { + "entropy": 1.64927805463473, + "epoch": 1.7521902721704978, + "grad_norm": 0.6364519000053406, + "learning_rate": 2.715814350298223e-06, + "loss": 1.2985, + "mean_token_accuracy": 0.6785600632429123, + "num_tokens": 2676349632.0, + "step": 15950 + }, + { + "entropy": 1.7328944404919941, + "epoch": 1.752300129081871, + "grad_norm": 0.7407512068748474, + "learning_rate": 2.715188720490486e-06, + "loss": 1.3626, + "mean_token_accuracy": 0.6627685775359472, + "num_tokens": 2676480646.0, + "step": 15951 + }, + { + "entropy": 1.6649740636348724, + "epoch": 1.7524099859932438, + "grad_norm": 0.6273338794708252, + "learning_rate": 2.7145633528932884e-06, + "loss": 1.2986, + "mean_token_accuracy": 0.6614208469788233, + "num_tokens": 2676625775.0, + "step": 15952 + }, + { + "entropy": 1.7441307703653972, + "epoch": 1.7525198429046167, + "grad_norm": 0.6636318564414978, + "learning_rate": 2.713938247526428e-06, + "loss": 1.3612, + "mean_token_accuracy": 0.659666990240415, + "num_tokens": 2676765131.0, + "step": 15953 + }, + { + "entropy": 1.691186914841334, + "epoch": 1.7526296998159898, + "grad_norm": 0.6902927160263062, + "learning_rate": 2.7133134044096894e-06, + "loss": 1.2746, + "mean_token_accuracy": 0.6825152337551117, + "num_tokens": 2676943845.0, + "step": 15954 + }, + { + "entropy": 1.6996070841948192, + "epoch": 1.7527395567273625, + "grad_norm": 0.6229896545410156, + "learning_rate": 2.7126888235628484e-06, + "loss": 1.4192, + "mean_token_accuracy": 0.6589094599088033, + "num_tokens": 2677124904.0, + "step": 15955 + }, + { + "entropy": 1.7306743164857228, + "epoch": 1.7528494136387356, + "grad_norm": 0.8987564444541931, + "learning_rate": 2.7120645050056693e-06, + "loss": 1.646, + "mean_token_accuracy": 0.6419266114632288, + "num_tokens": 2677305092.0, + "step": 15956 + }, + { + "entropy": 1.703240692615509, + "epoch": 1.7529592705501085, + "grad_norm": 0.6768248081207275, + "learning_rate": 2.711440448757916e-06, + "loss": 1.3706, + "mean_token_accuracy": 0.6598798781633377, + "num_tokens": 2677451996.0, + "step": 15957 + }, + { + "entropy": 1.68047430117925, + "epoch": 1.7530691274614814, + "grad_norm": 0.7604206204414368, + "learning_rate": 2.7108166548393355e-06, + "loss": 1.3357, + "mean_token_accuracy": 0.6692562450965246, + "num_tokens": 2677595564.0, + "step": 15958 + }, + { + "entropy": 1.6513379216194153, + "epoch": 1.7531789843728545, + "grad_norm": 0.7215495705604553, + "learning_rate": 2.710193123269674e-06, + "loss": 1.4028, + "mean_token_accuracy": 0.668435071905454, + "num_tokens": 2677747003.0, + "step": 15959 + }, + { + "entropy": 1.7083741823832195, + "epoch": 1.7532888412842271, + "grad_norm": 0.792725145816803, + "learning_rate": 2.7095698540686656e-06, + "loss": 1.4943, + "mean_token_accuracy": 0.6499527543783188, + "num_tokens": 2677977238.0, + "step": 15960 + }, + { + "entropy": 1.6812816560268402, + "epoch": 1.7533986981956002, + "grad_norm": 0.7482911944389343, + "learning_rate": 2.7089468472560337e-06, + "loss": 1.4457, + "mean_token_accuracy": 0.6625443349281946, + "num_tokens": 2678174371.0, + "step": 15961 + }, + { + "entropy": 1.7282609542210896, + "epoch": 1.7535085551069731, + "grad_norm": 0.6054615378379822, + "learning_rate": 2.708324102851498e-06, + "loss": 1.4013, + "mean_token_accuracy": 0.6478755126396815, + "num_tokens": 2678327959.0, + "step": 15962 + }, + { + "entropy": 1.6690288086732228, + "epoch": 1.753618412018346, + "grad_norm": 0.6034241318702698, + "learning_rate": 2.707701620874771e-06, + "loss": 1.447, + "mean_token_accuracy": 0.6598279525836309, + "num_tokens": 2678507673.0, + "step": 15963 + }, + { + "entropy": 1.7199995517730713, + "epoch": 1.7537282689297191, + "grad_norm": 0.5845739841461182, + "learning_rate": 2.707079401345548e-06, + "loss": 1.4097, + "mean_token_accuracy": 0.6425377229849497, + "num_tokens": 2678686394.0, + "step": 15964 + }, + { + "entropy": 1.7174680133660634, + "epoch": 1.753838125841092, + "grad_norm": 0.7504869103431702, + "learning_rate": 2.7064574442835244e-06, + "loss": 1.2004, + "mean_token_accuracy": 0.6825359563032786, + "num_tokens": 2678832333.0, + "step": 15965 + }, + { + "entropy": 1.7301206588745117, + "epoch": 1.7539479827524649, + "grad_norm": 0.7699616551399231, + "learning_rate": 2.705835749708389e-06, + "loss": 1.3751, + "mean_token_accuracy": 0.6503161787986755, + "num_tokens": 2678975215.0, + "step": 15966 + }, + { + "entropy": 1.6766011317571003, + "epoch": 1.754057839663838, + "grad_norm": 0.6368605494499207, + "learning_rate": 2.705214317639813e-06, + "loss": 1.3711, + "mean_token_accuracy": 0.6545276641845703, + "num_tokens": 2679134834.0, + "step": 15967 + }, + { + "entropy": 1.6891018450260162, + "epoch": 1.7541676965752107, + "grad_norm": 0.6660163402557373, + "learning_rate": 2.7045931480974647e-06, + "loss": 1.481, + "mean_token_accuracy": 0.6436668932437897, + "num_tokens": 2679370650.0, + "step": 15968 + }, + { + "entropy": 1.6860856016476948, + "epoch": 1.7542775534865838, + "grad_norm": 0.6828413009643555, + "learning_rate": 2.7039722411010077e-06, + "loss": 1.3477, + "mean_token_accuracy": 0.6638344178597132, + "num_tokens": 2679515052.0, + "step": 15969 + }, + { + "entropy": 1.707839588324229, + "epoch": 1.7543874103979566, + "grad_norm": 0.5549116134643555, + "learning_rate": 2.703351596670089e-06, + "loss": 1.3906, + "mean_token_accuracy": 0.6582557906707128, + "num_tokens": 2679697943.0, + "step": 15970 + }, + { + "entropy": 1.6483833988507588, + "epoch": 1.7544972673093295, + "grad_norm": 0.6756875514984131, + "learning_rate": 2.7027312148243552e-06, + "loss": 1.2794, + "mean_token_accuracy": 0.6740471869707108, + "num_tokens": 2679854587.0, + "step": 15971 + }, + { + "entropy": 1.7412736018498738, + "epoch": 1.7546071242207026, + "grad_norm": 0.6762500405311584, + "learning_rate": 2.7021110955834397e-06, + "loss": 1.3442, + "mean_token_accuracy": 0.6655664046605428, + "num_tokens": 2679989455.0, + "step": 15972 + }, + { + "entropy": 1.7418133318424225, + "epoch": 1.7547169811320755, + "grad_norm": 0.702103316783905, + "learning_rate": 2.701491238966968e-06, + "loss": 1.4733, + "mean_token_accuracy": 0.6563413143157959, + "num_tokens": 2680144527.0, + "step": 15973 + }, + { + "entropy": 1.7465166052182515, + "epoch": 1.7548268380434484, + "grad_norm": 0.659724771976471, + "learning_rate": 2.700871644994558e-06, + "loss": 1.6231, + "mean_token_accuracy": 0.6223872403303782, + "num_tokens": 2680367260.0, + "step": 15974 + }, + { + "entropy": 1.7185083429018657, + "epoch": 1.7549366949548213, + "grad_norm": 0.6778896450996399, + "learning_rate": 2.7002523136858243e-06, + "loss": 1.297, + "mean_token_accuracy": 0.659514586130778, + "num_tokens": 2680520101.0, + "step": 15975 + }, + { + "entropy": 1.6928850710391998, + "epoch": 1.7550465518661942, + "grad_norm": 0.6215304136276245, + "learning_rate": 2.699633245060362e-06, + "loss": 1.3566, + "mean_token_accuracy": 0.6550732006629308, + "num_tokens": 2680714804.0, + "step": 15976 + }, + { + "entropy": 1.636595219373703, + "epoch": 1.7551564087775673, + "grad_norm": 0.640494704246521, + "learning_rate": 2.6990144391377672e-06, + "loss": 1.4246, + "mean_token_accuracy": 0.6581357816855112, + "num_tokens": 2680900096.0, + "step": 15977 + }, + { + "entropy": 1.6510383188724518, + "epoch": 1.7552662656889402, + "grad_norm": 0.5748199820518494, + "learning_rate": 2.698395895937627e-06, + "loss": 1.3701, + "mean_token_accuracy": 0.6559861749410629, + "num_tokens": 2681112878.0, + "step": 15978 + }, + { + "entropy": 1.7111006379127502, + "epoch": 1.755376122600313, + "grad_norm": 0.6022339463233948, + "learning_rate": 2.6977776154795143e-06, + "loss": 1.43, + "mean_token_accuracy": 0.6573653519153595, + "num_tokens": 2681330192.0, + "step": 15979 + }, + { + "entropy": 1.7234566509723663, + "epoch": 1.7554859795116862, + "grad_norm": 0.7855963706970215, + "learning_rate": 2.6971595977829986e-06, + "loss": 1.4615, + "mean_token_accuracy": 0.6453143805265427, + "num_tokens": 2681475961.0, + "step": 15980 + }, + { + "entropy": 1.7315457065900166, + "epoch": 1.7555958364230588, + "grad_norm": 0.6155387759208679, + "learning_rate": 2.6965418428676416e-06, + "loss": 1.5345, + "mean_token_accuracy": 0.6340660750865936, + "num_tokens": 2681718898.0, + "step": 15981 + }, + { + "entropy": 1.6927696069081624, + "epoch": 1.755705693334432, + "grad_norm": 0.6210948824882507, + "learning_rate": 2.695924350752992e-06, + "loss": 1.4177, + "mean_token_accuracy": 0.6520951439936956, + "num_tokens": 2681871307.0, + "step": 15982 + }, + { + "entropy": 1.703049937884013, + "epoch": 1.7558155502458048, + "grad_norm": 0.6101760268211365, + "learning_rate": 2.695307121458597e-06, + "loss": 1.5486, + "mean_token_accuracy": 0.6360116451978683, + "num_tokens": 2682062747.0, + "step": 15983 + }, + { + "entropy": 1.7253990471363068, + "epoch": 1.7559254071571777, + "grad_norm": 0.8363544940948486, + "learning_rate": 2.694690155003989e-06, + "loss": 1.5341, + "mean_token_accuracy": 0.6490476578474045, + "num_tokens": 2682217967.0, + "step": 15984 + }, + { + "entropy": 1.7305609087149303, + "epoch": 1.7560352640685508, + "grad_norm": 0.6613196730613708, + "learning_rate": 2.694073451408693e-06, + "loss": 1.4157, + "mean_token_accuracy": 0.6571665753920873, + "num_tokens": 2682358141.0, + "step": 15985 + }, + { + "entropy": 1.683866063753764, + "epoch": 1.7561451209799237, + "grad_norm": 0.6239186525344849, + "learning_rate": 2.69345701069223e-06, + "loss": 1.412, + "mean_token_accuracy": 0.6451130757729212, + "num_tokens": 2682550962.0, + "step": 15986 + }, + { + "entropy": 1.7141178448994954, + "epoch": 1.7562549778912966, + "grad_norm": 0.6861118674278259, + "learning_rate": 2.6928408328741128e-06, + "loss": 1.5234, + "mean_token_accuracy": 0.6307132889827093, + "num_tokens": 2682768682.0, + "step": 15987 + }, + { + "entropy": 1.6778662502765656, + "epoch": 1.7563648348026695, + "grad_norm": 0.7148086428642273, + "learning_rate": 2.692224917973837e-06, + "loss": 1.279, + "mean_token_accuracy": 0.6807506283124288, + "num_tokens": 2682890200.0, + "step": 15988 + }, + { + "entropy": 1.680614044268926, + "epoch": 1.7564746917140424, + "grad_norm": 0.7254914045333862, + "learning_rate": 2.6916092660108985e-06, + "loss": 1.524, + "mean_token_accuracy": 0.6463294724623362, + "num_tokens": 2683094770.0, + "step": 15989 + }, + { + "entropy": 1.6979198157787323, + "epoch": 1.7565845486254155, + "grad_norm": 0.6991070508956909, + "learning_rate": 2.690993877004785e-06, + "loss": 1.5171, + "mean_token_accuracy": 0.6333752622207006, + "num_tokens": 2683283703.0, + "step": 15990 + }, + { + "entropy": 1.728099246819814, + "epoch": 1.7566944055367884, + "grad_norm": 0.6582128405570984, + "learning_rate": 2.69037875097497e-06, + "loss": 1.2978, + "mean_token_accuracy": 0.6661944588025411, + "num_tokens": 2683408690.0, + "step": 15991 + }, + { + "entropy": 1.6826795637607574, + "epoch": 1.7568042624481612, + "grad_norm": 0.7262701392173767, + "learning_rate": 2.6897638879409228e-06, + "loss": 1.4638, + "mean_token_accuracy": 0.6540475736061732, + "num_tokens": 2683550680.0, + "step": 15992 + }, + { + "entropy": 1.6950383583704631, + "epoch": 1.7569141193595343, + "grad_norm": 0.7318074703216553, + "learning_rate": 2.689149287922105e-06, + "loss": 1.5337, + "mean_token_accuracy": 0.6470025032758713, + "num_tokens": 2683732547.0, + "step": 15993 + }, + { + "entropy": 1.6813922425111134, + "epoch": 1.757023976270907, + "grad_norm": 0.6637877225875854, + "learning_rate": 2.6885349509379667e-06, + "loss": 1.4379, + "mean_token_accuracy": 0.6517617652813593, + "num_tokens": 2683904804.0, + "step": 15994 + }, + { + "entropy": 1.6451501250267029, + "epoch": 1.7571338331822801, + "grad_norm": 0.5915562510490417, + "learning_rate": 2.687920877007952e-06, + "loss": 1.3513, + "mean_token_accuracy": 0.6604169209798177, + "num_tokens": 2684077734.0, + "step": 15995 + }, + { + "entropy": 1.7052603960037231, + "epoch": 1.757243690093653, + "grad_norm": 0.6788076162338257, + "learning_rate": 2.6873070661514966e-06, + "loss": 1.3391, + "mean_token_accuracy": 0.6597078988949457, + "num_tokens": 2684211351.0, + "step": 15996 + }, + { + "entropy": 1.8529831767082214, + "epoch": 1.7573535470050259, + "grad_norm": 0.8095588088035583, + "learning_rate": 2.6866935183880246e-06, + "loss": 1.372, + "mean_token_accuracy": 0.6530237297217051, + "num_tokens": 2684339090.0, + "step": 15997 + }, + { + "entropy": 1.6758296092351277, + "epoch": 1.757463403916399, + "grad_norm": 0.8423399329185486, + "learning_rate": 2.6860802337369574e-06, + "loss": 1.2942, + "mean_token_accuracy": 0.6831070631742477, + "num_tokens": 2684480747.0, + "step": 15998 + }, + { + "entropy": 1.7313259641329448, + "epoch": 1.7575732608277719, + "grad_norm": 0.6407238841056824, + "learning_rate": 2.685467212217708e-06, + "loss": 1.3715, + "mean_token_accuracy": 0.6576495319604874, + "num_tokens": 2684654860.0, + "step": 15999 + }, + { + "entropy": 1.68729371825854, + "epoch": 1.7576831177391448, + "grad_norm": 0.6778021454811096, + "learning_rate": 2.6848544538496708e-06, + "loss": 1.3707, + "mean_token_accuracy": 0.6608982980251312, + "num_tokens": 2684843261.0, + "step": 16000 + }, + { + "entropy": 1.71370596686999, + "epoch": 1.7577929746505176, + "grad_norm": 0.6387611031532288, + "learning_rate": 2.6842419586522438e-06, + "loss": 1.4439, + "mean_token_accuracy": 0.6582885235548019, + "num_tokens": 2684991799.0, + "step": 16001 + }, + { + "entropy": 1.7273413042227428, + "epoch": 1.7579028315618905, + "grad_norm": 0.6538326144218445, + "learning_rate": 2.6836297266448132e-06, + "loss": 1.4684, + "mean_token_accuracy": 0.6439621796210607, + "num_tokens": 2685157892.0, + "step": 16002 + }, + { + "entropy": 1.6686415771643321, + "epoch": 1.7580126884732636, + "grad_norm": 0.7002345323562622, + "learning_rate": 2.6830177578467538e-06, + "loss": 1.2251, + "mean_token_accuracy": 0.6772776246070862, + "num_tokens": 2685281540.0, + "step": 16003 + }, + { + "entropy": 1.7180415491263072, + "epoch": 1.7581225453846365, + "grad_norm": 0.6955657601356506, + "learning_rate": 2.6824060522774324e-06, + "loss": 1.3035, + "mean_token_accuracy": 0.6689807226260504, + "num_tokens": 2685410847.0, + "step": 16004 + }, + { + "entropy": 1.7041480839252472, + "epoch": 1.7582324022960094, + "grad_norm": 0.7212827205657959, + "learning_rate": 2.6817946099562144e-06, + "loss": 1.4209, + "mean_token_accuracy": 0.6546263992786407, + "num_tokens": 2685550734.0, + "step": 16005 + }, + { + "entropy": 1.7181178629398346, + "epoch": 1.7583422592073825, + "grad_norm": 0.927869439125061, + "learning_rate": 2.6811834309024464e-06, + "loss": 1.5715, + "mean_token_accuracy": 0.6469027449687322, + "num_tokens": 2685780924.0, + "step": 16006 + }, + { + "entropy": 1.6185656785964966, + "epoch": 1.7584521161187552, + "grad_norm": 0.8028410077095032, + "learning_rate": 2.6805725151354767e-06, + "loss": 1.3453, + "mean_token_accuracy": 0.6656514505545298, + "num_tokens": 2685995927.0, + "step": 16007 + }, + { + "entropy": 1.7140028874079387, + "epoch": 1.7585619730301283, + "grad_norm": 0.5886359214782715, + "learning_rate": 2.6799618626746373e-06, + "loss": 1.3963, + "mean_token_accuracy": 0.6488246818383535, + "num_tokens": 2686183571.0, + "step": 16008 + }, + { + "entropy": 1.7340856492519379, + "epoch": 1.7586718299415012, + "grad_norm": 0.7170657515525818, + "learning_rate": 2.679351473539254e-06, + "loss": 1.3313, + "mean_token_accuracy": 0.6784360110759735, + "num_tokens": 2686339779.0, + "step": 16009 + }, + { + "entropy": 1.7010501722494762, + "epoch": 1.758781686852874, + "grad_norm": 0.708954930305481, + "learning_rate": 2.678741347748649e-06, + "loss": 1.3835, + "mean_token_accuracy": 0.6587957243124644, + "num_tokens": 2686483449.0, + "step": 16010 + }, + { + "entropy": 1.7321734031041462, + "epoch": 1.7588915437642472, + "grad_norm": 0.850824773311615, + "learning_rate": 2.6781314853221317e-06, + "loss": 1.2767, + "mean_token_accuracy": 0.6718885898590088, + "num_tokens": 2686619532.0, + "step": 16011 + }, + { + "entropy": 1.7261533737182617, + "epoch": 1.75900140067562, + "grad_norm": 0.6615371704101562, + "learning_rate": 2.677521886279e-06, + "loss": 1.3841, + "mean_token_accuracy": 0.657954066991806, + "num_tokens": 2686772792.0, + "step": 16012 + }, + { + "entropy": 1.6760503153006236, + "epoch": 1.759111257586993, + "grad_norm": 0.6881850957870483, + "learning_rate": 2.676912550638553e-06, + "loss": 1.317, + "mean_token_accuracy": 0.6734907428423563, + "num_tokens": 2686959332.0, + "step": 16013 + }, + { + "entropy": 1.7255980670452118, + "epoch": 1.7592211144983658, + "grad_norm": 0.8105875849723816, + "learning_rate": 2.6763034784200714e-06, + "loss": 1.3294, + "mean_token_accuracy": 0.6656341602404913, + "num_tokens": 2687089986.0, + "step": 16014 + }, + { + "entropy": 1.7051123281319935, + "epoch": 1.7593309714097387, + "grad_norm": 0.7158269882202148, + "learning_rate": 2.675694669642835e-06, + "loss": 1.3558, + "mean_token_accuracy": 0.6500856876373291, + "num_tokens": 2687263913.0, + "step": 16015 + }, + { + "entropy": 1.632367382446925, + "epoch": 1.7594408283211118, + "grad_norm": 0.6158329844474792, + "learning_rate": 2.6750861243261116e-06, + "loss": 1.4107, + "mean_token_accuracy": 0.6531671682993571, + "num_tokens": 2687432094.0, + "step": 16016 + }, + { + "entropy": 1.64411657055219, + "epoch": 1.7595506852324847, + "grad_norm": 0.7069551348686218, + "learning_rate": 2.6744778424891593e-06, + "loss": 1.2452, + "mean_token_accuracy": 0.681958943605423, + "num_tokens": 2687545345.0, + "step": 16017 + }, + { + "entropy": 1.7302074233690898, + "epoch": 1.7596605421438576, + "grad_norm": 0.922947108745575, + "learning_rate": 2.673869824151233e-06, + "loss": 1.411, + "mean_token_accuracy": 0.6597683926423391, + "num_tokens": 2687702086.0, + "step": 16018 + }, + { + "entropy": 1.6862739821275075, + "epoch": 1.7597703990552307, + "grad_norm": 0.611757218837738, + "learning_rate": 2.6732620693315747e-06, + "loss": 1.3634, + "mean_token_accuracy": 0.6643515825271606, + "num_tokens": 2687873391.0, + "step": 16019 + }, + { + "entropy": 1.6990788380304973, + "epoch": 1.7598802559666034, + "grad_norm": 0.6882119178771973, + "learning_rate": 2.672654578049421e-06, + "loss": 1.6048, + "mean_token_accuracy": 0.6338710337877274, + "num_tokens": 2688115401.0, + "step": 16020 + }, + { + "entropy": 1.762443095445633, + "epoch": 1.7599901128779765, + "grad_norm": 0.6838318109512329, + "learning_rate": 2.6720473503239965e-06, + "loss": 1.4519, + "mean_token_accuracy": 0.6481083780527115, + "num_tokens": 2688307828.0, + "step": 16021 + }, + { + "entropy": 1.6223317682743073, + "epoch": 1.7600999697893494, + "grad_norm": 0.7780981063842773, + "learning_rate": 2.67144038617452e-06, + "loss": 1.2702, + "mean_token_accuracy": 0.6707681715488434, + "num_tokens": 2688478830.0, + "step": 16022 + }, + { + "entropy": 1.7924179633458455, + "epoch": 1.7602098267007222, + "grad_norm": 0.693706214427948, + "learning_rate": 2.670833685620204e-06, + "loss": 1.4936, + "mean_token_accuracy": 0.6311075091362, + "num_tokens": 2688628557.0, + "step": 16023 + }, + { + "entropy": 1.7252692580223083, + "epoch": 1.7603196836120953, + "grad_norm": 0.8025743365287781, + "learning_rate": 2.6702272486802467e-06, + "loss": 1.6267, + "mean_token_accuracy": 0.6387341618537903, + "num_tokens": 2688872676.0, + "step": 16024 + }, + { + "entropy": 1.6703723271687825, + "epoch": 1.7604295405234682, + "grad_norm": 0.7961321473121643, + "learning_rate": 2.669621075373845e-06, + "loss": 1.344, + "mean_token_accuracy": 0.6630989263455073, + "num_tokens": 2689041194.0, + "step": 16025 + }, + { + "entropy": 1.72029647231102, + "epoch": 1.7605393974348411, + "grad_norm": 0.6698374152183533, + "learning_rate": 2.6690151657201813e-06, + "loss": 1.3747, + "mean_token_accuracy": 0.6576948761940002, + "num_tokens": 2689249040.0, + "step": 16026 + }, + { + "entropy": 1.7233352561791737, + "epoch": 1.7606492543462142, + "grad_norm": 0.7270263433456421, + "learning_rate": 2.668409519738434e-06, + "loss": 1.5737, + "mean_token_accuracy": 0.6292213350534439, + "num_tokens": 2689469853.0, + "step": 16027 + }, + { + "entropy": 1.716443419456482, + "epoch": 1.7607591112575869, + "grad_norm": 0.7445971965789795, + "learning_rate": 2.667804137447772e-06, + "loss": 1.3767, + "mean_token_accuracy": 0.6535246272881826, + "num_tokens": 2689621005.0, + "step": 16028 + }, + { + "entropy": 1.7172259191672008, + "epoch": 1.76086896816896, + "grad_norm": 0.6567142009735107, + "learning_rate": 2.6671990188673534e-06, + "loss": 1.4442, + "mean_token_accuracy": 0.6565740207831064, + "num_tokens": 2689811122.0, + "step": 16029 + }, + { + "entropy": 1.7025707860787709, + "epoch": 1.7609788250803329, + "grad_norm": 0.6175838112831116, + "learning_rate": 2.666594164016331e-06, + "loss": 1.4574, + "mean_token_accuracy": 0.6463165481885275, + "num_tokens": 2689987343.0, + "step": 16030 + }, + { + "entropy": 1.7212122082710266, + "epoch": 1.7610886819917058, + "grad_norm": 0.7046975493431091, + "learning_rate": 2.6659895729138506e-06, + "loss": 1.4979, + "mean_token_accuracy": 0.6492815067370733, + "num_tokens": 2690161731.0, + "step": 16031 + }, + { + "entropy": 1.7542727986971538, + "epoch": 1.7611985389030789, + "grad_norm": 0.8798859119415283, + "learning_rate": 2.665385245579042e-06, + "loss": 1.427, + "mean_token_accuracy": 0.666147435704867, + "num_tokens": 2690346545.0, + "step": 16032 + }, + { + "entropy": 1.7387540936470032, + "epoch": 1.7613083958144515, + "grad_norm": 0.7441887855529785, + "learning_rate": 2.6647811820310345e-06, + "loss": 1.3141, + "mean_token_accuracy": 0.6612844069798788, + "num_tokens": 2690447761.0, + "step": 16033 + }, + { + "entropy": 1.6950391431649525, + "epoch": 1.7614182527258246, + "grad_norm": 0.5990413427352905, + "learning_rate": 2.664177382288948e-06, + "loss": 1.4014, + "mean_token_accuracy": 0.6514792641003927, + "num_tokens": 2690622453.0, + "step": 16034 + }, + { + "entropy": 1.6765986780325572, + "epoch": 1.7615281096371975, + "grad_norm": 0.7618458271026611, + "learning_rate": 2.6635738463718907e-06, + "loss": 1.2839, + "mean_token_accuracy": 0.6652130633592606, + "num_tokens": 2690754415.0, + "step": 16035 + }, + { + "entropy": 1.6814166605472565, + "epoch": 1.7616379665485704, + "grad_norm": 0.745574951171875, + "learning_rate": 2.662970574298964e-06, + "loss": 1.2076, + "mean_token_accuracy": 0.6819567829370499, + "num_tokens": 2690921404.0, + "step": 16036 + }, + { + "entropy": 1.702602465947469, + "epoch": 1.7617478234599435, + "grad_norm": 0.6456969380378723, + "learning_rate": 2.6623675660892646e-06, + "loss": 1.3397, + "mean_token_accuracy": 0.6715992788473765, + "num_tokens": 2691076141.0, + "step": 16037 + }, + { + "entropy": 1.697231650352478, + "epoch": 1.7618576803713164, + "grad_norm": 0.6837537884712219, + "learning_rate": 2.661764821761871e-06, + "loss": 1.3611, + "mean_token_accuracy": 0.662015880147616, + "num_tokens": 2691223815.0, + "step": 16038 + }, + { + "entropy": 1.759251356124878, + "epoch": 1.7619675372826893, + "grad_norm": 0.714501142501831, + "learning_rate": 2.6611623413358656e-06, + "loss": 1.4554, + "mean_token_accuracy": 0.6524814665317535, + "num_tokens": 2691356249.0, + "step": 16039 + }, + { + "entropy": 1.7324928243954976, + "epoch": 1.7620773941940624, + "grad_norm": 0.6178767681121826, + "learning_rate": 2.6605601248303152e-06, + "loss": 1.3647, + "mean_token_accuracy": 0.6701581329107285, + "num_tokens": 2691553264.0, + "step": 16040 + }, + { + "entropy": 1.6730826298395793, + "epoch": 1.762187251105435, + "grad_norm": 0.7592617869377136, + "learning_rate": 2.6599581722642762e-06, + "loss": 1.3098, + "mean_token_accuracy": 0.6677819540103277, + "num_tokens": 2691739385.0, + "step": 16041 + }, + { + "entropy": 1.6866790254910786, + "epoch": 1.7622971080168082, + "grad_norm": 0.6936041712760925, + "learning_rate": 2.6593564836568047e-06, + "loss": 1.3359, + "mean_token_accuracy": 0.6542757352193197, + "num_tokens": 2691857897.0, + "step": 16042 + }, + { + "entropy": 1.6914058128992717, + "epoch": 1.762406964928181, + "grad_norm": 0.5658572316169739, + "learning_rate": 2.658755059026944e-06, + "loss": 1.384, + "mean_token_accuracy": 0.644287516673406, + "num_tokens": 2692062336.0, + "step": 16043 + }, + { + "entropy": 1.7410341103871663, + "epoch": 1.762516821839554, + "grad_norm": 0.6581618785858154, + "learning_rate": 2.6581538983937243e-06, + "loss": 1.3614, + "mean_token_accuracy": 0.6670055588086446, + "num_tokens": 2692198745.0, + "step": 16044 + }, + { + "entropy": 1.6533363958199818, + "epoch": 1.762626678750927, + "grad_norm": 0.6615976095199585, + "learning_rate": 2.657553001776175e-06, + "loss": 1.394, + "mean_token_accuracy": 0.6650797625382742, + "num_tokens": 2692378904.0, + "step": 16045 + }, + { + "entropy": 1.6993359824021657, + "epoch": 1.7627365356622997, + "grad_norm": 0.6522985100746155, + "learning_rate": 2.6569523691933154e-06, + "loss": 1.4219, + "mean_token_accuracy": 0.6460706889629364, + "num_tokens": 2692560029.0, + "step": 16046 + }, + { + "entropy": 1.7435101469357808, + "epoch": 1.7628463925736728, + "grad_norm": 0.678107500076294, + "learning_rate": 2.656352000664153e-06, + "loss": 1.4076, + "mean_token_accuracy": 0.6516735653082529, + "num_tokens": 2692749415.0, + "step": 16047 + }, + { + "entropy": 1.7272218664487202, + "epoch": 1.7629562494850457, + "grad_norm": 0.6166483759880066, + "learning_rate": 2.6557518962076896e-06, + "loss": 1.3556, + "mean_token_accuracy": 0.6620925267537435, + "num_tokens": 2692890906.0, + "step": 16048 + }, + { + "entropy": 1.7328097025553386, + "epoch": 1.7630661063964186, + "grad_norm": 1.0833711624145508, + "learning_rate": 2.65515205584292e-06, + "loss": 1.4331, + "mean_token_accuracy": 0.6583366692066193, + "num_tokens": 2693060492.0, + "step": 16049 + }, + { + "entropy": 1.6878051559130351, + "epoch": 1.7631759633077917, + "grad_norm": 0.694952666759491, + "learning_rate": 2.654552479588826e-06, + "loss": 1.3545, + "mean_token_accuracy": 0.6597214490175247, + "num_tokens": 2693240071.0, + "step": 16050 + }, + { + "entropy": 1.7286501228809357, + "epoch": 1.7632858202191646, + "grad_norm": 0.7728515267372131, + "learning_rate": 2.653953167464387e-06, + "loss": 1.3724, + "mean_token_accuracy": 0.6499157945315043, + "num_tokens": 2693361966.0, + "step": 16051 + }, + { + "entropy": 1.713899165391922, + "epoch": 1.7633956771305375, + "grad_norm": 0.7397400736808777, + "learning_rate": 2.653354119488568e-06, + "loss": 1.5974, + "mean_token_accuracy": 0.6266492505868276, + "num_tokens": 2693532391.0, + "step": 16052 + }, + { + "entropy": 1.7083103656768799, + "epoch": 1.7635055340419106, + "grad_norm": 0.7127790451049805, + "learning_rate": 2.65275533568033e-06, + "loss": 1.3755, + "mean_token_accuracy": 0.665579229593277, + "num_tokens": 2693713801.0, + "step": 16053 + }, + { + "entropy": 1.6606411933898926, + "epoch": 1.7636153909532832, + "grad_norm": 0.6641468405723572, + "learning_rate": 2.6521568160586247e-06, + "loss": 1.3632, + "mean_token_accuracy": 0.6683401316404343, + "num_tokens": 2693884118.0, + "step": 16054 + }, + { + "entropy": 1.714243769645691, + "epoch": 1.7637252478646563, + "grad_norm": 0.7927217483520508, + "learning_rate": 2.651558560642397e-06, + "loss": 1.5081, + "mean_token_accuracy": 0.6555715998013815, + "num_tokens": 2694061884.0, + "step": 16055 + }, + { + "entropy": 1.6633921265602112, + "epoch": 1.7638351047760292, + "grad_norm": 0.7348926663398743, + "learning_rate": 2.650960569450576e-06, + "loss": 1.4212, + "mean_token_accuracy": 0.6668934176365534, + "num_tokens": 2694230992.0, + "step": 16056 + }, + { + "entropy": 1.5766185522079468, + "epoch": 1.7639449616874021, + "grad_norm": 0.6212765574455261, + "learning_rate": 2.65036284250209e-06, + "loss": 1.5426, + "mean_token_accuracy": 0.6506764938433965, + "num_tokens": 2694462977.0, + "step": 16057 + }, + { + "entropy": 1.6853082577387493, + "epoch": 1.7640548185987752, + "grad_norm": 0.6269177794456482, + "learning_rate": 2.64976537981586e-06, + "loss": 1.4271, + "mean_token_accuracy": 0.6474867115418116, + "num_tokens": 2694663179.0, + "step": 16058 + }, + { + "entropy": 1.714323361714681, + "epoch": 1.7641646755101479, + "grad_norm": 0.6898490190505981, + "learning_rate": 2.6491681814107933e-06, + "loss": 1.2853, + "mean_token_accuracy": 0.6686572035153707, + "num_tokens": 2694814522.0, + "step": 16059 + }, + { + "entropy": 1.7176215052604675, + "epoch": 1.764274532421521, + "grad_norm": 0.6587092280387878, + "learning_rate": 2.6485712473057886e-06, + "loss": 1.3003, + "mean_token_accuracy": 0.6659404089053472, + "num_tokens": 2694951281.0, + "step": 16060 + }, + { + "entropy": 1.6764805714289348, + "epoch": 1.7643843893328939, + "grad_norm": 0.6627944707870483, + "learning_rate": 2.647974577519742e-06, + "loss": 1.3495, + "mean_token_accuracy": 0.6571897814671198, + "num_tokens": 2695107412.0, + "step": 16061 + }, + { + "entropy": 1.6623700261116028, + "epoch": 1.7644942462442668, + "grad_norm": 0.7114543318748474, + "learning_rate": 2.647378172071535e-06, + "loss": 1.3969, + "mean_token_accuracy": 0.651480957865715, + "num_tokens": 2695289663.0, + "step": 16062 + }, + { + "entropy": 1.7313786645730336, + "epoch": 1.7646041031556399, + "grad_norm": 0.6407042741775513, + "learning_rate": 2.6467820309800472e-06, + "loss": 1.4069, + "mean_token_accuracy": 0.6487823029359182, + "num_tokens": 2695500927.0, + "step": 16063 + }, + { + "entropy": 1.728273371855418, + "epoch": 1.7647139600670128, + "grad_norm": 0.677616536617279, + "learning_rate": 2.646186154264143e-06, + "loss": 1.2814, + "mean_token_accuracy": 0.6799236685037613, + "num_tokens": 2695643975.0, + "step": 16064 + }, + { + "entropy": 1.7222307622432709, + "epoch": 1.7648238169783856, + "grad_norm": 0.6702946424484253, + "learning_rate": 2.645590541942683e-06, + "loss": 1.4948, + "mean_token_accuracy": 0.6496520837148031, + "num_tokens": 2695802990.0, + "step": 16065 + }, + { + "entropy": 1.7068528135617573, + "epoch": 1.7649336738897587, + "grad_norm": 0.8094030022621155, + "learning_rate": 2.6449951940345164e-06, + "loss": 1.6127, + "mean_token_accuracy": 0.6310140788555145, + "num_tokens": 2696010181.0, + "step": 16066 + }, + { + "entropy": 1.6725980242093403, + "epoch": 1.7650435308011314, + "grad_norm": 0.8047062158584595, + "learning_rate": 2.6444001105584897e-06, + "loss": 1.3926, + "mean_token_accuracy": 0.6618801603714625, + "num_tokens": 2696173600.0, + "step": 16067 + }, + { + "entropy": 1.6676512161890666, + "epoch": 1.7651533877125045, + "grad_norm": 0.6256596446037292, + "learning_rate": 2.643805291533433e-06, + "loss": 1.311, + "mean_token_accuracy": 0.6845296223958334, + "num_tokens": 2696321443.0, + "step": 16068 + }, + { + "entropy": 1.7186016937096913, + "epoch": 1.7652632446238774, + "grad_norm": 0.6340106725692749, + "learning_rate": 2.643210736978173e-06, + "loss": 1.3346, + "mean_token_accuracy": 0.6681850502888361, + "num_tokens": 2696498399.0, + "step": 16069 + }, + { + "entropy": 1.684872140487035, + "epoch": 1.7653731015352503, + "grad_norm": 0.7013046145439148, + "learning_rate": 2.6426164469115274e-06, + "loss": 1.4155, + "mean_token_accuracy": 0.6384324083725611, + "num_tokens": 2696669061.0, + "step": 16070 + }, + { + "entropy": 1.6660610735416412, + "epoch": 1.7654829584466234, + "grad_norm": 0.7147235870361328, + "learning_rate": 2.6420224213523066e-06, + "loss": 1.2383, + "mean_token_accuracy": 0.6764589746793112, + "num_tokens": 2696830878.0, + "step": 16071 + }, + { + "entropy": 1.6860030889511108, + "epoch": 1.765592815357996, + "grad_norm": 0.6619150042533875, + "learning_rate": 2.6414286603193094e-06, + "loss": 1.5488, + "mean_token_accuracy": 0.640265941619873, + "num_tokens": 2697002297.0, + "step": 16072 + }, + { + "entropy": 1.7264870901902516, + "epoch": 1.7657026722693692, + "grad_norm": 0.7539317011833191, + "learning_rate": 2.6408351638313272e-06, + "loss": 1.3083, + "mean_token_accuracy": 0.6663891822099686, + "num_tokens": 2697134172.0, + "step": 16073 + }, + { + "entropy": 1.7615606784820557, + "epoch": 1.765812529180742, + "grad_norm": 0.6778137683868408, + "learning_rate": 2.6402419319071463e-06, + "loss": 1.5129, + "mean_token_accuracy": 0.6431242475907007, + "num_tokens": 2697306424.0, + "step": 16074 + }, + { + "entropy": 1.6955093443393707, + "epoch": 1.765922386092115, + "grad_norm": 0.7733752131462097, + "learning_rate": 2.639648964565542e-06, + "loss": 1.3358, + "mean_token_accuracy": 0.6611419717470804, + "num_tokens": 2697441605.0, + "step": 16075 + }, + { + "entropy": 1.6361872255802155, + "epoch": 1.766032243003488, + "grad_norm": 0.6613511443138123, + "learning_rate": 2.6390562618252806e-06, + "loss": 1.1672, + "mean_token_accuracy": 0.6900085906187693, + "num_tokens": 2697553650.0, + "step": 16076 + }, + { + "entropy": 1.71124001344045, + "epoch": 1.766142099914861, + "grad_norm": 0.7892715334892273, + "learning_rate": 2.6384638237051198e-06, + "loss": 1.3934, + "mean_token_accuracy": 0.6527740309635798, + "num_tokens": 2697688469.0, + "step": 16077 + }, + { + "entropy": 1.6695888042449951, + "epoch": 1.7662519568262338, + "grad_norm": 0.6769923567771912, + "learning_rate": 2.637871650223812e-06, + "loss": 1.3749, + "mean_token_accuracy": 0.6554907162984213, + "num_tokens": 2697888006.0, + "step": 16078 + }, + { + "entropy": 1.6960602402687073, + "epoch": 1.766361813737607, + "grad_norm": 0.6325769424438477, + "learning_rate": 2.6372797414000996e-06, + "loss": 1.483, + "mean_token_accuracy": 0.6564978261788686, + "num_tokens": 2698060704.0, + "step": 16079 + }, + { + "entropy": 1.715209275484085, + "epoch": 1.7664716706489796, + "grad_norm": 0.6917846202850342, + "learning_rate": 2.636688097252713e-06, + "loss": 1.2256, + "mean_token_accuracy": 0.6857384641965231, + "num_tokens": 2698175996.0, + "step": 16080 + }, + { + "entropy": 1.6636294424533844, + "epoch": 1.7665815275603527, + "grad_norm": 0.6673606634140015, + "learning_rate": 2.636096717800381e-06, + "loss": 1.486, + "mean_token_accuracy": 0.6630779206752777, + "num_tokens": 2698337011.0, + "step": 16081 + }, + { + "entropy": 1.6239332656065624, + "epoch": 1.7666913844717256, + "grad_norm": 0.8185363411903381, + "learning_rate": 2.6355056030618166e-06, + "loss": 1.2305, + "mean_token_accuracy": 0.6870084901650747, + "num_tokens": 2698479373.0, + "step": 16082 + }, + { + "entropy": 1.7032920519510906, + "epoch": 1.7668012413830985, + "grad_norm": 0.6755743622779846, + "learning_rate": 2.6349147530557327e-06, + "loss": 1.4925, + "mean_token_accuracy": 0.6550940821568171, + "num_tokens": 2698614324.0, + "step": 16083 + }, + { + "entropy": 1.6856786410013835, + "epoch": 1.7669110982944716, + "grad_norm": 0.7225176095962524, + "learning_rate": 2.6343241678008286e-06, + "loss": 1.2452, + "mean_token_accuracy": 0.6743053098519644, + "num_tokens": 2698770765.0, + "step": 16084 + }, + { + "entropy": 1.716284801562627, + "epoch": 1.7670209552058442, + "grad_norm": 0.6630404591560364, + "learning_rate": 2.6337338473157925e-06, + "loss": 1.4324, + "mean_token_accuracy": 0.648807222644488, + "num_tokens": 2698934174.0, + "step": 16085 + }, + { + "entropy": 1.705119530359904, + "epoch": 1.7671308121172173, + "grad_norm": 0.681877613067627, + "learning_rate": 2.633143791619311e-06, + "loss": 1.2931, + "mean_token_accuracy": 0.6711487770080566, + "num_tokens": 2699088905.0, + "step": 16086 + }, + { + "entropy": 1.6811328033606212, + "epoch": 1.7672406690285902, + "grad_norm": 0.6792789697647095, + "learning_rate": 2.6325540007300585e-06, + "loss": 1.4651, + "mean_token_accuracy": 0.6474873671929041, + "num_tokens": 2699230587.0, + "step": 16087 + }, + { + "entropy": 1.6984275380770366, + "epoch": 1.7673505259399631, + "grad_norm": 0.7023369669914246, + "learning_rate": 2.631964474666702e-06, + "loss": 1.412, + "mean_token_accuracy": 0.6622547606627146, + "num_tokens": 2699415668.0, + "step": 16088 + }, + { + "entropy": 1.7122309307257335, + "epoch": 1.7674603828513362, + "grad_norm": 0.614011287689209, + "learning_rate": 2.631375213447898e-06, + "loss": 1.328, + "mean_token_accuracy": 0.6657908707857132, + "num_tokens": 2699572735.0, + "step": 16089 + }, + { + "entropy": 1.6805100739002228, + "epoch": 1.767570239762709, + "grad_norm": 0.7636834979057312, + "learning_rate": 2.6307862170922992e-06, + "loss": 1.3451, + "mean_token_accuracy": 0.6637326081593832, + "num_tokens": 2699790891.0, + "step": 16090 + }, + { + "entropy": 1.7203999757766724, + "epoch": 1.767680096674082, + "grad_norm": 0.8896521329879761, + "learning_rate": 2.630197485618544e-06, + "loss": 1.6256, + "mean_token_accuracy": 0.6291612386703491, + "num_tokens": 2699997296.0, + "step": 16091 + }, + { + "entropy": 1.715426633755366, + "epoch": 1.767789953585455, + "grad_norm": 0.6982161402702332, + "learning_rate": 2.629609019045267e-06, + "loss": 1.6772, + "mean_token_accuracy": 0.6193340122699738, + "num_tokens": 2700178860.0, + "step": 16092 + }, + { + "entropy": 1.696618139743805, + "epoch": 1.7678998104968278, + "grad_norm": 0.6929605603218079, + "learning_rate": 2.6290208173910935e-06, + "loss": 1.3606, + "mean_token_accuracy": 0.6675488402446111, + "num_tokens": 2700334161.0, + "step": 16093 + }, + { + "entropy": 1.7300164600213368, + "epoch": 1.7680096674082009, + "grad_norm": 0.6754699945449829, + "learning_rate": 2.628432880674637e-06, + "loss": 1.4199, + "mean_token_accuracy": 0.6655618896087011, + "num_tokens": 2700482064.0, + "step": 16094 + }, + { + "entropy": 1.7005921800931294, + "epoch": 1.7681195243195738, + "grad_norm": 0.7460622191429138, + "learning_rate": 2.6278452089145107e-06, + "loss": 1.3045, + "mean_token_accuracy": 0.6671140988667806, + "num_tokens": 2700620075.0, + "step": 16095 + }, + { + "entropy": 1.6499681274096172, + "epoch": 1.7682293812309466, + "grad_norm": 0.6865026354789734, + "learning_rate": 2.627257802129309e-06, + "loss": 1.3658, + "mean_token_accuracy": 0.6599505941073099, + "num_tokens": 2700819470.0, + "step": 16096 + }, + { + "entropy": 1.6812036136786144, + "epoch": 1.7683392381423197, + "grad_norm": 0.6698898673057556, + "learning_rate": 2.6266706603376244e-06, + "loss": 1.4634, + "mean_token_accuracy": 0.6481799880663554, + "num_tokens": 2701000116.0, + "step": 16097 + }, + { + "entropy": 1.742706725994746, + "epoch": 1.7684490950536924, + "grad_norm": 0.7475796937942505, + "learning_rate": 2.62608378355804e-06, + "loss": 1.4385, + "mean_token_accuracy": 0.6343745936950048, + "num_tokens": 2701199955.0, + "step": 16098 + }, + { + "entropy": 1.7251374125480652, + "epoch": 1.7685589519650655, + "grad_norm": 0.6364467144012451, + "learning_rate": 2.6254971718091326e-06, + "loss": 1.2861, + "mean_token_accuracy": 0.6751857052246729, + "num_tokens": 2701376853.0, + "step": 16099 + }, + { + "entropy": 1.7397385934988658, + "epoch": 1.7686688088764384, + "grad_norm": 0.6278882026672363, + "learning_rate": 2.624910825109466e-06, + "loss": 1.4487, + "mean_token_accuracy": 0.6475135733683904, + "num_tokens": 2701541159.0, + "step": 16100 + }, + { + "entropy": 1.5949292679627736, + "epoch": 1.7687786657878113, + "grad_norm": 0.5723445415496826, + "learning_rate": 2.6243247434775967e-06, + "loss": 1.3228, + "mean_token_accuracy": 0.6649216016133627, + "num_tokens": 2701731184.0, + "step": 16101 + }, + { + "entropy": 1.6799314518769581, + "epoch": 1.7688885226991844, + "grad_norm": 0.6339821219444275, + "learning_rate": 2.623738926932075e-06, + "loss": 1.255, + "mean_token_accuracy": 0.6732438405354818, + "num_tokens": 2701853054.0, + "step": 16102 + }, + { + "entropy": 1.7200807829697926, + "epoch": 1.7689983796105573, + "grad_norm": 0.8305505514144897, + "learning_rate": 2.6231533754914435e-06, + "loss": 1.3544, + "mean_token_accuracy": 0.6595542430877686, + "num_tokens": 2701983290.0, + "step": 16103 + }, + { + "entropy": 1.6731150647004445, + "epoch": 1.7691082365219302, + "grad_norm": 0.7634748220443726, + "learning_rate": 2.6225680891742307e-06, + "loss": 1.5671, + "mean_token_accuracy": 0.642717699209849, + "num_tokens": 2702184992.0, + "step": 16104 + }, + { + "entropy": 1.7635932167371113, + "epoch": 1.7692180934333033, + "grad_norm": 0.6697694659233093, + "learning_rate": 2.6219830679989645e-06, + "loss": 1.4884, + "mean_token_accuracy": 0.6513105779886246, + "num_tokens": 2702325974.0, + "step": 16105 + }, + { + "entropy": 1.7982784907023113, + "epoch": 1.769327950344676, + "grad_norm": 0.8669276237487793, + "learning_rate": 2.6213983119841573e-06, + "loss": 1.5692, + "mean_token_accuracy": 0.6412549217542013, + "num_tokens": 2702471856.0, + "step": 16106 + }, + { + "entropy": 1.7105421125888824, + "epoch": 1.769437807256049, + "grad_norm": 0.6940631866455078, + "learning_rate": 2.6208138211483193e-06, + "loss": 1.4021, + "mean_token_accuracy": 0.6522999107837677, + "num_tokens": 2702631817.0, + "step": 16107 + }, + { + "entropy": 1.667997380097707, + "epoch": 1.769547664167422, + "grad_norm": 0.6804345846176147, + "learning_rate": 2.6202295955099484e-06, + "loss": 1.3276, + "mean_token_accuracy": 0.6740523924430212, + "num_tokens": 2702779392.0, + "step": 16108 + }, + { + "entropy": 1.764988124370575, + "epoch": 1.7696575210787948, + "grad_norm": 0.5990316271781921, + "learning_rate": 2.6196456350875336e-06, + "loss": 1.5235, + "mean_token_accuracy": 0.6251028428475062, + "num_tokens": 2703003157.0, + "step": 16109 + }, + { + "entropy": 1.712211827437083, + "epoch": 1.769767377990168, + "grad_norm": 0.8300401568412781, + "learning_rate": 2.619061939899558e-06, + "loss": 1.2377, + "mean_token_accuracy": 0.6737738301356634, + "num_tokens": 2703156071.0, + "step": 16110 + }, + { + "entropy": 1.732935518026352, + "epoch": 1.7698772349015406, + "grad_norm": 0.7982600331306458, + "learning_rate": 2.618478509964498e-06, + "loss": 1.3812, + "mean_token_accuracy": 0.6614757974942526, + "num_tokens": 2703317575.0, + "step": 16111 + }, + { + "entropy": 1.6523981094360352, + "epoch": 1.7699870918129137, + "grad_norm": 0.6973866820335388, + "learning_rate": 2.6178953453008143e-06, + "loss": 1.4013, + "mean_token_accuracy": 0.6528991907835007, + "num_tokens": 2703469385.0, + "step": 16112 + }, + { + "entropy": 1.6980741024017334, + "epoch": 1.7700969487242866, + "grad_norm": 0.9420611262321472, + "learning_rate": 2.6173124459269654e-06, + "loss": 1.5222, + "mean_token_accuracy": 0.6804485072692236, + "num_tokens": 2703634520.0, + "step": 16113 + }, + { + "entropy": 1.67299422621727, + "epoch": 1.7702068056356595, + "grad_norm": 0.6318880319595337, + "learning_rate": 2.616729811861402e-06, + "loss": 1.3287, + "mean_token_accuracy": 0.6642769277095795, + "num_tokens": 2703787272.0, + "step": 16114 + }, + { + "entropy": 1.7164417207241058, + "epoch": 1.7703166625470326, + "grad_norm": 0.788582444190979, + "learning_rate": 2.6161474431225624e-06, + "loss": 1.3569, + "mean_token_accuracy": 0.669378658135732, + "num_tokens": 2703907662.0, + "step": 16115 + }, + { + "entropy": 1.7314981818199158, + "epoch": 1.7704265194584055, + "grad_norm": 0.6980270743370056, + "learning_rate": 2.6155653397288762e-06, + "loss": 1.3667, + "mean_token_accuracy": 0.6507422377665838, + "num_tokens": 2704058988.0, + "step": 16116 + }, + { + "entropy": 1.7171373665332794, + "epoch": 1.7705363763697783, + "grad_norm": 0.7046215534210205, + "learning_rate": 2.61498350169877e-06, + "loss": 1.3032, + "mean_token_accuracy": 0.6671566814184189, + "num_tokens": 2704223794.0, + "step": 16117 + }, + { + "entropy": 1.6645724177360535, + "epoch": 1.7706462332811514, + "grad_norm": 0.8890359997749329, + "learning_rate": 2.6144019290506577e-06, + "loss": 1.2717, + "mean_token_accuracy": 0.6672799090544382, + "num_tokens": 2704401039.0, + "step": 16118 + }, + { + "entropy": 1.7233166893323262, + "epoch": 1.7707560901925241, + "grad_norm": 0.5646775364875793, + "learning_rate": 2.613820621802947e-06, + "loss": 1.4646, + "mean_token_accuracy": 0.6415334989627203, + "num_tokens": 2704591999.0, + "step": 16119 + }, + { + "entropy": 1.6967370808124542, + "epoch": 1.7708659471038972, + "grad_norm": 0.9199461340904236, + "learning_rate": 2.613239579974034e-06, + "loss": 1.3411, + "mean_token_accuracy": 0.6616208553314209, + "num_tokens": 2704736795.0, + "step": 16120 + }, + { + "entropy": 1.6985019147396088, + "epoch": 1.77097580401527, + "grad_norm": 0.7342497110366821, + "learning_rate": 2.6126588035823074e-06, + "loss": 1.4852, + "mean_token_accuracy": 0.663700466354688, + "num_tokens": 2704877585.0, + "step": 16121 + }, + { + "entropy": 1.686661461989085, + "epoch": 1.771085660926643, + "grad_norm": 0.6621032357215881, + "learning_rate": 2.6120782926461514e-06, + "loss": 1.2625, + "mean_token_accuracy": 0.6748026907444, + "num_tokens": 2705010312.0, + "step": 16122 + }, + { + "entropy": 1.6965848008791606, + "epoch": 1.771195517838016, + "grad_norm": 0.7610173225402832, + "learning_rate": 2.6114980471839384e-06, + "loss": 1.3852, + "mean_token_accuracy": 0.6536916842063268, + "num_tokens": 2705237228.0, + "step": 16123 + }, + { + "entropy": 1.687073806921641, + "epoch": 1.7713053747493888, + "grad_norm": 0.7324576377868652, + "learning_rate": 2.6109180672140315e-06, + "loss": 1.3022, + "mean_token_accuracy": 0.6851969212293625, + "num_tokens": 2705407389.0, + "step": 16124 + }, + { + "entropy": 1.705436368783315, + "epoch": 1.7714152316607619, + "grad_norm": 0.6689654588699341, + "learning_rate": 2.6103383527547864e-06, + "loss": 1.2624, + "mean_token_accuracy": 0.675425186753273, + "num_tokens": 2705543178.0, + "step": 16125 + }, + { + "entropy": 1.6826893587907155, + "epoch": 1.7715250885721348, + "grad_norm": 0.6657409071922302, + "learning_rate": 2.6097589038245545e-06, + "loss": 1.3492, + "mean_token_accuracy": 0.655790776014328, + "num_tokens": 2705697833.0, + "step": 16126 + }, + { + "entropy": 1.7663909792900085, + "epoch": 1.7716349454835076, + "grad_norm": 0.7559472322463989, + "learning_rate": 2.609179720441672e-06, + "loss": 1.533, + "mean_token_accuracy": 0.6423116599520048, + "num_tokens": 2705889425.0, + "step": 16127 + }, + { + "entropy": 1.6723959843317668, + "epoch": 1.7717448023948807, + "grad_norm": 0.6863106489181519, + "learning_rate": 2.6086008026244704e-06, + "loss": 1.32, + "mean_token_accuracy": 0.675317257642746, + "num_tokens": 2706068340.0, + "step": 16128 + }, + { + "entropy": 1.735195557276408, + "epoch": 1.7718546593062536, + "grad_norm": 0.8114670515060425, + "learning_rate": 2.6080221503912707e-06, + "loss": 1.5294, + "mean_token_accuracy": 0.6455264935890833, + "num_tokens": 2706229929.0, + "step": 16129 + }, + { + "entropy": 1.7443922758102417, + "epoch": 1.7719645162176265, + "grad_norm": 0.7101374864578247, + "learning_rate": 2.6074437637603885e-06, + "loss": 1.2177, + "mean_token_accuracy": 0.6852958450714747, + "num_tokens": 2706361616.0, + "step": 16130 + }, + { + "entropy": 1.6811170478661854, + "epoch": 1.7720743731289996, + "grad_norm": 0.7076679468154907, + "learning_rate": 2.6068656427501303e-06, + "loss": 1.4677, + "mean_token_accuracy": 0.6564472218354543, + "num_tokens": 2706535078.0, + "step": 16131 + }, + { + "entropy": 1.746174544095993, + "epoch": 1.7721842300403723, + "grad_norm": 0.7166526913642883, + "learning_rate": 2.6062877873787933e-06, + "loss": 1.5574, + "mean_token_accuracy": 0.6369020914038023, + "num_tokens": 2706692845.0, + "step": 16132 + }, + { + "entropy": 1.7436832785606384, + "epoch": 1.7722940869517454, + "grad_norm": 0.920242428779602, + "learning_rate": 2.6057101976646633e-06, + "loss": 1.586, + "mean_token_accuracy": 0.6347026800115904, + "num_tokens": 2706888652.0, + "step": 16133 + }, + { + "entropy": 1.7304763694604237, + "epoch": 1.7724039438631183, + "grad_norm": 0.6323895454406738, + "learning_rate": 2.605132873626025e-06, + "loss": 1.5587, + "mean_token_accuracy": 0.6354440351327261, + "num_tokens": 2707071567.0, + "step": 16134 + }, + { + "entropy": 1.6805862685044606, + "epoch": 1.7725138007744912, + "grad_norm": 0.6497008204460144, + "learning_rate": 2.604555815281148e-06, + "loss": 1.2406, + "mean_token_accuracy": 0.6805828412373861, + "num_tokens": 2707219245.0, + "step": 16135 + }, + { + "entropy": 1.7129256625970204, + "epoch": 1.7726236576858643, + "grad_norm": 0.6977981925010681, + "learning_rate": 2.6039790226482956e-06, + "loss": 1.514, + "mean_token_accuracy": 0.6338231811920801, + "num_tokens": 2707400096.0, + "step": 16136 + }, + { + "entropy": 1.641619215408961, + "epoch": 1.772733514597237, + "grad_norm": 0.6210656762123108, + "learning_rate": 2.603402495745724e-06, + "loss": 1.3067, + "mean_token_accuracy": 0.6655979255835215, + "num_tokens": 2707577271.0, + "step": 16137 + }, + { + "entropy": 1.7035197516282399, + "epoch": 1.77284337150861, + "grad_norm": 0.7586587071418762, + "learning_rate": 2.6028262345916796e-06, + "loss": 1.2978, + "mean_token_accuracy": 0.6796625256538391, + "num_tokens": 2707744658.0, + "step": 16138 + }, + { + "entropy": 1.752359499533971, + "epoch": 1.772953228419983, + "grad_norm": 0.6705688834190369, + "learning_rate": 2.6022502392044023e-06, + "loss": 1.5081, + "mean_token_accuracy": 0.6395560602347056, + "num_tokens": 2707934539.0, + "step": 16139 + }, + { + "entropy": 1.7236380577087402, + "epoch": 1.7730630853313558, + "grad_norm": 0.709601104259491, + "learning_rate": 2.60167450960212e-06, + "loss": 1.5704, + "mean_token_accuracy": 0.6389553348223368, + "num_tokens": 2708079185.0, + "step": 16140 + }, + { + "entropy": 1.7358200351397197, + "epoch": 1.773172942242729, + "grad_norm": 0.7101724743843079, + "learning_rate": 2.6010990458030548e-06, + "loss": 1.3491, + "mean_token_accuracy": 0.6665903975566229, + "num_tokens": 2708191566.0, + "step": 16141 + }, + { + "entropy": 1.7284641365210216, + "epoch": 1.7732827991541018, + "grad_norm": 0.6471381187438965, + "learning_rate": 2.600523847825419e-06, + "loss": 1.5326, + "mean_token_accuracy": 0.6393000731865565, + "num_tokens": 2708369199.0, + "step": 16142 + }, + { + "entropy": 1.719407816727956, + "epoch": 1.7733926560654747, + "grad_norm": 0.7297753095626831, + "learning_rate": 2.5999489156874214e-06, + "loss": 1.292, + "mean_token_accuracy": 0.6675407042105993, + "num_tokens": 2708487034.0, + "step": 16143 + }, + { + "entropy": 1.705468972524007, + "epoch": 1.7735025129768478, + "grad_norm": 0.6920966506004333, + "learning_rate": 2.5993742494072544e-06, + "loss": 1.4197, + "mean_token_accuracy": 0.6390438576539358, + "num_tokens": 2708660319.0, + "step": 16144 + }, + { + "entropy": 1.7181805968284607, + "epoch": 1.7736123698882205, + "grad_norm": 0.6733984351158142, + "learning_rate": 2.5987998490031054e-06, + "loss": 1.4356, + "mean_token_accuracy": 0.6629656205574671, + "num_tokens": 2708828951.0, + "step": 16145 + }, + { + "entropy": 1.7383651733398438, + "epoch": 1.7737222267995936, + "grad_norm": 0.7001859545707703, + "learning_rate": 2.5982257144931573e-06, + "loss": 1.4767, + "mean_token_accuracy": 0.651274119814237, + "num_tokens": 2708963681.0, + "step": 16146 + }, + { + "entropy": 1.68324081103007, + "epoch": 1.7738320837109665, + "grad_norm": 0.772042453289032, + "learning_rate": 2.597651845895579e-06, + "loss": 1.4884, + "mean_token_accuracy": 0.6505793780088425, + "num_tokens": 2709118225.0, + "step": 16147 + }, + { + "entropy": 1.7421075602372487, + "epoch": 1.7739419406223393, + "grad_norm": 0.7277549505233765, + "learning_rate": 2.597078243228533e-06, + "loss": 1.3914, + "mean_token_accuracy": 0.6713375896215439, + "num_tokens": 2709278861.0, + "step": 16148 + }, + { + "entropy": 1.6302008628845215, + "epoch": 1.7740517975337124, + "grad_norm": 0.6995809674263, + "learning_rate": 2.5965049065101746e-06, + "loss": 1.247, + "mean_token_accuracy": 0.686556855837504, + "num_tokens": 2709442407.0, + "step": 16149 + }, + { + "entropy": 1.6407863795757294, + "epoch": 1.7741616544450851, + "grad_norm": 0.7002416849136353, + "learning_rate": 2.595931835758649e-06, + "loss": 1.3002, + "mean_token_accuracy": 0.6775663743416468, + "num_tokens": 2709603952.0, + "step": 16150 + }, + { + "entropy": 1.6509563426176708, + "epoch": 1.7742715113564582, + "grad_norm": 0.6484111547470093, + "learning_rate": 2.595359030992094e-06, + "loss": 1.2547, + "mean_token_accuracy": 0.6776574452718099, + "num_tokens": 2709787365.0, + "step": 16151 + }, + { + "entropy": 1.7624330123265584, + "epoch": 1.774381368267831, + "grad_norm": 0.753284215927124, + "learning_rate": 2.5947864922286386e-06, + "loss": 1.4298, + "mean_token_accuracy": 0.6394060303767523, + "num_tokens": 2709952090.0, + "step": 16152 + }, + { + "entropy": 1.689292460680008, + "epoch": 1.774491225179204, + "grad_norm": 0.7348275780677795, + "learning_rate": 2.5942142194864024e-06, + "loss": 1.3677, + "mean_token_accuracy": 0.6548637946446737, + "num_tokens": 2710159115.0, + "step": 16153 + }, + { + "entropy": 1.7220345834891002, + "epoch": 1.774601082090577, + "grad_norm": 0.6995728611946106, + "learning_rate": 2.5936422127834985e-06, + "loss": 1.2697, + "mean_token_accuracy": 0.6734829644362131, + "num_tokens": 2710276327.0, + "step": 16154 + }, + { + "entropy": 1.7671296894550323, + "epoch": 1.77471093900195, + "grad_norm": 0.7245908379554749, + "learning_rate": 2.593070472138031e-06, + "loss": 1.4621, + "mean_token_accuracy": 0.6349116514126459, + "num_tokens": 2710506591.0, + "step": 16155 + }, + { + "entropy": 1.709269384543101, + "epoch": 1.7748207959133229, + "grad_norm": 0.6322346329689026, + "learning_rate": 2.5924989975680963e-06, + "loss": 1.3325, + "mean_token_accuracy": 0.6667979657649994, + "num_tokens": 2710662997.0, + "step": 16156 + }, + { + "entropy": 1.723963479200999, + "epoch": 1.774930652824696, + "grad_norm": 0.7471461892127991, + "learning_rate": 2.5919277890917777e-06, + "loss": 1.2944, + "mean_token_accuracy": 0.6734066704909006, + "num_tokens": 2710795173.0, + "step": 16157 + }, + { + "entropy": 1.7161237994829814, + "epoch": 1.7750405097360686, + "grad_norm": 0.6630175709724426, + "learning_rate": 2.5913568467271564e-06, + "loss": 1.5, + "mean_token_accuracy": 0.633497933546702, + "num_tokens": 2710956314.0, + "step": 16158 + }, + { + "entropy": 1.7181127270062764, + "epoch": 1.7751503666474417, + "grad_norm": 0.6697092652320862, + "learning_rate": 2.590786170492304e-06, + "loss": 1.4132, + "mean_token_accuracy": 0.6475900014241537, + "num_tokens": 2711129596.0, + "step": 16159 + }, + { + "entropy": 1.6878857612609863, + "epoch": 1.7752602235588146, + "grad_norm": 0.6678206324577332, + "learning_rate": 2.590215760405277e-06, + "loss": 1.3574, + "mean_token_accuracy": 0.6573008944590887, + "num_tokens": 2711318223.0, + "step": 16160 + }, + { + "entropy": 1.7164893845717113, + "epoch": 1.7753700804701875, + "grad_norm": 0.6961454749107361, + "learning_rate": 2.589645616484133e-06, + "loss": 1.4638, + "mean_token_accuracy": 0.6609781930843989, + "num_tokens": 2711458038.0, + "step": 16161 + }, + { + "entropy": 1.6448584695657094, + "epoch": 1.7754799373815606, + "grad_norm": 0.6715121269226074, + "learning_rate": 2.589075738746914e-06, + "loss": 1.4383, + "mean_token_accuracy": 0.6469363421201706, + "num_tokens": 2711618842.0, + "step": 16162 + }, + { + "entropy": 1.6738781730333965, + "epoch": 1.7755897942929333, + "grad_norm": 0.7207627892494202, + "learning_rate": 2.5885061272116597e-06, + "loss": 1.3785, + "mean_token_accuracy": 0.6592583358287811, + "num_tokens": 2711743448.0, + "step": 16163 + }, + { + "entropy": 1.764163116614024, + "epoch": 1.7756996512043064, + "grad_norm": 0.6622251868247986, + "learning_rate": 2.5879367818963965e-06, + "loss": 1.354, + "mean_token_accuracy": 0.6568918774525324, + "num_tokens": 2711879685.0, + "step": 16164 + }, + { + "entropy": 1.6938027838865917, + "epoch": 1.7758095081156793, + "grad_norm": 0.6962149143218994, + "learning_rate": 2.5873677028191418e-06, + "loss": 1.2467, + "mean_token_accuracy": 0.6714517027139664, + "num_tokens": 2712009648.0, + "step": 16165 + }, + { + "entropy": 1.706722229719162, + "epoch": 1.7759193650270522, + "grad_norm": 0.5590953230857849, + "learning_rate": 2.5867988899979086e-06, + "loss": 1.4431, + "mean_token_accuracy": 0.6479671547810236, + "num_tokens": 2712190182.0, + "step": 16166 + }, + { + "entropy": 1.7148079474767048, + "epoch": 1.7760292219384253, + "grad_norm": 0.6328277587890625, + "learning_rate": 2.5862303434507e-06, + "loss": 1.4287, + "mean_token_accuracy": 0.6558680633703867, + "num_tokens": 2712438765.0, + "step": 16167 + }, + { + "entropy": 1.7054628531138103, + "epoch": 1.7761390788497982, + "grad_norm": 0.6650689244270325, + "learning_rate": 2.5856620631955102e-06, + "loss": 1.3792, + "mean_token_accuracy": 0.6593814243872961, + "num_tokens": 2712569597.0, + "step": 16168 + }, + { + "entropy": 1.7111007869243622, + "epoch": 1.776248935761171, + "grad_norm": 0.6413836479187012, + "learning_rate": 2.5850940492503236e-06, + "loss": 1.3747, + "mean_token_accuracy": 0.6556829412778219, + "num_tokens": 2712700190.0, + "step": 16169 + }, + { + "entropy": 1.7287939886252086, + "epoch": 1.7763587926725442, + "grad_norm": 0.6228131055831909, + "learning_rate": 2.584526301633119e-06, + "loss": 1.4028, + "mean_token_accuracy": 0.6681485623121262, + "num_tokens": 2712890812.0, + "step": 16170 + }, + { + "entropy": 1.6465917030970256, + "epoch": 1.7764686495839168, + "grad_norm": 0.7483593225479126, + "learning_rate": 2.583958820361866e-06, + "loss": 1.3934, + "mean_token_accuracy": 0.6619542588790258, + "num_tokens": 2713032571.0, + "step": 16171 + }, + { + "entropy": 1.6852433780829112, + "epoch": 1.77657850649529, + "grad_norm": 0.6569193601608276, + "learning_rate": 2.5833916054545217e-06, + "loss": 1.2729, + "mean_token_accuracy": 0.6696591476599375, + "num_tokens": 2713172975.0, + "step": 16172 + }, + { + "entropy": 1.7001554270585377, + "epoch": 1.7766883634066628, + "grad_norm": 0.7120020389556885, + "learning_rate": 2.582824656929042e-06, + "loss": 1.43, + "mean_token_accuracy": 0.6575095355510712, + "num_tokens": 2713345608.0, + "step": 16173 + }, + { + "entropy": 1.667518824338913, + "epoch": 1.7767982203180357, + "grad_norm": 0.7396501898765564, + "learning_rate": 2.5822579748033676e-06, + "loss": 1.3359, + "mean_token_accuracy": 0.6745062321424484, + "num_tokens": 2713516606.0, + "step": 16174 + }, + { + "entropy": 1.6942040920257568, + "epoch": 1.7769080772294088, + "grad_norm": 0.8534673452377319, + "learning_rate": 2.5816915590954367e-06, + "loss": 1.2135, + "mean_token_accuracy": 0.6788101047277451, + "num_tokens": 2713681222.0, + "step": 16175 + }, + { + "entropy": 1.7359768450260162, + "epoch": 1.7770179341407817, + "grad_norm": 0.6801086664199829, + "learning_rate": 2.581125409823175e-06, + "loss": 1.4861, + "mean_token_accuracy": 0.6353782365719477, + "num_tokens": 2713917752.0, + "step": 16176 + }, + { + "entropy": 1.6656650304794312, + "epoch": 1.7771277910521546, + "grad_norm": 0.6138239502906799, + "learning_rate": 2.580559527004499e-06, + "loss": 1.2827, + "mean_token_accuracy": 0.6767630279064178, + "num_tokens": 2714055978.0, + "step": 16177 + }, + { + "entropy": 1.7242048780123393, + "epoch": 1.7772376479635275, + "grad_norm": 0.7279729247093201, + "learning_rate": 2.579993910657319e-06, + "loss": 1.33, + "mean_token_accuracy": 0.6593465854724249, + "num_tokens": 2714173544.0, + "step": 16178 + }, + { + "entropy": 1.7006490429242451, + "epoch": 1.7773475048749003, + "grad_norm": 0.7351089119911194, + "learning_rate": 2.5794285607995407e-06, + "loss": 1.4855, + "mean_token_accuracy": 0.6623196552197138, + "num_tokens": 2714346691.0, + "step": 16179 + }, + { + "entropy": 1.7114817400773366, + "epoch": 1.7774573617862734, + "grad_norm": 0.7498958110809326, + "learning_rate": 2.5788634774490524e-06, + "loss": 1.588, + "mean_token_accuracy": 0.6416305353244146, + "num_tokens": 2714550240.0, + "step": 16180 + }, + { + "entropy": 1.7464225788911183, + "epoch": 1.7775672186976463, + "grad_norm": 0.7129120826721191, + "learning_rate": 2.57829866062374e-06, + "loss": 1.5597, + "mean_token_accuracy": 0.626517136891683, + "num_tokens": 2714761712.0, + "step": 16181 + }, + { + "entropy": 1.7081879675388336, + "epoch": 1.7776770756090192, + "grad_norm": 0.6990813612937927, + "learning_rate": 2.5777341103414807e-06, + "loss": 1.3879, + "mean_token_accuracy": 0.6567564556996027, + "num_tokens": 2714898735.0, + "step": 16182 + }, + { + "entropy": 1.7253175874551137, + "epoch": 1.7777869325203923, + "grad_norm": 0.7992512583732605, + "learning_rate": 2.577169826620142e-06, + "loss": 1.3492, + "mean_token_accuracy": 0.6692212472359339, + "num_tokens": 2715058698.0, + "step": 16183 + }, + { + "entropy": 1.7420212825139363, + "epoch": 1.777896789431765, + "grad_norm": 0.6389954090118408, + "learning_rate": 2.576605809477582e-06, + "loss": 1.4296, + "mean_token_accuracy": 0.6562297642230988, + "num_tokens": 2715237761.0, + "step": 16184 + }, + { + "entropy": 1.7322679460048676, + "epoch": 1.778006646343138, + "grad_norm": 0.6425938010215759, + "learning_rate": 2.576042058931653e-06, + "loss": 1.2624, + "mean_token_accuracy": 0.6835384468237559, + "num_tokens": 2715439975.0, + "step": 16185 + }, + { + "entropy": 1.7014261583487194, + "epoch": 1.778116503254511, + "grad_norm": 0.6009911894798279, + "learning_rate": 2.5754785750001966e-06, + "loss": 1.3455, + "mean_token_accuracy": 0.6725515276193619, + "num_tokens": 2715575981.0, + "step": 16186 + }, + { + "entropy": 1.7201250692208607, + "epoch": 1.7782263601658839, + "grad_norm": 0.7367297410964966, + "learning_rate": 2.574915357701048e-06, + "loss": 1.3928, + "mean_token_accuracy": 0.660605326294899, + "num_tokens": 2715726384.0, + "step": 16187 + }, + { + "entropy": 1.6956477065881093, + "epoch": 1.778336217077257, + "grad_norm": 0.6419438719749451, + "learning_rate": 2.574352407052031e-06, + "loss": 1.3171, + "mean_token_accuracy": 0.6647944003343582, + "num_tokens": 2715836741.0, + "step": 16188 + }, + { + "entropy": 1.6989044447739918, + "epoch": 1.7784460739886299, + "grad_norm": 0.9841082692146301, + "learning_rate": 2.5737897230709622e-06, + "loss": 1.4961, + "mean_token_accuracy": 0.6657343481977781, + "num_tokens": 2716004267.0, + "step": 16189 + }, + { + "entropy": 1.7684976359208424, + "epoch": 1.7785559309000027, + "grad_norm": 0.8549887537956238, + "learning_rate": 2.5732273057756552e-06, + "loss": 1.446, + "mean_token_accuracy": 0.6611962815125784, + "num_tokens": 2716181603.0, + "step": 16190 + }, + { + "entropy": 1.7020417054494221, + "epoch": 1.7786657878113756, + "grad_norm": 0.5607102513313293, + "learning_rate": 2.572665155183905e-06, + "loss": 1.4124, + "mean_token_accuracy": 0.6579537143309911, + "num_tokens": 2716361068.0, + "step": 16191 + }, + { + "entropy": 1.6742089788119, + "epoch": 1.7787756447227485, + "grad_norm": 0.7444910407066345, + "learning_rate": 2.5721032713135043e-06, + "loss": 1.3866, + "mean_token_accuracy": 0.6673271010319392, + "num_tokens": 2716526072.0, + "step": 16192 + }, + { + "entropy": 1.759224534034729, + "epoch": 1.7788855016341216, + "grad_norm": 0.6558489799499512, + "learning_rate": 2.5715416541822387e-06, + "loss": 1.384, + "mean_token_accuracy": 0.6560174822807312, + "num_tokens": 2716699065.0, + "step": 16193 + }, + { + "entropy": 1.7571994364261627, + "epoch": 1.7789953585454945, + "grad_norm": 0.6524195075035095, + "learning_rate": 2.570980303807881e-06, + "loss": 1.3668, + "mean_token_accuracy": 0.6584974030653635, + "num_tokens": 2716857465.0, + "step": 16194 + }, + { + "entropy": 1.8164484004179637, + "epoch": 1.7791052154568674, + "grad_norm": 0.7619872689247131, + "learning_rate": 2.570419220208199e-06, + "loss": 1.3642, + "mean_token_accuracy": 0.654331718881925, + "num_tokens": 2716995467.0, + "step": 16195 + }, + { + "entropy": 1.7526369988918304, + "epoch": 1.7792150723682405, + "grad_norm": 0.6816757321357727, + "learning_rate": 2.5698584034009504e-06, + "loss": 1.3161, + "mean_token_accuracy": 0.6580136120319366, + "num_tokens": 2717136339.0, + "step": 16196 + }, + { + "entropy": 1.7177092730998993, + "epoch": 1.7793249292796132, + "grad_norm": 0.6499624848365784, + "learning_rate": 2.5692978534038834e-06, + "loss": 1.3675, + "mean_token_accuracy": 0.6472986241181692, + "num_tokens": 2717281863.0, + "step": 16197 + }, + { + "entropy": 1.7083185315132141, + "epoch": 1.7794347861909863, + "grad_norm": 0.6570821404457092, + "learning_rate": 2.56873757023474e-06, + "loss": 1.3565, + "mean_token_accuracy": 0.6527627358833948, + "num_tokens": 2717444596.0, + "step": 16198 + }, + { + "entropy": 1.7401759326457977, + "epoch": 1.7795446431023592, + "grad_norm": 0.7995166182518005, + "learning_rate": 2.5681775539112554e-06, + "loss": 1.4527, + "mean_token_accuracy": 0.6470424781243006, + "num_tokens": 2717671539.0, + "step": 16199 + }, + { + "entropy": 1.721009184916814, + "epoch": 1.779654500013732, + "grad_norm": 0.7379947900772095, + "learning_rate": 2.5676178044511513e-06, + "loss": 1.5224, + "mean_token_accuracy": 0.6547667557994524, + "num_tokens": 2717840082.0, + "step": 16200 + }, + { + "entropy": 1.7124258081118267, + "epoch": 1.7797643569251052, + "grad_norm": 0.5799604058265686, + "learning_rate": 2.5670583218721422e-06, + "loss": 1.4312, + "mean_token_accuracy": 0.6489716867605845, + "num_tokens": 2718026773.0, + "step": 16201 + }, + { + "entropy": 1.690716157356898, + "epoch": 1.779874213836478, + "grad_norm": 0.6450859904289246, + "learning_rate": 2.566499106191939e-06, + "loss": 1.5138, + "mean_token_accuracy": 0.6407775630553564, + "num_tokens": 2718200554.0, + "step": 16202 + }, + { + "entropy": 1.6798825959364574, + "epoch": 1.779984070747851, + "grad_norm": 0.68744957447052, + "learning_rate": 2.5659401574282393e-06, + "loss": 1.4299, + "mean_token_accuracy": 0.6480642408132553, + "num_tokens": 2718385213.0, + "step": 16203 + }, + { + "entropy": 1.7331880331039429, + "epoch": 1.7800939276592238, + "grad_norm": 0.6031718850135803, + "learning_rate": 2.5653814755987314e-06, + "loss": 1.5247, + "mean_token_accuracy": 0.631999467809995, + "num_tokens": 2718620759.0, + "step": 16204 + }, + { + "entropy": 1.6975124776363373, + "epoch": 1.7802037845705967, + "grad_norm": 0.6780478954315186, + "learning_rate": 2.5648230607211e-06, + "loss": 1.2644, + "mean_token_accuracy": 0.6704892267783483, + "num_tokens": 2718766277.0, + "step": 16205 + }, + { + "entropy": 1.678945968548457, + "epoch": 1.7803136414819698, + "grad_norm": 0.6999272704124451, + "learning_rate": 2.564264912813017e-06, + "loss": 1.359, + "mean_token_accuracy": 0.6699830194314321, + "num_tokens": 2718934488.0, + "step": 16206 + }, + { + "entropy": 1.7268775800863903, + "epoch": 1.7804234983933427, + "grad_norm": 0.7014032602310181, + "learning_rate": 2.5637070318921488e-06, + "loss": 1.3642, + "mean_token_accuracy": 0.6547218362490336, + "num_tokens": 2719076843.0, + "step": 16207 + }, + { + "entropy": 1.718455046415329, + "epoch": 1.7805333553047156, + "grad_norm": 0.6352714896202087, + "learning_rate": 2.563149417976152e-06, + "loss": 1.4188, + "mean_token_accuracy": 0.6555627485116323, + "num_tokens": 2719232421.0, + "step": 16208 + }, + { + "entropy": 1.7530864675839741, + "epoch": 1.7806432122160887, + "grad_norm": 0.6508417725563049, + "learning_rate": 2.562592071082674e-06, + "loss": 1.5481, + "mean_token_accuracy": 0.6468537002801895, + "num_tokens": 2719404790.0, + "step": 16209 + }, + { + "entropy": 1.6875406205654144, + "epoch": 1.7807530691274613, + "grad_norm": 0.7828112244606018, + "learning_rate": 2.5620349912293543e-06, + "loss": 1.4161, + "mean_token_accuracy": 0.6764611254135767, + "num_tokens": 2719529889.0, + "step": 16210 + }, + { + "entropy": 1.6766289969285328, + "epoch": 1.7808629260388344, + "grad_norm": 0.8107297420501709, + "learning_rate": 2.5614781784338255e-06, + "loss": 1.3208, + "mean_token_accuracy": 0.6553743382294973, + "num_tokens": 2719700245.0, + "step": 16211 + }, + { + "entropy": 1.672994703054428, + "epoch": 1.7809727829502073, + "grad_norm": 0.7123458981513977, + "learning_rate": 2.560921632713711e-06, + "loss": 1.3213, + "mean_token_accuracy": 0.6707338194052378, + "num_tokens": 2719870654.0, + "step": 16212 + }, + { + "entropy": 1.6604451934496562, + "epoch": 1.7810826398615802, + "grad_norm": 0.7824660539627075, + "learning_rate": 2.5603653540866226e-06, + "loss": 1.5431, + "mean_token_accuracy": 0.6393003712097803, + "num_tokens": 2720059793.0, + "step": 16213 + }, + { + "entropy": 1.69284787774086, + "epoch": 1.7811924967729533, + "grad_norm": 0.6443684101104736, + "learning_rate": 2.559809342570168e-06, + "loss": 1.2632, + "mean_token_accuracy": 0.6757178753614426, + "num_tokens": 2720222052.0, + "step": 16214 + }, + { + "entropy": 1.727107326189677, + "epoch": 1.7813023536843262, + "grad_norm": 0.6985810995101929, + "learning_rate": 2.5592535981819455e-06, + "loss": 1.4631, + "mean_token_accuracy": 0.6416812141736349, + "num_tokens": 2720394912.0, + "step": 16215 + }, + { + "entropy": 1.7403022348880768, + "epoch": 1.781412210595699, + "grad_norm": 0.6760064959526062, + "learning_rate": 2.5586981209395414e-06, + "loss": 1.4851, + "mean_token_accuracy": 0.6429832726716995, + "num_tokens": 2720559240.0, + "step": 16216 + }, + { + "entropy": 1.732146809498469, + "epoch": 1.7815220675070722, + "grad_norm": 0.6863375902175903, + "learning_rate": 2.5581429108605394e-06, + "loss": 1.4375, + "mean_token_accuracy": 0.6447567095359167, + "num_tokens": 2720745800.0, + "step": 16217 + }, + { + "entropy": 1.6957339843114216, + "epoch": 1.7816319244184449, + "grad_norm": 0.6518421769142151, + "learning_rate": 2.557587967962509e-06, + "loss": 1.361, + "mean_token_accuracy": 0.6534071415662766, + "num_tokens": 2720895109.0, + "step": 16218 + }, + { + "entropy": 1.7050747672716777, + "epoch": 1.781741781329818, + "grad_norm": 0.6569792032241821, + "learning_rate": 2.5570332922630163e-06, + "loss": 1.2822, + "mean_token_accuracy": 0.6715095390876135, + "num_tokens": 2721024272.0, + "step": 16219 + }, + { + "entropy": 1.679296483596166, + "epoch": 1.7818516382411909, + "grad_norm": 0.7886082530021667, + "learning_rate": 2.5564788837796156e-06, + "loss": 1.4813, + "mean_token_accuracy": 0.6585791359345118, + "num_tokens": 2721227239.0, + "step": 16220 + }, + { + "entropy": 1.73756409684817, + "epoch": 1.7819614951525637, + "grad_norm": 0.7555798888206482, + "learning_rate": 2.5559247425298523e-06, + "loss": 1.3367, + "mean_token_accuracy": 0.6579089959462484, + "num_tokens": 2721379504.0, + "step": 16221 + }, + { + "entropy": 1.7351977328459423, + "epoch": 1.7820713520639369, + "grad_norm": 0.6866292953491211, + "learning_rate": 2.5553708685312658e-06, + "loss": 1.3398, + "mean_token_accuracy": 0.6643187751372656, + "num_tokens": 2721521980.0, + "step": 16222 + }, + { + "entropy": 1.670342117547989, + "epoch": 1.7821812089753095, + "grad_norm": 0.5937049984931946, + "learning_rate": 2.554817261801387e-06, + "loss": 1.3746, + "mean_token_accuracy": 0.6563832610845566, + "num_tokens": 2721676655.0, + "step": 16223 + }, + { + "entropy": 1.674795150756836, + "epoch": 1.7822910658866826, + "grad_norm": 0.6507994532585144, + "learning_rate": 2.554263922357737e-06, + "loss": 1.4518, + "mean_token_accuracy": 0.6518680403629938, + "num_tokens": 2721878324.0, + "step": 16224 + }, + { + "entropy": 1.7025028467178345, + "epoch": 1.7824009227980555, + "grad_norm": 0.7818902134895325, + "learning_rate": 2.553710850217826e-06, + "loss": 1.5391, + "mean_token_accuracy": 0.636594370007515, + "num_tokens": 2722086360.0, + "step": 16225 + }, + { + "entropy": 1.7275918225447338, + "epoch": 1.7825107797094284, + "grad_norm": 0.6679732203483582, + "learning_rate": 2.5531580453991627e-06, + "loss": 1.3366, + "mean_token_accuracy": 0.6550202568372091, + "num_tokens": 2722217001.0, + "step": 16226 + }, + { + "entropy": 1.694819023211797, + "epoch": 1.7826206366208015, + "grad_norm": 0.7464036345481873, + "learning_rate": 2.5526055079192413e-06, + "loss": 1.5109, + "mean_token_accuracy": 0.6589773992697397, + "num_tokens": 2722371256.0, + "step": 16227 + }, + { + "entropy": 1.681807627280553, + "epoch": 1.7827304935321744, + "grad_norm": 0.6635357737541199, + "learning_rate": 2.5520532377955467e-06, + "loss": 1.3098, + "mean_token_accuracy": 0.6588182846705118, + "num_tokens": 2722509673.0, + "step": 16228 + }, + { + "entropy": 1.680689126253128, + "epoch": 1.7828403504435473, + "grad_norm": 0.6713885068893433, + "learning_rate": 2.551501235045562e-06, + "loss": 1.3095, + "mean_token_accuracy": 0.6868884414434433, + "num_tokens": 2722686390.0, + "step": 16229 + }, + { + "entropy": 1.712468832731247, + "epoch": 1.7829502073549204, + "grad_norm": 0.7904059886932373, + "learning_rate": 2.5509494996867558e-06, + "loss": 1.5056, + "mean_token_accuracy": 0.6609023263057073, + "num_tokens": 2722817375.0, + "step": 16230 + }, + { + "entropy": 1.7211446662743886, + "epoch": 1.783060064266293, + "grad_norm": 0.6894172430038452, + "learning_rate": 2.5503980317365908e-06, + "loss": 1.3432, + "mean_token_accuracy": 0.6689777423938116, + "num_tokens": 2722981904.0, + "step": 16231 + }, + { + "entropy": 1.6683667202790577, + "epoch": 1.7831699211776662, + "grad_norm": 0.8465138673782349, + "learning_rate": 2.549846831212521e-06, + "loss": 1.3657, + "mean_token_accuracy": 0.6582571069399515, + "num_tokens": 2723174066.0, + "step": 16232 + }, + { + "entropy": 1.726008802652359, + "epoch": 1.783279778089039, + "grad_norm": 0.7869644165039062, + "learning_rate": 2.5492958981319902e-06, + "loss": 1.2813, + "mean_token_accuracy": 0.6665952205657959, + "num_tokens": 2723281291.0, + "step": 16233 + }, + { + "entropy": 1.7815176844596863, + "epoch": 1.783389635000412, + "grad_norm": 0.656838059425354, + "learning_rate": 2.5487452325124363e-06, + "loss": 1.4156, + "mean_token_accuracy": 0.6533069312572479, + "num_tokens": 2723471325.0, + "step": 16234 + }, + { + "entropy": 1.7248845597108204, + "epoch": 1.783499491911785, + "grad_norm": 0.9060506820678711, + "learning_rate": 2.5481948343712885e-06, + "loss": 1.4979, + "mean_token_accuracy": 0.6565845509370168, + "num_tokens": 2723632795.0, + "step": 16235 + }, + { + "entropy": 1.730120857556661, + "epoch": 1.7836093488231577, + "grad_norm": 0.7274516820907593, + "learning_rate": 2.5476447037259666e-06, + "loss": 1.3954, + "mean_token_accuracy": 0.6534441063801447, + "num_tokens": 2723764226.0, + "step": 16236 + }, + { + "entropy": 1.6996258199214935, + "epoch": 1.7837192057345308, + "grad_norm": 0.7300492525100708, + "learning_rate": 2.547094840593879e-06, + "loss": 1.3445, + "mean_token_accuracy": 0.6685766031344732, + "num_tokens": 2723901277.0, + "step": 16237 + }, + { + "entropy": 1.7311415870984395, + "epoch": 1.7838290626459037, + "grad_norm": 0.8246431946754456, + "learning_rate": 2.546545244992432e-06, + "loss": 1.2342, + "mean_token_accuracy": 0.6768847008546194, + "num_tokens": 2724036004.0, + "step": 16238 + }, + { + "entropy": 1.6984553039073944, + "epoch": 1.7839389195572766, + "grad_norm": 0.6190625429153442, + "learning_rate": 2.5459959169390185e-06, + "loss": 1.5376, + "mean_token_accuracy": 0.6259044905503591, + "num_tokens": 2724340416.0, + "step": 16239 + }, + { + "entropy": 1.7265506088733673, + "epoch": 1.7840487764686497, + "grad_norm": 0.6775219440460205, + "learning_rate": 2.5454468564510242e-06, + "loss": 1.4671, + "mean_token_accuracy": 0.6378841251134872, + "num_tokens": 2724522692.0, + "step": 16240 + }, + { + "entropy": 1.7654529015223186, + "epoch": 1.7841586333800226, + "grad_norm": 0.655631959438324, + "learning_rate": 2.5448980635458287e-06, + "loss": 1.416, + "mean_token_accuracy": 0.6465630332628886, + "num_tokens": 2724738322.0, + "step": 16241 + }, + { + "entropy": 1.6371107796827953, + "epoch": 1.7842684902913954, + "grad_norm": 0.6871931552886963, + "learning_rate": 2.5443495382407973e-06, + "loss": 1.4574, + "mean_token_accuracy": 0.647613137960434, + "num_tokens": 2724894550.0, + "step": 16242 + }, + { + "entropy": 1.6568027933438618, + "epoch": 1.7843783472027686, + "grad_norm": 0.6367573142051697, + "learning_rate": 2.543801280553295e-06, + "loss": 1.4055, + "mean_token_accuracy": 0.653554563721021, + "num_tokens": 2725067815.0, + "step": 16243 + }, + { + "entropy": 1.753188967704773, + "epoch": 1.7844882041141412, + "grad_norm": 0.705480694770813, + "learning_rate": 2.5432532905006715e-06, + "loss": 1.5104, + "mean_token_accuracy": 0.631978377699852, + "num_tokens": 2725269398.0, + "step": 16244 + }, + { + "entropy": 1.6898446877797444, + "epoch": 1.7845980610255143, + "grad_norm": 0.7416958212852478, + "learning_rate": 2.542705568100268e-06, + "loss": 1.3553, + "mean_token_accuracy": 0.6741011242071787, + "num_tokens": 2725414400.0, + "step": 16245 + }, + { + "entropy": 1.7214942475159962, + "epoch": 1.7847079179368872, + "grad_norm": 0.7140223979949951, + "learning_rate": 2.542158113369424e-06, + "loss": 1.3623, + "mean_token_accuracy": 0.6528001030286154, + "num_tokens": 2725550421.0, + "step": 16246 + }, + { + "entropy": 1.6600320835908253, + "epoch": 1.78481777484826, + "grad_norm": 0.8466951251029968, + "learning_rate": 2.5416109263254656e-06, + "loss": 1.3405, + "mean_token_accuracy": 0.658960203329722, + "num_tokens": 2725749641.0, + "step": 16247 + }, + { + "entropy": 1.7030630608399708, + "epoch": 1.7849276317596332, + "grad_norm": 0.8019019365310669, + "learning_rate": 2.541064006985709e-06, + "loss": 1.5304, + "mean_token_accuracy": 0.6433060467243195, + "num_tokens": 2725922079.0, + "step": 16248 + }, + { + "entropy": 1.7040532032648723, + "epoch": 1.7850374886710059, + "grad_norm": 0.7823516726493835, + "learning_rate": 2.5405173553674662e-06, + "loss": 1.2843, + "mean_token_accuracy": 0.6735737522443136, + "num_tokens": 2726058883.0, + "step": 16249 + }, + { + "entropy": 1.7228721876939137, + "epoch": 1.785147345582379, + "grad_norm": 0.7137507200241089, + "learning_rate": 2.539970971488034e-06, + "loss": 1.3681, + "mean_token_accuracy": 0.6637583325306574, + "num_tokens": 2726214542.0, + "step": 16250 + }, + { + "entropy": 1.6770942211151123, + "epoch": 1.7852572024937519, + "grad_norm": 0.6972078680992126, + "learning_rate": 2.539424855364711e-06, + "loss": 1.326, + "mean_token_accuracy": 0.6649446338415146, + "num_tokens": 2726375099.0, + "step": 16251 + }, + { + "entropy": 1.7118816177050273, + "epoch": 1.7853670594051247, + "grad_norm": 0.7081136107444763, + "learning_rate": 2.5388790070147796e-06, + "loss": 1.3891, + "mean_token_accuracy": 0.6423606922229131, + "num_tokens": 2726521694.0, + "step": 16252 + }, + { + "entropy": 1.6510530412197113, + "epoch": 1.7854769163164979, + "grad_norm": 0.7234501242637634, + "learning_rate": 2.538333426455512e-06, + "loss": 1.4314, + "mean_token_accuracy": 0.6571709563334783, + "num_tokens": 2726696075.0, + "step": 16253 + }, + { + "entropy": 1.7041932344436646, + "epoch": 1.7855867732278707, + "grad_norm": 0.7055428624153137, + "learning_rate": 2.53778811370418e-06, + "loss": 1.454, + "mean_token_accuracy": 0.6542116304238638, + "num_tokens": 2726861735.0, + "step": 16254 + }, + { + "entropy": 1.6798604428768158, + "epoch": 1.7856966301392436, + "grad_norm": 0.6403173208236694, + "learning_rate": 2.5372430687780413e-06, + "loss": 1.4092, + "mean_token_accuracy": 0.6626434773206711, + "num_tokens": 2727050120.0, + "step": 16255 + }, + { + "entropy": 1.658277968565623, + "epoch": 1.7858064870506167, + "grad_norm": 0.7245867848396301, + "learning_rate": 2.536698291694346e-06, + "loss": 1.456, + "mean_token_accuracy": 0.6435498197873434, + "num_tokens": 2727250402.0, + "step": 16256 + }, + { + "entropy": 1.7269805371761322, + "epoch": 1.7859163439619894, + "grad_norm": 0.7680160999298096, + "learning_rate": 2.536153782470335e-06, + "loss": 1.5174, + "mean_token_accuracy": 0.6507440209388733, + "num_tokens": 2727435782.0, + "step": 16257 + }, + { + "entropy": 1.6973803043365479, + "epoch": 1.7860262008733625, + "grad_norm": 0.6898791790008545, + "learning_rate": 2.5356095411232455e-06, + "loss": 1.3865, + "mean_token_accuracy": 0.6631582975387573, + "num_tokens": 2727603708.0, + "step": 16258 + }, + { + "entropy": 1.764186054468155, + "epoch": 1.7861360577847354, + "grad_norm": 0.8053025007247925, + "learning_rate": 2.5350655676702985e-06, + "loss": 1.4573, + "mean_token_accuracy": 0.6421335885922114, + "num_tokens": 2727784144.0, + "step": 16259 + }, + { + "entropy": 1.7275482614835103, + "epoch": 1.7862459146961083, + "grad_norm": 0.6696358323097229, + "learning_rate": 2.534521862128711e-06, + "loss": 1.2937, + "mean_token_accuracy": 0.6718499114116033, + "num_tokens": 2727904805.0, + "step": 16260 + }, + { + "entropy": 1.7152071297168732, + "epoch": 1.7863557716074814, + "grad_norm": 0.8419023156166077, + "learning_rate": 2.5339784245156934e-06, + "loss": 1.3275, + "mean_token_accuracy": 0.664416715502739, + "num_tokens": 2728057786.0, + "step": 16261 + }, + { + "entropy": 1.7445741693178813, + "epoch": 1.786465628518854, + "grad_norm": 0.6461774110794067, + "learning_rate": 2.533435254848442e-06, + "loss": 1.3029, + "mean_token_accuracy": 0.6617392847935358, + "num_tokens": 2728206231.0, + "step": 16262 + }, + { + "entropy": 1.7072357336680095, + "epoch": 1.7865754854302272, + "grad_norm": 0.7268346548080444, + "learning_rate": 2.5328923531441506e-06, + "loss": 1.4484, + "mean_token_accuracy": 0.6492378860712051, + "num_tokens": 2728381345.0, + "step": 16263 + }, + { + "entropy": 1.6935893793900807, + "epoch": 1.7866853423416, + "grad_norm": 0.6541410088539124, + "learning_rate": 2.5323497194200025e-06, + "loss": 1.3363, + "mean_token_accuracy": 0.6590806543827057, + "num_tokens": 2728559317.0, + "step": 16264 + }, + { + "entropy": 1.727291206518809, + "epoch": 1.786795199252973, + "grad_norm": 0.7337368726730347, + "learning_rate": 2.5318073536931677e-06, + "loss": 1.5537, + "mean_token_accuracy": 0.6399403661489487, + "num_tokens": 2728774789.0, + "step": 16265 + }, + { + "entropy": 1.6569550434748332, + "epoch": 1.786905056164346, + "grad_norm": 0.8272339701652527, + "learning_rate": 2.5312652559808143e-06, + "loss": 1.4112, + "mean_token_accuracy": 0.646061177055041, + "num_tokens": 2728980119.0, + "step": 16266 + }, + { + "entropy": 1.7381823460261028, + "epoch": 1.787014913075719, + "grad_norm": 0.7224423885345459, + "learning_rate": 2.5307234263001006e-06, + "loss": 1.2268, + "mean_token_accuracy": 0.6785031110048294, + "num_tokens": 2729121787.0, + "step": 16267 + }, + { + "entropy": 1.6357511182626088, + "epoch": 1.7871247699870918, + "grad_norm": 0.6711469888687134, + "learning_rate": 2.530181864668174e-06, + "loss": 1.4506, + "mean_token_accuracy": 0.6360716919104258, + "num_tokens": 2729314285.0, + "step": 16268 + }, + { + "entropy": 1.66348002354304, + "epoch": 1.787234626898465, + "grad_norm": 0.5813800692558289, + "learning_rate": 2.5296405711021744e-06, + "loss": 1.4608, + "mean_token_accuracy": 0.6357814073562622, + "num_tokens": 2729556544.0, + "step": 16269 + }, + { + "entropy": 1.6779274741808574, + "epoch": 1.7873444838098376, + "grad_norm": 0.7128428816795349, + "learning_rate": 2.529099545619234e-06, + "loss": 1.4014, + "mean_token_accuracy": 0.6523097256819407, + "num_tokens": 2729746524.0, + "step": 16270 + }, + { + "entropy": 1.6374227901299794, + "epoch": 1.7874543407212107, + "grad_norm": 0.6598563194274902, + "learning_rate": 2.5285587882364766e-06, + "loss": 1.3394, + "mean_token_accuracy": 0.662226935227712, + "num_tokens": 2729917624.0, + "step": 16271 + }, + { + "entropy": 1.6421111126740773, + "epoch": 1.7875641976325836, + "grad_norm": 0.6324965953826904, + "learning_rate": 2.5280182989710143e-06, + "loss": 1.4136, + "mean_token_accuracy": 0.656077653169632, + "num_tokens": 2730146475.0, + "step": 16272 + }, + { + "entropy": 1.6922315955162048, + "epoch": 1.7876740545439564, + "grad_norm": 0.5751784443855286, + "learning_rate": 2.5274780778399576e-06, + "loss": 1.4533, + "mean_token_accuracy": 0.6490372568368912, + "num_tokens": 2730347616.0, + "step": 16273 + }, + { + "entropy": 1.699767659107844, + "epoch": 1.7877839114553296, + "grad_norm": 0.6968252062797546, + "learning_rate": 2.526938124860401e-06, + "loss": 1.4884, + "mean_token_accuracy": 0.6427743136882782, + "num_tokens": 2730587146.0, + "step": 16274 + }, + { + "entropy": 1.72932164867719, + "epoch": 1.7878937683667022, + "grad_norm": 0.700259804725647, + "learning_rate": 2.5263984400494353e-06, + "loss": 1.1976, + "mean_token_accuracy": 0.6866904695828756, + "num_tokens": 2730734334.0, + "step": 16275 + }, + { + "entropy": 1.7005844314893086, + "epoch": 1.7880036252780753, + "grad_norm": 0.6235203742980957, + "learning_rate": 2.52585902342414e-06, + "loss": 1.3332, + "mean_token_accuracy": 0.6622271637121836, + "num_tokens": 2730889099.0, + "step": 16276 + }, + { + "entropy": 1.7240139146645863, + "epoch": 1.7881134821894482, + "grad_norm": 0.7013264894485474, + "learning_rate": 2.525319875001587e-06, + "loss": 1.3928, + "mean_token_accuracy": 0.6499841312567393, + "num_tokens": 2731097821.0, + "step": 16277 + }, + { + "entropy": 1.730222374200821, + "epoch": 1.788223339100821, + "grad_norm": 0.6136840581893921, + "learning_rate": 2.5247809947988413e-06, + "loss": 1.4699, + "mean_token_accuracy": 0.6457250515619913, + "num_tokens": 2731308470.0, + "step": 16278 + }, + { + "entropy": 1.7151026626427968, + "epoch": 1.7883331960121942, + "grad_norm": 0.6567912101745605, + "learning_rate": 2.524242382832959e-06, + "loss": 1.3497, + "mean_token_accuracy": 0.6674692332744598, + "num_tokens": 2731473920.0, + "step": 16279 + }, + { + "entropy": 1.7287200788656871, + "epoch": 1.788443052923567, + "grad_norm": 0.8798257112503052, + "learning_rate": 2.5237040391209877e-06, + "loss": 1.4002, + "mean_token_accuracy": 0.6503605445226034, + "num_tokens": 2731618031.0, + "step": 16280 + }, + { + "entropy": 1.7416976193586986, + "epoch": 1.78855290983494, + "grad_norm": 0.7769317030906677, + "learning_rate": 2.523165963679961e-06, + "loss": 1.6973, + "mean_token_accuracy": 0.6211136281490326, + "num_tokens": 2731779387.0, + "step": 16281 + }, + { + "entropy": 1.7118895947933197, + "epoch": 1.788662766746313, + "grad_norm": 0.6092858910560608, + "learning_rate": 2.522628156526914e-06, + "loss": 1.4178, + "mean_token_accuracy": 0.6442497919003168, + "num_tokens": 2731984238.0, + "step": 16282 + }, + { + "entropy": 1.712389588356018, + "epoch": 1.7887726236576857, + "grad_norm": 0.8901104927062988, + "learning_rate": 2.5220906176788657e-06, + "loss": 1.3029, + "mean_token_accuracy": 0.6697202920913696, + "num_tokens": 2732213159.0, + "step": 16283 + }, + { + "entropy": 1.6833104491233826, + "epoch": 1.7888824805690589, + "grad_norm": 0.6661498546600342, + "learning_rate": 2.5215533471528276e-06, + "loss": 1.2588, + "mean_token_accuracy": 0.6872056176265081, + "num_tokens": 2732337606.0, + "step": 16284 + }, + { + "entropy": 1.722004105647405, + "epoch": 1.7889923374804317, + "grad_norm": 0.6921575665473938, + "learning_rate": 2.521016344965807e-06, + "loss": 1.5717, + "mean_token_accuracy": 0.6401575257380804, + "num_tokens": 2732528822.0, + "step": 16285 + }, + { + "entropy": 1.6872912446657817, + "epoch": 1.7891021943918046, + "grad_norm": 0.6085355281829834, + "learning_rate": 2.520479611134797e-06, + "loss": 1.3896, + "mean_token_accuracy": 0.6487388958533605, + "num_tokens": 2732708924.0, + "step": 16286 + }, + { + "entropy": 1.7655263344446819, + "epoch": 1.7892120513031777, + "grad_norm": 0.7121375799179077, + "learning_rate": 2.5199431456767877e-06, + "loss": 1.3726, + "mean_token_accuracy": 0.6477866073449453, + "num_tokens": 2732847700.0, + "step": 16287 + }, + { + "entropy": 1.7201635142167409, + "epoch": 1.7893219082145504, + "grad_norm": 0.6995194554328918, + "learning_rate": 2.5194069486087564e-06, + "loss": 1.3661, + "mean_token_accuracy": 0.6587748775879542, + "num_tokens": 2732996658.0, + "step": 16288 + }, + { + "entropy": 1.7433823545773823, + "epoch": 1.7894317651259235, + "grad_norm": 0.6454122066497803, + "learning_rate": 2.5188710199476725e-06, + "loss": 1.497, + "mean_token_accuracy": 0.6462214092413584, + "num_tokens": 2733190258.0, + "step": 16289 + }, + { + "entropy": 1.7409153878688812, + "epoch": 1.7895416220372964, + "grad_norm": 0.7849988341331482, + "learning_rate": 2.5183353597104994e-06, + "loss": 1.433, + "mean_token_accuracy": 0.6468554139137268, + "num_tokens": 2733359931.0, + "step": 16290 + }, + { + "entropy": 1.6835778454939525, + "epoch": 1.7896514789486693, + "grad_norm": 0.6935192346572876, + "learning_rate": 2.517799967914191e-06, + "loss": 1.5234, + "mean_token_accuracy": 0.6396876275539398, + "num_tokens": 2733583935.0, + "step": 16291 + }, + { + "entropy": 1.6984424690405528, + "epoch": 1.7897613358600424, + "grad_norm": 0.6901172995567322, + "learning_rate": 2.5172648445756927e-06, + "loss": 1.2957, + "mean_token_accuracy": 0.6744356652100881, + "num_tokens": 2733713104.0, + "step": 16292 + }, + { + "entropy": 1.6622243821620941, + "epoch": 1.7898711927714153, + "grad_norm": 0.6113606095314026, + "learning_rate": 2.516729989711937e-06, + "loss": 1.2825, + "mean_token_accuracy": 0.6713751504818598, + "num_tokens": 2733886622.0, + "step": 16293 + }, + { + "entropy": 1.68569149573644, + "epoch": 1.7899810496827882, + "grad_norm": 0.6700574159622192, + "learning_rate": 2.516195403339856e-06, + "loss": 1.4043, + "mean_token_accuracy": 0.6612590849399567, + "num_tokens": 2734036231.0, + "step": 16294 + }, + { + "entropy": 1.7632472316424053, + "epoch": 1.7900909065941613, + "grad_norm": 0.8103067278862, + "learning_rate": 2.515661085476368e-06, + "loss": 1.4073, + "mean_token_accuracy": 0.6506477644046148, + "num_tokens": 2734170169.0, + "step": 16295 + }, + { + "entropy": 1.6857722500960033, + "epoch": 1.790200763505534, + "grad_norm": 0.6791203618049622, + "learning_rate": 2.5151270361383816e-06, + "loss": 1.4516, + "mean_token_accuracy": 0.6414875040451685, + "num_tokens": 2734366953.0, + "step": 16296 + }, + { + "entropy": 1.6843474805355072, + "epoch": 1.790310620416907, + "grad_norm": 0.733788013458252, + "learning_rate": 2.5145932553428038e-06, + "loss": 1.4301, + "mean_token_accuracy": 0.653965026140213, + "num_tokens": 2734526424.0, + "step": 16297 + }, + { + "entropy": 1.6595512131849925, + "epoch": 1.79042047732828, + "grad_norm": 0.7882452607154846, + "learning_rate": 2.5140597431065233e-06, + "loss": 1.4809, + "mean_token_accuracy": 0.651551162203153, + "num_tokens": 2734683617.0, + "step": 16298 + }, + { + "entropy": 1.6853315234184265, + "epoch": 1.7905303342396528, + "grad_norm": 0.6644918322563171, + "learning_rate": 2.5135264994464294e-06, + "loss": 1.3111, + "mean_token_accuracy": 0.6654441605011622, + "num_tokens": 2734825857.0, + "step": 16299 + }, + { + "entropy": 1.6577715078989665, + "epoch": 1.790640191151026, + "grad_norm": 0.5730201601982117, + "learning_rate": 2.512993524379398e-06, + "loss": 1.3078, + "mean_token_accuracy": 0.6807037740945816, + "num_tokens": 2734961414.0, + "step": 16300 + }, + { + "entropy": 1.7456890443960826, + "epoch": 1.7907500480623986, + "grad_norm": 0.8542404770851135, + "learning_rate": 2.5124608179222958e-06, + "loss": 1.44, + "mean_token_accuracy": 0.6580932984749476, + "num_tokens": 2735127869.0, + "step": 16301 + }, + { + "entropy": 1.6978826026121776, + "epoch": 1.7908599049737717, + "grad_norm": 0.6640202403068542, + "learning_rate": 2.5119283800919853e-06, + "loss": 1.4269, + "mean_token_accuracy": 0.655546839038531, + "num_tokens": 2735315439.0, + "step": 16302 + }, + { + "entropy": 1.7440852721532185, + "epoch": 1.7909697618851446, + "grad_norm": 0.7849541902542114, + "learning_rate": 2.5113962109053162e-06, + "loss": 1.2534, + "mean_token_accuracy": 0.6781354149182638, + "num_tokens": 2735430005.0, + "step": 16303 + }, + { + "entropy": 1.7063166797161102, + "epoch": 1.7910796187965174, + "grad_norm": 1.1036492586135864, + "learning_rate": 2.5108643103791335e-06, + "loss": 1.3421, + "mean_token_accuracy": 0.6793977270523707, + "num_tokens": 2735552067.0, + "step": 16304 + }, + { + "entropy": 1.6659158567587535, + "epoch": 1.7911894757078906, + "grad_norm": 0.6122180819511414, + "learning_rate": 2.5103326785302677e-06, + "loss": 1.3725, + "mean_token_accuracy": 0.6636357257763544, + "num_tokens": 2735731985.0, + "step": 16305 + }, + { + "entropy": 1.6837575336297352, + "epoch": 1.7912993326192634, + "grad_norm": 0.5877187848091125, + "learning_rate": 2.5098013153755485e-06, + "loss": 1.486, + "mean_token_accuracy": 0.6384171495834986, + "num_tokens": 2735901634.0, + "step": 16306 + }, + { + "entropy": 1.6770992676417034, + "epoch": 1.7914091895306363, + "grad_norm": 0.7175406217575073, + "learning_rate": 2.509270220931792e-06, + "loss": 1.2945, + "mean_token_accuracy": 0.6702460249265035, + "num_tokens": 2736073169.0, + "step": 16307 + }, + { + "entropy": 1.7309307356675465, + "epoch": 1.7915190464420094, + "grad_norm": 0.7163530588150024, + "learning_rate": 2.5087393952158063e-06, + "loss": 1.3491, + "mean_token_accuracy": 0.6687060197194418, + "num_tokens": 2736199549.0, + "step": 16308 + }, + { + "entropy": 1.7686155239741008, + "epoch": 1.791628903353382, + "grad_norm": 0.7394537925720215, + "learning_rate": 2.5082088382443936e-06, + "loss": 1.4027, + "mean_token_accuracy": 0.6433225274085999, + "num_tokens": 2736375818.0, + "step": 16309 + }, + { + "entropy": 1.6461839079856873, + "epoch": 1.7917387602647552, + "grad_norm": 0.6997315287590027, + "learning_rate": 2.5076785500343426e-06, + "loss": 1.3915, + "mean_token_accuracy": 0.6605212738116583, + "num_tokens": 2736580721.0, + "step": 16310 + }, + { + "entropy": 1.7693572044372559, + "epoch": 1.791848617176128, + "grad_norm": 0.7545243501663208, + "learning_rate": 2.5071485306024405e-06, + "loss": 1.5297, + "mean_token_accuracy": 0.6341488460699717, + "num_tokens": 2736747341.0, + "step": 16311 + }, + { + "entropy": 1.6597660581270854, + "epoch": 1.791958474087501, + "grad_norm": 0.6617560386657715, + "learning_rate": 2.5066187799654608e-06, + "loss": 1.2636, + "mean_token_accuracy": 0.6663326869408289, + "num_tokens": 2736877606.0, + "step": 16312 + }, + { + "entropy": 1.5883537034193675, + "epoch": 1.792068330998874, + "grad_norm": 0.6924055218696594, + "learning_rate": 2.506089298140168e-06, + "loss": 1.4102, + "mean_token_accuracy": 0.6675305167833964, + "num_tokens": 2737039683.0, + "step": 16313 + }, + { + "entropy": 1.7212806940078735, + "epoch": 1.7921781879102467, + "grad_norm": 0.7379174828529358, + "learning_rate": 2.5055600851433228e-06, + "loss": 1.4748, + "mean_token_accuracy": 0.6568605154752731, + "num_tokens": 2737179397.0, + "step": 16314 + }, + { + "entropy": 1.7149664461612701, + "epoch": 1.7922880448216199, + "grad_norm": 0.6748194694519043, + "learning_rate": 2.5050311409916715e-06, + "loss": 1.3384, + "mean_token_accuracy": 0.6551110148429871, + "num_tokens": 2737319398.0, + "step": 16315 + }, + { + "entropy": 1.7069288392861683, + "epoch": 1.7923979017329927, + "grad_norm": 0.6862152814865112, + "learning_rate": 2.5045024657019585e-06, + "loss": 1.3491, + "mean_token_accuracy": 0.6714658091465632, + "num_tokens": 2737454204.0, + "step": 16316 + }, + { + "entropy": 1.7472139199574788, + "epoch": 1.7925077586443656, + "grad_norm": 0.7499648928642273, + "learning_rate": 2.503974059290914e-06, + "loss": 1.5385, + "mean_token_accuracy": 0.6405575921138128, + "num_tokens": 2737595817.0, + "step": 16317 + }, + { + "entropy": 1.6611140767733257, + "epoch": 1.7926176155557387, + "grad_norm": 0.9254728555679321, + "learning_rate": 2.503445921775261e-06, + "loss": 1.4631, + "mean_token_accuracy": 0.6541319787502289, + "num_tokens": 2737734944.0, + "step": 16318 + }, + { + "entropy": 1.7104494671026866, + "epoch": 1.7927274724671116, + "grad_norm": 0.6272209882736206, + "learning_rate": 2.5029180531717172e-06, + "loss": 1.36, + "mean_token_accuracy": 0.6594759970903397, + "num_tokens": 2737888914.0, + "step": 16319 + }, + { + "entropy": 1.6646449665228527, + "epoch": 1.7928373293784845, + "grad_norm": 0.7363027930259705, + "learning_rate": 2.5023904534969885e-06, + "loss": 1.4083, + "mean_token_accuracy": 0.6451329837242762, + "num_tokens": 2738084677.0, + "step": 16320 + }, + { + "entropy": 1.693971465031306, + "epoch": 1.7929471862898576, + "grad_norm": 0.8376074433326721, + "learning_rate": 2.50186312276777e-06, + "loss": 1.4147, + "mean_token_accuracy": 0.6637802918752035, + "num_tokens": 2738224943.0, + "step": 16321 + }, + { + "entropy": 1.6223762234052022, + "epoch": 1.7930570432012303, + "grad_norm": 0.7268716096878052, + "learning_rate": 2.5013360610007555e-06, + "loss": 1.3308, + "mean_token_accuracy": 0.6813297122716904, + "num_tokens": 2738430126.0, + "step": 16322 + }, + { + "entropy": 1.7354827622572582, + "epoch": 1.7931669001126034, + "grad_norm": 0.7839182615280151, + "learning_rate": 2.500809268212626e-06, + "loss": 1.3441, + "mean_token_accuracy": 0.6678336064020792, + "num_tokens": 2738574057.0, + "step": 16323 + }, + { + "entropy": 1.7117115159829457, + "epoch": 1.7932767570239763, + "grad_norm": 0.7276841998100281, + "learning_rate": 2.5002827444200543e-06, + "loss": 1.4605, + "mean_token_accuracy": 0.6557506322860718, + "num_tokens": 2738725295.0, + "step": 16324 + }, + { + "entropy": 1.6508471469084423, + "epoch": 1.7933866139353491, + "grad_norm": 0.7069340944290161, + "learning_rate": 2.4997564896397015e-06, + "loss": 1.3199, + "mean_token_accuracy": 0.6781076391537985, + "num_tokens": 2738874111.0, + "step": 16325 + }, + { + "entropy": 1.6659562587738037, + "epoch": 1.7934964708467223, + "grad_norm": 0.7840026617050171, + "learning_rate": 2.4992305038882266e-06, + "loss": 1.3742, + "mean_token_accuracy": 0.6697394450505575, + "num_tokens": 2739010468.0, + "step": 16326 + }, + { + "entropy": 1.7727423111597698, + "epoch": 1.793606327758095, + "grad_norm": 1.009763240814209, + "learning_rate": 2.4987047871822756e-06, + "loss": 1.4287, + "mean_token_accuracy": 0.6506382723649343, + "num_tokens": 2739200522.0, + "step": 16327 + }, + { + "entropy": 1.6970455447832744, + "epoch": 1.793716184669468, + "grad_norm": 0.6676150560379028, + "learning_rate": 2.498179339538487e-06, + "loss": 1.4482, + "mean_token_accuracy": 0.6354714632034302, + "num_tokens": 2739401660.0, + "step": 16328 + }, + { + "entropy": 1.6591151058673859, + "epoch": 1.793826041580841, + "grad_norm": 0.8759251832962036, + "learning_rate": 2.497654160973493e-06, + "loss": 1.4139, + "mean_token_accuracy": 0.6550566603740057, + "num_tokens": 2739573508.0, + "step": 16329 + }, + { + "entropy": 1.7104254464308422, + "epoch": 1.7939358984922138, + "grad_norm": 0.6900741457939148, + "learning_rate": 2.4971292515039106e-06, + "loss": 1.4752, + "mean_token_accuracy": 0.6559490313132604, + "num_tokens": 2739757627.0, + "step": 16330 + }, + { + "entropy": 1.6923480729262035, + "epoch": 1.794045755403587, + "grad_norm": 0.7489110231399536, + "learning_rate": 2.496604611146358e-06, + "loss": 1.3643, + "mean_token_accuracy": 0.6735963573058447, + "num_tokens": 2739932452.0, + "step": 16331 + }, + { + "entropy": 1.6965441604455311, + "epoch": 1.7941556123149598, + "grad_norm": 0.7722020149230957, + "learning_rate": 2.4960802399174376e-06, + "loss": 1.1919, + "mean_token_accuracy": 0.6849365482727686, + "num_tokens": 2740045892.0, + "step": 16332 + }, + { + "entropy": 1.7439928154150646, + "epoch": 1.7942654692263327, + "grad_norm": 0.6960392594337463, + "learning_rate": 2.4955561378337446e-06, + "loss": 1.4115, + "mean_token_accuracy": 0.6489661236604055, + "num_tokens": 2740166797.0, + "step": 16333 + }, + { + "entropy": 1.6369553208351135, + "epoch": 1.7943753261377058, + "grad_norm": 0.5748756527900696, + "learning_rate": 2.4950323049118684e-06, + "loss": 1.3669, + "mean_token_accuracy": 0.6603866517543793, + "num_tokens": 2740395498.0, + "step": 16334 + }, + { + "entropy": 1.6582307914892833, + "epoch": 1.7944851830490784, + "grad_norm": 0.8305114507675171, + "learning_rate": 2.494508741168388e-06, + "loss": 1.414, + "mean_token_accuracy": 0.6532878627379736, + "num_tokens": 2740586934.0, + "step": 16335 + }, + { + "entropy": 1.7147388954957326, + "epoch": 1.7945950399604516, + "grad_norm": 0.7109110355377197, + "learning_rate": 2.493985446619872e-06, + "loss": 1.5826, + "mean_token_accuracy": 0.6403073569138845, + "num_tokens": 2740834396.0, + "step": 16336 + }, + { + "entropy": 1.6089663604895275, + "epoch": 1.7947048968718244, + "grad_norm": 0.6663435101509094, + "learning_rate": 2.493462421282884e-06, + "loss": 1.3889, + "mean_token_accuracy": 0.6556515793005625, + "num_tokens": 2741006779.0, + "step": 16337 + }, + { + "entropy": 1.676442285378774, + "epoch": 1.7948147537831973, + "grad_norm": 0.6819809675216675, + "learning_rate": 2.4929396651739773e-06, + "loss": 1.3259, + "mean_token_accuracy": 0.6694385011990865, + "num_tokens": 2741114771.0, + "step": 16338 + }, + { + "entropy": 1.6962731381257374, + "epoch": 1.7949246106945704, + "grad_norm": 0.7554741501808167, + "learning_rate": 2.492417178309697e-06, + "loss": 1.3638, + "mean_token_accuracy": 0.659926618138949, + "num_tokens": 2741261871.0, + "step": 16339 + }, + { + "entropy": 1.6720819075902302, + "epoch": 1.795034467605943, + "grad_norm": 0.6029784083366394, + "learning_rate": 2.491894960706579e-06, + "loss": 1.3433, + "mean_token_accuracy": 0.665259430805842, + "num_tokens": 2741445707.0, + "step": 16340 + }, + { + "entropy": 1.7332893908023834, + "epoch": 1.7951443245173162, + "grad_norm": 0.6843107342720032, + "learning_rate": 2.4913730123811525e-06, + "loss": 1.5745, + "mean_token_accuracy": 0.6281691541274389, + "num_tokens": 2741632351.0, + "step": 16341 + }, + { + "entropy": 1.6098364094893138, + "epoch": 1.795254181428689, + "grad_norm": 0.6576522588729858, + "learning_rate": 2.4908513333499353e-06, + "loss": 1.1807, + "mean_token_accuracy": 0.6885288804769516, + "num_tokens": 2741787954.0, + "step": 16342 + }, + { + "entropy": 1.7071658372879028, + "epoch": 1.795364038340062, + "grad_norm": 0.6134920716285706, + "learning_rate": 2.4903299236294394e-06, + "loss": 1.4462, + "mean_token_accuracy": 0.6478994737068812, + "num_tokens": 2741949788.0, + "step": 16343 + }, + { + "entropy": 1.7075544893741608, + "epoch": 1.795473895251435, + "grad_norm": 0.713398814201355, + "learning_rate": 2.489808783236168e-06, + "loss": 1.3601, + "mean_token_accuracy": 0.6593527148167292, + "num_tokens": 2742099407.0, + "step": 16344 + }, + { + "entropy": 1.6858652830123901, + "epoch": 1.795583752162808, + "grad_norm": 0.8232229351997375, + "learning_rate": 2.4892879121866113e-06, + "loss": 1.2947, + "mean_token_accuracy": 0.667354146639506, + "num_tokens": 2742230304.0, + "step": 16345 + }, + { + "entropy": 1.6845860878626506, + "epoch": 1.7956936090741809, + "grad_norm": 0.7478837966918945, + "learning_rate": 2.4887673104972583e-06, + "loss": 1.2776, + "mean_token_accuracy": 0.6781817525625229, + "num_tokens": 2742369127.0, + "step": 16346 + }, + { + "entropy": 1.7888353765010834, + "epoch": 1.795803465985554, + "grad_norm": 0.6277621984481812, + "learning_rate": 2.4882469781845847e-06, + "loss": 1.4691, + "mean_token_accuracy": 0.6437779317299525, + "num_tokens": 2742578400.0, + "step": 16347 + }, + { + "entropy": 1.755648523569107, + "epoch": 1.7959133228969266, + "grad_norm": 0.6520666480064392, + "learning_rate": 2.4877269152650597e-06, + "loss": 1.3963, + "mean_token_accuracy": 0.6472931802272797, + "num_tokens": 2742753222.0, + "step": 16348 + }, + { + "entropy": 1.733013888200124, + "epoch": 1.7960231798082997, + "grad_norm": 0.7249704599380493, + "learning_rate": 2.4872071217551404e-06, + "loss": 1.4501, + "mean_token_accuracy": 0.6556122601032257, + "num_tokens": 2742928454.0, + "step": 16349 + }, + { + "entropy": 1.7438992460568745, + "epoch": 1.7961330367196726, + "grad_norm": 0.6250995397567749, + "learning_rate": 2.4866875976712813e-06, + "loss": 1.4395, + "mean_token_accuracy": 0.6565362215042114, + "num_tokens": 2743122316.0, + "step": 16350 + }, + { + "entropy": 1.620346486568451, + "epoch": 1.7962428936310455, + "grad_norm": 0.673563539981842, + "learning_rate": 2.4861683430299236e-06, + "loss": 1.4165, + "mean_token_accuracy": 0.6502549201250076, + "num_tokens": 2743314494.0, + "step": 16351 + }, + { + "entropy": 1.7250482241312664, + "epoch": 1.7963527505424186, + "grad_norm": 0.7625929117202759, + "learning_rate": 2.4856493578475003e-06, + "loss": 1.4833, + "mean_token_accuracy": 0.6498374988635381, + "num_tokens": 2743438742.0, + "step": 16352 + }, + { + "entropy": 1.7725351254145305, + "epoch": 1.7964626074537913, + "grad_norm": 0.7012255191802979, + "learning_rate": 2.485130642140439e-06, + "loss": 1.3361, + "mean_token_accuracy": 0.6614055832227071, + "num_tokens": 2743573991.0, + "step": 16353 + }, + { + "entropy": 1.677201271057129, + "epoch": 1.7965724643651644, + "grad_norm": 0.7226030230522156, + "learning_rate": 2.484612195925154e-06, + "loss": 1.3256, + "mean_token_accuracy": 0.665013333161672, + "num_tokens": 2743742342.0, + "step": 16354 + }, + { + "entropy": 1.7017574906349182, + "epoch": 1.7966823212765373, + "grad_norm": 0.6619887948036194, + "learning_rate": 2.4840940192180585e-06, + "loss": 1.4644, + "mean_token_accuracy": 0.6368465920289358, + "num_tokens": 2743926810.0, + "step": 16355 + }, + { + "entropy": 1.762619137763977, + "epoch": 1.7967921781879101, + "grad_norm": 1.3564014434814453, + "learning_rate": 2.4835761120355495e-06, + "loss": 1.2873, + "mean_token_accuracy": 0.669828325510025, + "num_tokens": 2744051036.0, + "step": 16356 + }, + { + "entropy": 1.7027521828810375, + "epoch": 1.7969020350992833, + "grad_norm": 0.597287654876709, + "learning_rate": 2.4830584743940176e-06, + "loss": 1.4155, + "mean_token_accuracy": 0.6454381992419561, + "num_tokens": 2744217006.0, + "step": 16357 + }, + { + "entropy": 1.7219856878121693, + "epoch": 1.7970118920106561, + "grad_norm": 0.6755548119544983, + "learning_rate": 2.4825411063098465e-06, + "loss": 1.5516, + "mean_token_accuracy": 0.6386887629826864, + "num_tokens": 2744493689.0, + "step": 16358 + }, + { + "entropy": 1.7195583780606587, + "epoch": 1.797121748922029, + "grad_norm": 0.7201851010322571, + "learning_rate": 2.482024007799414e-06, + "loss": 1.3217, + "mean_token_accuracy": 0.661146675546964, + "num_tokens": 2744632436.0, + "step": 16359 + }, + { + "entropy": 1.6850773394107819, + "epoch": 1.7972316058334021, + "grad_norm": 0.5701948404312134, + "learning_rate": 2.4815071788790824e-06, + "loss": 1.3307, + "mean_token_accuracy": 0.6748195836941401, + "num_tokens": 2744877316.0, + "step": 16360 + }, + { + "entropy": 1.7464614311854045, + "epoch": 1.7973414627447748, + "grad_norm": 0.8613492250442505, + "learning_rate": 2.480990619565209e-06, + "loss": 1.4267, + "mean_token_accuracy": 0.6546533902486166, + "num_tokens": 2745013143.0, + "step": 16361 + }, + { + "entropy": 1.738052507241567, + "epoch": 1.797451319656148, + "grad_norm": 0.6792759895324707, + "learning_rate": 2.480474329874146e-06, + "loss": 1.4118, + "mean_token_accuracy": 0.657256638010343, + "num_tokens": 2745174814.0, + "step": 16362 + }, + { + "entropy": 1.6887112458546956, + "epoch": 1.7975611765675208, + "grad_norm": 0.6691803932189941, + "learning_rate": 2.4799583098222295e-06, + "loss": 1.4631, + "mean_token_accuracy": 0.6501191159089407, + "num_tokens": 2745325641.0, + "step": 16363 + }, + { + "entropy": 1.7012372314929962, + "epoch": 1.7976710334788937, + "grad_norm": 0.6485432386398315, + "learning_rate": 2.479442559425793e-06, + "loss": 1.2735, + "mean_token_accuracy": 0.6670918663342794, + "num_tokens": 2745439047.0, + "step": 16364 + }, + { + "entropy": 1.7309893469015758, + "epoch": 1.7977808903902668, + "grad_norm": 0.8326260447502136, + "learning_rate": 2.4789270787011615e-06, + "loss": 1.3052, + "mean_token_accuracy": 0.6672724187374115, + "num_tokens": 2745546360.0, + "step": 16365 + }, + { + "entropy": 1.7189862628777821, + "epoch": 1.7978907473016397, + "grad_norm": 0.7580441236495972, + "learning_rate": 2.4784118676646467e-06, + "loss": 1.3881, + "mean_token_accuracy": 0.6678448468446732, + "num_tokens": 2745732348.0, + "step": 16366 + }, + { + "entropy": 1.6766027708848317, + "epoch": 1.7980006042130126, + "grad_norm": 0.6480311155319214, + "learning_rate": 2.477896926332558e-06, + "loss": 1.4681, + "mean_token_accuracy": 0.6425887246926626, + "num_tokens": 2745921764.0, + "step": 16367 + }, + { + "entropy": 1.6788609822591145, + "epoch": 1.7981104611243854, + "grad_norm": 0.6684360504150391, + "learning_rate": 2.477382254721191e-06, + "loss": 1.4321, + "mean_token_accuracy": 0.6493734816710154, + "num_tokens": 2746126329.0, + "step": 16368 + }, + { + "entropy": 1.766765018304189, + "epoch": 1.7982203180357583, + "grad_norm": 0.784034788608551, + "learning_rate": 2.4768678528468345e-06, + "loss": 1.4098, + "mean_token_accuracy": 0.6438094178835551, + "num_tokens": 2746279905.0, + "step": 16369 + }, + { + "entropy": 1.696258048216502, + "epoch": 1.7983301749471314, + "grad_norm": 0.6454617977142334, + "learning_rate": 2.476353720725771e-06, + "loss": 1.3864, + "mean_token_accuracy": 0.6533452222744623, + "num_tokens": 2746468173.0, + "step": 16370 + }, + { + "entropy": 1.684233695268631, + "epoch": 1.7984400318585043, + "grad_norm": 0.8708049654960632, + "learning_rate": 2.475839858374269e-06, + "loss": 1.3214, + "mean_token_accuracy": 0.6717989295721054, + "num_tokens": 2746606416.0, + "step": 16371 + }, + { + "entropy": 1.6811311642328899, + "epoch": 1.7985498887698772, + "grad_norm": 0.5873830914497375, + "learning_rate": 2.475326265808597e-06, + "loss": 1.3903, + "mean_token_accuracy": 0.6625532309214274, + "num_tokens": 2746825476.0, + "step": 16372 + }, + { + "entropy": 1.6943085193634033, + "epoch": 1.7986597456812503, + "grad_norm": 0.6107808351516724, + "learning_rate": 2.474812943045007e-06, + "loss": 1.3958, + "mean_token_accuracy": 0.6486289997895559, + "num_tokens": 2747042577.0, + "step": 16373 + }, + { + "entropy": 1.6802096863587697, + "epoch": 1.798769602592623, + "grad_norm": 0.6949267387390137, + "learning_rate": 2.474299890099744e-06, + "loss": 1.3366, + "mean_token_accuracy": 0.6724252700805664, + "num_tokens": 2747184615.0, + "step": 16374 + }, + { + "entropy": 1.7157519956429799, + "epoch": 1.798879459503996, + "grad_norm": 0.7292264103889465, + "learning_rate": 2.47378710698905e-06, + "loss": 1.2698, + "mean_token_accuracy": 0.671657994389534, + "num_tokens": 2747306579.0, + "step": 16375 + }, + { + "entropy": 1.7131900389989216, + "epoch": 1.798989316415369, + "grad_norm": 0.750167727470398, + "learning_rate": 2.4732745937291515e-06, + "loss": 1.3823, + "mean_token_accuracy": 0.6537191818157831, + "num_tokens": 2747456528.0, + "step": 16376 + }, + { + "entropy": 1.6936882932980855, + "epoch": 1.7990991733267419, + "grad_norm": 0.709400475025177, + "learning_rate": 2.4727623503362686e-06, + "loss": 1.3456, + "mean_token_accuracy": 0.6595764954884847, + "num_tokens": 2747582462.0, + "step": 16377 + }, + { + "entropy": 1.7834466397762299, + "epoch": 1.799209030238115, + "grad_norm": 0.7460691928863525, + "learning_rate": 2.4722503768266144e-06, + "loss": 1.4517, + "mean_token_accuracy": 0.6345730274915695, + "num_tokens": 2747760658.0, + "step": 16378 + }, + { + "entropy": 1.6861862341562908, + "epoch": 1.7993188871494878, + "grad_norm": 0.6694313287734985, + "learning_rate": 2.4717386732163953e-06, + "loss": 1.3049, + "mean_token_accuracy": 0.66120112935702, + "num_tokens": 2747884938.0, + "step": 16379 + }, + { + "entropy": 1.715090274810791, + "epoch": 1.7994287440608607, + "grad_norm": 0.6291208267211914, + "learning_rate": 2.471227239521804e-06, + "loss": 1.3891, + "mean_token_accuracy": 0.6506547033786774, + "num_tokens": 2748086682.0, + "step": 16380 + }, + { + "entropy": 1.6372637848059337, + "epoch": 1.7995386009722336, + "grad_norm": 0.6571980714797974, + "learning_rate": 2.4707160757590253e-06, + "loss": 1.2591, + "mean_token_accuracy": 0.6797957370678583, + "num_tokens": 2748239953.0, + "step": 16381 + }, + { + "entropy": 1.7329098383585613, + "epoch": 1.7996484578836065, + "grad_norm": 0.67576664686203, + "learning_rate": 2.470205181944242e-06, + "loss": 1.6271, + "mean_token_accuracy": 0.6063709209362665, + "num_tokens": 2748496615.0, + "step": 16382 + }, + { + "entropy": 1.7593752145767212, + "epoch": 1.7997583147949796, + "grad_norm": 0.6144885420799255, + "learning_rate": 2.469694558093618e-06, + "loss": 1.4584, + "mean_token_accuracy": 0.6472984254360199, + "num_tokens": 2748757893.0, + "step": 16383 + }, + { + "entropy": 1.760807067155838, + "epoch": 1.7998681717063525, + "grad_norm": 109.00904083251953, + "learning_rate": 2.469184204223321e-06, + "loss": 1.6694, + "mean_token_accuracy": 0.6406611104806265, + "num_tokens": 2748996769.0, + "step": 16384 + }, + { + "entropy": 1.6700663566589355, + "epoch": 1.7999780286177254, + "grad_norm": 0.6122145652770996, + "learning_rate": 2.4686741203494976e-06, + "loss": 1.339, + "mean_token_accuracy": 0.6703683187564214, + "num_tokens": 2749191720.0, + "step": 16385 + }, + { + "entropy": 1.6805997391541798, + "epoch": 1.8000878855290985, + "grad_norm": 0.6632294058799744, + "learning_rate": 2.468164306488295e-06, + "loss": 1.3269, + "mean_token_accuracy": 0.6549940158923467, + "num_tokens": 2749342736.0, + "step": 16386 + }, + { + "entropy": 1.7235978146394093, + "epoch": 1.8001977424404711, + "grad_norm": 0.6988422870635986, + "learning_rate": 2.467654762655847e-06, + "loss": 1.3662, + "mean_token_accuracy": 0.6608254263798395, + "num_tokens": 2749463576.0, + "step": 16387 + }, + { + "entropy": 1.73094642162323, + "epoch": 1.8003075993518443, + "grad_norm": 0.7575457096099854, + "learning_rate": 2.467145488868281e-06, + "loss": 1.4601, + "mean_token_accuracy": 0.6553111871083578, + "num_tokens": 2749630135.0, + "step": 16388 + }, + { + "entropy": 1.700294444958369, + "epoch": 1.8004174562632171, + "grad_norm": 0.6070172190666199, + "learning_rate": 2.4666364851417153e-06, + "loss": 1.5017, + "mean_token_accuracy": 0.6433312793572744, + "num_tokens": 2749851486.0, + "step": 16389 + }, + { + "entropy": 1.7130950689315796, + "epoch": 1.80052731317459, + "grad_norm": 0.7260795831680298, + "learning_rate": 2.4661277514922587e-06, + "loss": 1.3681, + "mean_token_accuracy": 0.650189533829689, + "num_tokens": 2750035261.0, + "step": 16390 + }, + { + "entropy": 1.700755516688029, + "epoch": 1.8006371700859631, + "grad_norm": 0.7316020131111145, + "learning_rate": 2.4656192879360145e-06, + "loss": 1.4561, + "mean_token_accuracy": 0.6599542399247488, + "num_tokens": 2750188972.0, + "step": 16391 + }, + { + "entropy": 1.6893901228904724, + "epoch": 1.800747026997336, + "grad_norm": 0.7152737975120544, + "learning_rate": 2.465111094489074e-06, + "loss": 1.2815, + "mean_token_accuracy": 0.6717756688594818, + "num_tokens": 2750332850.0, + "step": 16392 + }, + { + "entropy": 1.7034862637519836, + "epoch": 1.800856883908709, + "grad_norm": 0.6364946365356445, + "learning_rate": 2.464603171167521e-06, + "loss": 1.4426, + "mean_token_accuracy": 0.6520007997751236, + "num_tokens": 2750560589.0, + "step": 16393 + }, + { + "entropy": 1.6827989121278126, + "epoch": 1.8009667408200818, + "grad_norm": 0.6871801614761353, + "learning_rate": 2.4640955179874333e-06, + "loss": 1.2716, + "mean_token_accuracy": 0.6799880017836889, + "num_tokens": 2750726349.0, + "step": 16394 + }, + { + "entropy": 1.729516049226125, + "epoch": 1.8010765977314547, + "grad_norm": 0.7461774349212646, + "learning_rate": 2.4635881349648734e-06, + "loss": 1.4294, + "mean_token_accuracy": 0.6584520041942596, + "num_tokens": 2750898613.0, + "step": 16395 + }, + { + "entropy": 1.6981934209664662, + "epoch": 1.8011864546428278, + "grad_norm": 0.6527087688446045, + "learning_rate": 2.4630810221159043e-06, + "loss": 1.3578, + "mean_token_accuracy": 0.6532334089279175, + "num_tokens": 2751050794.0, + "step": 16396 + }, + { + "entropy": 1.7621082564194996, + "epoch": 1.8012963115542007, + "grad_norm": 0.6884635090827942, + "learning_rate": 2.462574179456574e-06, + "loss": 1.4785, + "mean_token_accuracy": 0.6381318867206573, + "num_tokens": 2751240455.0, + "step": 16397 + }, + { + "entropy": 1.6867110133171082, + "epoch": 1.8014061684655736, + "grad_norm": 0.5808276534080505, + "learning_rate": 2.4620676070029223e-06, + "loss": 1.4725, + "mean_token_accuracy": 0.651040847102801, + "num_tokens": 2751419542.0, + "step": 16398 + }, + { + "entropy": 1.6382981638113658, + "epoch": 1.8015160253769467, + "grad_norm": 0.5899358987808228, + "learning_rate": 2.4615613047709847e-06, + "loss": 1.3374, + "mean_token_accuracy": 0.660874272386233, + "num_tokens": 2751603980.0, + "step": 16399 + }, + { + "entropy": 1.7465067307154338, + "epoch": 1.8016258822883193, + "grad_norm": 0.700994610786438, + "learning_rate": 2.4610552727767843e-06, + "loss": 1.5425, + "mean_token_accuracy": 0.6477147589127222, + "num_tokens": 2751812703.0, + "step": 16400 + }, + { + "entropy": 1.6795567174752553, + "epoch": 1.8017357391996924, + "grad_norm": 0.6919041872024536, + "learning_rate": 2.4605495110363366e-06, + "loss": 1.4238, + "mean_token_accuracy": 0.6519719262917837, + "num_tokens": 2751984688.0, + "step": 16401 + }, + { + "entropy": 1.8034850259621937, + "epoch": 1.8018455961110653, + "grad_norm": 0.8304495215415955, + "learning_rate": 2.4600440195656476e-06, + "loss": 1.3008, + "mean_token_accuracy": 0.6683735996484756, + "num_tokens": 2752123752.0, + "step": 16402 + }, + { + "entropy": 1.7134062051773071, + "epoch": 1.8019554530224382, + "grad_norm": 0.668536901473999, + "learning_rate": 2.459538798380719e-06, + "loss": 1.3065, + "mean_token_accuracy": 0.6793159395456314, + "num_tokens": 2752275099.0, + "step": 16403 + }, + { + "entropy": 1.6386187970638275, + "epoch": 1.8020653099338113, + "grad_norm": 0.69599449634552, + "learning_rate": 2.4590338474975397e-06, + "loss": 1.3571, + "mean_token_accuracy": 0.6669880499442419, + "num_tokens": 2752404359.0, + "step": 16404 + }, + { + "entropy": 1.7862418989340465, + "epoch": 1.8021751668451842, + "grad_norm": 0.7669715881347656, + "learning_rate": 2.4585291669320877e-06, + "loss": 1.3221, + "mean_token_accuracy": 0.6571997304757436, + "num_tokens": 2752532453.0, + "step": 16405 + }, + { + "entropy": 1.6611520648002625, + "epoch": 1.802285023756557, + "grad_norm": 0.8591197729110718, + "learning_rate": 2.458024756700341e-06, + "loss": 1.2213, + "mean_token_accuracy": 0.6755464772383372, + "num_tokens": 2752670377.0, + "step": 16406 + }, + { + "entropy": 1.7248308161894481, + "epoch": 1.80239488066793, + "grad_norm": 0.7357346415519714, + "learning_rate": 2.4575206168182605e-06, + "loss": 1.4571, + "mean_token_accuracy": 0.6435425728559494, + "num_tokens": 2752869097.0, + "step": 16407 + }, + { + "entropy": 1.7167830963929493, + "epoch": 1.8025047375793029, + "grad_norm": 0.690274178981781, + "learning_rate": 2.457016747301804e-06, + "loss": 1.3364, + "mean_token_accuracy": 0.662845383087794, + "num_tokens": 2753007646.0, + "step": 16408 + }, + { + "entropy": 1.8022632400194805, + "epoch": 1.802614594490676, + "grad_norm": 0.8202261328697205, + "learning_rate": 2.4565131481669175e-06, + "loss": 1.3481, + "mean_token_accuracy": 0.6701598316431046, + "num_tokens": 2753123586.0, + "step": 16409 + }, + { + "entropy": 1.6932842234770458, + "epoch": 1.8027244514020488, + "grad_norm": 0.733900249004364, + "learning_rate": 2.4560098194295397e-06, + "loss": 1.4256, + "mean_token_accuracy": 0.6689083476861318, + "num_tokens": 2753287054.0, + "step": 16410 + }, + { + "entropy": 1.750009814898173, + "epoch": 1.8028343083134217, + "grad_norm": 0.7286418676376343, + "learning_rate": 2.455506761105601e-06, + "loss": 1.5902, + "mean_token_accuracy": 0.6424010594685873, + "num_tokens": 2753491319.0, + "step": 16411 + }, + { + "entropy": 1.6446313957373302, + "epoch": 1.8029441652247948, + "grad_norm": 0.7561642527580261, + "learning_rate": 2.455003973211025e-06, + "loss": 1.4449, + "mean_token_accuracy": 0.6605862602591515, + "num_tokens": 2753684199.0, + "step": 16412 + }, + { + "entropy": 1.7904584010442097, + "epoch": 1.8030540221361675, + "grad_norm": 0.7615606188774109, + "learning_rate": 2.4545014557617205e-06, + "loss": 1.4292, + "mean_token_accuracy": 0.6448209335406622, + "num_tokens": 2753848636.0, + "step": 16413 + }, + { + "entropy": 1.7027158041795094, + "epoch": 1.8031638790475406, + "grad_norm": 0.7243815660476685, + "learning_rate": 2.4539992087735937e-06, + "loss": 1.2719, + "mean_token_accuracy": 0.6721286574999491, + "num_tokens": 2753989493.0, + "step": 16414 + }, + { + "entropy": 1.6345816453297932, + "epoch": 1.8032737359589135, + "grad_norm": 0.6394364833831787, + "learning_rate": 2.4534972322625434e-06, + "loss": 1.3012, + "mean_token_accuracy": 0.670843780040741, + "num_tokens": 2754145938.0, + "step": 16415 + }, + { + "entropy": 1.710991491874059, + "epoch": 1.8033835928702864, + "grad_norm": 0.8464189171791077, + "learning_rate": 2.4529955262444534e-06, + "loss": 1.427, + "mean_token_accuracy": 0.6431051045656204, + "num_tokens": 2754279254.0, + "step": 16416 + }, + { + "entropy": 1.6918781101703644, + "epoch": 1.8034934497816595, + "grad_norm": 0.7020459771156311, + "learning_rate": 2.4524940907352028e-06, + "loss": 1.3814, + "mean_token_accuracy": 0.6550219456354777, + "num_tokens": 2754413658.0, + "step": 16417 + }, + { + "entropy": 1.653142919143041, + "epoch": 1.8036033066930324, + "grad_norm": 0.6922260522842407, + "learning_rate": 2.4519929257506644e-06, + "loss": 1.2536, + "mean_token_accuracy": 0.681772361199061, + "num_tokens": 2754595362.0, + "step": 16418 + }, + { + "entropy": 1.7180135349432628, + "epoch": 1.8037131636044053, + "grad_norm": 0.6518612504005432, + "learning_rate": 2.4514920313066972e-06, + "loss": 1.4934, + "mean_token_accuracy": 0.6491179863611857, + "num_tokens": 2754770348.0, + "step": 16419 + }, + { + "entropy": 1.646272877852122, + "epoch": 1.8038230205157784, + "grad_norm": 0.9592717885971069, + "learning_rate": 2.4509914074191544e-06, + "loss": 1.2443, + "mean_token_accuracy": 0.6764777451753616, + "num_tokens": 2754908323.0, + "step": 16420 + }, + { + "entropy": 1.7484492460886638, + "epoch": 1.803932877427151, + "grad_norm": 0.6856718063354492, + "learning_rate": 2.450491054103883e-06, + "loss": 1.338, + "mean_token_accuracy": 0.6600681195656458, + "num_tokens": 2755054083.0, + "step": 16421 + }, + { + "entropy": 1.6522502601146698, + "epoch": 1.8040427343385241, + "grad_norm": 0.5855985283851624, + "learning_rate": 2.4499909713767156e-06, + "loss": 1.3849, + "mean_token_accuracy": 0.6528994540373484, + "num_tokens": 2755261170.0, + "step": 16422 + }, + { + "entropy": 1.742342193921407, + "epoch": 1.804152591249897, + "grad_norm": 0.6841393113136292, + "learning_rate": 2.4494911592534825e-06, + "loss": 1.3531, + "mean_token_accuracy": 0.6608762443065643, + "num_tokens": 2755430970.0, + "step": 16423 + }, + { + "entropy": 1.7070422967274983, + "epoch": 1.80426244816127, + "grad_norm": 0.7020707726478577, + "learning_rate": 2.4489916177500013e-06, + "loss": 1.402, + "mean_token_accuracy": 0.6581531713406245, + "num_tokens": 2755580117.0, + "step": 16424 + }, + { + "entropy": 1.6086894969145458, + "epoch": 1.804372305072643, + "grad_norm": 0.7450229525566101, + "learning_rate": 2.4484923468820805e-06, + "loss": 1.4186, + "mean_token_accuracy": 0.6774997810522715, + "num_tokens": 2755722550.0, + "step": 16425 + }, + { + "entropy": 1.629365513722102, + "epoch": 1.8044821619840157, + "grad_norm": 0.5438582897186279, + "learning_rate": 2.447993346665523e-06, + "loss": 1.3446, + "mean_token_accuracy": 0.6634021550416946, + "num_tokens": 2755920382.0, + "step": 16426 + }, + { + "entropy": 1.6984173556168873, + "epoch": 1.8045920188953888, + "grad_norm": 0.6832945346832275, + "learning_rate": 2.447494617116126e-06, + "loss": 1.2979, + "mean_token_accuracy": 0.66312904159228, + "num_tokens": 2756073656.0, + "step": 16427 + }, + { + "entropy": 1.6762990454832714, + "epoch": 1.8047018758067617, + "grad_norm": 0.5510643124580383, + "learning_rate": 2.4469961582496683e-06, + "loss": 1.3274, + "mean_token_accuracy": 0.6577611863613129, + "num_tokens": 2756260540.0, + "step": 16428 + }, + { + "entropy": 1.7335613071918488, + "epoch": 1.8048117327181346, + "grad_norm": 0.6928039193153381, + "learning_rate": 2.446497970081928e-06, + "loss": 1.4443, + "mean_token_accuracy": 0.6604134688774744, + "num_tokens": 2756393501.0, + "step": 16429 + }, + { + "entropy": 1.7227633396784465, + "epoch": 1.8049215896295077, + "grad_norm": 0.6674528121948242, + "learning_rate": 2.4460000526286727e-06, + "loss": 1.3792, + "mean_token_accuracy": 0.6571878095467886, + "num_tokens": 2756561165.0, + "step": 16430 + }, + { + "entropy": 1.6204917430877686, + "epoch": 1.8050314465408805, + "grad_norm": 0.6645229458808899, + "learning_rate": 2.4455024059056627e-06, + "loss": 1.4276, + "mean_token_accuracy": 0.6697969138622284, + "num_tokens": 2756764755.0, + "step": 16431 + }, + { + "entropy": 1.7642103830973308, + "epoch": 1.8051413034522534, + "grad_norm": 0.6755779385566711, + "learning_rate": 2.4450050299286452e-06, + "loss": 1.3912, + "mean_token_accuracy": 0.6555102616548538, + "num_tokens": 2756901089.0, + "step": 16432 + }, + { + "entropy": 1.7875695725282033, + "epoch": 1.8052511603636265, + "grad_norm": 0.6372548341751099, + "learning_rate": 2.444507924713364e-06, + "loss": 1.3773, + "mean_token_accuracy": 0.6585378497838974, + "num_tokens": 2757036878.0, + "step": 16433 + }, + { + "entropy": 1.726793756087621, + "epoch": 1.8053610172749992, + "grad_norm": 0.6793131828308105, + "learning_rate": 2.4440110902755513e-06, + "loss": 1.3715, + "mean_token_accuracy": 0.6532783309618632, + "num_tokens": 2757170150.0, + "step": 16434 + }, + { + "entropy": 1.7632285555203755, + "epoch": 1.8054708741863723, + "grad_norm": 0.6790938973426819, + "learning_rate": 2.443514526630933e-06, + "loss": 1.3588, + "mean_token_accuracy": 0.6496838182210922, + "num_tokens": 2757309818.0, + "step": 16435 + }, + { + "entropy": 1.6782483259836833, + "epoch": 1.8055807310977452, + "grad_norm": 0.6445368528366089, + "learning_rate": 2.4430182337952247e-06, + "loss": 1.4392, + "mean_token_accuracy": 0.6515261183182398, + "num_tokens": 2757486240.0, + "step": 16436 + }, + { + "entropy": 1.6433900197347004, + "epoch": 1.805690588009118, + "grad_norm": 0.6568174958229065, + "learning_rate": 2.4425222117841315e-06, + "loss": 1.2252, + "mean_token_accuracy": 0.6834449718395869, + "num_tokens": 2757602672.0, + "step": 16437 + }, + { + "entropy": 1.7270792822043102, + "epoch": 1.8058004449204912, + "grad_norm": 0.7385875582695007, + "learning_rate": 2.4420264606133555e-06, + "loss": 1.3364, + "mean_token_accuracy": 0.6688449184099833, + "num_tokens": 2757751042.0, + "step": 16438 + }, + { + "entropy": 1.6770341396331787, + "epoch": 1.8059103018318639, + "grad_norm": 0.7027744650840759, + "learning_rate": 2.4415309802985854e-06, + "loss": 1.2349, + "mean_token_accuracy": 0.6747290591398875, + "num_tokens": 2757876536.0, + "step": 16439 + }, + { + "entropy": 1.7311459481716156, + "epoch": 1.806020158743237, + "grad_norm": 0.8075997233390808, + "learning_rate": 2.4410357708555032e-06, + "loss": 1.2985, + "mean_token_accuracy": 0.6764100193977356, + "num_tokens": 2758041919.0, + "step": 16440 + }, + { + "entropy": 1.6318459411462147, + "epoch": 1.8061300156546098, + "grad_norm": 0.5902323126792908, + "learning_rate": 2.440540832299783e-06, + "loss": 1.3022, + "mean_token_accuracy": 0.6714819123347601, + "num_tokens": 2758217442.0, + "step": 16441 + }, + { + "entropy": 1.7181476652622223, + "epoch": 1.8062398725659827, + "grad_norm": 0.8379008769989014, + "learning_rate": 2.440046164647087e-06, + "loss": 1.4068, + "mean_token_accuracy": 0.6589639882246653, + "num_tokens": 2758376926.0, + "step": 16442 + }, + { + "entropy": 1.672847221295039, + "epoch": 1.8063497294773558, + "grad_norm": 0.7189886569976807, + "learning_rate": 2.4395517679130744e-06, + "loss": 1.3829, + "mean_token_accuracy": 0.663548931479454, + "num_tokens": 2758551062.0, + "step": 16443 + }, + { + "entropy": 1.7527056137720745, + "epoch": 1.8064595863887287, + "grad_norm": 0.8830350041389465, + "learning_rate": 2.4390576421133897e-06, + "loss": 1.4128, + "mean_token_accuracy": 0.6445074528455734, + "num_tokens": 2758717383.0, + "step": 16444 + }, + { + "entropy": 1.6904211342334747, + "epoch": 1.8065694433001016, + "grad_norm": 0.7606146931648254, + "learning_rate": 2.438563787263673e-06, + "loss": 1.4546, + "mean_token_accuracy": 0.6546589334805807, + "num_tokens": 2758926548.0, + "step": 16445 + }, + { + "entropy": 1.684679885705312, + "epoch": 1.8066793002114747, + "grad_norm": 0.7838829159736633, + "learning_rate": 2.4380702033795538e-06, + "loss": 1.487, + "mean_token_accuracy": 0.6486780146757761, + "num_tokens": 2759100149.0, + "step": 16446 + }, + { + "entropy": 1.677791029214859, + "epoch": 1.8067891571228474, + "grad_norm": 0.6367784738540649, + "learning_rate": 2.4375768904766563e-06, + "loss": 1.4016, + "mean_token_accuracy": 0.6575369586547216, + "num_tokens": 2759261585.0, + "step": 16447 + }, + { + "entropy": 1.7279250423113506, + "epoch": 1.8068990140342205, + "grad_norm": 0.6288533210754395, + "learning_rate": 2.4370838485705912e-06, + "loss": 1.2892, + "mean_token_accuracy": 0.6716119796037674, + "num_tokens": 2759451394.0, + "step": 16448 + }, + { + "entropy": 1.6070989569028218, + "epoch": 1.8070088709455934, + "grad_norm": 0.5549145936965942, + "learning_rate": 2.4365910776769634e-06, + "loss": 1.3516, + "mean_token_accuracy": 0.664691095550855, + "num_tokens": 2759635049.0, + "step": 16449 + }, + { + "entropy": 1.6843027174472809, + "epoch": 1.8071187278569663, + "grad_norm": 0.7729708552360535, + "learning_rate": 2.4360985778113696e-06, + "loss": 1.2723, + "mean_token_accuracy": 0.6750624477863312, + "num_tokens": 2759847889.0, + "step": 16450 + }, + { + "entropy": 1.6489692231019337, + "epoch": 1.8072285847683394, + "grad_norm": 0.6890325546264648, + "learning_rate": 2.4356063489893965e-06, + "loss": 1.237, + "mean_token_accuracy": 0.6862647583087286, + "num_tokens": 2759961504.0, + "step": 16451 + }, + { + "entropy": 1.7302868167559307, + "epoch": 1.807338441679712, + "grad_norm": 0.6730805039405823, + "learning_rate": 2.4351143912266232e-06, + "loss": 1.4087, + "mean_token_accuracy": 0.6450496266285578, + "num_tokens": 2760109714.0, + "step": 16452 + }, + { + "entropy": 1.669048676888148, + "epoch": 1.8074482985910851, + "grad_norm": 0.6048988699913025, + "learning_rate": 2.4346227045386208e-06, + "loss": 1.4418, + "mean_token_accuracy": 0.6532367666562399, + "num_tokens": 2760296549.0, + "step": 16453 + }, + { + "entropy": 1.7388107577959697, + "epoch": 1.807558155502458, + "grad_norm": 0.6559601426124573, + "learning_rate": 2.4341312889409495e-06, + "loss": 1.4325, + "mean_token_accuracy": 0.6547484199206034, + "num_tokens": 2760490979.0, + "step": 16454 + }, + { + "entropy": 1.6647444764773052, + "epoch": 1.807668012413831, + "grad_norm": 0.6734881401062012, + "learning_rate": 2.433640144449164e-06, + "loss": 1.2971, + "mean_token_accuracy": 0.6663618286450704, + "num_tokens": 2760650612.0, + "step": 16455 + }, + { + "entropy": 1.7663162350654602, + "epoch": 1.807777869325204, + "grad_norm": 0.7578223943710327, + "learning_rate": 2.433149271078807e-06, + "loss": 1.3936, + "mean_token_accuracy": 0.6566072255373001, + "num_tokens": 2760761496.0, + "step": 16456 + }, + { + "entropy": 1.717143217722575, + "epoch": 1.807887726236577, + "grad_norm": 0.7225522994995117, + "learning_rate": 2.4326586688454147e-06, + "loss": 1.3449, + "mean_token_accuracy": 0.6655629724264145, + "num_tokens": 2760894645.0, + "step": 16457 + }, + { + "entropy": 1.6999610662460327, + "epoch": 1.8079975831479498, + "grad_norm": 0.6742311716079712, + "learning_rate": 2.4321683377645146e-06, + "loss": 1.4921, + "mean_token_accuracy": 0.6431157986323038, + "num_tokens": 2761080211.0, + "step": 16458 + }, + { + "entropy": 1.771820992231369, + "epoch": 1.808107440059323, + "grad_norm": 0.6953256726264954, + "learning_rate": 2.4316782778516275e-06, + "loss": 1.3855, + "mean_token_accuracy": 0.6551636606454849, + "num_tokens": 2761202587.0, + "step": 16459 + }, + { + "entropy": 1.745868742465973, + "epoch": 1.8082172969706956, + "grad_norm": 0.6078836917877197, + "learning_rate": 2.4311884891222613e-06, + "loss": 1.4532, + "mean_token_accuracy": 0.6424828767776489, + "num_tokens": 2761398670.0, + "step": 16460 + }, + { + "entropy": 1.6702168186505635, + "epoch": 1.8083271538820687, + "grad_norm": 0.6870954632759094, + "learning_rate": 2.4306989715919173e-06, + "loss": 1.4224, + "mean_token_accuracy": 0.6447147478659948, + "num_tokens": 2761597590.0, + "step": 16461 + }, + { + "entropy": 1.6488149464130402, + "epoch": 1.8084370107934415, + "grad_norm": 0.610784649848938, + "learning_rate": 2.4302097252760913e-06, + "loss": 1.4936, + "mean_token_accuracy": 0.6545774986346563, + "num_tokens": 2761794099.0, + "step": 16462 + }, + { + "entropy": 1.7186005214850109, + "epoch": 1.8085468677048144, + "grad_norm": 0.6306957602500916, + "learning_rate": 2.429720750190264e-06, + "loss": 1.3377, + "mean_token_accuracy": 0.6656250059604645, + "num_tokens": 2761973157.0, + "step": 16463 + }, + { + "entropy": 1.6818233629067738, + "epoch": 1.8086567246161875, + "grad_norm": 0.7410263419151306, + "learning_rate": 2.4292320463499144e-06, + "loss": 1.3116, + "mean_token_accuracy": 0.6546371678511301, + "num_tokens": 2762136640.0, + "step": 16464 + }, + { + "entropy": 1.6417667865753174, + "epoch": 1.8087665815275602, + "grad_norm": 0.6089370846748352, + "learning_rate": 2.428743613770508e-06, + "loss": 1.2356, + "mean_token_accuracy": 0.6799864719311396, + "num_tokens": 2762270207.0, + "step": 16465 + }, + { + "entropy": 1.746724675099055, + "epoch": 1.8088764384389333, + "grad_norm": 0.9241576194763184, + "learning_rate": 2.4282554524675036e-06, + "loss": 1.4229, + "mean_token_accuracy": 0.6547742585341135, + "num_tokens": 2762409921.0, + "step": 16466 + }, + { + "entropy": 1.6926732659339905, + "epoch": 1.8089862953503062, + "grad_norm": 0.7210370302200317, + "learning_rate": 2.4277675624563523e-06, + "loss": 1.3849, + "mean_token_accuracy": 0.6654962301254272, + "num_tokens": 2762570334.0, + "step": 16467 + }, + { + "entropy": 1.7325705389181774, + "epoch": 1.809096152261679, + "grad_norm": 0.6760542988777161, + "learning_rate": 2.4272799437524954e-06, + "loss": 1.4512, + "mean_token_accuracy": 0.6488116631905237, + "num_tokens": 2762709776.0, + "step": 16468 + }, + { + "entropy": 1.7742801705996196, + "epoch": 1.8092060091730522, + "grad_norm": 0.6283159852027893, + "learning_rate": 2.4267925963713634e-06, + "loss": 1.3957, + "mean_token_accuracy": 0.6601527482271194, + "num_tokens": 2762847503.0, + "step": 16469 + }, + { + "entropy": 1.6537544429302216, + "epoch": 1.809315866084425, + "grad_norm": 0.6634315252304077, + "learning_rate": 2.426305520328383e-06, + "loss": 1.3299, + "mean_token_accuracy": 0.6634285499652227, + "num_tokens": 2763000038.0, + "step": 16470 + }, + { + "entropy": 1.6798964043458302, + "epoch": 1.809425722995798, + "grad_norm": 0.7522450685501099, + "learning_rate": 2.4258187156389707e-06, + "loss": 1.4379, + "mean_token_accuracy": 0.675532266497612, + "num_tokens": 2763165723.0, + "step": 16471 + }, + { + "entropy": 1.6432731648286183, + "epoch": 1.809535579907171, + "grad_norm": 0.7260177135467529, + "learning_rate": 2.4253321823185318e-06, + "loss": 1.4119, + "mean_token_accuracy": 0.6538095225890478, + "num_tokens": 2763340508.0, + "step": 16472 + }, + { + "entropy": 1.6526922384897869, + "epoch": 1.8096454368185437, + "grad_norm": 0.6410662531852722, + "learning_rate": 2.4248459203824652e-06, + "loss": 1.4164, + "mean_token_accuracy": 0.653022438287735, + "num_tokens": 2763531382.0, + "step": 16473 + }, + { + "entropy": 1.6942103902498882, + "epoch": 1.8097552937299168, + "grad_norm": 0.8561227321624756, + "learning_rate": 2.4243599298461616e-06, + "loss": 1.4381, + "mean_token_accuracy": 0.6655525416135788, + "num_tokens": 2763661803.0, + "step": 16474 + }, + { + "entropy": 1.6899367968241374, + "epoch": 1.8098651506412897, + "grad_norm": 0.7844464778900146, + "learning_rate": 2.423874210725001e-06, + "loss": 1.3592, + "mean_token_accuracy": 0.6753224035104116, + "num_tokens": 2763804753.0, + "step": 16475 + }, + { + "entropy": 1.6260625620683034, + "epoch": 1.8099750075526626, + "grad_norm": 0.5812033414840698, + "learning_rate": 2.423388763034358e-06, + "loss": 1.4316, + "mean_token_accuracy": 0.6537586599588394, + "num_tokens": 2764008124.0, + "step": 16476 + }, + { + "entropy": 1.687359909216563, + "epoch": 1.8100848644640357, + "grad_norm": 0.7226285934448242, + "learning_rate": 2.422903586789597e-06, + "loss": 1.4195, + "mean_token_accuracy": 0.6663658916950226, + "num_tokens": 2764173536.0, + "step": 16477 + }, + { + "entropy": 1.7126532693703969, + "epoch": 1.8101947213754084, + "grad_norm": 0.6269643306732178, + "learning_rate": 2.4224186820060708e-06, + "loss": 1.4023, + "mean_token_accuracy": 0.6467209408680598, + "num_tokens": 2764346766.0, + "step": 16478 + }, + { + "entropy": 1.7550960679848988, + "epoch": 1.8103045782867815, + "grad_norm": 0.7029903531074524, + "learning_rate": 2.42193404869913e-06, + "loss": 1.2635, + "mean_token_accuracy": 0.6703293671210607, + "num_tokens": 2764462288.0, + "step": 16479 + }, + { + "entropy": 1.7869562208652496, + "epoch": 1.8104144351981544, + "grad_norm": 0.6959302425384521, + "learning_rate": 2.421449686884109e-06, + "loss": 1.549, + "mean_token_accuracy": 0.6411256889502207, + "num_tokens": 2764638641.0, + "step": 16480 + }, + { + "entropy": 1.7564504742622375, + "epoch": 1.8105242921095273, + "grad_norm": 0.6463617086410522, + "learning_rate": 2.4209655965763406e-06, + "loss": 1.4976, + "mean_token_accuracy": 0.6407757749160131, + "num_tokens": 2764811182.0, + "step": 16481 + }, + { + "entropy": 1.73487122853597, + "epoch": 1.8106341490209004, + "grad_norm": 0.7762302160263062, + "learning_rate": 2.4204817777911455e-06, + "loss": 1.3926, + "mean_token_accuracy": 0.6518355309963226, + "num_tokens": 2764952016.0, + "step": 16482 + }, + { + "entropy": 1.6598585546016693, + "epoch": 1.8107440059322732, + "grad_norm": 0.6841420531272888, + "learning_rate": 2.4199982305438365e-06, + "loss": 1.223, + "mean_token_accuracy": 0.6864841481049856, + "num_tokens": 2765057348.0, + "step": 16483 + }, + { + "entropy": 1.6540294587612152, + "epoch": 1.8108538628436461, + "grad_norm": 0.9833670854568481, + "learning_rate": 2.4195149548497173e-06, + "loss": 1.3703, + "mean_token_accuracy": 0.67726102968057, + "num_tokens": 2765199987.0, + "step": 16484 + }, + { + "entropy": 1.7150746981302898, + "epoch": 1.8109637197550192, + "grad_norm": 0.5984099507331848, + "learning_rate": 2.419031950724082e-06, + "loss": 1.4509, + "mean_token_accuracy": 0.6461076935132345, + "num_tokens": 2765417441.0, + "step": 16485 + }, + { + "entropy": 1.708159824212392, + "epoch": 1.811073576666392, + "grad_norm": 0.6068819165229797, + "learning_rate": 2.41854921818222e-06, + "loss": 1.4214, + "mean_token_accuracy": 0.6494432340065638, + "num_tokens": 2765635138.0, + "step": 16486 + }, + { + "entropy": 1.6690807143847148, + "epoch": 1.811183433577765, + "grad_norm": 0.7443904876708984, + "learning_rate": 2.4180667572394073e-06, + "loss": 1.2496, + "mean_token_accuracy": 0.6697708616654078, + "num_tokens": 2765765146.0, + "step": 16487 + }, + { + "entropy": 1.738635003566742, + "epoch": 1.811293290489138, + "grad_norm": 0.7414568662643433, + "learning_rate": 2.4175845679109157e-06, + "loss": 1.4483, + "mean_token_accuracy": 0.6477248768011729, + "num_tokens": 2765914159.0, + "step": 16488 + }, + { + "entropy": 1.73112353682518, + "epoch": 1.8114031474005108, + "grad_norm": 1.007875680923462, + "learning_rate": 2.417102650212005e-06, + "loss": 1.3697, + "mean_token_accuracy": 0.6697366237640381, + "num_tokens": 2766056322.0, + "step": 16489 + }, + { + "entropy": 1.6718713839848836, + "epoch": 1.811513004311884, + "grad_norm": 0.5636931657791138, + "learning_rate": 2.4166210041579266e-06, + "loss": 1.477, + "mean_token_accuracy": 0.6347486774126688, + "num_tokens": 2766306740.0, + "step": 16490 + }, + { + "entropy": 1.6812805632750194, + "epoch": 1.8116228612232566, + "grad_norm": 0.6897765398025513, + "learning_rate": 2.4161396297639277e-06, + "loss": 1.3959, + "mean_token_accuracy": 0.6611317793528239, + "num_tokens": 2766469990.0, + "step": 16491 + }, + { + "entropy": 1.7409476439158122, + "epoch": 1.8117327181346297, + "grad_norm": 0.9774511456489563, + "learning_rate": 2.4156585270452413e-06, + "loss": 1.7468, + "mean_token_accuracy": 0.643994982043902, + "num_tokens": 2766603370.0, + "step": 16492 + }, + { + "entropy": 1.6942278146743774, + "epoch": 1.8118425750460025, + "grad_norm": 0.6507070064544678, + "learning_rate": 2.415177696017093e-06, + "loss": 1.3617, + "mean_token_accuracy": 0.6736532896757126, + "num_tokens": 2766748463.0, + "step": 16493 + }, + { + "entropy": 1.6879957815011342, + "epoch": 1.8119524319573754, + "grad_norm": 0.6665419936180115, + "learning_rate": 2.4146971366947035e-06, + "loss": 1.3464, + "mean_token_accuracy": 0.6612015018860499, + "num_tokens": 2766923463.0, + "step": 16494 + }, + { + "entropy": 1.6867989003658295, + "epoch": 1.8120622888687485, + "grad_norm": 0.6739834547042847, + "learning_rate": 2.4142168490932784e-06, + "loss": 1.2979, + "mean_token_accuracy": 0.6711998730897903, + "num_tokens": 2767057148.0, + "step": 16495 + }, + { + "entropy": 1.7199506064256032, + "epoch": 1.8121721457801214, + "grad_norm": 0.6423784494400024, + "learning_rate": 2.413736833228024e-06, + "loss": 1.3756, + "mean_token_accuracy": 0.6621057589848837, + "num_tokens": 2767237139.0, + "step": 16496 + }, + { + "entropy": 1.6715861360232036, + "epoch": 1.8122820026914943, + "grad_norm": 0.6793438792228699, + "learning_rate": 2.4132570891141296e-06, + "loss": 1.2994, + "mean_token_accuracy": 0.6636922707160314, + "num_tokens": 2767411416.0, + "step": 16497 + }, + { + "entropy": 1.70285361011823, + "epoch": 1.8123918596028674, + "grad_norm": 0.6627052426338196, + "learning_rate": 2.412777616766778e-06, + "loss": 1.5275, + "mean_token_accuracy": 0.6365848928689957, + "num_tokens": 2767616868.0, + "step": 16498 + }, + { + "entropy": 1.748480220635732, + "epoch": 1.81250171651424, + "grad_norm": 0.6843937635421753, + "learning_rate": 2.4122984162011453e-06, + "loss": 1.4195, + "mean_token_accuracy": 0.6697787046432495, + "num_tokens": 2767808831.0, + "step": 16499 + }, + { + "entropy": 1.7178409099578857, + "epoch": 1.8126115734256132, + "grad_norm": 0.7172280550003052, + "learning_rate": 2.4118194874323993e-06, + "loss": 1.3364, + "mean_token_accuracy": 0.6643195003271103, + "num_tokens": 2767935675.0, + "step": 16500 + }, + { + "entropy": 1.7438208361466725, + "epoch": 1.812721430336986, + "grad_norm": 0.6629429459571838, + "learning_rate": 2.4113408304756943e-06, + "loss": 1.3889, + "mean_token_accuracy": 0.6561461488405863, + "num_tokens": 2768083615.0, + "step": 16501 + }, + { + "entropy": 1.7097637752691905, + "epoch": 1.812831287248359, + "grad_norm": 0.627873420715332, + "learning_rate": 2.4108624453461825e-06, + "loss": 1.3931, + "mean_token_accuracy": 0.6541771193345388, + "num_tokens": 2768263773.0, + "step": 16502 + }, + { + "entropy": 1.7309077978134155, + "epoch": 1.812941144159732, + "grad_norm": 0.6251326203346252, + "learning_rate": 2.4103843320590053e-06, + "loss": 1.548, + "mean_token_accuracy": 0.6347083449363708, + "num_tokens": 2768483831.0, + "step": 16503 + }, + { + "entropy": 1.645240803559621, + "epoch": 1.8130510010711047, + "grad_norm": 0.6707781553268433, + "learning_rate": 2.409906490629294e-06, + "loss": 1.4934, + "mean_token_accuracy": 0.6460568408171335, + "num_tokens": 2768679029.0, + "step": 16504 + }, + { + "entropy": 1.6946294804414113, + "epoch": 1.8131608579824778, + "grad_norm": 0.6006249785423279, + "learning_rate": 2.4094289210721684e-06, + "loss": 1.3718, + "mean_token_accuracy": 0.6605852544307709, + "num_tokens": 2768851311.0, + "step": 16505 + }, + { + "entropy": 1.711153248945872, + "epoch": 1.8132707148938507, + "grad_norm": 0.7911529541015625, + "learning_rate": 2.40895162340275e-06, + "loss": 1.5019, + "mean_token_accuracy": 0.653807650009791, + "num_tokens": 2768982496.0, + "step": 16506 + }, + { + "entropy": 1.6501058836778004, + "epoch": 1.8133805718052236, + "grad_norm": 0.6616920232772827, + "learning_rate": 2.4084745976361382e-06, + "loss": 1.3616, + "mean_token_accuracy": 0.6733062863349915, + "num_tokens": 2769152918.0, + "step": 16507 + }, + { + "entropy": 1.6238398055235546, + "epoch": 1.8134904287165967, + "grad_norm": 0.7626936435699463, + "learning_rate": 2.4079978437874357e-06, + "loss": 1.2714, + "mean_token_accuracy": 0.6772501319646835, + "num_tokens": 2769344820.0, + "step": 16508 + }, + { + "entropy": 1.7226660251617432, + "epoch": 1.8136002856279696, + "grad_norm": 0.6929018497467041, + "learning_rate": 2.4075213618717304e-06, + "loss": 1.4003, + "mean_token_accuracy": 0.6568613747755686, + "num_tokens": 2769517942.0, + "step": 16509 + }, + { + "entropy": 1.8020154933134716, + "epoch": 1.8137101425393425, + "grad_norm": 0.7082515358924866, + "learning_rate": 2.4070451519041014e-06, + "loss": 1.3086, + "mean_token_accuracy": 0.669136126836141, + "num_tokens": 2769667335.0, + "step": 16510 + }, + { + "entropy": 1.7351914743582408, + "epoch": 1.8138199994507156, + "grad_norm": 0.8061874508857727, + "learning_rate": 2.406569213899621e-06, + "loss": 1.4222, + "mean_token_accuracy": 0.6561322311560313, + "num_tokens": 2769819275.0, + "step": 16511 + }, + { + "entropy": 1.673819233973821, + "epoch": 1.8139298563620883, + "grad_norm": 0.7893275618553162, + "learning_rate": 2.4060935478733538e-06, + "loss": 1.2336, + "mean_token_accuracy": 0.6748340229193369, + "num_tokens": 2769964391.0, + "step": 16512 + }, + { + "entropy": 1.708072344462077, + "epoch": 1.8140397132734614, + "grad_norm": 0.7377780079841614, + "learning_rate": 2.4056181538403515e-06, + "loss": 1.4061, + "mean_token_accuracy": 0.6597805072863897, + "num_tokens": 2770118160.0, + "step": 16513 + }, + { + "entropy": 1.714382102092107, + "epoch": 1.8141495701848342, + "grad_norm": 0.5593966245651245, + "learning_rate": 2.4051430318156622e-06, + "loss": 1.415, + "mean_token_accuracy": 0.6464897443850836, + "num_tokens": 2770391818.0, + "step": 16514 + }, + { + "entropy": 1.7466392715771992, + "epoch": 1.8142594270962071, + "grad_norm": 0.7103216052055359, + "learning_rate": 2.4046681818143245e-06, + "loss": 1.3028, + "mean_token_accuracy": 0.6765001912911733, + "num_tokens": 2770508407.0, + "step": 16515 + }, + { + "entropy": 1.765973150730133, + "epoch": 1.8143692840075802, + "grad_norm": 0.7408754229545593, + "learning_rate": 2.4041936038513647e-06, + "loss": 1.3235, + "mean_token_accuracy": 0.6654133200645447, + "num_tokens": 2770656487.0, + "step": 16516 + }, + { + "entropy": 1.6894567012786865, + "epoch": 1.814479140918953, + "grad_norm": 0.6805311441421509, + "learning_rate": 2.4037192979418036e-06, + "loss": 1.4779, + "mean_token_accuracy": 0.6684766709804535, + "num_tokens": 2770806625.0, + "step": 16517 + }, + { + "entropy": 1.7023487786451976, + "epoch": 1.814588997830326, + "grad_norm": 0.6457291841506958, + "learning_rate": 2.4032452641006546e-06, + "loss": 1.4227, + "mean_token_accuracy": 0.6392653485139211, + "num_tokens": 2770997593.0, + "step": 16518 + }, + { + "entropy": 1.7675903141498566, + "epoch": 1.814698854741699, + "grad_norm": 0.7356979250907898, + "learning_rate": 2.4027715023429173e-06, + "loss": 1.3966, + "mean_token_accuracy": 0.6494138091802597, + "num_tokens": 2771136387.0, + "step": 16519 + }, + { + "entropy": 1.6030404170354207, + "epoch": 1.8148087116530718, + "grad_norm": 0.5834929347038269, + "learning_rate": 2.4022980126835897e-06, + "loss": 1.4752, + "mean_token_accuracy": 0.6513334512710571, + "num_tokens": 2771362647.0, + "step": 16520 + }, + { + "entropy": 1.722896158695221, + "epoch": 1.814918568564445, + "grad_norm": 0.7146098613739014, + "learning_rate": 2.4018247951376546e-06, + "loss": 1.595, + "mean_token_accuracy": 0.6407341261704763, + "num_tokens": 2771551945.0, + "step": 16521 + }, + { + "entropy": 1.7077131768067677, + "epoch": 1.8150284254758178, + "grad_norm": 0.6865191459655762, + "learning_rate": 2.401351849720091e-06, + "loss": 1.4025, + "mean_token_accuracy": 0.656955232222875, + "num_tokens": 2771751606.0, + "step": 16522 + }, + { + "entropy": 1.6773952742417653, + "epoch": 1.8151382823871907, + "grad_norm": 0.5679813027381897, + "learning_rate": 2.4008791764458667e-06, + "loss": 1.4841, + "mean_token_accuracy": 0.6524718155463537, + "num_tokens": 2771984622.0, + "step": 16523 + }, + { + "entropy": 1.7627310752868652, + "epoch": 1.8152481392985638, + "grad_norm": 0.621216893196106, + "learning_rate": 2.4004067753299414e-06, + "loss": 1.5703, + "mean_token_accuracy": 0.6433713287115097, + "num_tokens": 2772250906.0, + "step": 16524 + }, + { + "entropy": 1.6635705729325612, + "epoch": 1.8153579962099364, + "grad_norm": 0.661649763584137, + "learning_rate": 2.399934646387266e-06, + "loss": 1.3721, + "mean_token_accuracy": 0.6734176874160767, + "num_tokens": 2772389665.0, + "step": 16525 + }, + { + "entropy": 1.7503486176331837, + "epoch": 1.8154678531213095, + "grad_norm": 0.772406816482544, + "learning_rate": 2.3994627896327832e-06, + "loss": 1.4636, + "mean_token_accuracy": 0.6583824306726456, + "num_tokens": 2772543581.0, + "step": 16526 + }, + { + "entropy": 1.6406415303548176, + "epoch": 1.8155777100326824, + "grad_norm": 0.716643214225769, + "learning_rate": 2.39899120508143e-06, + "loss": 1.3905, + "mean_token_accuracy": 0.6612937748432159, + "num_tokens": 2772717311.0, + "step": 16527 + }, + { + "entropy": 1.6522994637489319, + "epoch": 1.8156875669440553, + "grad_norm": 0.7278351187705994, + "learning_rate": 2.398519892748128e-06, + "loss": 1.3473, + "mean_token_accuracy": 0.6766088207562765, + "num_tokens": 2772938768.0, + "step": 16528 + }, + { + "entropy": 1.779366006453832, + "epoch": 1.8157974238554284, + "grad_norm": 0.7547992467880249, + "learning_rate": 2.398048852647795e-06, + "loss": 1.2914, + "mean_token_accuracy": 0.6645817359288534, + "num_tokens": 2773034087.0, + "step": 16529 + }, + { + "entropy": 1.6814574499924977, + "epoch": 1.815907280766801, + "grad_norm": 0.7637436389923096, + "learning_rate": 2.3975780847953413e-06, + "loss": 1.4152, + "mean_token_accuracy": 0.6605032732089361, + "num_tokens": 2773188999.0, + "step": 16530 + }, + { + "entropy": 1.690110484759013, + "epoch": 1.8160171376781742, + "grad_norm": 0.5965714454650879, + "learning_rate": 2.3971075892056628e-06, + "loss": 1.3626, + "mean_token_accuracy": 0.6655519704023997, + "num_tokens": 2773368162.0, + "step": 16531 + }, + { + "entropy": 1.6539724171161652, + "epoch": 1.816126994589547, + "grad_norm": 0.733353853225708, + "learning_rate": 2.3966373658936536e-06, + "loss": 1.4261, + "mean_token_accuracy": 0.6458848516146342, + "num_tokens": 2773558099.0, + "step": 16532 + }, + { + "entropy": 1.716547002394994, + "epoch": 1.81623685150092, + "grad_norm": 0.5876471400260925, + "learning_rate": 2.3961674148741954e-06, + "loss": 1.4986, + "mean_token_accuracy": 0.6417978306611379, + "num_tokens": 2773771033.0, + "step": 16533 + }, + { + "entropy": 1.6878297924995422, + "epoch": 1.816346708412293, + "grad_norm": 0.6864316463470459, + "learning_rate": 2.3956977361621607e-06, + "loss": 1.4623, + "mean_token_accuracy": 0.6522022038698196, + "num_tokens": 2773946616.0, + "step": 16534 + }, + { + "entropy": 1.714217483997345, + "epoch": 1.816456565323666, + "grad_norm": 0.7374356389045715, + "learning_rate": 2.3952283297724162e-06, + "loss": 1.4191, + "mean_token_accuracy": 0.6531722098588943, + "num_tokens": 2774101537.0, + "step": 16535 + }, + { + "entropy": 1.6750989357630413, + "epoch": 1.8165664222350388, + "grad_norm": 0.8244749307632446, + "learning_rate": 2.394759195719818e-06, + "loss": 1.3881, + "mean_token_accuracy": 0.6675261706113815, + "num_tokens": 2774258044.0, + "step": 16536 + }, + { + "entropy": 1.6771197319030762, + "epoch": 1.816676279146412, + "grad_norm": 0.755206823348999, + "learning_rate": 2.394290334019213e-06, + "loss": 1.3127, + "mean_token_accuracy": 0.6679815848668417, + "num_tokens": 2774407830.0, + "step": 16537 + }, + { + "entropy": 1.6540471911430359, + "epoch": 1.8167861360577846, + "grad_norm": 0.6771402359008789, + "learning_rate": 2.3938217446854393e-06, + "loss": 1.1949, + "mean_token_accuracy": 0.6918987234433492, + "num_tokens": 2774503263.0, + "step": 16538 + }, + { + "entropy": 1.7108920514583588, + "epoch": 1.8168959929691577, + "grad_norm": 0.7171925902366638, + "learning_rate": 2.3933534277333327e-06, + "loss": 1.3017, + "mean_token_accuracy": 0.6671940038601557, + "num_tokens": 2774658086.0, + "step": 16539 + }, + { + "entropy": 1.7624292373657227, + "epoch": 1.8170058498805306, + "grad_norm": 0.753374457359314, + "learning_rate": 2.392885383177711e-06, + "loss": 1.4478, + "mean_token_accuracy": 0.6500358184178671, + "num_tokens": 2774808233.0, + "step": 16540 + }, + { + "entropy": 1.7303147614002228, + "epoch": 1.8171157067919035, + "grad_norm": 0.6383606791496277, + "learning_rate": 2.3924176110333864e-06, + "loss": 1.353, + "mean_token_accuracy": 0.6555335720380148, + "num_tokens": 2775015569.0, + "step": 16541 + }, + { + "entropy": 1.7392794887224834, + "epoch": 1.8172255637032766, + "grad_norm": 0.7534666061401367, + "learning_rate": 2.391950111315167e-06, + "loss": 1.3783, + "mean_token_accuracy": 0.6698889136314392, + "num_tokens": 2775181058.0, + "step": 16542 + }, + { + "entropy": 1.630462755759557, + "epoch": 1.8173354206146493, + "grad_norm": 0.7775093913078308, + "learning_rate": 2.3914828840378476e-06, + "loss": 1.196, + "mean_token_accuracy": 0.6869658430417379, + "num_tokens": 2775292306.0, + "step": 16543 + }, + { + "entropy": 1.7384036084016163, + "epoch": 1.8174452775260224, + "grad_norm": 0.788651704788208, + "learning_rate": 2.3910159292162167e-06, + "loss": 1.3534, + "mean_token_accuracy": 0.6725411961476008, + "num_tokens": 2775447501.0, + "step": 16544 + }, + { + "entropy": 1.7322270274162292, + "epoch": 1.8175551344373952, + "grad_norm": 0.6624115109443665, + "learning_rate": 2.3905492468650527e-06, + "loss": 1.4113, + "mean_token_accuracy": 0.6496214121580124, + "num_tokens": 2775618693.0, + "step": 16545 + }, + { + "entropy": 1.750143031279246, + "epoch": 1.8176649913487681, + "grad_norm": 0.6496718525886536, + "learning_rate": 2.3900828369991234e-06, + "loss": 1.466, + "mean_token_accuracy": 0.6551011850436529, + "num_tokens": 2775786820.0, + "step": 16546 + }, + { + "entropy": 1.6444389820098877, + "epoch": 1.8177748482601412, + "grad_norm": 0.7461166977882385, + "learning_rate": 2.389616699633194e-06, + "loss": 1.4606, + "mean_token_accuracy": 0.6540708293517431, + "num_tokens": 2775955052.0, + "step": 16547 + }, + { + "entropy": 1.7287197808424632, + "epoch": 1.8178847051715141, + "grad_norm": 0.6682546138763428, + "learning_rate": 2.3891508347820165e-06, + "loss": 1.0821, + "mean_token_accuracy": 0.6904325783252716, + "num_tokens": 2776186575.0, + "step": 16548 + }, + { + "entropy": 1.7020801107088726, + "epoch": 1.817994562082887, + "grad_norm": 0.8539118766784668, + "learning_rate": 2.3886852424603333e-06, + "loss": 1.4569, + "mean_token_accuracy": 0.6489265362421671, + "num_tokens": 2776385187.0, + "step": 16549 + }, + { + "entropy": 1.7080492277940114, + "epoch": 1.8181044189942601, + "grad_norm": 0.7399299740791321, + "learning_rate": 2.388219922682883e-06, + "loss": 1.4506, + "mean_token_accuracy": 0.6532412966092428, + "num_tokens": 2776571303.0, + "step": 16550 + }, + { + "entropy": 1.7559833427270253, + "epoch": 1.8182142759056328, + "grad_norm": 0.727372407913208, + "learning_rate": 2.387754875464391e-06, + "loss": 1.4189, + "mean_token_accuracy": 0.6559430956840515, + "num_tokens": 2776713216.0, + "step": 16551 + }, + { + "entropy": 1.6203450560569763, + "epoch": 1.818324132817006, + "grad_norm": 0.6398701071739197, + "learning_rate": 2.3872901008195773e-06, + "loss": 1.2424, + "mean_token_accuracy": 0.6794936507940292, + "num_tokens": 2776872230.0, + "step": 16552 + }, + { + "entropy": 1.7338751256465912, + "epoch": 1.8184339897283788, + "grad_norm": 0.7848241925239563, + "learning_rate": 2.3868255987631505e-06, + "loss": 1.2348, + "mean_token_accuracy": 0.6750635951757431, + "num_tokens": 2776992799.0, + "step": 16553 + }, + { + "entropy": 1.7074306507905324, + "epoch": 1.8185438466397517, + "grad_norm": 0.7216284871101379, + "learning_rate": 2.386361369309812e-06, + "loss": 1.3222, + "mean_token_accuracy": 0.6633963038523992, + "num_tokens": 2777138226.0, + "step": 16554 + }, + { + "entropy": 1.6739212572574615, + "epoch": 1.8186537035511248, + "grad_norm": 0.7185259461402893, + "learning_rate": 2.385897412474255e-06, + "loss": 1.5609, + "mean_token_accuracy": 0.6289549271265665, + "num_tokens": 2777323423.0, + "step": 16555 + }, + { + "entropy": 1.7207949956258137, + "epoch": 1.8187635604624974, + "grad_norm": 0.7477259039878845, + "learning_rate": 2.385433728271164e-06, + "loss": 1.3418, + "mean_token_accuracy": 0.6546726375818253, + "num_tokens": 2777504447.0, + "step": 16556 + }, + { + "entropy": 1.6997572779655457, + "epoch": 1.8188734173738705, + "grad_norm": 0.7746348977088928, + "learning_rate": 2.3849703167152125e-06, + "loss": 1.3969, + "mean_token_accuracy": 0.6501687119404475, + "num_tokens": 2777669377.0, + "step": 16557 + }, + { + "entropy": 1.715737024943034, + "epoch": 1.8189832742852434, + "grad_norm": 0.8140842318534851, + "learning_rate": 2.3845071778210687e-06, + "loss": 1.5268, + "mean_token_accuracy": 0.6537976066271464, + "num_tokens": 2777818039.0, + "step": 16558 + }, + { + "entropy": 1.714678963025411, + "epoch": 1.8190931311966163, + "grad_norm": 0.6977283954620361, + "learning_rate": 2.3840443116033906e-06, + "loss": 1.5821, + "mean_token_accuracy": 0.6431048860152563, + "num_tokens": 2778031195.0, + "step": 16559 + }, + { + "entropy": 1.7471038500467937, + "epoch": 1.8192029881079894, + "grad_norm": 0.6290971040725708, + "learning_rate": 2.383581718076828e-06, + "loss": 1.4625, + "mean_token_accuracy": 0.6498029927412668, + "num_tokens": 2778184112.0, + "step": 16560 + }, + { + "entropy": 1.7384718358516693, + "epoch": 1.8193128450193623, + "grad_norm": 0.8048774600028992, + "learning_rate": 2.3831193972560204e-06, + "loss": 1.5574, + "mean_token_accuracy": 0.65819351375103, + "num_tokens": 2778325520.0, + "step": 16561 + }, + { + "entropy": 1.709036111831665, + "epoch": 1.8194227019307352, + "grad_norm": 0.6308175325393677, + "learning_rate": 2.382657349155602e-06, + "loss": 1.4355, + "mean_token_accuracy": 0.6599417279163996, + "num_tokens": 2778495891.0, + "step": 16562 + }, + { + "entropy": 1.725346823533376, + "epoch": 1.8195325588421083, + "grad_norm": 0.6439611315727234, + "learning_rate": 2.3821955737901942e-06, + "loss": 1.3866, + "mean_token_accuracy": 0.6503596703211466, + "num_tokens": 2778659002.0, + "step": 16563 + }, + { + "entropy": 1.6709474126497905, + "epoch": 1.819642415753481, + "grad_norm": 0.6519814729690552, + "learning_rate": 2.381734071174416e-06, + "loss": 1.4571, + "mean_token_accuracy": 0.642798125743866, + "num_tokens": 2778843826.0, + "step": 16564 + }, + { + "entropy": 1.6215501725673676, + "epoch": 1.819752272664854, + "grad_norm": 0.655518114566803, + "learning_rate": 2.381272841322869e-06, + "loss": 1.2602, + "mean_token_accuracy": 0.675087700287501, + "num_tokens": 2778996584.0, + "step": 16565 + }, + { + "entropy": 1.7112641334533691, + "epoch": 1.819862129576227, + "grad_norm": 0.653723418712616, + "learning_rate": 2.380811884250152e-06, + "loss": 1.3754, + "mean_token_accuracy": 0.6574215839306513, + "num_tokens": 2779187702.0, + "step": 16566 + }, + { + "entropy": 1.6549660762151082, + "epoch": 1.8199719864875998, + "grad_norm": 0.7489318251609802, + "learning_rate": 2.3803511999708554e-06, + "loss": 1.3494, + "mean_token_accuracy": 0.6677233328421911, + "num_tokens": 2779388320.0, + "step": 16567 + }, + { + "entropy": 1.6793318192164104, + "epoch": 1.820081843398973, + "grad_norm": 0.6962845921516418, + "learning_rate": 2.3798907884995617e-06, + "loss": 1.3043, + "mean_token_accuracy": 0.6692550530036291, + "num_tokens": 2779514019.0, + "step": 16568 + }, + { + "entropy": 1.6745788753032684, + "epoch": 1.8201917003103458, + "grad_norm": 0.6880436539649963, + "learning_rate": 2.379430649850837e-06, + "loss": 1.3641, + "mean_token_accuracy": 0.6582282483577728, + "num_tokens": 2779675915.0, + "step": 16569 + }, + { + "entropy": 1.7116785844167073, + "epoch": 1.8203015572217187, + "grad_norm": 0.8645946383476257, + "learning_rate": 2.3789707840392484e-06, + "loss": 1.3819, + "mean_token_accuracy": 0.6639973024527231, + "num_tokens": 2779823266.0, + "step": 16570 + }, + { + "entropy": 1.6840636630853016, + "epoch": 1.8204114141330916, + "grad_norm": 0.6108362078666687, + "learning_rate": 2.378511191079351e-06, + "loss": 1.4026, + "mean_token_accuracy": 0.6641562829415003, + "num_tokens": 2779985266.0, + "step": 16571 + }, + { + "entropy": 1.7049194872379303, + "epoch": 1.8205212710444645, + "grad_norm": 0.7385087013244629, + "learning_rate": 2.378051870985689e-06, + "loss": 1.4367, + "mean_token_accuracy": 0.650545577208201, + "num_tokens": 2780144229.0, + "step": 16572 + }, + { + "entropy": 1.614651362101237, + "epoch": 1.8206311279558376, + "grad_norm": 0.6099897027015686, + "learning_rate": 2.3775928237727996e-06, + "loss": 1.3483, + "mean_token_accuracy": 0.6629302948713303, + "num_tokens": 2780302810.0, + "step": 16573 + }, + { + "entropy": 1.6483580370744069, + "epoch": 1.8207409848672105, + "grad_norm": 0.7105104327201843, + "learning_rate": 2.377134049455213e-06, + "loss": 1.3006, + "mean_token_accuracy": 0.664797286192576, + "num_tokens": 2780474250.0, + "step": 16574 + }, + { + "entropy": 1.7492181360721588, + "epoch": 1.8208508417785834, + "grad_norm": 0.6788060665130615, + "learning_rate": 2.3766755480474464e-06, + "loss": 1.4855, + "mean_token_accuracy": 0.6429063032070795, + "num_tokens": 2780669549.0, + "step": 16575 + }, + { + "entropy": 1.6822342773278554, + "epoch": 1.8209606986899565, + "grad_norm": 0.636565089225769, + "learning_rate": 2.3762173195640147e-06, + "loss": 1.4087, + "mean_token_accuracy": 0.6564022650321325, + "num_tokens": 2780853162.0, + "step": 16576 + }, + { + "entropy": 1.772486279408137, + "epoch": 1.8210705556013291, + "grad_norm": 0.6843612194061279, + "learning_rate": 2.375759364019419e-06, + "loss": 1.4175, + "mean_token_accuracy": 0.6469480246305466, + "num_tokens": 2781056685.0, + "step": 16577 + }, + { + "entropy": 1.7506540218989055, + "epoch": 1.8211804125127022, + "grad_norm": 0.727794885635376, + "learning_rate": 2.3753016814281514e-06, + "loss": 1.435, + "mean_token_accuracy": 0.6630039562781652, + "num_tokens": 2781207737.0, + "step": 16578 + }, + { + "entropy": 1.7064573367436726, + "epoch": 1.8212902694240751, + "grad_norm": 0.7168003916740417, + "learning_rate": 2.374844271804701e-06, + "loss": 1.2831, + "mean_token_accuracy": 0.6836849649747213, + "num_tokens": 2781321392.0, + "step": 16579 + }, + { + "entropy": 1.7291381855805714, + "epoch": 1.821400126335448, + "grad_norm": 0.6939396858215332, + "learning_rate": 2.3743871351635427e-06, + "loss": 1.4408, + "mean_token_accuracy": 0.6493770778179169, + "num_tokens": 2781476898.0, + "step": 16580 + }, + { + "entropy": 1.7322145501772563, + "epoch": 1.8215099832468211, + "grad_norm": 0.6548502445220947, + "learning_rate": 2.373930271519143e-06, + "loss": 1.4634, + "mean_token_accuracy": 0.6514165798823038, + "num_tokens": 2781644750.0, + "step": 16581 + }, + { + "entropy": 1.7112524112065632, + "epoch": 1.821619840158194, + "grad_norm": 0.5996736884117126, + "learning_rate": 2.373473680885964e-06, + "loss": 1.364, + "mean_token_accuracy": 0.6540916860103607, + "num_tokens": 2781826533.0, + "step": 16582 + }, + { + "entropy": 1.7321696877479553, + "epoch": 1.8217296970695669, + "grad_norm": 0.7605938911437988, + "learning_rate": 2.373017363278457e-06, + "loss": 1.2532, + "mean_token_accuracy": 0.676396057009697, + "num_tokens": 2781947706.0, + "step": 16583 + }, + { + "entropy": 1.683882822593053, + "epoch": 1.8218395539809398, + "grad_norm": 0.8438287377357483, + "learning_rate": 2.3725613187110626e-06, + "loss": 1.3207, + "mean_token_accuracy": 0.676286518573761, + "num_tokens": 2782070912.0, + "step": 16584 + }, + { + "entropy": 1.6728091140588124, + "epoch": 1.8219494108923127, + "grad_norm": 0.7430797219276428, + "learning_rate": 2.3721055471982138e-06, + "loss": 1.2725, + "mean_token_accuracy": 0.6717578570048014, + "num_tokens": 2782206255.0, + "step": 16585 + }, + { + "entropy": 1.7225382626056671, + "epoch": 1.8220592678036858, + "grad_norm": 0.6596887707710266, + "learning_rate": 2.3716500487543376e-06, + "loss": 1.4749, + "mean_token_accuracy": 0.6395098119974136, + "num_tokens": 2782423591.0, + "step": 16586 + }, + { + "entropy": 1.7209839125474293, + "epoch": 1.8221691247150587, + "grad_norm": 0.7370496988296509, + "learning_rate": 2.3711948233938485e-06, + "loss": 1.451, + "mean_token_accuracy": 0.6665770759185156, + "num_tokens": 2782558304.0, + "step": 16587 + }, + { + "entropy": 1.6876719395319622, + "epoch": 1.8222789816264315, + "grad_norm": 0.7385010719299316, + "learning_rate": 2.3707398711311553e-06, + "loss": 1.2099, + "mean_token_accuracy": 0.678598885734876, + "num_tokens": 2782664534.0, + "step": 16588 + }, + { + "entropy": 1.7227604786554973, + "epoch": 1.8223888385378046, + "grad_norm": 0.7744255065917969, + "learning_rate": 2.3702851919806576e-06, + "loss": 1.3421, + "mean_token_accuracy": 0.6641747852166494, + "num_tokens": 2782797638.0, + "step": 16589 + }, + { + "entropy": 1.675682693719864, + "epoch": 1.8224986954491773, + "grad_norm": 0.6251360177993774, + "learning_rate": 2.369830785956744e-06, + "loss": 1.2506, + "mean_token_accuracy": 0.6763549745082855, + "num_tokens": 2782911180.0, + "step": 16590 + }, + { + "entropy": 1.769344409306844, + "epoch": 1.8226085523605504, + "grad_norm": 0.7733559012413025, + "learning_rate": 2.3693766530737978e-06, + "loss": 1.5038, + "mean_token_accuracy": 0.648768370350202, + "num_tokens": 2783094728.0, + "step": 16591 + }, + { + "entropy": 1.7493232587973278, + "epoch": 1.8227184092719233, + "grad_norm": 0.711170494556427, + "learning_rate": 2.3689227933461916e-06, + "loss": 1.3211, + "mean_token_accuracy": 0.6690777093172073, + "num_tokens": 2783258611.0, + "step": 16592 + }, + { + "entropy": 1.681398739417394, + "epoch": 1.8228282661832962, + "grad_norm": 0.7355263829231262, + "learning_rate": 2.368469206788289e-06, + "loss": 1.2677, + "mean_token_accuracy": 0.6758220344781876, + "num_tokens": 2783430258.0, + "step": 16593 + }, + { + "entropy": 1.6628807882467906, + "epoch": 1.8229381230946693, + "grad_norm": 0.7200619578361511, + "learning_rate": 2.3680158934144456e-06, + "loss": 1.5192, + "mean_token_accuracy": 0.6433456887801489, + "num_tokens": 2783632093.0, + "step": 16594 + }, + { + "entropy": 1.701430231332779, + "epoch": 1.8230479800060422, + "grad_norm": 0.677447497844696, + "learning_rate": 2.3675628532390113e-06, + "loss": 1.4706, + "mean_token_accuracy": 0.6618293623129526, + "num_tokens": 2783774737.0, + "step": 16595 + }, + { + "entropy": 1.6659322182337444, + "epoch": 1.823157836917415, + "grad_norm": 0.7491664886474609, + "learning_rate": 2.3671100862763226e-06, + "loss": 1.3961, + "mean_token_accuracy": 0.6457837373018265, + "num_tokens": 2783955761.0, + "step": 16596 + }, + { + "entropy": 1.6904160976409912, + "epoch": 1.823267693828788, + "grad_norm": 0.6810483932495117, + "learning_rate": 2.3666575925407086e-06, + "loss": 1.4507, + "mean_token_accuracy": 0.6556557367245356, + "num_tokens": 2784112844.0, + "step": 16597 + }, + { + "entropy": 1.7386436462402344, + "epoch": 1.8233775507401608, + "grad_norm": 0.8389174342155457, + "learning_rate": 2.3662053720464927e-06, + "loss": 1.4558, + "mean_token_accuracy": 0.6538802261153857, + "num_tokens": 2784240847.0, + "step": 16598 + }, + { + "entropy": 1.6878819664319356, + "epoch": 1.823487407651534, + "grad_norm": 0.7181552648544312, + "learning_rate": 2.3657534248079855e-06, + "loss": 1.3477, + "mean_token_accuracy": 0.6808636685212454, + "num_tokens": 2784351960.0, + "step": 16599 + }, + { + "entropy": 1.6631807684898376, + "epoch": 1.8235972645629068, + "grad_norm": 0.6783377528190613, + "learning_rate": 2.3653017508394916e-06, + "loss": 1.3382, + "mean_token_accuracy": 0.6660791685183843, + "num_tokens": 2784499447.0, + "step": 16600 + }, + { + "entropy": 1.6464990079402924, + "epoch": 1.8237071214742797, + "grad_norm": 0.7220808267593384, + "learning_rate": 2.3648503501553083e-06, + "loss": 1.4082, + "mean_token_accuracy": 0.6685073425372442, + "num_tokens": 2784662470.0, + "step": 16601 + }, + { + "entropy": 1.718644032875697, + "epoch": 1.8238169783856528, + "grad_norm": 0.6363226175308228, + "learning_rate": 2.3643992227697176e-06, + "loss": 1.4001, + "mean_token_accuracy": 0.6684810817241669, + "num_tokens": 2784824331.0, + "step": 16602 + }, + { + "entropy": 1.6482765078544617, + "epoch": 1.8239268352970255, + "grad_norm": 0.5895575284957886, + "learning_rate": 2.363948368697002e-06, + "loss": 1.4568, + "mean_token_accuracy": 0.6472050746281942, + "num_tokens": 2785067484.0, + "step": 16603 + }, + { + "entropy": 1.6042577922344208, + "epoch": 1.8240366922083986, + "grad_norm": 0.662174642086029, + "learning_rate": 2.363497787951428e-06, + "loss": 1.3404, + "mean_token_accuracy": 0.6540089795986811, + "num_tokens": 2785336972.0, + "step": 16604 + }, + { + "entropy": 1.659020572900772, + "epoch": 1.8241465491197715, + "grad_norm": 0.6183151006698608, + "learning_rate": 2.363047480547258e-06, + "loss": 1.3836, + "mean_token_accuracy": 0.666933129231135, + "num_tokens": 2785508777.0, + "step": 16605 + }, + { + "entropy": 1.6878956854343414, + "epoch": 1.8242564060311444, + "grad_norm": 0.7308575510978699, + "learning_rate": 2.362597446498742e-06, + "loss": 1.1711, + "mean_token_accuracy": 0.690845270951589, + "num_tokens": 2785628333.0, + "step": 16606 + }, + { + "entropy": 1.7506561974684398, + "epoch": 1.8243662629425175, + "grad_norm": 0.7305101752281189, + "learning_rate": 2.362147685820126e-06, + "loss": 1.413, + "mean_token_accuracy": 0.6451665014028549, + "num_tokens": 2785756364.0, + "step": 16607 + }, + { + "entropy": 1.7476352254549663, + "epoch": 1.8244761198538904, + "grad_norm": 0.6824808716773987, + "learning_rate": 2.361698198525644e-06, + "loss": 1.4284, + "mean_token_accuracy": 0.6470771382252375, + "num_tokens": 2785922197.0, + "step": 16608 + }, + { + "entropy": 1.734663297732671, + "epoch": 1.8245859767652632, + "grad_norm": 0.7013296484947205, + "learning_rate": 2.36124898462952e-06, + "loss": 1.3782, + "mean_token_accuracy": 0.6541523436705271, + "num_tokens": 2786105100.0, + "step": 16609 + }, + { + "entropy": 1.7278130650520325, + "epoch": 1.8246958336766363, + "grad_norm": 0.8316255211830139, + "learning_rate": 2.3608000441459748e-06, + "loss": 1.382, + "mean_token_accuracy": 0.6508718381325403, + "num_tokens": 2786257010.0, + "step": 16610 + }, + { + "entropy": 1.699917882680893, + "epoch": 1.824805690588009, + "grad_norm": 0.8237127661705017, + "learning_rate": 2.3603513770892125e-06, + "loss": 1.3808, + "mean_token_accuracy": 0.6678240597248077, + "num_tokens": 2786411966.0, + "step": 16611 + }, + { + "entropy": 1.7327560484409332, + "epoch": 1.8249155474993821, + "grad_norm": 0.8676771521568298, + "learning_rate": 2.3599029834734393e-06, + "loss": 1.4715, + "mean_token_accuracy": 0.6531338741381963, + "num_tokens": 2786547040.0, + "step": 16612 + }, + { + "entropy": 1.7098338504632313, + "epoch": 1.825025404410755, + "grad_norm": 0.714078962802887, + "learning_rate": 2.3594548633128413e-06, + "loss": 1.3284, + "mean_token_accuracy": 0.6696013609568278, + "num_tokens": 2786756645.0, + "step": 16613 + }, + { + "entropy": 1.6878060698509216, + "epoch": 1.8251352613221279, + "grad_norm": 0.6905450820922852, + "learning_rate": 2.359007016621603e-06, + "loss": 1.4931, + "mean_token_accuracy": 0.6502345601717631, + "num_tokens": 2786951298.0, + "step": 16614 + }, + { + "entropy": 1.702516903479894, + "epoch": 1.825245118233501, + "grad_norm": 0.7101051807403564, + "learning_rate": 2.3585594434139002e-06, + "loss": 1.4063, + "mean_token_accuracy": 0.6508783797423044, + "num_tokens": 2787110816.0, + "step": 16615 + }, + { + "entropy": 1.7455834746360779, + "epoch": 1.8253549751448737, + "grad_norm": 0.6841264963150024, + "learning_rate": 2.3581121437038975e-06, + "loss": 1.2673, + "mean_token_accuracy": 0.6685113509496053, + "num_tokens": 2787231743.0, + "step": 16616 + }, + { + "entropy": 1.7308682600657146, + "epoch": 1.8254648320562468, + "grad_norm": 0.6765478253364563, + "learning_rate": 2.3576651175057493e-06, + "loss": 1.3843, + "mean_token_accuracy": 0.6586334705352783, + "num_tokens": 2787401301.0, + "step": 16617 + }, + { + "entropy": 1.6286826531092327, + "epoch": 1.8255746889676197, + "grad_norm": 0.7048670053482056, + "learning_rate": 2.3572183648336072e-06, + "loss": 1.2665, + "mean_token_accuracy": 0.6805399060249329, + "num_tokens": 2787572843.0, + "step": 16618 + }, + { + "entropy": 1.6946585575739543, + "epoch": 1.8256845458789925, + "grad_norm": 0.6286748647689819, + "learning_rate": 2.3567718857016084e-06, + "loss": 1.372, + "mean_token_accuracy": 0.6510303070147833, + "num_tokens": 2787763549.0, + "step": 16619 + }, + { + "entropy": 1.7653050124645233, + "epoch": 1.8257944027903656, + "grad_norm": 0.7640430331230164, + "learning_rate": 2.3563256801238855e-06, + "loss": 1.471, + "mean_token_accuracy": 0.6584224353233973, + "num_tokens": 2787893159.0, + "step": 16620 + }, + { + "entropy": 1.626900275548299, + "epoch": 1.8259042597017385, + "grad_norm": 0.7232459187507629, + "learning_rate": 2.35587974811456e-06, + "loss": 1.4189, + "mean_token_accuracy": 0.6637706806262335, + "num_tokens": 2788076267.0, + "step": 16621 + }, + { + "entropy": 1.6736698547999065, + "epoch": 1.8260141166131114, + "grad_norm": 0.7517544627189636, + "learning_rate": 2.3554340896877453e-06, + "loss": 1.3742, + "mean_token_accuracy": 0.6552935838699341, + "num_tokens": 2788232255.0, + "step": 16622 + }, + { + "entropy": 1.698614478111267, + "epoch": 1.8261239735244845, + "grad_norm": 0.7546419501304626, + "learning_rate": 2.3549887048575446e-06, + "loss": 1.4902, + "mean_token_accuracy": 0.6500131438175837, + "num_tokens": 2788431091.0, + "step": 16623 + }, + { + "entropy": 1.6913301448027294, + "epoch": 1.8262338304358572, + "grad_norm": 0.7140382528305054, + "learning_rate": 2.354543593638059e-06, + "loss": 1.3081, + "mean_token_accuracy": 0.668897733092308, + "num_tokens": 2788587712.0, + "step": 16624 + }, + { + "entropy": 1.728107343117396, + "epoch": 1.8263436873472303, + "grad_norm": 0.6673551797866821, + "learning_rate": 2.3540987560433704e-06, + "loss": 1.4086, + "mean_token_accuracy": 0.6558303982019424, + "num_tokens": 2788760375.0, + "step": 16625 + }, + { + "entropy": 1.6470238665739696, + "epoch": 1.8264535442586032, + "grad_norm": 0.6046478152275085, + "learning_rate": 2.353654192087561e-06, + "loss": 1.3309, + "mean_token_accuracy": 0.6752079874277115, + "num_tokens": 2788954047.0, + "step": 16626 + }, + { + "entropy": 1.6849171618620555, + "epoch": 1.826563401169976, + "grad_norm": 0.762234628200531, + "learning_rate": 2.3532099017847002e-06, + "loss": 1.4178, + "mean_token_accuracy": 0.6630821377038956, + "num_tokens": 2789138869.0, + "step": 16627 + }, + { + "entropy": 1.7250126202901204, + "epoch": 1.8266732580813492, + "grad_norm": 0.6825308799743652, + "learning_rate": 2.3527658851488503e-06, + "loss": 1.3463, + "mean_token_accuracy": 0.6651128977537155, + "num_tokens": 2789289194.0, + "step": 16628 + }, + { + "entropy": 1.6536945700645447, + "epoch": 1.8267831149927218, + "grad_norm": 0.6264491081237793, + "learning_rate": 2.3523221421940624e-06, + "loss": 1.3165, + "mean_token_accuracy": 0.6712521612644196, + "num_tokens": 2789433752.0, + "step": 16629 + }, + { + "entropy": 1.6571077704429626, + "epoch": 1.826892971904095, + "grad_norm": 0.6818052530288696, + "learning_rate": 2.351878672934383e-06, + "loss": 1.5239, + "mean_token_accuracy": 0.6425358305374781, + "num_tokens": 2789598739.0, + "step": 16630 + }, + { + "entropy": 1.7497955461343129, + "epoch": 1.8270028288154678, + "grad_norm": 0.7354583144187927, + "learning_rate": 2.351435477383846e-06, + "loss": 1.5477, + "mean_token_accuracy": 0.6368038604656855, + "num_tokens": 2789772847.0, + "step": 16631 + }, + { + "entropy": 1.736647496620814, + "epoch": 1.8271126857268407, + "grad_norm": 0.8048020601272583, + "learning_rate": 2.35099255555648e-06, + "loss": 1.5508, + "mean_token_accuracy": 0.6268570274114609, + "num_tokens": 2789991318.0, + "step": 16632 + }, + { + "entropy": 1.6801136036713917, + "epoch": 1.8272225426382138, + "grad_norm": 0.6898782253265381, + "learning_rate": 2.350549907466302e-06, + "loss": 1.3481, + "mean_token_accuracy": 0.66416896879673, + "num_tokens": 2790159923.0, + "step": 16633 + }, + { + "entropy": 1.7110735873381298, + "epoch": 1.8273323995495867, + "grad_norm": 0.8061655163764954, + "learning_rate": 2.3501075331273208e-06, + "loss": 1.4615, + "mean_token_accuracy": 0.6463738034168879, + "num_tokens": 2790314777.0, + "step": 16634 + }, + { + "entropy": 1.7187353670597076, + "epoch": 1.8274422564609596, + "grad_norm": 0.625403642654419, + "learning_rate": 2.349665432553538e-06, + "loss": 1.2924, + "mean_token_accuracy": 0.6705781618754069, + "num_tokens": 2790451689.0, + "step": 16635 + }, + { + "entropy": 1.7233947416146596, + "epoch": 1.8275521133723327, + "grad_norm": 0.5538727045059204, + "learning_rate": 2.3492236057589494e-06, + "loss": 1.5311, + "mean_token_accuracy": 0.623880739013354, + "num_tokens": 2790690085.0, + "step": 16636 + }, + { + "entropy": 1.734166105588277, + "epoch": 1.8276619702837054, + "grad_norm": 0.7274359464645386, + "learning_rate": 2.348782052757533e-06, + "loss": 1.5441, + "mean_token_accuracy": 0.6415307223796844, + "num_tokens": 2790855735.0, + "step": 16637 + }, + { + "entropy": 1.7256428599357605, + "epoch": 1.8277718271950785, + "grad_norm": 0.6457618474960327, + "learning_rate": 2.3483407735632668e-06, + "loss": 1.4386, + "mean_token_accuracy": 0.6566118150949478, + "num_tokens": 2791039866.0, + "step": 16638 + }, + { + "entropy": 1.7583041091759999, + "epoch": 1.8278816841064514, + "grad_norm": 0.7741835713386536, + "learning_rate": 2.347899768190117e-06, + "loss": 1.4442, + "mean_token_accuracy": 0.660509412487348, + "num_tokens": 2791221988.0, + "step": 16639 + }, + { + "entropy": 1.7275878588358562, + "epoch": 1.8279915410178242, + "grad_norm": 0.7615863084793091, + "learning_rate": 2.34745903665204e-06, + "loss": 1.3726, + "mean_token_accuracy": 0.6682560543219248, + "num_tokens": 2791351713.0, + "step": 16640 + }, + { + "entropy": 1.6993821263313293, + "epoch": 1.8281013979291973, + "grad_norm": 0.6623696684837341, + "learning_rate": 2.3470185789629854e-06, + "loss": 1.4305, + "mean_token_accuracy": 0.6371948470671972, + "num_tokens": 2791551404.0, + "step": 16641 + }, + { + "entropy": 1.7266852855682373, + "epoch": 1.82821125484057, + "grad_norm": 0.6855489015579224, + "learning_rate": 2.3465783951368955e-06, + "loss": 1.2734, + "mean_token_accuracy": 0.6709648966789246, + "num_tokens": 2791694899.0, + "step": 16642 + }, + { + "entropy": 1.7204951246579487, + "epoch": 1.8283211117519431, + "grad_norm": 0.6789599657058716, + "learning_rate": 2.3461384851876983e-06, + "loss": 1.6562, + "mean_token_accuracy": 0.6272246465086937, + "num_tokens": 2791866300.0, + "step": 16643 + }, + { + "entropy": 1.7294853528340657, + "epoch": 1.828430968663316, + "grad_norm": 0.6738252639770508, + "learning_rate": 2.3456988491293193e-06, + "loss": 1.5462, + "mean_token_accuracy": 0.6291048725446066, + "num_tokens": 2792090459.0, + "step": 16644 + }, + { + "entropy": 1.7207268675168355, + "epoch": 1.8285408255746889, + "grad_norm": 0.7021991610527039, + "learning_rate": 2.345259486975672e-06, + "loss": 1.2478, + "mean_token_accuracy": 0.6744669079780579, + "num_tokens": 2792232045.0, + "step": 16645 + }, + { + "entropy": 1.7015381852785747, + "epoch": 1.828650682486062, + "grad_norm": 2.8002796173095703, + "learning_rate": 2.3448203987406613e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6926949769258499, + "num_tokens": 2792370336.0, + "step": 16646 + }, + { + "entropy": 1.718252569437027, + "epoch": 1.8287605393974349, + "grad_norm": 0.7113930583000183, + "learning_rate": 2.3443815844381846e-06, + "loss": 1.3618, + "mean_token_accuracy": 0.6545801758766174, + "num_tokens": 2792522904.0, + "step": 16647 + }, + { + "entropy": 1.6835759778817494, + "epoch": 1.8288703963088078, + "grad_norm": 0.7733089327812195, + "learning_rate": 2.3439430440821325e-06, + "loss": 1.5894, + "mean_token_accuracy": 0.6576187337438265, + "num_tokens": 2792680483.0, + "step": 16648 + }, + { + "entropy": 1.7124978800614674, + "epoch": 1.8289802532201809, + "grad_norm": 0.7034731507301331, + "learning_rate": 2.343504777686381e-06, + "loss": 1.3342, + "mean_token_accuracy": 0.6707392732302347, + "num_tokens": 2792825740.0, + "step": 16649 + }, + { + "entropy": 1.7099045515060425, + "epoch": 1.8290901101315535, + "grad_norm": 0.7132044434547424, + "learning_rate": 2.3430667852648026e-06, + "loss": 1.3485, + "mean_token_accuracy": 0.6722139616807302, + "num_tokens": 2792955789.0, + "step": 16650 + }, + { + "entropy": 1.679776022831599, + "epoch": 1.8291999670429266, + "grad_norm": 0.6770622730255127, + "learning_rate": 2.3426290668312595e-06, + "loss": 1.3703, + "mean_token_accuracy": 0.6551995724439621, + "num_tokens": 2793117672.0, + "step": 16651 + }, + { + "entropy": 1.6446966528892517, + "epoch": 1.8293098239542995, + "grad_norm": 0.6462422609329224, + "learning_rate": 2.3421916223996065e-06, + "loss": 1.3236, + "mean_token_accuracy": 0.6686488538980484, + "num_tokens": 2793272672.0, + "step": 16652 + }, + { + "entropy": 1.6652612388134003, + "epoch": 1.8294196808656724, + "grad_norm": 0.721100926399231, + "learning_rate": 2.341754451983686e-06, + "loss": 1.4143, + "mean_token_accuracy": 0.6578306208054224, + "num_tokens": 2793437356.0, + "step": 16653 + }, + { + "entropy": 1.728610356648763, + "epoch": 1.8295295377770455, + "grad_norm": 0.755320131778717, + "learning_rate": 2.341317555597336e-06, + "loss": 1.3919, + "mean_token_accuracy": 0.6722359557946523, + "num_tokens": 2793560530.0, + "step": 16654 + }, + { + "entropy": 1.7180915176868439, + "epoch": 1.8296393946884182, + "grad_norm": 0.8217064142227173, + "learning_rate": 2.340880933254383e-06, + "loss": 1.4459, + "mean_token_accuracy": 0.6533014078934988, + "num_tokens": 2793736230.0, + "step": 16655 + }, + { + "entropy": 1.7117689450581868, + "epoch": 1.8297492515997913, + "grad_norm": 0.6694772839546204, + "learning_rate": 2.340444584968648e-06, + "loss": 1.4054, + "mean_token_accuracy": 0.6524570882320404, + "num_tokens": 2793904553.0, + "step": 16656 + }, + { + "entropy": 1.6685727834701538, + "epoch": 1.8298591085111642, + "grad_norm": 0.6724652051925659, + "learning_rate": 2.34000851075394e-06, + "loss": 1.4027, + "mean_token_accuracy": 0.6603127866983414, + "num_tokens": 2794065707.0, + "step": 16657 + }, + { + "entropy": 1.6848424673080444, + "epoch": 1.829968965422537, + "grad_norm": 0.6383946537971497, + "learning_rate": 2.339572710624059e-06, + "loss": 1.3201, + "mean_token_accuracy": 0.6705543498198191, + "num_tokens": 2794199367.0, + "step": 16658 + }, + { + "entropy": 1.7302991648515065, + "epoch": 1.8300788223339102, + "grad_norm": 0.823840320110321, + "learning_rate": 2.3391371845928e-06, + "loss": 1.4138, + "mean_token_accuracy": 0.6613588233788809, + "num_tokens": 2794363556.0, + "step": 16659 + }, + { + "entropy": 1.740784724553426, + "epoch": 1.830188679245283, + "grad_norm": 0.7151713967323303, + "learning_rate": 2.3387019326739455e-06, + "loss": 1.3664, + "mean_token_accuracy": 0.6722310036420822, + "num_tokens": 2794476140.0, + "step": 16660 + }, + { + "entropy": 1.7195179959138234, + "epoch": 1.830298536156656, + "grad_norm": 0.7539914846420288, + "learning_rate": 2.338266954881273e-06, + "loss": 1.5308, + "mean_token_accuracy": 0.6465074469645818, + "num_tokens": 2794668990.0, + "step": 16661 + }, + { + "entropy": 1.7197861671447754, + "epoch": 1.830408393068029, + "grad_norm": 0.8375680446624756, + "learning_rate": 2.337832251228547e-06, + "loss": 1.5809, + "mean_token_accuracy": 0.6555479913949966, + "num_tokens": 2794844610.0, + "step": 16662 + }, + { + "entropy": 1.7217328945795696, + "epoch": 1.8305182499794017, + "grad_norm": 0.7076695561408997, + "learning_rate": 2.3373978217295286e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.659163624048233, + "num_tokens": 2795012428.0, + "step": 16663 + }, + { + "entropy": 1.7476484874884288, + "epoch": 1.8306281068907748, + "grad_norm": 0.6806117296218872, + "learning_rate": 2.336963666397965e-06, + "loss": 1.6084, + "mean_token_accuracy": 0.6421725749969482, + "num_tokens": 2795174407.0, + "step": 16664 + }, + { + "entropy": 1.6913608014583588, + "epoch": 1.8307379638021477, + "grad_norm": 0.7132964134216309, + "learning_rate": 2.336529785247597e-06, + "loss": 1.4221, + "mean_token_accuracy": 0.6525693833827972, + "num_tokens": 2795305317.0, + "step": 16665 + }, + { + "entropy": 1.703669399023056, + "epoch": 1.8308478207135206, + "grad_norm": 0.7673993706703186, + "learning_rate": 2.336096178292159e-06, + "loss": 1.3788, + "mean_token_accuracy": 0.6568796038627625, + "num_tokens": 2795427951.0, + "step": 16666 + }, + { + "entropy": 1.6177269021670024, + "epoch": 1.8309576776248937, + "grad_norm": 0.6640709638595581, + "learning_rate": 2.3356628455453704e-06, + "loss": 1.2852, + "mean_token_accuracy": 0.6807574729124705, + "num_tokens": 2795558559.0, + "step": 16667 + }, + { + "entropy": 1.719151347875595, + "epoch": 1.8310675345362664, + "grad_norm": 0.7354775667190552, + "learning_rate": 2.3352297870209508e-06, + "loss": 1.3344, + "mean_token_accuracy": 0.6731430192788442, + "num_tokens": 2795681017.0, + "step": 16668 + }, + { + "entropy": 1.6955258548259735, + "epoch": 1.8311773914476395, + "grad_norm": 0.8023842573165894, + "learning_rate": 2.3347970027326043e-06, + "loss": 1.497, + "mean_token_accuracy": 0.6416831761598587, + "num_tokens": 2795871098.0, + "step": 16669 + }, + { + "entropy": 1.7174125413099925, + "epoch": 1.8312872483590124, + "grad_norm": 0.6527412533760071, + "learning_rate": 2.3343644926940253e-06, + "loss": 1.2843, + "mean_token_accuracy": 0.6673098454872767, + "num_tokens": 2796022135.0, + "step": 16670 + }, + { + "entropy": 1.6860974431037903, + "epoch": 1.8313971052703852, + "grad_norm": 0.6887062788009644, + "learning_rate": 2.3339322569189074e-06, + "loss": 1.4243, + "mean_token_accuracy": 0.6650121112664541, + "num_tokens": 2796181833.0, + "step": 16671 + }, + { + "entropy": 1.6747096180915833, + "epoch": 1.8315069621817583, + "grad_norm": 0.7698276042938232, + "learning_rate": 2.3335002954209285e-06, + "loss": 1.4173, + "mean_token_accuracy": 0.6679667383432388, + "num_tokens": 2796341629.0, + "step": 16672 + }, + { + "entropy": 1.705536663532257, + "epoch": 1.8316168190931312, + "grad_norm": 0.7042364478111267, + "learning_rate": 2.33306860821376e-06, + "loss": 1.5278, + "mean_token_accuracy": 0.6453391214211782, + "num_tokens": 2796497957.0, + "step": 16673 + }, + { + "entropy": 1.678510695695877, + "epoch": 1.8317266760045041, + "grad_norm": 0.6615474820137024, + "learning_rate": 2.3326371953110642e-06, + "loss": 1.3508, + "mean_token_accuracy": 0.6713146766026815, + "num_tokens": 2796665935.0, + "step": 16674 + }, + { + "entropy": 1.6524465282758076, + "epoch": 1.8318365329158772, + "grad_norm": 0.6595404744148254, + "learning_rate": 2.332206056726495e-06, + "loss": 1.3634, + "mean_token_accuracy": 0.6558839529752731, + "num_tokens": 2796839734.0, + "step": 16675 + }, + { + "entropy": 1.683104048172633, + "epoch": 1.8319463898272499, + "grad_norm": 0.59060138463974, + "learning_rate": 2.3317751924736994e-06, + "loss": 1.4722, + "mean_token_accuracy": 0.6600749840339025, + "num_tokens": 2797058425.0, + "step": 16676 + }, + { + "entropy": 1.690855731566747, + "epoch": 1.832056246738623, + "grad_norm": 0.7545903921127319, + "learning_rate": 2.331344602566313e-06, + "loss": 1.1887, + "mean_token_accuracy": 0.6840762843688329, + "num_tokens": 2797168514.0, + "step": 16677 + }, + { + "entropy": 1.7053393423557281, + "epoch": 1.8321661036499959, + "grad_norm": 0.6647628545761108, + "learning_rate": 2.3309142870179624e-06, + "loss": 1.2683, + "mean_token_accuracy": 0.6724948883056641, + "num_tokens": 2797306928.0, + "step": 16678 + }, + { + "entropy": 1.7144875427087147, + "epoch": 1.8322759605613688, + "grad_norm": 0.7609655857086182, + "learning_rate": 2.3304842458422687e-06, + "loss": 1.4754, + "mean_token_accuracy": 0.6519733120997747, + "num_tokens": 2797455985.0, + "step": 16679 + }, + { + "entropy": 1.695515791575114, + "epoch": 1.8323858174727419, + "grad_norm": 0.7155903577804565, + "learning_rate": 2.330054479052844e-06, + "loss": 1.3468, + "mean_token_accuracy": 0.6665776371955872, + "num_tokens": 2797601314.0, + "step": 16680 + }, + { + "entropy": 1.730222334464391, + "epoch": 1.8324956743841145, + "grad_norm": 0.6431559920310974, + "learning_rate": 2.329624986663286e-06, + "loss": 1.4597, + "mean_token_accuracy": 0.652314489086469, + "num_tokens": 2797755866.0, + "step": 16681 + }, + { + "entropy": 1.7097695469856262, + "epoch": 1.8326055312954876, + "grad_norm": 0.7711726427078247, + "learning_rate": 2.3291957686871906e-06, + "loss": 1.3307, + "mean_token_accuracy": 0.6789915611346563, + "num_tokens": 2797949501.0, + "step": 16682 + }, + { + "entropy": 1.7180574436982472, + "epoch": 1.8327153882068605, + "grad_norm": 0.629719614982605, + "learning_rate": 2.3287668251381425e-06, + "loss": 1.341, + "mean_token_accuracy": 0.661995048324267, + "num_tokens": 2798137574.0, + "step": 16683 + }, + { + "entropy": 1.663592944542567, + "epoch": 1.8328252451182334, + "grad_norm": 0.5827559232711792, + "learning_rate": 2.3283381560297174e-06, + "loss": 1.3726, + "mean_token_accuracy": 0.6665849586327871, + "num_tokens": 2798318692.0, + "step": 16684 + }, + { + "entropy": 1.7471620738506317, + "epoch": 1.8329351020296065, + "grad_norm": 0.6353728175163269, + "learning_rate": 2.327909761375481e-06, + "loss": 1.5038, + "mean_token_accuracy": 0.6181689401467642, + "num_tokens": 2798535717.0, + "step": 16685 + }, + { + "entropy": 1.7170325716336567, + "epoch": 1.8330449589409794, + "grad_norm": 0.5999660491943359, + "learning_rate": 2.327481641188994e-06, + "loss": 1.4572, + "mean_token_accuracy": 0.6410307437181473, + "num_tokens": 2798743059.0, + "step": 16686 + }, + { + "entropy": 1.6927921573321025, + "epoch": 1.8331548158523523, + "grad_norm": 0.767248272895813, + "learning_rate": 2.327053795483804e-06, + "loss": 1.3298, + "mean_token_accuracy": 0.6670361459255219, + "num_tokens": 2798877340.0, + "step": 16687 + }, + { + "entropy": 1.7161982754866283, + "epoch": 1.8332646727637254, + "grad_norm": 0.6494265198707581, + "learning_rate": 2.3266262242734533e-06, + "loss": 1.4735, + "mean_token_accuracy": 0.6449993848800659, + "num_tokens": 2799071422.0, + "step": 16688 + }, + { + "entropy": 1.6752700805664062, + "epoch": 1.833374529675098, + "grad_norm": 0.6607106924057007, + "learning_rate": 2.326198927571476e-06, + "loss": 1.4465, + "mean_token_accuracy": 0.6546447724103928, + "num_tokens": 2799256633.0, + "step": 16689 + }, + { + "entropy": 1.6198724607626598, + "epoch": 1.8334843865864712, + "grad_norm": 0.6523711085319519, + "learning_rate": 2.3257719053913918e-06, + "loss": 1.3877, + "mean_token_accuracy": 0.6629767715930939, + "num_tokens": 2799420133.0, + "step": 16690 + }, + { + "entropy": 1.65846848487854, + "epoch": 1.833594243497844, + "grad_norm": 0.7563357353210449, + "learning_rate": 2.325345157746719e-06, + "loss": 1.3276, + "mean_token_accuracy": 0.6739385028680166, + "num_tokens": 2799565448.0, + "step": 16691 + }, + { + "entropy": 1.715136726697286, + "epoch": 1.833704100409217, + "grad_norm": 0.9137521386146545, + "learning_rate": 2.324918684650965e-06, + "loss": 1.2287, + "mean_token_accuracy": 0.6744515299797058, + "num_tokens": 2799666308.0, + "step": 16692 + }, + { + "entropy": 1.7323183019955952, + "epoch": 1.83381395732059, + "grad_norm": 0.6691234111785889, + "learning_rate": 2.324492486117623e-06, + "loss": 1.4001, + "mean_token_accuracy": 0.6475772460301717, + "num_tokens": 2799802850.0, + "step": 16693 + }, + { + "entropy": 1.7045779128869374, + "epoch": 1.8339238142319627, + "grad_norm": 0.6672487854957581, + "learning_rate": 2.3240665621601845e-06, + "loss": 1.394, + "mean_token_accuracy": 0.6490417867898941, + "num_tokens": 2799931349.0, + "step": 16694 + }, + { + "entropy": 1.7110650738080342, + "epoch": 1.8340336711433358, + "grad_norm": 0.9822511672973633, + "learning_rate": 2.323640912792131e-06, + "loss": 1.4708, + "mean_token_accuracy": 0.6601487100124359, + "num_tokens": 2800082985.0, + "step": 16695 + }, + { + "entropy": 1.7707766592502594, + "epoch": 1.8341435280547087, + "grad_norm": 0.6212161779403687, + "learning_rate": 2.3232155380269334e-06, + "loss": 1.4198, + "mean_token_accuracy": 0.6484910945097605, + "num_tokens": 2800227500.0, + "step": 16696 + }, + { + "entropy": 1.7343395352363586, + "epoch": 1.8342533849660816, + "grad_norm": 0.6109079718589783, + "learning_rate": 2.3227904378780525e-06, + "loss": 1.4134, + "mean_token_accuracy": 0.6515641411145529, + "num_tokens": 2800407929.0, + "step": 16697 + }, + { + "entropy": 1.7123263776302338, + "epoch": 1.8343632418774547, + "grad_norm": 0.7430224418640137, + "learning_rate": 2.3223656123589465e-06, + "loss": 1.4596, + "mean_token_accuracy": 0.6462646871805191, + "num_tokens": 2800592020.0, + "step": 16698 + }, + { + "entropy": 1.760039468606313, + "epoch": 1.8344730987888276, + "grad_norm": 0.7846954464912415, + "learning_rate": 2.3219410614830565e-06, + "loss": 1.3439, + "mean_token_accuracy": 0.6587338050206503, + "num_tokens": 2800717471.0, + "step": 16699 + }, + { + "entropy": 1.71830815076828, + "epoch": 1.8345829557002005, + "grad_norm": 0.833976686000824, + "learning_rate": 2.321516785263822e-06, + "loss": 1.4485, + "mean_token_accuracy": 0.6538856824239095, + "num_tokens": 2800878599.0, + "step": 16700 + }, + { + "entropy": 1.734289248784383, + "epoch": 1.8346928126115736, + "grad_norm": 0.6943913698196411, + "learning_rate": 2.321092783714671e-06, + "loss": 1.3736, + "mean_token_accuracy": 0.6654922415812811, + "num_tokens": 2801021962.0, + "step": 16701 + }, + { + "entropy": 1.7423664331436157, + "epoch": 1.8348026695229462, + "grad_norm": 0.665240466594696, + "learning_rate": 2.3206690568490227e-06, + "loss": 1.3895, + "mean_token_accuracy": 0.6504726807276408, + "num_tokens": 2801170792.0, + "step": 16702 + }, + { + "entropy": 1.6692781150341034, + "epoch": 1.8349125264343193, + "grad_norm": 0.6680687069892883, + "learning_rate": 2.320245604680287e-06, + "loss": 1.4679, + "mean_token_accuracy": 0.6471812377373377, + "num_tokens": 2801333096.0, + "step": 16703 + }, + { + "entropy": 1.73446982105573, + "epoch": 1.8350223833456922, + "grad_norm": 0.6099308133125305, + "learning_rate": 2.3198224272218688e-06, + "loss": 1.4017, + "mean_token_accuracy": 0.6566864202419916, + "num_tokens": 2801528350.0, + "step": 16704 + }, + { + "entropy": 1.7558285593986511, + "epoch": 1.8351322402570651, + "grad_norm": 0.6862417459487915, + "learning_rate": 2.3193995244871563e-06, + "loss": 1.4117, + "mean_token_accuracy": 0.6532981991767883, + "num_tokens": 2801649689.0, + "step": 16705 + }, + { + "entropy": 1.668075293302536, + "epoch": 1.8352420971684382, + "grad_norm": 0.6663626432418823, + "learning_rate": 2.318976896489539e-06, + "loss": 1.2259, + "mean_token_accuracy": 0.6728375951449076, + "num_tokens": 2801778768.0, + "step": 16706 + }, + { + "entropy": 1.7122756640116374, + "epoch": 1.8353519540798109, + "grad_norm": 0.6838109493255615, + "learning_rate": 2.3185545432423913e-06, + "loss": 1.3666, + "mean_token_accuracy": 0.6625057260195414, + "num_tokens": 2801920311.0, + "step": 16707 + }, + { + "entropy": 1.6512116491794586, + "epoch": 1.835461810991184, + "grad_norm": 0.6728096008300781, + "learning_rate": 2.31813246475908e-06, + "loss": 1.4699, + "mean_token_accuracy": 0.6534949143727621, + "num_tokens": 2802092232.0, + "step": 16708 + }, + { + "entropy": 1.7649835646152496, + "epoch": 1.8355716679025569, + "grad_norm": 0.6703423261642456, + "learning_rate": 2.3177106610529636e-06, + "loss": 1.3734, + "mean_token_accuracy": 0.670386994878451, + "num_tokens": 2802257287.0, + "step": 16709 + }, + { + "entropy": 1.709249993165334, + "epoch": 1.8356815248139298, + "grad_norm": 0.6634789109230042, + "learning_rate": 2.317289132137394e-06, + "loss": 1.4009, + "mean_token_accuracy": 0.6566235572099686, + "num_tokens": 2802415941.0, + "step": 16710 + }, + { + "entropy": 1.6666079958279927, + "epoch": 1.8357913817253029, + "grad_norm": 0.7016635537147522, + "learning_rate": 2.3168678780257087e-06, + "loss": 1.2665, + "mean_token_accuracy": 0.6808893928925196, + "num_tokens": 2802520782.0, + "step": 16711 + }, + { + "entropy": 1.7787012954552968, + "epoch": 1.8359012386366758, + "grad_norm": 0.803626298904419, + "learning_rate": 2.316446898731243e-06, + "loss": 1.3076, + "mean_token_accuracy": 0.669058566292127, + "num_tokens": 2802665165.0, + "step": 16712 + }, + { + "entropy": 1.7142982184886932, + "epoch": 1.8360110955480486, + "grad_norm": 0.6213630437850952, + "learning_rate": 2.3160261942673214e-06, + "loss": 1.4598, + "mean_token_accuracy": 0.6640812555948893, + "num_tokens": 2802817063.0, + "step": 16713 + }, + { + "entropy": 1.6978369255860646, + "epoch": 1.8361209524594218, + "grad_norm": 0.7595458030700684, + "learning_rate": 2.315605764647256e-06, + "loss": 1.2793, + "mean_token_accuracy": 0.6742851883172989, + "num_tokens": 2802976665.0, + "step": 16714 + }, + { + "entropy": 1.7584485709667206, + "epoch": 1.8362308093707944, + "grad_norm": 0.6625379323959351, + "learning_rate": 2.3151856098843546e-06, + "loss": 1.3989, + "mean_token_accuracy": 0.6467997978130976, + "num_tokens": 2803145950.0, + "step": 16715 + }, + { + "entropy": 1.6833869119485219, + "epoch": 1.8363406662821675, + "grad_norm": 0.7335963249206543, + "learning_rate": 2.314765729991918e-06, + "loss": 1.3019, + "mean_token_accuracy": 0.67206671833992, + "num_tokens": 2803299408.0, + "step": 16716 + }, + { + "entropy": 1.693650444348653, + "epoch": 1.8364505231935404, + "grad_norm": 1.7734737396240234, + "learning_rate": 2.31434612498323e-06, + "loss": 1.2178, + "mean_token_accuracy": 0.6751609444618225, + "num_tokens": 2803465084.0, + "step": 16717 + }, + { + "entropy": 1.657290409008662, + "epoch": 1.8365603801049133, + "grad_norm": 0.6704737544059753, + "learning_rate": 2.3139267948715727e-06, + "loss": 1.2829, + "mean_token_accuracy": 0.6748186101516088, + "num_tokens": 2803606663.0, + "step": 16718 + }, + { + "entropy": 1.6640637814998627, + "epoch": 1.8366702370162864, + "grad_norm": 0.6008581519126892, + "learning_rate": 2.3135077396702205e-06, + "loss": 1.4499, + "mean_token_accuracy": 0.6364815980195999, + "num_tokens": 2803903340.0, + "step": 16719 + }, + { + "entropy": 1.7623928785324097, + "epoch": 1.836780093927659, + "grad_norm": 0.6992172598838806, + "learning_rate": 2.313088959392434e-06, + "loss": 1.4895, + "mean_token_accuracy": 0.6527946243683497, + "num_tokens": 2804062195.0, + "step": 16720 + }, + { + "entropy": 1.6591077148914337, + "epoch": 1.8368899508390322, + "grad_norm": 0.6158271431922913, + "learning_rate": 2.312670454051466e-06, + "loss": 1.5485, + "mean_token_accuracy": 0.636052280664444, + "num_tokens": 2804285283.0, + "step": 16721 + }, + { + "entropy": 1.7128118971983592, + "epoch": 1.836999807750405, + "grad_norm": 0.5718826055526733, + "learning_rate": 2.3122522236605645e-06, + "loss": 1.4816, + "mean_token_accuracy": 0.639900396267573, + "num_tokens": 2804515546.0, + "step": 16722 + }, + { + "entropy": 1.6636869013309479, + "epoch": 1.837109664661778, + "grad_norm": 0.6339669227600098, + "learning_rate": 2.311834268232964e-06, + "loss": 1.3845, + "mean_token_accuracy": 0.661793996890386, + "num_tokens": 2804720983.0, + "step": 16723 + }, + { + "entropy": 1.6897972722848256, + "epoch": 1.837219521573151, + "grad_norm": 0.6934084296226501, + "learning_rate": 2.311416587781895e-06, + "loss": 1.215, + "mean_token_accuracy": 0.6849165956179301, + "num_tokens": 2804878895.0, + "step": 16724 + }, + { + "entropy": 1.757619212071101, + "epoch": 1.837329378484524, + "grad_norm": 0.6026404500007629, + "learning_rate": 2.3109991823205763e-06, + "loss": 1.3753, + "mean_token_accuracy": 0.6520447432994843, + "num_tokens": 2805043646.0, + "step": 16725 + }, + { + "entropy": 1.7655748923619587, + "epoch": 1.8374392353958968, + "grad_norm": 0.6365966200828552, + "learning_rate": 2.310582051862217e-06, + "loss": 1.3717, + "mean_token_accuracy": 0.6470306913057963, + "num_tokens": 2805192125.0, + "step": 16726 + }, + { + "entropy": 1.7034188906351726, + "epoch": 1.83754909230727, + "grad_norm": 0.8369081020355225, + "learning_rate": 2.310165196420021e-06, + "loss": 1.2939, + "mean_token_accuracy": 0.668337215979894, + "num_tokens": 2805332008.0, + "step": 16727 + }, + { + "entropy": 1.7169418434302013, + "epoch": 1.8376589492186426, + "grad_norm": 0.707994818687439, + "learning_rate": 2.309748616007181e-06, + "loss": 1.6019, + "mean_token_accuracy": 0.6377008507649103, + "num_tokens": 2805520557.0, + "step": 16728 + }, + { + "entropy": 1.739993025859197, + "epoch": 1.8377688061300157, + "grad_norm": 0.7567148208618164, + "learning_rate": 2.3093323106368804e-06, + "loss": 1.219, + "mean_token_accuracy": 0.6780005594094595, + "num_tokens": 2805656356.0, + "step": 16729 + }, + { + "entropy": 1.7764423092206318, + "epoch": 1.8378786630413886, + "grad_norm": 0.7887062430381775, + "learning_rate": 2.308916280322296e-06, + "loss": 1.5156, + "mean_token_accuracy": 0.6396622359752655, + "num_tokens": 2805861144.0, + "step": 16730 + }, + { + "entropy": 1.6603560149669647, + "epoch": 1.8379885199527615, + "grad_norm": 0.6049760580062866, + "learning_rate": 2.3085005250765965e-06, + "loss": 1.3529, + "mean_token_accuracy": 0.6593878070513407, + "num_tokens": 2806012980.0, + "step": 16731 + }, + { + "entropy": 1.745975524187088, + "epoch": 1.8380983768641346, + "grad_norm": 0.694965124130249, + "learning_rate": 2.3080850449129375e-06, + "loss": 1.5094, + "mean_token_accuracy": 0.6494153340657552, + "num_tokens": 2806192613.0, + "step": 16732 + }, + { + "entropy": 1.638623684644699, + "epoch": 1.8382082337755072, + "grad_norm": 0.5887953639030457, + "learning_rate": 2.3076698398444714e-06, + "loss": 1.318, + "mean_token_accuracy": 0.6599133412043253, + "num_tokens": 2806347525.0, + "step": 16733 + }, + { + "entropy": 1.7663246889909108, + "epoch": 1.8383180906868803, + "grad_norm": 0.7165161967277527, + "learning_rate": 2.307254909884337e-06, + "loss": 1.4706, + "mean_token_accuracy": 0.6425551424423853, + "num_tokens": 2806567225.0, + "step": 16734 + }, + { + "entropy": 1.6672922571500142, + "epoch": 1.8384279475982532, + "grad_norm": 0.7588421106338501, + "learning_rate": 2.3068402550456666e-06, + "loss": 1.3066, + "mean_token_accuracy": 0.6717980951070786, + "num_tokens": 2806721250.0, + "step": 16735 + }, + { + "entropy": 1.6799332797527313, + "epoch": 1.8385378045096261, + "grad_norm": 0.6996718645095825, + "learning_rate": 2.3064258753415876e-06, + "loss": 1.427, + "mean_token_accuracy": 0.6578944275776545, + "num_tokens": 2806893990.0, + "step": 16736 + }, + { + "entropy": 1.7593200008074443, + "epoch": 1.8386476614209992, + "grad_norm": 0.7356519103050232, + "learning_rate": 2.30601177078521e-06, + "loss": 1.5172, + "mean_token_accuracy": 0.6432800640662512, + "num_tokens": 2807056845.0, + "step": 16737 + }, + { + "entropy": 1.770335892836253, + "epoch": 1.838757518332372, + "grad_norm": 0.7394158840179443, + "learning_rate": 2.305597941389643e-06, + "loss": 1.4034, + "mean_token_accuracy": 0.650958850979805, + "num_tokens": 2807199870.0, + "step": 16738 + }, + { + "entropy": 1.683393657207489, + "epoch": 1.838867375243745, + "grad_norm": 0.7186655402183533, + "learning_rate": 2.305184387167984e-06, + "loss": 1.4326, + "mean_token_accuracy": 0.6676509827375412, + "num_tokens": 2807428387.0, + "step": 16739 + }, + { + "entropy": 1.7485286990801494, + "epoch": 1.838977232155118, + "grad_norm": 0.6906415224075317, + "learning_rate": 2.3047711081333206e-06, + "loss": 1.2982, + "mean_token_accuracy": 0.6647703647613525, + "num_tokens": 2807578751.0, + "step": 16740 + }, + { + "entropy": 1.7130445539951324, + "epoch": 1.8390870890664908, + "grad_norm": 0.6014775037765503, + "learning_rate": 2.304358104298733e-06, + "loss": 1.5085, + "mean_token_accuracy": 0.6496585061152776, + "num_tokens": 2807789299.0, + "step": 16741 + }, + { + "entropy": 1.7314409911632538, + "epoch": 1.8391969459778639, + "grad_norm": 0.8532242178916931, + "learning_rate": 2.3039453756772944e-06, + "loss": 1.5082, + "mean_token_accuracy": 0.6510532250006994, + "num_tokens": 2807931287.0, + "step": 16742 + }, + { + "entropy": 1.6471184194087982, + "epoch": 1.8393068028892368, + "grad_norm": 0.7204332947731018, + "learning_rate": 2.3035329222820648e-06, + "loss": 1.3199, + "mean_token_accuracy": 0.6603179921706518, + "num_tokens": 2808087757.0, + "step": 16743 + }, + { + "entropy": 1.6706886788209279, + "epoch": 1.8394166598006096, + "grad_norm": 0.6390844583511353, + "learning_rate": 2.3031207441261006e-06, + "loss": 1.3195, + "mean_token_accuracy": 0.6594639817873637, + "num_tokens": 2808250689.0, + "step": 16744 + }, + { + "entropy": 1.7221011817455292, + "epoch": 1.8395265167119828, + "grad_norm": 0.7180662751197815, + "learning_rate": 2.302708841222445e-06, + "loss": 1.2913, + "mean_token_accuracy": 0.6685677369435629, + "num_tokens": 2808358557.0, + "step": 16745 + }, + { + "entropy": 1.6975955367088318, + "epoch": 1.8396363736233554, + "grad_norm": 0.8012198209762573, + "learning_rate": 2.3022972135841354e-06, + "loss": 1.5236, + "mean_token_accuracy": 0.636689285437266, + "num_tokens": 2808536243.0, + "step": 16746 + }, + { + "entropy": 1.70916286110878, + "epoch": 1.8397462305347285, + "grad_norm": 0.6637392044067383, + "learning_rate": 2.3018858612241997e-06, + "loss": 1.3551, + "mean_token_accuracy": 0.6624687761068344, + "num_tokens": 2808674739.0, + "step": 16747 + }, + { + "entropy": 1.7597693900267284, + "epoch": 1.8398560874461014, + "grad_norm": 0.9263545274734497, + "learning_rate": 2.3014747841556583e-06, + "loss": 1.4849, + "mean_token_accuracy": 0.6398697346448898, + "num_tokens": 2808878557.0, + "step": 16748 + }, + { + "entropy": 1.7256910403569539, + "epoch": 1.8399659443574743, + "grad_norm": 0.695277988910675, + "learning_rate": 2.301063982391519e-06, + "loss": 1.3442, + "mean_token_accuracy": 0.673882856965065, + "num_tokens": 2809033754.0, + "step": 16749 + }, + { + "entropy": 1.6844683488210042, + "epoch": 1.8400758012688474, + "grad_norm": 0.775905430316925, + "learning_rate": 2.300653455944785e-06, + "loss": 1.2839, + "mean_token_accuracy": 0.6783890922864279, + "num_tokens": 2809173907.0, + "step": 16750 + }, + { + "entropy": 1.660104662179947, + "epoch": 1.8401856581802203, + "grad_norm": 0.9453991055488586, + "learning_rate": 2.3002432048284495e-06, + "loss": 1.569, + "mean_token_accuracy": 0.6625748674074808, + "num_tokens": 2809371151.0, + "step": 16751 + }, + { + "entropy": 1.6786755224068959, + "epoch": 1.8402955150915932, + "grad_norm": 0.9610604643821716, + "learning_rate": 2.299833229055497e-06, + "loss": 1.0901, + "mean_token_accuracy": 0.6992639452219009, + "num_tokens": 2809557765.0, + "step": 16752 + }, + { + "entropy": 1.6895591119925182, + "epoch": 1.8404053720029663, + "grad_norm": 0.6605542898178101, + "learning_rate": 2.2994235286389006e-06, + "loss": 1.4095, + "mean_token_accuracy": 0.6520673781633377, + "num_tokens": 2809742936.0, + "step": 16753 + }, + { + "entropy": 1.63118776679039, + "epoch": 1.840515228914339, + "grad_norm": 0.5481594800949097, + "learning_rate": 2.2990141035916304e-06, + "loss": 1.4048, + "mean_token_accuracy": 0.6418586075305939, + "num_tokens": 2809960815.0, + "step": 16754 + }, + { + "entropy": 1.64922496676445, + "epoch": 1.840625085825712, + "grad_norm": 0.6355406641960144, + "learning_rate": 2.298604953926642e-06, + "loss": 1.4373, + "mean_token_accuracy": 0.6513316084941229, + "num_tokens": 2810165370.0, + "step": 16755 + }, + { + "entropy": 1.6678697963555653, + "epoch": 1.840734942737085, + "grad_norm": 0.7785813212394714, + "learning_rate": 2.2981960796568873e-06, + "loss": 1.5217, + "mean_token_accuracy": 0.65113993982474, + "num_tokens": 2810349269.0, + "step": 16756 + }, + { + "entropy": 1.7382381856441498, + "epoch": 1.8408447996484578, + "grad_norm": 0.8284785747528076, + "learning_rate": 2.297787480795305e-06, + "loss": 1.442, + "mean_token_accuracy": 0.6547079781691233, + "num_tokens": 2810533928.0, + "step": 16757 + }, + { + "entropy": 1.6844376226266224, + "epoch": 1.840954656559831, + "grad_norm": 0.7436577677726746, + "learning_rate": 2.2973791573548267e-06, + "loss": 1.3838, + "mean_token_accuracy": 0.6586803744236628, + "num_tokens": 2810689879.0, + "step": 16758 + }, + { + "entropy": 1.7238931755224864, + "epoch": 1.8410645134712038, + "grad_norm": 0.6770063042640686, + "learning_rate": 2.2969711093483765e-06, + "loss": 1.4644, + "mean_token_accuracy": 0.6473502864440283, + "num_tokens": 2810846252.0, + "step": 16759 + }, + { + "entropy": 1.776161293188731, + "epoch": 1.8411743703825767, + "grad_norm": 0.601456344127655, + "learning_rate": 2.2965633367888716e-06, + "loss": 1.3195, + "mean_token_accuracy": 0.6596719473600388, + "num_tokens": 2810987101.0, + "step": 16760 + }, + { + "entropy": 1.6843051811059315, + "epoch": 1.8412842272939496, + "grad_norm": 0.746356189250946, + "learning_rate": 2.296155839689213e-06, + "loss": 1.3483, + "mean_token_accuracy": 0.6751666714747747, + "num_tokens": 2811153094.0, + "step": 16761 + }, + { + "entropy": 1.7052331566810608, + "epoch": 1.8413940842053225, + "grad_norm": 0.6584969162940979, + "learning_rate": 2.295748618062299e-06, + "loss": 1.4244, + "mean_token_accuracy": 0.6420090397198995, + "num_tokens": 2811356100.0, + "step": 16762 + }, + { + "entropy": 1.6847927769025166, + "epoch": 1.8415039411166956, + "grad_norm": 0.6887544393539429, + "learning_rate": 2.2953416719210216e-06, + "loss": 1.3471, + "mean_token_accuracy": 0.6726632316907247, + "num_tokens": 2811473596.0, + "step": 16763 + }, + { + "entropy": 1.6632155577341716, + "epoch": 1.8416137980280685, + "grad_norm": 0.7095504999160767, + "learning_rate": 2.2949350012782563e-06, + "loss": 1.4566, + "mean_token_accuracy": 0.6513134290774664, + "num_tokens": 2811660773.0, + "step": 16764 + }, + { + "entropy": 1.7216549217700958, + "epoch": 1.8417236549394413, + "grad_norm": 0.7090489268302917, + "learning_rate": 2.2945286061468764e-06, + "loss": 1.4007, + "mean_token_accuracy": 0.6596039036909739, + "num_tokens": 2811832001.0, + "step": 16765 + }, + { + "entropy": 1.7071708242098491, + "epoch": 1.8418335118508145, + "grad_norm": 0.6916561126708984, + "learning_rate": 2.2941224865397428e-06, + "loss": 1.5654, + "mean_token_accuracy": 0.630169411500295, + "num_tokens": 2812111813.0, + "step": 16766 + }, + { + "entropy": 1.7071344057718914, + "epoch": 1.8419433687621871, + "grad_norm": 0.6458945274353027, + "learning_rate": 2.293716642469709e-06, + "loss": 1.3842, + "mean_token_accuracy": 0.6630050440629324, + "num_tokens": 2812271734.0, + "step": 16767 + }, + { + "entropy": 1.727453351020813, + "epoch": 1.8420532256735602, + "grad_norm": 0.7190876603126526, + "learning_rate": 2.2933110739496217e-06, + "loss": 1.4531, + "mean_token_accuracy": 0.6521121064821879, + "num_tokens": 2812433572.0, + "step": 16768 + }, + { + "entropy": 1.6626160542170207, + "epoch": 1.842163082584933, + "grad_norm": 0.8066210746765137, + "learning_rate": 2.2929057809923155e-06, + "loss": 1.4449, + "mean_token_accuracy": 0.6541995108127594, + "num_tokens": 2812644959.0, + "step": 16769 + }, + { + "entropy": 1.7399661739667256, + "epoch": 1.842272939496306, + "grad_norm": 0.6394560933113098, + "learning_rate": 2.2925007636106167e-06, + "loss": 1.3539, + "mean_token_accuracy": 0.6547619154055914, + "num_tokens": 2812803221.0, + "step": 16770 + }, + { + "entropy": 1.7124665677547455, + "epoch": 1.842382796407679, + "grad_norm": 0.8350101709365845, + "learning_rate": 2.292096021817345e-06, + "loss": 1.3946, + "mean_token_accuracy": 0.6505677302678426, + "num_tokens": 2812976208.0, + "step": 16771 + }, + { + "entropy": 1.7322071393330891, + "epoch": 1.842492653319052, + "grad_norm": 0.7939152121543884, + "learning_rate": 2.2916915556253123e-06, + "loss": 1.4203, + "mean_token_accuracy": 0.661156415939331, + "num_tokens": 2813157911.0, + "step": 16772 + }, + { + "entropy": 1.7652178903420765, + "epoch": 1.8426025102304249, + "grad_norm": 0.7062113285064697, + "learning_rate": 2.291287365047316e-06, + "loss": 1.5109, + "mean_token_accuracy": 0.6431985199451447, + "num_tokens": 2813344031.0, + "step": 16773 + }, + { + "entropy": 1.6703122456868489, + "epoch": 1.8427123671417978, + "grad_norm": 0.713137686252594, + "learning_rate": 2.2908834500961504e-06, + "loss": 1.2947, + "mean_token_accuracy": 0.6722335070371628, + "num_tokens": 2813509598.0, + "step": 16774 + }, + { + "entropy": 1.73709570368131, + "epoch": 1.8428222240531706, + "grad_norm": 0.9680473804473877, + "learning_rate": 2.290479810784599e-06, + "loss": 1.387, + "mean_token_accuracy": 0.6628308445215225, + "num_tokens": 2813646975.0, + "step": 16775 + }, + { + "entropy": 1.7047974566618602, + "epoch": 1.8429320809645438, + "grad_norm": 0.6554457545280457, + "learning_rate": 2.2900764471254385e-06, + "loss": 1.4557, + "mean_token_accuracy": 0.6604510943094889, + "num_tokens": 2813795279.0, + "step": 16776 + }, + { + "entropy": 1.6860364377498627, + "epoch": 1.8430419378759166, + "grad_norm": 0.6723480820655823, + "learning_rate": 2.2896733591314315e-06, + "loss": 1.244, + "mean_token_accuracy": 0.6721046268939972, + "num_tokens": 2813934809.0, + "step": 16777 + }, + { + "entropy": 1.6676070193449657, + "epoch": 1.8431517947872895, + "grad_norm": 0.7274590730667114, + "learning_rate": 2.28927054681534e-06, + "loss": 1.2252, + "mean_token_accuracy": 0.6727160960435867, + "num_tokens": 2814065654.0, + "step": 16778 + }, + { + "entropy": 1.6777145564556122, + "epoch": 1.8432616516986626, + "grad_norm": 0.7008301019668579, + "learning_rate": 2.2888680101899086e-06, + "loss": 1.2634, + "mean_token_accuracy": 0.6827175964911779, + "num_tokens": 2814237054.0, + "step": 16779 + }, + { + "entropy": 1.6733269294102986, + "epoch": 1.8433715086100353, + "grad_norm": 0.7075291872024536, + "learning_rate": 2.28846574926788e-06, + "loss": 1.2871, + "mean_token_accuracy": 0.6724706093470255, + "num_tokens": 2814372528.0, + "step": 16780 + }, + { + "entropy": 1.7397844890753429, + "epoch": 1.8434813655214084, + "grad_norm": 0.6934898495674133, + "learning_rate": 2.288063764061986e-06, + "loss": 1.491, + "mean_token_accuracy": 0.6440728902816772, + "num_tokens": 2814564424.0, + "step": 16781 + }, + { + "entropy": 1.6866810023784637, + "epoch": 1.8435912224327813, + "grad_norm": 0.6557011008262634, + "learning_rate": 2.2876620545849465e-06, + "loss": 1.3145, + "mean_token_accuracy": 0.66270412504673, + "num_tokens": 2814761649.0, + "step": 16782 + }, + { + "entropy": 1.6241275866826375, + "epoch": 1.8437010793441542, + "grad_norm": 0.6553396582603455, + "learning_rate": 2.2872606208494775e-06, + "loss": 1.4424, + "mean_token_accuracy": 0.6536834836006165, + "num_tokens": 2814932983.0, + "step": 16783 + }, + { + "entropy": 1.6947355270385742, + "epoch": 1.8438109362555273, + "grad_norm": 0.8572350740432739, + "learning_rate": 2.286859462868286e-06, + "loss": 1.3464, + "mean_token_accuracy": 0.6532238374153773, + "num_tokens": 2815067834.0, + "step": 16784 + }, + { + "entropy": 1.7147201299667358, + "epoch": 1.8439207931669002, + "grad_norm": 0.6992621421813965, + "learning_rate": 2.2864585806540637e-06, + "loss": 1.3477, + "mean_token_accuracy": 0.6593698014815649, + "num_tokens": 2815250511.0, + "step": 16785 + }, + { + "entropy": 1.7203228970368702, + "epoch": 1.844030650078273, + "grad_norm": 0.7004925012588501, + "learning_rate": 2.2860579742195016e-06, + "loss": 1.3027, + "mean_token_accuracy": 0.6743641148010889, + "num_tokens": 2815388655.0, + "step": 16786 + }, + { + "entropy": 1.6600177884101868, + "epoch": 1.844140506989646, + "grad_norm": 0.6689477562904358, + "learning_rate": 2.285657643577278e-06, + "loss": 1.1647, + "mean_token_accuracy": 0.6987222582101822, + "num_tokens": 2815523979.0, + "step": 16787 + }, + { + "entropy": 1.6286835670471191, + "epoch": 1.8442503639010188, + "grad_norm": 0.632027804851532, + "learning_rate": 2.285257588740064e-06, + "loss": 1.3428, + "mean_token_accuracy": 0.6710825363794962, + "num_tokens": 2815756939.0, + "step": 16788 + }, + { + "entropy": 1.6791391670703888, + "epoch": 1.844360220812392, + "grad_norm": 0.6494190692901611, + "learning_rate": 2.2848578097205193e-06, + "loss": 1.4686, + "mean_token_accuracy": 0.6351282844940821, + "num_tokens": 2815934346.0, + "step": 16789 + }, + { + "entropy": 1.6839772363503773, + "epoch": 1.8444700777237648, + "grad_norm": 0.8661864995956421, + "learning_rate": 2.284458306531298e-06, + "loss": 1.4426, + "mean_token_accuracy": 0.6467615962028503, + "num_tokens": 2816134609.0, + "step": 16790 + }, + { + "entropy": 1.7479057808717091, + "epoch": 1.8445799346351377, + "grad_norm": 0.9440947771072388, + "learning_rate": 2.2840590791850434e-06, + "loss": 1.4306, + "mean_token_accuracy": 0.6492450833320618, + "num_tokens": 2816298540.0, + "step": 16791 + }, + { + "entropy": 1.6491265694300334, + "epoch": 1.8446897915465108, + "grad_norm": 0.6750578284263611, + "learning_rate": 2.2836601276943944e-06, + "loss": 1.4913, + "mean_token_accuracy": 0.6493054578701655, + "num_tokens": 2816496291.0, + "step": 16792 + }, + { + "entropy": 1.7014791468779247, + "epoch": 1.8447996484578835, + "grad_norm": 0.7774354815483093, + "learning_rate": 2.2832614520719713e-06, + "loss": 1.2901, + "mean_token_accuracy": 0.6726734042167664, + "num_tokens": 2816634154.0, + "step": 16793 + }, + { + "entropy": 1.6888912518819172, + "epoch": 1.8449095053692566, + "grad_norm": 0.6581716537475586, + "learning_rate": 2.2828630523303962e-06, + "loss": 1.2948, + "mean_token_accuracy": 0.668033296863238, + "num_tokens": 2816767661.0, + "step": 16794 + }, + { + "entropy": 1.7331350843111675, + "epoch": 1.8450193622806295, + "grad_norm": 0.8059678673744202, + "learning_rate": 2.2824649284822777e-06, + "loss": 1.2899, + "mean_token_accuracy": 0.6695135881503423, + "num_tokens": 2816904457.0, + "step": 16795 + }, + { + "entropy": 1.7052109042803447, + "epoch": 1.8451292191920023, + "grad_norm": 0.7135511040687561, + "learning_rate": 2.2820670805402166e-06, + "loss": 1.3201, + "mean_token_accuracy": 0.6706758240858713, + "num_tokens": 2817068724.0, + "step": 16796 + }, + { + "entropy": 1.7488359014193218, + "epoch": 1.8452390761033755, + "grad_norm": 0.7513749599456787, + "learning_rate": 2.281669508516803e-06, + "loss": 1.4146, + "mean_token_accuracy": 0.6454348017772039, + "num_tokens": 2817186893.0, + "step": 16797 + }, + { + "entropy": 1.7428977489471436, + "epoch": 1.8453489330147483, + "grad_norm": 0.6585659980773926, + "learning_rate": 2.281272212424622e-06, + "loss": 1.5118, + "mean_token_accuracy": 0.6480231831471125, + "num_tokens": 2817380337.0, + "step": 16798 + }, + { + "entropy": 1.6482553680737813, + "epoch": 1.8454587899261212, + "grad_norm": 0.6863150000572205, + "learning_rate": 2.280875192276245e-06, + "loss": 1.2707, + "mean_token_accuracy": 0.6809622297684351, + "num_tokens": 2817523945.0, + "step": 16799 + }, + { + "entropy": 1.7246152857939403, + "epoch": 1.845568646837494, + "grad_norm": 0.6100006103515625, + "learning_rate": 2.2804784480842414e-06, + "loss": 1.4405, + "mean_token_accuracy": 0.648542195558548, + "num_tokens": 2817701592.0, + "step": 16800 + }, + { + "entropy": 1.7031661570072174, + "epoch": 1.845678503748867, + "grad_norm": 0.6806704998016357, + "learning_rate": 2.2800819798611644e-06, + "loss": 1.3778, + "mean_token_accuracy": 0.6427052021026611, + "num_tokens": 2817867695.0, + "step": 16801 + }, + { + "entropy": 1.7161981364091237, + "epoch": 1.84578836066024, + "grad_norm": 0.6332004070281982, + "learning_rate": 2.2796857876195637e-06, + "loss": 1.4339, + "mean_token_accuracy": 0.6533434242010117, + "num_tokens": 2818056193.0, + "step": 16802 + }, + { + "entropy": 1.7151016394297283, + "epoch": 1.845898217571613, + "grad_norm": 0.7988026142120361, + "learning_rate": 2.279289871371977e-06, + "loss": 1.3272, + "mean_token_accuracy": 0.6582034826278687, + "num_tokens": 2818196429.0, + "step": 16803 + }, + { + "entropy": 1.7060537834962208, + "epoch": 1.8460080744829859, + "grad_norm": 0.7432763576507568, + "learning_rate": 2.2788942311309397e-06, + "loss": 1.3024, + "mean_token_accuracy": 0.6767023553450903, + "num_tokens": 2818331053.0, + "step": 16804 + }, + { + "entropy": 1.683958222468694, + "epoch": 1.846117931394359, + "grad_norm": 0.6856158375740051, + "learning_rate": 2.2784988669089674e-06, + "loss": 1.5868, + "mean_token_accuracy": 0.6441004474957784, + "num_tokens": 2818554982.0, + "step": 16805 + }, + { + "entropy": 1.689920614163081, + "epoch": 1.8462277883057316, + "grad_norm": 0.6839845180511475, + "learning_rate": 2.278103778718577e-06, + "loss": 1.5445, + "mean_token_accuracy": 0.6441525717576345, + "num_tokens": 2818721341.0, + "step": 16806 + }, + { + "entropy": 1.6903660396734874, + "epoch": 1.8463376452171048, + "grad_norm": 0.6059070825576782, + "learning_rate": 2.2777089665722706e-06, + "loss": 1.3686, + "mean_token_accuracy": 0.6590339243412018, + "num_tokens": 2818914745.0, + "step": 16807 + }, + { + "entropy": 1.6847312947114308, + "epoch": 1.8464475021284776, + "grad_norm": 0.6773668527603149, + "learning_rate": 2.2773144304825473e-06, + "loss": 1.3906, + "mean_token_accuracy": 0.6678819706042608, + "num_tokens": 2819045859.0, + "step": 16808 + }, + { + "entropy": 1.7114491661389668, + "epoch": 1.8465573590398505, + "grad_norm": 0.6937119960784912, + "learning_rate": 2.2769201704618895e-06, + "loss": 1.3054, + "mean_token_accuracy": 0.6675901015599569, + "num_tokens": 2819189812.0, + "step": 16809 + }, + { + "entropy": 1.7245989938577015, + "epoch": 1.8466672159512236, + "grad_norm": 0.8096246719360352, + "learning_rate": 2.2765261865227795e-06, + "loss": 1.3121, + "mean_token_accuracy": 0.661870464682579, + "num_tokens": 2819297022.0, + "step": 16810 + }, + { + "entropy": 1.7037302354971569, + "epoch": 1.8467770728625965, + "grad_norm": 0.7414513230323792, + "learning_rate": 2.2761324786776827e-06, + "loss": 1.2294, + "mean_token_accuracy": 0.6829250454902649, + "num_tokens": 2819415838.0, + "step": 16811 + }, + { + "entropy": 1.7018751204013824, + "epoch": 1.8468869297739694, + "grad_norm": 0.6822280287742615, + "learning_rate": 2.275739046939063e-06, + "loss": 1.4365, + "mean_token_accuracy": 0.6489651799201965, + "num_tokens": 2819582184.0, + "step": 16812 + }, + { + "entropy": 1.6578473349412282, + "epoch": 1.8469967866853425, + "grad_norm": 0.7063673734664917, + "learning_rate": 2.275345891319372e-06, + "loss": 1.2741, + "mean_token_accuracy": 0.6733155796925226, + "num_tokens": 2819737015.0, + "step": 16813 + }, + { + "entropy": 1.7193239827950795, + "epoch": 1.8471066435967152, + "grad_norm": 0.6380773782730103, + "learning_rate": 2.2749530118310504e-06, + "loss": 1.4591, + "mean_token_accuracy": 0.6509887427091599, + "num_tokens": 2819935004.0, + "step": 16814 + }, + { + "entropy": 1.7366726497809093, + "epoch": 1.8472165005080883, + "grad_norm": 0.6270143985748291, + "learning_rate": 2.274560408486535e-06, + "loss": 1.5331, + "mean_token_accuracy": 0.6440207809209824, + "num_tokens": 2820141194.0, + "step": 16815 + }, + { + "entropy": 1.6627205908298492, + "epoch": 1.8473263574194612, + "grad_norm": 0.6391332149505615, + "learning_rate": 2.2741680812982525e-06, + "loss": 1.3179, + "mean_token_accuracy": 0.6662083069483439, + "num_tokens": 2820333411.0, + "step": 16816 + }, + { + "entropy": 1.6978925466537476, + "epoch": 1.847436214330834, + "grad_norm": 0.6449623703956604, + "learning_rate": 2.2737760302786165e-06, + "loss": 1.383, + "mean_token_accuracy": 0.6534488449494044, + "num_tokens": 2820497197.0, + "step": 16817 + }, + { + "entropy": 1.6958427727222443, + "epoch": 1.8475460712422072, + "grad_norm": 0.7582001686096191, + "learning_rate": 2.273384255440037e-06, + "loss": 1.2255, + "mean_token_accuracy": 0.6779115696748098, + "num_tokens": 2820615677.0, + "step": 16818 + }, + { + "entropy": 1.668468713760376, + "epoch": 1.8476559281535798, + "grad_norm": 0.7352595925331116, + "learning_rate": 2.2729927567949147e-06, + "loss": 1.2167, + "mean_token_accuracy": 0.6819255699714025, + "num_tokens": 2820735125.0, + "step": 16819 + }, + { + "entropy": 1.7504223088423412, + "epoch": 1.847765785064953, + "grad_norm": 0.653083086013794, + "learning_rate": 2.272601534355638e-06, + "loss": 1.478, + "mean_token_accuracy": 0.640269880493482, + "num_tokens": 2820924373.0, + "step": 16820 + }, + { + "entropy": 1.744905153910319, + "epoch": 1.8478756419763258, + "grad_norm": 0.808557391166687, + "learning_rate": 2.27221058813459e-06, + "loss": 1.3103, + "mean_token_accuracy": 0.6747282495101293, + "num_tokens": 2821111140.0, + "step": 16821 + }, + { + "entropy": 1.6883414487044017, + "epoch": 1.8479854988876987, + "grad_norm": 0.8405027985572815, + "learning_rate": 2.271819918144145e-06, + "loss": 1.3422, + "mean_token_accuracy": 0.6721090773741404, + "num_tokens": 2821233106.0, + "step": 16822 + }, + { + "entropy": 1.6892358760039012, + "epoch": 1.8480953557990718, + "grad_norm": 0.5664523243904114, + "learning_rate": 2.2714295243966663e-06, + "loss": 1.4374, + "mean_token_accuracy": 0.6468595862388611, + "num_tokens": 2821409664.0, + "step": 16823 + }, + { + "entropy": 1.7041309575239818, + "epoch": 1.8482052127104447, + "grad_norm": 0.7229970097541809, + "learning_rate": 2.2710394069045096e-06, + "loss": 1.4368, + "mean_token_accuracy": 0.6511443008979162, + "num_tokens": 2821617059.0, + "step": 16824 + }, + { + "entropy": 1.7157021065553029, + "epoch": 1.8483150696218176, + "grad_norm": 0.6895220279693604, + "learning_rate": 2.270649565680023e-06, + "loss": 1.5049, + "mean_token_accuracy": 0.6378757754961649, + "num_tokens": 2821809999.0, + "step": 16825 + }, + { + "entropy": 1.6815617382526398, + "epoch": 1.8484249265331907, + "grad_norm": 0.6916029453277588, + "learning_rate": 2.270260000735543e-06, + "loss": 1.4192, + "mean_token_accuracy": 0.6669404208660126, + "num_tokens": 2821991768.0, + "step": 16826 + }, + { + "entropy": 1.7499388257662456, + "epoch": 1.8485347834445633, + "grad_norm": 0.6223210096359253, + "learning_rate": 2.2698707120834e-06, + "loss": 1.374, + "mean_token_accuracy": 0.6528652707735697, + "num_tokens": 2822148133.0, + "step": 16827 + }, + { + "entropy": 1.6805897454420726, + "epoch": 1.8486446403559365, + "grad_norm": 2.376575231552124, + "learning_rate": 2.269481699735918e-06, + "loss": 1.1966, + "mean_token_accuracy": 0.6810894310474396, + "num_tokens": 2822351860.0, + "step": 16828 + }, + { + "entropy": 1.6760378777980804, + "epoch": 1.8487544972673093, + "grad_norm": 0.6722053289413452, + "learning_rate": 2.269092963705404e-06, + "loss": 1.3251, + "mean_token_accuracy": 0.6669818659623464, + "num_tokens": 2822522055.0, + "step": 16829 + }, + { + "entropy": 1.6548854509989421, + "epoch": 1.8488643541786822, + "grad_norm": 0.6542387008666992, + "learning_rate": 2.2687045040041625e-06, + "loss": 1.2904, + "mean_token_accuracy": 0.6669500768184662, + "num_tokens": 2822682578.0, + "step": 16830 + }, + { + "entropy": 1.6649717092514038, + "epoch": 1.8489742110900553, + "grad_norm": 0.69137042760849, + "learning_rate": 2.2683163206444903e-06, + "loss": 1.3382, + "mean_token_accuracy": 0.6678586552540461, + "num_tokens": 2822822417.0, + "step": 16831 + }, + { + "entropy": 1.6161488095919292, + "epoch": 1.849084068001428, + "grad_norm": 0.7891423106193542, + "learning_rate": 2.2679284136386717e-06, + "loss": 1.4021, + "mean_token_accuracy": 0.6629961331685384, + "num_tokens": 2823043372.0, + "step": 16832 + }, + { + "entropy": 1.6671480735143025, + "epoch": 1.849193924912801, + "grad_norm": 0.5742250680923462, + "learning_rate": 2.267540782998984e-06, + "loss": 1.4551, + "mean_token_accuracy": 0.6415112614631653, + "num_tokens": 2823234593.0, + "step": 16833 + }, + { + "entropy": 1.7150403559207916, + "epoch": 1.849303781824174, + "grad_norm": 0.6328002214431763, + "learning_rate": 2.2671534287376955e-06, + "loss": 1.3687, + "mean_token_accuracy": 0.6589196075995764, + "num_tokens": 2823412259.0, + "step": 16834 + }, + { + "entropy": 1.6628845036029816, + "epoch": 1.8494136387355469, + "grad_norm": 0.6902245879173279, + "learning_rate": 2.2667663508670654e-06, + "loss": 1.3141, + "mean_token_accuracy": 0.6782469848791758, + "num_tokens": 2823563936.0, + "step": 16835 + }, + { + "entropy": 1.778301070133845, + "epoch": 1.84952349564692, + "grad_norm": 0.6464490294456482, + "learning_rate": 2.266379549399346e-06, + "loss": 1.4777, + "mean_token_accuracy": 0.6514505942662557, + "num_tokens": 2823785146.0, + "step": 16836 + }, + { + "entropy": 1.6853400766849518, + "epoch": 1.8496333525582929, + "grad_norm": 0.7404756546020508, + "learning_rate": 2.265993024346779e-06, + "loss": 1.3938, + "mean_token_accuracy": 0.6646452844142914, + "num_tokens": 2823948844.0, + "step": 16837 + }, + { + "entropy": 1.7575759092966716, + "epoch": 1.8497432094696658, + "grad_norm": 0.9101560115814209, + "learning_rate": 2.2656067757215955e-06, + "loss": 1.6004, + "mean_token_accuracy": 0.6494082659482956, + "num_tokens": 2824102594.0, + "step": 16838 + }, + { + "entropy": 1.705398013194402, + "epoch": 1.8498530663810389, + "grad_norm": 0.691576361656189, + "learning_rate": 2.2652208035360216e-06, + "loss": 1.5896, + "mean_token_accuracy": 0.6335019121567408, + "num_tokens": 2824300458.0, + "step": 16839 + }, + { + "entropy": 1.7217795650164287, + "epoch": 1.8499629232924115, + "grad_norm": 0.6099857091903687, + "learning_rate": 2.2648351078022756e-06, + "loss": 1.3593, + "mean_token_accuracy": 0.6655921290318171, + "num_tokens": 2824478345.0, + "step": 16840 + }, + { + "entropy": 1.6977481245994568, + "epoch": 1.8500727802037846, + "grad_norm": 0.7489005327224731, + "learning_rate": 2.2644496885325602e-06, + "loss": 1.3109, + "mean_token_accuracy": 0.6732942511638006, + "num_tokens": 2824614208.0, + "step": 16841 + }, + { + "entropy": 1.6951703131198883, + "epoch": 1.8501826371151575, + "grad_norm": 0.6727724671363831, + "learning_rate": 2.2640645457390757e-06, + "loss": 1.3008, + "mean_token_accuracy": 0.6589942077795664, + "num_tokens": 2824761514.0, + "step": 16842 + }, + { + "entropy": 1.6436572670936584, + "epoch": 1.8502924940265304, + "grad_norm": 0.6955944895744324, + "learning_rate": 2.2636796794340134e-06, + "loss": 1.2896, + "mean_token_accuracy": 0.673475960890452, + "num_tokens": 2824926126.0, + "step": 16843 + }, + { + "entropy": 1.7151092290878296, + "epoch": 1.8504023509379035, + "grad_norm": 0.6162389516830444, + "learning_rate": 2.2632950896295524e-06, + "loss": 1.3972, + "mean_token_accuracy": 0.6557242920001348, + "num_tokens": 2825105477.0, + "step": 16844 + }, + { + "entropy": 1.7381452520688374, + "epoch": 1.8505122078492762, + "grad_norm": 0.5753760933876038, + "learning_rate": 2.262910776337863e-06, + "loss": 1.4604, + "mean_token_accuracy": 0.6351420283317566, + "num_tokens": 2825303396.0, + "step": 16845 + }, + { + "entropy": 1.693215678135554, + "epoch": 1.8506220647606493, + "grad_norm": 0.8320888876914978, + "learning_rate": 2.2625267395711124e-06, + "loss": 1.3931, + "mean_token_accuracy": 0.6597619901100794, + "num_tokens": 2825511715.0, + "step": 16846 + }, + { + "entropy": 1.772448907295863, + "epoch": 1.8507319216720222, + "grad_norm": 0.6900238990783691, + "learning_rate": 2.2621429793414513e-06, + "loss": 1.2598, + "mean_token_accuracy": 0.6730435639619827, + "num_tokens": 2825660163.0, + "step": 16847 + }, + { + "entropy": 1.7407717903455098, + "epoch": 1.850841778583395, + "grad_norm": 0.6229955554008484, + "learning_rate": 2.26175949566103e-06, + "loss": 1.3992, + "mean_token_accuracy": 0.6651196181774139, + "num_tokens": 2825812055.0, + "step": 16848 + }, + { + "entropy": 1.6535049378871918, + "epoch": 1.8509516354947682, + "grad_norm": 0.8824671506881714, + "learning_rate": 2.261376288541982e-06, + "loss": 1.471, + "mean_token_accuracy": 0.6661679844061533, + "num_tokens": 2825974645.0, + "step": 16849 + }, + { + "entropy": 1.7141542633374531, + "epoch": 1.851061492406141, + "grad_norm": 0.7158608436584473, + "learning_rate": 2.2609933579964364e-06, + "loss": 1.3092, + "mean_token_accuracy": 0.6757365266482035, + "num_tokens": 2826109915.0, + "step": 16850 + }, + { + "entropy": 1.684940109650294, + "epoch": 1.851171349317514, + "grad_norm": 0.7266584634780884, + "learning_rate": 2.260610704036514e-06, + "loss": 1.4043, + "mean_token_accuracy": 0.6563446720441183, + "num_tokens": 2826276894.0, + "step": 16851 + }, + { + "entropy": 1.6518448789914448, + "epoch": 1.851281206228887, + "grad_norm": 0.8699021935462952, + "learning_rate": 2.2602283266743242e-06, + "loss": 1.235, + "mean_token_accuracy": 0.6798707942167918, + "num_tokens": 2826413789.0, + "step": 16852 + }, + { + "entropy": 1.7219206094741821, + "epoch": 1.8513910631402597, + "grad_norm": 0.6562429070472717, + "learning_rate": 2.25984622592197e-06, + "loss": 1.3031, + "mean_token_accuracy": 0.675565222899119, + "num_tokens": 2826543479.0, + "step": 16853 + }, + { + "entropy": 1.6323307752609253, + "epoch": 1.8515009200516328, + "grad_norm": 0.6604292988777161, + "learning_rate": 2.259464401791544e-06, + "loss": 1.3841, + "mean_token_accuracy": 0.6637335220972697, + "num_tokens": 2826681051.0, + "step": 16854 + }, + { + "entropy": 1.7547399997711182, + "epoch": 1.8516107769630057, + "grad_norm": 0.6812036633491516, + "learning_rate": 2.25908285429513e-06, + "loss": 1.5095, + "mean_token_accuracy": 0.6496999114751816, + "num_tokens": 2826877453.0, + "step": 16855 + }, + { + "entropy": 1.756416380405426, + "epoch": 1.8517206338743786, + "grad_norm": 0.5992780327796936, + "learning_rate": 2.2587015834448066e-06, + "loss": 1.3727, + "mean_token_accuracy": 0.6700999438762665, + "num_tokens": 2827036356.0, + "step": 16856 + }, + { + "entropy": 1.7250816226005554, + "epoch": 1.8518304907857517, + "grad_norm": 0.6736693978309631, + "learning_rate": 2.2583205892526395e-06, + "loss": 1.3946, + "mean_token_accuracy": 0.6623944640159607, + "num_tokens": 2827218750.0, + "step": 16857 + }, + { + "entropy": 1.746427297592163, + "epoch": 1.8519403476971243, + "grad_norm": 0.5987364053726196, + "learning_rate": 2.2579398717306853e-06, + "loss": 1.4286, + "mean_token_accuracy": 0.6408430685599645, + "num_tokens": 2827429105.0, + "step": 16858 + }, + { + "entropy": 1.7429214417934418, + "epoch": 1.8520502046084975, + "grad_norm": 0.6848793625831604, + "learning_rate": 2.257559430890994e-06, + "loss": 1.4197, + "mean_token_accuracy": 0.6512637386719385, + "num_tokens": 2827607457.0, + "step": 16859 + }, + { + "entropy": 1.6617528994878132, + "epoch": 1.8521600615198703, + "grad_norm": 0.6407644748687744, + "learning_rate": 2.25717926674561e-06, + "loss": 1.2838, + "mean_token_accuracy": 0.6738065630197525, + "num_tokens": 2827740068.0, + "step": 16860 + }, + { + "entropy": 1.6476930975914001, + "epoch": 1.8522699184312432, + "grad_norm": 0.587350070476532, + "learning_rate": 2.2567993793065612e-06, + "loss": 1.5079, + "mean_token_accuracy": 0.6454335004091263, + "num_tokens": 2827970706.0, + "step": 16861 + }, + { + "entropy": 1.6667810281117756, + "epoch": 1.8523797753426163, + "grad_norm": 0.6816261410713196, + "learning_rate": 2.2564197685858718e-06, + "loss": 1.3844, + "mean_token_accuracy": 0.6544611503680547, + "num_tokens": 2828133482.0, + "step": 16862 + }, + { + "entropy": 1.7026315033435822, + "epoch": 1.8524896322539892, + "grad_norm": 0.7092069387435913, + "learning_rate": 2.2560404345955573e-06, + "loss": 1.421, + "mean_token_accuracy": 0.6365664452314377, + "num_tokens": 2828374569.0, + "step": 16863 + }, + { + "entropy": 1.7290400266647339, + "epoch": 1.852599489165362, + "grad_norm": 0.6336418986320496, + "learning_rate": 2.2556613773476234e-06, + "loss": 1.3831, + "mean_token_accuracy": 0.6517727623383204, + "num_tokens": 2828523922.0, + "step": 16864 + }, + { + "entropy": 1.7043097118536632, + "epoch": 1.8527093460767352, + "grad_norm": 0.7059993147850037, + "learning_rate": 2.255282596854065e-06, + "loss": 1.3474, + "mean_token_accuracy": 0.6678043107191721, + "num_tokens": 2828669505.0, + "step": 16865 + }, + { + "entropy": 1.7145764529705048, + "epoch": 1.8528192029881079, + "grad_norm": 0.7578869462013245, + "learning_rate": 2.254904093126874e-06, + "loss": 1.4085, + "mean_token_accuracy": 0.6547698179880778, + "num_tokens": 2828791241.0, + "step": 16866 + }, + { + "entropy": 1.762321561574936, + "epoch": 1.852929059899481, + "grad_norm": 0.7735615372657776, + "learning_rate": 2.2545258661780266e-06, + "loss": 1.5863, + "mean_token_accuracy": 0.6380794048309326, + "num_tokens": 2828975921.0, + "step": 16867 + }, + { + "entropy": 1.69214462240537, + "epoch": 1.8530389168108539, + "grad_norm": 0.7045040130615234, + "learning_rate": 2.2541479160194944e-06, + "loss": 1.4513, + "mean_token_accuracy": 0.6603608429431915, + "num_tokens": 2829140958.0, + "step": 16868 + }, + { + "entropy": 1.662009169658025, + "epoch": 1.8531487737222267, + "grad_norm": 0.8396393060684204, + "learning_rate": 2.2537702426632405e-06, + "loss": 1.229, + "mean_token_accuracy": 0.6779507348934809, + "num_tokens": 2829246179.0, + "step": 16869 + }, + { + "entropy": 1.7280895511309307, + "epoch": 1.8532586306335999, + "grad_norm": 0.7660303115844727, + "learning_rate": 2.2533928461212163e-06, + "loss": 1.3047, + "mean_token_accuracy": 0.6635698924462, + "num_tokens": 2829424054.0, + "step": 16870 + }, + { + "entropy": 1.6450142761071522, + "epoch": 1.8533684875449725, + "grad_norm": 0.6026352047920227, + "learning_rate": 2.2530157264053683e-06, + "loss": 1.4684, + "mean_token_accuracy": 0.6385354151328405, + "num_tokens": 2829608694.0, + "step": 16871 + }, + { + "entropy": 1.7429430484771729, + "epoch": 1.8534783444563456, + "grad_norm": 0.7299525141716003, + "learning_rate": 2.252638883527631e-06, + "loss": 1.4044, + "mean_token_accuracy": 0.6550277421871821, + "num_tokens": 2829770875.0, + "step": 16872 + }, + { + "entropy": 1.7349806527296703, + "epoch": 1.8535882013677185, + "grad_norm": 0.7500406503677368, + "learning_rate": 2.252262317499931e-06, + "loss": 1.3102, + "mean_token_accuracy": 0.6578699747721354, + "num_tokens": 2829947693.0, + "step": 16873 + }, + { + "entropy": 1.7390219668547313, + "epoch": 1.8536980582790914, + "grad_norm": 0.7751812934875488, + "learning_rate": 2.2518860283341864e-06, + "loss": 1.5918, + "mean_token_accuracy": 0.6272272417942683, + "num_tokens": 2830153181.0, + "step": 16874 + }, + { + "entropy": 1.7201407651106517, + "epoch": 1.8538079151904645, + "grad_norm": 0.7094139456748962, + "learning_rate": 2.251510016042308e-06, + "loss": 1.5539, + "mean_token_accuracy": 0.6450007905562719, + "num_tokens": 2830334049.0, + "step": 16875 + }, + { + "entropy": 1.736316164334615, + "epoch": 1.8539177721018374, + "grad_norm": 0.6322393417358398, + "learning_rate": 2.251134280636195e-06, + "loss": 1.2949, + "mean_token_accuracy": 0.675666610399882, + "num_tokens": 2830485303.0, + "step": 16876 + }, + { + "entropy": 1.729094882806142, + "epoch": 1.8540276290132103, + "grad_norm": 0.7388508319854736, + "learning_rate": 2.25075882212774e-06, + "loss": 1.383, + "mean_token_accuracy": 0.6609533528486887, + "num_tokens": 2830635382.0, + "step": 16877 + }, + { + "entropy": 1.682877242565155, + "epoch": 1.8541374859245834, + "grad_norm": 0.6518582105636597, + "learning_rate": 2.2503836405288256e-06, + "loss": 1.4053, + "mean_token_accuracy": 0.6476947963237762, + "num_tokens": 2830843164.0, + "step": 16878 + }, + { + "entropy": 1.6826065182685852, + "epoch": 1.854247342835956, + "grad_norm": 0.6691297888755798, + "learning_rate": 2.250008735851325e-06, + "loss": 1.3046, + "mean_token_accuracy": 0.6775392790635427, + "num_tokens": 2830999778.0, + "step": 16879 + }, + { + "entropy": 1.661271055539449, + "epoch": 1.8543571997473292, + "grad_norm": 0.7119371294975281, + "learning_rate": 2.2496341081071066e-06, + "loss": 1.3536, + "mean_token_accuracy": 0.657180925210317, + "num_tokens": 2831151781.0, + "step": 16880 + }, + { + "entropy": 1.6969236334164937, + "epoch": 1.854467056658702, + "grad_norm": 0.6304360628128052, + "learning_rate": 2.249259757308026e-06, + "loss": 1.4231, + "mean_token_accuracy": 0.6411556551853815, + "num_tokens": 2831342014.0, + "step": 16881 + }, + { + "entropy": 1.7519350747267406, + "epoch": 1.854576913570075, + "grad_norm": 0.7681441307067871, + "learning_rate": 2.248885683465929e-06, + "loss": 1.2912, + "mean_token_accuracy": 0.6645476470390955, + "num_tokens": 2831444174.0, + "step": 16882 + }, + { + "entropy": 1.7026408016681671, + "epoch": 1.854686770481448, + "grad_norm": 0.6976202726364136, + "learning_rate": 2.248511886592658e-06, + "loss": 1.3042, + "mean_token_accuracy": 0.6687952727079391, + "num_tokens": 2831566636.0, + "step": 16883 + }, + { + "entropy": 1.6760593354701996, + "epoch": 1.8547966273928207, + "grad_norm": 0.6563632488250732, + "learning_rate": 2.248138366700043e-06, + "loss": 1.4318, + "mean_token_accuracy": 0.6755139579375585, + "num_tokens": 2831726930.0, + "step": 16884 + }, + { + "entropy": 1.724344511826833, + "epoch": 1.8549064843041938, + "grad_norm": 0.7031118869781494, + "learning_rate": 2.247765123799904e-06, + "loss": 1.3251, + "mean_token_accuracy": 0.6661837746699651, + "num_tokens": 2831896095.0, + "step": 16885 + }, + { + "entropy": 1.727848341067632, + "epoch": 1.8550163412155667, + "grad_norm": 0.5700660943984985, + "learning_rate": 2.247392157904055e-06, + "loss": 1.4622, + "mean_token_accuracy": 0.642038439710935, + "num_tokens": 2832131149.0, + "step": 16886 + }, + { + "entropy": 1.6446122825145721, + "epoch": 1.8551261981269396, + "grad_norm": 0.6341161727905273, + "learning_rate": 2.2470194690243006e-06, + "loss": 1.3984, + "mean_token_accuracy": 0.6593250582615534, + "num_tokens": 2832307378.0, + "step": 16887 + }, + { + "entropy": 1.6699632306893666, + "epoch": 1.8552360550383127, + "grad_norm": 0.5614050626754761, + "learning_rate": 2.2466470571724357e-06, + "loss": 1.3048, + "mean_token_accuracy": 0.6609620600938797, + "num_tokens": 2832519981.0, + "step": 16888 + }, + { + "entropy": 1.6194656590620677, + "epoch": 1.8553459119496856, + "grad_norm": 0.5817786455154419, + "learning_rate": 2.2462749223602464e-06, + "loss": 1.4775, + "mean_token_accuracy": 0.6533914605776469, + "num_tokens": 2832712716.0, + "step": 16889 + }, + { + "entropy": 1.7114621301492055, + "epoch": 1.8554557688610585, + "grad_norm": 0.7644500136375427, + "learning_rate": 2.2459030645995118e-06, + "loss": 1.3141, + "mean_token_accuracy": 0.661896139383316, + "num_tokens": 2832835841.0, + "step": 16890 + }, + { + "entropy": 1.661112666130066, + "epoch": 1.8555656257724316, + "grad_norm": 0.8860819935798645, + "learning_rate": 2.245531483902e-06, + "loss": 1.1732, + "mean_token_accuracy": 0.688821072379748, + "num_tokens": 2832959778.0, + "step": 16891 + }, + { + "entropy": 1.6903029382228851, + "epoch": 1.8556754826838042, + "grad_norm": 0.6567397713661194, + "learning_rate": 2.245160180279473e-06, + "loss": 1.4008, + "mean_token_accuracy": 0.6515764991442362, + "num_tokens": 2833141986.0, + "step": 16892 + }, + { + "entropy": 1.7396831810474396, + "epoch": 1.8557853395951773, + "grad_norm": 0.6399162411689758, + "learning_rate": 2.244789153743681e-06, + "loss": 1.5221, + "mean_token_accuracy": 0.6352566480636597, + "num_tokens": 2833383901.0, + "step": 16893 + }, + { + "entropy": 1.6990015904108684, + "epoch": 1.8558951965065502, + "grad_norm": 0.6782881617546082, + "learning_rate": 2.2444184043063666e-06, + "loss": 1.3644, + "mean_token_accuracy": 0.6583975255489349, + "num_tokens": 2833543551.0, + "step": 16894 + }, + { + "entropy": 1.798978457848231, + "epoch": 1.856005053417923, + "grad_norm": 0.7053307294845581, + "learning_rate": 2.2440479319792636e-06, + "loss": 1.4234, + "mean_token_accuracy": 0.6484651267528534, + "num_tokens": 2833660767.0, + "step": 16895 + }, + { + "entropy": 1.6739828785260518, + "epoch": 1.8561149103292962, + "grad_norm": 0.741791844367981, + "learning_rate": 2.2436777367741004e-06, + "loss": 1.3366, + "mean_token_accuracy": 0.6574702759583791, + "num_tokens": 2833816561.0, + "step": 16896 + }, + { + "entropy": 1.7440234621365864, + "epoch": 1.8562247672406689, + "grad_norm": 0.7840536832809448, + "learning_rate": 2.2433078187025897e-06, + "loss": 1.3036, + "mean_token_accuracy": 0.6595564881960551, + "num_tokens": 2833970261.0, + "step": 16897 + }, + { + "entropy": 1.7112967669963837, + "epoch": 1.856334624152042, + "grad_norm": 0.6875016093254089, + "learning_rate": 2.24293817777644e-06, + "loss": 1.5245, + "mean_token_accuracy": 0.655097077290217, + "num_tokens": 2834143895.0, + "step": 16898 + }, + { + "entropy": 1.7350221673647563, + "epoch": 1.8564444810634149, + "grad_norm": 0.6510404944419861, + "learning_rate": 2.2425688140073515e-06, + "loss": 1.3634, + "mean_token_accuracy": 0.6634075343608856, + "num_tokens": 2834282750.0, + "step": 16899 + }, + { + "entropy": 1.7747245331605275, + "epoch": 1.8565543379747877, + "grad_norm": 0.7304637432098389, + "learning_rate": 2.2421997274070153e-06, + "loss": 1.3861, + "mean_token_accuracy": 0.6545447160800298, + "num_tokens": 2834400058.0, + "step": 16900 + }, + { + "entropy": 1.6580698291460674, + "epoch": 1.8566641948861609, + "grad_norm": 0.6355282664299011, + "learning_rate": 2.2418309179871094e-06, + "loss": 1.3236, + "mean_token_accuracy": 0.6701503843069077, + "num_tokens": 2834567340.0, + "step": 16901 + }, + { + "entropy": 1.7195066312948863, + "epoch": 1.8567740517975337, + "grad_norm": 0.6729101538658142, + "learning_rate": 2.2414623857593086e-06, + "loss": 1.329, + "mean_token_accuracy": 0.6680616289377213, + "num_tokens": 2834701276.0, + "step": 16902 + }, + { + "entropy": 1.7093308369318645, + "epoch": 1.8568839087089066, + "grad_norm": 0.735205352306366, + "learning_rate": 2.241094130735277e-06, + "loss": 1.5407, + "mean_token_accuracy": 0.6620696832736334, + "num_tokens": 2834860563.0, + "step": 16903 + }, + { + "entropy": 1.6724826991558075, + "epoch": 1.8569937656202797, + "grad_norm": 0.6093858480453491, + "learning_rate": 2.2407261529266697e-06, + "loss": 1.4459, + "mean_token_accuracy": 0.6603935311237971, + "num_tokens": 2835024875.0, + "step": 16904 + }, + { + "entropy": 1.7351752022902172, + "epoch": 1.8571036225316524, + "grad_norm": 0.716964840888977, + "learning_rate": 2.240358452345133e-06, + "loss": 1.4218, + "mean_token_accuracy": 0.6472053527832031, + "num_tokens": 2835160577.0, + "step": 16905 + }, + { + "entropy": 1.6534366210301716, + "epoch": 1.8572134794430255, + "grad_norm": 0.6837955713272095, + "learning_rate": 2.2399910290023024e-06, + "loss": 1.3871, + "mean_token_accuracy": 0.6537323395411173, + "num_tokens": 2835300716.0, + "step": 16906 + }, + { + "entropy": 1.6657463312149048, + "epoch": 1.8573233363543984, + "grad_norm": 0.6619056463241577, + "learning_rate": 2.2396238829098092e-06, + "loss": 1.3247, + "mean_token_accuracy": 0.6633240481217703, + "num_tokens": 2835477286.0, + "step": 16907 + }, + { + "entropy": 1.7382064660390217, + "epoch": 1.8574331932657713, + "grad_norm": 0.9045251607894897, + "learning_rate": 2.2392570140792743e-06, + "loss": 1.4923, + "mean_token_accuracy": 0.6527342349290848, + "num_tokens": 2835621855.0, + "step": 16908 + }, + { + "entropy": 1.746301809946696, + "epoch": 1.8575430501771444, + "grad_norm": 0.8234473466873169, + "learning_rate": 2.2388904225223047e-06, + "loss": 1.4924, + "mean_token_accuracy": 0.6496127992868423, + "num_tokens": 2835796980.0, + "step": 16909 + }, + { + "entropy": 1.6744904418786366, + "epoch": 1.857652907088517, + "grad_norm": 0.6349613070487976, + "learning_rate": 2.2385241082505062e-06, + "loss": 1.3036, + "mean_token_accuracy": 0.6642439812421799, + "num_tokens": 2835991748.0, + "step": 16910 + }, + { + "entropy": 1.7444684902826946, + "epoch": 1.8577627639998902, + "grad_norm": 0.6656630635261536, + "learning_rate": 2.2381580712754717e-06, + "loss": 1.4149, + "mean_token_accuracy": 0.6564318190018336, + "num_tokens": 2836177838.0, + "step": 16911 + }, + { + "entropy": 1.7117125988006592, + "epoch": 1.857872620911263, + "grad_norm": 0.6614863276481628, + "learning_rate": 2.237792311608787e-06, + "loss": 1.5543, + "mean_token_accuracy": 0.650190144777298, + "num_tokens": 2836380864.0, + "step": 16912 + }, + { + "entropy": 1.70883509516716, + "epoch": 1.857982477822636, + "grad_norm": 0.6626452207565308, + "learning_rate": 2.237426829262027e-06, + "loss": 1.3597, + "mean_token_accuracy": 0.6688442379236221, + "num_tokens": 2836523172.0, + "step": 16913 + }, + { + "entropy": 1.7223861614863079, + "epoch": 1.858092334734009, + "grad_norm": 1.0268768072128296, + "learning_rate": 2.237061624246758e-06, + "loss": 1.3974, + "mean_token_accuracy": 0.6556852708260218, + "num_tokens": 2836659552.0, + "step": 16914 + }, + { + "entropy": 1.6183397471904755, + "epoch": 1.858202191645382, + "grad_norm": 0.6055456399917603, + "learning_rate": 2.2366966965745403e-06, + "loss": 1.3816, + "mean_token_accuracy": 0.6664853493372599, + "num_tokens": 2836851444.0, + "step": 16915 + }, + { + "entropy": 1.723154256741206, + "epoch": 1.8583120485567548, + "grad_norm": 0.6002776622772217, + "learning_rate": 2.236332046256924e-06, + "loss": 1.3431, + "mean_token_accuracy": 0.6692459831635157, + "num_tokens": 2837003681.0, + "step": 16916 + }, + { + "entropy": 1.7682878176371257, + "epoch": 1.858421905468128, + "grad_norm": 0.6165063977241516, + "learning_rate": 2.2359676733054496e-06, + "loss": 1.4432, + "mean_token_accuracy": 0.6513163695732752, + "num_tokens": 2837206049.0, + "step": 16917 + }, + { + "entropy": 1.745583325624466, + "epoch": 1.8585317623795006, + "grad_norm": 0.5849365592002869, + "learning_rate": 2.235603577731648e-06, + "loss": 1.5011, + "mean_token_accuracy": 0.6388779282569885, + "num_tokens": 2837413970.0, + "step": 16918 + }, + { + "entropy": 1.7166366577148438, + "epoch": 1.8586416192908737, + "grad_norm": 0.6849484443664551, + "learning_rate": 2.2352397595470453e-06, + "loss": 1.4588, + "mean_token_accuracy": 0.6550758282343546, + "num_tokens": 2837575965.0, + "step": 16919 + }, + { + "entropy": 1.6985012590885162, + "epoch": 1.8587514762022466, + "grad_norm": 0.6950430274009705, + "learning_rate": 2.2348762187631537e-06, + "loss": 1.3575, + "mean_token_accuracy": 0.6614127407471339, + "num_tokens": 2837715810.0, + "step": 16920 + }, + { + "entropy": 1.6506640315055847, + "epoch": 1.8588613331136195, + "grad_norm": 0.7237517237663269, + "learning_rate": 2.2345129553914805e-06, + "loss": 1.3152, + "mean_token_accuracy": 0.66397192577521, + "num_tokens": 2837839931.0, + "step": 16921 + }, + { + "entropy": 1.7041799624760945, + "epoch": 1.8589711900249926, + "grad_norm": 0.7405112981796265, + "learning_rate": 2.234149969443522e-06, + "loss": 1.5072, + "mean_token_accuracy": 0.6494031300147375, + "num_tokens": 2838000703.0, + "step": 16922 + }, + { + "entropy": 1.6308053533236186, + "epoch": 1.8590810469363652, + "grad_norm": 0.6319633722305298, + "learning_rate": 2.2337872609307677e-06, + "loss": 1.3862, + "mean_token_accuracy": 0.6686498373746872, + "num_tokens": 2838140080.0, + "step": 16923 + }, + { + "entropy": 1.7466503183046977, + "epoch": 1.8591909038477383, + "grad_norm": 0.6635008454322815, + "learning_rate": 2.233424829864696e-06, + "loss": 1.3945, + "mean_token_accuracy": 0.649022842446963, + "num_tokens": 2838274430.0, + "step": 16924 + }, + { + "entropy": 1.7209342022736867, + "epoch": 1.8593007607591112, + "grad_norm": 0.7904077172279358, + "learning_rate": 2.2330626762567784e-06, + "loss": 1.553, + "mean_token_accuracy": 0.6467588543891907, + "num_tokens": 2838492658.0, + "step": 16925 + }, + { + "entropy": 1.6345500747362773, + "epoch": 1.859410617670484, + "grad_norm": 0.7646093964576721, + "learning_rate": 2.2327008001184764e-06, + "loss": 1.5033, + "mean_token_accuracy": 0.643667072057724, + "num_tokens": 2838691595.0, + "step": 16926 + }, + { + "entropy": 1.692623883485794, + "epoch": 1.8595204745818572, + "grad_norm": 0.7697778940200806, + "learning_rate": 2.2323392014612425e-06, + "loss": 1.3278, + "mean_token_accuracy": 0.6717701901992162, + "num_tokens": 2838804890.0, + "step": 16927 + }, + { + "entropy": 1.676644762357076, + "epoch": 1.85963033149323, + "grad_norm": 0.6009611487388611, + "learning_rate": 2.2319778802965244e-06, + "loss": 1.463, + "mean_token_accuracy": 0.6408818513154984, + "num_tokens": 2839016654.0, + "step": 16928 + }, + { + "entropy": 1.719579428434372, + "epoch": 1.859740188404603, + "grad_norm": 0.6772399544715881, + "learning_rate": 2.2316168366357533e-06, + "loss": 1.4172, + "mean_token_accuracy": 0.6630838066339493, + "num_tokens": 2839199453.0, + "step": 16929 + }, + { + "entropy": 1.680406113465627, + "epoch": 1.859850045315976, + "grad_norm": 0.7438538074493408, + "learning_rate": 2.2312560704903586e-06, + "loss": 1.4936, + "mean_token_accuracy": 0.6705125272274017, + "num_tokens": 2839374638.0, + "step": 16930 + }, + { + "entropy": 1.6635006268819172, + "epoch": 1.8599599022273487, + "grad_norm": 0.6747929453849792, + "learning_rate": 2.230895581871759e-06, + "loss": 1.302, + "mean_token_accuracy": 0.6731831183036169, + "num_tokens": 2839525698.0, + "step": 16931 + }, + { + "entropy": 1.7203009327252705, + "epoch": 1.8600697591387219, + "grad_norm": 0.6208789348602295, + "learning_rate": 2.2305353707913624e-06, + "loss": 1.5573, + "mean_token_accuracy": 0.6403620640436808, + "num_tokens": 2839743378.0, + "step": 16932 + }, + { + "entropy": 1.7072515587011974, + "epoch": 1.8601796160500947, + "grad_norm": 0.6848557591438293, + "learning_rate": 2.230175437260569e-06, + "loss": 1.289, + "mean_token_accuracy": 0.6748117307821909, + "num_tokens": 2839860106.0, + "step": 16933 + }, + { + "entropy": 1.6722529927889507, + "epoch": 1.8602894729614676, + "grad_norm": 0.7470364570617676, + "learning_rate": 2.229815781290772e-06, + "loss": 1.3631, + "mean_token_accuracy": 0.6619627922773361, + "num_tokens": 2840008850.0, + "step": 16934 + }, + { + "entropy": 1.7064866026242573, + "epoch": 1.8603993298728407, + "grad_norm": 0.692008376121521, + "learning_rate": 2.229456402893352e-06, + "loss": 1.3543, + "mean_token_accuracy": 0.6580140540997187, + "num_tokens": 2840145150.0, + "step": 16935 + }, + { + "entropy": 1.6983463366826375, + "epoch": 1.8605091867842134, + "grad_norm": 0.6857158541679382, + "learning_rate": 2.2290973020796873e-06, + "loss": 1.3891, + "mean_token_accuracy": 0.6641733994086584, + "num_tokens": 2840296513.0, + "step": 16936 + }, + { + "entropy": 1.7579893469810486, + "epoch": 1.8606190436955865, + "grad_norm": 0.5688817501068115, + "learning_rate": 2.228738478861139e-06, + "loss": 1.4527, + "mean_token_accuracy": 0.631959984699885, + "num_tokens": 2840489377.0, + "step": 16937 + }, + { + "entropy": 1.7483516136805217, + "epoch": 1.8607289006069594, + "grad_norm": 0.6688814163208008, + "learning_rate": 2.228379933249066e-06, + "loss": 1.4025, + "mean_token_accuracy": 0.651861771941185, + "num_tokens": 2840638757.0, + "step": 16938 + }, + { + "entropy": 1.745933045943578, + "epoch": 1.8608387575183323, + "grad_norm": 0.6227981448173523, + "learning_rate": 2.2280216652548144e-06, + "loss": 1.3729, + "mean_token_accuracy": 0.6608076989650726, + "num_tokens": 2840812693.0, + "step": 16939 + }, + { + "entropy": 1.6749466558297474, + "epoch": 1.8609486144297054, + "grad_norm": 0.6650595664978027, + "learning_rate": 2.2276636748897264e-06, + "loss": 1.4298, + "mean_token_accuracy": 0.6458970904350281, + "num_tokens": 2840956606.0, + "step": 16940 + }, + { + "entropy": 1.7333874702453613, + "epoch": 1.8610584713410783, + "grad_norm": 0.6113640069961548, + "learning_rate": 2.227305962165129e-06, + "loss": 1.4095, + "mean_token_accuracy": 0.6591263363758723, + "num_tokens": 2841175897.0, + "step": 16941 + }, + { + "entropy": 1.7647278308868408, + "epoch": 1.8611683282524512, + "grad_norm": 0.9023246765136719, + "learning_rate": 2.2269485270923446e-06, + "loss": 1.3244, + "mean_token_accuracy": 0.6695207307736079, + "num_tokens": 2841295330.0, + "step": 16942 + }, + { + "entropy": 1.7428401311238606, + "epoch": 1.8612781851638243, + "grad_norm": 0.6869433522224426, + "learning_rate": 2.2265913696826865e-06, + "loss": 1.3092, + "mean_token_accuracy": 0.6689073791106542, + "num_tokens": 2841438192.0, + "step": 16943 + }, + { + "entropy": 1.7334860563278198, + "epoch": 1.861388042075197, + "grad_norm": 0.6899425983428955, + "learning_rate": 2.2262344899474585e-06, + "loss": 1.3344, + "mean_token_accuracy": 0.6681728015343348, + "num_tokens": 2841567347.0, + "step": 16944 + }, + { + "entropy": 1.7800839046637218, + "epoch": 1.86149789898657, + "grad_norm": 0.744773805141449, + "learning_rate": 2.225877887897954e-06, + "loss": 1.4977, + "mean_token_accuracy": 0.6642592052618662, + "num_tokens": 2841725960.0, + "step": 16945 + }, + { + "entropy": 1.6366903285185497, + "epoch": 1.861607755897943, + "grad_norm": 0.6069456934928894, + "learning_rate": 2.2255215635454618e-06, + "loss": 1.401, + "mean_token_accuracy": 0.6623266190290451, + "num_tokens": 2841915773.0, + "step": 16946 + }, + { + "entropy": 1.6906556288401287, + "epoch": 1.8617176128093158, + "grad_norm": 0.5900170803070068, + "learning_rate": 2.225165516901257e-06, + "loss": 1.5347, + "mean_token_accuracy": 0.636528434852759, + "num_tokens": 2842126533.0, + "step": 16947 + }, + { + "entropy": 1.7210322221120198, + "epoch": 1.861827469720689, + "grad_norm": 0.6937727928161621, + "learning_rate": 2.2248097479766114e-06, + "loss": 1.4849, + "mean_token_accuracy": 0.651384433110555, + "num_tokens": 2842268373.0, + "step": 16948 + }, + { + "entropy": 1.6970987915992737, + "epoch": 1.8619373266320616, + "grad_norm": 0.8207261562347412, + "learning_rate": 2.224454256782783e-06, + "loss": 1.4079, + "mean_token_accuracy": 0.6684905638297399, + "num_tokens": 2842422883.0, + "step": 16949 + }, + { + "entropy": 1.7192309498786926, + "epoch": 1.8620471835434347, + "grad_norm": 0.7105966806411743, + "learning_rate": 2.2240990433310218e-06, + "loss": 1.3975, + "mean_token_accuracy": 0.6547664652268091, + "num_tokens": 2842586220.0, + "step": 16950 + }, + { + "entropy": 1.714976857105891, + "epoch": 1.8621570404548076, + "grad_norm": 0.6246412396430969, + "learning_rate": 2.2237441076325714e-06, + "loss": 1.4338, + "mean_token_accuracy": 0.652628536025683, + "num_tokens": 2842777503.0, + "step": 16951 + }, + { + "entropy": 1.6915887892246246, + "epoch": 1.8622668973661805, + "grad_norm": 0.6842101216316223, + "learning_rate": 2.223389449698666e-06, + "loss": 1.3436, + "mean_token_accuracy": 0.6701055020093918, + "num_tokens": 2842904821.0, + "step": 16952 + }, + { + "entropy": 1.6738732159137726, + "epoch": 1.8623767542775536, + "grad_norm": 0.7973695397377014, + "learning_rate": 2.2230350695405288e-06, + "loss": 1.3581, + "mean_token_accuracy": 0.6813636471827825, + "num_tokens": 2843019612.0, + "step": 16953 + }, + { + "entropy": 1.682248741388321, + "epoch": 1.8624866111889264, + "grad_norm": 0.6853655576705933, + "learning_rate": 2.222680967169377e-06, + "loss": 1.3193, + "mean_token_accuracy": 0.6603673497835795, + "num_tokens": 2843174426.0, + "step": 16954 + }, + { + "entropy": 1.7681555946667988, + "epoch": 1.8625964681002993, + "grad_norm": 0.7105289101600647, + "learning_rate": 2.2223271425964182e-06, + "loss": 1.3293, + "mean_token_accuracy": 0.67288438975811, + "num_tokens": 2843305957.0, + "step": 16955 + }, + { + "entropy": 1.7183941106001537, + "epoch": 1.8627063250116724, + "grad_norm": 0.7996242642402649, + "learning_rate": 2.22197359583285e-06, + "loss": 1.53, + "mean_token_accuracy": 0.6441441575686137, + "num_tokens": 2843524423.0, + "step": 16956 + }, + { + "entropy": 1.7308327456315358, + "epoch": 1.862816181923045, + "grad_norm": 0.7495532035827637, + "learning_rate": 2.2216203268898605e-06, + "loss": 1.3608, + "mean_token_accuracy": 0.6570025732119879, + "num_tokens": 2843657802.0, + "step": 16957 + }, + { + "entropy": 1.6802193820476532, + "epoch": 1.8629260388344182, + "grad_norm": 0.7395232915878296, + "learning_rate": 2.2212673357786333e-06, + "loss": 1.3952, + "mean_token_accuracy": 0.6501044581333796, + "num_tokens": 2843822977.0, + "step": 16958 + }, + { + "entropy": 1.706279416879018, + "epoch": 1.863035895745791, + "grad_norm": 0.7118804454803467, + "learning_rate": 2.220914622510339e-06, + "loss": 1.4068, + "mean_token_accuracy": 0.6517497350772222, + "num_tokens": 2843988396.0, + "step": 16959 + }, + { + "entropy": 1.6896512309710185, + "epoch": 1.863145752657164, + "grad_norm": 0.7015063166618347, + "learning_rate": 2.2205621870961405e-06, + "loss": 1.5505, + "mean_token_accuracy": 0.6469273467858633, + "num_tokens": 2844136156.0, + "step": 16960 + }, + { + "entropy": 1.6428396503130596, + "epoch": 1.863255609568537, + "grad_norm": 0.7962595820426941, + "learning_rate": 2.2202100295471937e-06, + "loss": 1.1938, + "mean_token_accuracy": 0.6869229475657145, + "num_tokens": 2844254113.0, + "step": 16961 + }, + { + "entropy": 1.707498123248418, + "epoch": 1.86336546647991, + "grad_norm": 0.6344748139381409, + "learning_rate": 2.219858149874642e-06, + "loss": 1.4643, + "mean_token_accuracy": 0.6413133492072424, + "num_tokens": 2844456777.0, + "step": 16962 + }, + { + "entropy": 1.6804148157437642, + "epoch": 1.8634753233912829, + "grad_norm": 0.6727264523506165, + "learning_rate": 2.219506548089623e-06, + "loss": 1.1781, + "mean_token_accuracy": 0.6860218544801077, + "num_tokens": 2844572149.0, + "step": 16963 + }, + { + "entropy": 1.7345021267731984, + "epoch": 1.8635851803026557, + "grad_norm": 0.7716752290725708, + "learning_rate": 2.219155224203268e-06, + "loss": 1.4514, + "mean_token_accuracy": 0.6558243483304977, + "num_tokens": 2844746134.0, + "step": 16964 + }, + { + "entropy": 1.717079867919286, + "epoch": 1.8636950372140286, + "grad_norm": 0.7007601857185364, + "learning_rate": 2.2188041782266905e-06, + "loss": 1.5099, + "mean_token_accuracy": 0.6400510122378668, + "num_tokens": 2844934759.0, + "step": 16965 + }, + { + "entropy": 1.6786939601103466, + "epoch": 1.8638048941254017, + "grad_norm": 0.6247925758361816, + "learning_rate": 2.2184534101710043e-06, + "loss": 1.3304, + "mean_token_accuracy": 0.6726222485303879, + "num_tokens": 2845083838.0, + "step": 16966 + }, + { + "entropy": 1.7275762955347698, + "epoch": 1.8639147510367746, + "grad_norm": 0.8030937910079956, + "learning_rate": 2.2181029200473123e-06, + "loss": 1.2345, + "mean_token_accuracy": 0.6929545601209005, + "num_tokens": 2845188998.0, + "step": 16967 + }, + { + "entropy": 1.7728230853875477, + "epoch": 1.8640246079481475, + "grad_norm": 0.7184717059135437, + "learning_rate": 2.217752707866704e-06, + "loss": 1.4425, + "mean_token_accuracy": 0.6444051365057627, + "num_tokens": 2845371280.0, + "step": 16968 + }, + { + "entropy": 1.75874129931132, + "epoch": 1.8641344648595206, + "grad_norm": 0.7662733793258667, + "learning_rate": 2.217402773640265e-06, + "loss": 1.4214, + "mean_token_accuracy": 0.6515029867490133, + "num_tokens": 2845525478.0, + "step": 16969 + }, + { + "entropy": 1.7064630488554637, + "epoch": 1.8642443217708933, + "grad_norm": 0.7449208498001099, + "learning_rate": 2.2170531173790722e-06, + "loss": 1.3713, + "mean_token_accuracy": 0.6680237799882889, + "num_tokens": 2845670438.0, + "step": 16970 + }, + { + "entropy": 1.6645330289999645, + "epoch": 1.8643541786822664, + "grad_norm": 0.713966429233551, + "learning_rate": 2.2167037390941892e-06, + "loss": 1.4005, + "mean_token_accuracy": 0.6721183756987253, + "num_tokens": 2845825623.0, + "step": 16971 + }, + { + "entropy": 1.7196373244126637, + "epoch": 1.8644640355936393, + "grad_norm": 0.7147789597511292, + "learning_rate": 2.2163546387966756e-06, + "loss": 1.5454, + "mean_token_accuracy": 0.6515597999095917, + "num_tokens": 2845991287.0, + "step": 16972 + }, + { + "entropy": 1.704675664504369, + "epoch": 1.8645738925050122, + "grad_norm": 0.6308305263519287, + "learning_rate": 2.21600581649758e-06, + "loss": 1.3732, + "mean_token_accuracy": 0.6472256034612656, + "num_tokens": 2846154390.0, + "step": 16973 + }, + { + "entropy": 1.6952514847119649, + "epoch": 1.8646837494163853, + "grad_norm": 0.8253968358039856, + "learning_rate": 2.2156572722079413e-06, + "loss": 1.2201, + "mean_token_accuracy": 0.6835501392682394, + "num_tokens": 2846297166.0, + "step": 16974 + }, + { + "entropy": 1.7401870787143707, + "epoch": 1.8647936063277581, + "grad_norm": 0.9213194847106934, + "learning_rate": 2.2153090059387926e-06, + "loss": 1.4246, + "mean_token_accuracy": 0.6445530652999878, + "num_tokens": 2846470925.0, + "step": 16975 + }, + { + "entropy": 1.7035265266895294, + "epoch": 1.864903463239131, + "grad_norm": 0.6574537754058838, + "learning_rate": 2.2149610177011547e-06, + "loss": 1.4512, + "mean_token_accuracy": 0.6518820325533549, + "num_tokens": 2846627363.0, + "step": 16976 + }, + { + "entropy": 1.6686862508455913, + "epoch": 1.865013320150504, + "grad_norm": 0.7016027569770813, + "learning_rate": 2.2146133075060412e-06, + "loss": 1.5151, + "mean_token_accuracy": 0.651968797047933, + "num_tokens": 2846810432.0, + "step": 16977 + }, + { + "entropy": 1.673084298769633, + "epoch": 1.8651231770618768, + "grad_norm": 0.5893659591674805, + "learning_rate": 2.2142658753644593e-06, + "loss": 1.449, + "mean_token_accuracy": 0.6452168524265289, + "num_tokens": 2847041747.0, + "step": 16978 + }, + { + "entropy": 1.6141287585099537, + "epoch": 1.86523303397325, + "grad_norm": 0.7347483038902283, + "learning_rate": 2.213918721287402e-06, + "loss": 1.2958, + "mean_token_accuracy": 0.6759419937928518, + "num_tokens": 2847193557.0, + "step": 16979 + }, + { + "entropy": 1.71195982893308, + "epoch": 1.8653428908846228, + "grad_norm": 0.7597905993461609, + "learning_rate": 2.2135718452858598e-06, + "loss": 1.4343, + "mean_token_accuracy": 0.6534530371427536, + "num_tokens": 2847395961.0, + "step": 16980 + }, + { + "entropy": 1.726847916841507, + "epoch": 1.8654527477959957, + "grad_norm": 0.751272976398468, + "learning_rate": 2.213225247370808e-06, + "loss": 1.4013, + "mean_token_accuracy": 0.6617669512828191, + "num_tokens": 2847577102.0, + "step": 16981 + }, + { + "entropy": 1.7482527395089467, + "epoch": 1.8655626047073688, + "grad_norm": 0.7296915054321289, + "learning_rate": 2.2128789275532175e-06, + "loss": 1.5129, + "mean_token_accuracy": 0.6335967232783636, + "num_tokens": 2847772021.0, + "step": 16982 + }, + { + "entropy": 1.6838180720806122, + "epoch": 1.8656724616187415, + "grad_norm": 0.5979762077331543, + "learning_rate": 2.2125328858440503e-06, + "loss": 1.5231, + "mean_token_accuracy": 0.6346350063880285, + "num_tokens": 2847965351.0, + "step": 16983 + }, + { + "entropy": 1.7451152900854747, + "epoch": 1.8657823185301146, + "grad_norm": 0.6949437856674194, + "learning_rate": 2.212187122254258e-06, + "loss": 1.3898, + "mean_token_accuracy": 0.6568313439687093, + "num_tokens": 2848141395.0, + "step": 16984 + }, + { + "entropy": 1.7283788720766704, + "epoch": 1.8658921754414874, + "grad_norm": 0.6480981111526489, + "learning_rate": 2.211841636794783e-06, + "loss": 1.4786, + "mean_token_accuracy": 0.6345923642317454, + "num_tokens": 2848375079.0, + "step": 16985 + }, + { + "entropy": 1.7344995041688283, + "epoch": 1.8660020323528603, + "grad_norm": 0.7044827342033386, + "learning_rate": 2.211496429476559e-06, + "loss": 1.4727, + "mean_token_accuracy": 0.648894136150678, + "num_tokens": 2848592441.0, + "step": 16986 + }, + { + "entropy": 1.7405148645242055, + "epoch": 1.8661118892642334, + "grad_norm": 0.6550964117050171, + "learning_rate": 2.2111515003105137e-06, + "loss": 1.3739, + "mean_token_accuracy": 0.656710093220075, + "num_tokens": 2848736248.0, + "step": 16987 + }, + { + "entropy": 1.6667698224385579, + "epoch": 1.8662217461756063, + "grad_norm": 0.7342043519020081, + "learning_rate": 2.2108068493075634e-06, + "loss": 1.2817, + "mean_token_accuracy": 0.676526720325152, + "num_tokens": 2848898107.0, + "step": 16988 + }, + { + "entropy": 1.6982735991477966, + "epoch": 1.8663316030869792, + "grad_norm": 0.7887325286865234, + "learning_rate": 2.2104624764786152e-06, + "loss": 1.2851, + "mean_token_accuracy": 0.673782487710317, + "num_tokens": 2849023372.0, + "step": 16989 + }, + { + "entropy": 1.806358168522517, + "epoch": 1.866441459998352, + "grad_norm": 0.7065950036048889, + "learning_rate": 2.210118381834569e-06, + "loss": 1.5251, + "mean_token_accuracy": 0.640480175614357, + "num_tokens": 2849217998.0, + "step": 16990 + }, + { + "entropy": 1.6777258316675823, + "epoch": 1.866551316909725, + "grad_norm": 0.6541003584861755, + "learning_rate": 2.2097745653863156e-06, + "loss": 1.3798, + "mean_token_accuracy": 0.6534243921438853, + "num_tokens": 2849428641.0, + "step": 16991 + }, + { + "entropy": 1.741273860136668, + "epoch": 1.866661173821098, + "grad_norm": 0.7950140833854675, + "learning_rate": 2.2094310271447355e-06, + "loss": 1.3057, + "mean_token_accuracy": 0.6594074964523315, + "num_tokens": 2849527531.0, + "step": 16992 + }, + { + "entropy": 1.6880672574043274, + "epoch": 1.866771030732471, + "grad_norm": 0.6249304413795471, + "learning_rate": 2.209087767120704e-06, + "loss": 1.4085, + "mean_token_accuracy": 0.6540153622627258, + "num_tokens": 2849684171.0, + "step": 16993 + }, + { + "entropy": 1.7445914248625438, + "epoch": 1.8668808876438439, + "grad_norm": 0.6921653747558594, + "learning_rate": 2.208744785325081e-06, + "loss": 1.389, + "mean_token_accuracy": 0.6449531565109888, + "num_tokens": 2849836896.0, + "step": 16994 + }, + { + "entropy": 1.7295080125331879, + "epoch": 1.866990744555217, + "grad_norm": 0.5937165021896362, + "learning_rate": 2.2084020817687253e-06, + "loss": 1.4598, + "mean_token_accuracy": 0.6409016450246176, + "num_tokens": 2850047785.0, + "step": 16995 + }, + { + "entropy": 1.7254140277703602, + "epoch": 1.8671006014665896, + "grad_norm": 1.068790078163147, + "learning_rate": 2.208059656462482e-06, + "loss": 1.3398, + "mean_token_accuracy": 0.6658286303281784, + "num_tokens": 2850181291.0, + "step": 16996 + }, + { + "entropy": 1.7456210553646088, + "epoch": 1.8672104583779627, + "grad_norm": 0.6479652523994446, + "learning_rate": 2.2077175094171903e-06, + "loss": 1.3958, + "mean_token_accuracy": 0.6518608878056208, + "num_tokens": 2850309540.0, + "step": 16997 + }, + { + "entropy": 1.6959585348765056, + "epoch": 1.8673203152893356, + "grad_norm": 0.5444127321243286, + "learning_rate": 2.207375640643675e-06, + "loss": 1.3446, + "mean_token_accuracy": 0.6572243670622507, + "num_tokens": 2850495775.0, + "step": 16998 + }, + { + "entropy": 1.7165914873282115, + "epoch": 1.8674301722007085, + "grad_norm": 0.7069491744041443, + "learning_rate": 2.2070340501527597e-06, + "loss": 1.3456, + "mean_token_accuracy": 0.6599544485410055, + "num_tokens": 2850651848.0, + "step": 16999 + }, + { + "entropy": 1.6840533812840779, + "epoch": 1.8675400291120816, + "grad_norm": 0.6812978982925415, + "learning_rate": 2.206692737955256e-06, + "loss": 1.2676, + "mean_token_accuracy": 0.6691629191239675, + "num_tokens": 2850765863.0, + "step": 17000 + }, + { + "entropy": 1.6662333011627197, + "epoch": 1.8676498860234545, + "grad_norm": 0.7810607552528381, + "learning_rate": 2.206351704061963e-06, + "loss": 1.5147, + "mean_token_accuracy": 0.6441531578699747, + "num_tokens": 2850957710.0, + "step": 17001 + }, + { + "entropy": 1.778091549873352, + "epoch": 1.8677597429348274, + "grad_norm": 0.6844190955162048, + "learning_rate": 2.2060109484836766e-06, + "loss": 1.4703, + "mean_token_accuracy": 0.6522639393806458, + "num_tokens": 2851124151.0, + "step": 17002 + }, + { + "entropy": 1.772169252236684, + "epoch": 1.8678695998462005, + "grad_norm": 0.7324439883232117, + "learning_rate": 2.20567047123118e-06, + "loss": 1.401, + "mean_token_accuracy": 0.6545427242914835, + "num_tokens": 2851257624.0, + "step": 17003 + }, + { + "entropy": 1.6680610577265422, + "epoch": 1.8679794567575732, + "grad_norm": 0.6523362994194031, + "learning_rate": 2.205330272315251e-06, + "loss": 1.3719, + "mean_token_accuracy": 0.6796788175900778, + "num_tokens": 2851407097.0, + "step": 17004 + }, + { + "entropy": 1.7212667365868886, + "epoch": 1.8680893136689463, + "grad_norm": 0.7622836232185364, + "learning_rate": 2.204990351746657e-06, + "loss": 1.4981, + "mean_token_accuracy": 0.6425471156835556, + "num_tokens": 2851594805.0, + "step": 17005 + }, + { + "entropy": 1.7824140787124634, + "epoch": 1.8681991705803191, + "grad_norm": 0.8553072214126587, + "learning_rate": 2.204650709536153e-06, + "loss": 1.6837, + "mean_token_accuracy": 0.6378919730583826, + "num_tokens": 2851761200.0, + "step": 17006 + }, + { + "entropy": 1.7299172381560008, + "epoch": 1.868309027491692, + "grad_norm": 0.8043472766876221, + "learning_rate": 2.204311345694492e-06, + "loss": 1.4065, + "mean_token_accuracy": 0.6654603232940038, + "num_tokens": 2851896719.0, + "step": 17007 + }, + { + "entropy": 1.7903384566307068, + "epoch": 1.8684188844030651, + "grad_norm": 0.8522409200668335, + "learning_rate": 2.203972260232415e-06, + "loss": 1.3412, + "mean_token_accuracy": 0.6667458862066269, + "num_tokens": 2852084088.0, + "step": 17008 + }, + { + "entropy": 1.649288256963094, + "epoch": 1.8685287413144378, + "grad_norm": 0.6457241177558899, + "learning_rate": 2.20363345316065e-06, + "loss": 1.2725, + "mean_token_accuracy": 0.6749263107776642, + "num_tokens": 2852201978.0, + "step": 17009 + }, + { + "entropy": 1.6585692763328552, + "epoch": 1.868638598225811, + "grad_norm": 0.6605546474456787, + "learning_rate": 2.203294924489922e-06, + "loss": 1.2864, + "mean_token_accuracy": 0.6589010854562124, + "num_tokens": 2852379096.0, + "step": 17010 + }, + { + "entropy": 1.688430259625117, + "epoch": 1.8687484551371838, + "grad_norm": 0.6104924082756042, + "learning_rate": 2.202956674230948e-06, + "loss": 1.3572, + "mean_token_accuracy": 0.67152139544487, + "num_tokens": 2852565308.0, + "step": 17011 + }, + { + "entropy": 1.6815722684065502, + "epoch": 1.8688583120485567, + "grad_norm": 0.6199679970741272, + "learning_rate": 2.202618702394431e-06, + "loss": 1.3527, + "mean_token_accuracy": 0.6769666820764542, + "num_tokens": 2852721982.0, + "step": 17012 + }, + { + "entropy": 1.699970543384552, + "epoch": 1.8689681689599298, + "grad_norm": 0.6916362643241882, + "learning_rate": 2.202281008991066e-06, + "loss": 1.3273, + "mean_token_accuracy": 0.6627818942070007, + "num_tokens": 2852855715.0, + "step": 17013 + }, + { + "entropy": 1.7449293434619904, + "epoch": 1.8690780258713027, + "grad_norm": 0.6241941452026367, + "learning_rate": 2.2019435940315435e-06, + "loss": 1.5613, + "mean_token_accuracy": 0.6198464930057526, + "num_tokens": 2853091879.0, + "step": 17014 + }, + { + "entropy": 1.6857063074906666, + "epoch": 1.8691878827826756, + "grad_norm": 0.7108656167984009, + "learning_rate": 2.2016064575265426e-06, + "loss": 1.3398, + "mean_token_accuracy": 0.6637291759252548, + "num_tokens": 2853218575.0, + "step": 17015 + }, + { + "entropy": 1.6702306667963664, + "epoch": 1.8692977396940487, + "grad_norm": 0.688471794128418, + "learning_rate": 2.201269599486732e-06, + "loss": 1.3163, + "mean_token_accuracy": 0.6772701740264893, + "num_tokens": 2853381585.0, + "step": 17016 + }, + { + "entropy": 1.7522780398527782, + "epoch": 1.8694075966054213, + "grad_norm": 0.6958498358726501, + "learning_rate": 2.2009330199227746e-06, + "loss": 1.3844, + "mean_token_accuracy": 0.6645175168911616, + "num_tokens": 2853526534.0, + "step": 17017 + }, + { + "entropy": 1.6620089908440907, + "epoch": 1.8695174535167944, + "grad_norm": 0.7110161185264587, + "learning_rate": 2.2005967188453206e-06, + "loss": 1.2126, + "mean_token_accuracy": 0.6797206650177637, + "num_tokens": 2853685590.0, + "step": 17018 + }, + { + "entropy": 1.6503388981024425, + "epoch": 1.8696273104281673, + "grad_norm": 0.6182279586791992, + "learning_rate": 2.200260696265016e-06, + "loss": 1.6693, + "mean_token_accuracy": 0.6140792071819305, + "num_tokens": 2853949541.0, + "step": 17019 + }, + { + "entropy": 1.735267659028371, + "epoch": 1.8697371673395402, + "grad_norm": 0.7907410860061646, + "learning_rate": 2.199924952192496e-06, + "loss": 1.4237, + "mean_token_accuracy": 0.6509945740302404, + "num_tokens": 2854077562.0, + "step": 17020 + }, + { + "entropy": 1.7652918795744579, + "epoch": 1.8698470242509133, + "grad_norm": 0.9519900679588318, + "learning_rate": 2.1995894866383844e-06, + "loss": 1.3656, + "mean_token_accuracy": 0.6748112390438715, + "num_tokens": 2854228561.0, + "step": 17021 + }, + { + "entropy": 1.775589833656947, + "epoch": 1.869956881162286, + "grad_norm": 0.7908014059066772, + "learning_rate": 2.1992542996133008e-06, + "loss": 1.4917, + "mean_token_accuracy": 0.6449446976184845, + "num_tokens": 2854352461.0, + "step": 17022 + }, + { + "entropy": 1.6987537741661072, + "epoch": 1.870066738073659, + "grad_norm": 0.7079589366912842, + "learning_rate": 2.198919391127854e-06, + "loss": 1.452, + "mean_token_accuracy": 0.6492635756731033, + "num_tokens": 2854509134.0, + "step": 17023 + }, + { + "entropy": 1.7013746201992035, + "epoch": 1.870176594985032, + "grad_norm": 0.5822315216064453, + "learning_rate": 2.1985847611926412e-06, + "loss": 1.3831, + "mean_token_accuracy": 0.6505730946858724, + "num_tokens": 2854693579.0, + "step": 17024 + }, + { + "entropy": 1.6798737148443859, + "epoch": 1.8702864518964049, + "grad_norm": 0.6504672169685364, + "learning_rate": 2.1982504098182543e-06, + "loss": 1.4304, + "mean_token_accuracy": 0.6504451334476471, + "num_tokens": 2854874531.0, + "step": 17025 + }, + { + "entropy": 1.6749244034290314, + "epoch": 1.870396308807778, + "grad_norm": 0.8020433187484741, + "learning_rate": 2.197916337015277e-06, + "loss": 1.2575, + "mean_token_accuracy": 0.6731888701518377, + "num_tokens": 2855033432.0, + "step": 17026 + }, + { + "entropy": 1.7233870228131611, + "epoch": 1.8705061657191508, + "grad_norm": 0.6521610617637634, + "learning_rate": 2.1975825427942797e-06, + "loss": 1.2734, + "mean_token_accuracy": 0.6759884258111318, + "num_tokens": 2855157619.0, + "step": 17027 + }, + { + "entropy": 1.7447825769583385, + "epoch": 1.8706160226305237, + "grad_norm": 0.672861635684967, + "learning_rate": 2.1972490271658304e-06, + "loss": 1.3972, + "mean_token_accuracy": 0.6574916392564774, + "num_tokens": 2855387405.0, + "step": 17028 + }, + { + "entropy": 1.7002271513144176, + "epoch": 1.8707258795418968, + "grad_norm": 0.6032365560531616, + "learning_rate": 2.1969157901404825e-06, + "loss": 1.4316, + "mean_token_accuracy": 0.649682859579722, + "num_tokens": 2855554119.0, + "step": 17029 + }, + { + "entropy": 1.6851763526598613, + "epoch": 1.8708357364532695, + "grad_norm": 0.759894609451294, + "learning_rate": 2.1965828317287816e-06, + "loss": 1.3552, + "mean_token_accuracy": 0.6525135089953741, + "num_tokens": 2855676220.0, + "step": 17030 + }, + { + "entropy": 1.7493245204289753, + "epoch": 1.8709455933646426, + "grad_norm": 0.7808289527893066, + "learning_rate": 2.1962501519412676e-06, + "loss": 1.2661, + "mean_token_accuracy": 0.674684152007103, + "num_tokens": 2855798387.0, + "step": 17031 + }, + { + "entropy": 1.7108966807524364, + "epoch": 1.8710554502760155, + "grad_norm": 0.6273822784423828, + "learning_rate": 2.1959177507884706e-06, + "loss": 1.3955, + "mean_token_accuracy": 0.6569380015134811, + "num_tokens": 2855962363.0, + "step": 17032 + }, + { + "entropy": 1.658450762430827, + "epoch": 1.8711653071873884, + "grad_norm": 0.6467920541763306, + "learning_rate": 2.195585628280909e-06, + "loss": 1.5203, + "mean_token_accuracy": 0.6477319151163101, + "num_tokens": 2856134799.0, + "step": 17033 + }, + { + "entropy": 1.7665147085984547, + "epoch": 1.8712751640987615, + "grad_norm": 0.7447314262390137, + "learning_rate": 2.1952537844290942e-06, + "loss": 1.4415, + "mean_token_accuracy": 0.6656771103541056, + "num_tokens": 2856282873.0, + "step": 17034 + }, + { + "entropy": 1.715973476568858, + "epoch": 1.8713850210101342, + "grad_norm": 0.687869131565094, + "learning_rate": 2.1949222192435293e-06, + "loss": 1.4573, + "mean_token_accuracy": 0.6531703372796377, + "num_tokens": 2856413198.0, + "step": 17035 + }, + { + "entropy": 1.7442041039466858, + "epoch": 1.8714948779215073, + "grad_norm": 0.6900414824485779, + "learning_rate": 2.1945909327347094e-06, + "loss": 1.3789, + "mean_token_accuracy": 0.6618852317333221, + "num_tokens": 2856543909.0, + "step": 17036 + }, + { + "entropy": 1.6572751700878143, + "epoch": 1.8716047348328801, + "grad_norm": 0.6331953406333923, + "learning_rate": 2.194259924913119e-06, + "loss": 1.2094, + "mean_token_accuracy": 0.6854116519292196, + "num_tokens": 2856704996.0, + "step": 17037 + }, + { + "entropy": 1.769335389137268, + "epoch": 1.871714591744253, + "grad_norm": 0.7674529552459717, + "learning_rate": 2.1939291957892327e-06, + "loss": 1.2783, + "mean_token_accuracy": 0.6658161183198293, + "num_tokens": 2856829001.0, + "step": 17038 + }, + { + "entropy": 1.7292616963386536, + "epoch": 1.8718244486556261, + "grad_norm": 0.6778686046600342, + "learning_rate": 2.19359874537352e-06, + "loss": 1.5333, + "mean_token_accuracy": 0.6352483679850897, + "num_tokens": 2857081755.0, + "step": 17039 + }, + { + "entropy": 1.6831102867921193, + "epoch": 1.871934305566999, + "grad_norm": 4.391300678253174, + "learning_rate": 2.1932685736764393e-06, + "loss": 1.4329, + "mean_token_accuracy": 0.6666145275036494, + "num_tokens": 2857303953.0, + "step": 17040 + }, + { + "entropy": 1.7301143109798431, + "epoch": 1.872044162478372, + "grad_norm": 0.6847091913223267, + "learning_rate": 2.1929386807084392e-06, + "loss": 1.3037, + "mean_token_accuracy": 0.6623615125815073, + "num_tokens": 2857451437.0, + "step": 17041 + }, + { + "entropy": 1.7140269080797832, + "epoch": 1.872154019389745, + "grad_norm": 0.6751673221588135, + "learning_rate": 2.192609066479961e-06, + "loss": 1.2567, + "mean_token_accuracy": 0.6729971269766489, + "num_tokens": 2857569788.0, + "step": 17042 + }, + { + "entropy": 1.6233059763908386, + "epoch": 1.8722638763011177, + "grad_norm": 0.697934627532959, + "learning_rate": 2.192279731001438e-06, + "loss": 1.2847, + "mean_token_accuracy": 0.6785244792699814, + "num_tokens": 2857745514.0, + "step": 17043 + }, + { + "entropy": 1.7246392567952473, + "epoch": 1.8723737332124908, + "grad_norm": 0.714394211769104, + "learning_rate": 2.191950674283292e-06, + "loss": 1.4599, + "mean_token_accuracy": 0.6533997456232706, + "num_tokens": 2857908966.0, + "step": 17044 + }, + { + "entropy": 1.6922082702318828, + "epoch": 1.8724835901238637, + "grad_norm": 0.671735405921936, + "learning_rate": 2.191621896335938e-06, + "loss": 1.3781, + "mean_token_accuracy": 0.6661517322063446, + "num_tokens": 2858048527.0, + "step": 17045 + }, + { + "entropy": 1.710803061723709, + "epoch": 1.8725934470352366, + "grad_norm": 0.5997881293296814, + "learning_rate": 2.1912933971697833e-06, + "loss": 1.3553, + "mean_token_accuracy": 0.6474985132614771, + "num_tokens": 2858183882.0, + "step": 17046 + }, + { + "entropy": 1.7078399062156677, + "epoch": 1.8727033039466097, + "grad_norm": 0.6537618041038513, + "learning_rate": 2.190965176795221e-06, + "loss": 1.4829, + "mean_token_accuracy": 0.6355342864990234, + "num_tokens": 2858436938.0, + "step": 17047 + }, + { + "entropy": 1.717695951461792, + "epoch": 1.8728131608579823, + "grad_norm": 0.7487478256225586, + "learning_rate": 2.1906372352226434e-06, + "loss": 1.5507, + "mean_token_accuracy": 0.6489723970492681, + "num_tokens": 2858632964.0, + "step": 17048 + }, + { + "entropy": 1.7423529624938965, + "epoch": 1.8729230177693554, + "grad_norm": 0.59836745262146, + "learning_rate": 2.1903095724624266e-06, + "loss": 1.416, + "mean_token_accuracy": 0.639144832889239, + "num_tokens": 2858829021.0, + "step": 17049 + }, + { + "entropy": 1.7561921576658885, + "epoch": 1.8730328746807283, + "grad_norm": 0.7711092829704285, + "learning_rate": 2.1899821885249423e-06, + "loss": 1.4651, + "mean_token_accuracy": 0.6511860340833664, + "num_tokens": 2858947061.0, + "step": 17050 + }, + { + "entropy": 1.6973484357198079, + "epoch": 1.8731427315921012, + "grad_norm": 0.6623829007148743, + "learning_rate": 2.189655083420551e-06, + "loss": 1.3509, + "mean_token_accuracy": 0.666906327009201, + "num_tokens": 2859138268.0, + "step": 17051 + }, + { + "entropy": 1.7268371681372325, + "epoch": 1.8732525885034743, + "grad_norm": 0.7016375064849854, + "learning_rate": 2.1893282571596075e-06, + "loss": 1.5942, + "mean_token_accuracy": 0.6331303964058558, + "num_tokens": 2859322249.0, + "step": 17052 + }, + { + "entropy": 1.6590932706991832, + "epoch": 1.8733624454148472, + "grad_norm": 0.6531033515930176, + "learning_rate": 2.189001709752454e-06, + "loss": 1.4715, + "mean_token_accuracy": 0.6680138657490412, + "num_tokens": 2859528608.0, + "step": 17053 + }, + { + "entropy": 1.6124554375807445, + "epoch": 1.87347230232622, + "grad_norm": 0.7286319136619568, + "learning_rate": 2.1886754412094264e-06, + "loss": 1.2273, + "mean_token_accuracy": 0.6924339234828949, + "num_tokens": 2859687765.0, + "step": 17054 + }, + { + "entropy": 1.676022340854009, + "epoch": 1.8735821592375932, + "grad_norm": 0.6995905637741089, + "learning_rate": 2.1883494515408502e-06, + "loss": 1.3037, + "mean_token_accuracy": 0.677752767999967, + "num_tokens": 2859809649.0, + "step": 17055 + }, + { + "entropy": 1.6995362242062886, + "epoch": 1.8736920161489659, + "grad_norm": 0.8846271634101868, + "learning_rate": 2.1880237407570444e-06, + "loss": 1.4299, + "mean_token_accuracy": 0.6495392918586731, + "num_tokens": 2859990844.0, + "step": 17056 + }, + { + "entropy": 1.7364083031813304, + "epoch": 1.873801873060339, + "grad_norm": 0.6322318315505981, + "learning_rate": 2.1876983088683143e-06, + "loss": 1.4336, + "mean_token_accuracy": 0.6391441822052002, + "num_tokens": 2860194791.0, + "step": 17057 + }, + { + "entropy": 1.7000387012958527, + "epoch": 1.8739117299717118, + "grad_norm": 0.7319401502609253, + "learning_rate": 2.187373155884964e-06, + "loss": 1.4675, + "mean_token_accuracy": 0.6507869611183802, + "num_tokens": 2860341823.0, + "step": 17058 + }, + { + "entropy": 1.7169471581776936, + "epoch": 1.8740215868830847, + "grad_norm": 0.7342702746391296, + "learning_rate": 2.1870482818172806e-06, + "loss": 1.3525, + "mean_token_accuracy": 0.6542551517486572, + "num_tokens": 2860451598.0, + "step": 17059 + }, + { + "entropy": 1.690687209367752, + "epoch": 1.8741314437944578, + "grad_norm": 0.8333070874214172, + "learning_rate": 2.1867236866755485e-06, + "loss": 1.4478, + "mean_token_accuracy": 0.6612561593453089, + "num_tokens": 2860613234.0, + "step": 17060 + }, + { + "entropy": 1.6780410210291545, + "epoch": 1.8742413007058305, + "grad_norm": 0.6751420497894287, + "learning_rate": 2.186399370470041e-06, + "loss": 1.3559, + "mean_token_accuracy": 0.6728497544924418, + "num_tokens": 2860791756.0, + "step": 17061 + }, + { + "entropy": 1.6945658723513286, + "epoch": 1.8743511576172036, + "grad_norm": 0.6549538373947144, + "learning_rate": 2.186075333211021e-06, + "loss": 1.3931, + "mean_token_accuracy": 0.6677144765853882, + "num_tokens": 2861011997.0, + "step": 17062 + }, + { + "entropy": 1.7178350687026978, + "epoch": 1.8744610145285765, + "grad_norm": 0.8077467679977417, + "learning_rate": 2.1857515749087446e-06, + "loss": 1.4099, + "mean_token_accuracy": 0.6525556395451227, + "num_tokens": 2861207983.0, + "step": 17063 + }, + { + "entropy": 1.7497240900993347, + "epoch": 1.8745708714399494, + "grad_norm": 0.9003075957298279, + "learning_rate": 2.1854280955734598e-06, + "loss": 1.4174, + "mean_token_accuracy": 0.6599795470635096, + "num_tokens": 2861327750.0, + "step": 17064 + }, + { + "entropy": 1.6928727825482686, + "epoch": 1.8746807283513225, + "grad_norm": 0.6773639917373657, + "learning_rate": 2.185104895215404e-06, + "loss": 1.3828, + "mean_token_accuracy": 0.6638530343770981, + "num_tokens": 2861505270.0, + "step": 17065 + }, + { + "entropy": 1.7255871494611104, + "epoch": 1.8747905852626954, + "grad_norm": 0.6331843733787537, + "learning_rate": 2.1847819738448052e-06, + "loss": 1.4456, + "mean_token_accuracy": 0.6406483799219131, + "num_tokens": 2861670381.0, + "step": 17066 + }, + { + "entropy": 1.7596171100934346, + "epoch": 1.8749004421740683, + "grad_norm": 0.8675169348716736, + "learning_rate": 2.1844593314718867e-06, + "loss": 1.4466, + "mean_token_accuracy": 0.6425968805948893, + "num_tokens": 2861824862.0, + "step": 17067 + }, + { + "entropy": 1.6844032406806946, + "epoch": 1.8750102990854414, + "grad_norm": 0.7364755868911743, + "learning_rate": 2.184136968106857e-06, + "loss": 1.3846, + "mean_token_accuracy": 0.6768645147482554, + "num_tokens": 2861966016.0, + "step": 17068 + }, + { + "entropy": 1.6982781986395519, + "epoch": 1.875120155996814, + "grad_norm": 0.7753962278366089, + "learning_rate": 2.1838148837599186e-06, + "loss": 1.2406, + "mean_token_accuracy": 0.6812696407238642, + "num_tokens": 2862084902.0, + "step": 17069 + }, + { + "entropy": 1.715488960345586, + "epoch": 1.8752300129081871, + "grad_norm": 0.6685140132904053, + "learning_rate": 2.183493078441268e-06, + "loss": 1.3345, + "mean_token_accuracy": 0.6754670590162277, + "num_tokens": 2862300683.0, + "step": 17070 + }, + { + "entropy": 1.7158759633700054, + "epoch": 1.87533986981956, + "grad_norm": 0.6193472743034363, + "learning_rate": 2.183171552161088e-06, + "loss": 1.3894, + "mean_token_accuracy": 0.6529115388790766, + "num_tokens": 2862469396.0, + "step": 17071 + }, + { + "entropy": 1.7098310788472493, + "epoch": 1.875449726730933, + "grad_norm": 0.6856310963630676, + "learning_rate": 2.1828503049295556e-06, + "loss": 1.3109, + "mean_token_accuracy": 0.6672416975100836, + "num_tokens": 2862651736.0, + "step": 17072 + }, + { + "entropy": 1.686122328042984, + "epoch": 1.875559583642306, + "grad_norm": 0.7014265656471252, + "learning_rate": 2.1825293367568375e-06, + "loss": 1.446, + "mean_token_accuracy": 0.6568909734487534, + "num_tokens": 2862825931.0, + "step": 17073 + }, + { + "entropy": 1.6640195647875469, + "epoch": 1.8756694405536787, + "grad_norm": 0.7757869958877563, + "learning_rate": 2.1822086476530922e-06, + "loss": 1.2991, + "mean_token_accuracy": 0.6700432101885477, + "num_tokens": 2862961042.0, + "step": 17074 + }, + { + "entropy": 1.695358783006668, + "epoch": 1.8757792974650518, + "grad_norm": 0.5820761919021606, + "learning_rate": 2.181888237628471e-06, + "loss": 1.4312, + "mean_token_accuracy": 0.6432149757941564, + "num_tokens": 2863141499.0, + "step": 17075 + }, + { + "entropy": 1.7364993294080098, + "epoch": 1.8758891543764247, + "grad_norm": 0.6635037660598755, + "learning_rate": 2.1815681066931136e-06, + "loss": 1.4227, + "mean_token_accuracy": 0.644807959596316, + "num_tokens": 2863308785.0, + "step": 17076 + }, + { + "entropy": 1.6751290559768677, + "epoch": 1.8759990112877976, + "grad_norm": 0.6418736577033997, + "learning_rate": 2.1812482548571515e-06, + "loss": 1.3018, + "mean_token_accuracy": 0.6599528888861338, + "num_tokens": 2863449125.0, + "step": 17077 + }, + { + "entropy": 1.7395286758740742, + "epoch": 1.8761088681991707, + "grad_norm": 0.7806156873703003, + "learning_rate": 2.180928682130708e-06, + "loss": 1.4441, + "mean_token_accuracy": 0.6352565785249075, + "num_tokens": 2863620259.0, + "step": 17078 + }, + { + "entropy": 1.6934346357981365, + "epoch": 1.8762187251105436, + "grad_norm": 0.7733772993087769, + "learning_rate": 2.1806093885238976e-06, + "loss": 1.4599, + "mean_token_accuracy": 0.6639293332894644, + "num_tokens": 2863779694.0, + "step": 17079 + }, + { + "entropy": 1.73578542470932, + "epoch": 1.8763285820219164, + "grad_norm": 0.5854712724685669, + "learning_rate": 2.1802903740468267e-06, + "loss": 1.5494, + "mean_token_accuracy": 0.6304313540458679, + "num_tokens": 2863971818.0, + "step": 17080 + }, + { + "entropy": 1.7329691052436829, + "epoch": 1.8764384389332895, + "grad_norm": 0.7195613980293274, + "learning_rate": 2.1799716387095905e-06, + "loss": 1.3164, + "mean_token_accuracy": 0.6736815422773361, + "num_tokens": 2864109592.0, + "step": 17081 + }, + { + "entropy": 1.7070931295553844, + "epoch": 1.8765482958446622, + "grad_norm": 0.7272635102272034, + "learning_rate": 2.179653182522278e-06, + "loss": 1.2566, + "mean_token_accuracy": 0.6699541062116623, + "num_tokens": 2864217249.0, + "step": 17082 + }, + { + "entropy": 1.694780518611272, + "epoch": 1.8766581527560353, + "grad_norm": 0.6670692563056946, + "learning_rate": 2.1793350054949674e-06, + "loss": 1.3224, + "mean_token_accuracy": 0.6682775169610977, + "num_tokens": 2864379968.0, + "step": 17083 + }, + { + "entropy": 1.7478112777074177, + "epoch": 1.8767680096674082, + "grad_norm": 0.770989716053009, + "learning_rate": 2.179017107637729e-06, + "loss": 1.3859, + "mean_token_accuracy": 0.6664139181375504, + "num_tokens": 2864557477.0, + "step": 17084 + }, + { + "entropy": 1.7456609308719635, + "epoch": 1.876877866578781, + "grad_norm": 0.9178656339645386, + "learning_rate": 2.1786994889606262e-06, + "loss": 1.5924, + "mean_token_accuracy": 0.6394909123579661, + "num_tokens": 2864782031.0, + "step": 17085 + }, + { + "entropy": 1.6717379689216614, + "epoch": 1.8769877234901542, + "grad_norm": 0.668597400188446, + "learning_rate": 2.1783821494737067e-06, + "loss": 1.3992, + "mean_token_accuracy": 0.6508362789948782, + "num_tokens": 2864958961.0, + "step": 17086 + }, + { + "entropy": 1.6922581891218822, + "epoch": 1.8770975804015269, + "grad_norm": 0.6903640031814575, + "learning_rate": 2.1780650891870188e-06, + "loss": 1.3327, + "mean_token_accuracy": 0.6786336451768875, + "num_tokens": 2865077172.0, + "step": 17087 + }, + { + "entropy": 1.7544843256473541, + "epoch": 1.8772074373129, + "grad_norm": 0.7095819115638733, + "learning_rate": 2.177748308110596e-06, + "loss": 1.3514, + "mean_token_accuracy": 0.6596596439679464, + "num_tokens": 2865201021.0, + "step": 17088 + }, + { + "entropy": 1.6850469013055165, + "epoch": 1.8773172942242728, + "grad_norm": 0.7165399789810181, + "learning_rate": 2.1774318062544623e-06, + "loss": 1.4429, + "mean_token_accuracy": 0.6499310483535131, + "num_tokens": 2865443298.0, + "step": 17089 + }, + { + "entropy": 1.7009844084580739, + "epoch": 1.8774271511356457, + "grad_norm": 0.6865831613540649, + "learning_rate": 2.177115583628637e-06, + "loss": 1.4149, + "mean_token_accuracy": 0.6542165130376816, + "num_tokens": 2865685108.0, + "step": 17090 + }, + { + "entropy": 1.6992128491401672, + "epoch": 1.8775370080470188, + "grad_norm": 0.7311968803405762, + "learning_rate": 2.176799640243128e-06, + "loss": 1.2469, + "mean_token_accuracy": 0.6794113417466482, + "num_tokens": 2865784923.0, + "step": 17091 + }, + { + "entropy": 1.7033413747946422, + "epoch": 1.8776468649583917, + "grad_norm": 0.7062245607376099, + "learning_rate": 2.1764839761079354e-06, + "loss": 1.3618, + "mean_token_accuracy": 0.6627474625905355, + "num_tokens": 2865941631.0, + "step": 17092 + }, + { + "entropy": 1.6912708282470703, + "epoch": 1.8777567218697646, + "grad_norm": 0.6303133368492126, + "learning_rate": 2.176168591233048e-06, + "loss": 1.3618, + "mean_token_accuracy": 0.6606016606092453, + "num_tokens": 2866114212.0, + "step": 17093 + }, + { + "entropy": 1.6680894295374553, + "epoch": 1.8778665787811377, + "grad_norm": 0.6817684173583984, + "learning_rate": 2.175853485628448e-06, + "loss": 1.4784, + "mean_token_accuracy": 0.6489969938993454, + "num_tokens": 2866343322.0, + "step": 17094 + }, + { + "entropy": 1.7241562108198802, + "epoch": 1.8779764356925104, + "grad_norm": 0.653832197189331, + "learning_rate": 2.175538659304109e-06, + "loss": 1.3772, + "mean_token_accuracy": 0.6535724550485611, + "num_tokens": 2866532714.0, + "step": 17095 + }, + { + "entropy": 1.7202540437380474, + "epoch": 1.8780862926038835, + "grad_norm": 0.8709003329277039, + "learning_rate": 2.1752241122699966e-06, + "loss": 1.2235, + "mean_token_accuracy": 0.6837896257638931, + "num_tokens": 2866663231.0, + "step": 17096 + }, + { + "entropy": 1.7354286313056946, + "epoch": 1.8781961495152564, + "grad_norm": 0.6793311238288879, + "learning_rate": 2.1749098445360633e-06, + "loss": 1.4171, + "mean_token_accuracy": 0.6585271706183752, + "num_tokens": 2866817700.0, + "step": 17097 + }, + { + "entropy": 1.7306594252586365, + "epoch": 1.8783060064266293, + "grad_norm": 0.7136476039886475, + "learning_rate": 2.174595856112257e-06, + "loss": 1.3441, + "mean_token_accuracy": 0.6554663379987081, + "num_tokens": 2866969575.0, + "step": 17098 + }, + { + "entropy": 1.7610450983047485, + "epoch": 1.8784158633380024, + "grad_norm": 0.6379169821739197, + "learning_rate": 2.174282147008515e-06, + "loss": 1.3371, + "mean_token_accuracy": 0.6573585122823715, + "num_tokens": 2867123276.0, + "step": 17099 + }, + { + "entropy": 1.691591699918111, + "epoch": 1.878525720249375, + "grad_norm": 0.7431465983390808, + "learning_rate": 2.173968717234767e-06, + "loss": 1.4903, + "mean_token_accuracy": 0.6451913366715113, + "num_tokens": 2867292293.0, + "step": 17100 + }, + { + "entropy": 1.6427749395370483, + "epoch": 1.8786355771607481, + "grad_norm": 0.6136252880096436, + "learning_rate": 2.1736555668009302e-06, + "loss": 1.4948, + "mean_token_accuracy": 0.6392818937699, + "num_tokens": 2867500607.0, + "step": 17101 + }, + { + "entropy": 1.7291006743907928, + "epoch": 1.878745434072121, + "grad_norm": 0.7383739352226257, + "learning_rate": 2.1733426957169185e-06, + "loss": 1.4956, + "mean_token_accuracy": 0.6548979928096136, + "num_tokens": 2867695033.0, + "step": 17102 + }, + { + "entropy": 1.7051356335481007, + "epoch": 1.878855290983494, + "grad_norm": 0.7386910915374756, + "learning_rate": 2.1730301039926322e-06, + "loss": 1.4685, + "mean_token_accuracy": 0.6496059795220693, + "num_tokens": 2867916001.0, + "step": 17103 + }, + { + "entropy": 1.644729922215144, + "epoch": 1.878965147894867, + "grad_norm": 0.6021768450737, + "learning_rate": 2.1727177916379664e-06, + "loss": 1.3486, + "mean_token_accuracy": 0.6531336605548859, + "num_tokens": 2868091178.0, + "step": 17104 + }, + { + "entropy": 1.747380663951238, + "epoch": 1.87907500480624, + "grad_norm": 0.7101630568504333, + "learning_rate": 2.1724057586628055e-06, + "loss": 1.4363, + "mean_token_accuracy": 0.6600662469863892, + "num_tokens": 2868228095.0, + "step": 17105 + }, + { + "entropy": 1.7580998639265697, + "epoch": 1.8791848617176128, + "grad_norm": 0.6921120882034302, + "learning_rate": 2.1720940050770238e-06, + "loss": 1.3382, + "mean_token_accuracy": 0.6595444331566492, + "num_tokens": 2868384732.0, + "step": 17106 + }, + { + "entropy": 1.692195286353429, + "epoch": 1.879294718628986, + "grad_norm": 0.7340524792671204, + "learning_rate": 2.171782530890488e-06, + "loss": 1.5391, + "mean_token_accuracy": 0.6522092272837957, + "num_tokens": 2868557649.0, + "step": 17107 + }, + { + "entropy": 1.6761192977428436, + "epoch": 1.8794045755403586, + "grad_norm": 0.6261641383171082, + "learning_rate": 2.171471336113058e-06, + "loss": 1.2654, + "mean_token_accuracy": 0.6697875261306763, + "num_tokens": 2868694048.0, + "step": 17108 + }, + { + "entropy": 1.7369263966878254, + "epoch": 1.8795144324517317, + "grad_norm": 0.7136772274971008, + "learning_rate": 2.1711604207545828e-06, + "loss": 1.427, + "mean_token_accuracy": 0.6500804722309113, + "num_tokens": 2868867959.0, + "step": 17109 + }, + { + "entropy": 1.6655204892158508, + "epoch": 1.8796242893631046, + "grad_norm": 0.7923451066017151, + "learning_rate": 2.1708497848248998e-06, + "loss": 1.4741, + "mean_token_accuracy": 0.647340714931488, + "num_tokens": 2869051956.0, + "step": 17110 + }, + { + "entropy": 1.6829159657160442, + "epoch": 1.8797341462744774, + "grad_norm": 0.5876792669296265, + "learning_rate": 2.170539428333844e-06, + "loss": 1.4499, + "mean_token_accuracy": 0.6519188384215037, + "num_tokens": 2869234691.0, + "step": 17111 + }, + { + "entropy": 1.717431555191676, + "epoch": 1.8798440031858505, + "grad_norm": 0.8365014791488647, + "learning_rate": 2.170229351291237e-06, + "loss": 1.2309, + "mean_token_accuracy": 0.6746444006760915, + "num_tokens": 2869356432.0, + "step": 17112 + }, + { + "entropy": 1.6984197696050007, + "epoch": 1.8799538600972232, + "grad_norm": 0.6819642782211304, + "learning_rate": 2.1699195537068908e-06, + "loss": 1.3904, + "mean_token_accuracy": 0.6632606933514277, + "num_tokens": 2869499052.0, + "step": 17113 + }, + { + "entropy": 1.7775751153628032, + "epoch": 1.8800637170085963, + "grad_norm": 0.7044260501861572, + "learning_rate": 2.1696100355906137e-06, + "loss": 1.4486, + "mean_token_accuracy": 0.6558221131563187, + "num_tokens": 2869637826.0, + "step": 17114 + }, + { + "entropy": 1.6655368208885193, + "epoch": 1.8801735739199692, + "grad_norm": 0.6480795741081238, + "learning_rate": 2.1693007969521985e-06, + "loss": 1.2762, + "mean_token_accuracy": 0.664667988816897, + "num_tokens": 2869770682.0, + "step": 17115 + }, + { + "entropy": 1.7761302689711254, + "epoch": 1.880283430831342, + "grad_norm": 0.6664544343948364, + "learning_rate": 2.1689918378014345e-06, + "loss": 1.5244, + "mean_token_accuracy": 0.6382568577925364, + "num_tokens": 2869953883.0, + "step": 17116 + }, + { + "entropy": 1.7439662218093872, + "epoch": 1.8803932877427152, + "grad_norm": 0.8166419267654419, + "learning_rate": 2.1686831581480992e-06, + "loss": 1.2875, + "mean_token_accuracy": 0.6673971563577652, + "num_tokens": 2870115126.0, + "step": 17117 + }, + { + "entropy": 1.697892556587855, + "epoch": 1.880503144654088, + "grad_norm": 0.6083175539970398, + "learning_rate": 2.1683747580019617e-06, + "loss": 1.4151, + "mean_token_accuracy": 0.6482170025507609, + "num_tokens": 2870306843.0, + "step": 17118 + }, + { + "entropy": 1.6897941430409749, + "epoch": 1.880613001565461, + "grad_norm": 0.6954461932182312, + "learning_rate": 2.1680666373727835e-06, + "loss": 1.3079, + "mean_token_accuracy": 0.6700096229712168, + "num_tokens": 2870488476.0, + "step": 17119 + }, + { + "entropy": 1.6832915445168812, + "epoch": 1.880722858476834, + "grad_norm": 0.6204468607902527, + "learning_rate": 2.1677587962703186e-06, + "loss": 1.389, + "mean_token_accuracy": 0.6591685314973196, + "num_tokens": 2870669134.0, + "step": 17120 + }, + { + "entropy": 1.6706489821275075, + "epoch": 1.8808327153882067, + "grad_norm": 0.845382809638977, + "learning_rate": 2.1674512347043057e-06, + "loss": 1.2121, + "mean_token_accuracy": 0.6803888330856959, + "num_tokens": 2870794482.0, + "step": 17121 + }, + { + "entropy": 1.7348777552445729, + "epoch": 1.8809425722995798, + "grad_norm": 0.5870766043663025, + "learning_rate": 2.1671439526844816e-06, + "loss": 1.4127, + "mean_token_accuracy": 0.6562628646691641, + "num_tokens": 2870984182.0, + "step": 17122 + }, + { + "entropy": 1.6719197829564412, + "epoch": 1.8810524292109527, + "grad_norm": 0.685198962688446, + "learning_rate": 2.166836950220572e-06, + "loss": 1.3735, + "mean_token_accuracy": 0.6528567423423132, + "num_tokens": 2871163157.0, + "step": 17123 + }, + { + "entropy": 1.707009196281433, + "epoch": 1.8811622861223256, + "grad_norm": 0.9159536957740784, + "learning_rate": 2.166530227322293e-06, + "loss": 1.5308, + "mean_token_accuracy": 0.660614863038063, + "num_tokens": 2871303925.0, + "step": 17124 + }, + { + "entropy": 1.7264705697695415, + "epoch": 1.8812721430336987, + "grad_norm": 0.6962438821792603, + "learning_rate": 2.166223783999351e-06, + "loss": 1.3317, + "mean_token_accuracy": 0.6664454191923141, + "num_tokens": 2871441073.0, + "step": 17125 + }, + { + "entropy": 1.7702072660128276, + "epoch": 1.8813819999450714, + "grad_norm": 0.6648865938186646, + "learning_rate": 2.165917620261446e-06, + "loss": 1.5234, + "mean_token_accuracy": 0.645287091533343, + "num_tokens": 2871637167.0, + "step": 17126 + }, + { + "entropy": 1.6745448410511017, + "epoch": 1.8814918568564445, + "grad_norm": 0.7408782243728638, + "learning_rate": 2.1656117361182664e-06, + "loss": 1.2249, + "mean_token_accuracy": 0.6850744038820267, + "num_tokens": 2871766634.0, + "step": 17127 + }, + { + "entropy": 1.6802305380503337, + "epoch": 1.8816017137678174, + "grad_norm": 0.6817216277122498, + "learning_rate": 2.165306131579495e-06, + "loss": 1.3904, + "mean_token_accuracy": 0.6435802976290385, + "num_tokens": 2871954489.0, + "step": 17128 + }, + { + "entropy": 1.7907779812812805, + "epoch": 1.8817115706791903, + "grad_norm": 0.7694052457809448, + "learning_rate": 2.165000806654805e-06, + "loss": 1.5186, + "mean_token_accuracy": 0.645979126294454, + "num_tokens": 2872141193.0, + "step": 17129 + }, + { + "entropy": 1.7915573219458263, + "epoch": 1.8818214275905634, + "grad_norm": 0.7176956534385681, + "learning_rate": 2.1646957613538573e-06, + "loss": 1.4003, + "mean_token_accuracy": 0.6597653726736704, + "num_tokens": 2872323839.0, + "step": 17130 + }, + { + "entropy": 1.6581577956676483, + "epoch": 1.8819312845019363, + "grad_norm": 0.8513138890266418, + "learning_rate": 2.1643909956863064e-06, + "loss": 1.4395, + "mean_token_accuracy": 0.6642699937025706, + "num_tokens": 2872474722.0, + "step": 17131 + }, + { + "entropy": 1.731380472580592, + "epoch": 1.8820411414133091, + "grad_norm": 0.6215800642967224, + "learning_rate": 2.1640865096618006e-06, + "loss": 1.432, + "mean_token_accuracy": 0.6468487431605657, + "num_tokens": 2872681874.0, + "step": 17132 + }, + { + "entropy": 1.674561321735382, + "epoch": 1.8821509983246822, + "grad_norm": 0.6435210108757019, + "learning_rate": 2.1637823032899747e-06, + "loss": 1.3136, + "mean_token_accuracy": 0.6661032090584437, + "num_tokens": 2872841550.0, + "step": 17133 + }, + { + "entropy": 1.7193843921025593, + "epoch": 1.882260855236055, + "grad_norm": 0.7598627209663391, + "learning_rate": 2.163478376580456e-06, + "loss": 1.4469, + "mean_token_accuracy": 0.6524012287457784, + "num_tokens": 2873024920.0, + "step": 17134 + }, + { + "entropy": 1.693623701731364, + "epoch": 1.882370712147428, + "grad_norm": 2.42760968208313, + "learning_rate": 2.1631747295428672e-06, + "loss": 1.3129, + "mean_token_accuracy": 0.6606552849213282, + "num_tokens": 2873190325.0, + "step": 17135 + }, + { + "entropy": 1.7247453530629475, + "epoch": 1.882480569058801, + "grad_norm": 0.5800105333328247, + "learning_rate": 2.1628713621868154e-06, + "loss": 1.538, + "mean_token_accuracy": 0.6374923388163248, + "num_tokens": 2873403679.0, + "step": 17136 + }, + { + "entropy": 1.7465098798274994, + "epoch": 1.8825904259701738, + "grad_norm": 0.9322325587272644, + "learning_rate": 2.1625682745219016e-06, + "loss": 1.3702, + "mean_token_accuracy": 0.6589129120111465, + "num_tokens": 2873575685.0, + "step": 17137 + }, + { + "entropy": 1.7134642004966736, + "epoch": 1.882700282881547, + "grad_norm": 0.6871570348739624, + "learning_rate": 2.1622654665577216e-06, + "loss": 1.4218, + "mean_token_accuracy": 0.6442284633715948, + "num_tokens": 2873756855.0, + "step": 17138 + }, + { + "entropy": 1.7436382969220479, + "epoch": 1.8828101397929196, + "grad_norm": 0.773567795753479, + "learning_rate": 2.1619629383038555e-06, + "loss": 1.4462, + "mean_token_accuracy": 0.6507983406384786, + "num_tokens": 2873937261.0, + "step": 17139 + }, + { + "entropy": 1.691699226697286, + "epoch": 1.8829199967042927, + "grad_norm": 0.6026458740234375, + "learning_rate": 2.1616606897698805e-06, + "loss": 1.5601, + "mean_token_accuracy": 0.6354630514979362, + "num_tokens": 2874124211.0, + "step": 17140 + }, + { + "entropy": 1.7005370358626049, + "epoch": 1.8830298536156655, + "grad_norm": 0.6932726502418518, + "learning_rate": 2.161358720965363e-06, + "loss": 1.5492, + "mean_token_accuracy": 0.6447855283816656, + "num_tokens": 2874316120.0, + "step": 17141 + }, + { + "entropy": 1.67661514878273, + "epoch": 1.8831397105270384, + "grad_norm": 0.7656899690628052, + "learning_rate": 2.1610570318998573e-06, + "loss": 1.4386, + "mean_token_accuracy": 0.6698861916859945, + "num_tokens": 2874435775.0, + "step": 17142 + }, + { + "entropy": 1.6867518723011017, + "epoch": 1.8832495674384115, + "grad_norm": 0.6553547382354736, + "learning_rate": 2.1607556225829144e-06, + "loss": 1.3886, + "mean_token_accuracy": 0.6542109300692877, + "num_tokens": 2874597623.0, + "step": 17143 + }, + { + "entropy": 1.752112736304601, + "epoch": 1.8833594243497844, + "grad_norm": 0.6014690399169922, + "learning_rate": 2.160454493024073e-06, + "loss": 1.4618, + "mean_token_accuracy": 0.6372010310490926, + "num_tokens": 2874826976.0, + "step": 17144 + }, + { + "entropy": 1.7432648241519928, + "epoch": 1.8834692812611573, + "grad_norm": 0.649844229221344, + "learning_rate": 2.1601536432328648e-06, + "loss": 1.3441, + "mean_token_accuracy": 0.6546978851159414, + "num_tokens": 2875003958.0, + "step": 17145 + }, + { + "entropy": 1.7034888068834941, + "epoch": 1.8835791381725304, + "grad_norm": 0.7077612280845642, + "learning_rate": 2.1598530732188087e-06, + "loss": 1.4713, + "mean_token_accuracy": 0.666733592748642, + "num_tokens": 2875139891.0, + "step": 17146 + }, + { + "entropy": 1.7785635590553284, + "epoch": 1.883688995083903, + "grad_norm": 0.686241626739502, + "learning_rate": 2.159552782991421e-06, + "loss": 1.5261, + "mean_token_accuracy": 0.6443983117739359, + "num_tokens": 2875341135.0, + "step": 17147 + }, + { + "entropy": 1.6909163693586986, + "epoch": 1.8837988519952762, + "grad_norm": 0.7089021801948547, + "learning_rate": 2.159252772560204e-06, + "loss": 1.4343, + "mean_token_accuracy": 0.6512851615746816, + "num_tokens": 2875523311.0, + "step": 17148 + }, + { + "entropy": 1.6890226205190022, + "epoch": 1.883908708906649, + "grad_norm": 0.7156793475151062, + "learning_rate": 2.1589530419346515e-06, + "loss": 1.3621, + "mean_token_accuracy": 0.6604212572177252, + "num_tokens": 2875662146.0, + "step": 17149 + }, + { + "entropy": 1.6136377950509389, + "epoch": 1.884018565818022, + "grad_norm": 0.6490492224693298, + "learning_rate": 2.158653591124252e-06, + "loss": 1.3841, + "mean_token_accuracy": 0.6620889157056808, + "num_tokens": 2875843407.0, + "step": 17150 + }, + { + "entropy": 1.791116327047348, + "epoch": 1.884128422729395, + "grad_norm": 0.7462132573127747, + "learning_rate": 2.1583544201384825e-06, + "loss": 1.4494, + "mean_token_accuracy": 0.6622537871201833, + "num_tokens": 2875972410.0, + "step": 17151 + }, + { + "entropy": 1.6781320869922638, + "epoch": 1.884238279640768, + "grad_norm": 0.7043426632881165, + "learning_rate": 2.1580555289868118e-06, + "loss": 1.3202, + "mean_token_accuracy": 0.6793678253889084, + "num_tokens": 2876130756.0, + "step": 17152 + }, + { + "entropy": 1.6705817480882008, + "epoch": 1.8843481365521408, + "grad_norm": 0.6849955916404724, + "learning_rate": 2.1577569176786993e-06, + "loss": 1.2153, + "mean_token_accuracy": 0.6768456598122915, + "num_tokens": 2876253913.0, + "step": 17153 + }, + { + "entropy": 1.7637967069943745, + "epoch": 1.8844579934635137, + "grad_norm": 1.0273441076278687, + "learning_rate": 2.157458586223596e-06, + "loss": 1.3025, + "mean_token_accuracy": 0.6631099134683609, + "num_tokens": 2876388583.0, + "step": 17154 + }, + { + "entropy": 1.6225744386514027, + "epoch": 1.8845678503748866, + "grad_norm": 0.6709175705909729, + "learning_rate": 2.157160534630943e-06, + "loss": 1.2164, + "mean_token_accuracy": 0.6821977148453394, + "num_tokens": 2876505830.0, + "step": 17155 + }, + { + "entropy": 1.7032880385716755, + "epoch": 1.8846777072862597, + "grad_norm": 0.7127795219421387, + "learning_rate": 2.1568627629101753e-06, + "loss": 1.4102, + "mean_token_accuracy": 0.6570267875989279, + "num_tokens": 2876686265.0, + "step": 17156 + }, + { + "entropy": 1.6788496275742848, + "epoch": 1.8847875641976326, + "grad_norm": 0.7283757328987122, + "learning_rate": 2.156565271070716e-06, + "loss": 1.3016, + "mean_token_accuracy": 0.6735827922821045, + "num_tokens": 2876818542.0, + "step": 17157 + }, + { + "entropy": 1.6719705959161122, + "epoch": 1.8848974211090055, + "grad_norm": 0.8454893231391907, + "learning_rate": 2.1562680591219815e-06, + "loss": 1.1964, + "mean_token_accuracy": 0.6822755336761475, + "num_tokens": 2876933270.0, + "step": 17158 + }, + { + "entropy": 1.7461024026076, + "epoch": 1.8850072780203786, + "grad_norm": 0.7794149518013, + "learning_rate": 2.1559711270733765e-06, + "loss": 1.4554, + "mean_token_accuracy": 0.6527780294418335, + "num_tokens": 2877087587.0, + "step": 17159 + }, + { + "entropy": 1.7108294367790222, + "epoch": 1.8851171349317513, + "grad_norm": 0.6386652588844299, + "learning_rate": 2.155674474934301e-06, + "loss": 1.3632, + "mean_token_accuracy": 0.6620263059933981, + "num_tokens": 2877241197.0, + "step": 17160 + }, + { + "entropy": 1.716377208630244, + "epoch": 1.8852269918431244, + "grad_norm": 0.6413389444351196, + "learning_rate": 2.1553781027141433e-06, + "loss": 1.4775, + "mean_token_accuracy": 0.6538679301738739, + "num_tokens": 2877466362.0, + "step": 17161 + }, + { + "entropy": 1.6621573368708293, + "epoch": 1.8853368487544973, + "grad_norm": 0.7208569645881653, + "learning_rate": 2.155082010422283e-06, + "loss": 1.2998, + "mean_token_accuracy": 0.6709053417046865, + "num_tokens": 2877575652.0, + "step": 17162 + }, + { + "entropy": 1.6736730337142944, + "epoch": 1.8854467056658701, + "grad_norm": 0.8033668398857117, + "learning_rate": 2.154786198068091e-06, + "loss": 1.3498, + "mean_token_accuracy": 0.6585894276698431, + "num_tokens": 2877692948.0, + "step": 17163 + }, + { + "entropy": 1.7026234964529674, + "epoch": 1.8855565625772432, + "grad_norm": 0.7073157429695129, + "learning_rate": 2.1544906656609303e-06, + "loss": 1.4006, + "mean_token_accuracy": 0.6467277258634567, + "num_tokens": 2877850196.0, + "step": 17164 + }, + { + "entropy": 1.719588041305542, + "epoch": 1.8856664194886161, + "grad_norm": 0.6670511364936829, + "learning_rate": 2.1541954132101546e-06, + "loss": 1.4866, + "mean_token_accuracy": 0.6505639304717382, + "num_tokens": 2878003561.0, + "step": 17165 + }, + { + "entropy": 1.7602489292621613, + "epoch": 1.885776276399989, + "grad_norm": 0.9589049220085144, + "learning_rate": 2.153900440725107e-06, + "loss": 1.3435, + "mean_token_accuracy": 0.6540907273689905, + "num_tokens": 2878142884.0, + "step": 17166 + }, + { + "entropy": 1.7023918430010478, + "epoch": 1.885886133311362, + "grad_norm": 0.6268293261528015, + "learning_rate": 2.1536057482151253e-06, + "loss": 1.3064, + "mean_token_accuracy": 0.6649014155069987, + "num_tokens": 2878286722.0, + "step": 17167 + }, + { + "entropy": 1.7479477028052013, + "epoch": 1.8859959902227348, + "grad_norm": 0.7205145955085754, + "learning_rate": 2.1533113356895356e-06, + "loss": 1.3166, + "mean_token_accuracy": 0.6679264704386393, + "num_tokens": 2878488440.0, + "step": 17168 + }, + { + "entropy": 1.6543095012505848, + "epoch": 1.886105847134108, + "grad_norm": 0.6612537503242493, + "learning_rate": 2.153017203157655e-06, + "loss": 1.4688, + "mean_token_accuracy": 0.6611438890298208, + "num_tokens": 2878638347.0, + "step": 17169 + }, + { + "entropy": 1.6952688296635945, + "epoch": 1.8862157040454808, + "grad_norm": 0.6854779720306396, + "learning_rate": 2.152723350628793e-06, + "loss": 1.3837, + "mean_token_accuracy": 0.6586264471213022, + "num_tokens": 2878823408.0, + "step": 17170 + }, + { + "entropy": 1.6490706702073414, + "epoch": 1.8863255609568537, + "grad_norm": 0.7439612746238708, + "learning_rate": 2.1524297781122507e-06, + "loss": 1.2378, + "mean_token_accuracy": 0.6832160651683807, + "num_tokens": 2878940613.0, + "step": 17171 + }, + { + "entropy": 1.6276902059714, + "epoch": 1.8864354178682268, + "grad_norm": 0.6118603348731995, + "learning_rate": 2.15213648561732e-06, + "loss": 1.4464, + "mean_token_accuracy": 0.6511234442392985, + "num_tokens": 2879148471.0, + "step": 17172 + }, + { + "entropy": 1.7158278822898865, + "epoch": 1.8865452747795994, + "grad_norm": 0.603689432144165, + "learning_rate": 2.1518434731532815e-06, + "loss": 1.4145, + "mean_token_accuracy": 0.6546343117952347, + "num_tokens": 2879312410.0, + "step": 17173 + }, + { + "entropy": 1.75503213206927, + "epoch": 1.8866551316909725, + "grad_norm": 0.6921542882919312, + "learning_rate": 2.1515507407294096e-06, + "loss": 1.3781, + "mean_token_accuracy": 0.6559339066346487, + "num_tokens": 2879489120.0, + "step": 17174 + }, + { + "entropy": 1.6965388059616089, + "epoch": 1.8867649886023454, + "grad_norm": 0.5904287695884705, + "learning_rate": 2.1512582883549703e-06, + "loss": 1.507, + "mean_token_accuracy": 0.6430360277493795, + "num_tokens": 2879694192.0, + "step": 17175 + }, + { + "entropy": 1.7962828079859416, + "epoch": 1.8868748455137183, + "grad_norm": 0.8660070300102234, + "learning_rate": 2.150966116039219e-06, + "loss": 1.4214, + "mean_token_accuracy": 0.6482437898715337, + "num_tokens": 2879808765.0, + "step": 17176 + }, + { + "entropy": 1.7078647017478943, + "epoch": 1.8869847024250914, + "grad_norm": 0.6933693289756775, + "learning_rate": 2.1506742237914026e-06, + "loss": 1.4655, + "mean_token_accuracy": 0.6365055541197459, + "num_tokens": 2879968191.0, + "step": 17177 + }, + { + "entropy": 1.6872906982898712, + "epoch": 1.8870945593364643, + "grad_norm": 0.6161040663719177, + "learning_rate": 2.1503826116207586e-06, + "loss": 1.4166, + "mean_token_accuracy": 0.6558385094006857, + "num_tokens": 2880131622.0, + "step": 17178 + }, + { + "entropy": 1.75909224152565, + "epoch": 1.8872044162478372, + "grad_norm": 0.658726692199707, + "learning_rate": 2.1500912795365193e-06, + "loss": 1.3921, + "mean_token_accuracy": 0.6491363197565079, + "num_tokens": 2880300886.0, + "step": 17179 + }, + { + "entropy": 1.6820505162080128, + "epoch": 1.88731427315921, + "grad_norm": 0.8096556067466736, + "learning_rate": 2.149800227547902e-06, + "loss": 1.276, + "mean_token_accuracy": 0.6641524781783422, + "num_tokens": 2880485214.0, + "step": 17180 + }, + { + "entropy": 1.7462229331334431, + "epoch": 1.887424130070583, + "grad_norm": 0.7542430758476257, + "learning_rate": 2.1495094556641183e-06, + "loss": 1.5453, + "mean_token_accuracy": 0.6436825742324194, + "num_tokens": 2880688455.0, + "step": 17181 + }, + { + "entropy": 1.6861853897571564, + "epoch": 1.887533986981956, + "grad_norm": 0.7085747122764587, + "learning_rate": 2.149218963894373e-06, + "loss": 1.4594, + "mean_token_accuracy": 0.657812312245369, + "num_tokens": 2880835604.0, + "step": 17182 + }, + { + "entropy": 1.734100381533305, + "epoch": 1.887643843893329, + "grad_norm": 0.6035891175270081, + "learning_rate": 2.148928752247859e-06, + "loss": 1.5958, + "mean_token_accuracy": 0.6254423459370931, + "num_tokens": 2881031539.0, + "step": 17183 + }, + { + "entropy": 1.7015228271484375, + "epoch": 1.8877537008047018, + "grad_norm": 0.6321704983711243, + "learning_rate": 2.148638820733762e-06, + "loss": 1.4446, + "mean_token_accuracy": 0.6493401179711024, + "num_tokens": 2881176601.0, + "step": 17184 + }, + { + "entropy": 1.6736248135566711, + "epoch": 1.887863557716075, + "grad_norm": 0.7702553868293762, + "learning_rate": 2.148349169361259e-06, + "loss": 1.3464, + "mean_token_accuracy": 0.675323560833931, + "num_tokens": 2881349503.0, + "step": 17185 + }, + { + "entropy": 1.7201215823491414, + "epoch": 1.8879734146274476, + "grad_norm": 0.7781912684440613, + "learning_rate": 2.148059798139514e-06, + "loss": 1.4305, + "mean_token_accuracy": 0.6516983310381571, + "num_tokens": 2881490232.0, + "step": 17186 + }, + { + "entropy": 1.7094205915927887, + "epoch": 1.8880832715388207, + "grad_norm": 0.6526473760604858, + "learning_rate": 2.1477707070776883e-06, + "loss": 1.5196, + "mean_token_accuracy": 0.6446429987748464, + "num_tokens": 2881698286.0, + "step": 17187 + }, + { + "entropy": 1.688776175181071, + "epoch": 1.8881931284501936, + "grad_norm": 0.7511041760444641, + "learning_rate": 2.1474818961849316e-06, + "loss": 1.3161, + "mean_token_accuracy": 0.6700662126143774, + "num_tokens": 2881880985.0, + "step": 17188 + }, + { + "entropy": 1.6932948231697083, + "epoch": 1.8883029853615665, + "grad_norm": 0.6438283324241638, + "learning_rate": 2.1471933654703836e-06, + "loss": 1.4265, + "mean_token_accuracy": 0.6442149132490158, + "num_tokens": 2882058048.0, + "step": 17189 + }, + { + "entropy": 1.705348789691925, + "epoch": 1.8884128422729396, + "grad_norm": 0.7522128820419312, + "learning_rate": 2.1469051149431757e-06, + "loss": 1.271, + "mean_token_accuracy": 0.6708547174930573, + "num_tokens": 2882169248.0, + "step": 17190 + }, + { + "entropy": 1.8012695014476776, + "epoch": 1.8885226991843125, + "grad_norm": 0.7312252521514893, + "learning_rate": 2.146617144612432e-06, + "loss": 1.5895, + "mean_token_accuracy": 0.6462727536757787, + "num_tokens": 2882375194.0, + "step": 17191 + }, + { + "entropy": 1.737836887439092, + "epoch": 1.8886325560956854, + "grad_norm": 1.2015639543533325, + "learning_rate": 2.1463294544872667e-06, + "loss": 1.4586, + "mean_token_accuracy": 0.6477744380633036, + "num_tokens": 2882532455.0, + "step": 17192 + }, + { + "entropy": 1.737512121597926, + "epoch": 1.8887424130070583, + "grad_norm": 0.6532018184661865, + "learning_rate": 2.1460420445767836e-06, + "loss": 1.3933, + "mean_token_accuracy": 0.6575685640176138, + "num_tokens": 2882708598.0, + "step": 17193 + }, + { + "entropy": 1.6850264469782512, + "epoch": 1.8888522699184311, + "grad_norm": 0.6517339944839478, + "learning_rate": 2.145754914890081e-06, + "loss": 1.3152, + "mean_token_accuracy": 0.6752770642439524, + "num_tokens": 2882844056.0, + "step": 17194 + }, + { + "entropy": 1.6757459739844005, + "epoch": 1.8889621268298042, + "grad_norm": 0.6307591199874878, + "learning_rate": 2.1454680654362445e-06, + "loss": 1.3338, + "mean_token_accuracy": 0.6622594942649206, + "num_tokens": 2883031026.0, + "step": 17195 + }, + { + "entropy": 1.7201253175735474, + "epoch": 1.8890719837411771, + "grad_norm": 0.740987241268158, + "learning_rate": 2.1451814962243545e-06, + "loss": 1.3998, + "mean_token_accuracy": 0.6528403460979462, + "num_tokens": 2883216632.0, + "step": 17196 + }, + { + "entropy": 1.700130045413971, + "epoch": 1.88918184065255, + "grad_norm": 0.6037099361419678, + "learning_rate": 2.1448952072634807e-06, + "loss": 1.3888, + "mean_token_accuracy": 0.6569363375504812, + "num_tokens": 2883388061.0, + "step": 17197 + }, + { + "entropy": 1.7035789688428242, + "epoch": 1.8892916975639231, + "grad_norm": 0.629531741142273, + "learning_rate": 2.1446091985626818e-06, + "loss": 1.3131, + "mean_token_accuracy": 0.6592882921298345, + "num_tokens": 2883555748.0, + "step": 17198 + }, + { + "entropy": 1.7170901894569397, + "epoch": 1.8894015544752958, + "grad_norm": 0.8567357659339905, + "learning_rate": 2.144323470131012e-06, + "loss": 1.347, + "mean_token_accuracy": 0.6601819346348444, + "num_tokens": 2883707533.0, + "step": 17199 + }, + { + "entropy": 1.6625105639298756, + "epoch": 1.889511411386669, + "grad_norm": 0.7391311526298523, + "learning_rate": 2.144038021977515e-06, + "loss": 1.3399, + "mean_token_accuracy": 0.6689294477303823, + "num_tokens": 2883851556.0, + "step": 17200 + }, + { + "entropy": 1.6819651424884796, + "epoch": 1.8896212682980418, + "grad_norm": 0.7686917185783386, + "learning_rate": 2.143752854111223e-06, + "loss": 1.3399, + "mean_token_accuracy": 0.6758074214061102, + "num_tokens": 2883995505.0, + "step": 17201 + }, + { + "entropy": 1.8181818425655365, + "epoch": 1.8897311252094147, + "grad_norm": 0.7404806613922119, + "learning_rate": 2.1434679665411625e-06, + "loss": 1.487, + "mean_token_accuracy": 0.6463923106590906, + "num_tokens": 2884131386.0, + "step": 17202 + }, + { + "entropy": 1.736276884873708, + "epoch": 1.8898409821207878, + "grad_norm": 0.7244618535041809, + "learning_rate": 2.1431833592763512e-06, + "loss": 1.3495, + "mean_token_accuracy": 0.6554465840260187, + "num_tokens": 2884280001.0, + "step": 17203 + }, + { + "entropy": 1.7070696453253429, + "epoch": 1.8899508390321607, + "grad_norm": 0.6054216623306274, + "learning_rate": 2.1428990323257944e-06, + "loss": 1.4691, + "mean_token_accuracy": 0.6478527784347534, + "num_tokens": 2884483799.0, + "step": 17204 + }, + { + "entropy": 1.673392613728841, + "epoch": 1.8900606959435335, + "grad_norm": 0.6630545258522034, + "learning_rate": 2.1426149856984922e-06, + "loss": 1.2966, + "mean_token_accuracy": 0.681118776400884, + "num_tokens": 2884654832.0, + "step": 17205 + }, + { + "entropy": 1.7095024585723877, + "epoch": 1.8901705528549066, + "grad_norm": 0.633782684803009, + "learning_rate": 2.1423312194034347e-06, + "loss": 1.4286, + "mean_token_accuracy": 0.6514197190602621, + "num_tokens": 2884826182.0, + "step": 17206 + }, + { + "entropy": 1.6941520472367604, + "epoch": 1.8902804097662793, + "grad_norm": 0.772243320941925, + "learning_rate": 2.1420477334496024e-06, + "loss": 1.2764, + "mean_token_accuracy": 0.6747996111710867, + "num_tokens": 2884956841.0, + "step": 17207 + }, + { + "entropy": 1.7718258996804555, + "epoch": 1.8903902666776524, + "grad_norm": 0.8815136551856995, + "learning_rate": 2.141764527845968e-06, + "loss": 1.3896, + "mean_token_accuracy": 0.6521994322538376, + "num_tokens": 2885070369.0, + "step": 17208 + }, + { + "entropy": 1.7424478928248088, + "epoch": 1.8905001235890253, + "grad_norm": 0.6464412212371826, + "learning_rate": 2.141481602601495e-06, + "loss": 1.4169, + "mean_token_accuracy": 0.6564925710360209, + "num_tokens": 2885247084.0, + "step": 17209 + }, + { + "entropy": 1.7533444662888844, + "epoch": 1.8906099805003982, + "grad_norm": 0.6359097361564636, + "learning_rate": 2.1411989577251376e-06, + "loss": 1.4394, + "mean_token_accuracy": 0.6459400206804276, + "num_tokens": 2885433750.0, + "step": 17210 + }, + { + "entropy": 1.804566999276479, + "epoch": 1.8907198374117713, + "grad_norm": 0.7828404903411865, + "learning_rate": 2.1409165932258406e-06, + "loss": 1.523, + "mean_token_accuracy": 0.6601679027080536, + "num_tokens": 2885640946.0, + "step": 17211 + }, + { + "entropy": 1.722624033689499, + "epoch": 1.890829694323144, + "grad_norm": 0.775169849395752, + "learning_rate": 2.1406345091125415e-06, + "loss": 1.549, + "mean_token_accuracy": 0.6515462944904963, + "num_tokens": 2885818456.0, + "step": 17212 + }, + { + "entropy": 1.6952950755755107, + "epoch": 1.890939551234517, + "grad_norm": 0.7390128970146179, + "learning_rate": 2.140352705394169e-06, + "loss": 1.4906, + "mean_token_accuracy": 0.6618280857801437, + "num_tokens": 2885977593.0, + "step": 17213 + }, + { + "entropy": 1.7304560939470928, + "epoch": 1.89104940814589, + "grad_norm": 0.7561904191970825, + "learning_rate": 2.140071182079641e-06, + "loss": 1.2999, + "mean_token_accuracy": 0.6680457144975662, + "num_tokens": 2886109952.0, + "step": 17214 + }, + { + "entropy": 1.7267817457516987, + "epoch": 1.8911592650572628, + "grad_norm": 0.7434021234512329, + "learning_rate": 2.1397899391778666e-06, + "loss": 1.5385, + "mean_token_accuracy": 0.6487158189217249, + "num_tokens": 2886248004.0, + "step": 17215 + }, + { + "entropy": 1.64817480246226, + "epoch": 1.891269121968636, + "grad_norm": 0.5317151546478271, + "learning_rate": 2.139508976697749e-06, + "loss": 1.4407, + "mean_token_accuracy": 0.6526560485363007, + "num_tokens": 2886450875.0, + "step": 17216 + }, + { + "entropy": 1.653537929058075, + "epoch": 1.8913789788800088, + "grad_norm": 0.665301501750946, + "learning_rate": 2.1392282946481794e-06, + "loss": 1.2987, + "mean_token_accuracy": 0.6666727811098099, + "num_tokens": 2886590812.0, + "step": 17217 + }, + { + "entropy": 1.6849933167298634, + "epoch": 1.8914888357913817, + "grad_norm": 0.6797506213188171, + "learning_rate": 2.1389478930380415e-06, + "loss": 1.333, + "mean_token_accuracy": 0.6623549064000448, + "num_tokens": 2886736662.0, + "step": 17218 + }, + { + "entropy": 1.6301970183849335, + "epoch": 1.8915986927027548, + "grad_norm": 0.6078571081161499, + "learning_rate": 2.13866777187621e-06, + "loss": 1.3713, + "mean_token_accuracy": 0.6544771194458008, + "num_tokens": 2886961307.0, + "step": 17219 + }, + { + "entropy": 1.6913349032402039, + "epoch": 1.8917085496141275, + "grad_norm": 0.6399415135383606, + "learning_rate": 2.13838793117155e-06, + "loss": 1.4364, + "mean_token_accuracy": 0.6472468177477518, + "num_tokens": 2887153437.0, + "step": 17220 + }, + { + "entropy": 1.679469347000122, + "epoch": 1.8918184065255006, + "grad_norm": 0.6966269612312317, + "learning_rate": 2.1381083709329195e-06, + "loss": 1.3719, + "mean_token_accuracy": 0.6473198433717092, + "num_tokens": 2887320267.0, + "step": 17221 + }, + { + "entropy": 1.7523421943187714, + "epoch": 1.8919282634368735, + "grad_norm": 0.8340956568717957, + "learning_rate": 2.1378290911691655e-06, + "loss": 1.464, + "mean_token_accuracy": 0.6585601170857748, + "num_tokens": 2887483418.0, + "step": 17222 + }, + { + "entropy": 1.7297047674655914, + "epoch": 1.8920381203482464, + "grad_norm": 0.6574650406837463, + "learning_rate": 2.1375500918891275e-06, + "loss": 1.3611, + "mean_token_accuracy": 0.6614359567562739, + "num_tokens": 2887629911.0, + "step": 17223 + }, + { + "entropy": 1.6866742571194966, + "epoch": 1.8921479772596195, + "grad_norm": 0.6516870856285095, + "learning_rate": 2.1372713731016356e-06, + "loss": 1.3913, + "mean_token_accuracy": 0.6656065583229065, + "num_tokens": 2887773364.0, + "step": 17224 + }, + { + "entropy": 1.7239994208017986, + "epoch": 1.8922578341709921, + "grad_norm": 0.8059686422348022, + "learning_rate": 2.136992934815511e-06, + "loss": 1.3294, + "mean_token_accuracy": 0.6618863890568415, + "num_tokens": 2887901335.0, + "step": 17225 + }, + { + "entropy": 1.7334805130958557, + "epoch": 1.8923676910823652, + "grad_norm": 0.7674765586853027, + "learning_rate": 2.1367147770395665e-06, + "loss": 1.3486, + "mean_token_accuracy": 0.6562761962413788, + "num_tokens": 2888048170.0, + "step": 17226 + }, + { + "entropy": 1.8443331122398376, + "epoch": 1.8924775479937381, + "grad_norm": 0.6038434505462646, + "learning_rate": 2.136436899782605e-06, + "loss": 1.4085, + "mean_token_accuracy": 0.6459904710451762, + "num_tokens": 2888205505.0, + "step": 17227 + }, + { + "entropy": 1.7091451783974965, + "epoch": 1.892587404905111, + "grad_norm": 0.6854003071784973, + "learning_rate": 2.1361593030534218e-06, + "loss": 1.3721, + "mean_token_accuracy": 0.6586553553740183, + "num_tokens": 2888360293.0, + "step": 17228 + }, + { + "entropy": 1.7349936366081238, + "epoch": 1.8926972618164841, + "grad_norm": 0.672595739364624, + "learning_rate": 2.135881986860803e-06, + "loss": 1.3288, + "mean_token_accuracy": 0.6604596922794977, + "num_tokens": 2888475380.0, + "step": 17229 + }, + { + "entropy": 1.7175809343655903, + "epoch": 1.892807118727857, + "grad_norm": 0.7268809080123901, + "learning_rate": 2.1356049512135245e-06, + "loss": 1.2871, + "mean_token_accuracy": 0.6609225074450175, + "num_tokens": 2888622020.0, + "step": 17230 + }, + { + "entropy": 1.721697747707367, + "epoch": 1.89291697563923, + "grad_norm": 1.0464180707931519, + "learning_rate": 2.135328196120354e-06, + "loss": 1.3146, + "mean_token_accuracy": 0.6679658045371374, + "num_tokens": 2888767213.0, + "step": 17231 + }, + { + "entropy": 1.651025613149007, + "epoch": 1.893026832550603, + "grad_norm": 0.5859130620956421, + "learning_rate": 2.135051721590053e-06, + "loss": 1.3748, + "mean_token_accuracy": 0.6555741727352142, + "num_tokens": 2888927693.0, + "step": 17232 + }, + { + "entropy": 1.6868204673131306, + "epoch": 1.8931366894619757, + "grad_norm": 0.7139847874641418, + "learning_rate": 2.1347755276313705e-06, + "loss": 1.3789, + "mean_token_accuracy": 0.6621604611476263, + "num_tokens": 2889100377.0, + "step": 17233 + }, + { + "entropy": 1.7080059349536896, + "epoch": 1.8932465463733488, + "grad_norm": 0.8161289691925049, + "learning_rate": 2.1344996142530466e-06, + "loss": 1.3759, + "mean_token_accuracy": 0.6496799687544504, + "num_tokens": 2889249897.0, + "step": 17234 + }, + { + "entropy": 1.7380808293819427, + "epoch": 1.8933564032847217, + "grad_norm": 0.721257746219635, + "learning_rate": 2.134223981463816e-06, + "loss": 1.588, + "mean_token_accuracy": 0.6347461392482122, + "num_tokens": 2889409243.0, + "step": 17235 + }, + { + "entropy": 1.7111575802167256, + "epoch": 1.8934662601960945, + "grad_norm": 0.6843127608299255, + "learning_rate": 2.133948629272401e-06, + "loss": 1.3835, + "mean_token_accuracy": 0.6749522536993027, + "num_tokens": 2889559550.0, + "step": 17236 + }, + { + "entropy": 1.7137587368488312, + "epoch": 1.8935761171074676, + "grad_norm": 0.7074581384658813, + "learning_rate": 2.133673557687516e-06, + "loss": 1.5246, + "mean_token_accuracy": 0.6346543331940969, + "num_tokens": 2889754498.0, + "step": 17237 + }, + { + "entropy": 1.6512508889039357, + "epoch": 1.8936859740188403, + "grad_norm": 0.7215355634689331, + "learning_rate": 2.1333987667178695e-06, + "loss": 1.2638, + "mean_token_accuracy": 0.6745673269033432, + "num_tokens": 2889878312.0, + "step": 17238 + }, + { + "entropy": 1.7148396968841553, + "epoch": 1.8937958309302134, + "grad_norm": 0.7948116660118103, + "learning_rate": 2.133124256372155e-06, + "loss": 1.3975, + "mean_token_accuracy": 0.6557581474383672, + "num_tokens": 2890036666.0, + "step": 17239 + }, + { + "entropy": 1.7264957229296367, + "epoch": 1.8939056878415863, + "grad_norm": 0.7121086120605469, + "learning_rate": 2.1328500266590625e-06, + "loss": 1.3031, + "mean_token_accuracy": 0.6640142252047857, + "num_tokens": 2890161291.0, + "step": 17240 + }, + { + "entropy": 1.6922083993752797, + "epoch": 1.8940155447529592, + "grad_norm": 0.7959898114204407, + "learning_rate": 2.132576077587272e-06, + "loss": 1.2614, + "mean_token_accuracy": 0.6731408536434174, + "num_tokens": 2890333804.0, + "step": 17241 + }, + { + "entropy": 1.6807706554730732, + "epoch": 1.8941254016643323, + "grad_norm": 0.6071732044219971, + "learning_rate": 2.132302409165452e-06, + "loss": 1.3122, + "mean_token_accuracy": 0.6773058325052261, + "num_tokens": 2890502137.0, + "step": 17242 + }, + { + "entropy": 1.720189521710078, + "epoch": 1.8942352585757052, + "grad_norm": 0.6647835969924927, + "learning_rate": 2.1320290214022642e-06, + "loss": 1.4414, + "mean_token_accuracy": 0.6582275728384653, + "num_tokens": 2890709961.0, + "step": 17243 + }, + { + "entropy": 1.7124016781648, + "epoch": 1.894345115487078, + "grad_norm": 0.6113753318786621, + "learning_rate": 2.1317559143063625e-06, + "loss": 1.4965, + "mean_token_accuracy": 0.6508686641852061, + "num_tokens": 2890904159.0, + "step": 17244 + }, + { + "entropy": 1.7102111279964447, + "epoch": 1.8944549723984512, + "grad_norm": 0.856601893901825, + "learning_rate": 2.1314830878863908e-06, + "loss": 1.1759, + "mean_token_accuracy": 0.6895684152841568, + "num_tokens": 2891022698.0, + "step": 17245 + }, + { + "entropy": 1.7018269002437592, + "epoch": 1.8945648293098238, + "grad_norm": 0.6518383622169495, + "learning_rate": 2.1312105421509827e-06, + "loss": 1.3364, + "mean_token_accuracy": 0.6601632038752238, + "num_tokens": 2891193896.0, + "step": 17246 + }, + { + "entropy": 1.6979057788848877, + "epoch": 1.894674686221197, + "grad_norm": 0.678108274936676, + "learning_rate": 2.130938277108764e-06, + "loss": 1.3862, + "mean_token_accuracy": 0.6634653856356939, + "num_tokens": 2891354151.0, + "step": 17247 + }, + { + "entropy": 1.7425933976968129, + "epoch": 1.8947845431325698, + "grad_norm": 0.8439452052116394, + "learning_rate": 2.1306662927683532e-06, + "loss": 1.5762, + "mean_token_accuracy": 0.6411895950635275, + "num_tokens": 2891509855.0, + "step": 17248 + }, + { + "entropy": 1.6670502225557964, + "epoch": 1.8948944000439427, + "grad_norm": 0.8232260942459106, + "learning_rate": 2.1303945891383575e-06, + "loss": 1.3695, + "mean_token_accuracy": 0.6671279718478521, + "num_tokens": 2891678701.0, + "step": 17249 + }, + { + "entropy": 1.700257698694865, + "epoch": 1.8950042569553158, + "grad_norm": 0.8447266817092896, + "learning_rate": 2.130123166227376e-06, + "loss": 1.3507, + "mean_token_accuracy": 0.6675139367580414, + "num_tokens": 2891855717.0, + "step": 17250 + }, + { + "entropy": 1.717061976591746, + "epoch": 1.8951141138666885, + "grad_norm": 0.6110275983810425, + "learning_rate": 2.129852024043999e-06, + "loss": 1.4106, + "mean_token_accuracy": 0.6613439122835795, + "num_tokens": 2891988808.0, + "step": 17251 + }, + { + "entropy": 1.6792820394039154, + "epoch": 1.8952239707780616, + "grad_norm": 0.709892988204956, + "learning_rate": 2.129581162596809e-06, + "loss": 1.3578, + "mean_token_accuracy": 0.6553207437197367, + "num_tokens": 2892155247.0, + "step": 17252 + }, + { + "entropy": 1.6732084353764851, + "epoch": 1.8953338276894345, + "grad_norm": 0.7051602602005005, + "learning_rate": 2.1293105818943777e-06, + "loss": 1.3955, + "mean_token_accuracy": 0.6673270811637243, + "num_tokens": 2892306122.0, + "step": 17253 + }, + { + "entropy": 1.6753906508286793, + "epoch": 1.8954436846008074, + "grad_norm": 0.6376731991767883, + "learning_rate": 2.1290402819452695e-06, + "loss": 1.394, + "mean_token_accuracy": 0.6547005921602249, + "num_tokens": 2892455330.0, + "step": 17254 + }, + { + "entropy": 1.6851638952891033, + "epoch": 1.8955535415121805, + "grad_norm": 0.7481420636177063, + "learning_rate": 2.1287702627580388e-06, + "loss": 1.4232, + "mean_token_accuracy": 0.6521519323190054, + "num_tokens": 2892625583.0, + "step": 17255 + }, + { + "entropy": 1.7001373370488484, + "epoch": 1.8956633984235534, + "grad_norm": 0.7096436619758606, + "learning_rate": 2.128500524341232e-06, + "loss": 1.4475, + "mean_token_accuracy": 0.648720865448316, + "num_tokens": 2892835053.0, + "step": 17256 + }, + { + "entropy": 1.747383952140808, + "epoch": 1.8957732553349262, + "grad_norm": 0.8015025854110718, + "learning_rate": 2.128231066703387e-06, + "loss": 1.3288, + "mean_token_accuracy": 0.6623203357060751, + "num_tokens": 2892954963.0, + "step": 17257 + }, + { + "entropy": 1.6256678303082783, + "epoch": 1.8958831122462994, + "grad_norm": 0.6793299913406372, + "learning_rate": 2.1279618898530294e-06, + "loss": 1.3693, + "mean_token_accuracy": 0.6596795121828715, + "num_tokens": 2893138868.0, + "step": 17258 + }, + { + "entropy": 1.696552187204361, + "epoch": 1.895992969157672, + "grad_norm": 0.626272976398468, + "learning_rate": 2.1276929937986816e-06, + "loss": 1.4428, + "mean_token_accuracy": 0.6529415895541509, + "num_tokens": 2893319526.0, + "step": 17259 + }, + { + "entropy": 1.7114895482858021, + "epoch": 1.8961028260690451, + "grad_norm": 0.7434846758842468, + "learning_rate": 2.1274243785488514e-06, + "loss": 1.4121, + "mean_token_accuracy": 0.6516354481379191, + "num_tokens": 2893468438.0, + "step": 17260 + }, + { + "entropy": 1.6837340195973713, + "epoch": 1.896212682980418, + "grad_norm": 0.769420325756073, + "learning_rate": 2.1271560441120416e-06, + "loss": 1.2678, + "mean_token_accuracy": 0.6741555829842886, + "num_tokens": 2893605178.0, + "step": 17261 + }, + { + "entropy": 1.686502754688263, + "epoch": 1.896322539891791, + "grad_norm": 0.6285480260848999, + "learning_rate": 2.1268879904967456e-06, + "loss": 1.385, + "mean_token_accuracy": 0.6608720819155375, + "num_tokens": 2893824967.0, + "step": 17262 + }, + { + "entropy": 1.7563516199588776, + "epoch": 1.896432396803164, + "grad_norm": 0.856654942035675, + "learning_rate": 2.1266202177114455e-06, + "loss": 1.36, + "mean_token_accuracy": 0.6687353501717249, + "num_tokens": 2893958076.0, + "step": 17263 + }, + { + "entropy": 1.7702421148618062, + "epoch": 1.8965422537145367, + "grad_norm": 0.6715371608734131, + "learning_rate": 2.1263527257646175e-06, + "loss": 1.4385, + "mean_token_accuracy": 0.6432004123926163, + "num_tokens": 2894165718.0, + "step": 17264 + }, + { + "entropy": 1.7267632186412811, + "epoch": 1.8966521106259098, + "grad_norm": 0.6621989011764526, + "learning_rate": 2.1260855146647278e-06, + "loss": 1.4865, + "mean_token_accuracy": 0.6259207328160604, + "num_tokens": 2894390591.0, + "step": 17265 + }, + { + "entropy": 1.7133949995040894, + "epoch": 1.8967619675372827, + "grad_norm": 0.5840614438056946, + "learning_rate": 2.125818584420232e-06, + "loss": 1.4147, + "mean_token_accuracy": 0.6490117361148199, + "num_tokens": 2894561880.0, + "step": 17266 + }, + { + "entropy": 1.715239018201828, + "epoch": 1.8968718244486555, + "grad_norm": 0.6776023507118225, + "learning_rate": 2.125551935039579e-06, + "loss": 1.4306, + "mean_token_accuracy": 0.6536256372928619, + "num_tokens": 2894752290.0, + "step": 17267 + }, + { + "entropy": 1.6557459632555644, + "epoch": 1.8969816813600286, + "grad_norm": 0.6310899257659912, + "learning_rate": 2.1252855665312084e-06, + "loss": 1.3285, + "mean_token_accuracy": 0.6622317085663477, + "num_tokens": 2894940916.0, + "step": 17268 + }, + { + "entropy": 1.7279532651106517, + "epoch": 1.8970915382714015, + "grad_norm": 0.6145720481872559, + "learning_rate": 2.1250194789035518e-06, + "loss": 1.4792, + "mean_token_accuracy": 0.6527701765298843, + "num_tokens": 2895121962.0, + "step": 17269 + }, + { + "entropy": 1.7092354695002239, + "epoch": 1.8972013951827744, + "grad_norm": 0.6521205902099609, + "learning_rate": 2.1247536721650283e-06, + "loss": 1.4655, + "mean_token_accuracy": 0.6464730401833853, + "num_tokens": 2895276814.0, + "step": 17270 + }, + { + "entropy": 1.7156803806622822, + "epoch": 1.8973112520941475, + "grad_norm": 0.6255699992179871, + "learning_rate": 2.1244881463240525e-06, + "loss": 1.4412, + "mean_token_accuracy": 0.6423149853944778, + "num_tokens": 2895474372.0, + "step": 17271 + }, + { + "entropy": 1.7619168758392334, + "epoch": 1.8974211090055202, + "grad_norm": 0.6236703991889954, + "learning_rate": 2.1242229013890277e-06, + "loss": 1.4911, + "mean_token_accuracy": 0.6392246186733246, + "num_tokens": 2895664516.0, + "step": 17272 + }, + { + "entropy": 1.6980822086334229, + "epoch": 1.8975309659168933, + "grad_norm": 0.8708395957946777, + "learning_rate": 2.1239579373683485e-06, + "loss": 1.2633, + "mean_token_accuracy": 0.6819048374891281, + "num_tokens": 2895813180.0, + "step": 17273 + }, + { + "entropy": 1.6919432481129963, + "epoch": 1.8976408228282662, + "grad_norm": 0.6261935830116272, + "learning_rate": 2.1236932542703996e-06, + "loss": 1.4297, + "mean_token_accuracy": 0.6447741687297821, + "num_tokens": 2895979054.0, + "step": 17274 + }, + { + "entropy": 1.67582102616628, + "epoch": 1.897750679739639, + "grad_norm": 0.6235146522521973, + "learning_rate": 2.1234288521035594e-06, + "loss": 1.5338, + "mean_token_accuracy": 0.660733645160993, + "num_tokens": 2896154771.0, + "step": 17275 + }, + { + "entropy": 1.7051470975081127, + "epoch": 1.8978605366510122, + "grad_norm": 0.7629257440567017, + "learning_rate": 2.1231647308761976e-06, + "loss": 1.3684, + "mean_token_accuracy": 0.6604376882314682, + "num_tokens": 2896305092.0, + "step": 17276 + }, + { + "entropy": 1.7228100895881653, + "epoch": 1.8979703935623848, + "grad_norm": 0.5867047309875488, + "learning_rate": 2.1229008905966725e-06, + "loss": 1.4371, + "mean_token_accuracy": 0.6493526001771291, + "num_tokens": 2896471668.0, + "step": 17277 + }, + { + "entropy": 1.6839666068553925, + "epoch": 1.898080250473758, + "grad_norm": 0.5775178670883179, + "learning_rate": 2.1226373312733327e-06, + "loss": 1.4035, + "mean_token_accuracy": 0.6348609228928884, + "num_tokens": 2896683909.0, + "step": 17278 + }, + { + "entropy": 1.7211569050947826, + "epoch": 1.8981901073851308, + "grad_norm": 0.6018611192703247, + "learning_rate": 2.1223740529145217e-06, + "loss": 1.3831, + "mean_token_accuracy": 0.6495659500360489, + "num_tokens": 2896856820.0, + "step": 17279 + }, + { + "entropy": 1.7634968161582947, + "epoch": 1.8982999642965037, + "grad_norm": 0.6888352632522583, + "learning_rate": 2.1221110555285705e-06, + "loss": 1.3899, + "mean_token_accuracy": 0.6544539431730906, + "num_tokens": 2897043615.0, + "step": 17280 + }, + { + "entropy": 1.7279618481794994, + "epoch": 1.8984098212078768, + "grad_norm": 0.6864139437675476, + "learning_rate": 2.1218483391238056e-06, + "loss": 1.4239, + "mean_token_accuracy": 0.6641011635462443, + "num_tokens": 2897196622.0, + "step": 17281 + }, + { + "entropy": 1.7093985974788666, + "epoch": 1.8985196781192497, + "grad_norm": 0.7060381770133972, + "learning_rate": 2.1215859037085396e-06, + "loss": 1.3693, + "mean_token_accuracy": 0.6542820632457733, + "num_tokens": 2897350359.0, + "step": 17282 + }, + { + "entropy": 1.7037651638189952, + "epoch": 1.8986295350306226, + "grad_norm": 0.5937888026237488, + "learning_rate": 2.121323749291078e-06, + "loss": 1.3839, + "mean_token_accuracy": 0.6539882570505142, + "num_tokens": 2897504289.0, + "step": 17283 + }, + { + "entropy": 1.69916437069575, + "epoch": 1.8987393919419957, + "grad_norm": 0.7481595873832703, + "learning_rate": 2.1210618758797206e-06, + "loss": 1.3909, + "mean_token_accuracy": 0.6524476011594137, + "num_tokens": 2897655290.0, + "step": 17284 + }, + { + "entropy": 1.7476255595684052, + "epoch": 1.8988492488533684, + "grad_norm": 0.8031712174415588, + "learning_rate": 2.1208002834827533e-06, + "loss": 1.4292, + "mean_token_accuracy": 0.6464576125144958, + "num_tokens": 2897853586.0, + "step": 17285 + }, + { + "entropy": 1.6887069741884868, + "epoch": 1.8989591057647415, + "grad_norm": 0.6965251564979553, + "learning_rate": 2.1205389721084556e-06, + "loss": 1.4859, + "mean_token_accuracy": 0.6459216872851054, + "num_tokens": 2898098470.0, + "step": 17286 + }, + { + "entropy": 1.7607675989468892, + "epoch": 1.8990689626761144, + "grad_norm": 0.7362844347953796, + "learning_rate": 2.1202779417650975e-06, + "loss": 1.3171, + "mean_token_accuracy": 0.6696411470572153, + "num_tokens": 2898208883.0, + "step": 17287 + }, + { + "entropy": 1.6586427589257557, + "epoch": 1.8991788195874872, + "grad_norm": 0.5607314705848694, + "learning_rate": 2.120017192460943e-06, + "loss": 1.2891, + "mean_token_accuracy": 0.6717559099197388, + "num_tokens": 2898361274.0, + "step": 17288 + }, + { + "entropy": 1.741514007250468, + "epoch": 1.8992886764988604, + "grad_norm": 0.7974133491516113, + "learning_rate": 2.119756724204242e-06, + "loss": 1.2662, + "mean_token_accuracy": 0.6750031113624573, + "num_tokens": 2898462903.0, + "step": 17289 + }, + { + "entropy": 1.6692986885706584, + "epoch": 1.899398533410233, + "grad_norm": 0.6954519748687744, + "learning_rate": 2.1194965370032384e-06, + "loss": 1.2814, + "mean_token_accuracy": 0.6717743625243505, + "num_tokens": 2898658678.0, + "step": 17290 + }, + { + "entropy": 1.6970256865024567, + "epoch": 1.8995083903216061, + "grad_norm": 0.9171543121337891, + "learning_rate": 2.119236630866169e-06, + "loss": 1.344, + "mean_token_accuracy": 0.6659214198589325, + "num_tokens": 2898834450.0, + "step": 17291 + }, + { + "entropy": 1.7186993459860485, + "epoch": 1.899618247232979, + "grad_norm": 0.694511353969574, + "learning_rate": 2.1189770058012575e-06, + "loss": 1.5021, + "mean_token_accuracy": 0.6403380235036215, + "num_tokens": 2899016730.0, + "step": 17292 + }, + { + "entropy": 1.7705208857854207, + "epoch": 1.899728104144352, + "grad_norm": 0.65798020362854, + "learning_rate": 2.118717661816723e-06, + "loss": 1.3265, + "mean_token_accuracy": 0.6581917802492777, + "num_tokens": 2899142112.0, + "step": 17293 + }, + { + "entropy": 1.6541693210601807, + "epoch": 1.899837961055725, + "grad_norm": 0.5420558452606201, + "learning_rate": 2.1184585989207723e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6923287163178126, + "num_tokens": 2899325969.0, + "step": 17294 + }, + { + "entropy": 1.7057847082614899, + "epoch": 1.8999478179670979, + "grad_norm": 0.6451363563537598, + "learning_rate": 2.118199817121604e-06, + "loss": 1.3285, + "mean_token_accuracy": 0.6623325844605764, + "num_tokens": 2899500885.0, + "step": 17295 + }, + { + "entropy": 1.6700185736020405, + "epoch": 1.9000576748784708, + "grad_norm": 0.7210705280303955, + "learning_rate": 2.1179413164274095e-06, + "loss": 1.4972, + "mean_token_accuracy": 0.6420283913612366, + "num_tokens": 2899716939.0, + "step": 17296 + }, + { + "entropy": 1.685696393251419, + "epoch": 1.9001675317898439, + "grad_norm": 0.6625770926475525, + "learning_rate": 2.117683096846371e-06, + "loss": 1.5663, + "mean_token_accuracy": 0.6434602290391922, + "num_tokens": 2899951366.0, + "step": 17297 + }, + { + "entropy": 1.6919982035954793, + "epoch": 1.9002773887012165, + "grad_norm": 0.977072536945343, + "learning_rate": 2.117425158386659e-06, + "loss": 1.395, + "mean_token_accuracy": 0.6762153804302216, + "num_tokens": 2900112592.0, + "step": 17298 + }, + { + "entropy": 1.6969668567180634, + "epoch": 1.9003872456125896, + "grad_norm": 0.7278903722763062, + "learning_rate": 2.1171675010564374e-06, + "loss": 1.5714, + "mean_token_accuracy": 0.6418847143650055, + "num_tokens": 2900304765.0, + "step": 17299 + }, + { + "entropy": 1.6995855470498402, + "epoch": 1.9004971025239625, + "grad_norm": 0.7209764122962952, + "learning_rate": 2.116910124863863e-06, + "loss": 1.269, + "mean_token_accuracy": 0.6693304081757864, + "num_tokens": 2900414824.0, + "step": 17300 + }, + { + "entropy": 1.6842413942019145, + "epoch": 1.9006069594353354, + "grad_norm": 0.6563640236854553, + "learning_rate": 2.1166530298170803e-06, + "loss": 1.5027, + "mean_token_accuracy": 0.6496880004803339, + "num_tokens": 2900593430.0, + "step": 17301 + }, + { + "entropy": 1.692998468875885, + "epoch": 1.9007168163467085, + "grad_norm": 0.6235043406486511, + "learning_rate": 2.1163962159242257e-06, + "loss": 1.3154, + "mean_token_accuracy": 0.6697773188352585, + "num_tokens": 2900747780.0, + "step": 17302 + }, + { + "entropy": 1.6582288543383281, + "epoch": 1.9008266732580812, + "grad_norm": 0.6133668422698975, + "learning_rate": 2.1161396831934276e-06, + "loss": 1.3683, + "mean_token_accuracy": 0.6610623051722845, + "num_tokens": 2900929126.0, + "step": 17303 + }, + { + "entropy": 1.696847399075826, + "epoch": 1.9009365301694543, + "grad_norm": 0.7132181525230408, + "learning_rate": 2.1158834316328057e-06, + "loss": 1.3959, + "mean_token_accuracy": 0.6614312728246053, + "num_tokens": 2901113026.0, + "step": 17304 + }, + { + "entropy": 1.7614192068576813, + "epoch": 1.9010463870808272, + "grad_norm": 0.666845440864563, + "learning_rate": 2.1156274612504707e-06, + "loss": 1.6487, + "mean_token_accuracy": 0.613468810915947, + "num_tokens": 2901324692.0, + "step": 17305 + }, + { + "entropy": 1.6704783340295155, + "epoch": 1.9011562439922, + "grad_norm": 0.5894295573234558, + "learning_rate": 2.115371772054523e-06, + "loss": 1.458, + "mean_token_accuracy": 0.6370960672696432, + "num_tokens": 2901536912.0, + "step": 17306 + }, + { + "entropy": 1.739585965871811, + "epoch": 1.9012661009035732, + "grad_norm": 0.8946515321731567, + "learning_rate": 2.115116364053054e-06, + "loss": 1.4145, + "mean_token_accuracy": 0.6544796675443649, + "num_tokens": 2901672711.0, + "step": 17307 + }, + { + "entropy": 1.7094165285428364, + "epoch": 1.901375957814946, + "grad_norm": 0.753603994846344, + "learning_rate": 2.1148612372541494e-06, + "loss": 1.4601, + "mean_token_accuracy": 0.6673130393028259, + "num_tokens": 2901797776.0, + "step": 17308 + }, + { + "entropy": 1.7277058760325115, + "epoch": 1.901485814726319, + "grad_norm": 0.6706650257110596, + "learning_rate": 2.114606391665883e-06, + "loss": 1.4088, + "mean_token_accuracy": 0.6417536189158758, + "num_tokens": 2901976272.0, + "step": 17309 + }, + { + "entropy": 1.7154739300409954, + "epoch": 1.901595671637692, + "grad_norm": 0.7305334210395813, + "learning_rate": 2.114351827296319e-06, + "loss": 1.3598, + "mean_token_accuracy": 0.6586563885211945, + "num_tokens": 2902107800.0, + "step": 17310 + }, + { + "entropy": 1.6918523510297139, + "epoch": 1.9017055285490647, + "grad_norm": 0.6615371108055115, + "learning_rate": 2.1140975441535173e-06, + "loss": 1.4371, + "mean_token_accuracy": 0.6612310359875361, + "num_tokens": 2902296754.0, + "step": 17311 + }, + { + "entropy": 1.7673652370770772, + "epoch": 1.9018153854604378, + "grad_norm": 0.7220476865768433, + "learning_rate": 2.1138435422455237e-06, + "loss": 1.4816, + "mean_token_accuracy": 0.6466121921936671, + "num_tokens": 2902468097.0, + "step": 17312 + }, + { + "entropy": 1.7256468534469604, + "epoch": 1.9019252423718107, + "grad_norm": 0.7970626354217529, + "learning_rate": 2.113589821580378e-06, + "loss": 1.2991, + "mean_token_accuracy": 0.6718382885058721, + "num_tokens": 2902588100.0, + "step": 17313 + }, + { + "entropy": 1.682048757870992, + "epoch": 1.9020350992831836, + "grad_norm": 0.578278660774231, + "learning_rate": 2.1133363821661097e-06, + "loss": 1.4743, + "mean_token_accuracy": 0.6461637963851293, + "num_tokens": 2902759596.0, + "step": 17314 + }, + { + "entropy": 1.7191002070903778, + "epoch": 1.9021449561945567, + "grad_norm": 0.6966040134429932, + "learning_rate": 2.113083224010741e-06, + "loss": 1.3934, + "mean_token_accuracy": 0.6677719354629517, + "num_tokens": 2902921113.0, + "step": 17315 + }, + { + "entropy": 1.7440748512744904, + "epoch": 1.9022548131059294, + "grad_norm": 0.9760518074035645, + "learning_rate": 2.112830347122284e-06, + "loss": 1.4498, + "mean_token_accuracy": 0.6512205849091212, + "num_tokens": 2903037716.0, + "step": 17316 + }, + { + "entropy": 1.709779401620229, + "epoch": 1.9023646700173025, + "grad_norm": 0.6506473422050476, + "learning_rate": 2.1125777515087405e-06, + "loss": 1.4735, + "mean_token_accuracy": 0.6514460444450378, + "num_tokens": 2903208041.0, + "step": 17317 + }, + { + "entropy": 1.7440835038820903, + "epoch": 1.9024745269286754, + "grad_norm": 0.6512756943702698, + "learning_rate": 2.1123254371781072e-06, + "loss": 1.408, + "mean_token_accuracy": 0.6547217865784963, + "num_tokens": 2903370726.0, + "step": 17318 + }, + { + "entropy": 1.6938308576742809, + "epoch": 1.9025843838400482, + "grad_norm": 0.7320882081985474, + "learning_rate": 2.1120734041383693e-06, + "loss": 1.3902, + "mean_token_accuracy": 0.6558147321144739, + "num_tokens": 2903533604.0, + "step": 17319 + }, + { + "entropy": 1.7045827706654866, + "epoch": 1.9026942407514214, + "grad_norm": 0.6136589050292969, + "learning_rate": 2.1118216523975033e-06, + "loss": 1.3388, + "mean_token_accuracy": 0.667515754699707, + "num_tokens": 2903687713.0, + "step": 17320 + }, + { + "entropy": 1.6572574277718861, + "epoch": 1.9028040976627942, + "grad_norm": 0.642920196056366, + "learning_rate": 2.111570181963476e-06, + "loss": 1.2703, + "mean_token_accuracy": 0.6777136772871017, + "num_tokens": 2903848962.0, + "step": 17321 + }, + { + "entropy": 1.6885874370733898, + "epoch": 1.9029139545741671, + "grad_norm": 0.6778194904327393, + "learning_rate": 2.1113189928442474e-06, + "loss": 1.4339, + "mean_token_accuracy": 0.6557305157184601, + "num_tokens": 2904026770.0, + "step": 17322 + }, + { + "entropy": 1.7227852642536163, + "epoch": 1.9030238114855402, + "grad_norm": 0.7432954907417297, + "learning_rate": 2.1110680850477677e-06, + "loss": 1.2885, + "mean_token_accuracy": 0.6714919358491898, + "num_tokens": 2904155217.0, + "step": 17323 + }, + { + "entropy": 1.632401277621587, + "epoch": 1.903133668396913, + "grad_norm": 0.6605057716369629, + "learning_rate": 2.1108174585819766e-06, + "loss": 1.4722, + "mean_token_accuracy": 0.6337202688058218, + "num_tokens": 2904366981.0, + "step": 17324 + }, + { + "entropy": 1.7007041573524475, + "epoch": 1.903243525308286, + "grad_norm": 0.7226251363754272, + "learning_rate": 2.1105671134548095e-06, + "loss": 1.4875, + "mean_token_accuracy": 0.652341494957606, + "num_tokens": 2904540754.0, + "step": 17325 + }, + { + "entropy": 1.6867063740889232, + "epoch": 1.9033533822196589, + "grad_norm": 1.4981513023376465, + "learning_rate": 2.110317049674186e-06, + "loss": 1.2325, + "mean_token_accuracy": 0.6909955491622289, + "num_tokens": 2904719674.0, + "step": 17326 + }, + { + "entropy": 1.6712439060211182, + "epoch": 1.9034632391310318, + "grad_norm": 0.717866837978363, + "learning_rate": 2.110067267248022e-06, + "loss": 1.3748, + "mean_token_accuracy": 0.6473964601755142, + "num_tokens": 2904849892.0, + "step": 17327 + }, + { + "entropy": 1.626634528239568, + "epoch": 1.9035730960424049, + "grad_norm": 0.8814289569854736, + "learning_rate": 2.109817766184224e-06, + "loss": 1.3151, + "mean_token_accuracy": 0.6663583666086197, + "num_tokens": 2905045049.0, + "step": 17328 + }, + { + "entropy": 1.7585350374380748, + "epoch": 1.9036829529537775, + "grad_norm": 0.6172336339950562, + "learning_rate": 2.1095685464906867e-06, + "loss": 1.3216, + "mean_token_accuracy": 0.6797197361787161, + "num_tokens": 2905187276.0, + "step": 17329 + }, + { + "entropy": 1.7158283491929371, + "epoch": 1.9037928098651506, + "grad_norm": 0.6529079079627991, + "learning_rate": 2.1093196081753005e-06, + "loss": 1.4517, + "mean_token_accuracy": 0.646214579542478, + "num_tokens": 2905407433.0, + "step": 17330 + }, + { + "entropy": 1.6329115728537242, + "epoch": 1.9039026667765235, + "grad_norm": 0.6197842359542847, + "learning_rate": 2.1090709512459403e-06, + "loss": 1.4375, + "mean_token_accuracy": 0.6615285128355026, + "num_tokens": 2905645926.0, + "step": 17331 + }, + { + "entropy": 1.7400815387566884, + "epoch": 1.9040125236878964, + "grad_norm": 0.7766255736351013, + "learning_rate": 2.1088225757104797e-06, + "loss": 1.3882, + "mean_token_accuracy": 0.6602704524993896, + "num_tokens": 2905831337.0, + "step": 17332 + }, + { + "entropy": 1.6611623167991638, + "epoch": 1.9041223805992695, + "grad_norm": 0.8019027709960938, + "learning_rate": 2.108574481576778e-06, + "loss": 1.372, + "mean_token_accuracy": 0.6643891384204229, + "num_tokens": 2905988768.0, + "step": 17333 + }, + { + "entropy": 1.6944166123867035, + "epoch": 1.9042322375106424, + "grad_norm": 0.6995406150817871, + "learning_rate": 2.1083266688526864e-06, + "loss": 1.3526, + "mean_token_accuracy": 0.6618950814008713, + "num_tokens": 2906191608.0, + "step": 17334 + }, + { + "entropy": 1.7486229836940765, + "epoch": 1.9043420944220153, + "grad_norm": 0.7345391511917114, + "learning_rate": 2.1080791375460497e-06, + "loss": 1.3983, + "mean_token_accuracy": 0.661010871330897, + "num_tokens": 2906317489.0, + "step": 17335 + }, + { + "entropy": 1.6621031761169434, + "epoch": 1.9044519513333884, + "grad_norm": 0.6560261845588684, + "learning_rate": 2.1078318876647008e-06, + "loss": 1.2522, + "mean_token_accuracy": 0.6751103301843008, + "num_tokens": 2906467343.0, + "step": 17336 + }, + { + "entropy": 1.7779687742392223, + "epoch": 1.904561808244761, + "grad_norm": 0.6154184937477112, + "learning_rate": 2.107584919216467e-06, + "loss": 1.5128, + "mean_token_accuracy": 0.6438574666778246, + "num_tokens": 2906658202.0, + "step": 17337 + }, + { + "entropy": 1.8077290554841359, + "epoch": 1.9046716651561342, + "grad_norm": 0.7357187867164612, + "learning_rate": 2.1073382322091633e-06, + "loss": 1.5264, + "mean_token_accuracy": 0.6432707210381826, + "num_tokens": 2906818347.0, + "step": 17338 + }, + { + "entropy": 1.7658953468004863, + "epoch": 1.904781522067507, + "grad_norm": 0.615322470664978, + "learning_rate": 2.107091826650596e-06, + "loss": 1.4196, + "mean_token_accuracy": 0.6580002655585607, + "num_tokens": 2906975280.0, + "step": 17339 + }, + { + "entropy": 1.7291893462340038, + "epoch": 1.90489137897888, + "grad_norm": 0.6034359931945801, + "learning_rate": 2.106845702548567e-06, + "loss": 1.6408, + "mean_token_accuracy": 0.6236142565806707, + "num_tokens": 2907236930.0, + "step": 17340 + }, + { + "entropy": 1.6925783356030781, + "epoch": 1.905001235890253, + "grad_norm": 0.6417155861854553, + "learning_rate": 2.1065998599108627e-06, + "loss": 1.5223, + "mean_token_accuracy": 0.631607269247373, + "num_tokens": 2907449207.0, + "step": 17341 + }, + { + "entropy": 1.6567042768001556, + "epoch": 1.9051110928016257, + "grad_norm": 0.6812300682067871, + "learning_rate": 2.106354298745266e-06, + "loss": 1.3955, + "mean_token_accuracy": 0.6692457795143127, + "num_tokens": 2907609264.0, + "step": 17342 + }, + { + "entropy": 1.707110603650411, + "epoch": 1.9052209497129988, + "grad_norm": 0.59761643409729, + "learning_rate": 2.1061090190595484e-06, + "loss": 1.5329, + "mean_token_accuracy": 0.6419897625843684, + "num_tokens": 2907816900.0, + "step": 17343 + }, + { + "entropy": 1.7284224132696788, + "epoch": 1.9053308066243717, + "grad_norm": 0.7119576930999756, + "learning_rate": 2.1058640208614723e-06, + "loss": 1.3361, + "mean_token_accuracy": 0.6653526822725931, + "num_tokens": 2907986068.0, + "step": 17344 + }, + { + "entropy": 1.69676540295283, + "epoch": 1.9054406635357446, + "grad_norm": 0.7282069325447083, + "learning_rate": 2.1056193041587924e-06, + "loss": 1.2982, + "mean_token_accuracy": 0.6773037711779276, + "num_tokens": 2908134262.0, + "step": 17345 + }, + { + "entropy": 1.678769161303838, + "epoch": 1.9055505204471177, + "grad_norm": 0.6445490717887878, + "learning_rate": 2.105374868959253e-06, + "loss": 1.3609, + "mean_token_accuracy": 0.6605921387672424, + "num_tokens": 2908292260.0, + "step": 17346 + }, + { + "entropy": 1.7496456503868103, + "epoch": 1.9056603773584906, + "grad_norm": 0.8550807237625122, + "learning_rate": 2.105130715270591e-06, + "loss": 1.3914, + "mean_token_accuracy": 0.6650789976119995, + "num_tokens": 2908453413.0, + "step": 17347 + }, + { + "entropy": 1.743078887462616, + "epoch": 1.9057702342698635, + "grad_norm": 0.747218906879425, + "learning_rate": 2.104886843100534e-06, + "loss": 1.4484, + "mean_token_accuracy": 0.6520692358414332, + "num_tokens": 2908624898.0, + "step": 17348 + }, + { + "entropy": 1.767656107743581, + "epoch": 1.9058800911812366, + "grad_norm": 0.6700419783592224, + "learning_rate": 2.104643252456801e-06, + "loss": 1.498, + "mean_token_accuracy": 0.6458103656768799, + "num_tokens": 2908776179.0, + "step": 17349 + }, + { + "entropy": 1.7707056005795796, + "epoch": 1.9059899480926092, + "grad_norm": 0.7064527869224548, + "learning_rate": 2.1043999433471006e-06, + "loss": 1.3796, + "mean_token_accuracy": 0.6523715605338415, + "num_tokens": 2908908378.0, + "step": 17350 + }, + { + "entropy": 1.7640493313471477, + "epoch": 1.9060998050039824, + "grad_norm": 0.69893479347229, + "learning_rate": 2.1041569157791325e-06, + "loss": 1.356, + "mean_token_accuracy": 0.6554395059744517, + "num_tokens": 2909045131.0, + "step": 17351 + }, + { + "entropy": 1.7937356928984325, + "epoch": 1.9062096619153552, + "grad_norm": 0.7464211583137512, + "learning_rate": 2.10391416976059e-06, + "loss": 1.4799, + "mean_token_accuracy": 0.6518764893213908, + "num_tokens": 2909200452.0, + "step": 17352 + }, + { + "entropy": 1.7353100379308064, + "epoch": 1.9063195188267281, + "grad_norm": 0.6664573550224304, + "learning_rate": 2.103671705299156e-06, + "loss": 1.4749, + "mean_token_accuracy": 0.6581431378920873, + "num_tokens": 2909359220.0, + "step": 17353 + }, + { + "entropy": 1.7203071018060048, + "epoch": 1.9064293757381012, + "grad_norm": 0.7875437140464783, + "learning_rate": 2.103429522402502e-06, + "loss": 1.5624, + "mean_token_accuracy": 0.6487771024306616, + "num_tokens": 2909556852.0, + "step": 17354 + }, + { + "entropy": 1.6935315628846486, + "epoch": 1.9065392326494741, + "grad_norm": 0.6662817001342773, + "learning_rate": 2.1031876210782954e-06, + "loss": 1.2577, + "mean_token_accuracy": 0.6786133150259653, + "num_tokens": 2909700221.0, + "step": 17355 + }, + { + "entropy": 1.670400321483612, + "epoch": 1.906649089560847, + "grad_norm": 0.6516803503036499, + "learning_rate": 2.1029460013341927e-06, + "loss": 1.4057, + "mean_token_accuracy": 0.6678726325432459, + "num_tokens": 2909895492.0, + "step": 17356 + }, + { + "entropy": 1.6352357765038807, + "epoch": 1.9067589464722199, + "grad_norm": 0.6111500263214111, + "learning_rate": 2.1027046631778395e-06, + "loss": 1.3753, + "mean_token_accuracy": 0.6627530604600906, + "num_tokens": 2910096151.0, + "step": 17357 + }, + { + "entropy": 1.6735480030377705, + "epoch": 1.9068688033835928, + "grad_norm": 0.6414183974266052, + "learning_rate": 2.1024636066168734e-06, + "loss": 1.4689, + "mean_token_accuracy": 0.6363677581151327, + "num_tokens": 2910260269.0, + "step": 17358 + }, + { + "entropy": 1.7215021948019664, + "epoch": 1.9069786602949659, + "grad_norm": 0.6086724996566772, + "learning_rate": 2.102222831658926e-06, + "loss": 1.4963, + "mean_token_accuracy": 0.6482528100411097, + "num_tokens": 2910447473.0, + "step": 17359 + }, + { + "entropy": 1.642045497894287, + "epoch": 1.9070885172063388, + "grad_norm": 0.7172538042068481, + "learning_rate": 2.1019823383116163e-06, + "loss": 1.4312, + "mean_token_accuracy": 0.6497194568316141, + "num_tokens": 2910609340.0, + "step": 17360 + }, + { + "entropy": 1.710667649904887, + "epoch": 1.9071983741177116, + "grad_norm": 0.8276099562644958, + "learning_rate": 2.1017421265825557e-06, + "loss": 1.3401, + "mean_token_accuracy": 0.6725934545199076, + "num_tokens": 2910731278.0, + "step": 17361 + }, + { + "entropy": 1.733361969391505, + "epoch": 1.9073082310290848, + "grad_norm": 0.7834944128990173, + "learning_rate": 2.101502196479348e-06, + "loss": 1.3967, + "mean_token_accuracy": 0.658065527677536, + "num_tokens": 2910897871.0, + "step": 17362 + }, + { + "entropy": 1.7039579351743062, + "epoch": 1.9074180879404574, + "grad_norm": 0.7555103302001953, + "learning_rate": 2.1012625480095844e-06, + "loss": 1.3922, + "mean_token_accuracy": 0.6700689966479937, + "num_tokens": 2911030541.0, + "step": 17363 + }, + { + "entropy": 1.7056597967942555, + "epoch": 1.9075279448518305, + "grad_norm": 0.6581346988677979, + "learning_rate": 2.1010231811808534e-06, + "loss": 1.4952, + "mean_token_accuracy": 0.6401078750689825, + "num_tokens": 2911278929.0, + "step": 17364 + }, + { + "entropy": 1.6974429786205292, + "epoch": 1.9076378017632034, + "grad_norm": 0.573140561580658, + "learning_rate": 2.1007840960007274e-06, + "loss": 1.5212, + "mean_token_accuracy": 0.6344274332125982, + "num_tokens": 2911502264.0, + "step": 17365 + }, + { + "entropy": 1.679697851339976, + "epoch": 1.9077476586745763, + "grad_norm": 0.7497307658195496, + "learning_rate": 2.1005452924767745e-06, + "loss": 1.3362, + "mean_token_accuracy": 0.659656897187233, + "num_tokens": 2911637934.0, + "step": 17366 + }, + { + "entropy": 1.6486320694287617, + "epoch": 1.9078575155859494, + "grad_norm": 0.6654718518257141, + "learning_rate": 2.1003067706165534e-06, + "loss": 1.4975, + "mean_token_accuracy": 0.6403248061736425, + "num_tokens": 2911856512.0, + "step": 17367 + }, + { + "entropy": 1.712807983160019, + "epoch": 1.9079673724973223, + "grad_norm": 0.7178590297698975, + "learning_rate": 2.1000685304276123e-06, + "loss": 1.258, + "mean_token_accuracy": 0.6742016822099686, + "num_tokens": 2911978469.0, + "step": 17368 + }, + { + "entropy": 1.699036826690038, + "epoch": 1.9080772294086952, + "grad_norm": 0.7930957674980164, + "learning_rate": 2.0998305719174924e-06, + "loss": 1.3113, + "mean_token_accuracy": 0.6729649156332016, + "num_tokens": 2912136257.0, + "step": 17369 + }, + { + "entropy": 1.764205386241277, + "epoch": 1.908187086320068, + "grad_norm": 0.7400210499763489, + "learning_rate": 2.0995928950937237e-06, + "loss": 1.6815, + "mean_token_accuracy": 0.6208955893913904, + "num_tokens": 2912335968.0, + "step": 17370 + }, + { + "entropy": 1.6935129761695862, + "epoch": 1.908296943231441, + "grad_norm": 0.696378767490387, + "learning_rate": 2.09935549996383e-06, + "loss": 1.3272, + "mean_token_accuracy": 0.6687455127636591, + "num_tokens": 2912508242.0, + "step": 17371 + }, + { + "entropy": 1.725334147612254, + "epoch": 1.908406800142814, + "grad_norm": 0.7115861773490906, + "learning_rate": 2.099118386535323e-06, + "loss": 1.4895, + "mean_token_accuracy": 0.6452667613824209, + "num_tokens": 2912694485.0, + "step": 17372 + }, + { + "entropy": 1.714306155840556, + "epoch": 1.908516657054187, + "grad_norm": 0.7995036840438843, + "learning_rate": 2.09888155481571e-06, + "loss": 1.2769, + "mean_token_accuracy": 0.6667713671922684, + "num_tokens": 2912828341.0, + "step": 17373 + }, + { + "entropy": 1.6857503950595856, + "epoch": 1.9086265139655598, + "grad_norm": 0.7703235745429993, + "learning_rate": 2.0986450048124836e-06, + "loss": 1.4397, + "mean_token_accuracy": 0.654585580031077, + "num_tokens": 2912996677.0, + "step": 17374 + }, + { + "entropy": 1.6876949568589528, + "epoch": 1.908736370876933, + "grad_norm": 0.778724193572998, + "learning_rate": 2.0984087365331315e-06, + "loss": 1.501, + "mean_token_accuracy": 0.6460902194182078, + "num_tokens": 2913167483.0, + "step": 17375 + }, + { + "entropy": 1.6780872146288555, + "epoch": 1.9088462277883056, + "grad_norm": 0.6744540929794312, + "learning_rate": 2.0981727499851326e-06, + "loss": 1.5221, + "mean_token_accuracy": 0.6462369163831075, + "num_tokens": 2913400005.0, + "step": 17376 + }, + { + "entropy": 1.6881005962689717, + "epoch": 1.9089560846996787, + "grad_norm": 0.6781927347183228, + "learning_rate": 2.097937045175954e-06, + "loss": 1.4824, + "mean_token_accuracy": 0.6568097323179245, + "num_tokens": 2913569420.0, + "step": 17377 + }, + { + "entropy": 1.7392517030239105, + "epoch": 1.9090659416110516, + "grad_norm": 0.6244411468505859, + "learning_rate": 2.0977016221130565e-06, + "loss": 1.5132, + "mean_token_accuracy": 0.6272874772548676, + "num_tokens": 2913830881.0, + "step": 17378 + }, + { + "entropy": 1.6990625858306885, + "epoch": 1.9091757985224245, + "grad_norm": 0.7350092530250549, + "learning_rate": 2.097466480803892e-06, + "loss": 1.3652, + "mean_token_accuracy": 0.6684706459442774, + "num_tokens": 2914056190.0, + "step": 17379 + }, + { + "entropy": 1.7045234441757202, + "epoch": 1.9092856554337976, + "grad_norm": 0.7276471257209778, + "learning_rate": 2.097231621255901e-06, + "loss": 1.3865, + "mean_token_accuracy": 0.6614778786897659, + "num_tokens": 2914209787.0, + "step": 17380 + }, + { + "entropy": 1.6718364854653676, + "epoch": 1.9093955123451705, + "grad_norm": 0.7682698369026184, + "learning_rate": 2.096997043476519e-06, + "loss": 1.3389, + "mean_token_accuracy": 0.6800911873579025, + "num_tokens": 2914413262.0, + "step": 17381 + }, + { + "entropy": 1.7149316171805065, + "epoch": 1.9095053692565434, + "grad_norm": 0.7124384045600891, + "learning_rate": 2.096762747473168e-06, + "loss": 1.4091, + "mean_token_accuracy": 0.6653865824143091, + "num_tokens": 2914541576.0, + "step": 17382 + }, + { + "entropy": 1.6883401771386464, + "epoch": 1.9096152261679162, + "grad_norm": 0.6308038234710693, + "learning_rate": 2.0965287332532634e-06, + "loss": 1.3913, + "mean_token_accuracy": 0.6438700606425604, + "num_tokens": 2914680841.0, + "step": 17383 + }, + { + "entropy": 1.7368865112463634, + "epoch": 1.9097250830792891, + "grad_norm": 0.6777101755142212, + "learning_rate": 2.0962950008242124e-06, + "loss": 1.3046, + "mean_token_accuracy": 0.6656106561422348, + "num_tokens": 2914857873.0, + "step": 17384 + }, + { + "entropy": 1.673662155866623, + "epoch": 1.9098349399906622, + "grad_norm": 0.8075534105300903, + "learning_rate": 2.096061550193414e-06, + "loss": 1.3713, + "mean_token_accuracy": 0.6667961577574412, + "num_tokens": 2915027062.0, + "step": 17385 + }, + { + "entropy": 1.6893903613090515, + "epoch": 1.9099447969020351, + "grad_norm": 0.7893344759941101, + "learning_rate": 2.0958283813682538e-06, + "loss": 1.3099, + "mean_token_accuracy": 0.668051486214002, + "num_tokens": 2915178403.0, + "step": 17386 + }, + { + "entropy": 1.6581451892852783, + "epoch": 1.910054653813408, + "grad_norm": 0.6381314396858215, + "learning_rate": 2.095595494356113e-06, + "loss": 1.3248, + "mean_token_accuracy": 0.6689875473578771, + "num_tokens": 2915377301.0, + "step": 17387 + }, + { + "entropy": 1.7033253610134125, + "epoch": 1.910164510724781, + "grad_norm": 0.7965177893638611, + "learning_rate": 2.0953628891643645e-06, + "loss": 1.3608, + "mean_token_accuracy": 0.6760942687590917, + "num_tokens": 2915496356.0, + "step": 17388 + }, + { + "entropy": 1.715836187203725, + "epoch": 1.9102743676361538, + "grad_norm": 0.7303998470306396, + "learning_rate": 2.0951305658003655e-06, + "loss": 1.4864, + "mean_token_accuracy": 0.6432332595189413, + "num_tokens": 2915680397.0, + "step": 17389 + }, + { + "entropy": 1.6246002614498138, + "epoch": 1.9103842245475269, + "grad_norm": 0.5866871476173401, + "learning_rate": 2.094898524271473e-06, + "loss": 1.3568, + "mean_token_accuracy": 0.6729725748300552, + "num_tokens": 2915856875.0, + "step": 17390 + }, + { + "entropy": 1.6950217187404633, + "epoch": 1.9104940814588998, + "grad_norm": 0.604824423789978, + "learning_rate": 2.094666764585028e-06, + "loss": 1.5893, + "mean_token_accuracy": 0.6251017103592554, + "num_tokens": 2916071725.0, + "step": 17391 + }, + { + "entropy": 1.6907562216122944, + "epoch": 1.9106039383702726, + "grad_norm": 0.6250636577606201, + "learning_rate": 2.0944352867483685e-06, + "loss": 1.3843, + "mean_token_accuracy": 0.6660982569058737, + "num_tokens": 2916229565.0, + "step": 17392 + }, + { + "entropy": 1.6932495137055714, + "epoch": 1.9107137952816458, + "grad_norm": 0.7064518928527832, + "learning_rate": 2.0942040907688184e-06, + "loss": 1.3489, + "mean_token_accuracy": 0.6507859379053116, + "num_tokens": 2916382240.0, + "step": 17393 + }, + { + "entropy": 1.7231053411960602, + "epoch": 1.9108236521930186, + "grad_norm": 0.6741702556610107, + "learning_rate": 2.0939731766536963e-06, + "loss": 1.5839, + "mean_token_accuracy": 0.6615618417660395, + "num_tokens": 2916544920.0, + "step": 17394 + }, + { + "entropy": 1.7370639046033223, + "epoch": 1.9109335091043915, + "grad_norm": 0.6713312268257141, + "learning_rate": 2.0937425444103105e-06, + "loss": 1.3087, + "mean_token_accuracy": 0.6731932461261749, + "num_tokens": 2916669826.0, + "step": 17395 + }, + { + "entropy": 1.7642224729061127, + "epoch": 1.9110433660157646, + "grad_norm": 0.7128596305847168, + "learning_rate": 2.0935121940459595e-06, + "loss": 1.4046, + "mean_token_accuracy": 0.6427824894587199, + "num_tokens": 2916817048.0, + "step": 17396 + }, + { + "entropy": 1.6907521684964497, + "epoch": 1.9111532229271373, + "grad_norm": 0.7318049073219299, + "learning_rate": 2.0932821255679337e-06, + "loss": 1.3763, + "mean_token_accuracy": 0.6755510369936625, + "num_tokens": 2916967274.0, + "step": 17397 + }, + { + "entropy": 1.6487935086091359, + "epoch": 1.9112630798385104, + "grad_norm": 0.7536101341247559, + "learning_rate": 2.0930523389835154e-06, + "loss": 1.584, + "mean_token_accuracy": 0.6465408056974411, + "num_tokens": 2917128574.0, + "step": 17398 + }, + { + "entropy": 1.740364799896876, + "epoch": 1.9113729367498833, + "grad_norm": 0.6966463923454285, + "learning_rate": 2.0928228342999764e-06, + "loss": 1.4856, + "mean_token_accuracy": 0.6480376496911049, + "num_tokens": 2917283340.0, + "step": 17399 + }, + { + "entropy": 1.6265077789624531, + "epoch": 1.9114827936612562, + "grad_norm": 0.6639081835746765, + "learning_rate": 2.092593611524582e-06, + "loss": 1.4039, + "mean_token_accuracy": 0.6535753359397253, + "num_tokens": 2917514395.0, + "step": 17400 + }, + { + "entropy": 1.768510530392329, + "epoch": 1.9115926505726293, + "grad_norm": 0.6785690188407898, + "learning_rate": 2.092364670664586e-06, + "loss": 1.4136, + "mean_token_accuracy": 0.6424920608599981, + "num_tokens": 2917680252.0, + "step": 17401 + }, + { + "entropy": 1.6863116323947906, + "epoch": 1.911702507484002, + "grad_norm": 0.6685127019882202, + "learning_rate": 2.0921360117272334e-06, + "loss": 1.4231, + "mean_token_accuracy": 0.6529277910788854, + "num_tokens": 2917812834.0, + "step": 17402 + }, + { + "entropy": 1.6903614699840546, + "epoch": 1.911812364395375, + "grad_norm": 0.8102334141731262, + "learning_rate": 2.0919076347197622e-06, + "loss": 1.3383, + "mean_token_accuracy": 0.6616157094637553, + "num_tokens": 2917957581.0, + "step": 17403 + }, + { + "entropy": 1.7386558850606282, + "epoch": 1.911922221306748, + "grad_norm": 0.7050533294677734, + "learning_rate": 2.091679539649401e-06, + "loss": 1.4781, + "mean_token_accuracy": 0.6581172744433085, + "num_tokens": 2918097568.0, + "step": 17404 + }, + { + "entropy": 1.692396640777588, + "epoch": 1.9120320782181208, + "grad_norm": 0.8380081057548523, + "learning_rate": 2.091451726523368e-06, + "loss": 1.4178, + "mean_token_accuracy": 0.6531778971354166, + "num_tokens": 2918232997.0, + "step": 17405 + }, + { + "entropy": 1.774149735768636, + "epoch": 1.912141935129494, + "grad_norm": 0.7412464618682861, + "learning_rate": 2.0912241953488736e-06, + "loss": 1.4631, + "mean_token_accuracy": 0.6348550717035929, + "num_tokens": 2918390827.0, + "step": 17406 + }, + { + "entropy": 1.7355511287848155, + "epoch": 1.9122517920408668, + "grad_norm": 0.6477778553962708, + "learning_rate": 2.0909969461331185e-06, + "loss": 1.4172, + "mean_token_accuracy": 0.6531795511643091, + "num_tokens": 2918560623.0, + "step": 17407 + }, + { + "entropy": 1.6789377927780151, + "epoch": 1.9123616489522397, + "grad_norm": 0.6781573295593262, + "learning_rate": 2.0907699788832962e-06, + "loss": 1.4929, + "mean_token_accuracy": 0.6445153504610062, + "num_tokens": 2918738819.0, + "step": 17408 + }, + { + "entropy": 1.6440903345743816, + "epoch": 1.9124715058636128, + "grad_norm": 0.6116029620170593, + "learning_rate": 2.0905432936065895e-06, + "loss": 1.3995, + "mean_token_accuracy": 0.6445668091376623, + "num_tokens": 2918932198.0, + "step": 17409 + }, + { + "entropy": 1.72769961754481, + "epoch": 1.9125813627749855, + "grad_norm": 0.749138355255127, + "learning_rate": 2.090316890310172e-06, + "loss": 1.3863, + "mean_token_accuracy": 0.6565118928750356, + "num_tokens": 2919077877.0, + "step": 17410 + }, + { + "entropy": 1.7584010362625122, + "epoch": 1.9126912196863586, + "grad_norm": 0.6772981286048889, + "learning_rate": 2.0900907690012095e-06, + "loss": 1.2869, + "mean_token_accuracy": 0.6611250092585882, + "num_tokens": 2919201070.0, + "step": 17411 + }, + { + "entropy": 1.669664631287257, + "epoch": 1.9128010765977315, + "grad_norm": 0.7344382405281067, + "learning_rate": 2.089864929686861e-06, + "loss": 1.4473, + "mean_token_accuracy": 0.6602032780647278, + "num_tokens": 2919334440.0, + "step": 17412 + }, + { + "entropy": 1.6799374123414357, + "epoch": 1.9129109335091043, + "grad_norm": 0.5712904334068298, + "learning_rate": 2.0896393723742725e-06, + "loss": 1.3804, + "mean_token_accuracy": 0.6711891492207845, + "num_tokens": 2919517894.0, + "step": 17413 + }, + { + "entropy": 1.6531173884868622, + "epoch": 1.9130207904204775, + "grad_norm": 0.6494702696800232, + "learning_rate": 2.089414097070581e-06, + "loss": 1.2276, + "mean_token_accuracy": 0.6815090030431747, + "num_tokens": 2919684705.0, + "step": 17414 + }, + { + "entropy": 1.6996674636999767, + "epoch": 1.9131306473318501, + "grad_norm": 0.6836813688278198, + "learning_rate": 2.0891891037829204e-06, + "loss": 1.4035, + "mean_token_accuracy": 0.6582774519920349, + "num_tokens": 2919839562.0, + "step": 17415 + }, + { + "entropy": 1.6734414498011272, + "epoch": 1.9132405042432232, + "grad_norm": 0.804311215877533, + "learning_rate": 2.0889643925184073e-06, + "loss": 1.3143, + "mean_token_accuracy": 0.6627353529135386, + "num_tokens": 2919960685.0, + "step": 17416 + }, + { + "entropy": 1.669149398803711, + "epoch": 1.9133503611545961, + "grad_norm": 0.6251264214515686, + "learning_rate": 2.0887399632841578e-06, + "loss": 1.4457, + "mean_token_accuracy": 0.6435550649960836, + "num_tokens": 2920181248.0, + "step": 17417 + }, + { + "entropy": 1.7074171503384907, + "epoch": 1.913460218065969, + "grad_norm": 0.6743486523628235, + "learning_rate": 2.0885158160872717e-06, + "loss": 1.4268, + "mean_token_accuracy": 0.654527614514033, + "num_tokens": 2920389161.0, + "step": 17418 + }, + { + "entropy": 1.7007141311963399, + "epoch": 1.913570074977342, + "grad_norm": 0.6802394986152649, + "learning_rate": 2.088291950934844e-06, + "loss": 1.4994, + "mean_token_accuracy": 0.634449248512586, + "num_tokens": 2920611442.0, + "step": 17419 + }, + { + "entropy": 1.7454398373762767, + "epoch": 1.913679931888715, + "grad_norm": 0.7798505425453186, + "learning_rate": 2.088068367833961e-06, + "loss": 1.3725, + "mean_token_accuracy": 0.6632242302099863, + "num_tokens": 2920737237.0, + "step": 17420 + }, + { + "entropy": 1.7217259307702382, + "epoch": 1.9137897888000879, + "grad_norm": 0.6991998553276062, + "learning_rate": 2.0878450667916983e-06, + "loss": 1.4191, + "mean_token_accuracy": 0.642817402879397, + "num_tokens": 2920907848.0, + "step": 17421 + }, + { + "entropy": 1.6675984263420105, + "epoch": 1.913899645711461, + "grad_norm": 0.7746340036392212, + "learning_rate": 2.0876220478151233e-06, + "loss": 1.337, + "mean_token_accuracy": 0.6696319133043289, + "num_tokens": 2921027405.0, + "step": 17422 + }, + { + "entropy": 1.6807717482248943, + "epoch": 1.9140095026228336, + "grad_norm": 0.6704512238502502, + "learning_rate": 2.0873993109112943e-06, + "loss": 1.2705, + "mean_token_accuracy": 0.6693969368934631, + "num_tokens": 2921204488.0, + "step": 17423 + }, + { + "entropy": 1.6527533928553264, + "epoch": 1.9141193595342068, + "grad_norm": 0.7245538830757141, + "learning_rate": 2.087176856087261e-06, + "loss": 1.3807, + "mean_token_accuracy": 0.671676109234492, + "num_tokens": 2921333040.0, + "step": 17424 + }, + { + "entropy": 1.6895228326320648, + "epoch": 1.9142292164455796, + "grad_norm": 0.5984308123588562, + "learning_rate": 2.086954683350064e-06, + "loss": 1.3878, + "mean_token_accuracy": 0.6493009428183237, + "num_tokens": 2921582051.0, + "step": 17425 + }, + { + "entropy": 1.7049407164255779, + "epoch": 1.9143390733569525, + "grad_norm": 0.7265485525131226, + "learning_rate": 2.086732792706735e-06, + "loss": 1.326, + "mean_token_accuracy": 0.6612226913372675, + "num_tokens": 2921738662.0, + "step": 17426 + }, + { + "entropy": 1.7597126563390095, + "epoch": 1.9144489302683256, + "grad_norm": 0.8590186238288879, + "learning_rate": 2.086511184164297e-06, + "loss": 1.5987, + "mean_token_accuracy": 0.6419850587844849, + "num_tokens": 2921942676.0, + "step": 17427 + }, + { + "entropy": 1.7027497589588165, + "epoch": 1.9145587871796983, + "grad_norm": 0.6106962561607361, + "learning_rate": 2.0862898577297636e-06, + "loss": 1.3344, + "mean_token_accuracy": 0.6656178931395212, + "num_tokens": 2922083937.0, + "step": 17428 + }, + { + "entropy": 1.7116191983222961, + "epoch": 1.9146686440910714, + "grad_norm": 0.7605423927307129, + "learning_rate": 2.0860688134101394e-06, + "loss": 1.2908, + "mean_token_accuracy": 0.6651194790999094, + "num_tokens": 2922221769.0, + "step": 17429 + }, + { + "entropy": 1.7398067712783813, + "epoch": 1.9147785010024443, + "grad_norm": 0.6844810247421265, + "learning_rate": 2.0858480512124205e-06, + "loss": 1.3323, + "mean_token_accuracy": 0.6669246157010397, + "num_tokens": 2922352941.0, + "step": 17430 + }, + { + "entropy": 1.6831025381882985, + "epoch": 1.9148883579138172, + "grad_norm": 0.6590069532394409, + "learning_rate": 2.0856275711435934e-06, + "loss": 1.4787, + "mean_token_accuracy": 0.6421498209238052, + "num_tokens": 2922541900.0, + "step": 17431 + }, + { + "entropy": 1.7204078237215679, + "epoch": 1.9149982148251903, + "grad_norm": 0.7026862502098083, + "learning_rate": 2.085407373210637e-06, + "loss": 1.3757, + "mean_token_accuracy": 0.645499716202418, + "num_tokens": 2922689747.0, + "step": 17432 + }, + { + "entropy": 1.704828808705012, + "epoch": 1.9151080717365632, + "grad_norm": 0.7054374814033508, + "learning_rate": 2.0851874574205206e-06, + "loss": 1.4807, + "mean_token_accuracy": 0.6405527790387472, + "num_tokens": 2922909076.0, + "step": 17433 + }, + { + "entropy": 1.7403566241264343, + "epoch": 1.915217928647936, + "grad_norm": 0.6240404844284058, + "learning_rate": 2.084967823780204e-06, + "loss": 1.3858, + "mean_token_accuracy": 0.6522353092829386, + "num_tokens": 2923086459.0, + "step": 17434 + }, + { + "entropy": 1.6699285606543224, + "epoch": 1.9153277855593092, + "grad_norm": 0.6833006739616394, + "learning_rate": 2.0847484722966383e-06, + "loss": 1.3314, + "mean_token_accuracy": 0.6654851237932841, + "num_tokens": 2923264919.0, + "step": 17435 + }, + { + "entropy": 1.7221463322639465, + "epoch": 1.9154376424706818, + "grad_norm": 0.6218920946121216, + "learning_rate": 2.0845294029767665e-06, + "loss": 1.4466, + "mean_token_accuracy": 0.6374649703502655, + "num_tokens": 2923481855.0, + "step": 17436 + }, + { + "entropy": 1.7145523428916931, + "epoch": 1.915547499382055, + "grad_norm": 0.8189231157302856, + "learning_rate": 2.084310615827522e-06, + "loss": 1.7038, + "mean_token_accuracy": 0.6330908884604772, + "num_tokens": 2923683139.0, + "step": 17437 + }, + { + "entropy": 1.7109653453032176, + "epoch": 1.9156573562934278, + "grad_norm": 0.6383715271949768, + "learning_rate": 2.0840921108558277e-06, + "loss": 1.2914, + "mean_token_accuracy": 0.664964367945989, + "num_tokens": 2923815046.0, + "step": 17438 + }, + { + "entropy": 1.6905694603919983, + "epoch": 1.9157672132048007, + "grad_norm": 0.8235413432121277, + "learning_rate": 2.0838738880686023e-06, + "loss": 1.3632, + "mean_token_accuracy": 0.6752482801675797, + "num_tokens": 2923945485.0, + "step": 17439 + }, + { + "entropy": 1.6911144355932872, + "epoch": 1.9158770701161738, + "grad_norm": 0.794511616230011, + "learning_rate": 2.083655947472749e-06, + "loss": 1.3981, + "mean_token_accuracy": 0.6572174479564031, + "num_tokens": 2924111825.0, + "step": 17440 + }, + { + "entropy": 1.6802450319131215, + "epoch": 1.9159869270275465, + "grad_norm": 0.6762140989303589, + "learning_rate": 2.0834382890751675e-06, + "loss": 1.395, + "mean_token_accuracy": 0.6544107298056284, + "num_tokens": 2924227529.0, + "step": 17441 + }, + { + "entropy": 1.7163665493329365, + "epoch": 1.9160967839389196, + "grad_norm": 0.7602096199989319, + "learning_rate": 2.0832209128827475e-06, + "loss": 1.4227, + "mean_token_accuracy": 0.6661782662073771, + "num_tokens": 2924395248.0, + "step": 17442 + }, + { + "entropy": 1.6846363445123036, + "epoch": 1.9162066408502925, + "grad_norm": 0.6908214092254639, + "learning_rate": 2.0830038189023657e-06, + "loss": 1.6045, + "mean_token_accuracy": 0.637446328997612, + "num_tokens": 2924571785.0, + "step": 17443 + }, + { + "entropy": 1.6963723401228588, + "epoch": 1.9163164977616653, + "grad_norm": 0.7088649868965149, + "learning_rate": 2.0827870071408965e-06, + "loss": 1.2535, + "mean_token_accuracy": 0.678161750237147, + "num_tokens": 2924703162.0, + "step": 17444 + }, + { + "entropy": 1.6324211259682972, + "epoch": 1.9164263546730385, + "grad_norm": 0.6246728301048279, + "learning_rate": 2.0825704776052e-06, + "loss": 1.3837, + "mean_token_accuracy": 0.66278408964475, + "num_tokens": 2924980915.0, + "step": 17445 + }, + { + "entropy": 1.7229852279027302, + "epoch": 1.9165362115844113, + "grad_norm": 0.754826009273529, + "learning_rate": 2.082354230302129e-06, + "loss": 1.3334, + "mean_token_accuracy": 0.6602563957373301, + "num_tokens": 2925088795.0, + "step": 17446 + }, + { + "entropy": 1.6306418975194295, + "epoch": 1.9166460684957842, + "grad_norm": 0.6160972714424133, + "learning_rate": 2.0821382652385284e-06, + "loss": 1.4224, + "mean_token_accuracy": 0.6546281178792318, + "num_tokens": 2925322433.0, + "step": 17447 + }, + { + "entropy": 1.6297888457775116, + "epoch": 1.9167559254071573, + "grad_norm": 0.6668198704719543, + "learning_rate": 2.081922582421233e-06, + "loss": 1.4147, + "mean_token_accuracy": 0.6672234535217285, + "num_tokens": 2925496843.0, + "step": 17448 + }, + { + "entropy": 1.7703557411829631, + "epoch": 1.91686578231853, + "grad_norm": 0.7170999050140381, + "learning_rate": 2.081707181857071e-06, + "loss": 1.3329, + "mean_token_accuracy": 0.6564952532450358, + "num_tokens": 2925647583.0, + "step": 17449 + }, + { + "entropy": 1.7169082860151927, + "epoch": 1.916975639229903, + "grad_norm": 0.6741853356361389, + "learning_rate": 2.0814920635528563e-06, + "loss": 1.2811, + "mean_token_accuracy": 0.6681034664312998, + "num_tokens": 2925770751.0, + "step": 17450 + }, + { + "entropy": 1.6565737128257751, + "epoch": 1.917085496141276, + "grad_norm": 0.6688545346260071, + "learning_rate": 2.081277227515399e-06, + "loss": 1.2302, + "mean_token_accuracy": 0.6802611798048019, + "num_tokens": 2925890003.0, + "step": 17451 + }, + { + "entropy": 1.687977929910024, + "epoch": 1.9171953530526489, + "grad_norm": 0.7067691683769226, + "learning_rate": 2.081062673751499e-06, + "loss": 1.2822, + "mean_token_accuracy": 0.6621923645337423, + "num_tokens": 2926063066.0, + "step": 17452 + }, + { + "entropy": 1.6485174397627513, + "epoch": 1.917305209964022, + "grad_norm": 0.6077693104743958, + "learning_rate": 2.0808484022679467e-06, + "loss": 1.459, + "mean_token_accuracy": 0.6410925338665644, + "num_tokens": 2926254114.0, + "step": 17453 + }, + { + "entropy": 1.6806750198205311, + "epoch": 1.9174150668753946, + "grad_norm": 0.6232016682624817, + "learning_rate": 2.0806344130715233e-06, + "loss": 1.5307, + "mean_token_accuracy": 0.6372226725021998, + "num_tokens": 2926443709.0, + "step": 17454 + }, + { + "entropy": 1.6603333155314128, + "epoch": 1.9175249237867678, + "grad_norm": 0.7298293709754944, + "learning_rate": 2.080420706169001e-06, + "loss": 1.3589, + "mean_token_accuracy": 0.6681396961212158, + "num_tokens": 2926597936.0, + "step": 17455 + }, + { + "entropy": 1.82164399822553, + "epoch": 1.9176347806981406, + "grad_norm": 0.6539393663406372, + "learning_rate": 2.080207281567144e-06, + "loss": 1.6118, + "mean_token_accuracy": 0.6109184970458349, + "num_tokens": 2926871518.0, + "step": 17456 + }, + { + "entropy": 1.6787754893302917, + "epoch": 1.9177446376095135, + "grad_norm": 0.6502023339271545, + "learning_rate": 2.079994139272708e-06, + "loss": 1.3738, + "mean_token_accuracy": 0.6680939892927805, + "num_tokens": 2927064929.0, + "step": 17457 + }, + { + "entropy": 1.6514959534009297, + "epoch": 1.9178544945208866, + "grad_norm": 0.7282174825668335, + "learning_rate": 2.0797812792924372e-06, + "loss": 1.3208, + "mean_token_accuracy": 0.6738225072622299, + "num_tokens": 2927212785.0, + "step": 17458 + }, + { + "entropy": 1.6412135660648346, + "epoch": 1.9179643514322595, + "grad_norm": 0.6706090569496155, + "learning_rate": 2.079568701633071e-06, + "loss": 1.2853, + "mean_token_accuracy": 0.6833283007144928, + "num_tokens": 2927400983.0, + "step": 17459 + }, + { + "entropy": 1.6410308082898457, + "epoch": 1.9180742083436324, + "grad_norm": 0.7416619658470154, + "learning_rate": 2.0793564063013337e-06, + "loss": 1.41, + "mean_token_accuracy": 0.6614979207515717, + "num_tokens": 2927587509.0, + "step": 17460 + }, + { + "entropy": 1.6796442766984303, + "epoch": 1.9181840652550055, + "grad_norm": 0.613105058670044, + "learning_rate": 2.0791443933039477e-06, + "loss": 1.3794, + "mean_token_accuracy": 0.6581423729658127, + "num_tokens": 2927770429.0, + "step": 17461 + }, + { + "entropy": 1.6736660699049632, + "epoch": 1.9182939221663782, + "grad_norm": 0.6586642861366272, + "learning_rate": 2.0789326626476213e-06, + "loss": 1.3009, + "mean_token_accuracy": 0.6772599170605341, + "num_tokens": 2927929359.0, + "step": 17462 + }, + { + "entropy": 1.7245589395364125, + "epoch": 1.9184037790777513, + "grad_norm": 0.7125158905982971, + "learning_rate": 2.078721214339057e-06, + "loss": 1.2919, + "mean_token_accuracy": 0.6627042591571808, + "num_tokens": 2928041715.0, + "step": 17463 + }, + { + "entropy": 1.6420990029970806, + "epoch": 1.9185136359891242, + "grad_norm": 0.6651259660720825, + "learning_rate": 2.078510048384944e-06, + "loss": 1.3234, + "mean_token_accuracy": 0.6715792467196783, + "num_tokens": 2928211841.0, + "step": 17464 + }, + { + "entropy": 1.680494636297226, + "epoch": 1.918623492900497, + "grad_norm": 0.5542195439338684, + "learning_rate": 2.0782991647919707e-06, + "loss": 1.1542, + "mean_token_accuracy": 0.6779807110627493, + "num_tokens": 2928375456.0, + "step": 17465 + }, + { + "entropy": 1.6585955023765564, + "epoch": 1.9187333498118702, + "grad_norm": 0.6564247012138367, + "learning_rate": 2.0780885635668067e-06, + "loss": 1.3989, + "mean_token_accuracy": 0.6611930181582769, + "num_tokens": 2928528830.0, + "step": 17466 + }, + { + "entropy": 1.6445672412713368, + "epoch": 1.9188432067232428, + "grad_norm": 0.6655113101005554, + "learning_rate": 2.0778782447161197e-06, + "loss": 1.4603, + "mean_token_accuracy": 0.6606913854678472, + "num_tokens": 2928741569.0, + "step": 17467 + }, + { + "entropy": 1.7159535090128581, + "epoch": 1.918953063634616, + "grad_norm": 0.7392234206199646, + "learning_rate": 2.077668208246567e-06, + "loss": 1.3667, + "mean_token_accuracy": 0.6548206061124802, + "num_tokens": 2928905957.0, + "step": 17468 + }, + { + "entropy": 1.7086364229520161, + "epoch": 1.9190629205459888, + "grad_norm": 0.7043489813804626, + "learning_rate": 2.0774584541647944e-06, + "loss": 1.229, + "mean_token_accuracy": 0.6727901895840963, + "num_tokens": 2928997788.0, + "step": 17469 + }, + { + "entropy": 1.6513133843739827, + "epoch": 1.9191727774573617, + "grad_norm": 0.676826536655426, + "learning_rate": 2.0772489824774392e-06, + "loss": 1.437, + "mean_token_accuracy": 0.6636106073856354, + "num_tokens": 2929237876.0, + "step": 17470 + }, + { + "entropy": 1.714188575744629, + "epoch": 1.9192826343687348, + "grad_norm": 0.7203949689865112, + "learning_rate": 2.0770397931911355e-06, + "loss": 1.3942, + "mean_token_accuracy": 0.662879596153895, + "num_tokens": 2929369246.0, + "step": 17471 + }, + { + "entropy": 1.7575792769591014, + "epoch": 1.9193924912801077, + "grad_norm": 0.6798261404037476, + "learning_rate": 2.0768308863125003e-06, + "loss": 1.3794, + "mean_token_accuracy": 0.6597266445557276, + "num_tokens": 2929534671.0, + "step": 17472 + }, + { + "entropy": 1.6687467495600383, + "epoch": 1.9195023481914806, + "grad_norm": 0.619688868522644, + "learning_rate": 2.0766222618481476e-06, + "loss": 1.4953, + "mean_token_accuracy": 0.6502044051885605, + "num_tokens": 2929715709.0, + "step": 17473 + }, + { + "entropy": 1.714370201031367, + "epoch": 1.9196122051028537, + "grad_norm": 0.7117909789085388, + "learning_rate": 2.076413919804679e-06, + "loss": 1.4366, + "mean_token_accuracy": 0.6580507506926855, + "num_tokens": 2929870289.0, + "step": 17474 + }, + { + "entropy": 1.696961522102356, + "epoch": 1.9197220620142263, + "grad_norm": 0.7254346013069153, + "learning_rate": 2.0762058601886882e-06, + "loss": 1.3134, + "mean_token_accuracy": 0.6692439218362173, + "num_tokens": 2930053304.0, + "step": 17475 + }, + { + "entropy": 1.7915849188963573, + "epoch": 1.9198319189255995, + "grad_norm": 0.7773795127868652, + "learning_rate": 2.0759980830067615e-06, + "loss": 1.5147, + "mean_token_accuracy": 0.6480228255192438, + "num_tokens": 2930244054.0, + "step": 17476 + }, + { + "entropy": 1.7063364485899608, + "epoch": 1.9199417758369723, + "grad_norm": 0.6918734312057495, + "learning_rate": 2.0757905882654744e-06, + "loss": 1.355, + "mean_token_accuracy": 0.6601203779379526, + "num_tokens": 2930396665.0, + "step": 17477 + }, + { + "entropy": 1.7182010610898335, + "epoch": 1.9200516327483452, + "grad_norm": 0.7387691140174866, + "learning_rate": 2.0755833759713935e-06, + "loss": 1.4699, + "mean_token_accuracy": 0.6380101641019186, + "num_tokens": 2930590121.0, + "step": 17478 + }, + { + "entropy": 1.7257728974024455, + "epoch": 1.9201614896597183, + "grad_norm": 0.6251614689826965, + "learning_rate": 2.075376446131076e-06, + "loss": 1.3573, + "mean_token_accuracy": 0.6537232995033264, + "num_tokens": 2930743975.0, + "step": 17479 + }, + { + "entropy": 1.75346240401268, + "epoch": 1.920271346571091, + "grad_norm": 0.8177134990692139, + "learning_rate": 2.0751697987510747e-06, + "loss": 1.3432, + "mean_token_accuracy": 0.6525897781054179, + "num_tokens": 2930879478.0, + "step": 17480 + }, + { + "entropy": 1.6926167905330658, + "epoch": 1.920381203482464, + "grad_norm": 0.6901217699050903, + "learning_rate": 2.0749634338379268e-06, + "loss": 1.3196, + "mean_token_accuracy": 0.6637932906548182, + "num_tokens": 2931011772.0, + "step": 17481 + }, + { + "entropy": 1.725586086511612, + "epoch": 1.920491060393837, + "grad_norm": 0.7865347266197205, + "learning_rate": 2.0747573513981635e-06, + "loss": 1.4176, + "mean_token_accuracy": 0.65843033293883, + "num_tokens": 2931162551.0, + "step": 17482 + }, + { + "entropy": 1.685064325730006, + "epoch": 1.9206009173052099, + "grad_norm": 0.6249054074287415, + "learning_rate": 2.0745515514383088e-06, + "loss": 1.5084, + "mean_token_accuracy": 0.6323985556761423, + "num_tokens": 2931392882.0, + "step": 17483 + }, + { + "entropy": 1.6556347211201985, + "epoch": 1.920710774216583, + "grad_norm": 0.6788010001182556, + "learning_rate": 2.0743460339648753e-06, + "loss": 1.3561, + "mean_token_accuracy": 0.6696719378232956, + "num_tokens": 2931555075.0, + "step": 17484 + }, + { + "entropy": 1.6232224702835083, + "epoch": 1.9208206311279559, + "grad_norm": 0.651375412940979, + "learning_rate": 2.074140798984369e-06, + "loss": 1.3004, + "mean_token_accuracy": 0.6754371821880341, + "num_tokens": 2931698300.0, + "step": 17485 + }, + { + "entropy": 1.796891490618388, + "epoch": 1.9209304880393288, + "grad_norm": 0.6899563074111938, + "learning_rate": 2.0739358465032837e-06, + "loss": 1.4438, + "mean_token_accuracy": 0.6399559328953425, + "num_tokens": 2931835023.0, + "step": 17486 + }, + { + "entropy": 1.6822420060634613, + "epoch": 1.9210403449507019, + "grad_norm": 1.8057386875152588, + "learning_rate": 2.0737311765281066e-06, + "loss": 1.1202, + "mean_token_accuracy": 0.6863191624482473, + "num_tokens": 2931948990.0, + "step": 17487 + }, + { + "entropy": 1.6927362382411957, + "epoch": 1.9211502018620745, + "grad_norm": 0.7195414304733276, + "learning_rate": 2.0735267890653154e-06, + "loss": 1.3814, + "mean_token_accuracy": 0.658441017071406, + "num_tokens": 2932114658.0, + "step": 17488 + }, + { + "entropy": 1.57031911611557, + "epoch": 1.9212600587734476, + "grad_norm": 0.8102070093154907, + "learning_rate": 2.0733226841213792e-06, + "loss": 1.3443, + "mean_token_accuracy": 0.6727907160917918, + "num_tokens": 2932274915.0, + "step": 17489 + }, + { + "entropy": 1.6582297484079997, + "epoch": 1.9213699156848205, + "grad_norm": 0.7032040953636169, + "learning_rate": 2.0731188617027572e-06, + "loss": 1.399, + "mean_token_accuracy": 0.66445920864741, + "num_tokens": 2932448501.0, + "step": 17490 + }, + { + "entropy": 1.6773069600264232, + "epoch": 1.9214797725961934, + "grad_norm": 0.6263803839683533, + "learning_rate": 2.072915321815901e-06, + "loss": 1.5152, + "mean_token_accuracy": 0.6362102230389913, + "num_tokens": 2932722036.0, + "step": 17491 + }, + { + "entropy": 1.6817654371261597, + "epoch": 1.9215896295075665, + "grad_norm": 0.6429359912872314, + "learning_rate": 2.072712064467252e-06, + "loss": 1.5055, + "mean_token_accuracy": 0.6553362160921097, + "num_tokens": 2932910588.0, + "step": 17492 + }, + { + "entropy": 1.7349775632222493, + "epoch": 1.9216994864189392, + "grad_norm": 0.6507477164268494, + "learning_rate": 2.0725090896632436e-06, + "loss": 1.3682, + "mean_token_accuracy": 0.6527557075023651, + "num_tokens": 2933061736.0, + "step": 17493 + }, + { + "entropy": 1.6733063260714214, + "epoch": 1.9218093433303123, + "grad_norm": 0.7544586658477783, + "learning_rate": 2.0723063974102996e-06, + "loss": 1.4147, + "mean_token_accuracy": 0.6568095733722051, + "num_tokens": 2933234118.0, + "step": 17494 + }, + { + "entropy": 1.7533384064833324, + "epoch": 1.9219192002416852, + "grad_norm": 0.7645695209503174, + "learning_rate": 2.072103987714835e-06, + "loss": 1.3646, + "mean_token_accuracy": 0.6804790943861008, + "num_tokens": 2933344246.0, + "step": 17495 + }, + { + "entropy": 1.7520137230555217, + "epoch": 1.922029057153058, + "grad_norm": 0.8002681732177734, + "learning_rate": 2.071901860583257e-06, + "loss": 1.5351, + "mean_token_accuracy": 0.6434406936168671, + "num_tokens": 2933540169.0, + "step": 17496 + }, + { + "entropy": 1.7390229205290477, + "epoch": 1.9221389140644312, + "grad_norm": 0.7108039855957031, + "learning_rate": 2.071700016021961e-06, + "loss": 1.4703, + "mean_token_accuracy": 0.6398574312527975, + "num_tokens": 2933720785.0, + "step": 17497 + }, + { + "entropy": 1.6974186301231384, + "epoch": 1.922248770975804, + "grad_norm": 0.7148029804229736, + "learning_rate": 2.0714984540373373e-06, + "loss": 1.3062, + "mean_token_accuracy": 0.6633280366659164, + "num_tokens": 2933842641.0, + "step": 17498 + }, + { + "entropy": 1.6941316624482472, + "epoch": 1.922358627887177, + "grad_norm": 0.5549025535583496, + "learning_rate": 2.071297174635763e-06, + "loss": 1.4171, + "mean_token_accuracy": 0.6544789026180903, + "num_tokens": 2934021044.0, + "step": 17499 + }, + { + "entropy": 1.7065569758415222, + "epoch": 1.92246848479855, + "grad_norm": 0.6217483878135681, + "learning_rate": 2.071096177823611e-06, + "loss": 1.4027, + "mean_token_accuracy": 0.6564749876658121, + "num_tokens": 2934255736.0, + "step": 17500 + }, + { + "entropy": 1.6603321035703023, + "epoch": 1.9225783417099227, + "grad_norm": 0.734339714050293, + "learning_rate": 2.070895463607242e-06, + "loss": 1.3017, + "mean_token_accuracy": 0.6692576507727305, + "num_tokens": 2934400859.0, + "step": 17501 + }, + { + "entropy": 1.704683502515157, + "epoch": 1.9226881986212958, + "grad_norm": 0.8344931602478027, + "learning_rate": 2.070695031993006e-06, + "loss": 1.5092, + "mean_token_accuracy": 0.6664090702931086, + "num_tokens": 2934582181.0, + "step": 17502 + }, + { + "entropy": 1.7220360140005748, + "epoch": 1.9227980555326687, + "grad_norm": 0.7285640835762024, + "learning_rate": 2.070494882987249e-06, + "loss": 1.4179, + "mean_token_accuracy": 0.6631739139556885, + "num_tokens": 2934740982.0, + "step": 17503 + }, + { + "entropy": 1.6735987563927968, + "epoch": 1.9229079124440416, + "grad_norm": 0.6303743124008179, + "learning_rate": 2.0702950165963066e-06, + "loss": 1.3832, + "mean_token_accuracy": 0.6726168394088745, + "num_tokens": 2934932553.0, + "step": 17504 + }, + { + "entropy": 1.6929684579372406, + "epoch": 1.9230177693554147, + "grad_norm": 0.7466548681259155, + "learning_rate": 2.0700954328265024e-06, + "loss": 1.266, + "mean_token_accuracy": 0.6844168156385422, + "num_tokens": 2935095166.0, + "step": 17505 + }, + { + "entropy": 1.736617624759674, + "epoch": 1.9231276262667873, + "grad_norm": 0.7594635486602783, + "learning_rate": 2.069896131684154e-06, + "loss": 1.3552, + "mean_token_accuracy": 0.6446435898542404, + "num_tokens": 2935284625.0, + "step": 17506 + }, + { + "entropy": 1.6730614403883617, + "epoch": 1.9232374831781605, + "grad_norm": 0.6581339836120605, + "learning_rate": 2.069697113175569e-06, + "loss": 1.3928, + "mean_token_accuracy": 0.663309171795845, + "num_tokens": 2935444299.0, + "step": 17507 + }, + { + "entropy": 1.7582378685474396, + "epoch": 1.9233473400895333, + "grad_norm": 0.8001452088356018, + "learning_rate": 2.069498377307045e-06, + "loss": 1.4882, + "mean_token_accuracy": 0.6397890994946162, + "num_tokens": 2935579811.0, + "step": 17508 + }, + { + "entropy": 1.739981472492218, + "epoch": 1.9234571970009062, + "grad_norm": 0.721105694770813, + "learning_rate": 2.0692999240848744e-06, + "loss": 1.3548, + "mean_token_accuracy": 0.6565016210079193, + "num_tokens": 2935794799.0, + "step": 17509 + }, + { + "entropy": 1.7093225916226704, + "epoch": 1.9235670539122793, + "grad_norm": 0.6602572798728943, + "learning_rate": 2.0691017535153375e-06, + "loss": 1.4042, + "mean_token_accuracy": 0.648267442981402, + "num_tokens": 2935960630.0, + "step": 17510 + }, + { + "entropy": 1.6835937400658925, + "epoch": 1.9236769108236522, + "grad_norm": 0.6169915199279785, + "learning_rate": 2.0689038656047046e-06, + "loss": 1.4336, + "mean_token_accuracy": 0.652891164024671, + "num_tokens": 2936153886.0, + "step": 17511 + }, + { + "entropy": 1.691537966330846, + "epoch": 1.923786767735025, + "grad_norm": 0.6025387644767761, + "learning_rate": 2.0687062603592407e-06, + "loss": 1.3336, + "mean_token_accuracy": 0.662538543343544, + "num_tokens": 2936319156.0, + "step": 17512 + }, + { + "entropy": 1.68320166071256, + "epoch": 1.9238966246463982, + "grad_norm": 0.6816161274909973, + "learning_rate": 2.068508937785198e-06, + "loss": 1.3839, + "mean_token_accuracy": 0.6539181371529897, + "num_tokens": 2936475278.0, + "step": 17513 + }, + { + "entropy": 1.7054178714752197, + "epoch": 1.9240064815577709, + "grad_norm": 0.6919237375259399, + "learning_rate": 2.0683118978888243e-06, + "loss": 1.313, + "mean_token_accuracy": 0.662996177872022, + "num_tokens": 2936649177.0, + "step": 17514 + }, + { + "entropy": 1.6796445548534393, + "epoch": 1.924116338469144, + "grad_norm": 0.9301538467407227, + "learning_rate": 2.0681151406763533e-06, + "loss": 1.3744, + "mean_token_accuracy": 0.6774726808071136, + "num_tokens": 2936795606.0, + "step": 17515 + }, + { + "entropy": 1.746860404809316, + "epoch": 1.9242261953805169, + "grad_norm": 0.7453663349151611, + "learning_rate": 2.067918666154014e-06, + "loss": 1.337, + "mean_token_accuracy": 0.6566027700901031, + "num_tokens": 2936929561.0, + "step": 17516 + }, + { + "entropy": 1.6719463368256886, + "epoch": 1.9243360522918898, + "grad_norm": 0.695083498954773, + "learning_rate": 2.067722474328024e-06, + "loss": 1.3896, + "mean_token_accuracy": 0.668315589427948, + "num_tokens": 2937090524.0, + "step": 17517 + }, + { + "entropy": 1.7075058122475941, + "epoch": 1.9244459092032629, + "grad_norm": 0.6535371541976929, + "learning_rate": 2.067526565204592e-06, + "loss": 1.3489, + "mean_token_accuracy": 0.6604385624329249, + "num_tokens": 2937238065.0, + "step": 17518 + }, + { + "entropy": 1.8017792999744415, + "epoch": 1.9245557661146355, + "grad_norm": 0.723039448261261, + "learning_rate": 2.0673309387899187e-06, + "loss": 1.4437, + "mean_token_accuracy": 0.6451696256796519, + "num_tokens": 2937417366.0, + "step": 17519 + }, + { + "entropy": 1.67741854985555, + "epoch": 1.9246656230260086, + "grad_norm": 0.7515210509300232, + "learning_rate": 2.067135595090197e-06, + "loss": 1.3528, + "mean_token_accuracy": 0.6660284996032715, + "num_tokens": 2937572016.0, + "step": 17520 + }, + { + "entropy": 1.6874766151110332, + "epoch": 1.9247754799373815, + "grad_norm": 0.6784387230873108, + "learning_rate": 2.0669405341116092e-06, + "loss": 1.3964, + "mean_token_accuracy": 0.6592771311601003, + "num_tokens": 2937720468.0, + "step": 17521 + }, + { + "entropy": 1.6902848482131958, + "epoch": 1.9248853368487544, + "grad_norm": 0.5754929184913635, + "learning_rate": 2.0667457558603264e-06, + "loss": 1.4538, + "mean_token_accuracy": 0.6473627537488937, + "num_tokens": 2937920848.0, + "step": 17522 + }, + { + "entropy": 1.6405058304468791, + "epoch": 1.9249951937601275, + "grad_norm": 0.565492570400238, + "learning_rate": 2.066551260342516e-06, + "loss": 1.4986, + "mean_token_accuracy": 0.6333438704411188, + "num_tokens": 2938144092.0, + "step": 17523 + }, + { + "entropy": 1.7087134818236034, + "epoch": 1.9251050506715004, + "grad_norm": 0.624536395072937, + "learning_rate": 2.0663570475643323e-06, + "loss": 1.3191, + "mean_token_accuracy": 0.6491807202498118, + "num_tokens": 2938330146.0, + "step": 17524 + }, + { + "entropy": 1.6821011404196422, + "epoch": 1.9252149075828733, + "grad_norm": 0.6043448448181152, + "learning_rate": 2.066163117531923e-06, + "loss": 1.4004, + "mean_token_accuracy": 0.6572328259547552, + "num_tokens": 2938490177.0, + "step": 17525 + }, + { + "entropy": 1.697978417078654, + "epoch": 1.9253247644942464, + "grad_norm": 0.7227047085762024, + "learning_rate": 2.065969470251425e-06, + "loss": 1.3753, + "mean_token_accuracy": 0.659215713540713, + "num_tokens": 2938668497.0, + "step": 17526 + }, + { + "entropy": 1.6828550398349762, + "epoch": 1.925434621405619, + "grad_norm": 0.6420347690582275, + "learning_rate": 2.065776105728967e-06, + "loss": 1.5599, + "mean_token_accuracy": 0.6333623677492142, + "num_tokens": 2938924531.0, + "step": 17527 + }, + { + "entropy": 1.7999260822931926, + "epoch": 1.9255444783169922, + "grad_norm": 0.6861507892608643, + "learning_rate": 2.0655830239706702e-06, + "loss": 1.4962, + "mean_token_accuracy": 0.6513161609570185, + "num_tokens": 2939093357.0, + "step": 17528 + }, + { + "entropy": 1.7336215178171794, + "epoch": 1.925654335228365, + "grad_norm": 0.6525002717971802, + "learning_rate": 2.0653902249826445e-06, + "loss": 1.4093, + "mean_token_accuracy": 0.6464171608289083, + "num_tokens": 2939226820.0, + "step": 17529 + }, + { + "entropy": 1.7006452282269795, + "epoch": 1.925764192139738, + "grad_norm": 0.6565628051757812, + "learning_rate": 2.065197708770992e-06, + "loss": 1.5153, + "mean_token_accuracy": 0.6620665639638901, + "num_tokens": 2939408770.0, + "step": 17530 + }, + { + "entropy": 1.7232566873232524, + "epoch": 1.925874049051111, + "grad_norm": 0.6713189482688904, + "learning_rate": 2.065005475341805e-06, + "loss": 1.5064, + "mean_token_accuracy": 0.6384440114100774, + "num_tokens": 2939574469.0, + "step": 17531 + }, + { + "entropy": 1.6672783493995667, + "epoch": 1.9259839059624837, + "grad_norm": 0.63712477684021, + "learning_rate": 2.06481352470117e-06, + "loss": 1.4419, + "mean_token_accuracy": 0.6587218890587488, + "num_tokens": 2939782494.0, + "step": 17532 + }, + { + "entropy": 1.7159414490063984, + "epoch": 1.9260937628738568, + "grad_norm": 0.7770838141441345, + "learning_rate": 2.064621856855161e-06, + "loss": 1.4086, + "mean_token_accuracy": 0.6606285522381464, + "num_tokens": 2939922605.0, + "step": 17533 + }, + { + "entropy": 1.6527531743049622, + "epoch": 1.9262036197852297, + "grad_norm": 0.6143233776092529, + "learning_rate": 2.064430471809843e-06, + "loss": 1.2611, + "mean_token_accuracy": 0.6733989963928858, + "num_tokens": 2940107503.0, + "step": 17534 + }, + { + "entropy": 1.717937578757604, + "epoch": 1.9263134766966026, + "grad_norm": 0.7945669293403625, + "learning_rate": 2.064239369571273e-06, + "loss": 1.302, + "mean_token_accuracy": 0.6729937593142191, + "num_tokens": 2940246262.0, + "step": 17535 + }, + { + "entropy": 1.7034448285897572, + "epoch": 1.9264233336079757, + "grad_norm": 0.6210589408874512, + "learning_rate": 2.064048550145502e-06, + "loss": 1.3828, + "mean_token_accuracy": 0.6582538237174352, + "num_tokens": 2940382994.0, + "step": 17536 + }, + { + "entropy": 1.7313305934270222, + "epoch": 1.9265331905193486, + "grad_norm": 0.6636930108070374, + "learning_rate": 2.0638580135385676e-06, + "loss": 1.4957, + "mean_token_accuracy": 0.641208882133166, + "num_tokens": 2940615045.0, + "step": 17537 + }, + { + "entropy": 1.716033011674881, + "epoch": 1.9266430474307215, + "grad_norm": 0.6111452579498291, + "learning_rate": 2.0636677597565e-06, + "loss": 1.3465, + "mean_token_accuracy": 0.6583975950876871, + "num_tokens": 2940785009.0, + "step": 17538 + }, + { + "entropy": 1.7833648025989532, + "epoch": 1.9267529043420946, + "grad_norm": 0.6030783653259277, + "learning_rate": 2.0634777888053214e-06, + "loss": 1.4208, + "mean_token_accuracy": 0.6543482542037964, + "num_tokens": 2940945824.0, + "step": 17539 + }, + { + "entropy": 1.739495297273, + "epoch": 1.9268627612534672, + "grad_norm": 0.7100062966346741, + "learning_rate": 2.063288100691043e-06, + "loss": 1.4085, + "mean_token_accuracy": 0.6565575549999872, + "num_tokens": 2941091153.0, + "step": 17540 + }, + { + "entropy": 1.708542416493098, + "epoch": 1.9269726181648403, + "grad_norm": 0.6001754403114319, + "learning_rate": 2.063098695419669e-06, + "loss": 1.408, + "mean_token_accuracy": 0.6558292259772619, + "num_tokens": 2941308528.0, + "step": 17541 + }, + { + "entropy": 1.7160709202289581, + "epoch": 1.9270824750762132, + "grad_norm": 0.7005263566970825, + "learning_rate": 2.0629095729971956e-06, + "loss": 1.3614, + "mean_token_accuracy": 0.6518668631712595, + "num_tokens": 2941457810.0, + "step": 17542 + }, + { + "entropy": 1.7111981709798176, + "epoch": 1.927192331987586, + "grad_norm": 0.6936342120170593, + "learning_rate": 2.0627207334296065e-06, + "loss": 1.2869, + "mean_token_accuracy": 0.6690366715192795, + "num_tokens": 2941608696.0, + "step": 17543 + }, + { + "entropy": 1.7328161001205444, + "epoch": 1.9273021888989592, + "grad_norm": 0.6630276441574097, + "learning_rate": 2.0625321767228782e-06, + "loss": 1.3247, + "mean_token_accuracy": 0.6659030715624491, + "num_tokens": 2941744217.0, + "step": 17544 + }, + { + "entropy": 1.676056981086731, + "epoch": 1.927412045810332, + "grad_norm": 0.6332334280014038, + "learning_rate": 2.062343902882981e-06, + "loss": 1.3053, + "mean_token_accuracy": 0.6878760854403178, + "num_tokens": 2941870064.0, + "step": 17545 + }, + { + "entropy": 1.7297570407390594, + "epoch": 1.927521902721705, + "grad_norm": 0.8926072120666504, + "learning_rate": 2.0621559119158707e-06, + "loss": 1.3512, + "mean_token_accuracy": 0.6621742248535156, + "num_tokens": 2942009195.0, + "step": 17546 + }, + { + "entropy": 1.6616042951742809, + "epoch": 1.9276317596330779, + "grad_norm": 0.7044560313224792, + "learning_rate": 2.061968203827498e-06, + "loss": 1.1998, + "mean_token_accuracy": 0.6850334058205286, + "num_tokens": 2942128712.0, + "step": 17547 + }, + { + "entropy": 1.6350234846274059, + "epoch": 1.9277416165444508, + "grad_norm": 0.7069318294525146, + "learning_rate": 2.0617807786238036e-06, + "loss": 1.3889, + "mean_token_accuracy": 0.662718782822291, + "num_tokens": 2942299164.0, + "step": 17548 + }, + { + "entropy": 1.671046882867813, + "epoch": 1.9278514734558239, + "grad_norm": 0.5789463520050049, + "learning_rate": 2.061593636310722e-06, + "loss": 1.345, + "mean_token_accuracy": 0.6666370083888372, + "num_tokens": 2942465119.0, + "step": 17549 + }, + { + "entropy": 1.6353905896345775, + "epoch": 1.9279613303671967, + "grad_norm": 0.7177479267120361, + "learning_rate": 2.061406776894172e-06, + "loss": 1.3278, + "mean_token_accuracy": 0.6639865090449651, + "num_tokens": 2942607010.0, + "step": 17550 + }, + { + "entropy": 1.700663298368454, + "epoch": 1.9280711872785696, + "grad_norm": 0.6942954659461975, + "learning_rate": 2.061220200380071e-06, + "loss": 1.2797, + "mean_token_accuracy": 0.6732474565505981, + "num_tokens": 2942712681.0, + "step": 17551 + }, + { + "entropy": 1.7159783045450847, + "epoch": 1.9281810441899427, + "grad_norm": 0.6486497521400452, + "learning_rate": 2.0610339067743213e-06, + "loss": 1.4483, + "mean_token_accuracy": 0.6449883927901586, + "num_tokens": 2942933437.0, + "step": 17552 + }, + { + "entropy": 1.6851761241753895, + "epoch": 1.9282909011013154, + "grad_norm": 0.7932791113853455, + "learning_rate": 2.060847896082822e-06, + "loss": 1.2722, + "mean_token_accuracy": 0.6713943233092626, + "num_tokens": 2943051659.0, + "step": 17553 + }, + { + "entropy": 1.7198482652505238, + "epoch": 1.9284007580126885, + "grad_norm": 0.5689072012901306, + "learning_rate": 2.0606621683114583e-06, + "loss": 1.4662, + "mean_token_accuracy": 0.639824832479159, + "num_tokens": 2943252511.0, + "step": 17554 + }, + { + "entropy": 1.6842391788959503, + "epoch": 1.9285106149240614, + "grad_norm": 0.8211684823036194, + "learning_rate": 2.0604767234661086e-06, + "loss": 1.428, + "mean_token_accuracy": 0.642557273308436, + "num_tokens": 2943430385.0, + "step": 17555 + }, + { + "entropy": 1.7061157524585724, + "epoch": 1.9286204718354343, + "grad_norm": 0.7843329310417175, + "learning_rate": 2.0602915615526418e-06, + "loss": 1.4143, + "mean_token_accuracy": 0.6773807754119238, + "num_tokens": 2943557103.0, + "step": 17556 + }, + { + "entropy": 1.6724831461906433, + "epoch": 1.9287303287468074, + "grad_norm": 0.6326301693916321, + "learning_rate": 2.06010668257692e-06, + "loss": 1.2568, + "mean_token_accuracy": 0.6788825293382009, + "num_tokens": 2943675826.0, + "step": 17557 + }, + { + "entropy": 1.7057609955469768, + "epoch": 1.9288401856581803, + "grad_norm": 0.6408417820930481, + "learning_rate": 2.0599220865447924e-06, + "loss": 1.4451, + "mean_token_accuracy": 0.6486535221338272, + "num_tokens": 2943822102.0, + "step": 17558 + }, + { + "entropy": 1.7137981454531352, + "epoch": 1.9289500425695532, + "grad_norm": 0.6233062148094177, + "learning_rate": 2.059737773462102e-06, + "loss": 1.3929, + "mean_token_accuracy": 0.6523498793443044, + "num_tokens": 2943978771.0, + "step": 17559 + }, + { + "entropy": 1.7529114683469136, + "epoch": 1.929059899480926, + "grad_norm": 0.6746161580085754, + "learning_rate": 2.059553743334683e-06, + "loss": 1.4265, + "mean_token_accuracy": 0.6609595467646917, + "num_tokens": 2944172280.0, + "step": 17560 + }, + { + "entropy": 1.7099245886007945, + "epoch": 1.929169756392299, + "grad_norm": 0.6567638516426086, + "learning_rate": 2.0593699961683594e-06, + "loss": 1.4972, + "mean_token_accuracy": 0.6532185673713684, + "num_tokens": 2944307513.0, + "step": 17561 + }, + { + "entropy": 1.715930829445521, + "epoch": 1.929279613303672, + "grad_norm": 0.6228388547897339, + "learning_rate": 2.059186531968946e-06, + "loss": 1.3633, + "mean_token_accuracy": 0.6599201112985611, + "num_tokens": 2944488590.0, + "step": 17562 + }, + { + "entropy": 1.7236754894256592, + "epoch": 1.929389470215045, + "grad_norm": 0.6078210473060608, + "learning_rate": 2.059003350742251e-06, + "loss": 1.3334, + "mean_token_accuracy": 0.6531506727139155, + "num_tokens": 2944658053.0, + "step": 17563 + }, + { + "entropy": 1.7208663125832875, + "epoch": 1.9294993271264178, + "grad_norm": 0.6693648099899292, + "learning_rate": 2.0588204524940702e-06, + "loss": 1.4905, + "mean_token_accuracy": 0.6456576486428579, + "num_tokens": 2944876779.0, + "step": 17564 + }, + { + "entropy": 1.6756204068660736, + "epoch": 1.929609184037791, + "grad_norm": 0.6165689826011658, + "learning_rate": 2.0586378372301948e-06, + "loss": 1.4901, + "mean_token_accuracy": 0.6373255352179209, + "num_tokens": 2945070191.0, + "step": 17565 + }, + { + "entropy": 1.767015238602956, + "epoch": 1.9297190409491636, + "grad_norm": 0.8223422765731812, + "learning_rate": 2.0584555049564012e-06, + "loss": 1.3183, + "mean_token_accuracy": 0.6613618781169256, + "num_tokens": 2945195660.0, + "step": 17566 + }, + { + "entropy": 1.6678463419278462, + "epoch": 1.9298288978605367, + "grad_norm": 0.7065117955207825, + "learning_rate": 2.0582734556784618e-06, + "loss": 1.4749, + "mean_token_accuracy": 0.6527349551518759, + "num_tokens": 2945350628.0, + "step": 17567 + }, + { + "entropy": 1.59239661693573, + "epoch": 1.9299387547719096, + "grad_norm": 0.7592599987983704, + "learning_rate": 2.0580916894021383e-06, + "loss": 1.2039, + "mean_token_accuracy": 0.6815276394287745, + "num_tokens": 2945494933.0, + "step": 17568 + }, + { + "entropy": 1.770630935827891, + "epoch": 1.9300486116832825, + "grad_norm": 0.6339170932769775, + "learning_rate": 2.0579102061331847e-06, + "loss": 1.4645, + "mean_token_accuracy": 0.6460703512032827, + "num_tokens": 2945689644.0, + "step": 17569 + }, + { + "entropy": 1.6582429707050323, + "epoch": 1.9301584685946556, + "grad_norm": 0.6860533952713013, + "learning_rate": 2.0577290058773418e-06, + "loss": 1.3485, + "mean_token_accuracy": 0.6733825008074442, + "num_tokens": 2945818254.0, + "step": 17570 + }, + { + "entropy": 1.6789084871610005, + "epoch": 1.9302683255060284, + "grad_norm": 0.6942259073257446, + "learning_rate": 2.057548088640347e-06, + "loss": 1.2834, + "mean_token_accuracy": 0.6708969473838806, + "num_tokens": 2945953707.0, + "step": 17571 + }, + { + "entropy": 1.6964404781659443, + "epoch": 1.9303781824174013, + "grad_norm": 0.6454761624336243, + "learning_rate": 2.0573674544279264e-06, + "loss": 1.3878, + "mean_token_accuracy": 0.6570356140534083, + "num_tokens": 2946100936.0, + "step": 17572 + }, + { + "entropy": 1.6695491174856822, + "epoch": 1.9304880393287742, + "grad_norm": 0.6851876378059387, + "learning_rate": 2.0571871032457957e-06, + "loss": 1.6392, + "mean_token_accuracy": 0.6317041863997778, + "num_tokens": 2946309464.0, + "step": 17573 + }, + { + "entropy": 1.7130326131979625, + "epoch": 1.930597896240147, + "grad_norm": 0.8214697241783142, + "learning_rate": 2.057007035099663e-06, + "loss": 1.3649, + "mean_token_accuracy": 0.673834909995397, + "num_tokens": 2946427385.0, + "step": 17574 + }, + { + "entropy": 1.729885309934616, + "epoch": 1.9307077531515202, + "grad_norm": 0.6711888909339905, + "learning_rate": 2.056827249995229e-06, + "loss": 1.3746, + "mean_token_accuracy": 0.661648154258728, + "num_tokens": 2946587424.0, + "step": 17575 + }, + { + "entropy": 1.6910810470581055, + "epoch": 1.930817610062893, + "grad_norm": 0.5932799577713013, + "learning_rate": 2.0566477479381818e-06, + "loss": 1.3763, + "mean_token_accuracy": 0.6474411884943644, + "num_tokens": 2946719017.0, + "step": 17576 + }, + { + "entropy": 1.62163445353508, + "epoch": 1.930927466974266, + "grad_norm": 0.5636860728263855, + "learning_rate": 2.0564685289342035e-06, + "loss": 1.4351, + "mean_token_accuracy": 0.6378550479809443, + "num_tokens": 2946923068.0, + "step": 17577 + }, + { + "entropy": 1.7062998513380687, + "epoch": 1.931037323885639, + "grad_norm": 0.8094304800033569, + "learning_rate": 2.0562895929889665e-06, + "loss": 1.32, + "mean_token_accuracy": 0.6633241772651672, + "num_tokens": 2947031556.0, + "step": 17578 + }, + { + "entropy": 1.6850249568621318, + "epoch": 1.9311471807970118, + "grad_norm": 0.7753397226333618, + "learning_rate": 2.0561109401081326e-06, + "loss": 1.4007, + "mean_token_accuracy": 0.6599368900060654, + "num_tokens": 2947165163.0, + "step": 17579 + }, + { + "entropy": 1.7386715511480968, + "epoch": 1.9312570377083849, + "grad_norm": 0.610440731048584, + "learning_rate": 2.055932570297359e-06, + "loss": 1.3973, + "mean_token_accuracy": 0.6602314561605453, + "num_tokens": 2947361719.0, + "step": 17580 + }, + { + "entropy": 1.7325500547885895, + "epoch": 1.9313668946197577, + "grad_norm": 0.8086016178131104, + "learning_rate": 2.0557544835622885e-06, + "loss": 1.6061, + "mean_token_accuracy": 0.6291801979144415, + "num_tokens": 2947510879.0, + "step": 17581 + }, + { + "entropy": 1.701207419236501, + "epoch": 1.9314767515311306, + "grad_norm": 0.7074292898178101, + "learning_rate": 2.055576679908558e-06, + "loss": 1.3229, + "mean_token_accuracy": 0.6661451806624731, + "num_tokens": 2947660460.0, + "step": 17582 + }, + { + "entropy": 1.710488885641098, + "epoch": 1.9315866084425037, + "grad_norm": 0.6005741953849792, + "learning_rate": 2.0553991593417954e-06, + "loss": 1.4012, + "mean_token_accuracy": 0.6515051871538162, + "num_tokens": 2947880490.0, + "step": 17583 + }, + { + "entropy": 1.7140753865242004, + "epoch": 1.9316964653538766, + "grad_norm": 0.75965416431427, + "learning_rate": 2.0552219218676184e-06, + "loss": 1.416, + "mean_token_accuracy": 0.655417412519455, + "num_tokens": 2948057336.0, + "step": 17584 + }, + { + "entropy": 1.6573958098888397, + "epoch": 1.9318063222652495, + "grad_norm": 0.5710785388946533, + "learning_rate": 2.0550449674916374e-06, + "loss": 1.3928, + "mean_token_accuracy": 0.6540137430032095, + "num_tokens": 2948288883.0, + "step": 17585 + }, + { + "entropy": 1.7147573431332905, + "epoch": 1.9319161791766224, + "grad_norm": 0.709555447101593, + "learning_rate": 2.0548682962194525e-06, + "loss": 1.4287, + "mean_token_accuracy": 0.6520878771940867, + "num_tokens": 2948492498.0, + "step": 17586 + }, + { + "entropy": 1.7575667103131611, + "epoch": 1.9320260360879953, + "grad_norm": 0.5812540650367737, + "learning_rate": 2.0546919080566545e-06, + "loss": 1.4646, + "mean_token_accuracy": 0.6403382619222006, + "num_tokens": 2948692105.0, + "step": 17587 + }, + { + "entropy": 1.7437999844551086, + "epoch": 1.9321358929993684, + "grad_norm": 0.7731236219406128, + "learning_rate": 2.054515803008827e-06, + "loss": 1.3402, + "mean_token_accuracy": 0.66108538210392, + "num_tokens": 2948809266.0, + "step": 17588 + }, + { + "entropy": 1.6447148124376934, + "epoch": 1.9322457499107413, + "grad_norm": 0.6328097581863403, + "learning_rate": 2.0543399810815448e-06, + "loss": 1.3588, + "mean_token_accuracy": 0.6679736226797104, + "num_tokens": 2948943265.0, + "step": 17589 + }, + { + "entropy": 1.6997552712758381, + "epoch": 1.9323556068221142, + "grad_norm": 0.7128564119338989, + "learning_rate": 2.05416444228037e-06, + "loss": 1.2944, + "mean_token_accuracy": 0.6734185715516409, + "num_tokens": 2949056782.0, + "step": 17590 + }, + { + "entropy": 1.7652290364106495, + "epoch": 1.9324654637334873, + "grad_norm": 0.6556487679481506, + "learning_rate": 2.053989186610859e-06, + "loss": 1.4006, + "mean_token_accuracy": 0.6583968649307886, + "num_tokens": 2949200473.0, + "step": 17591 + }, + { + "entropy": 1.7773662507534027, + "epoch": 1.93257532064486, + "grad_norm": 0.7734547853469849, + "learning_rate": 2.0538142140785604e-06, + "loss": 1.567, + "mean_token_accuracy": 0.6377990494171778, + "num_tokens": 2949430212.0, + "step": 17592 + }, + { + "entropy": 1.6884727974732716, + "epoch": 1.932685177556233, + "grad_norm": 0.6802954077720642, + "learning_rate": 2.0536395246890104e-06, + "loss": 1.2043, + "mean_token_accuracy": 0.6867648412783941, + "num_tokens": 2949532312.0, + "step": 17593 + }, + { + "entropy": 1.707016219695409, + "epoch": 1.932795034467606, + "grad_norm": 0.9521844983100891, + "learning_rate": 2.0534651184477376e-06, + "loss": 1.3584, + "mean_token_accuracy": 0.6677818149328232, + "num_tokens": 2949680088.0, + "step": 17594 + }, + { + "entropy": 1.7301967144012451, + "epoch": 1.9329048913789788, + "grad_norm": 0.7132523059844971, + "learning_rate": 2.053290995360262e-06, + "loss": 1.3355, + "mean_token_accuracy": 0.6592816412448883, + "num_tokens": 2949816290.0, + "step": 17595 + }, + { + "entropy": 1.6510724425315857, + "epoch": 1.933014748290352, + "grad_norm": 0.7266356348991394, + "learning_rate": 2.053117155432095e-06, + "loss": 1.2963, + "mean_token_accuracy": 0.6727642168601354, + "num_tokens": 2949962938.0, + "step": 17596 + }, + { + "entropy": 1.6770485142866771, + "epoch": 1.9331246052017248, + "grad_norm": 0.6671058535575867, + "learning_rate": 2.052943598668739e-06, + "loss": 1.2558, + "mean_token_accuracy": 0.6702116926511129, + "num_tokens": 2950094455.0, + "step": 17597 + }, + { + "entropy": 1.6965230802694957, + "epoch": 1.9332344621130977, + "grad_norm": 0.7873666286468506, + "learning_rate": 2.0527703250756874e-06, + "loss": 1.3483, + "mean_token_accuracy": 0.6622929026683172, + "num_tokens": 2950236947.0, + "step": 17598 + }, + { + "entropy": 1.7883230050404866, + "epoch": 1.9333443190244708, + "grad_norm": 0.6915072798728943, + "learning_rate": 2.0525973346584215e-06, + "loss": 1.5227, + "mean_token_accuracy": 0.6453968932231268, + "num_tokens": 2950483692.0, + "step": 17599 + }, + { + "entropy": 1.6910318632920582, + "epoch": 1.9334541759358435, + "grad_norm": 0.6855702996253967, + "learning_rate": 2.0524246274224193e-06, + "loss": 1.4729, + "mean_token_accuracy": 0.6422146856784821, + "num_tokens": 2950643969.0, + "step": 17600 + }, + { + "entropy": 1.7280892829100292, + "epoch": 1.9335640328472166, + "grad_norm": 0.6929906606674194, + "learning_rate": 2.0522522033731457e-06, + "loss": 1.4143, + "mean_token_accuracy": 0.6582867602507273, + "num_tokens": 2950797242.0, + "step": 17601 + }, + { + "entropy": 1.6679367621739705, + "epoch": 1.9336738897585894, + "grad_norm": 0.5708891153335571, + "learning_rate": 2.052080062516057e-06, + "loss": 1.4377, + "mean_token_accuracy": 0.6401932090520859, + "num_tokens": 2951025260.0, + "step": 17602 + }, + { + "entropy": 1.7221278150876362, + "epoch": 1.9337837466699623, + "grad_norm": 0.7946709990501404, + "learning_rate": 2.0519082048566026e-06, + "loss": 1.2156, + "mean_token_accuracy": 0.6832669277985891, + "num_tokens": 2951122533.0, + "step": 17603 + }, + { + "entropy": 1.721358944972356, + "epoch": 1.9338936035813354, + "grad_norm": 0.7960524559020996, + "learning_rate": 2.0517366304002225e-06, + "loss": 1.3881, + "mean_token_accuracy": 0.6513563940922419, + "num_tokens": 2951259461.0, + "step": 17604 + }, + { + "entropy": 1.623354434967041, + "epoch": 1.934003460492708, + "grad_norm": 0.6415337920188904, + "learning_rate": 2.0515653391523454e-06, + "loss": 1.2289, + "mean_token_accuracy": 0.678653672337532, + "num_tokens": 2951398780.0, + "step": 17605 + }, + { + "entropy": 1.6779876053333282, + "epoch": 1.9341133174040812, + "grad_norm": 0.7085760235786438, + "learning_rate": 2.051394331118392e-06, + "loss": 1.3341, + "mean_token_accuracy": 0.6740488906701406, + "num_tokens": 2951548624.0, + "step": 17606 + }, + { + "entropy": 1.7369104822476704, + "epoch": 1.934223174315454, + "grad_norm": 0.6106285452842712, + "learning_rate": 2.0512236063037767e-06, + "loss": 1.4817, + "mean_token_accuracy": 0.6403156071901321, + "num_tokens": 2951760517.0, + "step": 17607 + }, + { + "entropy": 1.744108925263087, + "epoch": 1.934333031226827, + "grad_norm": 0.6894066333770752, + "learning_rate": 2.051053164713902e-06, + "loss": 1.4239, + "mean_token_accuracy": 0.6464538921912512, + "num_tokens": 2951967261.0, + "step": 17608 + }, + { + "entropy": 1.7185207108656566, + "epoch": 1.9344428881382, + "grad_norm": 0.7298069596290588, + "learning_rate": 2.0508830063541615e-06, + "loss": 1.398, + "mean_token_accuracy": 0.6780803749958674, + "num_tokens": 2952102298.0, + "step": 17609 + }, + { + "entropy": 1.7548142075538635, + "epoch": 1.934552745049573, + "grad_norm": 0.7205964922904968, + "learning_rate": 2.0507131312299423e-06, + "loss": 1.4849, + "mean_token_accuracy": 0.6329950988292694, + "num_tokens": 2952322155.0, + "step": 17610 + }, + { + "entropy": 1.6941269437472026, + "epoch": 1.9346626019609459, + "grad_norm": 0.6685055494308472, + "learning_rate": 2.0505435393466183e-06, + "loss": 1.3536, + "mean_token_accuracy": 0.6650911470254263, + "num_tokens": 2952502975.0, + "step": 17611 + }, + { + "entropy": 1.7216095228989918, + "epoch": 1.934772458872319, + "grad_norm": 0.687439501285553, + "learning_rate": 2.0503742307095593e-06, + "loss": 1.3819, + "mean_token_accuracy": 0.6688835869232813, + "num_tokens": 2952660501.0, + "step": 17612 + }, + { + "entropy": 1.7403018077214558, + "epoch": 1.9348823157836916, + "grad_norm": 0.7653110027313232, + "learning_rate": 2.050205205324123e-06, + "loss": 1.3546, + "mean_token_accuracy": 0.6541765530904134, + "num_tokens": 2952802470.0, + "step": 17613 + }, + { + "entropy": 1.7342408001422882, + "epoch": 1.9349921726950647, + "grad_norm": 0.8154670596122742, + "learning_rate": 2.050036463195659e-06, + "loss": 1.4267, + "mean_token_accuracy": 0.6695893357197443, + "num_tokens": 2952917329.0, + "step": 17614 + }, + { + "entropy": 1.6820714871088664, + "epoch": 1.9351020296064376, + "grad_norm": 0.7906708717346191, + "learning_rate": 2.049868004329508e-06, + "loss": 1.3398, + "mean_token_accuracy": 0.6582440783580145, + "num_tokens": 2953082322.0, + "step": 17615 + }, + { + "entropy": 1.7514929175376892, + "epoch": 1.9352118865178105, + "grad_norm": 0.7457329630851746, + "learning_rate": 2.049699828731002e-06, + "loss": 1.5964, + "mean_token_accuracy": 0.6357724542419115, + "num_tokens": 2953248248.0, + "step": 17616 + }, + { + "entropy": 1.665393054485321, + "epoch": 1.9353217434291836, + "grad_norm": 0.6951583027839661, + "learning_rate": 2.049531936405464e-06, + "loss": 1.5821, + "mean_token_accuracy": 0.6351954390605291, + "num_tokens": 2953467876.0, + "step": 17617 + }, + { + "entropy": 1.7500004569689434, + "epoch": 1.9354316003405563, + "grad_norm": 0.6640505790710449, + "learning_rate": 2.0493643273582057e-06, + "loss": 1.4366, + "mean_token_accuracy": 0.6397050768136978, + "num_tokens": 2953701201.0, + "step": 17618 + }, + { + "entropy": 1.7141669193903606, + "epoch": 1.9355414572519294, + "grad_norm": 0.654679536819458, + "learning_rate": 2.049197001594534e-06, + "loss": 1.5257, + "mean_token_accuracy": 0.6353826026121775, + "num_tokens": 2953900373.0, + "step": 17619 + }, + { + "entropy": 1.7072954674561818, + "epoch": 1.9356513141633023, + "grad_norm": 0.6300878524780273, + "learning_rate": 2.0490299591197426e-06, + "loss": 1.4607, + "mean_token_accuracy": 0.6487697213888168, + "num_tokens": 2954088615.0, + "step": 17620 + }, + { + "entropy": 1.7204040785630543, + "epoch": 1.9357611710746752, + "grad_norm": 0.6421894431114197, + "learning_rate": 2.048863199939121e-06, + "loss": 1.3859, + "mean_token_accuracy": 0.6516237407922745, + "num_tokens": 2954246097.0, + "step": 17621 + }, + { + "entropy": 1.7502519289652507, + "epoch": 1.9358710279860483, + "grad_norm": 0.6403509378433228, + "learning_rate": 2.048696724057945e-06, + "loss": 1.4453, + "mean_token_accuracy": 0.6476548612117767, + "num_tokens": 2954416854.0, + "step": 17622 + }, + { + "entropy": 1.7479767203330994, + "epoch": 1.9359808848974212, + "grad_norm": 0.6511620283126831, + "learning_rate": 2.0485305314814843e-06, + "loss": 1.5819, + "mean_token_accuracy": 0.6273590077956518, + "num_tokens": 2954650460.0, + "step": 17623 + }, + { + "entropy": 1.746782938639323, + "epoch": 1.936090741808794, + "grad_norm": 0.7770575284957886, + "learning_rate": 2.0483646222149993e-06, + "loss": 1.3677, + "mean_token_accuracy": 0.6582979708909988, + "num_tokens": 2954788939.0, + "step": 17624 + }, + { + "entropy": 1.7274068494637806, + "epoch": 1.9362005987201671, + "grad_norm": 0.6287463307380676, + "learning_rate": 2.0481989962637393e-06, + "loss": 1.3752, + "mean_token_accuracy": 0.6612710257371267, + "num_tokens": 2954960018.0, + "step": 17625 + }, + { + "entropy": 1.732554425795873, + "epoch": 1.9363104556315398, + "grad_norm": 0.6919560432434082, + "learning_rate": 2.048033653632947e-06, + "loss": 1.4181, + "mean_token_accuracy": 0.651200125614802, + "num_tokens": 2955097173.0, + "step": 17626 + }, + { + "entropy": 1.663443386554718, + "epoch": 1.936420312542913, + "grad_norm": 0.6868497729301453, + "learning_rate": 2.0478685943278565e-06, + "loss": 1.4677, + "mean_token_accuracy": 0.6552320023377737, + "num_tokens": 2955304001.0, + "step": 17627 + }, + { + "entropy": 1.7057395776112874, + "epoch": 1.9365301694542858, + "grad_norm": 0.6456490159034729, + "learning_rate": 2.0477038183536913e-06, + "loss": 1.369, + "mean_token_accuracy": 0.6559461901585261, + "num_tokens": 2955487842.0, + "step": 17628 + }, + { + "entropy": 1.7742679218451183, + "epoch": 1.9366400263656587, + "grad_norm": 0.6736863851547241, + "learning_rate": 2.0475393257156655e-06, + "loss": 1.4416, + "mean_token_accuracy": 0.6455845187107722, + "num_tokens": 2955688361.0, + "step": 17629 + }, + { + "entropy": 1.7152503629525502, + "epoch": 1.9367498832770318, + "grad_norm": 0.7249388098716736, + "learning_rate": 2.0473751164189866e-06, + "loss": 1.2432, + "mean_token_accuracy": 0.6813737452030182, + "num_tokens": 2955803129.0, + "step": 17630 + }, + { + "entropy": 1.6919386585553486, + "epoch": 1.9368597401884045, + "grad_norm": 0.6506887674331665, + "learning_rate": 2.047211190468851e-06, + "loss": 1.3881, + "mean_token_accuracy": 0.6640054186185201, + "num_tokens": 2955999765.0, + "step": 17631 + }, + { + "entropy": 1.7553547322750092, + "epoch": 1.9369695970997776, + "grad_norm": 0.7061113715171814, + "learning_rate": 2.0470475478704465e-06, + "loss": 1.3973, + "mean_token_accuracy": 0.6518440246582031, + "num_tokens": 2956153926.0, + "step": 17632 + }, + { + "entropy": 1.690263569355011, + "epoch": 1.9370794540111504, + "grad_norm": 0.6797465682029724, + "learning_rate": 2.0468841886289534e-06, + "loss": 1.3509, + "mean_token_accuracy": 0.6615539789199829, + "num_tokens": 2956278409.0, + "step": 17633 + }, + { + "entropy": 1.7189227143923442, + "epoch": 1.9371893109225233, + "grad_norm": 0.7588323950767517, + "learning_rate": 2.0467211127495413e-06, + "loss": 1.3427, + "mean_token_accuracy": 0.680439700682958, + "num_tokens": 2956426012.0, + "step": 17634 + }, + { + "entropy": 1.6739676396052043, + "epoch": 1.9372991678338964, + "grad_norm": 0.6452463269233704, + "learning_rate": 2.0465583202373717e-06, + "loss": 1.2856, + "mean_token_accuracy": 0.6756146202484766, + "num_tokens": 2956554851.0, + "step": 17635 + }, + { + "entropy": 1.6939974129199982, + "epoch": 1.9374090247452693, + "grad_norm": 0.7644104361534119, + "learning_rate": 2.0463958110975957e-06, + "loss": 1.2794, + "mean_token_accuracy": 0.6747647374868393, + "num_tokens": 2956682401.0, + "step": 17636 + }, + { + "entropy": 1.7220764855543773, + "epoch": 1.9375188816566422, + "grad_norm": 0.6523967385292053, + "learning_rate": 2.046233585335359e-06, + "loss": 1.4107, + "mean_token_accuracy": 0.6561575432618459, + "num_tokens": 2956840065.0, + "step": 17637 + }, + { + "entropy": 1.72719939549764, + "epoch": 1.9376287385680153, + "grad_norm": 0.5423910021781921, + "learning_rate": 2.0460716429557937e-06, + "loss": 1.5145, + "mean_token_accuracy": 0.6321031053860983, + "num_tokens": 2957092643.0, + "step": 17638 + }, + { + "entropy": 1.6563969254493713, + "epoch": 1.937738595479388, + "grad_norm": 0.634699821472168, + "learning_rate": 2.045909983964027e-06, + "loss": 1.262, + "mean_token_accuracy": 0.6715675294399261, + "num_tokens": 2957223272.0, + "step": 17639 + }, + { + "entropy": 1.7614007492860158, + "epoch": 1.937848452390761, + "grad_norm": 0.8633323311805725, + "learning_rate": 2.045748608365174e-06, + "loss": 1.2673, + "mean_token_accuracy": 0.6737097253402075, + "num_tokens": 2957385470.0, + "step": 17640 + }, + { + "entropy": 1.6651588181654613, + "epoch": 1.937958309302134, + "grad_norm": 0.716803252696991, + "learning_rate": 2.045587516164342e-06, + "loss": 1.3767, + "mean_token_accuracy": 0.6795123418172201, + "num_tokens": 2957501108.0, + "step": 17641 + }, + { + "entropy": 1.7195215821266174, + "epoch": 1.9380681662135069, + "grad_norm": 0.639150857925415, + "learning_rate": 2.0454267073666314e-06, + "loss": 1.4592, + "mean_token_accuracy": 0.6458094666401545, + "num_tokens": 2957718885.0, + "step": 17642 + }, + { + "entropy": 1.6857140560944874, + "epoch": 1.93817802312488, + "grad_norm": 0.6419286727905273, + "learning_rate": 2.045266181977129e-06, + "loss": 1.3571, + "mean_token_accuracy": 0.6528473595778147, + "num_tokens": 2957910681.0, + "step": 17643 + }, + { + "entropy": 1.7539623578389485, + "epoch": 1.9382878800362526, + "grad_norm": 0.8385192155838013, + "learning_rate": 2.0451059400009183e-06, + "loss": 1.4932, + "mean_token_accuracy": 0.6529499888420105, + "num_tokens": 2958064304.0, + "step": 17644 + }, + { + "entropy": 1.7804806133111317, + "epoch": 1.9383977369476257, + "grad_norm": 0.731391429901123, + "learning_rate": 2.044945981443069e-06, + "loss": 1.5301, + "mean_token_accuracy": 0.6367037196954092, + "num_tokens": 2958232801.0, + "step": 17645 + }, + { + "entropy": 1.6474103232224782, + "epoch": 1.9385075938589986, + "grad_norm": 0.7180584073066711, + "learning_rate": 2.0447863063086444e-06, + "loss": 1.3229, + "mean_token_accuracy": 0.6761461794376373, + "num_tokens": 2958390527.0, + "step": 17646 + }, + { + "entropy": 1.6555694937705994, + "epoch": 1.9386174507703715, + "grad_norm": 0.7365303635597229, + "learning_rate": 2.0446269146026973e-06, + "loss": 1.4973, + "mean_token_accuracy": 0.6524901290734609, + "num_tokens": 2958559788.0, + "step": 17647 + }, + { + "entropy": 1.6348630885283153, + "epoch": 1.9387273076817446, + "grad_norm": 0.7571955323219299, + "learning_rate": 2.044467806330273e-06, + "loss": 1.3723, + "mean_token_accuracy": 0.6655997534592947, + "num_tokens": 2958782862.0, + "step": 17648 + }, + { + "entropy": 1.7069834967454274, + "epoch": 1.9388371645931175, + "grad_norm": 0.6963647603988647, + "learning_rate": 2.0443089814964074e-06, + "loss": 1.3579, + "mean_token_accuracy": 0.6717394888401031, + "num_tokens": 2958926571.0, + "step": 17649 + }, + { + "entropy": 1.739360918601354, + "epoch": 1.9389470215044904, + "grad_norm": 0.7900950908660889, + "learning_rate": 2.044150440106126e-06, + "loss": 1.547, + "mean_token_accuracy": 0.6345634957154592, + "num_tokens": 2959092134.0, + "step": 17650 + }, + { + "entropy": 1.732648919026057, + "epoch": 1.9390568784158635, + "grad_norm": 0.7327719330787659, + "learning_rate": 2.04399218216445e-06, + "loss": 1.4618, + "mean_token_accuracy": 0.6432109524806341, + "num_tokens": 2959228429.0, + "step": 17651 + }, + { + "entropy": 1.7227603793144226, + "epoch": 1.9391667353272362, + "grad_norm": 0.6913332343101501, + "learning_rate": 2.043834207676384e-06, + "loss": 1.3656, + "mean_token_accuracy": 0.6713483432928721, + "num_tokens": 2959344802.0, + "step": 17652 + }, + { + "entropy": 1.7791978816191356, + "epoch": 1.9392765922386093, + "grad_norm": 0.8063234686851501, + "learning_rate": 2.04367651664693e-06, + "loss": 1.5213, + "mean_token_accuracy": 0.6414127200841904, + "num_tokens": 2959482823.0, + "step": 17653 + }, + { + "entropy": 1.7187303205331166, + "epoch": 1.9393864491499822, + "grad_norm": 0.6150225400924683, + "learning_rate": 2.043519109081078e-06, + "loss": 1.4355, + "mean_token_accuracy": 0.6597120662530264, + "num_tokens": 2959665866.0, + "step": 17654 + }, + { + "entropy": 1.6703492403030396, + "epoch": 1.939496306061355, + "grad_norm": 0.7310093641281128, + "learning_rate": 2.04336198498381e-06, + "loss": 1.3601, + "mean_token_accuracy": 0.6774055063724518, + "num_tokens": 2959809976.0, + "step": 17655 + }, + { + "entropy": 1.6999173561731975, + "epoch": 1.9396061629727281, + "grad_norm": 0.6536582112312317, + "learning_rate": 2.0432051443601e-06, + "loss": 1.4258, + "mean_token_accuracy": 0.6664847979942957, + "num_tokens": 2959974099.0, + "step": 17656 + }, + { + "entropy": 1.706270545721054, + "epoch": 1.9397160198841008, + "grad_norm": 0.5601027607917786, + "learning_rate": 2.0430485872149117e-06, + "loss": 1.2432, + "mean_token_accuracy": 0.6715459475914637, + "num_tokens": 2960154880.0, + "step": 17657 + }, + { + "entropy": 1.6991495788097382, + "epoch": 1.939825876795474, + "grad_norm": 0.6836209297180176, + "learning_rate": 2.0428923135531984e-06, + "loss": 1.2991, + "mean_token_accuracy": 0.6632804969946543, + "num_tokens": 2960313055.0, + "step": 17658 + }, + { + "entropy": 1.7432080507278442, + "epoch": 1.9399357337068468, + "grad_norm": 0.91303950548172, + "learning_rate": 2.042736323379907e-06, + "loss": 1.4199, + "mean_token_accuracy": 0.6468232373396555, + "num_tokens": 2960456172.0, + "step": 17659 + }, + { + "entropy": 1.727357546488444, + "epoch": 1.9400455906182197, + "grad_norm": 0.752682626247406, + "learning_rate": 2.0425806166999755e-06, + "loss": 1.3344, + "mean_token_accuracy": 0.6625064412752787, + "num_tokens": 2960584571.0, + "step": 17660 + }, + { + "entropy": 1.7152182559172313, + "epoch": 1.9401554475295928, + "grad_norm": 0.7578296661376953, + "learning_rate": 2.0424251935183316e-06, + "loss": 1.2797, + "mean_token_accuracy": 0.6746835658947626, + "num_tokens": 2960717321.0, + "step": 17661 + }, + { + "entropy": 1.7040897111097972, + "epoch": 1.9402653044409657, + "grad_norm": 0.6826738119125366, + "learning_rate": 2.042270053839893e-06, + "loss": 1.5189, + "mean_token_accuracy": 0.6396296223004659, + "num_tokens": 2960883517.0, + "step": 17662 + }, + { + "entropy": 1.7391219735145569, + "epoch": 1.9403751613523386, + "grad_norm": 0.7768815755844116, + "learning_rate": 2.042115197669571e-06, + "loss": 1.4045, + "mean_token_accuracy": 0.6670640160640081, + "num_tokens": 2961004315.0, + "step": 17663 + }, + { + "entropy": 1.7473591566085815, + "epoch": 1.9404850182637117, + "grad_norm": 0.6485081315040588, + "learning_rate": 2.0419606250122666e-06, + "loss": 1.5027, + "mean_token_accuracy": 0.6461313168207804, + "num_tokens": 2961219255.0, + "step": 17664 + }, + { + "entropy": 1.686641921599706, + "epoch": 1.9405948751750843, + "grad_norm": 0.7350051403045654, + "learning_rate": 2.041806335872872e-06, + "loss": 1.3163, + "mean_token_accuracy": 0.6799498746792475, + "num_tokens": 2961376511.0, + "step": 17665 + }, + { + "entropy": 1.7102410693963368, + "epoch": 1.9407047320864574, + "grad_norm": 0.7319943308830261, + "learning_rate": 2.04165233025627e-06, + "loss": 1.4536, + "mean_token_accuracy": 0.647271086772283, + "num_tokens": 2961578557.0, + "step": 17666 + }, + { + "entropy": 1.7241827249526978, + "epoch": 1.9408145889978303, + "grad_norm": 0.7803419232368469, + "learning_rate": 2.041498608167335e-06, + "loss": 1.4855, + "mean_token_accuracy": 0.6466056903203329, + "num_tokens": 2961771057.0, + "step": 17667 + }, + { + "entropy": 1.653838684161504, + "epoch": 1.9409244459092032, + "grad_norm": 0.7193467617034912, + "learning_rate": 2.0413451696109315e-06, + "loss": 1.2023, + "mean_token_accuracy": 0.6801579395929972, + "num_tokens": 2961886643.0, + "step": 17668 + }, + { + "entropy": 1.6649916072686513, + "epoch": 1.9410343028205763, + "grad_norm": 0.6423388123512268, + "learning_rate": 2.0411920145919186e-06, + "loss": 1.3242, + "mean_token_accuracy": 0.6613106826941172, + "num_tokens": 2962044517.0, + "step": 17669 + }, + { + "entropy": 1.7756304542223613, + "epoch": 1.941144159731949, + "grad_norm": 0.765924334526062, + "learning_rate": 2.0410391431151396e-06, + "loss": 1.2838, + "mean_token_accuracy": 0.6603503326574961, + "num_tokens": 2962163822.0, + "step": 17670 + }, + { + "entropy": 1.7474627792835236, + "epoch": 1.941254016643322, + "grad_norm": 0.7282141447067261, + "learning_rate": 2.040886555185435e-06, + "loss": 1.3865, + "mean_token_accuracy": 0.6600453555583954, + "num_tokens": 2962301011.0, + "step": 17671 + }, + { + "entropy": 1.6974960267543793, + "epoch": 1.941363873554695, + "grad_norm": 0.7170888781547546, + "learning_rate": 2.040734250807634e-06, + "loss": 1.4518, + "mean_token_accuracy": 0.6686425358057022, + "num_tokens": 2962491509.0, + "step": 17672 + }, + { + "entropy": 1.673610270023346, + "epoch": 1.9414737304660679, + "grad_norm": 0.597626805305481, + "learning_rate": 2.0405822299865577e-06, + "loss": 1.2537, + "mean_token_accuracy": 0.6842442254225413, + "num_tokens": 2962673019.0, + "step": 17673 + }, + { + "entropy": 1.7404329578081768, + "epoch": 1.941583587377441, + "grad_norm": 0.8017789721488953, + "learning_rate": 2.0404304927270156e-06, + "loss": 1.3374, + "mean_token_accuracy": 0.6645545810461044, + "num_tokens": 2962792301.0, + "step": 17674 + }, + { + "entropy": 1.664712945620219, + "epoch": 1.9416934442888139, + "grad_norm": 0.6364064812660217, + "learning_rate": 2.040279039033812e-06, + "loss": 1.5154, + "mean_token_accuracy": 0.6475819051265717, + "num_tokens": 2962974839.0, + "step": 17675 + }, + { + "entropy": 1.690699468056361, + "epoch": 1.9418033012001867, + "grad_norm": 0.681605339050293, + "learning_rate": 2.0401278689117387e-06, + "loss": 1.3548, + "mean_token_accuracy": 0.6689165979623795, + "num_tokens": 2963119502.0, + "step": 17676 + }, + { + "entropy": 1.714266578356425, + "epoch": 1.9419131581115598, + "grad_norm": 0.7001098394393921, + "learning_rate": 2.039976982365581e-06, + "loss": 1.3567, + "mean_token_accuracy": 0.659160926938057, + "num_tokens": 2963294588.0, + "step": 17677 + }, + { + "entropy": 1.7354275782903035, + "epoch": 1.9420230150229325, + "grad_norm": 0.7794528007507324, + "learning_rate": 2.0398263794001142e-06, + "loss": 1.315, + "mean_token_accuracy": 0.6574402799208959, + "num_tokens": 2963406921.0, + "step": 17678 + }, + { + "entropy": 1.693873792886734, + "epoch": 1.9421328719343056, + "grad_norm": 0.7015413045883179, + "learning_rate": 2.0396760600201053e-06, + "loss": 1.4379, + "mean_token_accuracy": 0.6484410464763641, + "num_tokens": 2963573114.0, + "step": 17679 + }, + { + "entropy": 1.7558051943778992, + "epoch": 1.9422427288456785, + "grad_norm": 0.7721998691558838, + "learning_rate": 2.0395260242303113e-06, + "loss": 1.6748, + "mean_token_accuracy": 0.6339837138851484, + "num_tokens": 2963763093.0, + "step": 17680 + }, + { + "entropy": 1.6947738925615947, + "epoch": 1.9423525857570514, + "grad_norm": 0.6746589541435242, + "learning_rate": 2.0393762720354816e-06, + "loss": 1.3065, + "mean_token_accuracy": 0.6745160073041916, + "num_tokens": 2963898566.0, + "step": 17681 + }, + { + "entropy": 1.6936275362968445, + "epoch": 1.9424624426684245, + "grad_norm": 0.6480662226676941, + "learning_rate": 2.0392268034403545e-06, + "loss": 1.3829, + "mean_token_accuracy": 0.6479221681753794, + "num_tokens": 2964098765.0, + "step": 17682 + }, + { + "entropy": 1.7492527961730957, + "epoch": 1.9425722995797972, + "grad_norm": 0.6564264893531799, + "learning_rate": 2.0390776184496606e-06, + "loss": 1.3571, + "mean_token_accuracy": 0.6510317424933115, + "num_tokens": 2964246008.0, + "step": 17683 + }, + { + "entropy": 1.6724448998769124, + "epoch": 1.9426821564911703, + "grad_norm": 0.6479511857032776, + "learning_rate": 2.0389287170681226e-06, + "loss": 1.3041, + "mean_token_accuracy": 0.6661087870597839, + "num_tokens": 2964399518.0, + "step": 17684 + }, + { + "entropy": 1.731959581375122, + "epoch": 1.9427920134025431, + "grad_norm": 0.7445155382156372, + "learning_rate": 2.0387800993004534e-06, + "loss": 1.329, + "mean_token_accuracy": 0.659634068608284, + "num_tokens": 2964555472.0, + "step": 17685 + }, + { + "entropy": 1.699200451374054, + "epoch": 1.942901870313916, + "grad_norm": 0.7213234305381775, + "learning_rate": 2.0386317651513557e-06, + "loss": 1.2045, + "mean_token_accuracy": 0.6754323144753774, + "num_tokens": 2964684843.0, + "step": 17686 + }, + { + "entropy": 1.7045519252618153, + "epoch": 1.9430117272252891, + "grad_norm": 0.6406714916229248, + "learning_rate": 2.038483714625525e-06, + "loss": 1.339, + "mean_token_accuracy": 0.6636529515186945, + "num_tokens": 2964860646.0, + "step": 17687 + }, + { + "entropy": 1.713558445374171, + "epoch": 1.943121584136662, + "grad_norm": 0.7353739142417908, + "learning_rate": 2.038335947727646e-06, + "loss": 1.4388, + "mean_token_accuracy": 0.6637803067763647, + "num_tokens": 2965024257.0, + "step": 17688 + }, + { + "entropy": 1.6618661483128865, + "epoch": 1.943231441048035, + "grad_norm": 0.6706850528717041, + "learning_rate": 2.0381884644623956e-06, + "loss": 1.3865, + "mean_token_accuracy": 0.6615625272194544, + "num_tokens": 2965227115.0, + "step": 17689 + }, + { + "entropy": 1.705021212498347, + "epoch": 1.943341297959408, + "grad_norm": 0.714533269405365, + "learning_rate": 2.0380412648344426e-06, + "loss": 1.418, + "mean_token_accuracy": 0.6533076216777166, + "num_tokens": 2965392439.0, + "step": 17690 + }, + { + "entropy": 1.6992263595263164, + "epoch": 1.9434511548707807, + "grad_norm": 0.6608665585517883, + "learning_rate": 2.037894348848445e-06, + "loss": 1.3918, + "mean_token_accuracy": 0.6552961965401968, + "num_tokens": 2965554340.0, + "step": 17691 + }, + { + "entropy": 1.6828101476033528, + "epoch": 1.9435610117821538, + "grad_norm": 0.663170576095581, + "learning_rate": 2.0377477165090535e-06, + "loss": 1.4066, + "mean_token_accuracy": 0.6510103195905685, + "num_tokens": 2965750001.0, + "step": 17692 + }, + { + "entropy": 1.6746436854203541, + "epoch": 1.9436708686935267, + "grad_norm": 0.6782816648483276, + "learning_rate": 2.0376013678209085e-06, + "loss": 1.3206, + "mean_token_accuracy": 0.6747565368811289, + "num_tokens": 2965884732.0, + "step": 17693 + }, + { + "entropy": 1.6658462782700856, + "epoch": 1.9437807256048996, + "grad_norm": 0.641159176826477, + "learning_rate": 2.03745530278864e-06, + "loss": 1.3442, + "mean_token_accuracy": 0.6628729601701101, + "num_tokens": 2966090536.0, + "step": 17694 + }, + { + "entropy": 1.7376565237840016, + "epoch": 1.9438905825162727, + "grad_norm": 0.7169440388679504, + "learning_rate": 2.0373095214168737e-06, + "loss": 1.4649, + "mean_token_accuracy": 0.6538771440585455, + "num_tokens": 2966257301.0, + "step": 17695 + }, + { + "entropy": 1.73634934425354, + "epoch": 1.9440004394276453, + "grad_norm": 0.7864576578140259, + "learning_rate": 2.037164023710222e-06, + "loss": 1.4079, + "mean_token_accuracy": 0.6538835118214289, + "num_tokens": 2966401908.0, + "step": 17696 + }, + { + "entropy": 1.7507271766662598, + "epoch": 1.9441102963390184, + "grad_norm": 0.819874107837677, + "learning_rate": 2.0370188096732905e-06, + "loss": 1.2977, + "mean_token_accuracy": 0.660680502653122, + "num_tokens": 2966499432.0, + "step": 17697 + }, + { + "entropy": 1.655652830998103, + "epoch": 1.9442201532503913, + "grad_norm": 0.6456592082977295, + "learning_rate": 2.0368738793106745e-06, + "loss": 1.3016, + "mean_token_accuracy": 0.678799549738566, + "num_tokens": 2966687978.0, + "step": 17698 + }, + { + "entropy": 1.6859602630138397, + "epoch": 1.9443300101617642, + "grad_norm": 0.7051413059234619, + "learning_rate": 2.036729232626962e-06, + "loss": 1.4748, + "mean_token_accuracy": 0.6505583177010218, + "num_tokens": 2966839033.0, + "step": 17699 + }, + { + "entropy": 1.6403660873572032, + "epoch": 1.9444398670731373, + "grad_norm": 0.6287828683853149, + "learning_rate": 2.03658486962673e-06, + "loss": 1.3748, + "mean_token_accuracy": 0.6574486494064331, + "num_tokens": 2967024711.0, + "step": 17700 + }, + { + "entropy": 1.719410906235377, + "epoch": 1.9445497239845102, + "grad_norm": 0.6784718036651611, + "learning_rate": 2.036440790314548e-06, + "loss": 1.3411, + "mean_token_accuracy": 0.6652787824471792, + "num_tokens": 2967151451.0, + "step": 17701 + }, + { + "entropy": 1.7372966408729553, + "epoch": 1.944659580895883, + "grad_norm": 0.8029721975326538, + "learning_rate": 2.0362969946949755e-06, + "loss": 1.4529, + "mean_token_accuracy": 0.6361254553000132, + "num_tokens": 2967375721.0, + "step": 17702 + }, + { + "entropy": 1.7620983918507893, + "epoch": 1.9447694378072562, + "grad_norm": 0.575481116771698, + "learning_rate": 2.0361534827725636e-06, + "loss": 1.5031, + "mean_token_accuracy": 0.6388354301452637, + "num_tokens": 2967592752.0, + "step": 17703 + }, + { + "entropy": 1.7115374505519867, + "epoch": 1.9448792947186289, + "grad_norm": 0.6605756878852844, + "learning_rate": 2.0360102545518557e-06, + "loss": 1.39, + "mean_token_accuracy": 0.653022920091947, + "num_tokens": 2967736489.0, + "step": 17704 + }, + { + "entropy": 1.6519458691279094, + "epoch": 1.944989151630002, + "grad_norm": 0.6345553398132324, + "learning_rate": 2.035867310037384e-06, + "loss": 1.4154, + "mean_token_accuracy": 0.6511341631412506, + "num_tokens": 2967887788.0, + "step": 17705 + }, + { + "entropy": 1.737346072991689, + "epoch": 1.9450990085413749, + "grad_norm": 0.7456694841384888, + "learning_rate": 2.0357246492336716e-06, + "loss": 1.313, + "mean_token_accuracy": 0.6586676637331644, + "num_tokens": 2968015984.0, + "step": 17706 + }, + { + "entropy": 1.714151293039322, + "epoch": 1.9452088654527477, + "grad_norm": 0.747020959854126, + "learning_rate": 2.0355822721452358e-06, + "loss": 1.6276, + "mean_token_accuracy": 0.6316531747579575, + "num_tokens": 2968206188.0, + "step": 17707 + }, + { + "entropy": 1.7121087312698364, + "epoch": 1.9453187223641208, + "grad_norm": 0.6129441857337952, + "learning_rate": 2.03544017877658e-06, + "loss": 1.339, + "mean_token_accuracy": 0.6570817331473032, + "num_tokens": 2968360647.0, + "step": 17708 + }, + { + "entropy": 1.6801585257053375, + "epoch": 1.9454285792754935, + "grad_norm": 0.7115065455436707, + "learning_rate": 2.0352983691322046e-06, + "loss": 1.3085, + "mean_token_accuracy": 0.6725891331831614, + "num_tokens": 2968553740.0, + "step": 17709 + }, + { + "entropy": 1.6956307987372081, + "epoch": 1.9455384361868666, + "grad_norm": 0.6248717904090881, + "learning_rate": 2.035156843216596e-06, + "loss": 1.4046, + "mean_token_accuracy": 0.649314617117246, + "num_tokens": 2968758358.0, + "step": 17710 + }, + { + "entropy": 1.7073156734307606, + "epoch": 1.9456482930982395, + "grad_norm": 0.7282310724258423, + "learning_rate": 2.035015601034233e-06, + "loss": 1.4184, + "mean_token_accuracy": 0.6510714391867319, + "num_tokens": 2968907627.0, + "step": 17711 + }, + { + "entropy": 1.767699142297109, + "epoch": 1.9457581500096124, + "grad_norm": 0.74590665102005, + "learning_rate": 2.0348746425895865e-06, + "loss": 1.3263, + "mean_token_accuracy": 0.6632108837366104, + "num_tokens": 2969049681.0, + "step": 17712 + }, + { + "entropy": 1.73310982187589, + "epoch": 1.9458680069209855, + "grad_norm": 0.9105587005615234, + "learning_rate": 2.034733967887119e-06, + "loss": 1.3848, + "mean_token_accuracy": 0.6630599250396093, + "num_tokens": 2969205626.0, + "step": 17713 + }, + { + "entropy": 1.7039326230684917, + "epoch": 1.9459778638323584, + "grad_norm": 0.6730808019638062, + "learning_rate": 2.03459357693128e-06, + "loss": 1.2393, + "mean_token_accuracy": 0.6768955588340759, + "num_tokens": 2969315930.0, + "step": 17714 + }, + { + "entropy": 1.695899059375127, + "epoch": 1.9460877207437313, + "grad_norm": 1.4940348863601685, + "learning_rate": 2.0344534697265152e-06, + "loss": 1.3153, + "mean_token_accuracy": 0.6758741736412048, + "num_tokens": 2969509322.0, + "step": 17715 + }, + { + "entropy": 1.7427550554275513, + "epoch": 1.9461975776551044, + "grad_norm": 0.6874081492424011, + "learning_rate": 2.0343136462772583e-06, + "loss": 1.4501, + "mean_token_accuracy": 0.6438677261273066, + "num_tokens": 2969660188.0, + "step": 17716 + }, + { + "entropy": 1.7365627984205882, + "epoch": 1.946307434566477, + "grad_norm": 0.6163883805274963, + "learning_rate": 2.034174106587934e-06, + "loss": 1.5149, + "mean_token_accuracy": 0.6376241246859232, + "num_tokens": 2969893508.0, + "step": 17717 + }, + { + "entropy": 1.6431179742018382, + "epoch": 1.9464172914778501, + "grad_norm": 0.606504499912262, + "learning_rate": 2.0340348506629586e-06, + "loss": 1.5092, + "mean_token_accuracy": 0.6399561762809753, + "num_tokens": 2970104649.0, + "step": 17718 + }, + { + "entropy": 1.7281967997550964, + "epoch": 1.946527148389223, + "grad_norm": 0.6497771739959717, + "learning_rate": 2.033895878506742e-06, + "loss": 1.5915, + "mean_token_accuracy": 0.6247910012801489, + "num_tokens": 2970337768.0, + "step": 17719 + }, + { + "entropy": 1.7278837660948436, + "epoch": 1.946637005300596, + "grad_norm": 0.7515074610710144, + "learning_rate": 2.033757190123679e-06, + "loss": 1.3733, + "mean_token_accuracy": 0.6671270777781805, + "num_tokens": 2970488423.0, + "step": 17720 + }, + { + "entropy": 1.755313863356908, + "epoch": 1.946746862211969, + "grad_norm": 0.7204878926277161, + "learning_rate": 2.0336187855181603e-06, + "loss": 1.4981, + "mean_token_accuracy": 0.6442046463489532, + "num_tokens": 2970643331.0, + "step": 17721 + }, + { + "entropy": 1.6647318204243977, + "epoch": 1.9468567191233417, + "grad_norm": 0.6584035158157349, + "learning_rate": 2.033480664694568e-06, + "loss": 1.4157, + "mean_token_accuracy": 0.6540108720461527, + "num_tokens": 2970795323.0, + "step": 17722 + }, + { + "entropy": 1.765694946050644, + "epoch": 1.9469665760347148, + "grad_norm": 0.8152051568031311, + "learning_rate": 2.033342827657271e-06, + "loss": 1.4476, + "mean_token_accuracy": 0.6522760838270187, + "num_tokens": 2970956293.0, + "step": 17723 + }, + { + "entropy": 1.776330480972926, + "epoch": 1.9470764329460877, + "grad_norm": 0.817134439945221, + "learning_rate": 2.033205274410634e-06, + "loss": 1.4082, + "mean_token_accuracy": 0.6540219734112421, + "num_tokens": 2971111548.0, + "step": 17724 + }, + { + "entropy": 1.7007002929846446, + "epoch": 1.9471862898574606, + "grad_norm": 0.6617542505264282, + "learning_rate": 2.0330680049590095e-06, + "loss": 1.4518, + "mean_token_accuracy": 0.661642129222552, + "num_tokens": 2971264294.0, + "step": 17725 + }, + { + "entropy": 1.6825316647688549, + "epoch": 1.9472961467688337, + "grad_norm": 0.6800007820129395, + "learning_rate": 2.032931019306741e-06, + "loss": 1.4463, + "mean_token_accuracy": 0.6541166653235754, + "num_tokens": 2971450963.0, + "step": 17726 + }, + { + "entropy": 1.7201205591360729, + "epoch": 1.9474060036802066, + "grad_norm": 0.6191043853759766, + "learning_rate": 2.0327943174581663e-06, + "loss": 1.4436, + "mean_token_accuracy": 0.6575228720903397, + "num_tokens": 2971650421.0, + "step": 17727 + }, + { + "entropy": 1.7219129304091136, + "epoch": 1.9475158605915794, + "grad_norm": 0.7525007128715515, + "learning_rate": 2.0326578994176104e-06, + "loss": 1.3096, + "mean_token_accuracy": 0.6700589607159296, + "num_tokens": 2971787284.0, + "step": 17728 + }, + { + "entropy": 1.7868116199970245, + "epoch": 1.9476257175029525, + "grad_norm": 0.6978966593742371, + "learning_rate": 2.032521765189391e-06, + "loss": 1.4937, + "mean_token_accuracy": 0.6415872623523077, + "num_tokens": 2971987111.0, + "step": 17729 + }, + { + "entropy": 1.7060894866784413, + "epoch": 1.9477355744143252, + "grad_norm": 1.1793681383132935, + "learning_rate": 2.032385914777817e-06, + "loss": 1.5346, + "mean_token_accuracy": 0.642043317357699, + "num_tokens": 2972138435.0, + "step": 17730 + }, + { + "entropy": 1.7055272956689198, + "epoch": 1.9478454313256983, + "grad_norm": 0.7151913642883301, + "learning_rate": 2.032250348187188e-06, + "loss": 1.3737, + "mean_token_accuracy": 0.6631810615460078, + "num_tokens": 2972288505.0, + "step": 17731 + }, + { + "entropy": 1.7625197768211365, + "epoch": 1.9479552882370712, + "grad_norm": 0.9355411529541016, + "learning_rate": 2.032115065421794e-06, + "loss": 1.4241, + "mean_token_accuracy": 0.6463347425063452, + "num_tokens": 2972489591.0, + "step": 17732 + }, + { + "entropy": 1.7217676838239033, + "epoch": 1.948065145148444, + "grad_norm": 0.7413977980613708, + "learning_rate": 2.0319800664859175e-06, + "loss": 1.4565, + "mean_token_accuracy": 0.6653625269730886, + "num_tokens": 2972644196.0, + "step": 17733 + }, + { + "entropy": 1.7668897410233815, + "epoch": 1.9481750020598172, + "grad_norm": 0.6924734711647034, + "learning_rate": 2.031845351383831e-06, + "loss": 1.3263, + "mean_token_accuracy": 0.6588727583487829, + "num_tokens": 2972777302.0, + "step": 17734 + }, + { + "entropy": 1.7585100928942363, + "epoch": 1.9482848589711899, + "grad_norm": 0.7000848650932312, + "learning_rate": 2.0317109201197978e-06, + "loss": 1.4577, + "mean_token_accuracy": 0.6405526846647263, + "num_tokens": 2972939201.0, + "step": 17735 + }, + { + "entropy": 1.6759247382481892, + "epoch": 1.948394715882563, + "grad_norm": 0.7132837176322937, + "learning_rate": 2.0315767726980726e-06, + "loss": 1.5039, + "mean_token_accuracy": 0.6526048630475998, + "num_tokens": 2973129903.0, + "step": 17736 + }, + { + "entropy": 1.668695737918218, + "epoch": 1.9485045727939359, + "grad_norm": 0.6133973598480225, + "learning_rate": 2.031442909122902e-06, + "loss": 1.3299, + "mean_token_accuracy": 0.669144387046496, + "num_tokens": 2973289265.0, + "step": 17737 + }, + { + "entropy": 1.6713014245033264, + "epoch": 1.9486144297053087, + "grad_norm": 0.6235535144805908, + "learning_rate": 2.031309329398521e-06, + "loss": 1.399, + "mean_token_accuracy": 0.6442220707734426, + "num_tokens": 2973478673.0, + "step": 17738 + }, + { + "entropy": 1.6983527541160583, + "epoch": 1.9487242866166818, + "grad_norm": 0.6092813014984131, + "learning_rate": 2.031176033529158e-06, + "loss": 1.4028, + "mean_token_accuracy": 0.636608416835467, + "num_tokens": 2973675888.0, + "step": 17739 + }, + { + "entropy": 1.6300262808799744, + "epoch": 1.9488341435280547, + "grad_norm": 0.7199864983558655, + "learning_rate": 2.0310430215190336e-06, + "loss": 1.3337, + "mean_token_accuracy": 0.6639452030261358, + "num_tokens": 2973853010.0, + "step": 17740 + }, + { + "entropy": 1.7593192954858143, + "epoch": 1.9489440004394276, + "grad_norm": 0.7048966884613037, + "learning_rate": 2.0309102933723555e-06, + "loss": 1.3223, + "mean_token_accuracy": 0.6694928755362829, + "num_tokens": 2973982382.0, + "step": 17741 + }, + { + "entropy": 1.6991233627001445, + "epoch": 1.9490538573508007, + "grad_norm": 0.6124697923660278, + "learning_rate": 2.0307778490933245e-06, + "loss": 1.3468, + "mean_token_accuracy": 0.6536738177140554, + "num_tokens": 2974186796.0, + "step": 17742 + }, + { + "entropy": 1.6812581419944763, + "epoch": 1.9491637142621734, + "grad_norm": 0.6754594445228577, + "learning_rate": 2.0306456886861333e-06, + "loss": 1.3569, + "mean_token_accuracy": 0.6634483486413956, + "num_tokens": 2974362784.0, + "step": 17743 + }, + { + "entropy": 1.7027353048324585, + "epoch": 1.9492735711735465, + "grad_norm": 0.7747242450714111, + "learning_rate": 2.030513812154964e-06, + "loss": 1.2265, + "mean_token_accuracy": 0.682536577184995, + "num_tokens": 2974489565.0, + "step": 17744 + }, + { + "entropy": 1.631873478492101, + "epoch": 1.9493834280849194, + "grad_norm": 0.6376305222511292, + "learning_rate": 2.030382219503991e-06, + "loss": 1.3407, + "mean_token_accuracy": 0.6745425860087076, + "num_tokens": 2974644841.0, + "step": 17745 + }, + { + "entropy": 1.6536372005939484, + "epoch": 1.9494932849962923, + "grad_norm": 0.812867283821106, + "learning_rate": 2.03025091073738e-06, + "loss": 1.4305, + "mean_token_accuracy": 0.6665123303731283, + "num_tokens": 2974819612.0, + "step": 17746 + }, + { + "entropy": 1.741923948129018, + "epoch": 1.9496031419076654, + "grad_norm": 0.6799856424331665, + "learning_rate": 2.0301198858592847e-06, + "loss": 1.4306, + "mean_token_accuracy": 0.6507015228271484, + "num_tokens": 2974975514.0, + "step": 17747 + }, + { + "entropy": 1.7347366710503895, + "epoch": 1.9497129988190383, + "grad_norm": 0.8397426605224609, + "learning_rate": 2.029989144873853e-06, + "loss": 1.2693, + "mean_token_accuracy": 0.6738801846901575, + "num_tokens": 2975106527.0, + "step": 17748 + }, + { + "entropy": 1.7347449858983357, + "epoch": 1.9498228557304111, + "grad_norm": 0.6037880182266235, + "learning_rate": 2.0298586877852233e-06, + "loss": 1.463, + "mean_token_accuracy": 0.6357159316539764, + "num_tokens": 2975283443.0, + "step": 17749 + }, + { + "entropy": 1.7268792192141216, + "epoch": 1.949932712641784, + "grad_norm": 0.698653519153595, + "learning_rate": 2.0297285145975243e-06, + "loss": 1.2884, + "mean_token_accuracy": 0.6732809692621231, + "num_tokens": 2975397640.0, + "step": 17750 + }, + { + "entropy": 1.7239871521790822, + "epoch": 1.950042569553157, + "grad_norm": 0.7377451658248901, + "learning_rate": 2.0295986253148748e-06, + "loss": 1.5272, + "mean_token_accuracy": 0.6336576888958613, + "num_tokens": 2975554478.0, + "step": 17751 + }, + { + "entropy": 1.7457629640897114, + "epoch": 1.95015242646453, + "grad_norm": 0.8000925779342651, + "learning_rate": 2.029469019941387e-06, + "loss": 1.4273, + "mean_token_accuracy": 0.6509995808204015, + "num_tokens": 2975691139.0, + "step": 17752 + }, + { + "entropy": 1.7045822242895763, + "epoch": 1.950262283375903, + "grad_norm": 0.6363489627838135, + "learning_rate": 2.029339698481164e-06, + "loss": 1.3028, + "mean_token_accuracy": 0.6681992560625076, + "num_tokens": 2975879366.0, + "step": 17753 + }, + { + "entropy": 1.723255564769109, + "epoch": 1.9503721402872758, + "grad_norm": 0.7007434368133545, + "learning_rate": 2.029210660938295e-06, + "loss": 1.5731, + "mean_token_accuracy": 0.6275994380315145, + "num_tokens": 2976062063.0, + "step": 17754 + }, + { + "entropy": 1.7033185164133708, + "epoch": 1.950481997198649, + "grad_norm": 0.6987881064414978, + "learning_rate": 2.0290819073168673e-06, + "loss": 1.2802, + "mean_token_accuracy": 0.6664392650127411, + "num_tokens": 2976182351.0, + "step": 17755 + }, + { + "entropy": 1.6668557822704315, + "epoch": 1.9505918541100216, + "grad_norm": 0.6711578965187073, + "learning_rate": 2.028953437620955e-06, + "loss": 1.2469, + "mean_token_accuracy": 0.6852605938911438, + "num_tokens": 2976305690.0, + "step": 17756 + }, + { + "entropy": 1.7156360646088917, + "epoch": 1.9507017110213947, + "grad_norm": 0.7528457045555115, + "learning_rate": 2.0288252518546247e-06, + "loss": 1.5161, + "mean_token_accuracy": 0.64786363641421, + "num_tokens": 2976493784.0, + "step": 17757 + }, + { + "entropy": 1.669191300868988, + "epoch": 1.9508115679327676, + "grad_norm": 0.6685511469841003, + "learning_rate": 2.0286973500219315e-06, + "loss": 1.4391, + "mean_token_accuracy": 0.6614227841297785, + "num_tokens": 2976692124.0, + "step": 17758 + }, + { + "entropy": 1.6809686024983723, + "epoch": 1.9509214248441404, + "grad_norm": 0.6204544901847839, + "learning_rate": 2.028569732126924e-06, + "loss": 1.5254, + "mean_token_accuracy": 0.6390020251274109, + "num_tokens": 2976904174.0, + "step": 17759 + }, + { + "entropy": 1.7139968574047089, + "epoch": 1.9510312817555135, + "grad_norm": 0.5688869953155518, + "learning_rate": 2.0284423981736432e-06, + "loss": 1.3492, + "mean_token_accuracy": 0.663325771689415, + "num_tokens": 2977049188.0, + "step": 17760 + }, + { + "entropy": 1.7387417654196422, + "epoch": 1.9511411386668864, + "grad_norm": 0.6545831561088562, + "learning_rate": 2.028315348166117e-06, + "loss": 1.387, + "mean_token_accuracy": 0.6539622743924459, + "num_tokens": 2977201898.0, + "step": 17761 + }, + { + "entropy": 1.6949077546596527, + "epoch": 1.9512509955782593, + "grad_norm": 0.7409424781799316, + "learning_rate": 2.028188582108368e-06, + "loss": 1.3519, + "mean_token_accuracy": 0.6653612554073334, + "num_tokens": 2977333297.0, + "step": 17762 + }, + { + "entropy": 1.735666275024414, + "epoch": 1.9513608524896322, + "grad_norm": 0.604129433631897, + "learning_rate": 2.0280621000044065e-06, + "loss": 1.4387, + "mean_token_accuracy": 0.652505616346995, + "num_tokens": 2977536291.0, + "step": 17763 + }, + { + "entropy": 1.6692261199156444, + "epoch": 1.951470709401005, + "grad_norm": 0.6267274022102356, + "learning_rate": 2.0279359018582377e-06, + "loss": 1.4824, + "mean_token_accuracy": 0.6410986383756002, + "num_tokens": 2977726406.0, + "step": 17764 + }, + { + "entropy": 1.7290876011053722, + "epoch": 1.9515805663123782, + "grad_norm": 0.7559896111488342, + "learning_rate": 2.0278099876738543e-06, + "loss": 1.3264, + "mean_token_accuracy": 0.6726896514495214, + "num_tokens": 2977841878.0, + "step": 17765 + }, + { + "entropy": 1.716409037510554, + "epoch": 1.951690423223751, + "grad_norm": 0.6588131785392761, + "learning_rate": 2.0276843574552425e-06, + "loss": 1.2942, + "mean_token_accuracy": 0.6628698209921519, + "num_tokens": 2977963873.0, + "step": 17766 + }, + { + "entropy": 1.717410941918691, + "epoch": 1.951800280135124, + "grad_norm": 0.7476875185966492, + "learning_rate": 2.027559011206377e-06, + "loss": 1.319, + "mean_token_accuracy": 0.6718118588129679, + "num_tokens": 2978082887.0, + "step": 17767 + }, + { + "entropy": 1.6688204904397328, + "epoch": 1.951910137046497, + "grad_norm": 0.6238333582878113, + "learning_rate": 2.0274339489312252e-06, + "loss": 1.4476, + "mean_token_accuracy": 0.6366531451543173, + "num_tokens": 2978319052.0, + "step": 17768 + }, + { + "entropy": 1.6899834473927815, + "epoch": 1.9520199939578697, + "grad_norm": 0.7009279131889343, + "learning_rate": 2.0273091706337467e-06, + "loss": 1.2801, + "mean_token_accuracy": 0.6724486152331034, + "num_tokens": 2978430156.0, + "step": 17769 + }, + { + "entropy": 1.7222119470437367, + "epoch": 1.9521298508692428, + "grad_norm": 0.6706877946853638, + "learning_rate": 2.0271846763178895e-06, + "loss": 1.3487, + "mean_token_accuracy": 0.6513159523407618, + "num_tokens": 2978603591.0, + "step": 17770 + }, + { + "entropy": 1.711435745159785, + "epoch": 1.9522397077806157, + "grad_norm": 0.6107102632522583, + "learning_rate": 2.0270604659875943e-06, + "loss": 1.374, + "mean_token_accuracy": 0.6598343700170517, + "num_tokens": 2978796689.0, + "step": 17771 + }, + { + "entropy": 1.771813799937566, + "epoch": 1.9523495646919886, + "grad_norm": 0.6782006621360779, + "learning_rate": 2.026936539646792e-06, + "loss": 1.5024, + "mean_token_accuracy": 0.6387060980002085, + "num_tokens": 2978999550.0, + "step": 17772 + }, + { + "entropy": 1.713943600654602, + "epoch": 1.9524594216033617, + "grad_norm": 0.8408421277999878, + "learning_rate": 2.0268128972994044e-06, + "loss": 1.401, + "mean_token_accuracy": 0.6681250631809235, + "num_tokens": 2979123400.0, + "step": 17773 + }, + { + "entropy": 1.7296073734760284, + "epoch": 1.9525692785147346, + "grad_norm": 0.8583366274833679, + "learning_rate": 2.0266895389493456e-06, + "loss": 1.4684, + "mean_token_accuracy": 0.6323349376519521, + "num_tokens": 2979299769.0, + "step": 17774 + }, + { + "entropy": 1.7292738854885101, + "epoch": 1.9526791354261075, + "grad_norm": 0.6199796199798584, + "learning_rate": 2.0265664646005194e-06, + "loss": 1.4296, + "mean_token_accuracy": 0.6407396892706553, + "num_tokens": 2979543584.0, + "step": 17775 + }, + { + "entropy": 1.6723608275254567, + "epoch": 1.9527889923374804, + "grad_norm": 0.6568523049354553, + "learning_rate": 2.0264436742568204e-06, + "loss": 1.4326, + "mean_token_accuracy": 0.6430952151616415, + "num_tokens": 2979713882.0, + "step": 17776 + }, + { + "entropy": 1.6405765612920125, + "epoch": 1.9528988492488533, + "grad_norm": 0.7078863382339478, + "learning_rate": 2.0263211679221358e-06, + "loss": 1.4032, + "mean_token_accuracy": 0.6723550657431284, + "num_tokens": 2979909442.0, + "step": 17777 + }, + { + "entropy": 1.6554110149542491, + "epoch": 1.9530087061602264, + "grad_norm": 0.6638414859771729, + "learning_rate": 2.0261989456003436e-06, + "loss": 1.2885, + "mean_token_accuracy": 0.6878614326318105, + "num_tokens": 2980088903.0, + "step": 17778 + }, + { + "entropy": 1.7123227616151173, + "epoch": 1.9531185630715993, + "grad_norm": 0.5971532464027405, + "learning_rate": 2.02607700729531e-06, + "loss": 1.4228, + "mean_token_accuracy": 0.6444319188594818, + "num_tokens": 2980281354.0, + "step": 17779 + }, + { + "entropy": 1.7631987730662029, + "epoch": 1.9532284199829721, + "grad_norm": 0.5625388622283936, + "learning_rate": 2.025955353010896e-06, + "loss": 1.4602, + "mean_token_accuracy": 0.6468930691480637, + "num_tokens": 2980500165.0, + "step": 17780 + }, + { + "entropy": 1.7447159190972645, + "epoch": 1.9533382768943452, + "grad_norm": 0.6851442456245422, + "learning_rate": 2.0258339827509506e-06, + "loss": 1.4913, + "mean_token_accuracy": 0.6319667845964432, + "num_tokens": 2980713289.0, + "step": 17781 + }, + { + "entropy": 1.7208144168059032, + "epoch": 1.953448133805718, + "grad_norm": 0.629928708076477, + "learning_rate": 2.0257128965193165e-06, + "loss": 1.4998, + "mean_token_accuracy": 0.6411555955807368, + "num_tokens": 2980902416.0, + "step": 17782 + }, + { + "entropy": 1.7709160546461742, + "epoch": 1.953557990717091, + "grad_norm": 0.7798748016357422, + "learning_rate": 2.0255920943198244e-06, + "loss": 1.3797, + "mean_token_accuracy": 0.6637091686328253, + "num_tokens": 2981033075.0, + "step": 17783 + }, + { + "entropy": 1.6994303961594899, + "epoch": 1.953667847628464, + "grad_norm": 0.7806686758995056, + "learning_rate": 2.0254715761562998e-06, + "loss": 1.5518, + "mean_token_accuracy": 0.6446986546119055, + "num_tokens": 2981192101.0, + "step": 17784 + }, + { + "entropy": 1.7072394092877705, + "epoch": 1.9537777045398368, + "grad_norm": 0.6744371056556702, + "learning_rate": 2.0253513420325545e-06, + "loss": 1.4875, + "mean_token_accuracy": 0.6371971815824509, + "num_tokens": 2981426393.0, + "step": 17785 + }, + { + "entropy": 1.6127947370211284, + "epoch": 1.95388756145121, + "grad_norm": 0.7404419779777527, + "learning_rate": 2.025231391952396e-06, + "loss": 1.346, + "mean_token_accuracy": 0.6637389014164606, + "num_tokens": 2981579481.0, + "step": 17786 + }, + { + "entropy": 1.7959074278672535, + "epoch": 1.9539974183625828, + "grad_norm": 0.7481608986854553, + "learning_rate": 2.0251117259196202e-06, + "loss": 1.3442, + "mean_token_accuracy": 0.6640120794375738, + "num_tokens": 2981726089.0, + "step": 17787 + }, + { + "entropy": 1.713577965895335, + "epoch": 1.9541072752739557, + "grad_norm": 0.835660994052887, + "learning_rate": 2.0249923439380127e-06, + "loss": 1.3407, + "mean_token_accuracy": 0.6656025052070618, + "num_tokens": 2981849837.0, + "step": 17788 + }, + { + "entropy": 1.696219692627589, + "epoch": 1.9542171321853288, + "grad_norm": 0.7188278436660767, + "learning_rate": 2.024873246011354e-06, + "loss": 1.2623, + "mean_token_accuracy": 0.6720642745494843, + "num_tokens": 2982006826.0, + "step": 17789 + }, + { + "entropy": 1.7328607738018036, + "epoch": 1.9543269890967014, + "grad_norm": 0.7431653738021851, + "learning_rate": 2.0247544321434136e-06, + "loss": 1.4974, + "mean_token_accuracy": 0.6494582444429398, + "num_tokens": 2982194388.0, + "step": 17790 + }, + { + "entropy": 1.713299572467804, + "epoch": 1.9544368460080745, + "grad_norm": 0.6786489486694336, + "learning_rate": 2.02463590233795e-06, + "loss": 1.3613, + "mean_token_accuracy": 0.6588152448336283, + "num_tokens": 2982336529.0, + "step": 17791 + }, + { + "entropy": 1.721155156691869, + "epoch": 1.9545467029194474, + "grad_norm": 0.6759476661682129, + "learning_rate": 2.024517656598716e-06, + "loss": 1.5028, + "mean_token_accuracy": 0.6534711370865504, + "num_tokens": 2982487665.0, + "step": 17792 + }, + { + "entropy": 1.7625728845596313, + "epoch": 1.9546565598308203, + "grad_norm": 0.683269739151001, + "learning_rate": 2.0243996949294543e-06, + "loss": 1.4255, + "mean_token_accuracy": 0.6441936790943146, + "num_tokens": 2982725011.0, + "step": 17793 + }, + { + "entropy": 1.7418459355831146, + "epoch": 1.9547664167421934, + "grad_norm": 0.7607301473617554, + "learning_rate": 2.0242820173338963e-06, + "loss": 1.3563, + "mean_token_accuracy": 0.6651049753030142, + "num_tokens": 2982847742.0, + "step": 17794 + }, + { + "entropy": 1.7185759842395782, + "epoch": 1.954876273653566, + "grad_norm": 0.6495237350463867, + "learning_rate": 2.024164623815769e-06, + "loss": 1.3563, + "mean_token_accuracy": 0.6601082533597946, + "num_tokens": 2982984398.0, + "step": 17795 + }, + { + "entropy": 1.6946265896161397, + "epoch": 1.9549861305649392, + "grad_norm": 0.8139024972915649, + "learning_rate": 2.024047514378787e-06, + "loss": 1.5517, + "mean_token_accuracy": 0.6357475270827612, + "num_tokens": 2983145945.0, + "step": 17796 + }, + { + "entropy": 1.6650099456310272, + "epoch": 1.955095987476312, + "grad_norm": 0.5920116305351257, + "learning_rate": 2.0239306890266558e-06, + "loss": 1.3403, + "mean_token_accuracy": 0.6662998845179876, + "num_tokens": 2983303994.0, + "step": 17797 + }, + { + "entropy": 1.6227064232031505, + "epoch": 1.955205844387685, + "grad_norm": 0.6600332260131836, + "learning_rate": 2.0238141477630744e-06, + "loss": 1.4115, + "mean_token_accuracy": 0.6637008314331373, + "num_tokens": 2983460489.0, + "step": 17798 + }, + { + "entropy": 1.6318263411521912, + "epoch": 1.955315701299058, + "grad_norm": 0.7498336434364319, + "learning_rate": 2.0236978905917296e-06, + "loss": 1.3756, + "mean_token_accuracy": 0.6501528173685074, + "num_tokens": 2983644135.0, + "step": 17799 + }, + { + "entropy": 1.7046960294246674, + "epoch": 1.955425558210431, + "grad_norm": 0.7306966781616211, + "learning_rate": 2.0235819175163017e-06, + "loss": 1.3088, + "mean_token_accuracy": 0.6869508425394694, + "num_tokens": 2983803611.0, + "step": 17800 + }, + { + "entropy": 1.7258172035217285, + "epoch": 1.9555354151218038, + "grad_norm": 0.7059761881828308, + "learning_rate": 2.0234662285404617e-06, + "loss": 1.3583, + "mean_token_accuracy": 0.6637235432863235, + "num_tokens": 2983979414.0, + "step": 17801 + }, + { + "entropy": 1.769530753294627, + "epoch": 1.955645272033177, + "grad_norm": 0.6218457818031311, + "learning_rate": 2.0233508236678702e-06, + "loss": 1.4673, + "mean_token_accuracy": 0.6336728036403656, + "num_tokens": 2984189537.0, + "step": 17802 + }, + { + "entropy": 1.6664655307928722, + "epoch": 1.9557551289445496, + "grad_norm": 0.7295881509780884, + "learning_rate": 2.023235702902181e-06, + "loss": 1.5348, + "mean_token_accuracy": 0.6525272379318873, + "num_tokens": 2984350464.0, + "step": 17803 + }, + { + "entropy": 1.7370224396387737, + "epoch": 1.9558649858559227, + "grad_norm": 0.5997971892356873, + "learning_rate": 2.0231208662470357e-06, + "loss": 1.4465, + "mean_token_accuracy": 0.6436833242575327, + "num_tokens": 2984529796.0, + "step": 17804 + }, + { + "entropy": 1.6230102678140004, + "epoch": 1.9559748427672956, + "grad_norm": 0.6606541275978088, + "learning_rate": 2.023006313706071e-06, + "loss": 1.315, + "mean_token_accuracy": 0.6677302569150925, + "num_tokens": 2984680008.0, + "step": 17805 + }, + { + "entropy": 1.7209392488002777, + "epoch": 1.9560846996786685, + "grad_norm": 0.6199936866760254, + "learning_rate": 2.0228920452829103e-06, + "loss": 1.352, + "mean_token_accuracy": 0.6574979374806086, + "num_tokens": 2984848335.0, + "step": 17806 + }, + { + "entropy": 1.692920833826065, + "epoch": 1.9561945565900416, + "grad_norm": 0.653578519821167, + "learning_rate": 2.022778060981172e-06, + "loss": 1.424, + "mean_token_accuracy": 0.6595415671666464, + "num_tokens": 2985045996.0, + "step": 17807 + }, + { + "entropy": 1.6915589074293773, + "epoch": 1.9563044135014143, + "grad_norm": 0.6038907766342163, + "learning_rate": 2.0226643608044624e-06, + "loss": 1.3544, + "mean_token_accuracy": 0.6625373015801111, + "num_tokens": 2985204883.0, + "step": 17808 + }, + { + "entropy": 1.691193660100301, + "epoch": 1.9564142704127874, + "grad_norm": 0.6831693053245544, + "learning_rate": 2.022550944756381e-06, + "loss": 1.5571, + "mean_token_accuracy": 0.6408517857392629, + "num_tokens": 2985381113.0, + "step": 17809 + }, + { + "entropy": 1.7213842968146007, + "epoch": 1.9565241273241603, + "grad_norm": 0.7153857946395874, + "learning_rate": 2.0224378128405157e-06, + "loss": 1.4151, + "mean_token_accuracy": 0.6612313389778137, + "num_tokens": 2985523825.0, + "step": 17810 + }, + { + "entropy": 1.7010469138622284, + "epoch": 1.9566339842355331, + "grad_norm": 0.6271878480911255, + "learning_rate": 2.0223249650604493e-06, + "loss": 1.4303, + "mean_token_accuracy": 0.645938828587532, + "num_tokens": 2985672895.0, + "step": 17811 + }, + { + "entropy": 1.6880601942539215, + "epoch": 1.9567438411469062, + "grad_norm": 0.8414680361747742, + "learning_rate": 2.022212401419752e-06, + "loss": 1.5198, + "mean_token_accuracy": 0.6524382879336675, + "num_tokens": 2985824112.0, + "step": 17812 + }, + { + "entropy": 1.697400947411855, + "epoch": 1.9568536980582791, + "grad_norm": 0.7417037487030029, + "learning_rate": 2.0221001219219877e-06, + "loss": 1.427, + "mean_token_accuracy": 0.6512279262145361, + "num_tokens": 2985971235.0, + "step": 17813 + }, + { + "entropy": 1.6493543187777202, + "epoch": 1.956963554969652, + "grad_norm": 0.6581324934959412, + "learning_rate": 2.0219881265707077e-06, + "loss": 1.3354, + "mean_token_accuracy": 0.6625111848115921, + "num_tokens": 2986149318.0, + "step": 17814 + }, + { + "entropy": 1.7380223472913106, + "epoch": 1.9570734118810251, + "grad_norm": 0.60667884349823, + "learning_rate": 2.0218764153694586e-06, + "loss": 1.4921, + "mean_token_accuracy": 0.6427192886670431, + "num_tokens": 2986347470.0, + "step": 17815 + }, + { + "entropy": 1.7578493853410084, + "epoch": 1.9571832687923978, + "grad_norm": 0.6818580031394958, + "learning_rate": 2.0217649883217746e-06, + "loss": 1.4195, + "mean_token_accuracy": 0.64846271276474, + "num_tokens": 2986555340.0, + "step": 17816 + }, + { + "entropy": 1.6980695327123005, + "epoch": 1.957293125703771, + "grad_norm": 0.6845924258232117, + "learning_rate": 2.0216538454311836e-06, + "loss": 1.3064, + "mean_token_accuracy": 0.6753988613684972, + "num_tokens": 2986682716.0, + "step": 17817 + }, + { + "entropy": 1.711136023203532, + "epoch": 1.9574029826151438, + "grad_norm": 0.6631032824516296, + "learning_rate": 2.0215429867012017e-06, + "loss": 1.3439, + "mean_token_accuracy": 0.6630576600631078, + "num_tokens": 2986821237.0, + "step": 17818 + }, + { + "entropy": 1.7483221391836803, + "epoch": 1.9575128395265167, + "grad_norm": 0.6929804086685181, + "learning_rate": 2.0214324121353403e-06, + "loss": 1.5989, + "mean_token_accuracy": 0.6324756195147833, + "num_tokens": 2986991133.0, + "step": 17819 + }, + { + "entropy": 1.7035737733046215, + "epoch": 1.9576226964378898, + "grad_norm": 0.6949917078018188, + "learning_rate": 2.021322121737095e-06, + "loss": 1.3302, + "mean_token_accuracy": 0.6612950712442398, + "num_tokens": 2987114817.0, + "step": 17820 + }, + { + "entropy": 1.7653352518876393, + "epoch": 1.9577325533492624, + "grad_norm": 0.7462692260742188, + "learning_rate": 2.0212121155099607e-06, + "loss": 1.337, + "mean_token_accuracy": 0.6608709494272867, + "num_tokens": 2987231442.0, + "step": 17821 + }, + { + "entropy": 1.7688710192839305, + "epoch": 1.9578424102606355, + "grad_norm": 0.651204526424408, + "learning_rate": 2.0211023934574157e-06, + "loss": 1.5705, + "mean_token_accuracy": 0.6285947610934576, + "num_tokens": 2987437378.0, + "step": 17822 + }, + { + "entropy": 1.7017524043718975, + "epoch": 1.9579522671720084, + "grad_norm": 0.9739505052566528, + "learning_rate": 2.0209929555829346e-06, + "loss": 1.3126, + "mean_token_accuracy": 0.6632164816061655, + "num_tokens": 2987588804.0, + "step": 17823 + }, + { + "entropy": 1.6945142149925232, + "epoch": 1.9580621240833813, + "grad_norm": 0.6685092449188232, + "learning_rate": 2.02088380188998e-06, + "loss": 1.259, + "mean_token_accuracy": 0.6727628062168757, + "num_tokens": 2987716913.0, + "step": 17824 + }, + { + "entropy": 1.674852301677068, + "epoch": 1.9581719809947544, + "grad_norm": 0.6597932577133179, + "learning_rate": 2.020774932382007e-06, + "loss": 1.4541, + "mean_token_accuracy": 0.6472754130760828, + "num_tokens": 2987941322.0, + "step": 17825 + }, + { + "entropy": 1.6793397963047028, + "epoch": 1.9582818379061273, + "grad_norm": 0.5924980640411377, + "learning_rate": 2.0206663470624615e-06, + "loss": 1.2984, + "mean_token_accuracy": 0.678433025876681, + "num_tokens": 2988134589.0, + "step": 17826 + }, + { + "entropy": 1.7250191171964009, + "epoch": 1.9583916948175002, + "grad_norm": 0.723518967628479, + "learning_rate": 2.0205580459347796e-06, + "loss": 1.4791, + "mean_token_accuracy": 0.6549117714166641, + "num_tokens": 2988285766.0, + "step": 17827 + }, + { + "entropy": 1.6623546183109283, + "epoch": 1.9585015517288733, + "grad_norm": 1.5038546323776245, + "learning_rate": 2.0204500290023898e-06, + "loss": 1.2749, + "mean_token_accuracy": 0.6662939141194025, + "num_tokens": 2988537104.0, + "step": 17828 + }, + { + "entropy": 1.6771051188309987, + "epoch": 1.958611408640246, + "grad_norm": 0.6059991717338562, + "learning_rate": 2.0203422962687107e-06, + "loss": 1.474, + "mean_token_accuracy": 0.650149792432785, + "num_tokens": 2988725414.0, + "step": 17829 + }, + { + "entropy": 1.757999986410141, + "epoch": 1.958721265551619, + "grad_norm": 0.7955412268638611, + "learning_rate": 2.0202348477371504e-06, + "loss": 1.4515, + "mean_token_accuracy": 0.659949521223704, + "num_tokens": 2988875269.0, + "step": 17830 + }, + { + "entropy": 1.7157710095246632, + "epoch": 1.958831122462992, + "grad_norm": 0.6813088059425354, + "learning_rate": 2.0201276834111118e-06, + "loss": 1.3589, + "mean_token_accuracy": 0.6719192713499069, + "num_tokens": 2989020852.0, + "step": 17831 + }, + { + "entropy": 1.7082193493843079, + "epoch": 1.9589409793743648, + "grad_norm": 0.7664187550544739, + "learning_rate": 2.020020803293985e-06, + "loss": 1.2593, + "mean_token_accuracy": 0.6711249748865763, + "num_tokens": 2989151390.0, + "step": 17832 + }, + { + "entropy": 1.770639955997467, + "epoch": 1.959050836285738, + "grad_norm": 0.6521178483963013, + "learning_rate": 2.0199142073891527e-06, + "loss": 1.3776, + "mean_token_accuracy": 0.6535575886567434, + "num_tokens": 2989329801.0, + "step": 17833 + }, + { + "entropy": 1.6498074233531952, + "epoch": 1.9591606931971106, + "grad_norm": 0.7866834998130798, + "learning_rate": 2.019807895699991e-06, + "loss": 1.4535, + "mean_token_accuracy": 0.6591331660747528, + "num_tokens": 2989475469.0, + "step": 17834 + }, + { + "entropy": 1.6990808149178822, + "epoch": 1.9592705501084837, + "grad_norm": 0.656196653842926, + "learning_rate": 2.0197018682298614e-06, + "loss": 1.551, + "mean_token_accuracy": 0.6299006740252177, + "num_tokens": 2989681045.0, + "step": 17835 + }, + { + "entropy": 1.6446273624897003, + "epoch": 1.9593804070198566, + "grad_norm": 0.6230295300483704, + "learning_rate": 2.019596124982121e-06, + "loss": 1.4115, + "mean_token_accuracy": 0.6549782206614813, + "num_tokens": 2989876665.0, + "step": 17836 + }, + { + "entropy": 1.6909184356530507, + "epoch": 1.9594902639312295, + "grad_norm": 0.674351692199707, + "learning_rate": 2.0194906659601184e-06, + "loss": 1.4357, + "mean_token_accuracy": 0.6496634483337402, + "num_tokens": 2990066264.0, + "step": 17837 + }, + { + "entropy": 1.762929618358612, + "epoch": 1.9596001208426026, + "grad_norm": 0.8096222281455994, + "learning_rate": 2.0193854911671875e-06, + "loss": 1.432, + "mean_token_accuracy": 0.64181949198246, + "num_tokens": 2990181928.0, + "step": 17838 + }, + { + "entropy": 1.728385289510091, + "epoch": 1.9597099777539755, + "grad_norm": 0.81138676404953, + "learning_rate": 2.0192806006066588e-06, + "loss": 1.3549, + "mean_token_accuracy": 0.6485754102468491, + "num_tokens": 2990336143.0, + "step": 17839 + }, + { + "entropy": 1.7208243906497955, + "epoch": 1.9598198346653484, + "grad_norm": 0.6620015501976013, + "learning_rate": 2.019175994281854e-06, + "loss": 1.4155, + "mean_token_accuracy": 0.662138968706131, + "num_tokens": 2990511656.0, + "step": 17840 + }, + { + "entropy": 1.6964669227600098, + "epoch": 1.9599296915767215, + "grad_norm": 0.614718496799469, + "learning_rate": 2.019071672196081e-06, + "loss": 1.4276, + "mean_token_accuracy": 0.6447423497835795, + "num_tokens": 2990694853.0, + "step": 17841 + }, + { + "entropy": 1.708295355240504, + "epoch": 1.9600395484880941, + "grad_norm": 0.6118777990341187, + "learning_rate": 2.0189676343526424e-06, + "loss": 1.4314, + "mean_token_accuracy": 0.6555359264214834, + "num_tokens": 2990875634.0, + "step": 17842 + }, + { + "entropy": 1.7391878565152485, + "epoch": 1.9601494053994672, + "grad_norm": 0.6601645946502686, + "learning_rate": 2.0188638807548327e-06, + "loss": 1.4738, + "mean_token_accuracy": 0.6468467364708582, + "num_tokens": 2991041250.0, + "step": 17843 + }, + { + "entropy": 1.734427313009898, + "epoch": 1.9602592623108401, + "grad_norm": 0.7099672555923462, + "learning_rate": 2.0187604114059326e-06, + "loss": 1.4544, + "mean_token_accuracy": 0.648030087351799, + "num_tokens": 2991231154.0, + "step": 17844 + }, + { + "entropy": 1.6761779586474101, + "epoch": 1.960369119222213, + "grad_norm": 0.8135700225830078, + "learning_rate": 2.01865722630922e-06, + "loss": 1.493, + "mean_token_accuracy": 0.6484993646542231, + "num_tokens": 2991397922.0, + "step": 17845 + }, + { + "entropy": 1.6813652515411377, + "epoch": 1.9604789761335861, + "grad_norm": 0.616743266582489, + "learning_rate": 2.0185543254679576e-06, + "loss": 1.4241, + "mean_token_accuracy": 0.649142454067866, + "num_tokens": 2991592641.0, + "step": 17846 + }, + { + "entropy": 1.7229444285233815, + "epoch": 1.9605888330449588, + "grad_norm": 0.7352148294448853, + "learning_rate": 2.0184517088854044e-06, + "loss": 1.4267, + "mean_token_accuracy": 0.6816410024960836, + "num_tokens": 2991728524.0, + "step": 17847 + }, + { + "entropy": 1.737259527047475, + "epoch": 1.960698689956332, + "grad_norm": 0.7628870606422424, + "learning_rate": 2.0183493765648073e-06, + "loss": 1.3125, + "mean_token_accuracy": 0.6667766869068146, + "num_tokens": 2991859087.0, + "step": 17848 + }, + { + "entropy": 1.7120140492916107, + "epoch": 1.9608085468677048, + "grad_norm": 0.8102467060089111, + "learning_rate": 2.018247328509405e-06, + "loss": 1.4574, + "mean_token_accuracy": 0.6617083897193273, + "num_tokens": 2992026244.0, + "step": 17849 + }, + { + "entropy": 1.7056652307510376, + "epoch": 1.9609184037790777, + "grad_norm": 0.6154736280441284, + "learning_rate": 2.018145564722428e-06, + "loss": 1.3339, + "mean_token_accuracy": 0.6581693887710571, + "num_tokens": 2992195285.0, + "step": 17850 + }, + { + "entropy": 1.7515573799610138, + "epoch": 1.9610282606904508, + "grad_norm": 0.662531852722168, + "learning_rate": 2.0180440852070963e-06, + "loss": 1.4603, + "mean_token_accuracy": 0.6410819639762243, + "num_tokens": 2992437084.0, + "step": 17851 + }, + { + "entropy": 1.7056761781374614, + "epoch": 1.9611381176018237, + "grad_norm": 0.6547462344169617, + "learning_rate": 2.017942889966621e-06, + "loss": 1.2562, + "mean_token_accuracy": 0.676432599623998, + "num_tokens": 2992561421.0, + "step": 17852 + }, + { + "entropy": 1.7316773037115734, + "epoch": 1.9612479745131965, + "grad_norm": 0.6276424527168274, + "learning_rate": 2.0178419790042067e-06, + "loss": 1.3315, + "mean_token_accuracy": 0.6649932016928991, + "num_tokens": 2992734585.0, + "step": 17853 + }, + { + "entropy": 1.6133818924427032, + "epoch": 1.9613578314245697, + "grad_norm": 0.6731429100036621, + "learning_rate": 2.017741352323046e-06, + "loss": 1.1471, + "mean_token_accuracy": 0.7009394268194834, + "num_tokens": 2992844477.0, + "step": 17854 + }, + { + "entropy": 1.6940800249576569, + "epoch": 1.9614676883359423, + "grad_norm": 0.7534624934196472, + "learning_rate": 2.0176410099263245e-06, + "loss": 1.1898, + "mean_token_accuracy": 0.6858446151018143, + "num_tokens": 2992951432.0, + "step": 17855 + }, + { + "entropy": 1.7091482083002727, + "epoch": 1.9615775452473154, + "grad_norm": 0.6614341735839844, + "learning_rate": 2.017540951817217e-06, + "loss": 1.4209, + "mean_token_accuracy": 0.6604591459035873, + "num_tokens": 2993157963.0, + "step": 17856 + }, + { + "entropy": 1.743918001651764, + "epoch": 1.9616874021586883, + "grad_norm": 0.8729313015937805, + "learning_rate": 2.017441177998892e-06, + "loss": 1.5282, + "mean_token_accuracy": 0.6517240107059479, + "num_tokens": 2993348516.0, + "step": 17857 + }, + { + "entropy": 1.6870131293932598, + "epoch": 1.9617972590700612, + "grad_norm": 1.8149455785751343, + "learning_rate": 2.017341688474505e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6817310601472855, + "num_tokens": 2993514126.0, + "step": 17858 + }, + { + "entropy": 1.7509052058060963, + "epoch": 1.9619071159814343, + "grad_norm": 0.7601586580276489, + "learning_rate": 2.017242483247206e-06, + "loss": 1.4875, + "mean_token_accuracy": 0.6382629126310349, + "num_tokens": 2993744695.0, + "step": 17859 + }, + { + "entropy": 1.7065601646900177, + "epoch": 1.962016972892807, + "grad_norm": 0.5603963732719421, + "learning_rate": 2.017143562320135e-06, + "loss": 1.4044, + "mean_token_accuracy": 0.6478400429089864, + "num_tokens": 2993936539.0, + "step": 17860 + }, + { + "entropy": 1.6877718269824982, + "epoch": 1.96212682980418, + "grad_norm": 0.7344236969947815, + "learning_rate": 2.0170449256964217e-06, + "loss": 1.3698, + "mean_token_accuracy": 0.6719866941372553, + "num_tokens": 2994087478.0, + "step": 17861 + }, + { + "entropy": 1.6488263805707295, + "epoch": 1.962236686715553, + "grad_norm": 0.6478110551834106, + "learning_rate": 2.0169465733791895e-06, + "loss": 1.403, + "mean_token_accuracy": 0.6502639551957449, + "num_tokens": 2994304002.0, + "step": 17862 + }, + { + "entropy": 1.7499266862869263, + "epoch": 1.9623465436269258, + "grad_norm": 0.8414444327354431, + "learning_rate": 2.0168485053715497e-06, + "loss": 1.4985, + "mean_token_accuracy": 0.6633632381757101, + "num_tokens": 2994442860.0, + "step": 17863 + }, + { + "entropy": 1.6683284640312195, + "epoch": 1.962456400538299, + "grad_norm": 0.8371381759643555, + "learning_rate": 2.0167507216766076e-06, + "loss": 1.2807, + "mean_token_accuracy": 0.6784952729940414, + "num_tokens": 2994570920.0, + "step": 17864 + }, + { + "entropy": 1.7751424014568329, + "epoch": 1.9625662574496718, + "grad_norm": 0.6342650651931763, + "learning_rate": 2.0166532222974564e-06, + "loss": 1.464, + "mean_token_accuracy": 0.6453549315532049, + "num_tokens": 2994786840.0, + "step": 17865 + }, + { + "entropy": 1.7355596522490184, + "epoch": 1.9626761143610447, + "grad_norm": 0.776604175567627, + "learning_rate": 2.0165560072371824e-06, + "loss": 1.4411, + "mean_token_accuracy": 0.6572767297426859, + "num_tokens": 2994965451.0, + "step": 17866 + }, + { + "entropy": 1.6895995835463207, + "epoch": 1.9627859712724178, + "grad_norm": 0.7442336678504944, + "learning_rate": 2.0164590764988637e-06, + "loss": 1.4932, + "mean_token_accuracy": 0.6494418730338415, + "num_tokens": 2995177256.0, + "step": 17867 + }, + { + "entropy": 1.7676254113515217, + "epoch": 1.9628958281837905, + "grad_norm": 0.6468122005462646, + "learning_rate": 2.016362430085567e-06, + "loss": 1.5374, + "mean_token_accuracy": 0.6308100124200186, + "num_tokens": 2995357688.0, + "step": 17868 + }, + { + "entropy": 1.7291751305262248, + "epoch": 1.9630056850951636, + "grad_norm": 0.7243047952651978, + "learning_rate": 2.016266068000351e-06, + "loss": 1.4871, + "mean_token_accuracy": 0.6339818388223648, + "num_tokens": 2995565257.0, + "step": 17869 + }, + { + "entropy": 1.7408881783485413, + "epoch": 1.9631155420065365, + "grad_norm": 0.7614642977714539, + "learning_rate": 2.0161699902462664e-06, + "loss": 1.5102, + "mean_token_accuracy": 0.6478269298871359, + "num_tokens": 2995720018.0, + "step": 17870 + }, + { + "entropy": 1.6972165405750275, + "epoch": 1.9632253989179094, + "grad_norm": 0.7591689229011536, + "learning_rate": 2.016074196826353e-06, + "loss": 1.3704, + "mean_token_accuracy": 0.6604073345661163, + "num_tokens": 2995871780.0, + "step": 17871 + }, + { + "entropy": 1.6704501807689667, + "epoch": 1.9633352558292825, + "grad_norm": 0.665471076965332, + "learning_rate": 2.0159786877436425e-06, + "loss": 1.3698, + "mean_token_accuracy": 0.6625976065794627, + "num_tokens": 2996074778.0, + "step": 17872 + }, + { + "entropy": 1.6610515713691711, + "epoch": 1.9634451127406551, + "grad_norm": 0.6701338887214661, + "learning_rate": 2.015883463001159e-06, + "loss": 1.3937, + "mean_token_accuracy": 0.6479357580343882, + "num_tokens": 2996245890.0, + "step": 17873 + }, + { + "entropy": 1.6841512620449066, + "epoch": 1.9635549696520282, + "grad_norm": 0.7010718584060669, + "learning_rate": 2.015788522601915e-06, + "loss": 1.368, + "mean_token_accuracy": 0.6470450113217036, + "num_tokens": 2996414751.0, + "step": 17874 + }, + { + "entropy": 1.678158462047577, + "epoch": 1.9636648265634011, + "grad_norm": 0.8353567123413086, + "learning_rate": 2.0156938665489163e-06, + "loss": 1.3558, + "mean_token_accuracy": 0.6673894474903742, + "num_tokens": 2996608436.0, + "step": 17875 + }, + { + "entropy": 1.6845565140247345, + "epoch": 1.963774683474774, + "grad_norm": 0.5820711851119995, + "learning_rate": 2.0155994948451575e-06, + "loss": 1.3492, + "mean_token_accuracy": 0.6662445664405823, + "num_tokens": 2996777414.0, + "step": 17876 + }, + { + "entropy": 1.692328284184138, + "epoch": 1.9638845403861471, + "grad_norm": 0.7429178357124329, + "learning_rate": 2.015505407493627e-06, + "loss": 1.3524, + "mean_token_accuracy": 0.6588386446237564, + "num_tokens": 2996917973.0, + "step": 17877 + }, + { + "entropy": 1.7147633930047352, + "epoch": 1.96399439729752, + "grad_norm": 0.6130802035331726, + "learning_rate": 2.0154116044973023e-06, + "loss": 1.3864, + "mean_token_accuracy": 0.6510560760895411, + "num_tokens": 2997075428.0, + "step": 17878 + }, + { + "entropy": 1.674227237701416, + "epoch": 1.964104254208893, + "grad_norm": 0.6673222184181213, + "learning_rate": 2.015318085859151e-06, + "loss": 1.2579, + "mean_token_accuracy": 0.6707002917925516, + "num_tokens": 2997213096.0, + "step": 17879 + }, + { + "entropy": 1.669933428366979, + "epoch": 1.964214111120266, + "grad_norm": 0.781856894493103, + "learning_rate": 2.0152248515821334e-06, + "loss": 1.3723, + "mean_token_accuracy": 0.6608141760031382, + "num_tokens": 2997395284.0, + "step": 17880 + }, + { + "entropy": 1.7202060023943584, + "epoch": 1.9643239680316387, + "grad_norm": 1.0455927848815918, + "learning_rate": 2.0151319016692005e-06, + "loss": 1.5199, + "mean_token_accuracy": 0.6652625252803167, + "num_tokens": 2997573918.0, + "step": 17881 + }, + { + "entropy": 1.700150231520335, + "epoch": 1.9644338249430118, + "grad_norm": 0.718952476978302, + "learning_rate": 2.015039236123295e-06, + "loss": 1.3763, + "mean_token_accuracy": 0.649369607369105, + "num_tokens": 2997731551.0, + "step": 17882 + }, + { + "entropy": 1.7306404809157054, + "epoch": 1.9645436818543847, + "grad_norm": 0.6521731615066528, + "learning_rate": 2.014946854947349e-06, + "loss": 1.3775, + "mean_token_accuracy": 0.6443797498941422, + "num_tokens": 2997883899.0, + "step": 17883 + }, + { + "entropy": 1.767806778351466, + "epoch": 1.9646535387657575, + "grad_norm": 0.9187655448913574, + "learning_rate": 2.014854758144286e-06, + "loss": 1.6006, + "mean_token_accuracy": 0.6336929003397623, + "num_tokens": 2998040914.0, + "step": 17884 + }, + { + "entropy": 1.73748313387235, + "epoch": 1.9647633956771307, + "grad_norm": 0.7256967425346375, + "learning_rate": 2.0147629457170213e-06, + "loss": 1.2618, + "mean_token_accuracy": 0.6701177606980006, + "num_tokens": 2998217430.0, + "step": 17885 + }, + { + "entropy": 1.6992026666800182, + "epoch": 1.9648732525885033, + "grad_norm": 0.6321195363998413, + "learning_rate": 2.01467141766846e-06, + "loss": 1.3484, + "mean_token_accuracy": 0.6581118901570638, + "num_tokens": 2998385433.0, + "step": 17886 + }, + { + "entropy": 1.7310790121555328, + "epoch": 1.9649831094998764, + "grad_norm": 0.7075302600860596, + "learning_rate": 2.0145801740015e-06, + "loss": 1.5002, + "mean_token_accuracy": 0.6584192862113317, + "num_tokens": 2998560209.0, + "step": 17887 + }, + { + "entropy": 1.6907731493314107, + "epoch": 1.9650929664112493, + "grad_norm": 0.7162296772003174, + "learning_rate": 2.014489214719028e-06, + "loss": 1.2948, + "mean_token_accuracy": 0.6692901601394018, + "num_tokens": 2998710324.0, + "step": 17888 + }, + { + "entropy": 1.7298618853092194, + "epoch": 1.9652028233226222, + "grad_norm": 0.6216132044792175, + "learning_rate": 2.0143985398239234e-06, + "loss": 1.4547, + "mean_token_accuracy": 0.6533665706713995, + "num_tokens": 2998882685.0, + "step": 17889 + }, + { + "entropy": 1.6897225081920624, + "epoch": 1.9653126802339953, + "grad_norm": 0.6112697720527649, + "learning_rate": 2.0143081493190567e-06, + "loss": 1.4371, + "mean_token_accuracy": 0.6600435972213745, + "num_tokens": 2999088643.0, + "step": 17890 + }, + { + "entropy": 1.688838044802348, + "epoch": 1.9654225371453682, + "grad_norm": 0.7275003790855408, + "learning_rate": 2.0142180432072876e-06, + "loss": 1.4302, + "mean_token_accuracy": 0.6603851070006689, + "num_tokens": 2999265814.0, + "step": 17891 + }, + { + "entropy": 1.6855897307395935, + "epoch": 1.965532394056741, + "grad_norm": 0.6274251937866211, + "learning_rate": 2.0141282214914685e-06, + "loss": 1.2999, + "mean_token_accuracy": 0.6625367701053619, + "num_tokens": 2999421528.0, + "step": 17892 + }, + { + "entropy": 1.6572815577189128, + "epoch": 1.9656422509681142, + "grad_norm": 0.766179084777832, + "learning_rate": 2.014038684174442e-06, + "loss": 1.3068, + "mean_token_accuracy": 0.6744897613922755, + "num_tokens": 2999571936.0, + "step": 17893 + }, + { + "entropy": 1.7280305624008179, + "epoch": 1.9657521078794868, + "grad_norm": 0.7325267195701599, + "learning_rate": 2.0139494312590415e-06, + "loss": 1.4753, + "mean_token_accuracy": 0.6507488141457239, + "num_tokens": 2999749565.0, + "step": 17894 + }, + { + "entropy": 1.6773878633975983, + "epoch": 1.96586196479086, + "grad_norm": 0.6621171236038208, + "learning_rate": 2.013860462748093e-06, + "loss": 1.4302, + "mean_token_accuracy": 0.6595263083775839, + "num_tokens": 2999909764.0, + "step": 17895 + }, + { + "entropy": 1.6768188774585724, + "epoch": 1.9659718217022328, + "grad_norm": 0.6506795883178711, + "learning_rate": 2.0137717786444112e-06, + "loss": 1.4475, + "mean_token_accuracy": 0.6549390902121862, + "num_tokens": 3000064739.0, + "step": 17896 + }, + { + "entropy": 1.7129511932531993, + "epoch": 1.9660816786136057, + "grad_norm": 0.643744707107544, + "learning_rate": 2.0136833789508033e-06, + "loss": 1.48, + "mean_token_accuracy": 0.6619319965442022, + "num_tokens": 3000241947.0, + "step": 17897 + }, + { + "entropy": 1.700930525859197, + "epoch": 1.9661915355249788, + "grad_norm": 0.625928521156311, + "learning_rate": 2.0135952636700674e-06, + "loss": 1.2884, + "mean_token_accuracy": 0.6703586975733439, + "num_tokens": 3000384757.0, + "step": 17898 + }, + { + "entropy": 1.697757363319397, + "epoch": 1.9663013924363515, + "grad_norm": 0.6434378623962402, + "learning_rate": 2.0135074328049923e-06, + "loss": 1.3812, + "mean_token_accuracy": 0.6578892767429352, + "num_tokens": 3000562786.0, + "step": 17899 + }, + { + "entropy": 1.6748135387897491, + "epoch": 1.9664112493477246, + "grad_norm": 0.6851823329925537, + "learning_rate": 2.0134198863583563e-06, + "loss": 1.4003, + "mean_token_accuracy": 0.6644940574963888, + "num_tokens": 3000707563.0, + "step": 17900 + }, + { + "entropy": 1.7217712998390198, + "epoch": 1.9665211062590975, + "grad_norm": 0.6717512011528015, + "learning_rate": 2.0133326243329327e-06, + "loss": 1.2687, + "mean_token_accuracy": 0.6760751704374949, + "num_tokens": 3000863358.0, + "step": 17901 + }, + { + "entropy": 1.6838585337003071, + "epoch": 1.9666309631704704, + "grad_norm": 0.7156099677085876, + "learning_rate": 2.0132456467314814e-06, + "loss": 1.4277, + "mean_token_accuracy": 0.649495929479599, + "num_tokens": 3001021623.0, + "step": 17902 + }, + { + "entropy": 1.6566158632437389, + "epoch": 1.9667408200818435, + "grad_norm": 0.6351657509803772, + "learning_rate": 2.0131589535567566e-06, + "loss": 1.4425, + "mean_token_accuracy": 0.6564101775487264, + "num_tokens": 3001189910.0, + "step": 17903 + }, + { + "entropy": 1.6897284885247548, + "epoch": 1.9668506769932164, + "grad_norm": 0.6042178273200989, + "learning_rate": 2.0130725448115005e-06, + "loss": 1.3028, + "mean_token_accuracy": 0.6670292864243189, + "num_tokens": 3001334695.0, + "step": 17904 + }, + { + "entropy": 1.6312000652154286, + "epoch": 1.9669605339045892, + "grad_norm": 0.59149169921875, + "learning_rate": 2.012986420498449e-06, + "loss": 1.3407, + "mean_token_accuracy": 0.6634353597958883, + "num_tokens": 3001513880.0, + "step": 17905 + }, + { + "entropy": 1.697037806113561, + "epoch": 1.9670703908159624, + "grad_norm": 0.6991804242134094, + "learning_rate": 2.0129005806203278e-06, + "loss": 1.3814, + "mean_token_accuracy": 0.6621117989222208, + "num_tokens": 3001658835.0, + "step": 17906 + }, + { + "entropy": 1.747629165649414, + "epoch": 1.967180247727335, + "grad_norm": 0.7504332065582275, + "learning_rate": 2.0128150251798533e-06, + "loss": 1.3286, + "mean_token_accuracy": 0.6650111377239227, + "num_tokens": 3001809013.0, + "step": 17907 + }, + { + "entropy": 1.7131927410761516, + "epoch": 1.9672901046387081, + "grad_norm": 0.7261272668838501, + "learning_rate": 2.0127297541797336e-06, + "loss": 1.3738, + "mean_token_accuracy": 0.6725722004969915, + "num_tokens": 3001948343.0, + "step": 17908 + }, + { + "entropy": 1.6718792418638866, + "epoch": 1.967399961550081, + "grad_norm": 0.6443445086479187, + "learning_rate": 2.0126447676226678e-06, + "loss": 1.3358, + "mean_token_accuracy": 0.6656797925631205, + "num_tokens": 3002112297.0, + "step": 17909 + }, + { + "entropy": 1.6951535542805989, + "epoch": 1.967509818461454, + "grad_norm": 0.7346131205558777, + "learning_rate": 2.012560065511345e-06, + "loss": 1.318, + "mean_token_accuracy": 0.6694110383590063, + "num_tokens": 3002236731.0, + "step": 17910 + }, + { + "entropy": 1.7302409609158833, + "epoch": 1.967619675372827, + "grad_norm": 0.625469982624054, + "learning_rate": 2.012475647848446e-06, + "loss": 1.3161, + "mean_token_accuracy": 0.6690036505460739, + "num_tokens": 3002373310.0, + "step": 17911 + }, + { + "entropy": 1.6581771274407704, + "epoch": 1.9677295322841997, + "grad_norm": 0.5975332260131836, + "learning_rate": 2.0123915146366434e-06, + "loss": 1.3223, + "mean_token_accuracy": 0.6745662887891134, + "num_tokens": 3002542926.0, + "step": 17912 + }, + { + "entropy": 1.6342376867930095, + "epoch": 1.9678393891955728, + "grad_norm": 0.6779927611351013, + "learning_rate": 2.012307665878599e-06, + "loss": 1.3618, + "mean_token_accuracy": 0.6652511854966482, + "num_tokens": 3002702125.0, + "step": 17913 + }, + { + "entropy": 1.714419464270274, + "epoch": 1.9679492461069457, + "grad_norm": 0.6036478877067566, + "learning_rate": 2.0122241015769676e-06, + "loss": 1.378, + "mean_token_accuracy": 0.6464525610208511, + "num_tokens": 3002865175.0, + "step": 17914 + }, + { + "entropy": 1.7365634739398956, + "epoch": 1.9680591030183185, + "grad_norm": 0.9775516390800476, + "learning_rate": 2.0121408217343923e-06, + "loss": 1.3371, + "mean_token_accuracy": 0.6688741395870844, + "num_tokens": 3002997739.0, + "step": 17915 + }, + { + "entropy": 1.7164349257946014, + "epoch": 1.9681689599296917, + "grad_norm": 0.6772252321243286, + "learning_rate": 2.0120578263535116e-06, + "loss": 1.4787, + "mean_token_accuracy": 0.6568474372227987, + "num_tokens": 3003173190.0, + "step": 17916 + }, + { + "entropy": 1.6874643166859944, + "epoch": 1.9682788168410645, + "grad_norm": 0.6911616325378418, + "learning_rate": 2.01197511543695e-06, + "loss": 1.4895, + "mean_token_accuracy": 0.6461264938116074, + "num_tokens": 3003386472.0, + "step": 17917 + }, + { + "entropy": 1.7422433296839397, + "epoch": 1.9683886737524374, + "grad_norm": 0.7325913906097412, + "learning_rate": 2.011892688987325e-06, + "loss": 1.5443, + "mean_token_accuracy": 0.6517143944899241, + "num_tokens": 3003544907.0, + "step": 17918 + }, + { + "entropy": 1.7027767598628998, + "epoch": 1.9684985306638105, + "grad_norm": 0.6650202870368958, + "learning_rate": 2.011810547007247e-06, + "loss": 1.3508, + "mean_token_accuracy": 0.6672835250695547, + "num_tokens": 3003683798.0, + "step": 17919 + }, + { + "entropy": 1.6662018199761708, + "epoch": 1.9686083875751832, + "grad_norm": 0.8393872976303101, + "learning_rate": 2.0117286894993153e-06, + "loss": 1.2116, + "mean_token_accuracy": 0.684177945057551, + "num_tokens": 3003813303.0, + "step": 17920 + }, + { + "entropy": 1.6738957564036052, + "epoch": 1.9687182444865563, + "grad_norm": 0.6645524501800537, + "learning_rate": 2.01164711646612e-06, + "loss": 1.4325, + "mean_token_accuracy": 0.6557613164186478, + "num_tokens": 3003993607.0, + "step": 17921 + }, + { + "entropy": 1.6993304590384166, + "epoch": 1.9688281013979292, + "grad_norm": 0.8092979788780212, + "learning_rate": 2.0115658279102425e-06, + "loss": 1.5485, + "mean_token_accuracy": 0.632763127485911, + "num_tokens": 3004208309.0, + "step": 17922 + }, + { + "entropy": 1.7931698858737946, + "epoch": 1.968937958309302, + "grad_norm": 0.6377851963043213, + "learning_rate": 2.011484823834258e-06, + "loss": 1.3844, + "mean_token_accuracy": 0.6499971399704615, + "num_tokens": 3004364405.0, + "step": 17923 + }, + { + "entropy": 1.7573831876118977, + "epoch": 1.9690478152206752, + "grad_norm": 0.729793131351471, + "learning_rate": 2.0114041042407263e-06, + "loss": 1.381, + "mean_token_accuracy": 0.6636466036240259, + "num_tokens": 3004561554.0, + "step": 17924 + }, + { + "entropy": 1.6667506992816925, + "epoch": 1.9691576721320478, + "grad_norm": 0.685034990310669, + "learning_rate": 2.0113236691322057e-06, + "loss": 1.5663, + "mean_token_accuracy": 0.6583135426044464, + "num_tokens": 3004756452.0, + "step": 17925 + }, + { + "entropy": 1.6636810302734375, + "epoch": 1.969267529043421, + "grad_norm": 0.660313606262207, + "learning_rate": 2.0112435185112403e-06, + "loss": 1.423, + "mean_token_accuracy": 0.6492910335461298, + "num_tokens": 3004946559.0, + "step": 17926 + }, + { + "entropy": 1.6946007212003071, + "epoch": 1.9693773859547938, + "grad_norm": 0.6375925540924072, + "learning_rate": 2.0111636523803675e-06, + "loss": 1.3594, + "mean_token_accuracy": 0.6636908402045568, + "num_tokens": 3005102722.0, + "step": 17927 + }, + { + "entropy": 1.6661100486914318, + "epoch": 1.9694872428661667, + "grad_norm": 0.6003373265266418, + "learning_rate": 2.011084070742114e-06, + "loss": 1.5099, + "mean_token_accuracy": 0.657312293847402, + "num_tokens": 3005269206.0, + "step": 17928 + }, + { + "entropy": 1.7043770054976146, + "epoch": 1.9695970997775398, + "grad_norm": 0.6125054359436035, + "learning_rate": 2.0110047735989994e-06, + "loss": 1.474, + "mean_token_accuracy": 0.6454812387625376, + "num_tokens": 3005453743.0, + "step": 17929 + }, + { + "entropy": 1.71139990290006, + "epoch": 1.9697069566889127, + "grad_norm": 0.6473939418792725, + "learning_rate": 2.0109257609535333e-06, + "loss": 1.4996, + "mean_token_accuracy": 0.6278078705072403, + "num_tokens": 3005689388.0, + "step": 17930 + }, + { + "entropy": 1.654092291990916, + "epoch": 1.9698168136002856, + "grad_norm": 0.6603448390960693, + "learning_rate": 2.010847032808216e-06, + "loss": 1.1989, + "mean_token_accuracy": 0.6887961675723394, + "num_tokens": 3005819944.0, + "step": 17931 + }, + { + "entropy": 1.6703099111715953, + "epoch": 1.9699266705116587, + "grad_norm": 0.7166707515716553, + "learning_rate": 2.0107685891655396e-06, + "loss": 1.3184, + "mean_token_accuracy": 0.6670674930016199, + "num_tokens": 3005962932.0, + "step": 17932 + }, + { + "entropy": 1.6925211747487385, + "epoch": 1.9700365274230314, + "grad_norm": 0.8018051981925964, + "learning_rate": 2.0106904300279875e-06, + "loss": 1.1981, + "mean_token_accuracy": 0.6955769310394923, + "num_tokens": 3006075267.0, + "step": 17933 + }, + { + "entropy": 1.6854363183180492, + "epoch": 1.9701463843344045, + "grad_norm": 0.6480568647384644, + "learning_rate": 2.010612555398032e-06, + "loss": 1.3139, + "mean_token_accuracy": 0.6723464528719584, + "num_tokens": 3006253222.0, + "step": 17934 + }, + { + "entropy": 1.6880821188290913, + "epoch": 1.9702562412457774, + "grad_norm": 0.7775602340698242, + "learning_rate": 2.0105349652781383e-06, + "loss": 1.2487, + "mean_token_accuracy": 0.6716853429873785, + "num_tokens": 3006374921.0, + "step": 17935 + }, + { + "entropy": 1.725311279296875, + "epoch": 1.9703660981571502, + "grad_norm": 0.8793063163757324, + "learning_rate": 2.0104576596707627e-06, + "loss": 1.451, + "mean_token_accuracy": 0.6542079498370489, + "num_tokens": 3006567762.0, + "step": 17936 + }, + { + "entropy": 1.6732623775800068, + "epoch": 1.9704759550685234, + "grad_norm": 0.6772112846374512, + "learning_rate": 2.0103806385783504e-06, + "loss": 1.2661, + "mean_token_accuracy": 0.6756617873907089, + "num_tokens": 3006709794.0, + "step": 17937 + }, + { + "entropy": 1.7369797627131145, + "epoch": 1.9705858119798962, + "grad_norm": 0.7336527109146118, + "learning_rate": 2.0103039020033403e-06, + "loss": 1.4932, + "mean_token_accuracy": 0.6467774361371994, + "num_tokens": 3006873792.0, + "step": 17938 + }, + { + "entropy": 1.671160767475764, + "epoch": 1.9706956688912691, + "grad_norm": 0.684969961643219, + "learning_rate": 2.0102274499481617e-06, + "loss": 1.3979, + "mean_token_accuracy": 0.6610272874434789, + "num_tokens": 3007044737.0, + "step": 17939 + }, + { + "entropy": 1.6652365227540333, + "epoch": 1.970805525802642, + "grad_norm": 0.6617401242256165, + "learning_rate": 2.010151282415233e-06, + "loss": 1.3237, + "mean_token_accuracy": 0.6622043897708257, + "num_tokens": 3007183324.0, + "step": 17940 + }, + { + "entropy": 1.6508016188939412, + "epoch": 1.970915382714015, + "grad_norm": 0.7206847667694092, + "learning_rate": 2.010075399406965e-06, + "loss": 1.4783, + "mean_token_accuracy": 0.6530425846576691, + "num_tokens": 3007373061.0, + "step": 17941 + }, + { + "entropy": 1.7173553705215454, + "epoch": 1.971025239625388, + "grad_norm": 0.741521954536438, + "learning_rate": 2.00999980092576e-06, + "loss": 1.4746, + "mean_token_accuracy": 0.6368623872598013, + "num_tokens": 3007552632.0, + "step": 17942 + }, + { + "entropy": 1.7114310661951702, + "epoch": 1.9711350965367609, + "grad_norm": 0.6318982243537903, + "learning_rate": 2.0099244869740097e-06, + "loss": 1.3902, + "mean_token_accuracy": 0.6469387610753378, + "num_tokens": 3007779921.0, + "step": 17943 + }, + { + "entropy": 1.7579089105129242, + "epoch": 1.9712449534481338, + "grad_norm": 0.7655471563339233, + "learning_rate": 2.0098494575540984e-06, + "loss": 1.3765, + "mean_token_accuracy": 0.6498502790927887, + "num_tokens": 3007926071.0, + "step": 17944 + }, + { + "entropy": 1.6568682293097179, + "epoch": 1.9713548103595069, + "grad_norm": 0.8164061903953552, + "learning_rate": 2.009774712668402e-06, + "loss": 1.373, + "mean_token_accuracy": 0.6605587005615234, + "num_tokens": 3008099242.0, + "step": 17945 + }, + { + "entropy": 1.680535574754079, + "epoch": 1.9714646672708795, + "grad_norm": 0.6372302174568176, + "learning_rate": 2.009700252319283e-06, + "loss": 1.3498, + "mean_token_accuracy": 0.6511732886234919, + "num_tokens": 3008321670.0, + "step": 17946 + }, + { + "entropy": 1.7110346754391987, + "epoch": 1.9715745241822527, + "grad_norm": 0.7086535096168518, + "learning_rate": 2.0096260765091015e-06, + "loss": 1.3739, + "mean_token_accuracy": 0.6551353732744852, + "num_tokens": 3008445047.0, + "step": 17947 + }, + { + "entropy": 1.7076995074748993, + "epoch": 1.9716843810936255, + "grad_norm": 0.7128680348396301, + "learning_rate": 2.0095521852402027e-06, + "loss": 1.3066, + "mean_token_accuracy": 0.6718765745560328, + "num_tokens": 3008565086.0, + "step": 17948 + }, + { + "entropy": 1.6902350385983784, + "epoch": 1.9717942380049984, + "grad_norm": 0.7834108471870422, + "learning_rate": 2.0094785785149257e-06, + "loss": 1.4292, + "mean_token_accuracy": 0.6641406814257304, + "num_tokens": 3008720383.0, + "step": 17949 + }, + { + "entropy": 1.642638107140859, + "epoch": 1.9719040949163715, + "grad_norm": 0.6965975761413574, + "learning_rate": 2.009405256335602e-06, + "loss": 1.2824, + "mean_token_accuracy": 0.6902331511179606, + "num_tokens": 3008836709.0, + "step": 17950 + }, + { + "entropy": 1.769042044878006, + "epoch": 1.9720139518277444, + "grad_norm": 0.9151806235313416, + "learning_rate": 2.0093322187045495e-06, + "loss": 1.5366, + "mean_token_accuracy": 0.6416715830564499, + "num_tokens": 3009021041.0, + "step": 17951 + }, + { + "entropy": 1.705027828613917, + "epoch": 1.9721238087391173, + "grad_norm": 0.6604277491569519, + "learning_rate": 2.0092594656240805e-06, + "loss": 1.2852, + "mean_token_accuracy": 0.6765786459048589, + "num_tokens": 3009177630.0, + "step": 17952 + }, + { + "entropy": 1.740889310836792, + "epoch": 1.9722336656504902, + "grad_norm": 0.7900790572166443, + "learning_rate": 2.0091869970965e-06, + "loss": 1.2679, + "mean_token_accuracy": 0.6788128217061361, + "num_tokens": 3009308901.0, + "step": 17953 + }, + { + "entropy": 1.6999844014644623, + "epoch": 1.972343522561863, + "grad_norm": 0.6160433888435364, + "learning_rate": 2.0091148131240973e-06, + "loss": 1.4133, + "mean_token_accuracy": 0.6537296175956726, + "num_tokens": 3009523642.0, + "step": 17954 + }, + { + "entropy": 1.7226817508538563, + "epoch": 1.9724533794732362, + "grad_norm": 0.7243252992630005, + "learning_rate": 2.0090429137091604e-06, + "loss": 1.3226, + "mean_token_accuracy": 0.6633811742067337, + "num_tokens": 3009666666.0, + "step": 17955 + }, + { + "entropy": 1.7036002576351166, + "epoch": 1.972563236384609, + "grad_norm": 0.6371243596076965, + "learning_rate": 2.0089712988539647e-06, + "loss": 1.3227, + "mean_token_accuracy": 0.6615711351235708, + "num_tokens": 3009817269.0, + "step": 17956 + }, + { + "entropy": 1.6567615171273549, + "epoch": 1.972673093295982, + "grad_norm": 0.7171679735183716, + "learning_rate": 2.008899968560774e-06, + "loss": 1.4099, + "mean_token_accuracy": 0.649737944205602, + "num_tokens": 3009963221.0, + "step": 17957 + }, + { + "entropy": 1.7603488365809123, + "epoch": 1.972782950207355, + "grad_norm": 0.7773517966270447, + "learning_rate": 2.0088289228318493e-06, + "loss": 1.2555, + "mean_token_accuracy": 0.6756314287583033, + "num_tokens": 3010058260.0, + "step": 17958 + }, + { + "entropy": 1.7130048672358196, + "epoch": 1.9728928071187277, + "grad_norm": 0.7091876864433289, + "learning_rate": 2.008758161669438e-06, + "loss": 1.4677, + "mean_token_accuracy": 0.6474677075942358, + "num_tokens": 3010235324.0, + "step": 17959 + }, + { + "entropy": 1.7637153267860413, + "epoch": 1.9730026640301008, + "grad_norm": 0.8148097395896912, + "learning_rate": 2.008687685075778e-06, + "loss": 1.3503, + "mean_token_accuracy": 0.6695531010627747, + "num_tokens": 3010358374.0, + "step": 17960 + }, + { + "entropy": 1.6533841292063396, + "epoch": 1.9731125209414737, + "grad_norm": 0.6595221161842346, + "learning_rate": 2.0086174930531026e-06, + "loss": 1.2779, + "mean_token_accuracy": 0.6797658701737722, + "num_tokens": 3010515100.0, + "step": 17961 + }, + { + "entropy": 1.6787741879622142, + "epoch": 1.9732223778528466, + "grad_norm": 0.6817485690116882, + "learning_rate": 2.0085475856036317e-06, + "loss": 1.6186, + "mean_token_accuracy": 0.6448809107144674, + "num_tokens": 3010711557.0, + "step": 17962 + }, + { + "entropy": 1.7178953389326732, + "epoch": 1.9733322347642197, + "grad_norm": 0.710960865020752, + "learning_rate": 2.0084779627295764e-06, + "loss": 1.4679, + "mean_token_accuracy": 0.64095505575339, + "num_tokens": 3010862378.0, + "step": 17963 + }, + { + "entropy": 1.7367713054021199, + "epoch": 1.9734420916755926, + "grad_norm": 0.7530184388160706, + "learning_rate": 2.008408624433144e-06, + "loss": 1.4309, + "mean_token_accuracy": 0.6414470473925272, + "num_tokens": 3011047947.0, + "step": 17964 + }, + { + "entropy": 1.6682301461696625, + "epoch": 1.9735519485869655, + "grad_norm": 0.747166633605957, + "learning_rate": 2.008339570716525e-06, + "loss": 1.3236, + "mean_token_accuracy": 0.6601849645376205, + "num_tokens": 3011180016.0, + "step": 17965 + }, + { + "entropy": 1.7371576726436615, + "epoch": 1.9736618054983384, + "grad_norm": 0.7257885932922363, + "learning_rate": 2.0082708015819084e-06, + "loss": 1.2824, + "mean_token_accuracy": 0.6802943547566732, + "num_tokens": 3011352111.0, + "step": 17966 + }, + { + "entropy": 1.7696191271146138, + "epoch": 1.9737716624097112, + "grad_norm": 0.6771504878997803, + "learning_rate": 2.008202317031469e-06, + "loss": 1.4747, + "mean_token_accuracy": 0.6517404715220133, + "num_tokens": 3011523618.0, + "step": 17967 + }, + { + "entropy": 1.7489943603674571, + "epoch": 1.9738815193210844, + "grad_norm": 0.6868027448654175, + "learning_rate": 2.0081341170673733e-06, + "loss": 1.4084, + "mean_token_accuracy": 0.6525389303763708, + "num_tokens": 3011696831.0, + "step": 17968 + }, + { + "entropy": 1.7214332520961761, + "epoch": 1.9739913762324572, + "grad_norm": 0.6243865489959717, + "learning_rate": 2.0080662016917824e-06, + "loss": 1.4266, + "mean_token_accuracy": 0.6496342072884241, + "num_tokens": 3011899761.0, + "step": 17969 + }, + { + "entropy": 1.6936425268650055, + "epoch": 1.9741012331438301, + "grad_norm": 0.6805530786514282, + "learning_rate": 2.007998570906844e-06, + "loss": 1.4454, + "mean_token_accuracy": 0.6591909031073252, + "num_tokens": 3012066461.0, + "step": 17970 + }, + { + "entropy": 1.6858153243859608, + "epoch": 1.9742110900552032, + "grad_norm": 0.8352508544921875, + "learning_rate": 2.007931224714698e-06, + "loss": 1.2945, + "mean_token_accuracy": 0.6747691531976064, + "num_tokens": 3012255327.0, + "step": 17971 + }, + { + "entropy": 1.6527644395828247, + "epoch": 1.974320946966576, + "grad_norm": 0.6151508092880249, + "learning_rate": 2.0078641631174775e-06, + "loss": 1.2585, + "mean_token_accuracy": 0.6740232904752096, + "num_tokens": 3012413394.0, + "step": 17972 + }, + { + "entropy": 1.7410860856374104, + "epoch": 1.974430803877949, + "grad_norm": 0.7390371561050415, + "learning_rate": 2.007797386117304e-06, + "loss": 1.38, + "mean_token_accuracy": 0.6596208562453588, + "num_tokens": 3012562797.0, + "step": 17973 + }, + { + "entropy": 1.6805489460627239, + "epoch": 1.9745406607893219, + "grad_norm": 0.5240894556045532, + "learning_rate": 2.007730893716292e-06, + "loss": 1.4129, + "mean_token_accuracy": 0.6490062524875005, + "num_tokens": 3012788257.0, + "step": 17974 + }, + { + "entropy": 1.6937835117181141, + "epoch": 1.9746505177006948, + "grad_norm": 0.7052786350250244, + "learning_rate": 2.0076646859165442e-06, + "loss": 1.4543, + "mean_token_accuracy": 0.6538620889186859, + "num_tokens": 3012948228.0, + "step": 17975 + }, + { + "entropy": 1.68635560075442, + "epoch": 1.9747603746120679, + "grad_norm": 0.6069852709770203, + "learning_rate": 2.0075987627201576e-06, + "loss": 1.4386, + "mean_token_accuracy": 0.6424238681793213, + "num_tokens": 3013179738.0, + "step": 17976 + }, + { + "entropy": 1.720237821340561, + "epoch": 1.9748702315234408, + "grad_norm": 0.6068885922431946, + "learning_rate": 2.007533124129218e-06, + "loss": 1.4261, + "mean_token_accuracy": 0.6477613896131516, + "num_tokens": 3013348074.0, + "step": 17977 + }, + { + "entropy": 1.7363394598166149, + "epoch": 1.9749800884348137, + "grad_norm": 0.7410934567451477, + "learning_rate": 2.0074677701458028e-06, + "loss": 1.4191, + "mean_token_accuracy": 0.6523391604423523, + "num_tokens": 3013487387.0, + "step": 17978 + }, + { + "entropy": 1.6087459822495778, + "epoch": 1.9750899453461865, + "grad_norm": 0.7028157711029053, + "learning_rate": 2.007402700771981e-06, + "loss": 1.3504, + "mean_token_accuracy": 0.6695465197165807, + "num_tokens": 3013654113.0, + "step": 17979 + }, + { + "entropy": 1.7600714067618053, + "epoch": 1.9751998022575594, + "grad_norm": 0.8189085721969604, + "learning_rate": 2.007337916009811e-06, + "loss": 1.4994, + "mean_token_accuracy": 0.6365965008735657, + "num_tokens": 3013843992.0, + "step": 17980 + }, + { + "entropy": 1.75289652744929, + "epoch": 1.9753096591689325, + "grad_norm": 0.7521695494651794, + "learning_rate": 2.0072734158613445e-06, + "loss": 1.3926, + "mean_token_accuracy": 0.6658550798892975, + "num_tokens": 3014006944.0, + "step": 17981 + }, + { + "entropy": 1.6855365534623463, + "epoch": 1.9754195160803054, + "grad_norm": 0.6774551272392273, + "learning_rate": 2.0072092003286216e-06, + "loss": 1.2929, + "mean_token_accuracy": 0.680366670091947, + "num_tokens": 3014157643.0, + "step": 17982 + }, + { + "entropy": 1.7333262066046398, + "epoch": 1.9755293729916783, + "grad_norm": 1.2397245168685913, + "learning_rate": 2.0071452694136757e-06, + "loss": 1.1822, + "mean_token_accuracy": 0.6727334012587866, + "num_tokens": 3014341117.0, + "step": 17983 + }, + { + "entropy": 1.7674992481867473, + "epoch": 1.9756392299030514, + "grad_norm": 0.7200416922569275, + "learning_rate": 2.0070816231185293e-06, + "loss": 1.4271, + "mean_token_accuracy": 0.6562386403481165, + "num_tokens": 3014516310.0, + "step": 17984 + }, + { + "entropy": 1.6975335478782654, + "epoch": 1.975749086814424, + "grad_norm": 0.6550208926200867, + "learning_rate": 2.007018261445197e-06, + "loss": 1.2928, + "mean_token_accuracy": 0.6839944074551264, + "num_tokens": 3014672547.0, + "step": 17985 + }, + { + "entropy": 1.7610229949156444, + "epoch": 1.9758589437257972, + "grad_norm": 0.6302772164344788, + "learning_rate": 2.0069551843956847e-06, + "loss": 1.4284, + "mean_token_accuracy": 0.6451925585667292, + "num_tokens": 3014823710.0, + "step": 17986 + }, + { + "entropy": 1.7303914825121562, + "epoch": 1.97596880063717, + "grad_norm": 0.6791033744812012, + "learning_rate": 2.006892391971989e-06, + "loss": 1.3366, + "mean_token_accuracy": 0.6641490111748377, + "num_tokens": 3014969094.0, + "step": 17987 + }, + { + "entropy": 1.7095533609390259, + "epoch": 1.976078657548543, + "grad_norm": 0.6710312962532043, + "learning_rate": 2.0068298841760956e-06, + "loss": 1.4577, + "mean_token_accuracy": 0.631788025299708, + "num_tokens": 3015156304.0, + "step": 17988 + }, + { + "entropy": 1.7411635220050812, + "epoch": 1.976188514459916, + "grad_norm": 0.6896677017211914, + "learning_rate": 2.006767661009985e-06, + "loss": 1.4357, + "mean_token_accuracy": 0.6591566403706869, + "num_tokens": 3015356842.0, + "step": 17989 + }, + { + "entropy": 1.7042207817236583, + "epoch": 1.976298371371289, + "grad_norm": 0.6808891892433167, + "learning_rate": 2.0067057224756247e-06, + "loss": 1.5334, + "mean_token_accuracy": 0.6491953035195669, + "num_tokens": 3015582556.0, + "step": 17990 + }, + { + "entropy": 1.6519914468129475, + "epoch": 1.9764082282826618, + "grad_norm": 0.6816950440406799, + "learning_rate": 2.006644068574976e-06, + "loss": 1.2486, + "mean_token_accuracy": 0.6812132398287455, + "num_tokens": 3015709616.0, + "step": 17991 + }, + { + "entropy": 1.6635911564032237, + "epoch": 1.976518085194035, + "grad_norm": 0.8687704801559448, + "learning_rate": 2.00658269930999e-06, + "loss": 1.3443, + "mean_token_accuracy": 0.6642593095699946, + "num_tokens": 3015864905.0, + "step": 17992 + }, + { + "entropy": 1.7611885865529378, + "epoch": 1.9766279421054076, + "grad_norm": 0.6940531134605408, + "learning_rate": 2.00652161468261e-06, + "loss": 1.3961, + "mean_token_accuracy": 0.6467735171318054, + "num_tokens": 3016015646.0, + "step": 17993 + }, + { + "entropy": 1.6912155350049336, + "epoch": 1.9767377990167807, + "grad_norm": 0.6218036413192749, + "learning_rate": 2.0064608146947675e-06, + "loss": 1.3728, + "mean_token_accuracy": 0.6538327733675638, + "num_tokens": 3016178510.0, + "step": 17994 + }, + { + "entropy": 1.788610190153122, + "epoch": 1.9768476559281536, + "grad_norm": 0.7761731147766113, + "learning_rate": 2.006400299348387e-06, + "loss": 1.2656, + "mean_token_accuracy": 0.6742220024267832, + "num_tokens": 3016306071.0, + "step": 17995 + }, + { + "entropy": 1.7301820814609528, + "epoch": 1.9769575128395265, + "grad_norm": 0.6517966389656067, + "learning_rate": 2.006340068645385e-06, + "loss": 1.3778, + "mean_token_accuracy": 0.6603502233823141, + "num_tokens": 3016492205.0, + "step": 17996 + }, + { + "entropy": 1.7159747183322906, + "epoch": 1.9770673697508996, + "grad_norm": 0.7238568067550659, + "learning_rate": 2.0062801225876675e-06, + "loss": 1.3778, + "mean_token_accuracy": 0.6613292147715887, + "num_tokens": 3016667084.0, + "step": 17997 + }, + { + "entropy": 1.7366797228654225, + "epoch": 1.9771772266622722, + "grad_norm": 0.6958896517753601, + "learning_rate": 2.0062204611771306e-06, + "loss": 1.4078, + "mean_token_accuracy": 0.6479227592547735, + "num_tokens": 3016828560.0, + "step": 17998 + }, + { + "entropy": 1.7239971260229747, + "epoch": 1.9772870835736454, + "grad_norm": 0.6808164119720459, + "learning_rate": 2.006161084415664e-06, + "loss": 1.5164, + "mean_token_accuracy": 0.6513014584779739, + "num_tokens": 3017060646.0, + "step": 17999 + }, + { + "entropy": 1.7256098488966625, + "epoch": 1.9773969404850182, + "grad_norm": 0.7273125052452087, + "learning_rate": 2.006101992305146e-06, + "loss": 1.3934, + "mean_token_accuracy": 0.6515825539827347, + "num_tokens": 3017199524.0, + "step": 18000 + }, + { + "entropy": 1.7242956161499023, + "epoch": 1.9775067973963911, + "grad_norm": 0.6841662526130676, + "learning_rate": 2.0060431848474487e-06, + "loss": 1.6029, + "mean_token_accuracy": 0.6310764849185944, + "num_tokens": 3017376477.0, + "step": 18001 + }, + { + "entropy": 1.6526127556959789, + "epoch": 1.9776166543077642, + "grad_norm": 0.7398411631584167, + "learning_rate": 2.0059846620444303e-06, + "loss": 1.3041, + "mean_token_accuracy": 0.6696018973986307, + "num_tokens": 3017536030.0, + "step": 18002 + }, + { + "entropy": 1.6591049631436665, + "epoch": 1.9777265112191371, + "grad_norm": 0.7715206146240234, + "learning_rate": 2.0059264238979447e-06, + "loss": 1.3687, + "mean_token_accuracy": 0.6526884287595749, + "num_tokens": 3017710213.0, + "step": 18003 + }, + { + "entropy": 1.698662171761195, + "epoch": 1.97783636813051, + "grad_norm": 0.7030515074729919, + "learning_rate": 2.005868470409835e-06, + "loss": 1.4247, + "mean_token_accuracy": 0.6606688896814982, + "num_tokens": 3017879975.0, + "step": 18004 + }, + { + "entropy": 1.6552705466747284, + "epoch": 1.977946225041883, + "grad_norm": 0.6308731436729431, + "learning_rate": 2.0058108015819362e-06, + "loss": 1.2674, + "mean_token_accuracy": 0.6792994836966196, + "num_tokens": 3018008327.0, + "step": 18005 + }, + { + "entropy": 1.7466795146465302, + "epoch": 1.9780560819532558, + "grad_norm": 0.7210440635681152, + "learning_rate": 2.0057534174160713e-06, + "loss": 1.3281, + "mean_token_accuracy": 0.673799475034078, + "num_tokens": 3018133227.0, + "step": 18006 + }, + { + "entropy": 1.7657952308654785, + "epoch": 1.9781659388646289, + "grad_norm": 0.9165833592414856, + "learning_rate": 2.0056963179140585e-06, + "loss": 1.4521, + "mean_token_accuracy": 0.6580042143662771, + "num_tokens": 3018281843.0, + "step": 18007 + }, + { + "entropy": 1.697232147057851, + "epoch": 1.9782757957760018, + "grad_norm": 0.6184810996055603, + "learning_rate": 2.005639503077705e-06, + "loss": 1.4727, + "mean_token_accuracy": 0.6428412993748983, + "num_tokens": 3018460132.0, + "step": 18008 + }, + { + "entropy": 1.6890581647555034, + "epoch": 1.9783856526873747, + "grad_norm": 0.6864956617355347, + "learning_rate": 2.005582972908807e-06, + "loss": 1.4036, + "mean_token_accuracy": 0.6568064391613007, + "num_tokens": 3018656885.0, + "step": 18009 + }, + { + "entropy": 1.715958833694458, + "epoch": 1.9784955095987478, + "grad_norm": 0.6373485326766968, + "learning_rate": 2.0055267274091552e-06, + "loss": 1.3331, + "mean_token_accuracy": 0.6668579330046972, + "num_tokens": 3018801487.0, + "step": 18010 + }, + { + "entropy": 1.6817961037158966, + "epoch": 1.9786053665101204, + "grad_norm": 0.7401055097579956, + "learning_rate": 2.0054707665805303e-06, + "loss": 1.3845, + "mean_token_accuracy": 0.661645824710528, + "num_tokens": 3018946548.0, + "step": 18011 + }, + { + "entropy": 1.663213074207306, + "epoch": 1.9787152234214935, + "grad_norm": 0.670312762260437, + "learning_rate": 2.0054150904247017e-06, + "loss": 1.4159, + "mean_token_accuracy": 0.6704086015621821, + "num_tokens": 3019135682.0, + "step": 18012 + }, + { + "entropy": 1.6933028101921082, + "epoch": 1.9788250803328664, + "grad_norm": 0.6959567666053772, + "learning_rate": 2.0053596989434325e-06, + "loss": 1.4025, + "mean_token_accuracy": 0.6613306552171707, + "num_tokens": 3019288636.0, + "step": 18013 + }, + { + "entropy": 1.65764586130778, + "epoch": 1.9789349372442393, + "grad_norm": 0.68830406665802, + "learning_rate": 2.0053045921384766e-06, + "loss": 1.4326, + "mean_token_accuracy": 0.6502135346333185, + "num_tokens": 3019479477.0, + "step": 18014 + }, + { + "entropy": 1.6973117391268413, + "epoch": 1.9790447941556124, + "grad_norm": 0.6733773946762085, + "learning_rate": 2.005249770011576e-06, + "loss": 1.5697, + "mean_token_accuracy": 0.6302972286939621, + "num_tokens": 3019680116.0, + "step": 18015 + }, + { + "entropy": 1.654345730940501, + "epoch": 1.9791546510669853, + "grad_norm": 0.6176797151565552, + "learning_rate": 2.005195232564469e-06, + "loss": 1.3239, + "mean_token_accuracy": 0.6690139671166738, + "num_tokens": 3019863791.0, + "step": 18016 + }, + { + "entropy": 1.7217604120572407, + "epoch": 1.9792645079783582, + "grad_norm": 0.8068521022796631, + "learning_rate": 2.005140979798878e-06, + "loss": 1.4333, + "mean_token_accuracy": 0.6493235329786936, + "num_tokens": 3020037805.0, + "step": 18017 + }, + { + "entropy": 1.7371099591255188, + "epoch": 1.9793743648897313, + "grad_norm": 0.7657269239425659, + "learning_rate": 2.005087011716523e-06, + "loss": 1.4369, + "mean_token_accuracy": 0.6623478010296822, + "num_tokens": 3020184002.0, + "step": 18018 + }, + { + "entropy": 1.7969795564810436, + "epoch": 1.979484221801104, + "grad_norm": 0.7154170274734497, + "learning_rate": 2.0050333283191096e-06, + "loss": 1.5782, + "mean_token_accuracy": 0.6230311791102091, + "num_tokens": 3020363052.0, + "step": 18019 + }, + { + "entropy": 1.7187202374140422, + "epoch": 1.979594078712477, + "grad_norm": 0.576965868473053, + "learning_rate": 2.0049799296083384e-06, + "loss": 1.1483, + "mean_token_accuracy": 0.676199659705162, + "num_tokens": 3020557205.0, + "step": 18020 + }, + { + "entropy": 1.707470417022705, + "epoch": 1.97970393562385, + "grad_norm": 0.759087324142456, + "learning_rate": 2.0049268155859003e-06, + "loss": 1.4009, + "mean_token_accuracy": 0.6503723512093226, + "num_tokens": 3020700036.0, + "step": 18021 + }, + { + "entropy": 1.6869849860668182, + "epoch": 1.9798137925352228, + "grad_norm": 0.7738426923751831, + "learning_rate": 2.0048739862534737e-06, + "loss": 1.3752, + "mean_token_accuracy": 0.6719378630320231, + "num_tokens": 3020861112.0, + "step": 18022 + }, + { + "entropy": 1.7128291428089142, + "epoch": 1.979923649446596, + "grad_norm": 0.6039331555366516, + "learning_rate": 2.004821441612733e-06, + "loss": 1.4135, + "mean_token_accuracy": 0.6554146458705267, + "num_tokens": 3021038362.0, + "step": 18023 + }, + { + "entropy": 1.7349358598391216, + "epoch": 1.9800335063579686, + "grad_norm": 0.6787173748016357, + "learning_rate": 2.0047691816653407e-06, + "loss": 1.4621, + "mean_token_accuracy": 0.6508260667324066, + "num_tokens": 3021217797.0, + "step": 18024 + }, + { + "entropy": 1.6685168147087097, + "epoch": 1.9801433632693417, + "grad_norm": 0.705312967300415, + "learning_rate": 2.0047172064129493e-06, + "loss": 1.4366, + "mean_token_accuracy": 0.660369485616684, + "num_tokens": 3021410023.0, + "step": 18025 + }, + { + "entropy": 1.7754539052645366, + "epoch": 1.9802532201807146, + "grad_norm": 0.5943109393119812, + "learning_rate": 2.004665515857206e-06, + "loss": 1.3934, + "mean_token_accuracy": 0.6560729245344797, + "num_tokens": 3021580962.0, + "step": 18026 + }, + { + "entropy": 1.680273950099945, + "epoch": 1.9803630770920875, + "grad_norm": 0.6903275847434998, + "learning_rate": 2.004614109999745e-06, + "loss": 1.3749, + "mean_token_accuracy": 0.6597232023874918, + "num_tokens": 3021788227.0, + "step": 18027 + }, + { + "entropy": 1.670421948035558, + "epoch": 1.9804729340034606, + "grad_norm": 0.6176491379737854, + "learning_rate": 2.0045629888421937e-06, + "loss": 1.3391, + "mean_token_accuracy": 0.6674363017082214, + "num_tokens": 3021959993.0, + "step": 18028 + }, + { + "entropy": 1.7221202949682872, + "epoch": 1.9805827909148335, + "grad_norm": 0.8627282977104187, + "learning_rate": 2.004512152386172e-06, + "loss": 1.4449, + "mean_token_accuracy": 0.6531087706486384, + "num_tokens": 3022159424.0, + "step": 18029 + }, + { + "entropy": 1.7343119382858276, + "epoch": 1.9806926478262064, + "grad_norm": 0.6736763119697571, + "learning_rate": 2.0044616006332864e-06, + "loss": 1.3932, + "mean_token_accuracy": 0.6408475587765375, + "num_tokens": 3022327707.0, + "step": 18030 + }, + { + "entropy": 1.710096687078476, + "epoch": 1.9808025047375795, + "grad_norm": 0.6935316324234009, + "learning_rate": 2.0044113335851365e-06, + "loss": 1.4266, + "mean_token_accuracy": 0.6543708691994349, + "num_tokens": 3022506535.0, + "step": 18031 + }, + { + "entropy": 1.7249768376350403, + "epoch": 1.9809123616489521, + "grad_norm": 0.7204832434654236, + "learning_rate": 2.004361351243316e-06, + "loss": 1.368, + "mean_token_accuracy": 0.6743978013594946, + "num_tokens": 3022623060.0, + "step": 18032 + }, + { + "entropy": 1.7260689040025075, + "epoch": 1.9810222185603252, + "grad_norm": 0.5968881845474243, + "learning_rate": 2.004311653609404e-06, + "loss": 1.531, + "mean_token_accuracy": 0.6207184543212255, + "num_tokens": 3022847515.0, + "step": 18033 + }, + { + "entropy": 1.7605952223141987, + "epoch": 1.9811320754716981, + "grad_norm": 0.677291989326477, + "learning_rate": 2.004262240684976e-06, + "loss": 1.3908, + "mean_token_accuracy": 0.6491524130105972, + "num_tokens": 3023004730.0, + "step": 18034 + }, + { + "entropy": 1.7027316590150197, + "epoch": 1.981241932383071, + "grad_norm": 0.590280294418335, + "learning_rate": 2.004213112471593e-06, + "loss": 1.4213, + "mean_token_accuracy": 0.6485264748334885, + "num_tokens": 3023213682.0, + "step": 18035 + }, + { + "entropy": 1.7386829058329265, + "epoch": 1.981351789294444, + "grad_norm": 0.7665061950683594, + "learning_rate": 2.004164268970812e-06, + "loss": 1.379, + "mean_token_accuracy": 0.6652245422204336, + "num_tokens": 3023388373.0, + "step": 18036 + }, + { + "entropy": 1.7038521766662598, + "epoch": 1.9814616462058168, + "grad_norm": 0.7388865947723389, + "learning_rate": 2.004115710184179e-06, + "loss": 1.3043, + "mean_token_accuracy": 0.6681536138057709, + "num_tokens": 3023530065.0, + "step": 18037 + }, + { + "entropy": 1.7088390787442524, + "epoch": 1.9815715031171899, + "grad_norm": 0.8848956227302551, + "learning_rate": 2.004067436113229e-06, + "loss": 1.3557, + "mean_token_accuracy": 0.6594817042350769, + "num_tokens": 3023709403.0, + "step": 18038 + }, + { + "entropy": 1.6985073586304982, + "epoch": 1.9816813600285628, + "grad_norm": 0.5689802169799805, + "learning_rate": 2.004019446759491e-06, + "loss": 1.3975, + "mean_token_accuracy": 0.663287435968717, + "num_tokens": 3023913464.0, + "step": 18039 + }, + { + "entropy": 1.6951737900575001, + "epoch": 1.9817912169399357, + "grad_norm": 0.7058504819869995, + "learning_rate": 2.0039717421244838e-06, + "loss": 1.3111, + "mean_token_accuracy": 0.6672380814949671, + "num_tokens": 3024026724.0, + "step": 18040 + }, + { + "entropy": 1.733831246693929, + "epoch": 1.9819010738513088, + "grad_norm": 0.7383568286895752, + "learning_rate": 2.003924322209718e-06, + "loss": 1.2293, + "mean_token_accuracy": 0.6893104861179987, + "num_tokens": 3024117143.0, + "step": 18041 + }, + { + "entropy": 1.6926162540912628, + "epoch": 1.9820109307626816, + "grad_norm": 0.7105924487113953, + "learning_rate": 2.0038771870166933e-06, + "loss": 1.278, + "mean_token_accuracy": 0.6817958305279413, + "num_tokens": 3024256149.0, + "step": 18042 + }, + { + "entropy": 1.722558597723643, + "epoch": 1.9821207876740545, + "grad_norm": 0.6348571181297302, + "learning_rate": 2.0038303365469026e-06, + "loss": 1.398, + "mean_token_accuracy": 0.6517468144496282, + "num_tokens": 3024410077.0, + "step": 18043 + }, + { + "entropy": 1.6495929559071858, + "epoch": 1.9822306445854276, + "grad_norm": 0.6759068965911865, + "learning_rate": 2.0037837708018268e-06, + "loss": 1.438, + "mean_token_accuracy": 0.6489085604747137, + "num_tokens": 3024594527.0, + "step": 18044 + }, + { + "entropy": 1.7154331902662914, + "epoch": 1.9823405014968003, + "grad_norm": 0.7698425054550171, + "learning_rate": 2.0037374897829413e-06, + "loss": 1.4302, + "mean_token_accuracy": 0.6621577441692352, + "num_tokens": 3024748417.0, + "step": 18045 + }, + { + "entropy": 1.694651484489441, + "epoch": 1.9824503584081734, + "grad_norm": 0.7667627930641174, + "learning_rate": 2.0036914934917106e-06, + "loss": 1.4109, + "mean_token_accuracy": 0.6532551348209381, + "num_tokens": 3024893648.0, + "step": 18046 + }, + { + "entropy": 1.6467284560203552, + "epoch": 1.9825602153195463, + "grad_norm": 0.6645065546035767, + "learning_rate": 2.00364578192959e-06, + "loss": 1.4022, + "mean_token_accuracy": 0.6770640710989634, + "num_tokens": 3025055067.0, + "step": 18047 + }, + { + "entropy": 1.7726899286111195, + "epoch": 1.9826700722309192, + "grad_norm": 0.6508256793022156, + "learning_rate": 2.003600355098027e-06, + "loss": 1.4051, + "mean_token_accuracy": 0.6635939379533132, + "num_tokens": 3025246791.0, + "step": 18048 + }, + { + "entropy": 1.7325632472832997, + "epoch": 1.9827799291422923, + "grad_norm": 0.7848646640777588, + "learning_rate": 2.0035552129984595e-06, + "loss": 1.4272, + "mean_token_accuracy": 0.6580934077501297, + "num_tokens": 3025385806.0, + "step": 18049 + }, + { + "entropy": 1.7530264457066853, + "epoch": 1.982889786053665, + "grad_norm": 0.6446986794471741, + "learning_rate": 2.003510355632314e-06, + "loss": 1.3292, + "mean_token_accuracy": 0.6607964038848877, + "num_tokens": 3025565951.0, + "step": 18050 + }, + { + "entropy": 1.6674051980177562, + "epoch": 1.982999642965038, + "grad_norm": 0.7394304275512695, + "learning_rate": 2.003465783001013e-06, + "loss": 1.3453, + "mean_token_accuracy": 0.6573042770226797, + "num_tokens": 3025704826.0, + "step": 18051 + }, + { + "entropy": 1.6773662368456523, + "epoch": 1.983109499876411, + "grad_norm": 0.558399498462677, + "learning_rate": 2.003421495105966e-06, + "loss": 1.4961, + "mean_token_accuracy": 0.6545028338829676, + "num_tokens": 3025925349.0, + "step": 18052 + }, + { + "entropy": 1.7445982893308003, + "epoch": 1.9832193567877838, + "grad_norm": 0.7014199495315552, + "learning_rate": 2.003377491948574e-06, + "loss": 1.2774, + "mean_token_accuracy": 0.6735482960939407, + "num_tokens": 3026050899.0, + "step": 18053 + }, + { + "entropy": 1.7237263023853302, + "epoch": 1.983329213699157, + "grad_norm": 0.7317463755607605, + "learning_rate": 2.0033337735302303e-06, + "loss": 1.285, + "mean_token_accuracy": 0.6770526617765427, + "num_tokens": 3026212822.0, + "step": 18054 + }, + { + "entropy": 1.6821326514085133, + "epoch": 1.9834390706105298, + "grad_norm": 0.793610692024231, + "learning_rate": 2.003290339852319e-06, + "loss": 1.242, + "mean_token_accuracy": 0.6760970403750738, + "num_tokens": 3026380803.0, + "step": 18055 + }, + { + "entropy": 1.6886110802491505, + "epoch": 1.9835489275219027, + "grad_norm": 0.621466338634491, + "learning_rate": 2.003247190916215e-06, + "loss": 1.2854, + "mean_token_accuracy": 0.6679765482743582, + "num_tokens": 3026512384.0, + "step": 18056 + }, + { + "entropy": 1.7125110626220703, + "epoch": 1.9836587844332758, + "grad_norm": 0.7082045674324036, + "learning_rate": 2.0032043267232827e-06, + "loss": 1.2844, + "mean_token_accuracy": 0.6690214524666468, + "num_tokens": 3026634106.0, + "step": 18057 + }, + { + "entropy": 1.6765822370847066, + "epoch": 1.9837686413446485, + "grad_norm": 0.7026678919792175, + "learning_rate": 2.00316174727488e-06, + "loss": 1.3097, + "mean_token_accuracy": 0.6639251758654913, + "num_tokens": 3026780481.0, + "step": 18058 + }, + { + "entropy": 1.7151767710844676, + "epoch": 1.9838784982560216, + "grad_norm": 0.7121601104736328, + "learning_rate": 2.0031194525723535e-06, + "loss": 1.3883, + "mean_token_accuracy": 0.6660854717095693, + "num_tokens": 3026923173.0, + "step": 18059 + }, + { + "entropy": 1.6836934685707092, + "epoch": 1.9839883551673945, + "grad_norm": 0.7249175906181335, + "learning_rate": 2.003077442617042e-06, + "loss": 1.394, + "mean_token_accuracy": 0.6518472333749136, + "num_tokens": 3027118633.0, + "step": 18060 + }, + { + "entropy": 1.6882561047871907, + "epoch": 1.9840982120787674, + "grad_norm": 0.797842800617218, + "learning_rate": 2.0030357174102765e-06, + "loss": 1.311, + "mean_token_accuracy": 0.6653191695610682, + "num_tokens": 3027259226.0, + "step": 18061 + }, + { + "entropy": 1.622244934240977, + "epoch": 1.9842080689901405, + "grad_norm": 0.6981305480003357, + "learning_rate": 2.002994276953375e-06, + "loss": 1.2685, + "mean_token_accuracy": 0.6686497926712036, + "num_tokens": 3027400721.0, + "step": 18062 + }, + { + "entropy": 1.6579966247081757, + "epoch": 1.9843179259015131, + "grad_norm": 0.7618954181671143, + "learning_rate": 2.002953121247651e-06, + "loss": 1.2649, + "mean_token_accuracy": 0.6726798812548319, + "num_tokens": 3027564629.0, + "step": 18063 + }, + { + "entropy": 1.66702335079511, + "epoch": 1.9844277828128862, + "grad_norm": 0.7515163421630859, + "learning_rate": 2.0029122502944063e-06, + "loss": 1.2735, + "mean_token_accuracy": 0.6686499267816544, + "num_tokens": 3027759663.0, + "step": 18064 + }, + { + "entropy": 1.6984266340732574, + "epoch": 1.9845376397242591, + "grad_norm": 0.6969068050384521, + "learning_rate": 2.002871664094935e-06, + "loss": 1.2867, + "mean_token_accuracy": 0.6645700335502625, + "num_tokens": 3027883746.0, + "step": 18065 + }, + { + "entropy": 1.7177335818608601, + "epoch": 1.984647496635632, + "grad_norm": 0.5522650480270386, + "learning_rate": 2.0028313626505215e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6834283471107483, + "num_tokens": 3028051127.0, + "step": 18066 + }, + { + "entropy": 1.6780750850836437, + "epoch": 1.984757353547005, + "grad_norm": 0.6429264545440674, + "learning_rate": 2.002791345962441e-06, + "loss": 1.3301, + "mean_token_accuracy": 0.6632246325413386, + "num_tokens": 3028212961.0, + "step": 18067 + }, + { + "entropy": 1.684230109055837, + "epoch": 1.984867210458378, + "grad_norm": 0.6792988777160645, + "learning_rate": 2.0027516140319604e-06, + "loss": 1.212, + "mean_token_accuracy": 0.6855142414569855, + "num_tokens": 3028319131.0, + "step": 18068 + }, + { + "entropy": 1.6362544397513072, + "epoch": 1.9849770673697509, + "grad_norm": 0.5678415298461914, + "learning_rate": 2.0027121668603362e-06, + "loss": 1.3782, + "mean_token_accuracy": 0.6610343605279922, + "num_tokens": 3028505641.0, + "step": 18069 + }, + { + "entropy": 1.668990820646286, + "epoch": 1.985086924281124, + "grad_norm": 0.6381085515022278, + "learning_rate": 2.0026730044488184e-06, + "loss": 1.3541, + "mean_token_accuracy": 0.6664966394503912, + "num_tokens": 3028655036.0, + "step": 18070 + }, + { + "entropy": 1.6200170914332073, + "epoch": 1.9851967811924967, + "grad_norm": 0.6306815147399902, + "learning_rate": 2.0026341267986454e-06, + "loss": 1.2853, + "mean_token_accuracy": 0.6690742274125417, + "num_tokens": 3028820136.0, + "step": 18071 + }, + { + "entropy": 1.7141570250193279, + "epoch": 1.9853066381038698, + "grad_norm": 0.6135541200637817, + "learning_rate": 2.0025955339110474e-06, + "loss": 1.4239, + "mean_token_accuracy": 0.6421345720688502, + "num_tokens": 3029017262.0, + "step": 18072 + }, + { + "entropy": 1.713169127702713, + "epoch": 1.9854164950152426, + "grad_norm": 0.7709214687347412, + "learning_rate": 2.0025572257872475e-06, + "loss": 1.3818, + "mean_token_accuracy": 0.6472178449233373, + "num_tokens": 3029195490.0, + "step": 18073 + }, + { + "entropy": 1.7533264259497325, + "epoch": 1.9855263519266155, + "grad_norm": 0.6595629453659058, + "learning_rate": 2.002519202428457e-06, + "loss": 1.4027, + "mean_token_accuracy": 0.6588217069705328, + "num_tokens": 3029344302.0, + "step": 18074 + }, + { + "entropy": 1.702713559071223, + "epoch": 1.9856362088379886, + "grad_norm": 0.6595112085342407, + "learning_rate": 2.0024814638358793e-06, + "loss": 1.4347, + "mean_token_accuracy": 0.6574101795752844, + "num_tokens": 3029555592.0, + "step": 18075 + }, + { + "entropy": 1.6746285160382588, + "epoch": 1.9857460657493613, + "grad_norm": 0.7601661086082458, + "learning_rate": 2.002444010010708e-06, + "loss": 1.3132, + "mean_token_accuracy": 0.6680941929419836, + "num_tokens": 3029696305.0, + "step": 18076 + }, + { + "entropy": 1.704234351714452, + "epoch": 1.9858559226607344, + "grad_norm": 0.6155273914337158, + "learning_rate": 2.0024068409541304e-06, + "loss": 1.3917, + "mean_token_accuracy": 0.662379855910937, + "num_tokens": 3029881772.0, + "step": 18077 + }, + { + "entropy": 1.6934775014718373, + "epoch": 1.9859657795721073, + "grad_norm": 0.6397415399551392, + "learning_rate": 2.0023699566673213e-06, + "loss": 1.2694, + "mean_token_accuracy": 0.6719277749458948, + "num_tokens": 3030039562.0, + "step": 18078 + }, + { + "entropy": 1.7212519546349843, + "epoch": 1.9860756364834802, + "grad_norm": 0.6646420955657959, + "learning_rate": 2.0023333571514483e-06, + "loss": 1.4844, + "mean_token_accuracy": 0.6389352331558863, + "num_tokens": 3030268176.0, + "step": 18079 + }, + { + "entropy": 1.6160194476445515, + "epoch": 1.9861854933948533, + "grad_norm": 0.7115574479103088, + "learning_rate": 2.0022970424076705e-06, + "loss": 1.3326, + "mean_token_accuracy": 0.667738159497579, + "num_tokens": 3030445462.0, + "step": 18080 + }, + { + "entropy": 1.729399710893631, + "epoch": 1.9862953503062262, + "grad_norm": 0.5949446558952332, + "learning_rate": 2.002261012437137e-06, + "loss": 1.4196, + "mean_token_accuracy": 0.6431524356206259, + "num_tokens": 3030648761.0, + "step": 18081 + }, + { + "entropy": 1.71317191918691, + "epoch": 1.986405207217599, + "grad_norm": 3.0127835273742676, + "learning_rate": 2.002225267240988e-06, + "loss": 1.3224, + "mean_token_accuracy": 0.6609684824943542, + "num_tokens": 3030837875.0, + "step": 18082 + }, + { + "entropy": 1.7283783555030823, + "epoch": 1.9865150641289722, + "grad_norm": 0.8065574169158936, + "learning_rate": 2.0021898068203545e-06, + "loss": 1.422, + "mean_token_accuracy": 0.6655691017707189, + "num_tokens": 3030997256.0, + "step": 18083 + }, + { + "entropy": 1.6900490025679271, + "epoch": 1.9866249210403448, + "grad_norm": 0.6390381455421448, + "learning_rate": 2.00215463117636e-06, + "loss": 1.2966, + "mean_token_accuracy": 0.6703857729832331, + "num_tokens": 3031132200.0, + "step": 18084 + }, + { + "entropy": 1.660315861304601, + "epoch": 1.986734777951718, + "grad_norm": 0.6758151054382324, + "learning_rate": 2.0021197403101156e-06, + "loss": 1.319, + "mean_token_accuracy": 0.6526903261741003, + "num_tokens": 3031266341.0, + "step": 18085 + }, + { + "entropy": 1.724764307339986, + "epoch": 1.9868446348630908, + "grad_norm": 0.7246851325035095, + "learning_rate": 2.002085134222728e-06, + "loss": 1.358, + "mean_token_accuracy": 0.6704638799031576, + "num_tokens": 3031400811.0, + "step": 18086 + }, + { + "entropy": 1.7192512452602386, + "epoch": 1.9869544917744637, + "grad_norm": 0.6231260895729065, + "learning_rate": 2.002050812915291e-06, + "loss": 1.5262, + "mean_token_accuracy": 0.6408623903989792, + "num_tokens": 3031597704.0, + "step": 18087 + }, + { + "entropy": 1.7471778094768524, + "epoch": 1.9870643486858368, + "grad_norm": 0.7257117033004761, + "learning_rate": 2.0020167763888905e-06, + "loss": 1.6226, + "mean_token_accuracy": 0.6368986219167709, + "num_tokens": 3031759715.0, + "step": 18088 + }, + { + "entropy": 1.6563644409179688, + "epoch": 1.9871742055972095, + "grad_norm": 0.7969918251037598, + "learning_rate": 2.001983024644605e-06, + "loss": 1.1957, + "mean_token_accuracy": 0.68258864680926, + "num_tokens": 3031879721.0, + "step": 18089 + }, + { + "entropy": 1.7332975169022877, + "epoch": 1.9872840625085826, + "grad_norm": 0.7111884951591492, + "learning_rate": 2.0019495576835017e-06, + "loss": 1.2027, + "mean_token_accuracy": 0.678931881984075, + "num_tokens": 3031980175.0, + "step": 18090 + }, + { + "entropy": 1.677456219991048, + "epoch": 1.9873939194199555, + "grad_norm": 0.7319856882095337, + "learning_rate": 2.0019163755066414e-06, + "loss": 1.4709, + "mean_token_accuracy": 0.6506858567396799, + "num_tokens": 3032174655.0, + "step": 18091 + }, + { + "entropy": 1.7617920140425365, + "epoch": 1.9875037763313284, + "grad_norm": 0.7048560976982117, + "learning_rate": 2.0018834781150714e-06, + "loss": 1.3913, + "mean_token_accuracy": 0.6561016142368317, + "num_tokens": 3032322569.0, + "step": 18092 + }, + { + "entropy": 1.6850563287734985, + "epoch": 1.9876136332427015, + "grad_norm": 0.7205668091773987, + "learning_rate": 2.001850865509836e-06, + "loss": 1.2585, + "mean_token_accuracy": 0.6678894609212875, + "num_tokens": 3032456059.0, + "step": 18093 + }, + { + "entropy": 1.6681140661239624, + "epoch": 1.9877234901540743, + "grad_norm": 0.7361934185028076, + "learning_rate": 2.0018185376919665e-06, + "loss": 1.4023, + "mean_token_accuracy": 0.6583593388398489, + "num_tokens": 3032622180.0, + "step": 18094 + }, + { + "entropy": 1.8122372031211853, + "epoch": 1.9878333470654472, + "grad_norm": 0.7276337146759033, + "learning_rate": 2.0017864946624848e-06, + "loss": 1.6099, + "mean_token_accuracy": 0.6246108015378317, + "num_tokens": 3032802456.0, + "step": 18095 + }, + { + "entropy": 1.6701407432556152, + "epoch": 1.9879432039768203, + "grad_norm": 0.6742632985115051, + "learning_rate": 2.001754736422406e-06, + "loss": 1.2446, + "mean_token_accuracy": 0.6852086037397385, + "num_tokens": 3032910500.0, + "step": 18096 + }, + { + "entropy": 1.7621448735396068, + "epoch": 1.988053060888193, + "grad_norm": 0.849463164806366, + "learning_rate": 2.0017232629727345e-06, + "loss": 1.5072, + "mean_token_accuracy": 0.6401193489631017, + "num_tokens": 3033111853.0, + "step": 18097 + }, + { + "entropy": 1.6534665822982788, + "epoch": 1.988162917799566, + "grad_norm": 0.6665313839912415, + "learning_rate": 2.0016920743144674e-06, + "loss": 1.272, + "mean_token_accuracy": 0.6797701468070348, + "num_tokens": 3033255604.0, + "step": 18098 + }, + { + "entropy": 1.7228951156139374, + "epoch": 1.988272774710939, + "grad_norm": 0.6837633848190308, + "learning_rate": 2.0016611704485922e-06, + "loss": 1.2836, + "mean_token_accuracy": 0.671315535902977, + "num_tokens": 3033393490.0, + "step": 18099 + }, + { + "entropy": 1.8056779702504475, + "epoch": 1.9883826316223119, + "grad_norm": 0.7126280069351196, + "learning_rate": 2.001630551376086e-06, + "loss": 1.3957, + "mean_token_accuracy": 0.6486673851807913, + "num_tokens": 3033543350.0, + "step": 18100 + }, + { + "entropy": 1.6797227362791698, + "epoch": 1.988492488533685, + "grad_norm": 0.6651612520217896, + "learning_rate": 2.0016002170979173e-06, + "loss": 1.4704, + "mean_token_accuracy": 0.6411794424057007, + "num_tokens": 3033771985.0, + "step": 18101 + }, + { + "entropy": 1.7329282363255818, + "epoch": 1.9886023454450577, + "grad_norm": 0.7120084762573242, + "learning_rate": 2.0015701676150475e-06, + "loss": 1.4927, + "mean_token_accuracy": 0.6449198176463445, + "num_tokens": 3033916379.0, + "step": 18102 + }, + { + "entropy": 1.7090756595134735, + "epoch": 1.9887122023564308, + "grad_norm": 0.866844117641449, + "learning_rate": 2.001540402928426e-06, + "loss": 1.3771, + "mean_token_accuracy": 0.6641785850127538, + "num_tokens": 3034065801.0, + "step": 18103 + }, + { + "entropy": 1.7364407777786255, + "epoch": 1.9888220592678036, + "grad_norm": 0.7089415788650513, + "learning_rate": 2.001510923038997e-06, + "loss": 1.4758, + "mean_token_accuracy": 0.6599543740351995, + "num_tokens": 3034236864.0, + "step": 18104 + }, + { + "entropy": 1.71084330479304, + "epoch": 1.9889319161791765, + "grad_norm": 0.6918189525604248, + "learning_rate": 2.0014817279476928e-06, + "loss": 1.377, + "mean_token_accuracy": 0.645952895283699, + "num_tokens": 3034406382.0, + "step": 18105 + }, + { + "entropy": 1.7627890010674794, + "epoch": 1.9890417730905496, + "grad_norm": 0.8844940662384033, + "learning_rate": 2.0014528176554367e-06, + "loss": 1.4876, + "mean_token_accuracy": 0.6449083288510641, + "num_tokens": 3034578259.0, + "step": 18106 + }, + { + "entropy": 1.74430717031161, + "epoch": 1.9891516300019225, + "grad_norm": 0.5609480738639832, + "learning_rate": 2.0014241921631433e-06, + "loss": 1.4494, + "mean_token_accuracy": 0.6505746444066366, + "num_tokens": 3034771034.0, + "step": 18107 + }, + { + "entropy": 1.7141135434309642, + "epoch": 1.9892614869132954, + "grad_norm": 0.7115533947944641, + "learning_rate": 2.0013958514717206e-06, + "loss": 1.3258, + "mean_token_accuracy": 0.6693545977274576, + "num_tokens": 3034960817.0, + "step": 18108 + }, + { + "entropy": 1.7084451814492543, + "epoch": 1.9893713438246685, + "grad_norm": 0.6754060983657837, + "learning_rate": 2.001367795582063e-06, + "loss": 1.5179, + "mean_token_accuracy": 0.6482407848040262, + "num_tokens": 3035162145.0, + "step": 18109 + }, + { + "entropy": 1.7450725734233856, + "epoch": 1.9894812007360412, + "grad_norm": 0.7590783834457397, + "learning_rate": 2.001340024495061e-06, + "loss": 1.4425, + "mean_token_accuracy": 0.6480189065138499, + "num_tokens": 3035344376.0, + "step": 18110 + }, + { + "entropy": 1.7174135446548462, + "epoch": 1.9895910576474143, + "grad_norm": 0.6129758358001709, + "learning_rate": 2.0013125382115915e-06, + "loss": 1.2797, + "mean_token_accuracy": 0.669946551322937, + "num_tokens": 3035521562.0, + "step": 18111 + }, + { + "entropy": 1.7383268078168232, + "epoch": 1.9897009145587872, + "grad_norm": 0.7468062043190002, + "learning_rate": 2.0012853367325268e-06, + "loss": 1.3131, + "mean_token_accuracy": 0.6609462102254232, + "num_tokens": 3035705803.0, + "step": 18112 + }, + { + "entropy": 1.7733658452828724, + "epoch": 1.98981077147016, + "grad_norm": 0.665949821472168, + "learning_rate": 2.001258420058725e-06, + "loss": 1.4654, + "mean_token_accuracy": 0.6438464025656382, + "num_tokens": 3035834345.0, + "step": 18113 + }, + { + "entropy": 1.7397722403208415, + "epoch": 1.9899206283815332, + "grad_norm": 0.6292708516120911, + "learning_rate": 2.0012317881910387e-06, + "loss": 1.4591, + "mean_token_accuracy": 0.6419312010208765, + "num_tokens": 3036056500.0, + "step": 18114 + }, + { + "entropy": 1.7050531804561615, + "epoch": 1.9900304852929058, + "grad_norm": 0.6399442553520203, + "learning_rate": 2.0012054411303124e-06, + "loss": 1.4583, + "mean_token_accuracy": 0.6630610624949137, + "num_tokens": 3036230464.0, + "step": 18115 + }, + { + "entropy": 1.6535969475905101, + "epoch": 1.990140342204279, + "grad_norm": 0.8072215914726257, + "learning_rate": 2.0011793788773787e-06, + "loss": 1.1218, + "mean_token_accuracy": 0.6984556714693705, + "num_tokens": 3036333949.0, + "step": 18116 + }, + { + "entropy": 1.7108362515767415, + "epoch": 1.9902501991156518, + "grad_norm": 0.6812421083450317, + "learning_rate": 2.0011536014330627e-06, + "loss": 1.2873, + "mean_token_accuracy": 0.6654827296733856, + "num_tokens": 3036471958.0, + "step": 18117 + }, + { + "entropy": 1.708745191494624, + "epoch": 1.9903600560270247, + "grad_norm": 0.6712941527366638, + "learning_rate": 2.0011281087981796e-06, + "loss": 1.3927, + "mean_token_accuracy": 0.6592844178279241, + "num_tokens": 3036634049.0, + "step": 18118 + }, + { + "entropy": 1.673752874135971, + "epoch": 1.9904699129383978, + "grad_norm": 0.7544751763343811, + "learning_rate": 2.001102900973538e-06, + "loss": 1.3683, + "mean_token_accuracy": 0.667888343334198, + "num_tokens": 3036777513.0, + "step": 18119 + }, + { + "entropy": 1.6410714586575825, + "epoch": 1.9905797698497707, + "grad_norm": 0.6688563227653503, + "learning_rate": 2.0010779779599342e-06, + "loss": 1.4143, + "mean_token_accuracy": 0.6570560932159424, + "num_tokens": 3036987258.0, + "step": 18120 + }, + { + "entropy": 1.7408175667126973, + "epoch": 1.9906896267611436, + "grad_norm": 0.7752634882926941, + "learning_rate": 2.001053339758156e-06, + "loss": 1.346, + "mean_token_accuracy": 0.6612565120061239, + "num_tokens": 3037148408.0, + "step": 18121 + }, + { + "entropy": 1.662128746509552, + "epoch": 1.9907994836725167, + "grad_norm": 0.6829171180725098, + "learning_rate": 2.0010289863689857e-06, + "loss": 1.3705, + "mean_token_accuracy": 0.6696845690409342, + "num_tokens": 3037348031.0, + "step": 18122 + }, + { + "entropy": 1.728913923104604, + "epoch": 1.9909093405838894, + "grad_norm": 0.7454518675804138, + "learning_rate": 2.0010049177931933e-06, + "loss": 1.4135, + "mean_token_accuracy": 0.6618677377700806, + "num_tokens": 3037516275.0, + "step": 18123 + }, + { + "entropy": 1.7775543729464214, + "epoch": 1.9910191974952625, + "grad_norm": 0.8814812898635864, + "learning_rate": 2.0009811340315405e-06, + "loss": 1.3856, + "mean_token_accuracy": 0.6594171871741613, + "num_tokens": 3037647651.0, + "step": 18124 + }, + { + "entropy": 1.7491299907366435, + "epoch": 1.9911290544066353, + "grad_norm": 0.5548111796379089, + "learning_rate": 2.000957635084779e-06, + "loss": 1.4133, + "mean_token_accuracy": 0.6463166773319244, + "num_tokens": 3037834677.0, + "step": 18125 + }, + { + "entropy": 1.695239543914795, + "epoch": 1.9912389113180082, + "grad_norm": 0.7862319946289062, + "learning_rate": 2.0009344209536533e-06, + "loss": 1.5992, + "mean_token_accuracy": 0.6537997101744016, + "num_tokens": 3037978289.0, + "step": 18126 + }, + { + "entropy": 1.7685978809992473, + "epoch": 1.9913487682293813, + "grad_norm": 0.632785439491272, + "learning_rate": 2.000911491638899e-06, + "loss": 1.5919, + "mean_token_accuracy": 0.6281165331602097, + "num_tokens": 3038171790.0, + "step": 18127 + }, + { + "entropy": 1.6535864472389221, + "epoch": 1.991458625140754, + "grad_norm": 0.7036636471748352, + "learning_rate": 2.00088884714124e-06, + "loss": 1.3068, + "mean_token_accuracy": 0.6650771498680115, + "num_tokens": 3038379611.0, + "step": 18128 + }, + { + "entropy": 1.6982588171958923, + "epoch": 1.991568482052127, + "grad_norm": 0.6188250184059143, + "learning_rate": 2.000866487461393e-06, + "loss": 1.5187, + "mean_token_accuracy": 0.622700423002243, + "num_tokens": 3038630107.0, + "step": 18129 + }, + { + "entropy": 1.7043708562850952, + "epoch": 1.9916783389635, + "grad_norm": 0.7103232145309448, + "learning_rate": 2.000844412600068e-06, + "loss": 1.3381, + "mean_token_accuracy": 0.6587745447953542, + "num_tokens": 3038829736.0, + "step": 18130 + }, + { + "entropy": 1.6450840930143993, + "epoch": 1.9917881958748729, + "grad_norm": 0.8464440107345581, + "learning_rate": 2.0008226225579614e-06, + "loss": 1.2032, + "mean_token_accuracy": 0.6772677054007848, + "num_tokens": 3039029703.0, + "step": 18131 + }, + { + "entropy": 1.687016874551773, + "epoch": 1.991898052786246, + "grad_norm": 0.5829222202301025, + "learning_rate": 2.0008011173357644e-06, + "loss": 1.2675, + "mean_token_accuracy": 0.6710990617672602, + "num_tokens": 3039166171.0, + "step": 18132 + }, + { + "entropy": 1.750480592250824, + "epoch": 1.9920079096976189, + "grad_norm": 0.6543905138969421, + "learning_rate": 2.0007798969341565e-06, + "loss": 1.4109, + "mean_token_accuracy": 0.6614688485860825, + "num_tokens": 3039302322.0, + "step": 18133 + }, + { + "entropy": 1.7635838687419891, + "epoch": 1.9921177666089918, + "grad_norm": 0.582114577293396, + "learning_rate": 2.0007589613538104e-06, + "loss": 1.4239, + "mean_token_accuracy": 0.6485844204823176, + "num_tokens": 3039512616.0, + "step": 18134 + }, + { + "entropy": 1.6652332345644634, + "epoch": 1.9922276235203649, + "grad_norm": 0.7193676233291626, + "learning_rate": 2.000738310595387e-06, + "loss": 1.3869, + "mean_token_accuracy": 0.6623914440472921, + "num_tokens": 3039667577.0, + "step": 18135 + }, + { + "entropy": 1.6600812375545502, + "epoch": 1.9923374804317375, + "grad_norm": 0.6071199774742126, + "learning_rate": 2.0007179446595414e-06, + "loss": 1.3844, + "mean_token_accuracy": 0.6484141399463018, + "num_tokens": 3039865053.0, + "step": 18136 + }, + { + "entropy": 1.7320877810319264, + "epoch": 1.9924473373431106, + "grad_norm": 0.7393842339515686, + "learning_rate": 2.0006978635469175e-06, + "loss": 1.3568, + "mean_token_accuracy": 0.6677570939064026, + "num_tokens": 3040001768.0, + "step": 18137 + }, + { + "entropy": 1.694648305575053, + "epoch": 1.9925571942544835, + "grad_norm": 0.7653499245643616, + "learning_rate": 2.000678067258151e-06, + "loss": 1.4478, + "mean_token_accuracy": 0.644600714246432, + "num_tokens": 3040212474.0, + "step": 18138 + }, + { + "entropy": 1.7461239794890087, + "epoch": 1.9926670511658564, + "grad_norm": 0.6602531671524048, + "learning_rate": 2.000658555793869e-06, + "loss": 1.3958, + "mean_token_accuracy": 0.6600701163212458, + "num_tokens": 3040389094.0, + "step": 18139 + }, + { + "entropy": 1.7250058154265087, + "epoch": 1.9927769080772295, + "grad_norm": 0.7628946900367737, + "learning_rate": 2.0006393291546883e-06, + "loss": 1.3985, + "mean_token_accuracy": 0.6515516539414724, + "num_tokens": 3040533558.0, + "step": 18140 + }, + { + "entropy": 1.6692057152589161, + "epoch": 1.9928867649886024, + "grad_norm": 0.6552335619926453, + "learning_rate": 2.0006203873412174e-06, + "loss": 1.5968, + "mean_token_accuracy": 0.6354374637206396, + "num_tokens": 3040772004.0, + "step": 18141 + }, + { + "entropy": 1.7071496148904164, + "epoch": 1.9929966218999753, + "grad_norm": 0.5911110639572144, + "learning_rate": 2.000601730354056e-06, + "loss": 1.3592, + "mean_token_accuracy": 0.6535786141951879, + "num_tokens": 3040998379.0, + "step": 18142 + }, + { + "entropy": 1.7665246824423473, + "epoch": 1.9931064788113482, + "grad_norm": 0.699324369430542, + "learning_rate": 2.000583358193795e-06, + "loss": 1.4768, + "mean_token_accuracy": 0.6513043691714605, + "num_tokens": 3041134684.0, + "step": 18143 + }, + { + "entropy": 1.7260617713133495, + "epoch": 1.993216335722721, + "grad_norm": 0.7378421425819397, + "learning_rate": 2.0005652708610145e-06, + "loss": 1.3723, + "mean_token_accuracy": 0.6644292175769806, + "num_tokens": 3041295927.0, + "step": 18144 + }, + { + "entropy": 1.696333905061086, + "epoch": 1.9933261926340942, + "grad_norm": 0.6919421553611755, + "learning_rate": 2.000547468356289e-06, + "loss": 1.4383, + "mean_token_accuracy": 0.6629768361647924, + "num_tokens": 3041453675.0, + "step": 18145 + }, + { + "entropy": 1.7212949494520824, + "epoch": 1.993436049545467, + "grad_norm": 0.770926296710968, + "learning_rate": 2.0005299506801808e-06, + "loss": 1.4902, + "mean_token_accuracy": 0.6531381358702978, + "num_tokens": 3041633288.0, + "step": 18146 + }, + { + "entropy": 1.6915223002433777, + "epoch": 1.99354590645684, + "grad_norm": 0.7864291071891785, + "learning_rate": 2.000512717833244e-06, + "loss": 1.4262, + "mean_token_accuracy": 0.6496833662192026, + "num_tokens": 3041787947.0, + "step": 18147 + }, + { + "entropy": 1.6634085575739543, + "epoch": 1.993655763368213, + "grad_norm": 0.7216148376464844, + "learning_rate": 2.0004957698160243e-06, + "loss": 1.351, + "mean_token_accuracy": 0.6673944791158041, + "num_tokens": 3041922058.0, + "step": 18148 + }, + { + "entropy": 1.7852267622947693, + "epoch": 1.9937656202795857, + "grad_norm": 0.6994895339012146, + "learning_rate": 2.0004791066290583e-06, + "loss": 1.5275, + "mean_token_accuracy": 0.648076981306076, + "num_tokens": 3042114828.0, + "step": 18149 + }, + { + "entropy": 1.7273716727892559, + "epoch": 1.9938754771909588, + "grad_norm": 0.7003465890884399, + "learning_rate": 2.000462728272874e-06, + "loss": 1.3692, + "mean_token_accuracy": 0.6554417014122009, + "num_tokens": 3042260808.0, + "step": 18150 + }, + { + "entropy": 1.640130211909612, + "epoch": 1.9939853341023317, + "grad_norm": 0.7557557821273804, + "learning_rate": 2.000446634747988e-06, + "loss": 1.4792, + "mean_token_accuracy": 0.6581311722596487, + "num_tokens": 3042433542.0, + "step": 18151 + }, + { + "entropy": 1.6444950600465138, + "epoch": 1.9940951910137046, + "grad_norm": 0.6619048714637756, + "learning_rate": 2.0004308260549116e-06, + "loss": 1.4316, + "mean_token_accuracy": 0.6499272088209788, + "num_tokens": 3042630077.0, + "step": 18152 + }, + { + "entropy": 1.7205499112606049, + "epoch": 1.9942050479250777, + "grad_norm": 0.7403351664543152, + "learning_rate": 2.0004153021941435e-06, + "loss": 1.2772, + "mean_token_accuracy": 0.6660661300023397, + "num_tokens": 3042738444.0, + "step": 18153 + }, + { + "entropy": 1.642647961775462, + "epoch": 1.9943149048364506, + "grad_norm": 0.6520124673843384, + "learning_rate": 2.0004000631661763e-06, + "loss": 1.3813, + "mean_token_accuracy": 0.6548943569262823, + "num_tokens": 3042898607.0, + "step": 18154 + }, + { + "entropy": 1.762593497832616, + "epoch": 1.9944247617478235, + "grad_norm": 0.6629482507705688, + "learning_rate": 2.0003851089714914e-06, + "loss": 1.3936, + "mean_token_accuracy": 0.6553806563218435, + "num_tokens": 3043072996.0, + "step": 18155 + }, + { + "entropy": 1.6303976476192474, + "epoch": 1.9945346186591963, + "grad_norm": 0.6785465478897095, + "learning_rate": 2.000370439610563e-06, + "loss": 1.2432, + "mean_token_accuracy": 0.6743980348110199, + "num_tokens": 3043205963.0, + "step": 18156 + }, + { + "entropy": 1.694928377866745, + "epoch": 1.9946444755705692, + "grad_norm": 0.622810959815979, + "learning_rate": 2.000356055083854e-06, + "loss": 1.4332, + "mean_token_accuracy": 0.6510835389296213, + "num_tokens": 3043413561.0, + "step": 18157 + }, + { + "entropy": 1.6337503294150035, + "epoch": 1.9947543324819423, + "grad_norm": 0.5835052132606506, + "learning_rate": 2.000341955391821e-06, + "loss": 1.4437, + "mean_token_accuracy": 0.646688754359881, + "num_tokens": 3043625964.0, + "step": 18158 + }, + { + "entropy": 1.7825499673684437, + "epoch": 1.9948641893933152, + "grad_norm": 0.652887761592865, + "learning_rate": 2.0003281405349095e-06, + "loss": 1.6008, + "mean_token_accuracy": 0.6128579080104828, + "num_tokens": 3043865035.0, + "step": 18159 + }, + { + "entropy": 1.7153000434239705, + "epoch": 1.994974046304688, + "grad_norm": 0.6086410880088806, + "learning_rate": 2.0003146105135573e-06, + "loss": 1.1742, + "mean_token_accuracy": 0.6773505012194315, + "num_tokens": 3044061042.0, + "step": 18160 + }, + { + "entropy": 1.6450629631678264, + "epoch": 1.9950839032160612, + "grad_norm": 0.6745823621749878, + "learning_rate": 2.0003013653281926e-06, + "loss": 1.3636, + "mean_token_accuracy": 0.6562095880508423, + "num_tokens": 3044210757.0, + "step": 18161 + }, + { + "entropy": 1.7687378525733948, + "epoch": 1.9951937601274339, + "grad_norm": 0.7521758675575256, + "learning_rate": 2.000288404979235e-06, + "loss": 1.5204, + "mean_token_accuracy": 0.6322930653889974, + "num_tokens": 3044409106.0, + "step": 18162 + }, + { + "entropy": 1.7188294629255931, + "epoch": 1.995303617038807, + "grad_norm": 0.6091630458831787, + "learning_rate": 2.0002757294670926e-06, + "loss": 1.337, + "mean_token_accuracy": 0.6657722691694895, + "num_tokens": 3044596447.0, + "step": 18163 + }, + { + "entropy": 1.7012285093466442, + "epoch": 1.9954134739501799, + "grad_norm": 0.6440872550010681, + "learning_rate": 2.0002633387921676e-06, + "loss": 1.3507, + "mean_token_accuracy": 0.6604795008897781, + "num_tokens": 3044790314.0, + "step": 18164 + }, + { + "entropy": 1.7051705221335094, + "epoch": 1.9955233308615528, + "grad_norm": 0.7219937443733215, + "learning_rate": 2.000251232954854e-06, + "loss": 1.291, + "mean_token_accuracy": 0.6619381904602051, + "num_tokens": 3044914619.0, + "step": 18165 + }, + { + "entropy": 1.6888903081417084, + "epoch": 1.9956331877729259, + "grad_norm": 0.637630820274353, + "learning_rate": 2.0002394119555326e-06, + "loss": 1.389, + "mean_token_accuracy": 0.6619627823432287, + "num_tokens": 3045098000.0, + "step": 18166 + }, + { + "entropy": 1.7443795800209045, + "epoch": 1.9957430446842988, + "grad_norm": 0.6435023546218872, + "learning_rate": 2.000227875794579e-06, + "loss": 1.5764, + "mean_token_accuracy": 0.620304211974144, + "num_tokens": 3045323953.0, + "step": 18167 + }, + { + "entropy": 1.6719048420588176, + "epoch": 1.9958529015956716, + "grad_norm": 0.6005743741989136, + "learning_rate": 2.0002166244723573e-06, + "loss": 1.4482, + "mean_token_accuracy": 0.653143455584844, + "num_tokens": 3045542928.0, + "step": 18168 + }, + { + "entropy": 1.713512271642685, + "epoch": 1.9959627585070445, + "grad_norm": 0.7789661884307861, + "learning_rate": 2.000205657989225e-06, + "loss": 1.3143, + "mean_token_accuracy": 0.6610560963551203, + "num_tokens": 3045703434.0, + "step": 18169 + }, + { + "entropy": 1.7187098960081737, + "epoch": 1.9960726154184174, + "grad_norm": 0.6709672808647156, + "learning_rate": 2.000194976345527e-06, + "loss": 1.3616, + "mean_token_accuracy": 0.6639542629321417, + "num_tokens": 3045898367.0, + "step": 18170 + }, + { + "entropy": 1.7733473777770996, + "epoch": 1.9961824723297905, + "grad_norm": 0.7715442776679993, + "learning_rate": 2.0001845795416034e-06, + "loss": 1.4116, + "mean_token_accuracy": 0.6565881470839182, + "num_tokens": 3046044074.0, + "step": 18171 + }, + { + "entropy": 1.7301356891791027, + "epoch": 1.9962923292411634, + "grad_norm": 0.8577042818069458, + "learning_rate": 2.0001744675777812e-06, + "loss": 1.4882, + "mean_token_accuracy": 0.6440813392400742, + "num_tokens": 3046206087.0, + "step": 18172 + }, + { + "entropy": 1.6903728346029918, + "epoch": 1.9964021861525363, + "grad_norm": 0.5791963934898376, + "learning_rate": 2.000164640454383e-06, + "loss": 1.3326, + "mean_token_accuracy": 0.661042665441831, + "num_tokens": 3046378472.0, + "step": 18173 + }, + { + "entropy": 1.687019368012746, + "epoch": 1.9965120430639094, + "grad_norm": 0.6625068187713623, + "learning_rate": 2.000155098171718e-06, + "loss": 1.4011, + "mean_token_accuracy": 0.6618767331043879, + "num_tokens": 3046510726.0, + "step": 18174 + }, + { + "entropy": 1.759124368429184, + "epoch": 1.996621899975282, + "grad_norm": 0.7094929218292236, + "learning_rate": 2.000145840730089e-06, + "loss": 1.3934, + "mean_token_accuracy": 0.6440586149692535, + "num_tokens": 3046658513.0, + "step": 18175 + }, + { + "entropy": 1.691829909880956, + "epoch": 1.9967317568866552, + "grad_norm": 0.690881073474884, + "learning_rate": 2.000136868129788e-06, + "loss": 1.428, + "mean_token_accuracy": 0.6513862907886505, + "num_tokens": 3046862461.0, + "step": 18176 + }, + { + "entropy": 1.7729640205701191, + "epoch": 1.996841613798028, + "grad_norm": 0.7034747004508972, + "learning_rate": 2.0001281803711007e-06, + "loss": 1.358, + "mean_token_accuracy": 0.6604643066724142, + "num_tokens": 3047004414.0, + "step": 18177 + }, + { + "entropy": 1.694592813650767, + "epoch": 1.996951470709401, + "grad_norm": 0.6533588767051697, + "learning_rate": 2.0001197774543004e-06, + "loss": 1.3068, + "mean_token_accuracy": 0.6626057177782059, + "num_tokens": 3047148671.0, + "step": 18178 + }, + { + "entropy": 1.7134621640046437, + "epoch": 1.997061327620774, + "grad_norm": 0.5928846597671509, + "learning_rate": 2.000111659379654e-06, + "loss": 1.4752, + "mean_token_accuracy": 0.6483317414919535, + "num_tokens": 3047315839.0, + "step": 18179 + }, + { + "entropy": 1.7521715660889943, + "epoch": 1.997171184532147, + "grad_norm": 0.7548753619194031, + "learning_rate": 2.000103826147418e-06, + "loss": 1.3642, + "mean_token_accuracy": 0.6631999164819717, + "num_tokens": 3047443079.0, + "step": 18180 + }, + { + "entropy": 1.747659554084142, + "epoch": 1.9972810414435198, + "grad_norm": 0.7508565187454224, + "learning_rate": 2.0000962777578404e-06, + "loss": 1.3458, + "mean_token_accuracy": 0.659741202990214, + "num_tokens": 3047566548.0, + "step": 18181 + }, + { + "entropy": 1.700925201177597, + "epoch": 1.997390898354893, + "grad_norm": 0.7700899839401245, + "learning_rate": 2.0000890142111605e-06, + "loss": 1.1955, + "mean_token_accuracy": 0.6900986135005951, + "num_tokens": 3047664506.0, + "step": 18182 + }, + { + "entropy": 1.6882247428099315, + "epoch": 1.9975007552662656, + "grad_norm": 0.7430019378662109, + "learning_rate": 2.0000820355076072e-06, + "loss": 1.4729, + "mean_token_accuracy": 0.6475649029016495, + "num_tokens": 3047869269.0, + "step": 18183 + }, + { + "entropy": 1.6784042815367382, + "epoch": 1.9976106121776387, + "grad_norm": 0.618037760257721, + "learning_rate": 2.000075341647402e-06, + "loss": 1.3216, + "mean_token_accuracy": 0.6691079139709473, + "num_tokens": 3048027510.0, + "step": 18184 + }, + { + "entropy": 1.718860884507497, + "epoch": 1.9977204690890116, + "grad_norm": 0.7390254139900208, + "learning_rate": 2.0000689326307567e-06, + "loss": 1.2494, + "mean_token_accuracy": 0.6714848627646764, + "num_tokens": 3048152184.0, + "step": 18185 + }, + { + "entropy": 1.6419156392415364, + "epoch": 1.9978303260003845, + "grad_norm": 0.7709030508995056, + "learning_rate": 2.000062808457875e-06, + "loss": 1.3404, + "mean_token_accuracy": 0.6696422000726064, + "num_tokens": 3048287917.0, + "step": 18186 + }, + { + "entropy": 1.68554683526357, + "epoch": 1.9979401829117576, + "grad_norm": 0.591776967048645, + "learning_rate": 2.0000569691289495e-06, + "loss": 1.4674, + "mean_token_accuracy": 0.6471003343661627, + "num_tokens": 3048539631.0, + "step": 18187 + }, + { + "entropy": 1.698345571756363, + "epoch": 1.9980500398231302, + "grad_norm": 0.7942776083946228, + "learning_rate": 2.0000514146441654e-06, + "loss": 1.5863, + "mean_token_accuracy": 0.6430133432149887, + "num_tokens": 3048720202.0, + "step": 18188 + }, + { + "entropy": 1.6566093067328136, + "epoch": 1.9981598967345033, + "grad_norm": 0.6725602746009827, + "learning_rate": 2.0000461450036985e-06, + "loss": 1.2891, + "mean_token_accuracy": 0.6772060046593348, + "num_tokens": 3048914194.0, + "step": 18189 + }, + { + "entropy": 1.7242831885814667, + "epoch": 1.9982697536458762, + "grad_norm": 0.6108221411705017, + "learning_rate": 2.0000411602077163e-06, + "loss": 1.461, + "mean_token_accuracy": 0.6506116489569346, + "num_tokens": 3049099713.0, + "step": 18190 + }, + { + "entropy": 1.7347515324751537, + "epoch": 1.998379610557249, + "grad_norm": 0.7071492075920105, + "learning_rate": 2.0000364602563753e-06, + "loss": 1.3625, + "mean_token_accuracy": 0.6629375716050466, + "num_tokens": 3049259887.0, + "step": 18191 + }, + { + "entropy": 1.7372618913650513, + "epoch": 1.9984894674686222, + "grad_norm": 0.6914885640144348, + "learning_rate": 2.000032045149825e-06, + "loss": 1.4312, + "mean_token_accuracy": 0.650190552075704, + "num_tokens": 3049424470.0, + "step": 18192 + }, + { + "entropy": 1.7188272774219513, + "epoch": 1.998599324379995, + "grad_norm": 0.7079371213912964, + "learning_rate": 2.0000279148882053e-06, + "loss": 1.6254, + "mean_token_accuracy": 0.6389360229174296, + "num_tokens": 3049646174.0, + "step": 18193 + }, + { + "entropy": 1.6750788291295369, + "epoch": 1.998709181291368, + "grad_norm": 0.7202898859977722, + "learning_rate": 2.000024069471646e-06, + "loss": 1.2239, + "mean_token_accuracy": 0.6829964170853297, + "num_tokens": 3049800566.0, + "step": 18194 + }, + { + "entropy": 1.686879813671112, + "epoch": 1.998819038202741, + "grad_norm": 0.7090538740158081, + "learning_rate": 2.0000205089002696e-06, + "loss": 1.3509, + "mean_token_accuracy": 0.67240938047568, + "num_tokens": 3049949036.0, + "step": 18195 + }, + { + "entropy": 1.6966327925523121, + "epoch": 1.9989288951141138, + "grad_norm": 0.6627102494239807, + "learning_rate": 2.000017233174189e-06, + "loss": 1.3099, + "mean_token_accuracy": 0.6580508897701899, + "num_tokens": 3050113118.0, + "step": 18196 + }, + { + "entropy": 1.7072912355264027, + "epoch": 1.9990387520254869, + "grad_norm": 0.689150869846344, + "learning_rate": 2.0000142422935068e-06, + "loss": 1.4376, + "mean_token_accuracy": 0.6440875480572382, + "num_tokens": 3050310659.0, + "step": 18197 + }, + { + "entropy": 1.6617674827575684, + "epoch": 1.9991486089368598, + "grad_norm": 0.6578044295310974, + "learning_rate": 2.000011536258319e-06, + "loss": 1.3804, + "mean_token_accuracy": 0.6541756838560104, + "num_tokens": 3050446831.0, + "step": 18198 + }, + { + "entropy": 1.661964366833369, + "epoch": 1.9992584658482326, + "grad_norm": 0.7953215837478638, + "learning_rate": 2.00000911506871e-06, + "loss": 1.3325, + "mean_token_accuracy": 0.6677337139844894, + "num_tokens": 3050595454.0, + "step": 18199 + }, + { + "entropy": 1.7314130862553914, + "epoch": 1.9993683227596057, + "grad_norm": 0.6476884484291077, + "learning_rate": 2.0000069787247574e-06, + "loss": 1.282, + "mean_token_accuracy": 0.6776565164327621, + "num_tokens": 3050722494.0, + "step": 18200 + }, + { + "entropy": 1.7134939829508464, + "epoch": 1.9994781796709784, + "grad_norm": 0.7247772812843323, + "learning_rate": 2.0000051272265275e-06, + "loss": 1.4396, + "mean_token_accuracy": 0.6534018168846766, + "num_tokens": 3050881167.0, + "step": 18201 + }, + { + "entropy": 1.6716128786404927, + "epoch": 1.9995880365823515, + "grad_norm": 0.6074260473251343, + "learning_rate": 2.000003560574081e-06, + "loss": 1.3665, + "mean_token_accuracy": 0.6494092990954717, + "num_tokens": 3051103681.0, + "step": 18202 + }, + { + "entropy": 1.7386046648025513, + "epoch": 1.9996978934937244, + "grad_norm": 0.7610368728637695, + "learning_rate": 2.000002278767466e-06, + "loss": 1.4719, + "mean_token_accuracy": 0.6267879009246826, + "num_tokens": 3051336600.0, + "step": 18203 + }, + { + "entropy": 1.6595459183057149, + "epoch": 1.9998077504050973, + "grad_norm": 1.1902509927749634, + "learning_rate": 2.000001281806723e-06, + "loss": 1.319, + "mean_token_accuracy": 0.6607537617286047, + "num_tokens": 3051506942.0, + "step": 18204 + }, + { + "entropy": 1.6532653272151947, + "epoch": 1.9999176073164704, + "grad_norm": 0.6938499212265015, + "learning_rate": 2.000000569691885e-06, + "loss": 1.3618, + "mean_token_accuracy": 0.6603017499049505, + "num_tokens": 3051631616.0, + "step": 18205 + }, + { + "entropy": 1.7311393817265828, + "epoch": 2.0, + "grad_norm": 0.7615280747413635, + "learning_rate": 2.0000001424229725e-06, + "loss": 1.4697, + "mean_token_accuracy": 0.6398886442184448, + "num_tokens": 3051740039.0, + "step": 18206 + }, + { + "epoch": 2.0, + "step": 18206, + "total_flos": 3.1404917922249834e+19, + "train_loss": 1.4016287254459527, + "train_runtime": 304386.5826, + "train_samples_per_second": 7.177, + "train_steps_per_second": 0.06 + } + ], + "logging_steps": 1, + "max_steps": 18206, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.1404917922249834e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}