[ { "loss": 1.5327, "grad_norm": 0.5706859827041626, "learning_rate": 4.5e-05, "entropy": 1.2147093176841737, "num_tokens": 156309.0, "mean_token_accuracy": 0.659215685725212, "epoch": 0.03355704697986577, "step": 10 }, { "loss": 1.1979, "grad_norm": 1.02685546875, "learning_rate": 9.5e-05, "entropy": 1.2502837777137756, "num_tokens": 307512.0, "mean_token_accuracy": 0.7190202981233597, "epoch": 0.06711409395973154, "step": 20 }, { "loss": 0.6687, "grad_norm": 0.5571576356887817, "learning_rate": 9.999075138471951e-05, "entropy": 0.6448976010084152, "num_tokens": 461226.0, "mean_token_accuracy": 0.8424648404121399, "epoch": 0.10067114093959731, "step": 30 }, { "loss": 0.4812, "grad_norm": 0.38987967371940613, "learning_rate": 9.995878525539525e-05, "entropy": 0.47405652403831483, "num_tokens": 618054.0, "mean_token_accuracy": 0.8851036787033081, "epoch": 0.1342281879194631, "step": 40 }, { "loss": 0.3701, "grad_norm": 0.2643287777900696, "learning_rate": 9.990400202763563e-05, "entropy": 0.3693989284336567, "num_tokens": 775209.0, "mean_token_accuracy": 0.9097828209400177, "epoch": 0.16778523489932887, "step": 50 }, { "loss": 0.3193, "grad_norm": 0.27373382449150085, "learning_rate": 9.982642672195092e-05, "entropy": 0.32156657576560976, "num_tokens": 931365.0, "mean_token_accuracy": 0.920613020658493, "epoch": 0.20134228187919462, "step": 60 }, { "loss": 0.2866, "grad_norm": 0.3094562292098999, "learning_rate": 9.972609476841367e-05, "entropy": 0.292499927431345, "num_tokens": 1086695.0, "mean_token_accuracy": 0.9282213032245636, "epoch": 0.2348993288590604, "step": 70 }, { "loss": 0.2489, "grad_norm": 0.2749999761581421, "learning_rate": 9.960305199047712e-05, "entropy": 0.2526983417570591, "num_tokens": 1241470.0, "mean_token_accuracy": 0.9362666577100753, "epoch": 0.2684563758389262, "step": 80 }, { "loss": 0.2459, "grad_norm": 0.2721326947212219, "learning_rate": 9.945735458404681e-05, "entropy": 0.24655402898788453, "num_tokens": 1396709.0, "mean_token_accuracy": 0.9367562144994735, "epoch": 0.30201342281879195, "step": 90 }, { "loss": 0.2435, "grad_norm": 0.21973158419132233, "learning_rate": 9.928906909181481e-05, "entropy": 0.24917888268828392, "num_tokens": 1555025.0, "mean_token_accuracy": 0.9364570289850235, "epoch": 0.33557046979865773, "step": 100 }, { "loss": 0.2347, "grad_norm": 0.26128071546554565, "learning_rate": 9.909827237286849e-05, "entropy": 0.23597020357847215, "num_tokens": 1706763.0, "mean_token_accuracy": 0.9403788715600967, "epoch": 0.3691275167785235, "step": 110 }, { "loss": 0.2478, "grad_norm": 0.20566503703594208, "learning_rate": 9.888505156758759e-05, "entropy": 0.2498910292983055, "num_tokens": 1867753.0, "mean_token_accuracy": 0.9367677628993988, "epoch": 0.40268456375838924, "step": 120 }, { "loss": 0.2417, "grad_norm": 0.190927654504776, "learning_rate": 9.864950405784551e-05, "entropy": 0.24509735256433487, "num_tokens": 2027436.0, "mean_token_accuracy": 0.9376339435577392, "epoch": 0.436241610738255, "step": 130 }, { "loss": 0.2196, "grad_norm": 0.1996077448129654, "learning_rate": 9.839173742253334e-05, "entropy": 0.22244628816843032, "num_tokens": 2182978.0, "mean_token_accuracy": 0.9424097180366516, "epoch": 0.4697986577181208, "step": 140 }, { "loss": 0.2534, "grad_norm": 0.21034187078475952, "learning_rate": 9.811186938842645e-05, "entropy": 0.2574092894792557, "num_tokens": 2343108.0, "mean_token_accuracy": 0.9341461777687072, "epoch": 0.5033557046979866, "step": 150 }, { "loss": 0.2369, "grad_norm": 0.20116521418094635, "learning_rate": 9.781002777641664e-05, "entropy": 0.23710015863180162, "num_tokens": 2503180.0, "mean_token_accuracy": 0.9385264277458191, "epoch": 0.5369127516778524, "step": 160 }, { "loss": 0.2194, "grad_norm": 0.20917288959026337, "learning_rate": 9.748635044313386e-05, "entropy": 0.22300637513399124, "num_tokens": 2658840.0, "mean_token_accuracy": 0.9410306721925735, "epoch": 0.5704697986577181, "step": 170 }, { "loss": 0.2295, "grad_norm": 0.19810253381729126, "learning_rate": 9.714098521798465e-05, "entropy": 0.23066828176379203, "num_tokens": 2818656.0, "mean_token_accuracy": 0.9388932436704636, "epoch": 0.6040268456375839, "step": 180 }, { "loss": 0.231, "grad_norm": 0.17845278978347778, "learning_rate": 9.677408983563565e-05, "entropy": 0.23543039262294768, "num_tokens": 2975473.0, "mean_token_accuracy": 0.9391510993242264, "epoch": 0.6375838926174496, "step": 190 }, { "loss": 0.2152, "grad_norm": 0.23437312245368958, "learning_rate": 9.638583186397331e-05, "entropy": 0.21993101984262467, "num_tokens": 3130452.0, "mean_token_accuracy": 0.9423492521047592, "epoch": 0.6711409395973155, "step": 200 }, { "loss": 0.204, "grad_norm": 0.2377660572528839, "learning_rate": 9.597638862757255e-05, "entropy": 0.2064872995018959, "num_tokens": 3287490.0, "mean_token_accuracy": 0.9451317518949509, "epoch": 0.7046979865771812, "step": 210 }, { "loss": 0.2158, "grad_norm": 0.17862877249717712, "learning_rate": 9.554594712670926e-05, "entropy": 0.2166791968047619, "num_tokens": 3446266.0, "mean_token_accuracy": 0.9421544283628464, "epoch": 0.738255033557047, "step": 220 }, { "loss": 0.2253, "grad_norm": 0.1511947214603424, "learning_rate": 9.509470395195399e-05, "entropy": 0.22676953300833702, "num_tokens": 3602351.0, "mean_token_accuracy": 0.939826962351799, "epoch": 0.7718120805369127, "step": 230 }, { "loss": 0.2201, "grad_norm": 0.26649826765060425, "learning_rate": 9.46228651943853e-05, "entropy": 0.22365730181336402, "num_tokens": 3757932.0, "mean_token_accuracy": 0.9410541921854019, "epoch": 0.8053691275167785, "step": 240 }, { "loss": 0.2026, "grad_norm": 0.19634084403514862, "learning_rate": 9.413064635146418e-05, "entropy": 0.20426617376506329, "num_tokens": 3912346.0, "mean_token_accuracy": 0.9445015460252761, "epoch": 0.8389261744966443, "step": 250 }, { "loss": 0.2201, "grad_norm": 0.19683024287223816, "learning_rate": 9.361827222861241e-05, "entropy": 0.22703441381454467, "num_tokens": 4071117.0, "mean_token_accuracy": 0.9421917259693146, "epoch": 0.87248322147651, "step": 260 }, { "loss": 0.2048, "grad_norm": 0.184284046292305, "learning_rate": 9.308597683653975e-05, "entropy": 0.20578452795743943, "num_tokens": 4226750.0, "mean_token_accuracy": 0.9451743960380554, "epoch": 0.9060402684563759, "step": 270 }, { "loss": 0.2181, "grad_norm": 0.19832965731620789, "learning_rate": 9.253400328436699e-05, "entropy": 0.22092150747776032, "num_tokens": 4385096.0, "mean_token_accuracy": 0.9426127344369888, "epoch": 0.9395973154362416, "step": 280 }, { "loss": 0.2178, "grad_norm": 0.16237328946590424, "learning_rate": 9.196260366859342e-05, "entropy": 0.22047539427876472, "num_tokens": 4544552.0, "mean_token_accuracy": 0.9419336885213851, "epoch": 0.9731543624161074, "step": 290 }, { "loss": 0.2188, "grad_norm": 0.19606098532676697, "learning_rate": 9.137203895795983e-05, "entropy": 0.2207832932472229, "num_tokens": 4701426.0, "mean_token_accuracy": 0.941154745221138, "epoch": 1.0067114093959733, "step": 300 }, { "loss": 0.1948, "grad_norm": 0.2281929850578308, "learning_rate": 9.076257887425923e-05, "entropy": 0.19738901741802692, "num_tokens": 4858846.0, "mean_token_accuracy": 0.9471817642450333, "epoch": 1.0402684563758389, "step": 310 }, { "loss": 0.2037, "grad_norm": 0.19096426665782928, "learning_rate": 9.01345017691499e-05, "entropy": 0.2047410562634468, "num_tokens": 5015590.0, "mean_token_accuracy": 0.9454032570123673, "epoch": 1.0738255033557047, "step": 320 }, { "loss": 0.2023, "grad_norm": 0.19078002870082855, "learning_rate": 8.948809449702711e-05, "entropy": 0.21062878221273423, "num_tokens": 5173149.0, "mean_token_accuracy": 0.9446896910667419, "epoch": 1.1073825503355705, "step": 330 }, { "loss": 0.1997, "grad_norm": 0.26884037256240845, "learning_rate": 8.882365228401139e-05, "entropy": 0.20285834297537803, "num_tokens": 5331307.0, "mean_token_accuracy": 0.9449720442295074, "epoch": 1.1409395973154361, "step": 340 }, { "loss": 0.1967, "grad_norm": 0.23782381415367126, "learning_rate": 8.814147859311332e-05, "entropy": 0.1966269500553608, "num_tokens": 5486558.0, "mean_token_accuracy": 0.9466675192117691, "epoch": 1.174496644295302, "step": 350 }, { "loss": 0.2135, "grad_norm": 0.19140051305294037, "learning_rate": 8.744188498563641e-05, "entropy": 0.21856041625142097, "num_tokens": 5643311.0, "mean_token_accuracy": 0.9413786977529526, "epoch": 1.2080536912751678, "step": 360 }, { "loss": 0.1956, "grad_norm": 0.2523305118083954, "learning_rate": 8.672519097888126e-05, "entropy": 0.19831475540995597, "num_tokens": 5798511.0, "mean_token_accuracy": 0.946366423368454, "epoch": 1.2416107382550337, "step": 370 }, { "loss": 0.1895, "grad_norm": 0.16881771385669708, "learning_rate": 8.599172390021615e-05, "entropy": 0.1935427539050579, "num_tokens": 5954868.0, "mean_token_accuracy": 0.9482783049345016, "epoch": 1.2751677852348993, "step": 380 }, { "loss": 0.1857, "grad_norm": 0.15226209163665771, "learning_rate": 8.524181873758059e-05, "entropy": 0.18582973293960095, "num_tokens": 6108894.0, "mean_token_accuracy": 0.9488643199205399, "epoch": 1.308724832214765, "step": 390 }, { "loss": 0.2078, "grad_norm": 0.19812646508216858, "learning_rate": 8.447581798649014e-05, "entropy": 0.20552278831601142, "num_tokens": 6267764.0, "mean_token_accuracy": 0.9442088425159454, "epoch": 1.342281879194631, "step": 400 }, { "loss": 0.2074, "grad_norm": 0.188148632645607, "learning_rate": 8.369407149361241e-05, "entropy": 0.21127415373921393, "num_tokens": 6424716.0, "mean_token_accuracy": 0.9449205726385117, "epoch": 1.3758389261744965, "step": 410 }, { "loss": 0.1948, "grad_norm": 0.19974718987941742, "learning_rate": 8.289693629698564e-05, "entropy": 0.19758303910493852, "num_tokens": 6579760.0, "mean_token_accuracy": 0.9469632744789124, "epoch": 1.4093959731543624, "step": 420 }, { "loss": 0.2144, "grad_norm": 0.2088315635919571, "learning_rate": 8.208477646295277e-05, "entropy": 0.21340604722499848, "num_tokens": 6737911.0, "mean_token_accuracy": 0.9427065312862396, "epoch": 1.4429530201342282, "step": 430 }, { "loss": 0.2145, "grad_norm": 0.31156837940216064, "learning_rate": 8.125796291988577e-05, "entropy": 0.21349589377641678, "num_tokens": 6894903.0, "mean_token_accuracy": 0.9425385951995849, "epoch": 1.476510067114094, "step": 440 }, { "loss": 0.2159, "grad_norm": 0.18016791343688965, "learning_rate": 8.041687328877567e-05, "entropy": 0.2204777255654335, "num_tokens": 7054943.0, "mean_token_accuracy": 0.9415547668933868, "epoch": 1.5100671140939599, "step": 450 }, { "loss": 0.202, "grad_norm": 0.2058987319469452, "learning_rate": 7.956189171076616e-05, "entropy": 0.20117317102849483, "num_tokens": 7211170.0, "mean_token_accuracy": 0.9456082373857498, "epoch": 1.5436241610738255, "step": 460 }, { "loss": 0.1939, "grad_norm": 0.15302836894989014, "learning_rate": 7.869340867170928e-05, "entropy": 0.19479563012719153, "num_tokens": 7369121.0, "mean_token_accuracy": 0.9471449553966522, "epoch": 1.5771812080536913, "step": 470 }, { "loss": 0.1886, "grad_norm": 0.16370812058448792, "learning_rate": 7.781182082382325e-05, "entropy": 0.18975077904760837, "num_tokens": 7525693.0, "mean_token_accuracy": 0.9485057532787323, "epoch": 1.610738255033557, "step": 480 }, { "loss": 0.2023, "grad_norm": 0.1960698664188385, "learning_rate": 7.691753080453412e-05, "entropy": 0.2045104220509529, "num_tokens": 7683086.0, "mean_token_accuracy": 0.9456699937582016, "epoch": 1.6442953020134228, "step": 490 }, { "loss": 0.1991, "grad_norm": 0.18973101675510406, "learning_rate": 7.60109470525839e-05, "entropy": 0.20317464768886567, "num_tokens": 7840651.0, "mean_token_accuracy": 0.9453895062208175, "epoch": 1.6778523489932886, "step": 500 }, { "loss": 0.1939, "grad_norm": 0.19162558019161224, "learning_rate": 7.509248362148889e-05, "entropy": 0.19502840861678122, "num_tokens": 7997627.0, "mean_token_accuracy": 0.9481672704219818, "epoch": 1.7114093959731544, "step": 510 }, { "loss": 0.1921, "grad_norm": 0.18042920529842377, "learning_rate": 7.416255999043401e-05, "entropy": 0.19183044098317623, "num_tokens": 8153075.0, "mean_token_accuracy": 0.9467999368906022, "epoch": 1.7449664429530203, "step": 520 }, { "loss": 0.1788, "grad_norm": 0.1608305722475052, "learning_rate": 7.322160087268877e-05, "entropy": 0.18251866959035395, "num_tokens": 8307904.0, "mean_token_accuracy": 0.9506349295377732, "epoch": 1.778523489932886, "step": 530 }, { "loss": 0.1808, "grad_norm": 0.18992868065834045, "learning_rate": 7.227003602163295e-05, "entropy": 0.1834562122821808, "num_tokens": 8464236.0, "mean_token_accuracy": 0.9505033463239669, "epoch": 1.8120805369127517, "step": 540 }, { "loss": 0.1868, "grad_norm": 0.21314968168735504, "learning_rate": 7.130830003448032e-05, "entropy": 0.1886416744440794, "num_tokens": 8622442.0, "mean_token_accuracy": 0.9492894530296325, "epoch": 1.8456375838926173, "step": 550 }, { "loss": 0.19, "grad_norm": 0.196988046169281, "learning_rate": 7.033683215379002e-05, "entropy": 0.1898799568414688, "num_tokens": 8778712.0, "mean_token_accuracy": 0.9487443953752518, "epoch": 1.8791946308724832, "step": 560 }, { "loss": 0.1872, "grad_norm": 0.19480957090854645, "learning_rate": 6.935607606685642e-05, "entropy": 0.1890778660774231, "num_tokens": 8934261.0, "mean_token_accuracy": 0.9492935538291931, "epoch": 1.912751677852349, "step": 570 }, { "loss": 0.1923, "grad_norm": 0.1902640014886856, "learning_rate": 6.836647970306894e-05, "entropy": 0.1931080285459757, "num_tokens": 9093573.0, "mean_token_accuracy": 0.9487350136041641, "epoch": 1.9463087248322148, "step": 580 }, { "loss": 0.1817, "grad_norm": 0.19666995108127594, "learning_rate": 6.736849502933452e-05, "entropy": 0.18354742750525474, "num_tokens": 9247665.0, "mean_token_accuracy": 0.9502300530672073, "epoch": 1.9798657718120807, "step": 590 }, { "loss": 0.1859, "grad_norm": 0.2059757262468338, "learning_rate": 6.636257784365584e-05, "entropy": 0.18999958783388138, "num_tokens": 9402433.0, "mean_token_accuracy": 0.9492688685655594, "epoch": 2.0134228187919465, "step": 600 }, { "loss": 0.1847, "grad_norm": 0.1790536344051361, "learning_rate": 6.53491875669601e-05, "entropy": 0.18768733143806457, "num_tokens": 9559144.0, "mean_token_accuracy": 0.949212983250618, "epoch": 2.046979865771812, "step": 610 }, { "loss": 0.1642, "grad_norm": 0.21339593827724457, "learning_rate": 6.432878703327298e-05, "entropy": 0.16429513394832612, "num_tokens": 9714865.0, "mean_token_accuracy": 0.9541611701250077, "epoch": 2.0805369127516777, "step": 620 }, { "loss": 0.1855, "grad_norm": 0.21740688383579254, "learning_rate": 6.330184227833376e-05, "entropy": 0.18604125529527665, "num_tokens": 9872250.0, "mean_token_accuracy": 0.9479648023843765, "epoch": 2.1140939597315436, "step": 630 }, { "loss": 0.1898, "grad_norm": 0.1908544898033142, "learning_rate": 6.226882232674825e-05, "entropy": 0.19123471677303314, "num_tokens": 10026954.0, "mean_token_accuracy": 0.9476721823215485, "epoch": 2.1476510067114094, "step": 640 }, { "loss": 0.1652, "grad_norm": 0.2003369778394699, "learning_rate": 6.123019897777657e-05, "entropy": 0.16700895316898823, "num_tokens": 10177873.0, "mean_token_accuracy": 0.9538896352052688, "epoch": 2.1812080536912752, "step": 650 }, { "loss": 0.2036, "grad_norm": 0.22211240231990814, "learning_rate": 6.0186446589853784e-05, "entropy": 0.20571780651807786, "num_tokens": 10338481.0, "mean_token_accuracy": 0.9439575403928757, "epoch": 2.214765100671141, "step": 660 }, { "loss": 0.2002, "grad_norm": 0.23921112716197968, "learning_rate": 5.9138041863941616e-05, "entropy": 0.19964271709322928, "num_tokens": 10495720.0, "mean_token_accuracy": 0.9459815502166748, "epoch": 2.248322147651007, "step": 670 }, { "loss": 0.1989, "grad_norm": 0.20520520210266113, "learning_rate": 5.808546362581032e-05, "entropy": 0.20052680410444737, "num_tokens": 10655224.0, "mean_token_accuracy": 0.9459189057350159, "epoch": 2.2818791946308723, "step": 680 }, { "loss": 0.1803, "grad_norm": 0.20859530568122864, "learning_rate": 5.7029192607350146e-05, "entropy": 0.18437107987701892, "num_tokens": 10812393.0, "mean_token_accuracy": 0.9512098997831344, "epoch": 2.315436241610738, "step": 690 }, { "loss": 0.1625, "grad_norm": 0.21114759147167206, "learning_rate": 5.596971122701221e-05, "entropy": 0.16424274519085885, "num_tokens": 10965653.0, "mean_token_accuracy": 0.9543471187353134, "epoch": 2.348993288590604, "step": 700 }, { "loss": 0.164, "grad_norm": 0.19510741531848907, "learning_rate": 5.4907503369479116e-05, "entropy": 0.16679177805781364, "num_tokens": 11120539.0, "mean_token_accuracy": 0.954514691233635, "epoch": 2.38255033557047, "step": 710 }, { "loss": 0.1772, "grad_norm": 0.244350403547287, "learning_rate": 5.384305416466584e-05, "entropy": 0.17797317542135715, "num_tokens": 11275790.0, "mean_token_accuracy": 0.950087907910347, "epoch": 2.4161073825503356, "step": 720 }, { "loss": 0.1835, "grad_norm": 0.19845031201839447, "learning_rate": 5.2776849766152e-05, "entropy": 0.18638909384608268, "num_tokens": 11430204.0, "mean_token_accuracy": 0.9490520358085632, "epoch": 2.4496644295302015, "step": 730 }, { "loss": 0.1883, "grad_norm": 0.20039933919906616, "learning_rate": 5.170937712914655e-05, "entropy": 0.18770763501524926, "num_tokens": 11592345.0, "mean_token_accuracy": 0.9483928889036178, "epoch": 2.4832214765100673, "step": 740 }, { "loss": 0.1528, "grad_norm": 0.20752805471420288, "learning_rate": 5.064112378808637e-05, "entropy": 0.15445428304374217, "num_tokens": 11744570.0, "mean_token_accuracy": 0.9564135998487473, "epoch": 2.5167785234899327, "step": 750 }, { "loss": 0.1793, "grad_norm": 0.2342364341020584, "learning_rate": 4.957257763397024e-05, "entropy": 0.18113909810781478, "num_tokens": 11901062.0, "mean_token_accuracy": 0.9501085758209229, "epoch": 2.5503355704697985, "step": 760 }, { "loss": 0.1572, "grad_norm": 0.20083309710025787, "learning_rate": 4.850422669153009e-05, "entropy": 0.16168876476585864, "num_tokens": 12053753.0, "mean_token_accuracy": 0.9557742238044739, "epoch": 2.5838926174496644, "step": 770 }, { "loss": 0.1625, "grad_norm": 0.20891757309436798, "learning_rate": 4.743655889634105e-05, "entropy": 0.16362526044249534, "num_tokens": 12208995.0, "mean_token_accuracy": 0.9552077889442444, "epoch": 2.61744966442953, "step": 780 }, { "loss": 0.1567, "grad_norm": 0.21302001178264618, "learning_rate": 4.6370061871972326e-05, "entropy": 0.15993254519999028, "num_tokens": 12364530.0, "mean_token_accuracy": 0.955506107211113, "epoch": 2.651006711409396, "step": 790 }, { "loss": 0.1705, "grad_norm": 0.19647525250911713, "learning_rate": 4.530522270728048e-05, "entropy": 0.1731419090181589, "num_tokens": 12522512.0, "mean_token_accuracy": 0.9523056238889694, "epoch": 2.684563758389262, "step": 800 }, { "loss": 0.1656, "grad_norm": 0.23564544320106506, "learning_rate": 4.424252773394704e-05, "entropy": 0.16778925359249114, "num_tokens": 12678022.0, "mean_token_accuracy": 0.9539547711610794, "epoch": 2.7181208053691277, "step": 810 }, { "loss": 0.1862, "grad_norm": 0.19549453258514404, "learning_rate": 4.318246230436174e-05, "entropy": 0.1878517519682646, "num_tokens": 12838063.0, "mean_token_accuracy": 0.949842843413353, "epoch": 2.751677852348993, "step": 820 }, { "loss": 0.1687, "grad_norm": 0.23449555039405823, "learning_rate": 4.212551056995323e-05, "entropy": 0.17150973305106162, "num_tokens": 12993073.0, "mean_token_accuracy": 0.953241491317749, "epoch": 2.785234899328859, "step": 830 }, { "loss": 0.181, "grad_norm": 0.19433574378490448, "learning_rate": 4.107215526006817e-05, "entropy": 0.1791513752192259, "num_tokens": 13151000.0, "mean_token_accuracy": 0.9498830765485764, "epoch": 2.8187919463087248, "step": 840 }, { "loss": 0.1782, "grad_norm": 0.31362298130989075, "learning_rate": 4.00228774614998e-05, "entropy": 0.18104747980833052, "num_tokens": 13306740.0, "mean_token_accuracy": 0.9511455327272416, "epoch": 2.8523489932885906, "step": 850 }, { "loss": 0.1841, "grad_norm": 0.19440777599811554, "learning_rate": 3.897815639876673e-05, "entropy": 0.183981429412961, "num_tokens": 13465413.0, "mean_token_accuracy": 0.9499198466539382, "epoch": 2.8859060402684564, "step": 860 }, { "loss": 0.1727, "grad_norm": 0.22845511138439178, "learning_rate": 3.793846921524237e-05, "entropy": 0.17417334467172624, "num_tokens": 13625300.0, "mean_token_accuracy": 0.9522460520267486, "epoch": 2.9194630872483223, "step": 870 }, { "loss": 0.1917, "grad_norm": 0.17185327410697937, "learning_rate": 3.6904290755234604e-05, "entropy": 0.19201257564127444, "num_tokens": 13787633.0, "mean_token_accuracy": 0.9470277667045593, "epoch": 2.953020134228188, "step": 880 }, { "loss": 0.181, "grad_norm": 0.22177894413471222, "learning_rate": 3.587609334711576e-05, "entropy": 0.18371602855622768, "num_tokens": 13946986.0, "mean_token_accuracy": 0.9502452671527862, "epoch": 2.9865771812080535, "step": 890 }, { "loss": 0.1639, "grad_norm": 0.20057617127895355, "learning_rate": 3.48543465876014e-05, "entropy": 0.16733385995030403, "num_tokens": 14102933.0, "mean_token_accuracy": 0.9548174262046814, "epoch": 3.0201342281879193, "step": 900 }, { "loss": 0.1657, "grad_norm": 0.234299898147583, "learning_rate": 3.383951712727701e-05, "entropy": 0.1675445105880499, "num_tokens": 14259753.0, "mean_token_accuracy": 0.9535206586122513, "epoch": 3.053691275167785, "step": 910 }, { "loss": 0.1583, "grad_norm": 0.23432078957557678, "learning_rate": 3.2832068457469945e-05, "entropy": 0.1605758022516966, "num_tokens": 14418531.0, "mean_token_accuracy": 0.955757224559784, "epoch": 3.087248322147651, "step": 920 }, { "loss": 0.1504, "grad_norm": 0.20643459260463715, "learning_rate": 3.183246069856443e-05, "entropy": 0.15479738190770148, "num_tokens": 14573081.0, "mean_token_accuracy": 0.9577652394771576, "epoch": 3.120805369127517, "step": 930 }, { "loss": 0.1616, "grad_norm": 0.2060076892375946, "learning_rate": 3.0841150389856125e-05, "entropy": 0.16488183476030827, "num_tokens": 14727805.0, "mean_token_accuracy": 0.9542657196521759, "epoch": 3.1543624161073827, "step": 940 }, { "loss": 0.1727, "grad_norm": 0.25925537943840027, "learning_rate": 2.9858590281042152e-05, "entropy": 0.17419612146914004, "num_tokens": 14883574.0, "mean_token_accuracy": 0.9522079944610595, "epoch": 3.1879194630872485, "step": 950 }, { "loss": 0.1444, "grad_norm": 0.22550174593925476, "learning_rate": 2.888522912544202e-05, "entropy": 0.1470055654644966, "num_tokens": 15038732.0, "mean_token_accuracy": 0.9591749131679534, "epoch": 3.221476510067114, "step": 960 }, { "loss": 0.1453, "grad_norm": 0.22862796485424042, "learning_rate": 2.792151147504366e-05, "entropy": 0.14832949005067347, "num_tokens": 15193067.0, "mean_token_accuracy": 0.9592784523963929, "epoch": 3.2550335570469797, "step": 970 }, { "loss": 0.1592, "grad_norm": 0.21343277394771576, "learning_rate": 2.6967877477468397e-05, "entropy": 0.16025195196270942, "num_tokens": 15348668.0, "mean_token_accuracy": 0.9546813875436783, "epoch": 3.2885906040268456, "step": 980 }, { "loss": 0.1842, "grad_norm": 0.27152112126350403, "learning_rate": 2.6024762674947313e-05, "entropy": 0.18718509674072265, "num_tokens": 15507728.0, "mean_token_accuracy": 0.9490089803934098, "epoch": 3.3221476510067114, "step": 990 }, { "loss": 0.1613, "grad_norm": 0.20685099065303802, "learning_rate": 2.509259780540118e-05, "entropy": 0.16506212055683137, "num_tokens": 15663068.0, "mean_token_accuracy": 0.9536582648754119, "epoch": 3.3557046979865772, "step": 1000 }, { "loss": 0.1557, "grad_norm": 0.24159930646419525, "learning_rate": 2.4171808605714504e-05, "entropy": 0.15720976293087005, "num_tokens": 15820651.0, "mean_token_accuracy": 0.9559206962585449, "epoch": 3.389261744966443, "step": 1010 }, { "loss": 0.1775, "grad_norm": 0.23764103651046753, "learning_rate": 2.3262815617293517e-05, "entropy": 0.1807922873646021, "num_tokens": 15979358.0, "mean_token_accuracy": 0.9511760205030442, "epoch": 3.422818791946309, "step": 1020 }, { "loss": 0.1713, "grad_norm": 0.22108638286590576, "learning_rate": 2.2366033993997344e-05, "entropy": 0.17095785699784755, "num_tokens": 16135558.0, "mean_token_accuracy": 0.95192691385746, "epoch": 3.4563758389261743, "step": 1030 }, { "loss": 0.1682, "grad_norm": 0.24025718867778778, "learning_rate": 2.1481873312529426e-05, "entropy": 0.17155840508639814, "num_tokens": 16289493.0, "mean_token_accuracy": 0.9537844061851501, "epoch": 3.48993288590604, "step": 1040 }, { "loss": 0.1645, "grad_norm": 0.22691352665424347, "learning_rate": 2.061073738537635e-05, "entropy": 0.1673688843846321, "num_tokens": 16444253.0, "mean_token_accuracy": 0.9540765851736068, "epoch": 3.523489932885906, "step": 1050 }, { "loss": 0.1711, "grad_norm": 0.260170042514801, "learning_rate": 1.975302407637929e-05, "entropy": 0.17190844528377056, "num_tokens": 16605899.0, "mean_token_accuracy": 0.9533313482999801, "epoch": 3.557046979865772, "step": 1060 }, { "loss": 0.1703, "grad_norm": 0.22882592678070068, "learning_rate": 1.8909125119022136e-05, "entropy": 0.17202796116471292, "num_tokens": 16762980.0, "mean_token_accuracy": 0.9525971114635468, "epoch": 3.5906040268456376, "step": 1070 }, { "loss": 0.1751, "grad_norm": 0.2285274714231491, "learning_rate": 1.807942593751973e-05, "entropy": 0.17774682715535164, "num_tokens": 16923345.0, "mean_token_accuracy": 0.9512902319431304, "epoch": 3.6241610738255035, "step": 1080 }, { "loss": 0.1577, "grad_norm": 0.24551482498645782, "learning_rate": 1.7264305470787363e-05, "entropy": 0.1609206810593605, "num_tokens": 17077331.0, "mean_token_accuracy": 0.955593678355217, "epoch": 3.6577181208053693, "step": 1090 }, { "loss": 0.1558, "grad_norm": 0.22661259770393372, "learning_rate": 1.6464135999372537e-05, "entropy": 0.1576618704944849, "num_tokens": 17231136.0, "mean_token_accuracy": 0.9559727132320404, "epoch": 3.6912751677852347, "step": 1100 }, { "loss": 0.187, "grad_norm": 0.23124344646930695, "learning_rate": 1.567928297542749e-05, "entropy": 0.18841546326875686, "num_tokens": 17390871.0, "mean_token_accuracy": 0.9487693756818771, "epoch": 3.7248322147651005, "step": 1110 }, { "loss": 0.1368, "grad_norm": 0.2734222710132599, "learning_rate": 1.4910104855800427e-05, "entropy": 0.14009847678244114, "num_tokens": 17544815.0, "mean_token_accuracy": 0.9597841829061509, "epoch": 3.7583892617449663, "step": 1120 }, { "loss": 0.1557, "grad_norm": 0.25066235661506653, "learning_rate": 1.4156952938321798e-05, "entropy": 0.1570753049105406, "num_tokens": 17701337.0, "mean_token_accuracy": 0.9568644523620605, "epoch": 3.791946308724832, "step": 1130 }, { "loss": 0.1582, "grad_norm": 0.24784211814403534, "learning_rate": 1.3420171201359933e-05, "entropy": 0.15978324972093105, "num_tokens": 17855471.0, "mean_token_accuracy": 0.9565310895442962, "epoch": 3.825503355704698, "step": 1140 }, { "loss": 0.1583, "grad_norm": 0.22498124837875366, "learning_rate": 1.2700096146719931e-05, "entropy": 0.16122703738510608, "num_tokens": 18017269.0, "mean_token_accuracy": 0.955768808722496, "epoch": 3.859060402684564, "step": 1150 }, { "loss": 0.1729, "grad_norm": 0.22251562774181366, "learning_rate": 1.1997056645956967e-05, "entropy": 0.17401745431125165, "num_tokens": 18176540.0, "mean_token_accuracy": 0.9513998061418534, "epoch": 3.8926174496644297, "step": 1160 }, { "loss": 0.1429, "grad_norm": 0.2645820677280426, "learning_rate": 1.1311373790174657e-05, "entropy": 0.14712858721613883, "num_tokens": 18331179.0, "mean_token_accuracy": 0.9599408686161042, "epoch": 3.926174496644295, "step": 1170 }, { "loss": 0.1778, "grad_norm": 0.22513671219348907, "learning_rate": 1.0643360743376829e-05, "entropy": 0.178417731449008, "num_tokens": 18491744.0, "mean_token_accuracy": 0.9512795448303223, "epoch": 3.959731543624161, "step": 1180 }, { "loss": 0.1751, "grad_norm": 0.23683220148086548, "learning_rate": 9.993322599439692e-06, "entropy": 0.1774978566914797, "num_tokens": 18647922.0, "mean_token_accuracy": 0.9514397591352463, "epoch": 3.9932885906040267, "step": 1190 }, { "loss": 0.1343, "grad_norm": 0.23946121335029602, "learning_rate": 9.36155624276987e-06, "entropy": 0.13949108868837357, "num_tokens": 18799741.0, "mean_token_accuracy": 0.961316853761673, "epoch": 4.026845637583893, "step": 1200 }, { "loss": 0.1448, "grad_norm": 0.26739007234573364, "learning_rate": 8.748350212711853e-06, "entropy": 0.14928585700690747, "num_tokens": 18954239.0, "mean_token_accuracy": 0.9586472451686859, "epoch": 4.060402684563758, "step": 1210 }, { "loss": 0.1589, "grad_norm": 0.2560782730579376, "learning_rate": 8.153984571766699e-06, "entropy": 0.1610208224505186, "num_tokens": 19113299.0, "mean_token_accuracy": 0.955417987704277, "epoch": 4.093959731543624, "step": 1220 }, { "loss": 0.1566, "grad_norm": 0.2503839135169983, "learning_rate": 7.578730777682386e-06, "entropy": 0.15893662311136722, "num_tokens": 19269704.0, "mean_token_accuracy": 0.9564489126205444, "epoch": 4.12751677852349, "step": 1230 }, { "loss": 0.1729, "grad_norm": 0.2269158661365509, "learning_rate": 7.022851559473964e-06, "entropy": 0.17561149559915065, "num_tokens": 19430349.0, "mean_token_accuracy": 0.95240318775177, "epoch": 4.1610738255033555, "step": 1240 }, { "loss": 0.168, "grad_norm": 0.2869897186756134, "learning_rate": 6.486600797430492e-06, "entropy": 0.1709251943975687, "num_tokens": 19585660.0, "mean_token_accuracy": 0.9534720987081527, "epoch": 4.194630872483222, "step": 1250 }, { "loss": 0.1457, "grad_norm": 0.25000014901161194, "learning_rate": 5.9702234071631e-06, "entropy": 0.14829469434916973, "num_tokens": 19744182.0, "mean_token_accuracy": 0.9593781590461731, "epoch": 4.228187919463087, "step": 1260 }, { "loss": 0.16, "grad_norm": 0.24967129528522491, "learning_rate": 5.473955227747623e-06, "entropy": 0.16518744193017482, "num_tokens": 19902824.0, "mean_token_accuracy": 0.9548085004091262, "epoch": 4.261744966442953, "step": 1270 }, { "loss": 0.1508, "grad_norm": 0.27587053179740906, "learning_rate": 4.9980229140124905e-06, "entropy": 0.15380522198975086, "num_tokens": 20057944.0, "mean_token_accuracy": 0.9581666350364685, "epoch": 4.295302013422819, "step": 1280 }, { "loss": 0.1497, "grad_norm": 0.2772783935070038, "learning_rate": 4.542643833021254e-06, "entropy": 0.15263673029839991, "num_tokens": 20215072.0, "mean_token_accuracy": 0.9581062227487565, "epoch": 4.328859060402684, "step": 1290 }, { "loss": 0.1313, "grad_norm": 0.25381705164909363, "learning_rate": 4.108025964797135e-06, "entropy": 0.13392407521605493, "num_tokens": 20371415.0, "mean_token_accuracy": 0.9618801653385163, "epoch": 4.3624161073825505, "step": 1300 }, { "loss": 0.1442, "grad_norm": 0.318499892950058, "learning_rate": 3.69436780733462e-06, "entropy": 0.1467492014169693, "num_tokens": 20525796.0, "mean_token_accuracy": 0.9589717149734497, "epoch": 4.395973154362416, "step": 1310 }, { "loss": 0.164, "grad_norm": 0.29794761538505554, "learning_rate": 3.3018582859418446e-06, "entropy": 0.16693559624254703, "num_tokens": 20683012.0, "mean_token_accuracy": 0.9544567495584488, "epoch": 4.429530201342282, "step": 1320 }, { "loss": 0.1585, "grad_norm": 0.26992982625961304, "learning_rate": 2.930676666954846e-06, "entropy": 0.16096150763332845, "num_tokens": 20836880.0, "mean_token_accuracy": 0.955687940120697, "epoch": 4.4630872483221475, "step": 1330 }, { "loss": 0.1545, "grad_norm": 0.2633298635482788, "learning_rate": 2.580992475863381e-06, "entropy": 0.15808030292391778, "num_tokens": 20991520.0, "mean_token_accuracy": 0.9564832031726838, "epoch": 4.496644295302014, "step": 1340 }, { "loss": 0.1574, "grad_norm": 0.2697800099849701, "learning_rate": 2.2529654198854835e-06, "entropy": 0.16020409800112248, "num_tokens": 21149049.0, "mean_token_accuracy": 0.9553818762302398, "epoch": 4.530201342281879, "step": 1350 }, { "loss": 0.1434, "grad_norm": 0.26713675260543823, "learning_rate": 1.9467453150262327e-06, "entropy": 0.1468834660947323, "num_tokens": 21303482.0, "mean_token_accuracy": 0.9584787577390671, "epoch": 4.563758389261745, "step": 1360 }, { "loss": 0.1676, "grad_norm": 0.25141027569770813, "learning_rate": 1.6624720176540265e-06, "entropy": 0.17059922069311143, "num_tokens": 21462214.0, "mean_token_accuracy": 0.9529773116111755, "epoch": 4.597315436241611, "step": 1370 }, { "loss": 0.1631, "grad_norm": 0.22781124711036682, "learning_rate": 1.400275360625608e-06, "entropy": 0.16469677537679672, "num_tokens": 21617768.0, "mean_token_accuracy": 0.9540845274925231, "epoch": 4.630872483221476, "step": 1380 }, { "loss": 0.1405, "grad_norm": 0.26300156116485596, "learning_rate": 1.1602750939889774e-06, "entropy": 0.14349478296935558, "num_tokens": 21775827.0, "mean_token_accuracy": 0.9608874082565307, "epoch": 4.6644295302013425, "step": 1390 }, { "loss": 0.1648, "grad_norm": 0.28018462657928467, "learning_rate": 9.42580830291373e-07, "entropy": 0.168378734216094, "num_tokens": 21933598.0, "mean_token_accuracy": 0.9548741400241851, "epoch": 4.697986577181208, "step": 1400 }, { "loss": 0.1579, "grad_norm": 0.23573631048202515, "learning_rate": 7.472919945171631e-07, "entropy": 0.16024879328906536, "num_tokens": 22092356.0, "mean_token_accuracy": 0.9563170611858368, "epoch": 4.731543624161074, "step": 1410 }, { "loss": 0.1486, "grad_norm": 0.26423606276512146, "learning_rate": 5.74497778678662e-07, "entropy": 0.153883695602417, "num_tokens": 22245245.0, "mean_token_accuracy": 0.9581821829080581, "epoch": 4.76510067114094, "step": 1420 }, { "loss": 0.1668, "grad_norm": 0.24437865614891052, "learning_rate": 4.242771010804558e-07, "entropy": 0.16868923045694828, "num_tokens": 22403545.0, "mean_token_accuracy": 0.9531532049179077, "epoch": 4.798657718120805, "step": 1430 }, { "loss": 0.1497, "grad_norm": 0.2679818868637085, "learning_rate": 2.966985702759828e-07, "entropy": 0.15332721956074238, "num_tokens": 22559658.0, "mean_token_accuracy": 0.9572977095842361, "epoch": 4.832214765100671, "step": 1440 }, { "loss": 0.1487, "grad_norm": 0.25875014066696167, "learning_rate": 1.9182045373273838e-07, "entropy": 0.15049569718539715, "num_tokens": 22718226.0, "mean_token_accuracy": 0.9580355733633041, "epoch": 4.865771812080537, "step": 1450 }, { "loss": 0.1444, "grad_norm": 0.25537535548210144, "learning_rate": 1.0969065122041766e-07, "entropy": 0.14743491858243943, "num_tokens": 22873841.0, "mean_token_accuracy": 0.9597749501466751, "epoch": 4.899328859060403, "step": 1460 }, { "loss": 0.1584, "grad_norm": 0.28133252263069153, "learning_rate": 5.0346672934270534e-08, "entropy": 0.16207893192768097, "num_tokens": 23032944.0, "mean_token_accuracy": 0.9557940989732743, "epoch": 4.932885906040268, "step": 1470 }, { "loss": 0.1577, "grad_norm": 0.25087353587150574, "learning_rate": 1.3815622363427815e-08, "entropy": 0.16037070676684378, "num_tokens": 23191641.0, "mean_token_accuracy": 0.9560932219028473, "epoch": 4.966442953020135, "step": 1480 }, { "loss": 0.1779, "grad_norm": 0.307130366563797, "learning_rate": 1.141839123142141e-10, "entropy": 0.18243111334741116, "num_tokens": 23348865.0, "mean_token_accuracy": 0.9507445961236953, "epoch": 5.0, "step": 1490 }, { "train_runtime": 10055.6246, "train_samples_per_second": 4.74, "train_steps_per_second": 0.148, "total_flos": 1.0796462610038784e+19, "train_loss": 0.2056295836531876, "epoch": 5.0, "step": 1490 } ]