{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 33504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002984718242597899, "grad_norm": 2.5109126567840576, "learning_rate": 1.1933174224343676e-07, "loss": 0.849, "mean_token_accuracy": 0.7716509580612183, "num_tokens": 167680.0, "step": 5 }, { "epoch": 0.0005969436485195798, "grad_norm": 2.5402281284332275, "learning_rate": 2.684964200477327e-07, "loss": 0.8594, "mean_token_accuracy": 0.7681856036186219, "num_tokens": 335360.0, "step": 10 }, { "epoch": 0.0008954154727793697, "grad_norm": 2.270322799682617, "learning_rate": 4.1766109785202865e-07, "loss": 0.8854, "mean_token_accuracy": 0.7613682389259339, "num_tokens": 503040.0, "step": 15 }, { "epoch": 0.0011938872970391596, "grad_norm": 2.388043165206909, "learning_rate": 5.668257756563246e-07, "loss": 0.8881, "mean_token_accuracy": 0.7604795455932617, "num_tokens": 670720.0, "step": 20 }, { "epoch": 0.0014923591212989494, "grad_norm": 2.682539224624634, "learning_rate": 7.159904534606206e-07, "loss": 0.8831, "mean_token_accuracy": 0.7617738366127014, "num_tokens": 838400.0, "step": 25 }, { "epoch": 0.0017908309455587394, "grad_norm": 1.9359753131866455, "learning_rate": 8.651551312649164e-07, "loss": 0.8246, "mean_token_accuracy": 0.7766312837600708, "num_tokens": 1006080.0, "step": 30 }, { "epoch": 0.002089302769818529, "grad_norm": 1.921677589416504, "learning_rate": 1.0143198090692125e-06, "loss": 0.8854, "mean_token_accuracy": 0.7587021350860595, "num_tokens": 1173760.0, "step": 35 }, { "epoch": 0.002387774594078319, "grad_norm": 1.6539034843444824, "learning_rate": 1.1634844868735085e-06, "loss": 0.881, "mean_token_accuracy": 0.7584575891494751, "num_tokens": 1341440.0, "step": 40 }, { "epoch": 0.0026862464183381087, "grad_norm": 1.6672691106796265, "learning_rate": 1.3126491646778044e-06, "loss": 0.8263, "mean_token_accuracy": 0.7719491720199585, "num_tokens": 1509120.0, "step": 45 }, { "epoch": 0.0029847182425978987, "grad_norm": 1.7613340616226196, "learning_rate": 1.4618138424821004e-06, "loss": 0.8306, "mean_token_accuracy": 0.7692890286445617, "num_tokens": 1676800.0, "step": 50 }, { "epoch": 0.0032831900668576887, "grad_norm": 1.4340014457702637, "learning_rate": 1.6109785202863964e-06, "loss": 0.7541, "mean_token_accuracy": 0.7874746680259704, "num_tokens": 1844480.0, "step": 55 }, { "epoch": 0.0035816618911174787, "grad_norm": 1.2414655685424805, "learning_rate": 1.7601431980906923e-06, "loss": 0.8135, "mean_token_accuracy": 0.7701061725616455, "num_tokens": 2012160.0, "step": 60 }, { "epoch": 0.0038801337153772683, "grad_norm": 1.2400470972061157, "learning_rate": 1.909307875894988e-06, "loss": 0.7695, "mean_token_accuracy": 0.7819217443466187, "num_tokens": 2179840.0, "step": 65 }, { "epoch": 0.004178605539637058, "grad_norm": 1.1827894449234009, "learning_rate": 2.0584725536992843e-06, "loss": 0.767, "mean_token_accuracy": 0.7819277167320251, "num_tokens": 2347520.0, "step": 70 }, { "epoch": 0.004477077363896848, "grad_norm": 1.2285350561141968, "learning_rate": 2.20763723150358e-06, "loss": 0.7641, "mean_token_accuracy": 0.7814624905586243, "num_tokens": 2515200.0, "step": 75 }, { "epoch": 0.004775549188156638, "grad_norm": 1.2129645347595215, "learning_rate": 2.3568019093078762e-06, "loss": 0.7559, "mean_token_accuracy": 0.7816056251525879, "num_tokens": 2682880.0, "step": 80 }, { "epoch": 0.005074021012416428, "grad_norm": 1.2340443134307861, "learning_rate": 2.505966587112172e-06, "loss": 0.759, "mean_token_accuracy": 0.7816712260246277, "num_tokens": 2850560.0, "step": 85 }, { "epoch": 0.0053724928366762174, "grad_norm": 1.1642875671386719, "learning_rate": 2.6551312649164677e-06, "loss": 0.7329, "mean_token_accuracy": 0.7863831520080566, "num_tokens": 3018240.0, "step": 90 }, { "epoch": 0.005670964660936008, "grad_norm": 1.1461609601974487, "learning_rate": 2.804295942720764e-06, "loss": 0.7503, "mean_token_accuracy": 0.7809912919998169, "num_tokens": 3185920.0, "step": 95 }, { "epoch": 0.0059694364851957974, "grad_norm": 1.0481009483337402, "learning_rate": 2.9534606205250597e-06, "loss": 0.7112, "mean_token_accuracy": 0.7925742626190185, "num_tokens": 3353600.0, "step": 100 }, { "epoch": 0.006267908309455587, "grad_norm": 1.0805094242095947, "learning_rate": 3.1026252983293554e-06, "loss": 0.7714, "mean_token_accuracy": 0.7766133904457092, "num_tokens": 3521280.0, "step": 105 }, { "epoch": 0.0065663801337153774, "grad_norm": 0.9588642716407776, "learning_rate": 3.251789976133652e-06, "loss": 0.7113, "mean_token_accuracy": 0.7901944398880005, "num_tokens": 3688960.0, "step": 110 }, { "epoch": 0.006864851957975167, "grad_norm": 1.172989845275879, "learning_rate": 3.4009546539379474e-06, "loss": 0.749, "mean_token_accuracy": 0.78075270652771, "num_tokens": 3856640.0, "step": 115 }, { "epoch": 0.0071633237822349575, "grad_norm": 1.03142511844635, "learning_rate": 3.5501193317422436e-06, "loss": 0.6881, "mean_token_accuracy": 0.7986937999725342, "num_tokens": 4024320.0, "step": 120 }, { "epoch": 0.007461795606494747, "grad_norm": 1.1663661003112793, "learning_rate": 3.6992840095465393e-06, "loss": 0.7456, "mean_token_accuracy": 0.781778609752655, "num_tokens": 4192000.0, "step": 125 }, { "epoch": 0.007760267430754537, "grad_norm": 1.00513756275177, "learning_rate": 3.8484486873508355e-06, "loss": 0.6995, "mean_token_accuracy": 0.7933019280433655, "num_tokens": 4359680.0, "step": 130 }, { "epoch": 0.008058739255014326, "grad_norm": 1.2561825513839722, "learning_rate": 3.997613365155131e-06, "loss": 0.686, "mean_token_accuracy": 0.797393536567688, "num_tokens": 4527360.0, "step": 135 }, { "epoch": 0.008357211079274117, "grad_norm": 1.193806529045105, "learning_rate": 4.146778042959427e-06, "loss": 0.7801, "mean_token_accuracy": 0.7724979162216187, "num_tokens": 4695040.0, "step": 140 }, { "epoch": 0.008655682903533907, "grad_norm": 1.072721242904663, "learning_rate": 4.295942720763723e-06, "loss": 0.7001, "mean_token_accuracy": 0.7940415143966675, "num_tokens": 4862720.0, "step": 145 }, { "epoch": 0.008954154727793696, "grad_norm": 1.1867351531982422, "learning_rate": 4.445107398568019e-06, "loss": 0.7206, "mean_token_accuracy": 0.7882440567016602, "num_tokens": 5030400.0, "step": 150 }, { "epoch": 0.009252626552053486, "grad_norm": 1.0713008642196655, "learning_rate": 4.594272076372315e-06, "loss": 0.6724, "mean_token_accuracy": 0.8011153697967529, "num_tokens": 5198080.0, "step": 155 }, { "epoch": 0.009551098376313277, "grad_norm": 1.1685636043548584, "learning_rate": 4.743436754176611e-06, "loss": 0.6886, "mean_token_accuracy": 0.7966300845146179, "num_tokens": 5365760.0, "step": 160 }, { "epoch": 0.009849570200573065, "grad_norm": 1.1419373750686646, "learning_rate": 4.8926014319809075e-06, "loss": 0.7177, "mean_token_accuracy": 0.7853572726249695, "num_tokens": 5533440.0, "step": 165 }, { "epoch": 0.010148042024832856, "grad_norm": 1.017261028289795, "learning_rate": 5.041766109785203e-06, "loss": 0.6972, "mean_token_accuracy": 0.7945604085922241, "num_tokens": 5701120.0, "step": 170 }, { "epoch": 0.010446513849092646, "grad_norm": 1.1668633222579956, "learning_rate": 5.190930787589499e-06, "loss": 0.6491, "mean_token_accuracy": 0.8055350184440613, "num_tokens": 5868800.0, "step": 175 }, { "epoch": 0.010744985673352435, "grad_norm": 0.9799959659576416, "learning_rate": 5.340095465393795e-06, "loss": 0.6254, "mean_token_accuracy": 0.8119766235351562, "num_tokens": 6036480.0, "step": 180 }, { "epoch": 0.011043457497612225, "grad_norm": 1.1462510824203491, "learning_rate": 5.489260143198091e-06, "loss": 0.6946, "mean_token_accuracy": 0.7927233815193176, "num_tokens": 6204160.0, "step": 185 }, { "epoch": 0.011341929321872016, "grad_norm": 1.156132459640503, "learning_rate": 5.638424821002386e-06, "loss": 0.7361, "mean_token_accuracy": 0.7793510675430297, "num_tokens": 6371840.0, "step": 190 }, { "epoch": 0.011640401146131804, "grad_norm": 1.0716009140014648, "learning_rate": 5.787589498806683e-06, "loss": 0.7308, "mean_token_accuracy": 0.7823571681976318, "num_tokens": 6539520.0, "step": 195 }, { "epoch": 0.011938872970391595, "grad_norm": 1.0945734977722168, "learning_rate": 5.936754176610979e-06, "loss": 0.6918, "mean_token_accuracy": 0.7942442893981934, "num_tokens": 6707200.0, "step": 200 }, { "epoch": 0.012237344794651385, "grad_norm": 1.0889852046966553, "learning_rate": 6.0859188544152745e-06, "loss": 0.6849, "mean_token_accuracy": 0.795276153087616, "num_tokens": 6874880.0, "step": 205 }, { "epoch": 0.012535816618911174, "grad_norm": 1.1265602111816406, "learning_rate": 6.23508353221957e-06, "loss": 0.6678, "mean_token_accuracy": 0.8014195442199707, "num_tokens": 7042560.0, "step": 210 }, { "epoch": 0.012834288443170964, "grad_norm": 1.0774636268615723, "learning_rate": 6.384248210023866e-06, "loss": 0.6611, "mean_token_accuracy": 0.8024692893028259, "num_tokens": 7210240.0, "step": 215 }, { "epoch": 0.013132760267430755, "grad_norm": 1.0906325578689575, "learning_rate": 6.533412887828163e-06, "loss": 0.6313, "mean_token_accuracy": 0.807944655418396, "num_tokens": 7377920.0, "step": 220 }, { "epoch": 0.013431232091690544, "grad_norm": 1.0445871353149414, "learning_rate": 6.682577565632458e-06, "loss": 0.6426, "mean_token_accuracy": 0.8060420036315918, "num_tokens": 7545600.0, "step": 225 }, { "epoch": 0.013729703915950334, "grad_norm": 1.1006596088409424, "learning_rate": 6.831742243436755e-06, "loss": 0.6281, "mean_token_accuracy": 0.8109686255455018, "num_tokens": 7713280.0, "step": 230 }, { "epoch": 0.014028175740210124, "grad_norm": 1.0418542623519897, "learning_rate": 6.980906921241051e-06, "loss": 0.6862, "mean_token_accuracy": 0.7943993926048278, "num_tokens": 7880960.0, "step": 235 }, { "epoch": 0.014326647564469915, "grad_norm": 1.0069177150726318, "learning_rate": 7.130071599045346e-06, "loss": 0.644, "mean_token_accuracy": 0.8059823513031006, "num_tokens": 8048640.0, "step": 240 }, { "epoch": 0.014625119388729704, "grad_norm": 1.1208637952804565, "learning_rate": 7.279236276849642e-06, "loss": 0.7192, "mean_token_accuracy": 0.7862698316574097, "num_tokens": 8216320.0, "step": 245 }, { "epoch": 0.014923591212989494, "grad_norm": 1.064270257949829, "learning_rate": 7.428400954653939e-06, "loss": 0.6886, "mean_token_accuracy": 0.7932064771652222, "num_tokens": 8384000.0, "step": 250 }, { "epoch": 0.015222063037249284, "grad_norm": 1.0383998155593872, "learning_rate": 7.577565632458235e-06, "loss": 0.6209, "mean_token_accuracy": 0.8120899319648742, "num_tokens": 8551680.0, "step": 255 }, { "epoch": 0.015520534861509073, "grad_norm": 1.0964311361312866, "learning_rate": 7.72673031026253e-06, "loss": 0.634, "mean_token_accuracy": 0.8092150688171387, "num_tokens": 8719360.0, "step": 260 }, { "epoch": 0.015819006685768862, "grad_norm": 1.1319653987884521, "learning_rate": 7.875894988066825e-06, "loss": 0.6299, "mean_token_accuracy": 0.8099904656410217, "num_tokens": 8887040.0, "step": 265 }, { "epoch": 0.016117478510028652, "grad_norm": 1.131912350654602, "learning_rate": 8.025059665871123e-06, "loss": 0.6335, "mean_token_accuracy": 0.8081832528114319, "num_tokens": 9054720.0, "step": 270 }, { "epoch": 0.016415950334288443, "grad_norm": 1.0844351053237915, "learning_rate": 8.174224343675418e-06, "loss": 0.6501, "mean_token_accuracy": 0.8047655940055847, "num_tokens": 9222400.0, "step": 275 }, { "epoch": 0.016714422158548233, "grad_norm": 1.0595623254776, "learning_rate": 8.323389021479713e-06, "loss": 0.6802, "mean_token_accuracy": 0.7956459403038025, "num_tokens": 9390080.0, "step": 280 }, { "epoch": 0.017012893982808024, "grad_norm": 1.1240721940994263, "learning_rate": 8.47255369928401e-06, "loss": 0.6342, "mean_token_accuracy": 0.8102648258209229, "num_tokens": 9557760.0, "step": 285 }, { "epoch": 0.017311365807067814, "grad_norm": 1.173274278640747, "learning_rate": 8.621718377088306e-06, "loss": 0.7119, "mean_token_accuracy": 0.7863771915435791, "num_tokens": 9725440.0, "step": 290 }, { "epoch": 0.0176098376313276, "grad_norm": 1.2010884284973145, "learning_rate": 8.770883054892602e-06, "loss": 0.6609, "mean_token_accuracy": 0.8019265174865723, "num_tokens": 9893120.0, "step": 295 }, { "epoch": 0.01790830945558739, "grad_norm": 1.0363003015518188, "learning_rate": 8.920047732696899e-06, "loss": 0.667, "mean_token_accuracy": 0.7988429069519043, "num_tokens": 10060800.0, "step": 300 }, { "epoch": 0.018206781279847182, "grad_norm": 1.1245827674865723, "learning_rate": 9.069212410501193e-06, "loss": 0.6813, "mean_token_accuracy": 0.7939043283462525, "num_tokens": 10228480.0, "step": 305 }, { "epoch": 0.018505253104106972, "grad_norm": 1.1633771657943726, "learning_rate": 9.218377088305489e-06, "loss": 0.6452, "mean_token_accuracy": 0.8054395914077759, "num_tokens": 10396160.0, "step": 310 }, { "epoch": 0.018803724928366763, "grad_norm": 1.150593638420105, "learning_rate": 9.367541766109786e-06, "loss": 0.7082, "mean_token_accuracy": 0.7869139909744263, "num_tokens": 10563840.0, "step": 315 }, { "epoch": 0.019102196752626553, "grad_norm": 1.0160619020462036, "learning_rate": 9.516706443914082e-06, "loss": 0.6665, "mean_token_accuracy": 0.7976798415184021, "num_tokens": 10731520.0, "step": 320 }, { "epoch": 0.019400668576886344, "grad_norm": 1.0531463623046875, "learning_rate": 9.665871121718378e-06, "loss": 0.6464, "mean_token_accuracy": 0.8056542992591857, "num_tokens": 10899200.0, "step": 325 }, { "epoch": 0.01969914040114613, "grad_norm": 1.1304893493652344, "learning_rate": 9.815035799522674e-06, "loss": 0.6846, "mean_token_accuracy": 0.793683660030365, "num_tokens": 11066880.0, "step": 330 }, { "epoch": 0.01999761222540592, "grad_norm": 1.0940200090408325, "learning_rate": 9.96420047732697e-06, "loss": 0.6519, "mean_token_accuracy": 0.802367889881134, "num_tokens": 11234560.0, "step": 335 }, { "epoch": 0.02029608404966571, "grad_norm": 1.0811877250671387, "learning_rate": 1.0113365155131265e-05, "loss": 0.631, "mean_token_accuracy": 0.8103244781494141, "num_tokens": 11402240.0, "step": 340 }, { "epoch": 0.020594555873925502, "grad_norm": 1.041429877281189, "learning_rate": 1.026252983293556e-05, "loss": 0.6405, "mean_token_accuracy": 0.8054097652435303, "num_tokens": 11569920.0, "step": 345 }, { "epoch": 0.020893027698185292, "grad_norm": 1.0615298748016357, "learning_rate": 1.0411694510739857e-05, "loss": 0.6304, "mean_token_accuracy": 0.809048080444336, "num_tokens": 11737600.0, "step": 350 }, { "epoch": 0.021191499522445083, "grad_norm": 1.08322274684906, "learning_rate": 1.0560859188544154e-05, "loss": 0.681, "mean_token_accuracy": 0.7950972080230713, "num_tokens": 11905280.0, "step": 355 }, { "epoch": 0.02148997134670487, "grad_norm": 1.1768213510513306, "learning_rate": 1.071002386634845e-05, "loss": 0.6674, "mean_token_accuracy": 0.8011630654335022, "num_tokens": 12072960.0, "step": 360 }, { "epoch": 0.02178844317096466, "grad_norm": 1.1367334127426147, "learning_rate": 1.0859188544152746e-05, "loss": 0.6181, "mean_token_accuracy": 0.8141178607940673, "num_tokens": 12240640.0, "step": 365 }, { "epoch": 0.02208691499522445, "grad_norm": 1.0722662210464478, "learning_rate": 1.1008353221957041e-05, "loss": 0.6741, "mean_token_accuracy": 0.7966122031211853, "num_tokens": 12408320.0, "step": 370 }, { "epoch": 0.02238538681948424, "grad_norm": 1.0842481851577759, "learning_rate": 1.1157517899761337e-05, "loss": 0.6307, "mean_token_accuracy": 0.8092866659164428, "num_tokens": 12576000.0, "step": 375 }, { "epoch": 0.02268385864374403, "grad_norm": 1.1080683469772339, "learning_rate": 1.1306682577565633e-05, "loss": 0.6275, "mean_token_accuracy": 0.8088691473007202, "num_tokens": 12743680.0, "step": 380 }, { "epoch": 0.022982330468003822, "grad_norm": 1.1539459228515625, "learning_rate": 1.145584725536993e-05, "loss": 0.6832, "mean_token_accuracy": 0.7943695425987244, "num_tokens": 12911360.0, "step": 385 }, { "epoch": 0.02328080229226361, "grad_norm": 1.051101565361023, "learning_rate": 1.1605011933174224e-05, "loss": 0.695, "mean_token_accuracy": 0.7907431840896606, "num_tokens": 13079040.0, "step": 390 }, { "epoch": 0.0235792741165234, "grad_norm": 1.1492129564285278, "learning_rate": 1.175417661097852e-05, "loss": 0.6588, "mean_token_accuracy": 0.8029404759407044, "num_tokens": 13246720.0, "step": 395 }, { "epoch": 0.02387774594078319, "grad_norm": 1.0017367601394653, "learning_rate": 1.1903341288782818e-05, "loss": 0.6531, "mean_token_accuracy": 0.8003161191940308, "num_tokens": 13414400.0, "step": 400 }, { "epoch": 0.02417621776504298, "grad_norm": 1.1234192848205566, "learning_rate": 1.2052505966587113e-05, "loss": 0.6429, "mean_token_accuracy": 0.805439579486847, "num_tokens": 13582080.0, "step": 405 }, { "epoch": 0.02447468958930277, "grad_norm": 1.051469326019287, "learning_rate": 1.2201670644391407e-05, "loss": 0.6853, "mean_token_accuracy": 0.792443037033081, "num_tokens": 13749760.0, "step": 410 }, { "epoch": 0.02477316141356256, "grad_norm": 1.0647557973861694, "learning_rate": 1.2350835322195705e-05, "loss": 0.6633, "mean_token_accuracy": 0.7992425084114074, "num_tokens": 13917440.0, "step": 415 }, { "epoch": 0.025071633237822348, "grad_norm": 1.1210533380508423, "learning_rate": 1.25e-05, "loss": 0.6623, "mean_token_accuracy": 0.799260425567627, "num_tokens": 14085120.0, "step": 420 }, { "epoch": 0.02537010506208214, "grad_norm": 1.1232095956802368, "learning_rate": 1.2649164677804295e-05, "loss": 0.6428, "mean_token_accuracy": 0.803471314907074, "num_tokens": 14252800.0, "step": 425 }, { "epoch": 0.02566857688634193, "grad_norm": 1.2449058294296265, "learning_rate": 1.2798329355608594e-05, "loss": 0.627, "mean_token_accuracy": 0.8090003490447998, "num_tokens": 14420480.0, "step": 430 }, { "epoch": 0.02596704871060172, "grad_norm": 1.0878791809082031, "learning_rate": 1.2947494033412888e-05, "loss": 0.6265, "mean_token_accuracy": 0.8103662133216858, "num_tokens": 14588160.0, "step": 435 }, { "epoch": 0.02626552053486151, "grad_norm": 1.0709093809127808, "learning_rate": 1.3096658711217185e-05, "loss": 0.6187, "mean_token_accuracy": 0.8117499709129333, "num_tokens": 14755840.0, "step": 440 }, { "epoch": 0.0265639923591213, "grad_norm": 1.0762311220169067, "learning_rate": 1.3245823389021481e-05, "loss": 0.6245, "mean_token_accuracy": 0.8105570912361145, "num_tokens": 14923520.0, "step": 445 }, { "epoch": 0.026862464183381087, "grad_norm": 1.1213608980178833, "learning_rate": 1.3394988066825775e-05, "loss": 0.658, "mean_token_accuracy": 0.8015686511993408, "num_tokens": 15091200.0, "step": 450 }, { "epoch": 0.027160936007640878, "grad_norm": 1.147695779800415, "learning_rate": 1.3544152744630073e-05, "loss": 0.6844, "mean_token_accuracy": 0.7944948196411132, "num_tokens": 15258880.0, "step": 455 }, { "epoch": 0.027459407831900668, "grad_norm": 1.0717793703079224, "learning_rate": 1.3693317422434368e-05, "loss": 0.6508, "mean_token_accuracy": 0.802367889881134, "num_tokens": 15426560.0, "step": 460 }, { "epoch": 0.02775787965616046, "grad_norm": 1.0311919450759888, "learning_rate": 1.3842482100238662e-05, "loss": 0.5861, "mean_token_accuracy": 0.8208457589149475, "num_tokens": 15594240.0, "step": 465 }, { "epoch": 0.02805635148042025, "grad_norm": 1.1428905725479126, "learning_rate": 1.399164677804296e-05, "loss": 0.5845, "mean_token_accuracy": 0.8203686118125916, "num_tokens": 15761920.0, "step": 470 }, { "epoch": 0.02835482330468004, "grad_norm": 1.1685853004455566, "learning_rate": 1.4140811455847256e-05, "loss": 0.6442, "mean_token_accuracy": 0.8049325942993164, "num_tokens": 15929600.0, "step": 475 }, { "epoch": 0.02865329512893983, "grad_norm": 1.172850251197815, "learning_rate": 1.4289976133651553e-05, "loss": 0.632, "mean_token_accuracy": 0.8073124289512634, "num_tokens": 16097280.0, "step": 480 }, { "epoch": 0.028951766953199617, "grad_norm": 0.9494922757148743, "learning_rate": 1.4439140811455847e-05, "loss": 0.619, "mean_token_accuracy": 0.8106346249580383, "num_tokens": 16264960.0, "step": 485 }, { "epoch": 0.029250238777459407, "grad_norm": 1.1580462455749512, "learning_rate": 1.4588305489260143e-05, "loss": 0.7286, "mean_token_accuracy": 0.7803709864616394, "num_tokens": 16432640.0, "step": 490 }, { "epoch": 0.029548710601719198, "grad_norm": 1.0796221494674683, "learning_rate": 1.473747016706444e-05, "loss": 0.6096, "mean_token_accuracy": 0.8138792872428894, "num_tokens": 16600320.0, "step": 495 }, { "epoch": 0.029847182425978988, "grad_norm": 1.0365972518920898, "learning_rate": 1.4886634844868736e-05, "loss": 0.6267, "mean_token_accuracy": 0.8090361475944519, "num_tokens": 16768000.0, "step": 500 }, { "epoch": 0.03014565425023878, "grad_norm": 1.0467289686203003, "learning_rate": 1.5035799522673034e-05, "loss": 0.6976, "mean_token_accuracy": 0.7893355488777161, "num_tokens": 16935680.0, "step": 505 }, { "epoch": 0.03044412607449857, "grad_norm": 1.037341833114624, "learning_rate": 1.5184964200477328e-05, "loss": 0.6205, "mean_token_accuracy": 0.8109686255455018, "num_tokens": 17103360.0, "step": 510 }, { "epoch": 0.030742597898758356, "grad_norm": 1.044974446296692, "learning_rate": 1.5334128878281622e-05, "loss": 0.6095, "mean_token_accuracy": 0.8160264730453491, "num_tokens": 17271040.0, "step": 515 }, { "epoch": 0.031041069723018146, "grad_norm": 1.0821094512939453, "learning_rate": 1.548329355608592e-05, "loss": 0.6583, "mean_token_accuracy": 0.8012883186340332, "num_tokens": 17438720.0, "step": 520 }, { "epoch": 0.03133954154727794, "grad_norm": 1.1046359539031982, "learning_rate": 1.5632458233890217e-05, "loss": 0.637, "mean_token_accuracy": 0.8047178745269775, "num_tokens": 17606400.0, "step": 525 }, { "epoch": 0.031638013371537724, "grad_norm": 0.9659026265144348, "learning_rate": 1.578162291169451e-05, "loss": 0.6691, "mean_token_accuracy": 0.7965585112571716, "num_tokens": 17774080.0, "step": 530 }, { "epoch": 0.03193648519579752, "grad_norm": 1.0919208526611328, "learning_rate": 1.5930787589498808e-05, "loss": 0.7246, "mean_token_accuracy": 0.7839854478836059, "num_tokens": 17941760.0, "step": 535 }, { "epoch": 0.032234957020057305, "grad_norm": 1.0334947109222412, "learning_rate": 1.6079952267303104e-05, "loss": 0.5953, "mean_token_accuracy": 0.8185196161270142, "num_tokens": 18109440.0, "step": 540 }, { "epoch": 0.0325334288443171, "grad_norm": 1.0578919649124146, "learning_rate": 1.62291169451074e-05, "loss": 0.6199, "mean_token_accuracy": 0.8102051854133606, "num_tokens": 18277120.0, "step": 545 }, { "epoch": 0.032831900668576885, "grad_norm": 1.219687581062317, "learning_rate": 1.6378281622911695e-05, "loss": 0.6447, "mean_token_accuracy": 0.8032029032707214, "num_tokens": 18444800.0, "step": 550 }, { "epoch": 0.03313037249283668, "grad_norm": 1.0638045072555542, "learning_rate": 1.652744630071599e-05, "loss": 0.6361, "mean_token_accuracy": 0.8058511257171631, "num_tokens": 18612480.0, "step": 555 }, { "epoch": 0.033428844317096466, "grad_norm": 1.2430840730667114, "learning_rate": 1.6676610978520287e-05, "loss": 0.6715, "mean_token_accuracy": 0.7967221140861511, "num_tokens": 18773299.0, "step": 560 }, { "epoch": 0.03372731614135625, "grad_norm": 1.077887773513794, "learning_rate": 1.6825775656324583e-05, "loss": 0.6505, "mean_token_accuracy": 0.8032744884490967, "num_tokens": 18940979.0, "step": 565 }, { "epoch": 0.03402578796561605, "grad_norm": 1.2823841571807861, "learning_rate": 1.6974940334128882e-05, "loss": 0.6655, "mean_token_accuracy": 0.8005189180374146, "num_tokens": 19108659.0, "step": 570 }, { "epoch": 0.034324259789875834, "grad_norm": 1.1820347309112549, "learning_rate": 1.7124105011933174e-05, "loss": 0.6662, "mean_token_accuracy": 0.7994632124900818, "num_tokens": 19276339.0, "step": 575 }, { "epoch": 0.03462273161413563, "grad_norm": 1.2058839797973633, "learning_rate": 1.727326968973747e-05, "loss": 0.6852, "mean_token_accuracy": 0.7928009033203125, "num_tokens": 19444019.0, "step": 580 }, { "epoch": 0.034921203438395415, "grad_norm": 1.1294993162155151, "learning_rate": 1.742243436754177e-05, "loss": 0.6431, "mean_token_accuracy": 0.8039544343948364, "num_tokens": 19611699.0, "step": 585 }, { "epoch": 0.0352196752626552, "grad_norm": 1.0808647871017456, "learning_rate": 1.757159904534606e-05, "loss": 0.6496, "mean_token_accuracy": 0.8030716896057128, "num_tokens": 19779379.0, "step": 590 }, { "epoch": 0.035518147086914996, "grad_norm": 1.1090660095214844, "learning_rate": 1.7720763723150357e-05, "loss": 0.6256, "mean_token_accuracy": 0.8078014969825744, "num_tokens": 19947059.0, "step": 595 }, { "epoch": 0.03581661891117478, "grad_norm": 1.2074198722839355, "learning_rate": 1.7869928400954656e-05, "loss": 0.6983, "mean_token_accuracy": 0.7884349346160888, "num_tokens": 20114739.0, "step": 600 }, { "epoch": 0.03611509073543458, "grad_norm": 1.1048452854156494, "learning_rate": 1.801909307875895e-05, "loss": 0.5936, "mean_token_accuracy": 0.8177740693092346, "num_tokens": 20282419.0, "step": 605 }, { "epoch": 0.036413562559694364, "grad_norm": 1.1176884174346924, "learning_rate": 1.8168257756563248e-05, "loss": 0.6529, "mean_token_accuracy": 0.8003340005874634, "num_tokens": 20450099.0, "step": 610 }, { "epoch": 0.03671203438395416, "grad_norm": 0.9916646480560303, "learning_rate": 1.8317422434367544e-05, "loss": 0.6221, "mean_token_accuracy": 0.8111475586891175, "num_tokens": 20617779.0, "step": 615 }, { "epoch": 0.037010506208213945, "grad_norm": 0.9652239680290222, "learning_rate": 1.8466587112171836e-05, "loss": 0.6334, "mean_token_accuracy": 0.8067756175994873, "num_tokens": 20785459.0, "step": 620 }, { "epoch": 0.03730897803247373, "grad_norm": 0.9723144173622131, "learning_rate": 1.8615751789976135e-05, "loss": 0.6706, "mean_token_accuracy": 0.7959560990333557, "num_tokens": 20953139.0, "step": 625 }, { "epoch": 0.037607449856733526, "grad_norm": 1.0488835573196411, "learning_rate": 1.876491646778043e-05, "loss": 0.6377, "mean_token_accuracy": 0.8059704065322876, "num_tokens": 21120819.0, "step": 630 }, { "epoch": 0.03790592168099331, "grad_norm": 0.9998219609260559, "learning_rate": 1.8914081145584727e-05, "loss": 0.647, "mean_token_accuracy": 0.800990104675293, "num_tokens": 21288499.0, "step": 635 }, { "epoch": 0.038204393505253106, "grad_norm": 0.9908907413482666, "learning_rate": 1.9063245823389023e-05, "loss": 0.6108, "mean_token_accuracy": 0.8143624067306519, "num_tokens": 21456179.0, "step": 640 }, { "epoch": 0.03850286532951289, "grad_norm": 1.0818791389465332, "learning_rate": 1.9212410501193318e-05, "loss": 0.683, "mean_token_accuracy": 0.7940295815467835, "num_tokens": 21623859.0, "step": 645 }, { "epoch": 0.03880133715377269, "grad_norm": 0.9645847082138062, "learning_rate": 1.9361575178997614e-05, "loss": 0.6214, "mean_token_accuracy": 0.812030303478241, "num_tokens": 21791539.0, "step": 650 }, { "epoch": 0.039099808978032474, "grad_norm": 1.0143963098526, "learning_rate": 1.951073985680191e-05, "loss": 0.6465, "mean_token_accuracy": 0.8018310785293579, "num_tokens": 21959219.0, "step": 655 }, { "epoch": 0.03939828080229226, "grad_norm": 0.9881317615509033, "learning_rate": 1.9659904534606206e-05, "loss": 0.6641, "mean_token_accuracy": 0.7982643365859985, "num_tokens": 22126899.0, "step": 660 }, { "epoch": 0.039696752626552055, "grad_norm": 1.0634610652923584, "learning_rate": 1.98090692124105e-05, "loss": 0.678, "mean_token_accuracy": 0.794172739982605, "num_tokens": 22294579.0, "step": 665 }, { "epoch": 0.03999522445081184, "grad_norm": 1.3175532817840576, "learning_rate": 1.9958233890214797e-05, "loss": 0.6285, "mean_token_accuracy": 0.8092270016670227, "num_tokens": 22462259.0, "step": 670 }, { "epoch": 0.040293696275071636, "grad_norm": 1.6304832696914673, "learning_rate": 2.0107398568019096e-05, "loss": 0.6206, "mean_token_accuracy": 0.8117857575416565, "num_tokens": 22629939.0, "step": 675 }, { "epoch": 0.04059216809933142, "grad_norm": 1.0529224872589111, "learning_rate": 2.025656324582339e-05, "loss": 0.6384, "mean_token_accuracy": 0.8057914733886719, "num_tokens": 22797619.0, "step": 680 }, { "epoch": 0.04089063992359121, "grad_norm": 0.9856318831443787, "learning_rate": 2.0405727923627684e-05, "loss": 0.6364, "mean_token_accuracy": 0.8046761274337768, "num_tokens": 22965299.0, "step": 685 }, { "epoch": 0.041189111747851004, "grad_norm": 1.0770418643951416, "learning_rate": 2.0554892601431983e-05, "loss": 0.6511, "mean_token_accuracy": 0.80174161195755, "num_tokens": 23132979.0, "step": 690 }, { "epoch": 0.04148758357211079, "grad_norm": 1.185211181640625, "learning_rate": 2.0704057279236276e-05, "loss": 0.6757, "mean_token_accuracy": 0.7944113135337829, "num_tokens": 23300659.0, "step": 695 }, { "epoch": 0.041786055396370585, "grad_norm": 1.0646095275878906, "learning_rate": 2.0853221957040575e-05, "loss": 0.71, "mean_token_accuracy": 0.7840629816055298, "num_tokens": 23468339.0, "step": 700 }, { "epoch": 0.04208452722063037, "grad_norm": 1.0418790578842163, "learning_rate": 2.100238663484487e-05, "loss": 0.6645, "mean_token_accuracy": 0.7989860415458679, "num_tokens": 23636019.0, "step": 705 }, { "epoch": 0.042382999044890166, "grad_norm": 1.008773684501648, "learning_rate": 2.1151551312649167e-05, "loss": 0.6334, "mean_token_accuracy": 0.8068233370780945, "num_tokens": 23803699.0, "step": 710 }, { "epoch": 0.04268147086914995, "grad_norm": 1.1428587436676025, "learning_rate": 2.1300715990453462e-05, "loss": 0.6831, "mean_token_accuracy": 0.7940176486968994, "num_tokens": 23971379.0, "step": 715 }, { "epoch": 0.04297994269340974, "grad_norm": 1.1082713603973389, "learning_rate": 2.1449880668257758e-05, "loss": 0.6637, "mean_token_accuracy": 0.7991053223609924, "num_tokens": 24139059.0, "step": 720 }, { "epoch": 0.04327841451766953, "grad_norm": 1.0348970890045166, "learning_rate": 2.1599045346062054e-05, "loss": 0.6329, "mean_token_accuracy": 0.807855200767517, "num_tokens": 24306739.0, "step": 725 }, { "epoch": 0.04357688634192932, "grad_norm": 0.9988929033279419, "learning_rate": 2.174821002386635e-05, "loss": 0.6476, "mean_token_accuracy": 0.8021412372589112, "num_tokens": 24474419.0, "step": 730 }, { "epoch": 0.043875358166189114, "grad_norm": 1.1111482381820679, "learning_rate": 2.1897374701670645e-05, "loss": 0.648, "mean_token_accuracy": 0.8020696640014648, "num_tokens": 24642099.0, "step": 735 }, { "epoch": 0.0441738299904489, "grad_norm": 1.0283012390136719, "learning_rate": 2.204653937947494e-05, "loss": 0.6115, "mean_token_accuracy": 0.8134319424629212, "num_tokens": 24809779.0, "step": 740 }, { "epoch": 0.04447230181470869, "grad_norm": 1.0280516147613525, "learning_rate": 2.2195704057279237e-05, "loss": 0.647, "mean_token_accuracy": 0.8046164870262146, "num_tokens": 24977459.0, "step": 745 }, { "epoch": 0.04477077363896848, "grad_norm": 0.9584552049636841, "learning_rate": 2.2344868735083533e-05, "loss": 0.6557, "mean_token_accuracy": 0.8007097721099854, "num_tokens": 25145139.0, "step": 750 }, { "epoch": 0.04506924546322827, "grad_norm": 1.1966413259506226, "learning_rate": 2.249403341288783e-05, "loss": 0.7648, "mean_token_accuracy": 0.7722533583641052, "num_tokens": 25312819.0, "step": 755 }, { "epoch": 0.04536771728748806, "grad_norm": 1.034635066986084, "learning_rate": 2.2643198090692124e-05, "loss": 0.6048, "mean_token_accuracy": 0.8161875128746032, "num_tokens": 25480499.0, "step": 760 }, { "epoch": 0.04566618911174785, "grad_norm": 0.9553530812263489, "learning_rate": 2.2792362768496423e-05, "loss": 0.7142, "mean_token_accuracy": 0.7846653819084167, "num_tokens": 25648179.0, "step": 765 }, { "epoch": 0.045964660936007644, "grad_norm": 0.9542186856269836, "learning_rate": 2.294152744630072e-05, "loss": 0.6467, "mean_token_accuracy": 0.8023380637168884, "num_tokens": 25815859.0, "step": 770 }, { "epoch": 0.04626313276026743, "grad_norm": 1.106665849685669, "learning_rate": 2.309069212410501e-05, "loss": 0.6754, "mean_token_accuracy": 0.7965465784072876, "num_tokens": 25983539.0, "step": 775 }, { "epoch": 0.04656160458452722, "grad_norm": 1.0282317399978638, "learning_rate": 2.323985680190931e-05, "loss": 0.7013, "mean_token_accuracy": 0.7864428043365479, "num_tokens": 26151219.0, "step": 780 }, { "epoch": 0.04686007640878701, "grad_norm": 1.0247565507888794, "learning_rate": 2.3389021479713606e-05, "loss": 0.6184, "mean_token_accuracy": 0.8110700130462647, "num_tokens": 26318899.0, "step": 785 }, { "epoch": 0.0471585482330468, "grad_norm": 0.9614169597625732, "learning_rate": 2.35381861575179e-05, "loss": 0.6165, "mean_token_accuracy": 0.8113205313682557, "num_tokens": 26486579.0, "step": 790 }, { "epoch": 0.04745702005730659, "grad_norm": 1.0571271181106567, "learning_rate": 2.3687350835322198e-05, "loss": 0.6383, "mean_token_accuracy": 0.8051592469215393, "num_tokens": 26654259.0, "step": 795 }, { "epoch": 0.04775549188156638, "grad_norm": 1.024620532989502, "learning_rate": 2.3836515513126494e-05, "loss": 0.6759, "mean_token_accuracy": 0.7939759016036987, "num_tokens": 26821939.0, "step": 800 }, { "epoch": 0.04805396370582617, "grad_norm": 1.002708077430725, "learning_rate": 2.398568019093079e-05, "loss": 0.6626, "mean_token_accuracy": 0.7992604017257691, "num_tokens": 26989619.0, "step": 805 }, { "epoch": 0.04835243553008596, "grad_norm": 1.1294392347335815, "learning_rate": 2.4134844868735085e-05, "loss": 0.6854, "mean_token_accuracy": 0.7950435400009155, "num_tokens": 27157299.0, "step": 810 }, { "epoch": 0.04865090735434575, "grad_norm": 1.0987032651901245, "learning_rate": 2.428400954653938e-05, "loss": 0.6595, "mean_token_accuracy": 0.7997435331344604, "num_tokens": 27324979.0, "step": 815 }, { "epoch": 0.04894937917860554, "grad_norm": 1.0007015466690063, "learning_rate": 2.4433174224343677e-05, "loss": 0.6433, "mean_token_accuracy": 0.8039246082305909, "num_tokens": 27492659.0, "step": 820 }, { "epoch": 0.04924785100286533, "grad_norm": 1.1488895416259766, "learning_rate": 2.4582338902147972e-05, "loss": 0.6779, "mean_token_accuracy": 0.7940593957901001, "num_tokens": 27660339.0, "step": 825 }, { "epoch": 0.04954632282712512, "grad_norm": 1.0630240440368652, "learning_rate": 2.473150357995227e-05, "loss": 0.6117, "mean_token_accuracy": 0.81239413022995, "num_tokens": 27828019.0, "step": 830 }, { "epoch": 0.04984479465138491, "grad_norm": 1.0150984525680542, "learning_rate": 2.4880668257756564e-05, "loss": 0.6497, "mean_token_accuracy": 0.8021114230155945, "num_tokens": 27995699.0, "step": 835 }, { "epoch": 0.050143266475644696, "grad_norm": 0.9663558006286621, "learning_rate": 2.502983293556086e-05, "loss": 0.5899, "mean_token_accuracy": 0.8176234364509583, "num_tokens": 28158354.0, "step": 840 }, { "epoch": 0.05044173829990449, "grad_norm": 0.8831707239151001, "learning_rate": 2.517899761336516e-05, "loss": 0.6278, "mean_token_accuracy": 0.8061493515968323, "num_tokens": 28326034.0, "step": 845 }, { "epoch": 0.05074021012416428, "grad_norm": 1.0511727333068848, "learning_rate": 2.5328162291169455e-05, "loss": 0.6259, "mean_token_accuracy": 0.8092270135879517, "num_tokens": 28493714.0, "step": 850 }, { "epoch": 0.05103868194842407, "grad_norm": 1.0011943578720093, "learning_rate": 2.5477326968973747e-05, "loss": 0.6973, "mean_token_accuracy": 0.7880174160003662, "num_tokens": 28661394.0, "step": 855 }, { "epoch": 0.05133715377268386, "grad_norm": 1.098341941833496, "learning_rate": 2.5626491646778046e-05, "loss": 0.7285, "mean_token_accuracy": 0.781629490852356, "num_tokens": 28829074.0, "step": 860 }, { "epoch": 0.05163562559694365, "grad_norm": 1.0473344326019287, "learning_rate": 2.5775656324582342e-05, "loss": 0.6242, "mean_token_accuracy": 0.8083561778068542, "num_tokens": 28996754.0, "step": 865 }, { "epoch": 0.05193409742120344, "grad_norm": 1.2244330644607544, "learning_rate": 2.5924821002386634e-05, "loss": 0.639, "mean_token_accuracy": 0.8053978323936463, "num_tokens": 29164434.0, "step": 870 }, { "epoch": 0.052232569245463226, "grad_norm": 1.067139983177185, "learning_rate": 2.6073985680190933e-05, "loss": 0.7022, "mean_token_accuracy": 0.7878802418708801, "num_tokens": 29332114.0, "step": 875 }, { "epoch": 0.05253104106972302, "grad_norm": 0.8787959814071655, "learning_rate": 2.622315035799523e-05, "loss": 0.6115, "mean_token_accuracy": 0.8129667282104492, "num_tokens": 29499794.0, "step": 880 }, { "epoch": 0.05282951289398281, "grad_norm": 0.9647467136383057, "learning_rate": 2.637231503579952e-05, "loss": 0.6358, "mean_token_accuracy": 0.8050101399421692, "num_tokens": 29667474.0, "step": 885 }, { "epoch": 0.0531279847182426, "grad_norm": 1.1024103164672852, "learning_rate": 2.652147971360382e-05, "loss": 0.6797, "mean_token_accuracy": 0.7921806097030639, "num_tokens": 29835154.0, "step": 890 }, { "epoch": 0.05342645654250239, "grad_norm": 0.9410717487335205, "learning_rate": 2.6670644391408116e-05, "loss": 0.6672, "mean_token_accuracy": 0.7984790682792664, "num_tokens": 30002834.0, "step": 895 }, { "epoch": 0.053724928366762174, "grad_norm": 0.9704106450080872, "learning_rate": 2.681980906921241e-05, "loss": 0.6639, "mean_token_accuracy": 0.7983418941497803, "num_tokens": 30170514.0, "step": 900 }, { "epoch": 0.05402340019102197, "grad_norm": 1.0035582780838013, "learning_rate": 2.6968973747016708e-05, "loss": 0.6865, "mean_token_accuracy": 0.7923416495323181, "num_tokens": 30338194.0, "step": 905 }, { "epoch": 0.054321872015281755, "grad_norm": 0.8491612672805786, "learning_rate": 2.7118138424821004e-05, "loss": 0.5953, "mean_token_accuracy": 0.8169628977775574, "num_tokens": 30505874.0, "step": 910 }, { "epoch": 0.05462034383954155, "grad_norm": 1.0254725217819214, "learning_rate": 2.7267303102625303e-05, "loss": 0.6688, "mean_token_accuracy": 0.796892523765564, "num_tokens": 30673554.0, "step": 915 }, { "epoch": 0.054918815663801336, "grad_norm": 0.8602766394615173, "learning_rate": 2.7416467780429595e-05, "loss": 0.6036, "mean_token_accuracy": 0.8152809262275695, "num_tokens": 30841234.0, "step": 920 }, { "epoch": 0.05521728748806113, "grad_norm": 1.065596103668213, "learning_rate": 2.756563245823389e-05, "loss": 0.6769, "mean_token_accuracy": 0.7956161379814148, "num_tokens": 31008914.0, "step": 925 }, { "epoch": 0.05551575931232092, "grad_norm": 0.9849673509597778, "learning_rate": 2.771479713603819e-05, "loss": 0.6844, "mean_token_accuracy": 0.7949421405792236, "num_tokens": 31176594.0, "step": 930 }, { "epoch": 0.055814231136580704, "grad_norm": 0.9741759896278381, "learning_rate": 2.7863961813842482e-05, "loss": 0.7201, "mean_token_accuracy": 0.7840331554412842, "num_tokens": 31344274.0, "step": 935 }, { "epoch": 0.0561127029608405, "grad_norm": 1.0898329019546509, "learning_rate": 2.8013126491646778e-05, "loss": 0.6348, "mean_token_accuracy": 0.8053441405296325, "num_tokens": 31511954.0, "step": 940 }, { "epoch": 0.056411174785100285, "grad_norm": 0.9574422836303711, "learning_rate": 2.8162291169451077e-05, "loss": 0.6288, "mean_token_accuracy": 0.809197187423706, "num_tokens": 31679634.0, "step": 945 }, { "epoch": 0.05670964660936008, "grad_norm": 1.055638313293457, "learning_rate": 2.831145584725537e-05, "loss": 0.6808, "mean_token_accuracy": 0.7933794617652893, "num_tokens": 31847314.0, "step": 950 }, { "epoch": 0.057008118433619866, "grad_norm": 0.9862667322158813, "learning_rate": 2.8460620525059666e-05, "loss": 0.6321, "mean_token_accuracy": 0.807079803943634, "num_tokens": 32014994.0, "step": 955 }, { "epoch": 0.05730659025787966, "grad_norm": 1.005582332611084, "learning_rate": 2.8609785202863965e-05, "loss": 0.658, "mean_token_accuracy": 0.7998329877853394, "num_tokens": 32182674.0, "step": 960 }, { "epoch": 0.05760506208213945, "grad_norm": 0.9280803203582764, "learning_rate": 2.8758949880668257e-05, "loss": 0.5842, "mean_token_accuracy": 0.8203745722770691, "num_tokens": 32350354.0, "step": 965 }, { "epoch": 0.057903533906399234, "grad_norm": 0.9932334423065186, "learning_rate": 2.8908114558472553e-05, "loss": 0.6319, "mean_token_accuracy": 0.8079804301261901, "num_tokens": 32518034.0, "step": 970 }, { "epoch": 0.05820200573065903, "grad_norm": 0.9722811579704285, "learning_rate": 2.9057279236276852e-05, "loss": 0.6078, "mean_token_accuracy": 0.8157580852508545, "num_tokens": 32685714.0, "step": 975 }, { "epoch": 0.058500477554918814, "grad_norm": 0.9054681658744812, "learning_rate": 2.920644391408115e-05, "loss": 0.626, "mean_token_accuracy": 0.8077597498893738, "num_tokens": 32853394.0, "step": 980 }, { "epoch": 0.05879894937917861, "grad_norm": 1.0661559104919434, "learning_rate": 2.935560859188544e-05, "loss": 0.6347, "mean_token_accuracy": 0.8061911106109619, "num_tokens": 33021074.0, "step": 985 }, { "epoch": 0.059097421203438395, "grad_norm": 1.0102673768997192, "learning_rate": 2.950477326968974e-05, "loss": 0.6031, "mean_token_accuracy": 0.8141417145729065, "num_tokens": 33188754.0, "step": 990 }, { "epoch": 0.05939589302769818, "grad_norm": 1.2798126935958862, "learning_rate": 2.965393794749404e-05, "loss": 0.6373, "mean_token_accuracy": 0.8037635564804078, "num_tokens": 33356434.0, "step": 995 }, { "epoch": 0.059694364851957976, "grad_norm": 0.8796353936195374, "learning_rate": 2.980310262529833e-05, "loss": 0.6399, "mean_token_accuracy": 0.8031969428062439, "num_tokens": 33524114.0, "step": 1000 }, { "epoch": 0.05999283667621776, "grad_norm": 1.0169320106506348, "learning_rate": 2.9952267303102627e-05, "loss": 0.6342, "mean_token_accuracy": 0.8077120304107666, "num_tokens": 33691794.0, "step": 1005 }, { "epoch": 0.06029130850047756, "grad_norm": 0.855898380279541, "learning_rate": 3.0101431980906926e-05, "loss": 0.5901, "mean_token_accuracy": 0.81674222946167, "num_tokens": 33859474.0, "step": 1010 }, { "epoch": 0.060589780324737344, "grad_norm": 0.8868576884269714, "learning_rate": 3.0250596658711218e-05, "loss": 0.6453, "mean_token_accuracy": 0.8030895829200745, "num_tokens": 34027154.0, "step": 1015 }, { "epoch": 0.06088825214899714, "grad_norm": 0.9104098677635193, "learning_rate": 3.0399761336515514e-05, "loss": 0.6973, "mean_token_accuracy": 0.7892520546913147, "num_tokens": 34194834.0, "step": 1020 }, { "epoch": 0.061186723973256925, "grad_norm": 0.9279430508613586, "learning_rate": 3.054892601431981e-05, "loss": 0.6034, "mean_token_accuracy": 0.8136287808418274, "num_tokens": 34362514.0, "step": 1025 }, { "epoch": 0.06148519579751671, "grad_norm": 0.972676694393158, "learning_rate": 3.06980906921241e-05, "loss": 0.6801, "mean_token_accuracy": 0.7960455656051636, "num_tokens": 34530194.0, "step": 1030 }, { "epoch": 0.061783667621776506, "grad_norm": 1.1245499849319458, "learning_rate": 3.08472553699284e-05, "loss": 0.6116, "mean_token_accuracy": 0.8137480616569519, "num_tokens": 34697874.0, "step": 1035 }, { "epoch": 0.06208213944603629, "grad_norm": 0.9801743030548096, "learning_rate": 3.09964200477327e-05, "loss": 0.615, "mean_token_accuracy": 0.8098950266838074, "num_tokens": 34865554.0, "step": 1040 }, { "epoch": 0.06238061127029609, "grad_norm": 0.9435410499572754, "learning_rate": 3.1145584725537e-05, "loss": 0.6644, "mean_token_accuracy": 0.7998210787773132, "num_tokens": 35033234.0, "step": 1045 }, { "epoch": 0.06267908309455587, "grad_norm": 0.9696136713027954, "learning_rate": 3.129474940334129e-05, "loss": 0.6557, "mean_token_accuracy": 0.8016775250434875, "num_tokens": 35193806.0, "step": 1050 }, { "epoch": 0.06297755491881567, "grad_norm": 0.8354589939117432, "learning_rate": 3.1443914081145584e-05, "loss": 0.6388, "mean_token_accuracy": 0.8039425134658813, "num_tokens": 35361486.0, "step": 1055 }, { "epoch": 0.06327602674307545, "grad_norm": 0.959191620349884, "learning_rate": 3.159307875894988e-05, "loss": 0.6353, "mean_token_accuracy": 0.8077001094818115, "num_tokens": 35529166.0, "step": 1060 }, { "epoch": 0.06357449856733524, "grad_norm": 0.9490755200386047, "learning_rate": 3.1742243436754176e-05, "loss": 0.6416, "mean_token_accuracy": 0.8047655940055847, "num_tokens": 35696846.0, "step": 1065 }, { "epoch": 0.06387297039159504, "grad_norm": 0.9717972278594971, "learning_rate": 3.1891408114558475e-05, "loss": 0.6697, "mean_token_accuracy": 0.7956936836242676, "num_tokens": 35864526.0, "step": 1070 }, { "epoch": 0.06417144221585483, "grad_norm": 0.8323968648910522, "learning_rate": 3.2040572792362774e-05, "loss": 0.6138, "mean_token_accuracy": 0.8139389395713806, "num_tokens": 36032206.0, "step": 1075 }, { "epoch": 0.06446991404011461, "grad_norm": 0.9752675294876099, "learning_rate": 3.2189737470167066e-05, "loss": 0.673, "mean_token_accuracy": 0.7970953106880188, "num_tokens": 36199886.0, "step": 1080 }, { "epoch": 0.0647683858643744, "grad_norm": 1.1442911624908447, "learning_rate": 3.233890214797136e-05, "loss": 0.7082, "mean_token_accuracy": 0.7877609491348266, "num_tokens": 36367566.0, "step": 1085 }, { "epoch": 0.0650668576886342, "grad_norm": 1.037018060684204, "learning_rate": 3.248806682577566e-05, "loss": 0.6543, "mean_token_accuracy": 0.8013300776481629, "num_tokens": 36535246.0, "step": 1090 }, { "epoch": 0.06536532951289398, "grad_norm": 1.0444648265838623, "learning_rate": 3.263723150357995e-05, "loss": 0.6876, "mean_token_accuracy": 0.7919957041740417, "num_tokens": 36702926.0, "step": 1095 }, { "epoch": 0.06566380133715377, "grad_norm": 1.0123920440673828, "learning_rate": 3.278639618138425e-05, "loss": 0.6297, "mean_token_accuracy": 0.8090182542800903, "num_tokens": 36870606.0, "step": 1100 }, { "epoch": 0.06596227316141356, "grad_norm": 0.96045982837677, "learning_rate": 3.293556085918855e-05, "loss": 0.6734, "mean_token_accuracy": 0.7954491376876831, "num_tokens": 37038286.0, "step": 1105 }, { "epoch": 0.06626074498567336, "grad_norm": 0.9951871633529663, "learning_rate": 3.308472553699285e-05, "loss": 0.6348, "mean_token_accuracy": 0.8046761274337768, "num_tokens": 37205966.0, "step": 1110 }, { "epoch": 0.06655921680993314, "grad_norm": 1.0533862113952637, "learning_rate": 3.323389021479713e-05, "loss": 0.6228, "mean_token_accuracy": 0.810300612449646, "num_tokens": 37373646.0, "step": 1115 }, { "epoch": 0.06685768863419293, "grad_norm": 0.9938288927078247, "learning_rate": 3.338305489260143e-05, "loss": 0.6618, "mean_token_accuracy": 0.7987295746803283, "num_tokens": 37541326.0, "step": 1120 }, { "epoch": 0.06715616045845273, "grad_norm": 0.910653293132782, "learning_rate": 3.353221957040573e-05, "loss": 0.6518, "mean_token_accuracy": 0.802618408203125, "num_tokens": 37709006.0, "step": 1125 }, { "epoch": 0.0674546322827125, "grad_norm": 0.8506009578704834, "learning_rate": 3.3681384248210024e-05, "loss": 0.636, "mean_token_accuracy": 0.8066980719566346, "num_tokens": 37876686.0, "step": 1130 }, { "epoch": 0.0677531041069723, "grad_norm": 0.8333922028541565, "learning_rate": 3.383054892601432e-05, "loss": 0.6315, "mean_token_accuracy": 0.8075211763381958, "num_tokens": 38044366.0, "step": 1135 }, { "epoch": 0.0680515759312321, "grad_norm": 0.8634788393974304, "learning_rate": 3.397971360381862e-05, "loss": 0.6177, "mean_token_accuracy": 0.813157594203949, "num_tokens": 38212046.0, "step": 1140 }, { "epoch": 0.06835004775549189, "grad_norm": 1.0894731283187866, "learning_rate": 3.4128878281622915e-05, "loss": 0.6542, "mean_token_accuracy": 0.8000238537788391, "num_tokens": 38379726.0, "step": 1145 }, { "epoch": 0.06864851957975167, "grad_norm": 0.877989649772644, "learning_rate": 3.427804295942721e-05, "loss": 0.6432, "mean_token_accuracy": 0.8051831126213074, "num_tokens": 38547406.0, "step": 1150 }, { "epoch": 0.06894699140401146, "grad_norm": 1.0099589824676514, "learning_rate": 3.4427207637231506e-05, "loss": 0.6757, "mean_token_accuracy": 0.7953238725662232, "num_tokens": 38715086.0, "step": 1155 }, { "epoch": 0.06924546322827126, "grad_norm": 0.991763710975647, "learning_rate": 3.45763723150358e-05, "loss": 0.6467, "mean_token_accuracy": 0.8043301939964295, "num_tokens": 38882766.0, "step": 1160 }, { "epoch": 0.06954393505253104, "grad_norm": 0.9633587598800659, "learning_rate": 3.47255369928401e-05, "loss": 0.6407, "mean_token_accuracy": 0.8049006700515747, "num_tokens": 39050412.0, "step": 1165 }, { "epoch": 0.06984240687679083, "grad_norm": 0.8255019187927246, "learning_rate": 3.48747016706444e-05, "loss": 0.6417, "mean_token_accuracy": 0.8049206852912902, "num_tokens": 39218092.0, "step": 1170 }, { "epoch": 0.07014087870105062, "grad_norm": 0.8424438238143921, "learning_rate": 3.502386634844869e-05, "loss": 0.6699, "mean_token_accuracy": 0.7992604017257691, "num_tokens": 39385772.0, "step": 1175 }, { "epoch": 0.0704393505253104, "grad_norm": 0.9406603574752808, "learning_rate": 3.517303102625298e-05, "loss": 0.6198, "mean_token_accuracy": 0.8089884281158447, "num_tokens": 39553452.0, "step": 1180 }, { "epoch": 0.0707378223495702, "grad_norm": 0.9525007605552673, "learning_rate": 3.532219570405728e-05, "loss": 0.6857, "mean_token_accuracy": 0.7931349277496338, "num_tokens": 39721132.0, "step": 1185 }, { "epoch": 0.07103629417382999, "grad_norm": 0.9634766578674316, "learning_rate": 3.547136038186158e-05, "loss": 0.6574, "mean_token_accuracy": 0.7991649746894837, "num_tokens": 39888812.0, "step": 1190 }, { "epoch": 0.07133476599808979, "grad_norm": 1.0032249689102173, "learning_rate": 3.562052505966587e-05, "loss": 0.7442, "mean_token_accuracy": 0.7792198657989502, "num_tokens": 40056492.0, "step": 1195 }, { "epoch": 0.07163323782234957, "grad_norm": 0.8983254432678223, "learning_rate": 3.576968973747017e-05, "loss": 0.6583, "mean_token_accuracy": 0.8014374256134034, "num_tokens": 40224172.0, "step": 1200 }, { "epoch": 0.07193170964660936, "grad_norm": 0.9061633348464966, "learning_rate": 3.5918854415274464e-05, "loss": 0.6116, "mean_token_accuracy": 0.8129309177398681, "num_tokens": 40391852.0, "step": 1205 }, { "epoch": 0.07223018147086915, "grad_norm": 0.8976230621337891, "learning_rate": 3.6068019093078756e-05, "loss": 0.6341, "mean_token_accuracy": 0.8084218025207519, "num_tokens": 40559532.0, "step": 1210 }, { "epoch": 0.07252865329512893, "grad_norm": 0.8993825316429138, "learning_rate": 3.6217183770883055e-05, "loss": 0.6783, "mean_token_accuracy": 0.7957413792610168, "num_tokens": 40727212.0, "step": 1215 }, { "epoch": 0.07282712511938873, "grad_norm": 0.9418155550956726, "learning_rate": 3.6366348448687354e-05, "loss": 0.5908, "mean_token_accuracy": 0.8176905632019043, "num_tokens": 40894892.0, "step": 1220 }, { "epoch": 0.07312559694364852, "grad_norm": 0.9506223201751709, "learning_rate": 3.651551312649165e-05, "loss": 0.6605, "mean_token_accuracy": 0.7986281871795654, "num_tokens": 41062572.0, "step": 1225 }, { "epoch": 0.07342406876790832, "grad_norm": 0.9626055955886841, "learning_rate": 3.6664677804295946e-05, "loss": 0.6432, "mean_token_accuracy": 0.8047894597053528, "num_tokens": 41230252.0, "step": 1230 }, { "epoch": 0.0737225405921681, "grad_norm": 0.9560362696647644, "learning_rate": 3.681384248210024e-05, "loss": 0.6858, "mean_token_accuracy": 0.7895235776901245, "num_tokens": 41390132.0, "step": 1235 }, { "epoch": 0.07402101241642789, "grad_norm": 0.8743177652359009, "learning_rate": 3.696300715990454e-05, "loss": 0.6634, "mean_token_accuracy": 0.7970774173736572, "num_tokens": 41557812.0, "step": 1240 }, { "epoch": 0.07431948424068768, "grad_norm": 0.831082284450531, "learning_rate": 3.711217183770883e-05, "loss": 0.6038, "mean_token_accuracy": 0.8142013549804688, "num_tokens": 41725492.0, "step": 1245 }, { "epoch": 0.07461795606494746, "grad_norm": 0.9832303524017334, "learning_rate": 3.726133651551313e-05, "loss": 0.6486, "mean_token_accuracy": 0.802982223033905, "num_tokens": 41893172.0, "step": 1250 }, { "epoch": 0.07491642788920726, "grad_norm": 0.9175562262535095, "learning_rate": 3.741050119331743e-05, "loss": 0.6443, "mean_token_accuracy": 0.8030120491981506, "num_tokens": 42060852.0, "step": 1255 }, { "epoch": 0.07521489971346705, "grad_norm": 0.9284515976905823, "learning_rate": 3.755966587112172e-05, "loss": 0.6633, "mean_token_accuracy": 0.7998687863349915, "num_tokens": 42228532.0, "step": 1260 }, { "epoch": 0.07551337153772684, "grad_norm": 1.1153919696807861, "learning_rate": 3.770883054892602e-05, "loss": 0.6454, "mean_token_accuracy": 0.8018296599388123, "num_tokens": 42389862.0, "step": 1265 }, { "epoch": 0.07581184336198662, "grad_norm": 0.9243282079696655, "learning_rate": 3.785799522673031e-05, "loss": 0.675, "mean_token_accuracy": 0.7935524225234986, "num_tokens": 42557542.0, "step": 1270 }, { "epoch": 0.07611031518624642, "grad_norm": 0.8942394256591797, "learning_rate": 3.8007159904534604e-05, "loss": 0.6568, "mean_token_accuracy": 0.800501024723053, "num_tokens": 42725222.0, "step": 1275 }, { "epoch": 0.07640878701050621, "grad_norm": 0.8383049964904785, "learning_rate": 3.8156324582338903e-05, "loss": 0.6224, "mean_token_accuracy": 0.8080102682113648, "num_tokens": 42892902.0, "step": 1280 }, { "epoch": 0.07670725883476599, "grad_norm": 0.926938533782959, "learning_rate": 3.83054892601432e-05, "loss": 0.6166, "mean_token_accuracy": 0.8101336002349854, "num_tokens": 43060582.0, "step": 1285 }, { "epoch": 0.07700573065902579, "grad_norm": 0.8695472478866577, "learning_rate": 3.8454653937947495e-05, "loss": 0.6384, "mean_token_accuracy": 0.8027496099472046, "num_tokens": 43228262.0, "step": 1290 }, { "epoch": 0.07730420248328558, "grad_norm": 0.8851755261421204, "learning_rate": 3.8603818615751794e-05, "loss": 0.6554, "mean_token_accuracy": 0.8046105146408081, "num_tokens": 43395942.0, "step": 1295 }, { "epoch": 0.07760267430754537, "grad_norm": 0.9108031988143921, "learning_rate": 3.8752983293556087e-05, "loss": 0.6103, "mean_token_accuracy": 0.8136168360710144, "num_tokens": 43563622.0, "step": 1300 }, { "epoch": 0.07790114613180515, "grad_norm": 0.9888355135917664, "learning_rate": 3.8902147971360386e-05, "loss": 0.6626, "mean_token_accuracy": 0.7978468298912048, "num_tokens": 43731302.0, "step": 1305 }, { "epoch": 0.07819961795606495, "grad_norm": 0.9139824509620667, "learning_rate": 3.905131264916468e-05, "loss": 0.6671, "mean_token_accuracy": 0.7976440548896789, "num_tokens": 43898982.0, "step": 1310 }, { "epoch": 0.07849808978032474, "grad_norm": 0.8039135336875916, "learning_rate": 3.920047732696898e-05, "loss": 0.616, "mean_token_accuracy": 0.8116008639335632, "num_tokens": 44066662.0, "step": 1315 }, { "epoch": 0.07879656160458452, "grad_norm": 0.8862246870994568, "learning_rate": 3.9349642004773276e-05, "loss": 0.645, "mean_token_accuracy": 0.8045687794685363, "num_tokens": 44234342.0, "step": 1320 }, { "epoch": 0.07909503342884432, "grad_norm": 0.8758423328399658, "learning_rate": 3.949880668257757e-05, "loss": 0.6708, "mean_token_accuracy": 0.7965107917785644, "num_tokens": 44402022.0, "step": 1325 }, { "epoch": 0.07939350525310411, "grad_norm": 0.8791078925132751, "learning_rate": 3.964797136038186e-05, "loss": 0.6026, "mean_token_accuracy": 0.8155731916427612, "num_tokens": 44569702.0, "step": 1330 }, { "epoch": 0.07969197707736389, "grad_norm": 0.8547496199607849, "learning_rate": 3.979713603818616e-05, "loss": 0.6969, "mean_token_accuracy": 0.7891864538192749, "num_tokens": 44737382.0, "step": 1335 }, { "epoch": 0.07999044890162368, "grad_norm": 1.9958553314208984, "learning_rate": 3.994630071599045e-05, "loss": 0.6901, "mean_token_accuracy": 0.7919897437095642, "num_tokens": 44905062.0, "step": 1340 }, { "epoch": 0.08028892072588348, "grad_norm": 0.9849021434783936, "learning_rate": 4.009546539379475e-05, "loss": 0.6632, "mean_token_accuracy": 0.798646068572998, "num_tokens": 45072742.0, "step": 1345 }, { "epoch": 0.08058739255014327, "grad_norm": 0.9506638646125793, "learning_rate": 4.024463007159905e-05, "loss": 0.6586, "mean_token_accuracy": 0.8004473328590394, "num_tokens": 45240422.0, "step": 1350 }, { "epoch": 0.08088586437440305, "grad_norm": 0.8287100791931152, "learning_rate": 4.039379474940334e-05, "loss": 0.6131, "mean_token_accuracy": 0.812656569480896, "num_tokens": 45408102.0, "step": 1355 }, { "epoch": 0.08118433619866285, "grad_norm": 0.850650429725647, "learning_rate": 4.0542959427207636e-05, "loss": 0.6021, "mean_token_accuracy": 0.8153942584991455, "num_tokens": 45575782.0, "step": 1360 }, { "epoch": 0.08148280802292264, "grad_norm": 0.8990254402160645, "learning_rate": 4.0692124105011935e-05, "loss": 0.6228, "mean_token_accuracy": 0.8096624255180359, "num_tokens": 45743462.0, "step": 1365 }, { "epoch": 0.08178127984718242, "grad_norm": 0.9089404940605164, "learning_rate": 4.0841288782816234e-05, "loss": 0.6486, "mean_token_accuracy": 0.802856981754303, "num_tokens": 45911142.0, "step": 1370 }, { "epoch": 0.08207975167144221, "grad_norm": 0.8626326322555542, "learning_rate": 4.0990453460620526e-05, "loss": 0.6474, "mean_token_accuracy": 0.8042645812034607, "num_tokens": 46078822.0, "step": 1375 }, { "epoch": 0.08237822349570201, "grad_norm": 0.969448983669281, "learning_rate": 4.1139618138424825e-05, "loss": 0.6653, "mean_token_accuracy": 0.7979422569274902, "num_tokens": 46246502.0, "step": 1380 }, { "epoch": 0.0826766953199618, "grad_norm": 0.9010327458381653, "learning_rate": 4.1288782816229125e-05, "loss": 0.66, "mean_token_accuracy": 0.7994512677192688, "num_tokens": 46414182.0, "step": 1385 }, { "epoch": 0.08297516714422158, "grad_norm": 0.9436922073364258, "learning_rate": 4.143794749403341e-05, "loss": 0.6776, "mean_token_accuracy": 0.7937075138092041, "num_tokens": 46581862.0, "step": 1390 }, { "epoch": 0.08327363896848138, "grad_norm": 0.7983666658401489, "learning_rate": 4.158711217183771e-05, "loss": 0.6541, "mean_token_accuracy": 0.8005606651306152, "num_tokens": 46749542.0, "step": 1395 }, { "epoch": 0.08357211079274117, "grad_norm": 0.8562968373298645, "learning_rate": 4.173627684964201e-05, "loss": 0.6757, "mean_token_accuracy": 0.7943874716758728, "num_tokens": 46917222.0, "step": 1400 }, { "epoch": 0.08387058261700095, "grad_norm": 0.9290655255317688, "learning_rate": 4.18854415274463e-05, "loss": 0.6422, "mean_token_accuracy": 0.8043600082397461, "num_tokens": 47084902.0, "step": 1405 }, { "epoch": 0.08416905444126074, "grad_norm": 0.8598992228507996, "learning_rate": 4.20346062052506e-05, "loss": 0.5864, "mean_token_accuracy": 0.8187403321266175, "num_tokens": 47252582.0, "step": 1410 }, { "epoch": 0.08446752626552054, "grad_norm": 0.8854230046272278, "learning_rate": 4.21837708830549e-05, "loss": 0.6655, "mean_token_accuracy": 0.7973040819168091, "num_tokens": 47420262.0, "step": 1415 }, { "epoch": 0.08476599808978033, "grad_norm": 0.8565343618392944, "learning_rate": 4.233293556085919e-05, "loss": 0.669, "mean_token_accuracy": 0.797894561290741, "num_tokens": 47587942.0, "step": 1420 }, { "epoch": 0.08506446991404011, "grad_norm": 0.8702725768089294, "learning_rate": 4.2482100238663484e-05, "loss": 0.633, "mean_token_accuracy": 0.8055409669876099, "num_tokens": 47755622.0, "step": 1425 }, { "epoch": 0.0853629417382999, "grad_norm": 0.8215453028678894, "learning_rate": 4.263126491646778e-05, "loss": 0.6068, "mean_token_accuracy": 0.8145174741744995, "num_tokens": 47923302.0, "step": 1430 }, { "epoch": 0.0856614135625597, "grad_norm": 0.8121721148490906, "learning_rate": 4.278042959427208e-05, "loss": 0.6683, "mean_token_accuracy": 0.7968984842300415, "num_tokens": 48090982.0, "step": 1435 }, { "epoch": 0.08595988538681948, "grad_norm": 0.8051170706748962, "learning_rate": 4.2929594272076375e-05, "loss": 0.654, "mean_token_accuracy": 0.8024573564529419, "num_tokens": 48258662.0, "step": 1440 }, { "epoch": 0.08625835721107927, "grad_norm": 0.7850134372711182, "learning_rate": 4.3078758949880674e-05, "loss": 0.6628, "mean_token_accuracy": 0.8003817200660706, "num_tokens": 48426342.0, "step": 1445 }, { "epoch": 0.08655682903533907, "grad_norm": 0.8776177763938904, "learning_rate": 4.3227923627684966e-05, "loss": 0.6251, "mean_token_accuracy": 0.808494484424591, "num_tokens": 48588230.0, "step": 1450 }, { "epoch": 0.08685530085959886, "grad_norm": 0.781536340713501, "learning_rate": 4.337708830548926e-05, "loss": 0.6573, "mean_token_accuracy": 0.7996501803398133, "num_tokens": 48752708.0, "step": 1455 }, { "epoch": 0.08715377268385864, "grad_norm": 0.8784968852996826, "learning_rate": 4.352625298329356e-05, "loss": 0.7195, "mean_token_accuracy": 0.7821126103401184, "num_tokens": 48920388.0, "step": 1460 }, { "epoch": 0.08745224450811843, "grad_norm": 0.8595447540283203, "learning_rate": 4.367541766109786e-05, "loss": 0.6471, "mean_token_accuracy": 0.8021114230155945, "num_tokens": 49088068.0, "step": 1465 }, { "epoch": 0.08775071633237823, "grad_norm": 0.8538692593574524, "learning_rate": 4.382458233890215e-05, "loss": 0.6269, "mean_token_accuracy": 0.8087796688079834, "num_tokens": 49255748.0, "step": 1470 }, { "epoch": 0.08804918815663801, "grad_norm": 0.8793528079986572, "learning_rate": 4.397374701670645e-05, "loss": 0.6285, "mean_token_accuracy": 0.8089884281158447, "num_tokens": 49423428.0, "step": 1475 }, { "epoch": 0.0883476599808978, "grad_norm": 0.8972448706626892, "learning_rate": 4.412291169451074e-05, "loss": 0.6615, "mean_token_accuracy": 0.8002743601799012, "num_tokens": 49591108.0, "step": 1480 }, { "epoch": 0.0886461318051576, "grad_norm": 0.8688410520553589, "learning_rate": 4.427207637231503e-05, "loss": 0.6493, "mean_token_accuracy": 0.8021889448165893, "num_tokens": 49758788.0, "step": 1485 }, { "epoch": 0.08894460362941738, "grad_norm": 0.8287907242774963, "learning_rate": 4.442124105011933e-05, "loss": 0.6451, "mean_token_accuracy": 0.8048848867416382, "num_tokens": 49926468.0, "step": 1490 }, { "epoch": 0.08924307545367717, "grad_norm": 0.8825296759605408, "learning_rate": 4.457040572792363e-05, "loss": 0.6704, "mean_token_accuracy": 0.7964332580566407, "num_tokens": 50094148.0, "step": 1495 }, { "epoch": 0.08954154727793696, "grad_norm": 0.8199141621589661, "learning_rate": 4.4719570405727924e-05, "loss": 0.6339, "mean_token_accuracy": 0.8072169899940491, "num_tokens": 50261828.0, "step": 1500 }, { "epoch": 0.08984001910219676, "grad_norm": 0.949461817741394, "learning_rate": 4.486873508353222e-05, "loss": 0.6471, "mean_token_accuracy": 0.802469277381897, "num_tokens": 50429508.0, "step": 1505 }, { "epoch": 0.09013849092645654, "grad_norm": 0.8446555137634277, "learning_rate": 4.5017899761336515e-05, "loss": 0.6793, "mean_token_accuracy": 0.792818808555603, "num_tokens": 50597188.0, "step": 1510 }, { "epoch": 0.09043696275071633, "grad_norm": 0.8719688653945923, "learning_rate": 4.5167064439140814e-05, "loss": 0.6364, "mean_token_accuracy": 0.8059286594390869, "num_tokens": 50764868.0, "step": 1515 }, { "epoch": 0.09073543457497613, "grad_norm": 0.8748178482055664, "learning_rate": 4.531622911694511e-05, "loss": 0.6535, "mean_token_accuracy": 0.8021829843521118, "num_tokens": 50932548.0, "step": 1520 }, { "epoch": 0.0910339063992359, "grad_norm": 0.7743371725082397, "learning_rate": 4.5465393794749406e-05, "loss": 0.6319, "mean_token_accuracy": 0.8074018836021424, "num_tokens": 51100228.0, "step": 1525 }, { "epoch": 0.0913323782234957, "grad_norm": 0.7892270684242249, "learning_rate": 4.5614558472553705e-05, "loss": 0.6156, "mean_token_accuracy": 0.8116068243980408, "num_tokens": 51267908.0, "step": 1530 }, { "epoch": 0.0916308500477555, "grad_norm": 0.8279834389686584, "learning_rate": 4.5763723150358e-05, "loss": 0.6131, "mean_token_accuracy": 0.8119527697563171, "num_tokens": 51435588.0, "step": 1535 }, { "epoch": 0.09192932187201529, "grad_norm": 0.8303422331809998, "learning_rate": 4.5912887828162297e-05, "loss": 0.6335, "mean_token_accuracy": 0.8053501129150391, "num_tokens": 51603268.0, "step": 1540 }, { "epoch": 0.09222779369627507, "grad_norm": 0.8752642869949341, "learning_rate": 4.606205250596659e-05, "loss": 0.6315, "mean_token_accuracy": 0.8056185126304627, "num_tokens": 51770948.0, "step": 1545 }, { "epoch": 0.09252626552053486, "grad_norm": 0.8765949010848999, "learning_rate": 4.621121718377088e-05, "loss": 0.6409, "mean_token_accuracy": 0.8050578594207763, "num_tokens": 51938628.0, "step": 1550 }, { "epoch": 0.09282473734479466, "grad_norm": 0.8335886001586914, "learning_rate": 4.636038186157518e-05, "loss": 0.604, "mean_token_accuracy": 0.8138911962509155, "num_tokens": 52106308.0, "step": 1555 }, { "epoch": 0.09312320916905444, "grad_norm": 0.9149099588394165, "learning_rate": 4.650954653937948e-05, "loss": 0.6456, "mean_token_accuracy": 0.8014195322990417, "num_tokens": 52273988.0, "step": 1560 }, { "epoch": 0.09342168099331423, "grad_norm": 0.7945675253868103, "learning_rate": 4.665871121718377e-05, "loss": 0.6676, "mean_token_accuracy": 0.7962125658988952, "num_tokens": 52441668.0, "step": 1565 }, { "epoch": 0.09372015281757402, "grad_norm": 0.7990531325340271, "learning_rate": 4.680787589498807e-05, "loss": 0.6695, "mean_token_accuracy": 0.7956877112388611, "num_tokens": 52609348.0, "step": 1570 }, { "epoch": 0.09401862464183382, "grad_norm": 0.9765162467956543, "learning_rate": 4.6957040572792363e-05, "loss": 0.7025, "mean_token_accuracy": 0.7895502805709839, "num_tokens": 52777028.0, "step": 1575 }, { "epoch": 0.0943170964660936, "grad_norm": 0.8342398405075073, "learning_rate": 4.710620525059666e-05, "loss": 0.636, "mean_token_accuracy": 0.8052308201789856, "num_tokens": 52944708.0, "step": 1580 }, { "epoch": 0.09461556829035339, "grad_norm": 0.9157741665840149, "learning_rate": 4.7255369928400955e-05, "loss": 0.6463, "mean_token_accuracy": 0.8030001282691955, "num_tokens": 53112388.0, "step": 1585 }, { "epoch": 0.09491404011461319, "grad_norm": 0.8565534353256226, "learning_rate": 4.7404534606205254e-05, "loss": 0.6883, "mean_token_accuracy": 0.790916132926941, "num_tokens": 53280068.0, "step": 1590 }, { "epoch": 0.09521251193887297, "grad_norm": 0.9609672427177429, "learning_rate": 4.755369928400955e-05, "loss": 0.686, "mean_token_accuracy": 0.7923416495323181, "num_tokens": 53447748.0, "step": 1595 }, { "epoch": 0.09551098376313276, "grad_norm": 0.8258156776428223, "learning_rate": 4.7702863961813846e-05, "loss": 0.6341, "mean_token_accuracy": 0.8055588722229003, "num_tokens": 53615428.0, "step": 1600 }, { "epoch": 0.09580945558739255, "grad_norm": 0.9373846650123596, "learning_rate": 4.785202863961814e-05, "loss": 0.6391, "mean_token_accuracy": 0.8042407274246216, "num_tokens": 53783108.0, "step": 1605 }, { "epoch": 0.09610792741165235, "grad_norm": 0.8284741044044495, "learning_rate": 4.800119331742244e-05, "loss": 0.6916, "mean_token_accuracy": 0.790802824497223, "num_tokens": 53950788.0, "step": 1610 }, { "epoch": 0.09640639923591213, "grad_norm": 0.8239485025405884, "learning_rate": 4.815035799522673e-05, "loss": 0.6504, "mean_token_accuracy": 0.8026899695396423, "num_tokens": 54118468.0, "step": 1615 }, { "epoch": 0.09670487106017192, "grad_norm": 0.8749241232872009, "learning_rate": 4.829952267303103e-05, "loss": 0.6443, "mean_token_accuracy": 0.8018966794013977, "num_tokens": 54286148.0, "step": 1620 }, { "epoch": 0.09700334288443171, "grad_norm": 0.7732098698616028, "learning_rate": 4.844868735083533e-05, "loss": 0.6376, "mean_token_accuracy": 0.8057556986808777, "num_tokens": 54453828.0, "step": 1625 }, { "epoch": 0.0973018147086915, "grad_norm": 0.8085273504257202, "learning_rate": 4.859785202863962e-05, "loss": 0.6205, "mean_token_accuracy": 0.8094891905784607, "num_tokens": 54613284.0, "step": 1630 }, { "epoch": 0.09760028653295129, "grad_norm": 0.8560582399368286, "learning_rate": 4.874701670644391e-05, "loss": 0.6885, "mean_token_accuracy": 0.7920732498168945, "num_tokens": 54780964.0, "step": 1635 }, { "epoch": 0.09789875835721108, "grad_norm": 0.7586135268211365, "learning_rate": 4.889618138424821e-05, "loss": 0.6188, "mean_token_accuracy": 0.8088154554367065, "num_tokens": 54948644.0, "step": 1640 }, { "epoch": 0.09819723018147086, "grad_norm": 0.9643146395683289, "learning_rate": 4.904534606205251e-05, "loss": 0.6594, "mean_token_accuracy": 0.8002862930297852, "num_tokens": 55116324.0, "step": 1645 }, { "epoch": 0.09849570200573066, "grad_norm": 0.8251019716262817, "learning_rate": 4.91945107398568e-05, "loss": 0.5937, "mean_token_accuracy": 0.8172074437141419, "num_tokens": 55284004.0, "step": 1650 }, { "epoch": 0.09879417382999045, "grad_norm": 0.8162339329719543, "learning_rate": 4.93436754176611e-05, "loss": 0.6679, "mean_token_accuracy": 0.7967016577720643, "num_tokens": 55451684.0, "step": 1655 }, { "epoch": 0.09909264565425024, "grad_norm": 0.7879742383956909, "learning_rate": 4.9492840095465395e-05, "loss": 0.6224, "mean_token_accuracy": 0.8104497194290161, "num_tokens": 55619364.0, "step": 1660 }, { "epoch": 0.09939111747851002, "grad_norm": 0.8442058563232422, "learning_rate": 4.964200477326969e-05, "loss": 0.6383, "mean_token_accuracy": 0.8046403408050538, "num_tokens": 55787044.0, "step": 1665 }, { "epoch": 0.09968958930276982, "grad_norm": 0.8841798305511475, "learning_rate": 4.9791169451073986e-05, "loss": 0.6689, "mean_token_accuracy": 0.798007869720459, "num_tokens": 55954724.0, "step": 1670 }, { "epoch": 0.09998806112702961, "grad_norm": 0.8683707118034363, "learning_rate": 4.9940334128878285e-05, "loss": 0.677, "mean_token_accuracy": 0.7960396051406861, "num_tokens": 56122404.0, "step": 1675 }, { "epoch": 0.10028653295128939, "grad_norm": 0.7513850927352905, "learning_rate": 4.999999901354773e-05, "loss": 0.6123, "mean_token_accuracy": 0.8120779991149902, "num_tokens": 56290084.0, "step": 1680 }, { "epoch": 0.10058500477554919, "grad_norm": 0.8978567719459534, "learning_rate": 4.9999992985228636e-05, "loss": 0.6895, "mean_token_accuracy": 0.7923416495323181, "num_tokens": 56457764.0, "step": 1685 }, { "epoch": 0.10088347659980898, "grad_norm": 0.820931613445282, "learning_rate": 4.9999981476620944e-05, "loss": 0.5774, "mean_token_accuracy": 0.8212751984596253, "num_tokens": 56625444.0, "step": 1690 }, { "epoch": 0.10118194842406877, "grad_norm": 0.7980391383171082, "learning_rate": 4.9999964487727464e-05, "loss": 0.5925, "mean_token_accuracy": 0.8184659481048584, "num_tokens": 56793124.0, "step": 1695 }, { "epoch": 0.10148042024832855, "grad_norm": 0.7825395464897156, "learning_rate": 4.999994201855233e-05, "loss": 0.6576, "mean_token_accuracy": 0.8001013994216919, "num_tokens": 56960804.0, "step": 1700 }, { "epoch": 0.10177889207258835, "grad_norm": 0.8230543732643127, "learning_rate": 4.9999914069101016e-05, "loss": 0.623, "mean_token_accuracy": 0.8087916135787964, "num_tokens": 57128484.0, "step": 1705 }, { "epoch": 0.10207736389684814, "grad_norm": 0.7690154314041138, "learning_rate": 4.999988063938033e-05, "loss": 0.6198, "mean_token_accuracy": 0.8091315865516663, "num_tokens": 57296164.0, "step": 1710 }, { "epoch": 0.10237583572110792, "grad_norm": 0.8304437398910522, "learning_rate": 4.999984172939841e-05, "loss": 0.6461, "mean_token_accuracy": 0.8028092741966247, "num_tokens": 57463844.0, "step": 1715 }, { "epoch": 0.10267430754536772, "grad_norm": 0.8990681171417236, "learning_rate": 4.999979733916474e-05, "loss": 0.6983, "mean_token_accuracy": 0.785589873790741, "num_tokens": 57631524.0, "step": 1720 }, { "epoch": 0.10297277936962751, "grad_norm": 0.7804028391838074, "learning_rate": 4.999974746869013e-05, "loss": 0.6916, "mean_token_accuracy": 0.7889180541038513, "num_tokens": 57799204.0, "step": 1725 }, { "epoch": 0.1032712511938873, "grad_norm": 0.8725268840789795, "learning_rate": 4.999969211798672e-05, "loss": 0.646, "mean_token_accuracy": 0.8023380756378173, "num_tokens": 57966884.0, "step": 1730 }, { "epoch": 0.10356972301814708, "grad_norm": 0.8713625073432922, "learning_rate": 4.9999631287068004e-05, "loss": 0.7015, "mean_token_accuracy": 0.7879935503005981, "num_tokens": 58134564.0, "step": 1735 }, { "epoch": 0.10386819484240688, "grad_norm": 0.808779239654541, "learning_rate": 4.999956497594879e-05, "loss": 0.6943, "mean_token_accuracy": 0.7902600526809692, "num_tokens": 58302244.0, "step": 1740 }, { "epoch": 0.10416666666666667, "grad_norm": 0.7280786633491516, "learning_rate": 4.999949318464524e-05, "loss": 0.6228, "mean_token_accuracy": 0.8087319493293762, "num_tokens": 58469924.0, "step": 1745 }, { "epoch": 0.10446513849092645, "grad_norm": 0.8153372406959534, "learning_rate": 4.9999415913174825e-05, "loss": 0.6974, "mean_token_accuracy": 0.7893594145774842, "num_tokens": 58637604.0, "step": 1750 }, { "epoch": 0.10476361031518625, "grad_norm": 0.7588002681732178, "learning_rate": 4.9999333161556374e-05, "loss": 0.6347, "mean_token_accuracy": 0.8061724901199341, "num_tokens": 58804558.0, "step": 1755 }, { "epoch": 0.10506208213944604, "grad_norm": 0.7470396161079407, "learning_rate": 4.999924492981004e-05, "loss": 0.641, "mean_token_accuracy": 0.8059107780456543, "num_tokens": 58972238.0, "step": 1760 }, { "epoch": 0.10536055396370583, "grad_norm": 0.8653382062911987, "learning_rate": 4.999915121795732e-05, "loss": 0.6801, "mean_token_accuracy": 0.7929261684417724, "num_tokens": 59139918.0, "step": 1765 }, { "epoch": 0.10565902578796561, "grad_norm": 0.9816977977752686, "learning_rate": 4.9999052026021024e-05, "loss": 0.6644, "mean_token_accuracy": 0.7990134358406067, "num_tokens": 59305447.0, "step": 1770 }, { "epoch": 0.10595749761222541, "grad_norm": 0.8733177185058594, "learning_rate": 4.999894735402533e-05, "loss": 0.6167, "mean_token_accuracy": 0.8126625418663025, "num_tokens": 59473127.0, "step": 1775 }, { "epoch": 0.1062559694364852, "grad_norm": 0.7908738255500793, "learning_rate": 4.999883720199572e-05, "loss": 0.6168, "mean_token_accuracy": 0.8099069476127625, "num_tokens": 59640807.0, "step": 1780 }, { "epoch": 0.10655444126074498, "grad_norm": 0.8156537413597107, "learning_rate": 4.9998721569959035e-05, "loss": 0.6945, "mean_token_accuracy": 0.7903017997741699, "num_tokens": 59808487.0, "step": 1785 }, { "epoch": 0.10685291308500477, "grad_norm": 0.8189343214035034, "learning_rate": 4.999860045794343e-05, "loss": 0.6788, "mean_token_accuracy": 0.7944590210914612, "num_tokens": 59976167.0, "step": 1790 }, { "epoch": 0.10715138490926457, "grad_norm": 0.9364219307899475, "learning_rate": 4.999847386597841e-05, "loss": 0.6401, "mean_token_accuracy": 0.8055946588516235, "num_tokens": 60143847.0, "step": 1795 }, { "epoch": 0.10744985673352435, "grad_norm": 0.8389230966567993, "learning_rate": 4.9998341794094795e-05, "loss": 0.6314, "mean_token_accuracy": 0.8070320963859559, "num_tokens": 60311527.0, "step": 1800 }, { "epoch": 0.10774832855778414, "grad_norm": 0.8747956156730652, "learning_rate": 4.999820424232477e-05, "loss": 0.7154, "mean_token_accuracy": 0.7825838088989258, "num_tokens": 60479207.0, "step": 1805 }, { "epoch": 0.10804680038204394, "grad_norm": 4.572124004364014, "learning_rate": 4.9998061210701836e-05, "loss": 0.6549, "mean_token_accuracy": 0.8010378003120422, "num_tokens": 60646887.0, "step": 1810 }, { "epoch": 0.10834527220630373, "grad_norm": 0.7478983998298645, "learning_rate": 4.999791269926082e-05, "loss": 0.5921, "mean_token_accuracy": 0.8204623460769653, "num_tokens": 60810262.0, "step": 1815 }, { "epoch": 0.10864374403056351, "grad_norm": 1.0214107036590576, "learning_rate": 4.9997758708037906e-05, "loss": 0.671, "mean_token_accuracy": 0.7963855385780334, "num_tokens": 60977942.0, "step": 1820 }, { "epoch": 0.1089422158548233, "grad_norm": 0.9769614934921265, "learning_rate": 4.999759923707061e-05, "loss": 0.6606, "mean_token_accuracy": 0.7990695357322692, "num_tokens": 61145622.0, "step": 1825 }, { "epoch": 0.1092406876790831, "grad_norm": 0.7749030590057373, "learning_rate": 4.9997434286397746e-05, "loss": 0.6264, "mean_token_accuracy": 0.808421790599823, "num_tokens": 61313302.0, "step": 1830 }, { "epoch": 0.10953915950334288, "grad_norm": 1.0878808498382568, "learning_rate": 4.999726385605951e-05, "loss": 0.7611, "mean_token_accuracy": 0.7697423338890076, "num_tokens": 61480982.0, "step": 1835 }, { "epoch": 0.10983763132760267, "grad_norm": 0.7937517166137695, "learning_rate": 4.999708794609741e-05, "loss": 0.6456, "mean_token_accuracy": 0.8015149712562561, "num_tokens": 61648662.0, "step": 1840 }, { "epoch": 0.11013610315186247, "grad_norm": 0.8066228032112122, "learning_rate": 4.999690655655429e-05, "loss": 0.6776, "mean_token_accuracy": 0.792681610584259, "num_tokens": 61816342.0, "step": 1845 }, { "epoch": 0.11043457497612226, "grad_norm": 0.8846402764320374, "learning_rate": 4.999671968747434e-05, "loss": 0.6363, "mean_token_accuracy": 0.8043779015541077, "num_tokens": 61984022.0, "step": 1850 }, { "epoch": 0.11073304680038204, "grad_norm": 0.7564229369163513, "learning_rate": 4.999652733890306e-05, "loss": 0.6465, "mean_token_accuracy": 0.8019324660301208, "num_tokens": 62151702.0, "step": 1855 }, { "epoch": 0.11103151862464183, "grad_norm": 0.7707486152648926, "learning_rate": 4.999632951088732e-05, "loss": 0.6413, "mean_token_accuracy": 0.8040021538734436, "num_tokens": 62319382.0, "step": 1860 }, { "epoch": 0.11132999044890163, "grad_norm": 0.8063837885856628, "learning_rate": 4.999612620347528e-05, "loss": 0.6425, "mean_token_accuracy": 0.8032983303070068, "num_tokens": 62487062.0, "step": 1865 }, { "epoch": 0.11162846227316141, "grad_norm": 0.842122495174408, "learning_rate": 4.999591741671648e-05, "loss": 0.6961, "mean_token_accuracy": 0.789222252368927, "num_tokens": 62654742.0, "step": 1870 }, { "epoch": 0.1119269340974212, "grad_norm": 0.8580625057220459, "learning_rate": 4.9995703150661765e-05, "loss": 0.6505, "mean_token_accuracy": 0.8024335026741027, "num_tokens": 62822422.0, "step": 1875 }, { "epoch": 0.112225405921681, "grad_norm": 0.7586648464202881, "learning_rate": 4.999548340536332e-05, "loss": 0.6036, "mean_token_accuracy": 0.8125909566879272, "num_tokens": 62990102.0, "step": 1880 }, { "epoch": 0.11252387774594079, "grad_norm": 0.7322880029678345, "learning_rate": 4.9995258180874674e-05, "loss": 0.5983, "mean_token_accuracy": 0.8164380311965942, "num_tokens": 63157782.0, "step": 1885 }, { "epoch": 0.11282234957020057, "grad_norm": 0.75962233543396, "learning_rate": 4.9995027477250684e-05, "loss": 0.5895, "mean_token_accuracy": 0.8182989478111267, "num_tokens": 63325462.0, "step": 1890 }, { "epoch": 0.11312082139446036, "grad_norm": 0.7311409115791321, "learning_rate": 4.999479129454755e-05, "loss": 0.6697, "mean_token_accuracy": 0.796403431892395, "num_tokens": 63493142.0, "step": 1895 }, { "epoch": 0.11341929321872016, "grad_norm": 0.8264614343643188, "learning_rate": 4.999454963282278e-05, "loss": 0.6556, "mean_token_accuracy": 0.8048133254051208, "num_tokens": 63660822.0, "step": 1900 }, { "epoch": 0.11371776504297994, "grad_norm": 0.7244938015937805, "learning_rate": 4.9994302492135245e-05, "loss": 0.5858, "mean_token_accuracy": 0.8215078234672546, "num_tokens": 63828502.0, "step": 1905 }, { "epoch": 0.11401623686723973, "grad_norm": 0.825389564037323, "learning_rate": 4.999404987254514e-05, "loss": 0.6417, "mean_token_accuracy": 0.8050161004066467, "num_tokens": 63996182.0, "step": 1910 }, { "epoch": 0.11431470869149953, "grad_norm": 0.8485185503959656, "learning_rate": 4.9993791774114e-05, "loss": 0.6604, "mean_token_accuracy": 0.7995884537696838, "num_tokens": 64163862.0, "step": 1915 }, { "epoch": 0.11461318051575932, "grad_norm": 0.8129671812057495, "learning_rate": 4.999352819690469e-05, "loss": 0.6404, "mean_token_accuracy": 0.8028509974479675, "num_tokens": 64331542.0, "step": 1920 }, { "epoch": 0.1149116523400191, "grad_norm": 0.735074520111084, "learning_rate": 4.99932591409814e-05, "loss": 0.6448, "mean_token_accuracy": 0.8026720762252808, "num_tokens": 64499222.0, "step": 1925 }, { "epoch": 0.1152101241642789, "grad_norm": 0.8927582502365112, "learning_rate": 4.999298460640967e-05, "loss": 0.6991, "mean_token_accuracy": 0.7883752703666687, "num_tokens": 64666902.0, "step": 1930 }, { "epoch": 0.11550859598853869, "grad_norm": 0.761263906955719, "learning_rate": 4.9992704593256365e-05, "loss": 0.6513, "mean_token_accuracy": 0.7997017741203308, "num_tokens": 64834582.0, "step": 1935 }, { "epoch": 0.11580706781279847, "grad_norm": 0.7414693236351013, "learning_rate": 4.999241910158968e-05, "loss": 0.6103, "mean_token_accuracy": 0.8118871569633483, "num_tokens": 65002262.0, "step": 1940 }, { "epoch": 0.11610553963705826, "grad_norm": 0.7668074369430542, "learning_rate": 4.9992128131479167e-05, "loss": 0.6318, "mean_token_accuracy": 0.8079303145408631, "num_tokens": 65165118.0, "step": 1945 }, { "epoch": 0.11640401146131805, "grad_norm": 0.6964201331138611, "learning_rate": 4.9991831682995696e-05, "loss": 0.6713, "mean_token_accuracy": 0.7963974714279175, "num_tokens": 65332798.0, "step": 1950 }, { "epoch": 0.11670248328557783, "grad_norm": 0.7373561859130859, "learning_rate": 4.999152975621146e-05, "loss": 0.6682, "mean_token_accuracy": 0.7958904862403869, "num_tokens": 65500478.0, "step": 1955 }, { "epoch": 0.11700095510983763, "grad_norm": 0.91972815990448, "learning_rate": 4.999122235120001e-05, "loss": 0.6689, "mean_token_accuracy": 0.7971668839454651, "num_tokens": 65668158.0, "step": 1960 }, { "epoch": 0.11729942693409742, "grad_norm": 0.8223519325256348, "learning_rate": 4.999090946803621e-05, "loss": 0.6544, "mean_token_accuracy": 0.8011093854904174, "num_tokens": 65835838.0, "step": 1965 }, { "epoch": 0.11759789875835722, "grad_norm": 0.8795048594474792, "learning_rate": 4.9990591106796276e-05, "loss": 0.6549, "mean_token_accuracy": 0.7998031735420227, "num_tokens": 66003518.0, "step": 1970 }, { "epoch": 0.117896370582617, "grad_norm": 0.715836763381958, "learning_rate": 4.999026726755775e-05, "loss": 0.6149, "mean_token_accuracy": 0.8110819458961487, "num_tokens": 66171198.0, "step": 1975 }, { "epoch": 0.11819484240687679, "grad_norm": 0.7795296311378479, "learning_rate": 4.998993795039951e-05, "loss": 0.6072, "mean_token_accuracy": 0.8121853709220886, "num_tokens": 66338878.0, "step": 1980 }, { "epoch": 0.11849331423113658, "grad_norm": 0.8171116709709167, "learning_rate": 4.998960315540177e-05, "loss": 0.6071, "mean_token_accuracy": 0.8135213971138, "num_tokens": 66506558.0, "step": 1985 }, { "epoch": 0.11879178605539636, "grad_norm": 0.8781904578208923, "learning_rate": 4.998926288264606e-05, "loss": 0.5904, "mean_token_accuracy": 0.8168376445770263, "num_tokens": 66674238.0, "step": 1990 }, { "epoch": 0.11909025787965616, "grad_norm": 0.8734174370765686, "learning_rate": 4.998891713221528e-05, "loss": 0.6321, "mean_token_accuracy": 0.8071036696434021, "num_tokens": 66841918.0, "step": 1995 }, { "epoch": 0.11938872970391595, "grad_norm": 0.7292535901069641, "learning_rate": 4.998856590419363e-05, "loss": 0.6519, "mean_token_accuracy": 0.8018191576004028, "num_tokens": 67009598.0, "step": 2000 }, { "epoch": 0.11968720152817575, "grad_norm": 0.7165786027908325, "learning_rate": 4.9988209198666667e-05, "loss": 0.6151, "mean_token_accuracy": 0.8122390508651733, "num_tokens": 67177278.0, "step": 2005 }, { "epoch": 0.11998567335243553, "grad_norm": 0.694393515586853, "learning_rate": 4.998784701572127e-05, "loss": 0.6438, "mean_token_accuracy": 0.8043719410896302, "num_tokens": 67344958.0, "step": 2010 }, { "epoch": 0.12028414517669532, "grad_norm": 0.7211229205131531, "learning_rate": 4.9987479355445645e-05, "loss": 0.6312, "mean_token_accuracy": 0.8063461899757385, "num_tokens": 67512638.0, "step": 2015 }, { "epoch": 0.12058261700095511, "grad_norm": 0.8628489375114441, "learning_rate": 4.998710621792936e-05, "loss": 0.6962, "mean_token_accuracy": 0.788106894493103, "num_tokens": 67680318.0, "step": 2020 }, { "epoch": 0.1208810888252149, "grad_norm": 0.7633472681045532, "learning_rate": 4.998672760326329e-05, "loss": 0.6548, "mean_token_accuracy": 0.7996182680130005, "num_tokens": 67847998.0, "step": 2025 }, { "epoch": 0.12117956064947469, "grad_norm": 0.8130160570144653, "learning_rate": 4.9986343511539653e-05, "loss": 0.6258, "mean_token_accuracy": 0.8070142030715942, "num_tokens": 68015678.0, "step": 2030 }, { "epoch": 0.12147803247373448, "grad_norm": 0.6947306394577026, "learning_rate": 4.9985953942852e-05, "loss": 0.6495, "mean_token_accuracy": 0.801479172706604, "num_tokens": 68183358.0, "step": 2035 }, { "epoch": 0.12177650429799428, "grad_norm": 0.7634063363075256, "learning_rate": 4.998555889729523e-05, "loss": 0.6555, "mean_token_accuracy": 0.8009125709533691, "num_tokens": 68351038.0, "step": 2040 }, { "epoch": 0.12207497612225406, "grad_norm": 0.778538167476654, "learning_rate": 4.9985158374965543e-05, "loss": 0.5638, "mean_token_accuracy": 0.8260646462440491, "num_tokens": 68518718.0, "step": 2045 }, { "epoch": 0.12237344794651385, "grad_norm": 0.7564401030540466, "learning_rate": 4.998475237596051e-05, "loss": 0.6481, "mean_token_accuracy": 0.8013062119483948, "num_tokens": 68686398.0, "step": 2050 }, { "epoch": 0.12267191977077364, "grad_norm": 0.6965293288230896, "learning_rate": 4.9984340900379026e-05, "loss": 0.6503, "mean_token_accuracy": 0.8002326130867005, "num_tokens": 68854078.0, "step": 2055 }, { "epoch": 0.12297039159503342, "grad_norm": 0.7579342722892761, "learning_rate": 4.9983923948321287e-05, "loss": 0.639, "mean_token_accuracy": 0.804849112033844, "num_tokens": 69021758.0, "step": 2060 }, { "epoch": 0.12326886341929322, "grad_norm": 0.74488765001297, "learning_rate": 4.998350151988887e-05, "loss": 0.6642, "mean_token_accuracy": 0.7954431653022767, "num_tokens": 69189438.0, "step": 2065 }, { "epoch": 0.12356733524355301, "grad_norm": 0.8010881543159485, "learning_rate": 4.998307361518466e-05, "loss": 0.6219, "mean_token_accuracy": 0.8085888147354126, "num_tokens": 69357118.0, "step": 2070 }, { "epoch": 0.1238658070678128, "grad_norm": 0.7767967581748962, "learning_rate": 4.9982640234312875e-05, "loss": 0.6268, "mean_token_accuracy": 0.8063640713691711, "num_tokens": 69524798.0, "step": 2075 }, { "epoch": 0.12416427889207259, "grad_norm": 0.8799981474876404, "learning_rate": 4.998220137737909e-05, "loss": 0.6772, "mean_token_accuracy": 0.7931885957717896, "num_tokens": 69692478.0, "step": 2080 }, { "epoch": 0.12446275071633238, "grad_norm": 0.7806699275970459, "learning_rate": 4.998175704449019e-05, "loss": 0.6151, "mean_token_accuracy": 0.8097220540046692, "num_tokens": 69860158.0, "step": 2085 }, { "epoch": 0.12476122254059217, "grad_norm": 0.7636421918869019, "learning_rate": 4.998130723575438e-05, "loss": 0.6421, "mean_token_accuracy": 0.8033281683921814, "num_tokens": 70027838.0, "step": 2090 }, { "epoch": 0.12505969436485195, "grad_norm": 0.8024376630783081, "learning_rate": 4.998085195128124e-05, "loss": 0.6787, "mean_token_accuracy": 0.7936836361885071, "num_tokens": 70195518.0, "step": 2095 }, { "epoch": 0.12535816618911175, "grad_norm": 0.7960630059242249, "learning_rate": 4.998039119118166e-05, "loss": 0.6845, "mean_token_accuracy": 0.791202437877655, "num_tokens": 70363198.0, "step": 2100 }, { "epoch": 0.12565663801337154, "grad_norm": 0.8377149701118469, "learning_rate": 4.997992495556788e-05, "loss": 0.6462, "mean_token_accuracy": 0.8019623160362244, "num_tokens": 70530878.0, "step": 2105 }, { "epoch": 0.12595510983763133, "grad_norm": 0.8119000792503357, "learning_rate": 4.9979453244553424e-05, "loss": 0.6686, "mean_token_accuracy": 0.7952165126800537, "num_tokens": 70698558.0, "step": 2110 }, { "epoch": 0.12625358166189113, "grad_norm": 0.8080458045005798, "learning_rate": 4.9978976058253205e-05, "loss": 0.6991, "mean_token_accuracy": 0.789210295677185, "num_tokens": 70866238.0, "step": 2115 }, { "epoch": 0.1265520534861509, "grad_norm": 0.7718464732170105, "learning_rate": 4.997849339678347e-05, "loss": 0.6106, "mean_token_accuracy": 0.8126744508743287, "num_tokens": 71033918.0, "step": 2120 }, { "epoch": 0.1268505253104107, "grad_norm": 0.7288238406181335, "learning_rate": 4.997800526026176e-05, "loss": 0.649, "mean_token_accuracy": 0.8037635564804078, "num_tokens": 71201598.0, "step": 2125 }, { "epoch": 0.12714899713467048, "grad_norm": 0.7069281935691833, "learning_rate": 4.997751164880696e-05, "loss": 0.6353, "mean_token_accuracy": 0.8050817131996155, "num_tokens": 71369278.0, "step": 2130 }, { "epoch": 0.12744746895893028, "grad_norm": 0.7088195085525513, "learning_rate": 4.997701256253932e-05, "loss": 0.629, "mean_token_accuracy": 0.807306456565857, "num_tokens": 71536958.0, "step": 2135 }, { "epoch": 0.12774594078319007, "grad_norm": 0.7803517580032349, "learning_rate": 4.997650800158039e-05, "loss": 0.625, "mean_token_accuracy": 0.809537160396576, "num_tokens": 71704638.0, "step": 2140 }, { "epoch": 0.12804441260744986, "grad_norm": 0.8309139609336853, "learning_rate": 4.997599796605306e-05, "loss": 0.6059, "mean_token_accuracy": 0.8137361168861389, "num_tokens": 71872318.0, "step": 2145 }, { "epoch": 0.12834288443170966, "grad_norm": 0.754838228225708, "learning_rate": 4.997548245608158e-05, "loss": 0.6338, "mean_token_accuracy": 0.8056423664093018, "num_tokens": 72039998.0, "step": 2150 }, { "epoch": 0.12864135625596942, "grad_norm": 0.8363931179046631, "learning_rate": 4.9974961471791485e-05, "loss": 0.7028, "mean_token_accuracy": 0.787719190120697, "num_tokens": 72207678.0, "step": 2155 }, { "epoch": 0.12893982808022922, "grad_norm": 0.738895058631897, "learning_rate": 4.997443501330969e-05, "loss": 0.6208, "mean_token_accuracy": 0.8090898275375367, "num_tokens": 72375358.0, "step": 2160 }, { "epoch": 0.129238299904489, "grad_norm": 0.757001519203186, "learning_rate": 4.9973903080764416e-05, "loss": 0.6735, "mean_token_accuracy": 0.7951807141304016, "num_tokens": 72543038.0, "step": 2165 }, { "epoch": 0.1295367717287488, "grad_norm": 0.7282401919364929, "learning_rate": 4.997336567428523e-05, "loss": 0.6415, "mean_token_accuracy": 0.8038709282875061, "num_tokens": 72710718.0, "step": 2170 }, { "epoch": 0.1298352435530086, "grad_norm": 0.7109334468841553, "learning_rate": 4.9972822794003024e-05, "loss": 0.5997, "mean_token_accuracy": 0.8166050314903259, "num_tokens": 72878398.0, "step": 2175 }, { "epoch": 0.1301337153772684, "grad_norm": 0.7727491855621338, "learning_rate": 4.997227444005001e-05, "loss": 0.6291, "mean_token_accuracy": 0.807592761516571, "num_tokens": 73046078.0, "step": 2180 }, { "epoch": 0.1304321872015282, "grad_norm": 0.7181538939476013, "learning_rate": 4.997172061255978e-05, "loss": 0.6396, "mean_token_accuracy": 0.8033043026924134, "num_tokens": 73213758.0, "step": 2185 }, { "epoch": 0.13073065902578795, "grad_norm": 0.7119318842887878, "learning_rate": 4.9971161311667217e-05, "loss": 0.6073, "mean_token_accuracy": 0.812895143032074, "num_tokens": 73381438.0, "step": 2190 }, { "epoch": 0.13102913085004775, "grad_norm": 0.7757542133331299, "learning_rate": 4.9970596537508536e-05, "loss": 0.6207, "mean_token_accuracy": 0.8105809330940247, "num_tokens": 73549118.0, "step": 2195 }, { "epoch": 0.13132760267430754, "grad_norm": 0.7838243842124939, "learning_rate": 4.997002629022132e-05, "loss": 0.6521, "mean_token_accuracy": 0.8015567302703858, "num_tokens": 73716798.0, "step": 2200 }, { "epoch": 0.13162607449856734, "grad_norm": 0.7373713254928589, "learning_rate": 4.9969450569944435e-05, "loss": 0.6685, "mean_token_accuracy": 0.7966241240501404, "num_tokens": 73884478.0, "step": 2205 }, { "epoch": 0.13192454632282713, "grad_norm": 0.7715480327606201, "learning_rate": 4.996886937681814e-05, "loss": 0.6697, "mean_token_accuracy": 0.7969402313232422, "num_tokens": 74052158.0, "step": 2210 }, { "epoch": 0.13222301814708692, "grad_norm": 0.9639972448348999, "learning_rate": 4.996828271098398e-05, "loss": 0.6559, "mean_token_accuracy": 0.8007336139678956, "num_tokens": 74219838.0, "step": 2215 }, { "epoch": 0.13252148997134672, "grad_norm": 0.8003249764442444, "learning_rate": 4.996769057258484e-05, "loss": 0.6856, "mean_token_accuracy": 0.7893832802772522, "num_tokens": 74387518.0, "step": 2220 }, { "epoch": 0.13281996179560648, "grad_norm": 0.7151038646697998, "learning_rate": 4.996709296176496e-05, "loss": 0.6566, "mean_token_accuracy": 0.7986639738082886, "num_tokens": 74555198.0, "step": 2225 }, { "epoch": 0.13311843361986628, "grad_norm": 0.8242704272270203, "learning_rate": 4.9966489878669885e-05, "loss": 0.6345, "mean_token_accuracy": 0.8052427530288696, "num_tokens": 74722878.0, "step": 2230 }, { "epoch": 0.13341690544412607, "grad_norm": 0.7722378969192505, "learning_rate": 4.9965881323446534e-05, "loss": 0.6528, "mean_token_accuracy": 0.8037099003791809, "num_tokens": 74890558.0, "step": 2235 }, { "epoch": 0.13371537726838587, "grad_norm": 0.7419425845146179, "learning_rate": 4.99652672962431e-05, "loss": 0.6253, "mean_token_accuracy": 0.8088989734649659, "num_tokens": 75058238.0, "step": 2240 }, { "epoch": 0.13401384909264566, "grad_norm": 0.8640178442001343, "learning_rate": 4.996464779720916e-05, "loss": 0.6495, "mean_token_accuracy": 0.8026601433753967, "num_tokens": 75225918.0, "step": 2245 }, { "epoch": 0.13431232091690545, "grad_norm": 0.821284830570221, "learning_rate": 4.99640228264956e-05, "loss": 0.6683, "mean_token_accuracy": 0.7976380825042725, "num_tokens": 75393598.0, "step": 2250 }, { "epoch": 0.13461079274116525, "grad_norm": 0.6626023054122925, "learning_rate": 4.9963392384254635e-05, "loss": 0.6217, "mean_token_accuracy": 0.8092627882957458, "num_tokens": 75561278.0, "step": 2255 }, { "epoch": 0.134909264565425, "grad_norm": 0.7161763906478882, "learning_rate": 4.9962756470639846e-05, "loss": 0.6096, "mean_token_accuracy": 0.8110342264175415, "num_tokens": 75728958.0, "step": 2260 }, { "epoch": 0.1352077363896848, "grad_norm": 0.7436273097991943, "learning_rate": 4.9962115085806086e-05, "loss": 0.619, "mean_token_accuracy": 0.8112489461898804, "num_tokens": 75896638.0, "step": 2265 }, { "epoch": 0.1355062082139446, "grad_norm": 0.7392136454582214, "learning_rate": 4.99614682299096e-05, "loss": 0.6556, "mean_token_accuracy": 0.8005606532096863, "num_tokens": 76064318.0, "step": 2270 }, { "epoch": 0.1358046800382044, "grad_norm": 0.7133291959762573, "learning_rate": 4.996081590310794e-05, "loss": 0.6379, "mean_token_accuracy": 0.8034057021141052, "num_tokens": 76231998.0, "step": 2275 }, { "epoch": 0.1361031518624642, "grad_norm": 0.6754491925239563, "learning_rate": 4.996015810555997e-05, "loss": 0.5799, "mean_token_accuracy": 0.822205638885498, "num_tokens": 76399678.0, "step": 2280 }, { "epoch": 0.13640162368672398, "grad_norm": 0.7440774440765381, "learning_rate": 4.995949483742594e-05, "loss": 0.6735, "mean_token_accuracy": 0.7971609234809875, "num_tokens": 76567358.0, "step": 2285 }, { "epoch": 0.13670009551098378, "grad_norm": 0.7381521463394165, "learning_rate": 4.995882609886739e-05, "loss": 0.6281, "mean_token_accuracy": 0.8080042839050293, "num_tokens": 76735038.0, "step": 2290 }, { "epoch": 0.13699856733524354, "grad_norm": 0.8954285979270935, "learning_rate": 4.9958151890047196e-05, "loss": 0.6434, "mean_token_accuracy": 0.803107476234436, "num_tokens": 76902718.0, "step": 2295 }, { "epoch": 0.13729703915950334, "grad_norm": 0.7358289361000061, "learning_rate": 4.995747221112958e-05, "loss": 0.6329, "mean_token_accuracy": 0.8068173646926879, "num_tokens": 77070398.0, "step": 2300 }, { "epoch": 0.13759551098376313, "grad_norm": 0.7513579726219177, "learning_rate": 4.995678706228009e-05, "loss": 0.6164, "mean_token_accuracy": 0.8093999862670899, "num_tokens": 77238078.0, "step": 2305 }, { "epoch": 0.13789398280802292, "grad_norm": 0.7520550489425659, "learning_rate": 4.995609644366561e-05, "loss": 0.6325, "mean_token_accuracy": 0.8071692705154419, "num_tokens": 77405758.0, "step": 2310 }, { "epoch": 0.13819245463228272, "grad_norm": 0.7596616148948669, "learning_rate": 4.995540035545435e-05, "loss": 0.6234, "mean_token_accuracy": 0.8089466691017151, "num_tokens": 77573438.0, "step": 2315 }, { "epoch": 0.1384909264565425, "grad_norm": 0.7286480069160461, "learning_rate": 4.995469879781584e-05, "loss": 0.6152, "mean_token_accuracy": 0.8094775080680847, "num_tokens": 77741118.0, "step": 2320 }, { "epoch": 0.13878939828080228, "grad_norm": 0.7319132089614868, "learning_rate": 4.9953991770920984e-05, "loss": 0.7041, "mean_token_accuracy": 0.7839556336402893, "num_tokens": 77908798.0, "step": 2325 }, { "epoch": 0.13908787010506207, "grad_norm": 0.7207380533218384, "learning_rate": 4.995327927494198e-05, "loss": 0.6218, "mean_token_accuracy": 0.8090957880020142, "num_tokens": 78076478.0, "step": 2330 }, { "epoch": 0.13938634192932187, "grad_norm": 0.728185772895813, "learning_rate": 4.9952561310052365e-05, "loss": 0.5835, "mean_token_accuracy": 0.819336760044098, "num_tokens": 78244158.0, "step": 2335 }, { "epoch": 0.13968481375358166, "grad_norm": 0.7341372966766357, "learning_rate": 4.995183787642701e-05, "loss": 0.65, "mean_token_accuracy": 0.8011988639831543, "num_tokens": 78411838.0, "step": 2340 }, { "epoch": 0.13998328557784145, "grad_norm": 0.8130307793617249, "learning_rate": 4.9951108974242136e-05, "loss": 0.5887, "mean_token_accuracy": 0.816390311717987, "num_tokens": 78579518.0, "step": 2345 }, { "epoch": 0.14028175740210125, "grad_norm": 0.7904885411262512, "learning_rate": 4.995037460367527e-05, "loss": 0.6605, "mean_token_accuracy": 0.7984969615936279, "num_tokens": 78747198.0, "step": 2350 }, { "epoch": 0.14058022922636104, "grad_norm": 0.7222793698310852, "learning_rate": 4.9949634764905276e-05, "loss": 0.5814, "mean_token_accuracy": 0.8193964004516602, "num_tokens": 78914878.0, "step": 2355 }, { "epoch": 0.1408787010506208, "grad_norm": 0.6837400197982788, "learning_rate": 4.994888945811237e-05, "loss": 0.636, "mean_token_accuracy": 0.8061791658401489, "num_tokens": 79082558.0, "step": 2360 }, { "epoch": 0.1411771728748806, "grad_norm": 0.7385311722755432, "learning_rate": 4.9948138683478076e-05, "loss": 0.5857, "mean_token_accuracy": 0.8195335865020752, "num_tokens": 79250238.0, "step": 2365 }, { "epoch": 0.1414756446991404, "grad_norm": 0.7428717613220215, "learning_rate": 4.9947382441185255e-05, "loss": 0.6286, "mean_token_accuracy": 0.8067935109138489, "num_tokens": 79417918.0, "step": 2370 }, { "epoch": 0.1417741165234002, "grad_norm": 0.6756755113601685, "learning_rate": 4.9946620731418115e-05, "loss": 0.624, "mean_token_accuracy": 0.807807457447052, "num_tokens": 79585598.0, "step": 2375 }, { "epoch": 0.14207258834765998, "grad_norm": 0.6760066151618958, "learning_rate": 4.9945853554362177e-05, "loss": 0.6122, "mean_token_accuracy": 0.8115948915481568, "num_tokens": 79753278.0, "step": 2380 }, { "epoch": 0.14237106017191978, "grad_norm": 0.7346377968788147, "learning_rate": 4.99450809102043e-05, "loss": 0.6596, "mean_token_accuracy": 0.798007869720459, "num_tokens": 79920958.0, "step": 2385 }, { "epoch": 0.14266953199617957, "grad_norm": 0.717136561870575, "learning_rate": 4.994430279913268e-05, "loss": 0.6748, "mean_token_accuracy": 0.7956221103668213, "num_tokens": 80088638.0, "step": 2390 }, { "epoch": 0.14296800382043934, "grad_norm": 0.7092469930648804, "learning_rate": 4.9943519221336844e-05, "loss": 0.6876, "mean_token_accuracy": 0.7916497588157654, "num_tokens": 80256318.0, "step": 2395 }, { "epoch": 0.14326647564469913, "grad_norm": 0.7810541987419128, "learning_rate": 4.994273017700764e-05, "loss": 0.6212, "mean_token_accuracy": 0.8079028964042664, "num_tokens": 80423998.0, "step": 2400 }, { "epoch": 0.14356494746895893, "grad_norm": 0.7001931667327881, "learning_rate": 4.994193566633725e-05, "loss": 0.6196, "mean_token_accuracy": 0.8095968127250671, "num_tokens": 80591678.0, "step": 2405 }, { "epoch": 0.14386341929321872, "grad_norm": 0.7972292900085449, "learning_rate": 4.9941135689519206e-05, "loss": 0.6592, "mean_token_accuracy": 0.7987116694450378, "num_tokens": 80759358.0, "step": 2410 }, { "epoch": 0.1441618911174785, "grad_norm": 0.6630339622497559, "learning_rate": 4.9940330246748343e-05, "loss": 0.6221, "mean_token_accuracy": 0.8100262522697449, "num_tokens": 80927038.0, "step": 2415 }, { "epoch": 0.1444603629417383, "grad_norm": 0.8520267009735107, "learning_rate": 4.993951933822085e-05, "loss": 0.6998, "mean_token_accuracy": 0.7876178026199341, "num_tokens": 81094718.0, "step": 2420 }, { "epoch": 0.1447588347659981, "grad_norm": 0.7509779334068298, "learning_rate": 4.993870296413424e-05, "loss": 0.6515, "mean_token_accuracy": 0.802505087852478, "num_tokens": 81262398.0, "step": 2425 }, { "epoch": 0.14505730659025787, "grad_norm": 0.8092514276504517, "learning_rate": 4.993788112468734e-05, "loss": 0.6536, "mean_token_accuracy": 0.8006859183311462, "num_tokens": 81430078.0, "step": 2430 }, { "epoch": 0.14535577841451766, "grad_norm": 0.7288460731506348, "learning_rate": 4.993705382008035e-05, "loss": 0.6407, "mean_token_accuracy": 0.8037635803222656, "num_tokens": 81597758.0, "step": 2435 }, { "epoch": 0.14565425023877746, "grad_norm": 0.9207749962806702, "learning_rate": 4.993622105051475e-05, "loss": 0.6298, "mean_token_accuracy": 0.8070499777793885, "num_tokens": 81765438.0, "step": 2440 }, { "epoch": 0.14595272206303725, "grad_norm": 0.6609447598457336, "learning_rate": 4.993538281619339e-05, "loss": 0.5507, "mean_token_accuracy": 0.8290647864341736, "num_tokens": 81933118.0, "step": 2445 }, { "epoch": 0.14625119388729704, "grad_norm": 0.7385303974151611, "learning_rate": 4.993453911732045e-05, "loss": 0.6671, "mean_token_accuracy": 0.7958666324615479, "num_tokens": 82100798.0, "step": 2450 }, { "epoch": 0.14654966571155684, "grad_norm": 0.7430198192596436, "learning_rate": 4.993368995410139e-05, "loss": 0.6335, "mean_token_accuracy": 0.8055111527442932, "num_tokens": 82268478.0, "step": 2455 }, { "epoch": 0.14684813753581663, "grad_norm": 0.6414621472358704, "learning_rate": 4.993283532674308e-05, "loss": 0.6154, "mean_token_accuracy": 0.8096445202827454, "num_tokens": 82436158.0, "step": 2460 }, { "epoch": 0.1471466093600764, "grad_norm": 0.7025129199028015, "learning_rate": 4.9931975235453666e-05, "loss": 0.6248, "mean_token_accuracy": 0.8093164682388305, "num_tokens": 82603838.0, "step": 2465 }, { "epoch": 0.1474450811843362, "grad_norm": 0.690983235836029, "learning_rate": 4.9931109680442625e-05, "loss": 0.6328, "mean_token_accuracy": 0.8053441405296325, "num_tokens": 82771518.0, "step": 2470 }, { "epoch": 0.14774355300859598, "grad_norm": 0.7963226437568665, "learning_rate": 4.993023866192079e-05, "loss": 0.6131, "mean_token_accuracy": 0.81239413022995, "num_tokens": 82939198.0, "step": 2475 }, { "epoch": 0.14804202483285578, "grad_norm": 0.823704183101654, "learning_rate": 4.9929362180100326e-05, "loss": 0.6226, "mean_token_accuracy": 0.811153519153595, "num_tokens": 83106878.0, "step": 2480 }, { "epoch": 0.14834049665711557, "grad_norm": 0.6541733741760254, "learning_rate": 4.99284802351947e-05, "loss": 0.6085, "mean_token_accuracy": 0.8119646906852722, "num_tokens": 83274558.0, "step": 2485 }, { "epoch": 0.14863896848137537, "grad_norm": 0.7201209664344788, "learning_rate": 4.9927592827418725e-05, "loss": 0.6959, "mean_token_accuracy": 0.7875044703483581, "num_tokens": 83442238.0, "step": 2490 }, { "epoch": 0.14893744030563516, "grad_norm": 0.7420956492424011, "learning_rate": 4.992669995698855e-05, "loss": 0.6343, "mean_token_accuracy": 0.8059763789176941, "num_tokens": 83609918.0, "step": 2495 }, { "epoch": 0.14923591212989493, "grad_norm": 0.7263451218605042, "learning_rate": 4.9925801624121657e-05, "loss": 0.6304, "mean_token_accuracy": 0.8056423664093018, "num_tokens": 83777598.0, "step": 2500 }, { "epoch": 0.14953438395415472, "grad_norm": 0.7237770557403564, "learning_rate": 4.992489782903684e-05, "loss": 0.6519, "mean_token_accuracy": 0.8010855317115784, "num_tokens": 83945278.0, "step": 2505 }, { "epoch": 0.14983285577841451, "grad_norm": 0.6878868937492371, "learning_rate": 4.992398857195425e-05, "loss": 0.601, "mean_token_accuracy": 0.8140462875366211, "num_tokens": 84112958.0, "step": 2510 }, { "epoch": 0.1501313276026743, "grad_norm": 0.698435366153717, "learning_rate": 4.9923073853095334e-05, "loss": 0.6264, "mean_token_accuracy": 0.8081713080406189, "num_tokens": 84280638.0, "step": 2515 }, { "epoch": 0.1504297994269341, "grad_norm": 0.7715688943862915, "learning_rate": 4.992215367268291e-05, "loss": 0.5669, "mean_token_accuracy": 0.8235536098480225, "num_tokens": 84448318.0, "step": 2520 }, { "epoch": 0.1507282712511939, "grad_norm": 0.718703031539917, "learning_rate": 4.9921228030941086e-05, "loss": 0.6642, "mean_token_accuracy": 0.7946916341781616, "num_tokens": 84615998.0, "step": 2525 }, { "epoch": 0.1510267430754537, "grad_norm": 0.696151077747345, "learning_rate": 4.992029692809532e-05, "loss": 0.6378, "mean_token_accuracy": 0.8031790614128113, "num_tokens": 84783678.0, "step": 2530 }, { "epoch": 0.15132521489971346, "grad_norm": 0.6987526416778564, "learning_rate": 4.991936036437241e-05, "loss": 0.6225, "mean_token_accuracy": 0.8104570865631103, "num_tokens": 84945647.0, "step": 2535 }, { "epoch": 0.15162368672397325, "grad_norm": 0.7180252075195312, "learning_rate": 4.991841834000048e-05, "loss": 0.6288, "mean_token_accuracy": 0.8090063214302063, "num_tokens": 85113327.0, "step": 2540 }, { "epoch": 0.15192215854823304, "grad_norm": 0.7295783162117004, "learning_rate": 4.991747085520895e-05, "loss": 0.6599, "mean_token_accuracy": 0.7979124426841736, "num_tokens": 85281007.0, "step": 2545 }, { "epoch": 0.15222063037249284, "grad_norm": 0.7051554322242737, "learning_rate": 4.991651791022863e-05, "loss": 0.6556, "mean_token_accuracy": 0.7968507647514343, "num_tokens": 85448687.0, "step": 2550 }, { "epoch": 0.15251910219675263, "grad_norm": 0.7344266176223755, "learning_rate": 4.9915559505291604e-05, "loss": 0.611, "mean_token_accuracy": 0.8117201447486877, "num_tokens": 85616367.0, "step": 2555 }, { "epoch": 0.15281757402101243, "grad_norm": 0.6918289661407471, "learning_rate": 4.991459564063132e-05, "loss": 0.6469, "mean_token_accuracy": 0.8017654657363892, "num_tokens": 85784047.0, "step": 2560 }, { "epoch": 0.15311604584527222, "grad_norm": 0.650274932384491, "learning_rate": 4.991362631648254e-05, "loss": 0.6114, "mean_token_accuracy": 0.8120899438858032, "num_tokens": 85951727.0, "step": 2565 }, { "epoch": 0.15341451766953199, "grad_norm": 0.7291713953018188, "learning_rate": 4.991265153308136e-05, "loss": 0.6401, "mean_token_accuracy": 0.8030537962913513, "num_tokens": 86119407.0, "step": 2570 }, { "epoch": 0.15371298949379178, "grad_norm": 0.66012042760849, "learning_rate": 4.99116712906652e-05, "loss": 0.6298, "mean_token_accuracy": 0.8075092554092407, "num_tokens": 86287087.0, "step": 2575 }, { "epoch": 0.15401146131805157, "grad_norm": 0.7682079672813416, "learning_rate": 4.991068558947284e-05, "loss": 0.6605, "mean_token_accuracy": 0.7970118045806884, "num_tokens": 86454767.0, "step": 2580 }, { "epoch": 0.15430993314231137, "grad_norm": 0.637602686882019, "learning_rate": 4.9909694429744346e-05, "loss": 0.6247, "mean_token_accuracy": 0.8069128036499024, "num_tokens": 86622447.0, "step": 2585 }, { "epoch": 0.15460840496657116, "grad_norm": 0.7632467150688171, "learning_rate": 4.990869781172114e-05, "loss": 0.6424, "mean_token_accuracy": 0.800876772403717, "num_tokens": 86790127.0, "step": 2590 }, { "epoch": 0.15490687679083096, "grad_norm": 0.67705237865448, "learning_rate": 4.990769573564596e-05, "loss": 0.6103, "mean_token_accuracy": 0.8119527578353882, "num_tokens": 86957807.0, "step": 2595 }, { "epoch": 0.15520534861509075, "grad_norm": 0.6863157153129578, "learning_rate": 4.990668820176289e-05, "loss": 0.6296, "mean_token_accuracy": 0.8071275234222413, "num_tokens": 87125487.0, "step": 2600 }, { "epoch": 0.15550382043935052, "grad_norm": 0.7088929414749146, "learning_rate": 4.990567521031733e-05, "loss": 0.6324, "mean_token_accuracy": 0.808038866519928, "num_tokens": 87286740.0, "step": 2605 }, { "epoch": 0.1558022922636103, "grad_norm": 0.7194373607635498, "learning_rate": 4.9904656761556e-05, "loss": 0.5995, "mean_token_accuracy": 0.8147023677825928, "num_tokens": 87454420.0, "step": 2610 }, { "epoch": 0.1561007640878701, "grad_norm": 0.6910497546195984, "learning_rate": 4.9903632855726983e-05, "loss": 0.606, "mean_token_accuracy": 0.8130621552467346, "num_tokens": 87622100.0, "step": 2615 }, { "epoch": 0.1563992359121299, "grad_norm": 0.794522225856781, "learning_rate": 4.990260349307966e-05, "loss": 0.6257, "mean_token_accuracy": 0.8078551769256592, "num_tokens": 87789780.0, "step": 2620 }, { "epoch": 0.1566977077363897, "grad_norm": 0.7447571754455566, "learning_rate": 4.990156867386475e-05, "loss": 0.6338, "mean_token_accuracy": 0.8046463012695313, "num_tokens": 87957460.0, "step": 2625 }, { "epoch": 0.15699617956064948, "grad_norm": 0.6099318265914917, "learning_rate": 4.9900528398334304e-05, "loss": 0.6216, "mean_token_accuracy": 0.8102111458778382, "num_tokens": 88125140.0, "step": 2630 }, { "epoch": 0.15729465138490925, "grad_norm": 0.7270940542221069, "learning_rate": 4.989948266674169e-05, "loss": 0.6803, "mean_token_accuracy": 0.7926100611686706, "num_tokens": 88292820.0, "step": 2635 }, { "epoch": 0.15759312320916904, "grad_norm": 0.8718281984329224, "learning_rate": 4.989843147934164e-05, "loss": 0.6381, "mean_token_accuracy": 0.8051175117492676, "num_tokens": 88460500.0, "step": 2640 }, { "epoch": 0.15789159503342884, "grad_norm": 0.7283470630645752, "learning_rate": 4.989737483639018e-05, "loss": 0.6376, "mean_token_accuracy": 0.8044673800468445, "num_tokens": 88628180.0, "step": 2645 }, { "epoch": 0.15819006685768863, "grad_norm": 0.7733397483825684, "learning_rate": 4.989631273814465e-05, "loss": 0.6539, "mean_token_accuracy": 0.7983180046081543, "num_tokens": 88795860.0, "step": 2650 }, { "epoch": 0.15848853868194843, "grad_norm": 0.7224664092063904, "learning_rate": 4.9895245184863775e-05, "loss": 0.6389, "mean_token_accuracy": 0.8033221960067749, "num_tokens": 88963540.0, "step": 2655 }, { "epoch": 0.15878701050620822, "grad_norm": 0.8396055102348328, "learning_rate": 4.989417217680757e-05, "loss": 0.6487, "mean_token_accuracy": 0.803220796585083, "num_tokens": 89131220.0, "step": 2660 }, { "epoch": 0.15908548233046801, "grad_norm": 0.7238900065422058, "learning_rate": 4.9893093714237384e-05, "loss": 0.6222, "mean_token_accuracy": 0.8101693868637085, "num_tokens": 89298900.0, "step": 2665 }, { "epoch": 0.15938395415472778, "grad_norm": 0.7223188877105713, "learning_rate": 4.9892009797415886e-05, "loss": 0.6257, "mean_token_accuracy": 0.8076583623886109, "num_tokens": 89466580.0, "step": 2670 }, { "epoch": 0.15968242597898757, "grad_norm": 0.7806656360626221, "learning_rate": 4.989092042660709e-05, "loss": 0.619, "mean_token_accuracy": 0.8088154673576355, "num_tokens": 89634260.0, "step": 2675 }, { "epoch": 0.15998089780324737, "grad_norm": 0.7205828428268433, "learning_rate": 4.988982560207635e-05, "loss": 0.6064, "mean_token_accuracy": 0.8118513584136963, "num_tokens": 89801940.0, "step": 2680 }, { "epoch": 0.16027936962750716, "grad_norm": 0.8668790459632874, "learning_rate": 4.98887253240903e-05, "loss": 0.6109, "mean_token_accuracy": 0.8123344779014587, "num_tokens": 89969620.0, "step": 2685 }, { "epoch": 0.16057784145176696, "grad_norm": 0.8031144738197327, "learning_rate": 4.9887619592916956e-05, "loss": 0.6615, "mean_token_accuracy": 0.7974710822105407, "num_tokens": 90137300.0, "step": 2690 }, { "epoch": 0.16087631327602675, "grad_norm": 0.7694268226623535, "learning_rate": 4.9886508408825635e-05, "loss": 0.5922, "mean_token_accuracy": 0.8175295233726502, "num_tokens": 90304980.0, "step": 2695 }, { "epoch": 0.16117478510028654, "grad_norm": 0.659078061580658, "learning_rate": 4.988539177208698e-05, "loss": 0.5937, "mean_token_accuracy": 0.8154300451278687, "num_tokens": 90472660.0, "step": 2700 }, { "epoch": 0.1614732569245463, "grad_norm": 0.7620792984962463, "learning_rate": 4.988426968297297e-05, "loss": 0.6588, "mean_token_accuracy": 0.7991291880607605, "num_tokens": 90640340.0, "step": 2705 }, { "epoch": 0.1617717287488061, "grad_norm": 0.7885122895240784, "learning_rate": 4.9883142141756925e-05, "loss": 0.6346, "mean_token_accuracy": 0.8048073530197144, "num_tokens": 90808020.0, "step": 2710 }, { "epoch": 0.1620702005730659, "grad_norm": 0.8176653385162354, "learning_rate": 4.9882009148713445e-05, "loss": 0.6411, "mean_token_accuracy": 0.8032267689704895, "num_tokens": 90975700.0, "step": 2715 }, { "epoch": 0.1623686723973257, "grad_norm": 0.7681753039360046, "learning_rate": 4.988087070411853e-05, "loss": 0.6167, "mean_token_accuracy": 0.8108433604240417, "num_tokens": 91143380.0, "step": 2720 }, { "epoch": 0.16266714422158549, "grad_norm": 0.9150328636169434, "learning_rate": 4.987972680824944e-05, "loss": 0.5778, "mean_token_accuracy": 0.822557556629181, "num_tokens": 91311060.0, "step": 2725 }, { "epoch": 0.16296561604584528, "grad_norm": 0.6954326033592224, "learning_rate": 4.987857746138481e-05, "loss": 0.5959, "mean_token_accuracy": 0.8174579501152038, "num_tokens": 91478740.0, "step": 2730 }, { "epoch": 0.16326408787010507, "grad_norm": 0.6668336391448975, "learning_rate": 4.9877422663804574e-05, "loss": 0.6131, "mean_token_accuracy": 0.8127937555313111, "num_tokens": 91646420.0, "step": 2735 }, { "epoch": 0.16356255969436484, "grad_norm": 0.7585785388946533, "learning_rate": 4.987626241579001e-05, "loss": 0.6386, "mean_token_accuracy": 0.8051234602928161, "num_tokens": 91814100.0, "step": 2740 }, { "epoch": 0.16386103151862463, "grad_norm": 0.7316513061523438, "learning_rate": 4.9875096717623716e-05, "loss": 0.6573, "mean_token_accuracy": 0.7992961883544922, "num_tokens": 91981780.0, "step": 2745 }, { "epoch": 0.16415950334288443, "grad_norm": 0.7090036869049072, "learning_rate": 4.987392556958962e-05, "loss": 0.5774, "mean_token_accuracy": 0.8201240777969361, "num_tokens": 92149460.0, "step": 2750 }, { "epoch": 0.16445797516714422, "grad_norm": 0.6841456294059753, "learning_rate": 4.9872748971972976e-05, "loss": 0.6074, "mean_token_accuracy": 0.8100679874420166, "num_tokens": 92317140.0, "step": 2755 }, { "epoch": 0.16475644699140402, "grad_norm": 0.6879178285598755, "learning_rate": 4.987156692506037e-05, "loss": 0.6216, "mean_token_accuracy": 0.8083144426345825, "num_tokens": 92484820.0, "step": 2760 }, { "epoch": 0.1650549188156638, "grad_norm": 0.7845852375030518, "learning_rate": 4.9870379429139694e-05, "loss": 0.6308, "mean_token_accuracy": 0.8065370440483093, "num_tokens": 92652500.0, "step": 2765 }, { "epoch": 0.1653533906399236, "grad_norm": 0.7859224677085876, "learning_rate": 4.986918648450021e-05, "loss": 0.6197, "mean_token_accuracy": 0.8071454048156739, "num_tokens": 92820180.0, "step": 2770 }, { "epoch": 0.16565186246418337, "grad_norm": 0.7830347418785095, "learning_rate": 4.9867988091432475e-05, "loss": 0.6003, "mean_token_accuracy": 0.8133305430412292, "num_tokens": 92987860.0, "step": 2775 }, { "epoch": 0.16595033428844316, "grad_norm": 0.733372688293457, "learning_rate": 4.986678425022836e-05, "loss": 0.5964, "mean_token_accuracy": 0.816903269290924, "num_tokens": 93155540.0, "step": 2780 }, { "epoch": 0.16624880611270296, "grad_norm": 0.8022065758705139, "learning_rate": 4.986557496118111e-05, "loss": 0.7015, "mean_token_accuracy": 0.785088860988617, "num_tokens": 93323220.0, "step": 2785 }, { "epoch": 0.16654727793696275, "grad_norm": 0.7595603466033936, "learning_rate": 4.986436022458525e-05, "loss": 0.6271, "mean_token_accuracy": 0.8079506158828735, "num_tokens": 93490900.0, "step": 2790 }, { "epoch": 0.16684574976122254, "grad_norm": 0.7025103569030762, "learning_rate": 4.9863140040736664e-05, "loss": 0.626, "mean_token_accuracy": 0.8075271368026733, "num_tokens": 93658580.0, "step": 2795 }, { "epoch": 0.16714422158548234, "grad_norm": 0.7312909364700317, "learning_rate": 4.9861914409932534e-05, "loss": 0.5975, "mean_token_accuracy": 0.815901231765747, "num_tokens": 93826260.0, "step": 2800 }, { "epoch": 0.16744269340974213, "grad_norm": 0.6187756061553955, "learning_rate": 4.9860683332471405e-05, "loss": 0.602, "mean_token_accuracy": 0.8137599945068359, "num_tokens": 93993940.0, "step": 2805 }, { "epoch": 0.1677411652340019, "grad_norm": 0.6624660491943359, "learning_rate": 4.985944680865312e-05, "loss": 0.6563, "mean_token_accuracy": 0.7977514147758484, "num_tokens": 94161620.0, "step": 2810 }, { "epoch": 0.1680396370582617, "grad_norm": 0.7377040386199951, "learning_rate": 4.985820483877885e-05, "loss": 0.6423, "mean_token_accuracy": 0.8036800622940063, "num_tokens": 94329300.0, "step": 2815 }, { "epoch": 0.1683381088825215, "grad_norm": 0.6923050880432129, "learning_rate": 4.985695742315111e-05, "loss": 0.6382, "mean_token_accuracy": 0.8044733285903931, "num_tokens": 94496980.0, "step": 2820 }, { "epoch": 0.16863658070678128, "grad_norm": 0.6681461334228516, "learning_rate": 4.9855704562073725e-05, "loss": 0.6192, "mean_token_accuracy": 0.8077597379684448, "num_tokens": 94664660.0, "step": 2825 }, { "epoch": 0.16893505253104107, "grad_norm": 0.6664122939109802, "learning_rate": 4.985444625585186e-05, "loss": 0.6101, "mean_token_accuracy": 0.8125134229660034, "num_tokens": 94832340.0, "step": 2830 }, { "epoch": 0.16923352435530087, "grad_norm": 0.7340320944786072, "learning_rate": 4.985318250479199e-05, "loss": 0.6455, "mean_token_accuracy": 0.8028092503547668, "num_tokens": 95000020.0, "step": 2835 }, { "epoch": 0.16953199617956066, "grad_norm": 0.7466009259223938, "learning_rate": 4.985191330920193e-05, "loss": 0.6375, "mean_token_accuracy": 0.8048819541931153, "num_tokens": 95166202.0, "step": 2840 }, { "epoch": 0.16983046800382043, "grad_norm": 0.678203284740448, "learning_rate": 4.9850638669390816e-05, "loss": 0.5868, "mean_token_accuracy": 0.8190087080001831, "num_tokens": 95333882.0, "step": 2845 }, { "epoch": 0.17012893982808022, "grad_norm": 0.6863088607788086, "learning_rate": 4.98493585856691e-05, "loss": 0.6066, "mean_token_accuracy": 0.8119885325431824, "num_tokens": 95501562.0, "step": 2850 }, { "epoch": 0.17042741165234002, "grad_norm": 0.7498096227645874, "learning_rate": 4.984807305834859e-05, "loss": 0.6694, "mean_token_accuracy": 0.7951270341873169, "num_tokens": 95669242.0, "step": 2855 }, { "epoch": 0.1707258834765998, "grad_norm": 0.6950475573539734, "learning_rate": 4.9846782087742384e-05, "loss": 0.6398, "mean_token_accuracy": 0.8023857831954956, "num_tokens": 95836922.0, "step": 2860 }, { "epoch": 0.1710243553008596, "grad_norm": 0.7693982124328613, "learning_rate": 4.984548567416493e-05, "loss": 0.62, "mean_token_accuracy": 0.8073959231376648, "num_tokens": 96004602.0, "step": 2865 }, { "epoch": 0.1713228271251194, "grad_norm": 0.6745275855064392, "learning_rate": 4.9844183817931985e-05, "loss": 0.6382, "mean_token_accuracy": 0.80477055311203, "num_tokens": 96166897.0, "step": 2870 }, { "epoch": 0.1716212989493792, "grad_norm": 0.6974903345108032, "learning_rate": 4.9842876519360635e-05, "loss": 0.5975, "mean_token_accuracy": 0.8148157000541687, "num_tokens": 96334577.0, "step": 2875 }, { "epoch": 0.17191977077363896, "grad_norm": 0.723353385925293, "learning_rate": 4.984156377876932e-05, "loss": 0.6336, "mean_token_accuracy": 0.8037397146224976, "num_tokens": 96502257.0, "step": 2880 }, { "epoch": 0.17221824259789875, "grad_norm": 0.6630491018295288, "learning_rate": 4.984024559647777e-05, "loss": 0.6047, "mean_token_accuracy": 0.813408088684082, "num_tokens": 96669937.0, "step": 2885 }, { "epoch": 0.17251671442215855, "grad_norm": 0.6247820258140564, "learning_rate": 4.983892197280705e-05, "loss": 0.6116, "mean_token_accuracy": 0.8108195066452026, "num_tokens": 96837617.0, "step": 2890 }, { "epoch": 0.17281518624641834, "grad_norm": 0.6820305585861206, "learning_rate": 4.9837592908079544e-05, "loss": 0.5859, "mean_token_accuracy": 0.8177740693092346, "num_tokens": 97005297.0, "step": 2895 }, { "epoch": 0.17311365807067813, "grad_norm": 0.6570559740066528, "learning_rate": 4.983625840261899e-05, "loss": 0.6351, "mean_token_accuracy": 0.8049326062202453, "num_tokens": 97172977.0, "step": 2900 }, { "epoch": 0.17341212989493793, "grad_norm": 0.7028390169143677, "learning_rate": 4.9834918456750417e-05, "loss": 0.6003, "mean_token_accuracy": 0.8141059160232544, "num_tokens": 97340657.0, "step": 2905 }, { "epoch": 0.17371060171919772, "grad_norm": 0.7124725580215454, "learning_rate": 4.98335730708002e-05, "loss": 0.6173, "mean_token_accuracy": 0.8082548022270203, "num_tokens": 97508337.0, "step": 2910 }, { "epoch": 0.1740090735434575, "grad_norm": 0.6809436082839966, "learning_rate": 4.983222224509604e-05, "loss": 0.6272, "mean_token_accuracy": 0.8079923629760742, "num_tokens": 97676017.0, "step": 2915 }, { "epoch": 0.17430754536771728, "grad_norm": 0.7032147645950317, "learning_rate": 4.983086597996694e-05, "loss": 0.6416, "mean_token_accuracy": 0.8048073410987854, "num_tokens": 97843697.0, "step": 2920 }, { "epoch": 0.17460601719197708, "grad_norm": 0.7811126708984375, "learning_rate": 4.982950427574325e-05, "loss": 0.6362, "mean_token_accuracy": 0.8042168617248535, "num_tokens": 98011377.0, "step": 2925 }, { "epoch": 0.17490448901623687, "grad_norm": 0.8064723014831543, "learning_rate": 4.982813713275664e-05, "loss": 0.593, "mean_token_accuracy": 0.8176905512809753, "num_tokens": 98179057.0, "step": 2930 }, { "epoch": 0.17520296084049666, "grad_norm": 0.7125324606895447, "learning_rate": 4.982676455134009e-05, "loss": 0.5986, "mean_token_accuracy": 0.8163604974746704, "num_tokens": 98346737.0, "step": 2935 }, { "epoch": 0.17550143266475646, "grad_norm": 0.6731593012809753, "learning_rate": 4.982538653182794e-05, "loss": 0.6518, "mean_token_accuracy": 0.8014255046844483, "num_tokens": 98514417.0, "step": 2940 }, { "epoch": 0.17579990448901622, "grad_norm": 0.6540849208831787, "learning_rate": 4.982400307455582e-05, "loss": 0.5967, "mean_token_accuracy": 0.8152034044265747, "num_tokens": 98682097.0, "step": 2945 }, { "epoch": 0.17609837631327602, "grad_norm": 0.6249152421951294, "learning_rate": 4.982261417986068e-05, "loss": 0.5809, "mean_token_accuracy": 0.8181856155395508, "num_tokens": 98849777.0, "step": 2950 }, { "epoch": 0.1763968481375358, "grad_norm": 0.6893585324287415, "learning_rate": 4.9821219848080844e-05, "loss": 0.584, "mean_token_accuracy": 0.8165036320686341, "num_tokens": 99017457.0, "step": 2955 }, { "epoch": 0.1766953199617956, "grad_norm": 0.6397733688354492, "learning_rate": 4.981982007955592e-05, "loss": 0.5348, "mean_token_accuracy": 0.8326851844787597, "num_tokens": 99185137.0, "step": 2960 }, { "epoch": 0.1769937917860554, "grad_norm": 0.6592968106269836, "learning_rate": 4.981841487462683e-05, "loss": 0.6095, "mean_token_accuracy": 0.8119587302207947, "num_tokens": 99352817.0, "step": 2965 }, { "epoch": 0.1772922636103152, "grad_norm": 0.6834815740585327, "learning_rate": 4.981700423363584e-05, "loss": 0.6473, "mean_token_accuracy": 0.8011988520622253, "num_tokens": 99520497.0, "step": 2970 }, { "epoch": 0.177590735434575, "grad_norm": 0.8949568867683411, "learning_rate": 4.9815588156926553e-05, "loss": 0.6138, "mean_token_accuracy": 0.8123881816864014, "num_tokens": 99688177.0, "step": 2975 }, { "epoch": 0.17788920725883475, "grad_norm": 0.6223493814468384, "learning_rate": 4.981416664484387e-05, "loss": 0.5837, "mean_token_accuracy": 0.8190564274787903, "num_tokens": 99855857.0, "step": 2980 }, { "epoch": 0.17818767908309455, "grad_norm": 0.6669052839279175, "learning_rate": 4.9812739697734024e-05, "loss": 0.6004, "mean_token_accuracy": 0.8143504738807679, "num_tokens": 100023537.0, "step": 2985 }, { "epoch": 0.17848615090735434, "grad_norm": 0.6864182949066162, "learning_rate": 4.9811307315944585e-05, "loss": 0.6337, "mean_token_accuracy": 0.8044136762619019, "num_tokens": 100191217.0, "step": 2990 }, { "epoch": 0.17878462273161413, "grad_norm": 0.6830667853355408, "learning_rate": 4.980986949982443e-05, "loss": 0.6015, "mean_token_accuracy": 0.8124478101730347, "num_tokens": 100358897.0, "step": 2995 }, { "epoch": 0.17908309455587393, "grad_norm": 0.6315885186195374, "learning_rate": 4.980842624972376e-05, "loss": 0.5787, "mean_token_accuracy": 0.820577347278595, "num_tokens": 100526577.0, "step": 3000 }, { "epoch": 0.17938156638013372, "grad_norm": 0.6943024396896362, "learning_rate": 4.9806977565994113e-05, "loss": 0.5997, "mean_token_accuracy": 0.8127636313438416, "num_tokens": 100688735.0, "step": 3005 }, { "epoch": 0.17968003820439352, "grad_norm": 0.7130090594291687, "learning_rate": 4.980552344898834e-05, "loss": 0.5806, "mean_token_accuracy": 0.8186448812484741, "num_tokens": 100856415.0, "step": 3010 }, { "epoch": 0.17997851002865328, "grad_norm": 0.7095332741737366, "learning_rate": 4.980406389906062e-05, "loss": 0.5687, "mean_token_accuracy": 0.8244721531867981, "num_tokens": 101024095.0, "step": 3015 }, { "epoch": 0.18027698185291308, "grad_norm": 0.7395371198654175, "learning_rate": 4.980259891656644e-05, "loss": 0.6091, "mean_token_accuracy": 0.8115829706192017, "num_tokens": 101191775.0, "step": 3020 }, { "epoch": 0.18057545367717287, "grad_norm": 0.6890239715576172, "learning_rate": 4.980112850186265e-05, "loss": 0.5699, "mean_token_accuracy": 0.8231659293174743, "num_tokens": 101359455.0, "step": 3025 }, { "epoch": 0.18087392550143266, "grad_norm": 0.614687979221344, "learning_rate": 4.979965265530738e-05, "loss": 0.599, "mean_token_accuracy": 0.8175116300582885, "num_tokens": 101527135.0, "step": 3030 }, { "epoch": 0.18117239732569246, "grad_norm": 0.6636961698532104, "learning_rate": 4.979817137726009e-05, "loss": 0.5665, "mean_token_accuracy": 0.8247405409812927, "num_tokens": 101694815.0, "step": 3035 }, { "epoch": 0.18147086914995225, "grad_norm": 0.6953255534172058, "learning_rate": 4.97966846680816e-05, "loss": 0.5906, "mean_token_accuracy": 0.8145651936531066, "num_tokens": 101862495.0, "step": 3040 }, { "epoch": 0.18176934097421205, "grad_norm": 0.7299449443817139, "learning_rate": 4.979519252813399e-05, "loss": 0.607, "mean_token_accuracy": 0.8143743276596069, "num_tokens": 102030175.0, "step": 3045 }, { "epoch": 0.1820678127984718, "grad_norm": 0.644241988658905, "learning_rate": 4.979369495778074e-05, "loss": 0.5778, "mean_token_accuracy": 0.8193546414375306, "num_tokens": 102197855.0, "step": 3050 }, { "epoch": 0.1823662846227316, "grad_norm": 0.7699316740036011, "learning_rate": 4.979219195738658e-05, "loss": 0.5912, "mean_token_accuracy": 0.815036392211914, "num_tokens": 102365535.0, "step": 3055 }, { "epoch": 0.1826647564469914, "grad_norm": 0.7355851531028748, "learning_rate": 4.9790683527317594e-05, "loss": 0.604, "mean_token_accuracy": 0.8115889430046082, "num_tokens": 102533215.0, "step": 3060 }, { "epoch": 0.1829632282712512, "grad_norm": 0.7343382239341736, "learning_rate": 4.9789169667941196e-05, "loss": 0.6401, "mean_token_accuracy": 0.8039067268371582, "num_tokens": 102700895.0, "step": 3065 }, { "epoch": 0.183261700095511, "grad_norm": 0.7042121887207031, "learning_rate": 4.978765037962612e-05, "loss": 0.603, "mean_token_accuracy": 0.8136764883995056, "num_tokens": 102868575.0, "step": 3070 }, { "epoch": 0.18356017191977078, "grad_norm": 0.6647803783416748, "learning_rate": 4.978612566274241e-05, "loss": 0.6811, "mean_token_accuracy": 0.7922700762748718, "num_tokens": 103036255.0, "step": 3075 }, { "epoch": 0.18385864374403058, "grad_norm": 0.7692590355873108, "learning_rate": 4.9784595517661445e-05, "loss": 0.6412, "mean_token_accuracy": 0.8045568346977234, "num_tokens": 103203935.0, "step": 3080 }, { "epoch": 0.18415711556829034, "grad_norm": 0.6967044472694397, "learning_rate": 4.978305994475591e-05, "loss": 0.6041, "mean_token_accuracy": 0.8128474235534668, "num_tokens": 103371615.0, "step": 3085 }, { "epoch": 0.18445558739255014, "grad_norm": 0.6589059233665466, "learning_rate": 4.978151894439983e-05, "loss": 0.6225, "mean_token_accuracy": 0.8084814548492432, "num_tokens": 103539295.0, "step": 3090 }, { "epoch": 0.18475405921680993, "grad_norm": 0.6136396527290344, "learning_rate": 4.977997251696854e-05, "loss": 0.5857, "mean_token_accuracy": 0.8194679617881775, "num_tokens": 103706975.0, "step": 3095 }, { "epoch": 0.18505253104106972, "grad_norm": 0.6578737497329712, "learning_rate": 4.97784206628387e-05, "loss": 0.6558, "mean_token_accuracy": 0.7991649985313416, "num_tokens": 103874655.0, "step": 3100 }, { "epoch": 0.18535100286532952, "grad_norm": 0.6422058343887329, "learning_rate": 4.9776863382388294e-05, "loss": 0.6234, "mean_token_accuracy": 0.8081474423408508, "num_tokens": 104042335.0, "step": 3105 }, { "epoch": 0.1856494746895893, "grad_norm": 0.7381056547164917, "learning_rate": 4.977530067599664e-05, "loss": 0.6141, "mean_token_accuracy": 0.8085709333419799, "num_tokens": 104210015.0, "step": 3110 }, { "epoch": 0.1859479465138491, "grad_norm": 0.7627916932106018, "learning_rate": 4.977373254404434e-05, "loss": 0.6362, "mean_token_accuracy": 0.8067756175994873, "num_tokens": 104377695.0, "step": 3115 }, { "epoch": 0.18624641833810887, "grad_norm": 0.6562960147857666, "learning_rate": 4.977215898691335e-05, "loss": 0.6414, "mean_token_accuracy": 0.8028689026832581, "num_tokens": 104545375.0, "step": 3120 }, { "epoch": 0.18654489016236867, "grad_norm": 0.7071141004562378, "learning_rate": 4.977058000498696e-05, "loss": 0.6396, "mean_token_accuracy": 0.8021650910377502, "num_tokens": 104713055.0, "step": 3125 }, { "epoch": 0.18684336198662846, "grad_norm": 0.6516364216804504, "learning_rate": 4.976899559864972e-05, "loss": 0.6281, "mean_token_accuracy": 0.805702006816864, "num_tokens": 104880735.0, "step": 3130 }, { "epoch": 0.18714183381088825, "grad_norm": 0.6863721013069153, "learning_rate": 4.9767405768287575e-05, "loss": 0.641, "mean_token_accuracy": 0.8012048006057739, "num_tokens": 105048415.0, "step": 3135 }, { "epoch": 0.18744030563514805, "grad_norm": 0.6565121412277222, "learning_rate": 4.9765810514287734e-05, "loss": 0.643, "mean_token_accuracy": 0.8026303172111511, "num_tokens": 105216095.0, "step": 3140 }, { "epoch": 0.18773877745940784, "grad_norm": 0.6837484836578369, "learning_rate": 4.976420983703878e-05, "loss": 0.65, "mean_token_accuracy": 0.8001610398292541, "num_tokens": 105383775.0, "step": 3145 }, { "epoch": 0.18803724928366763, "grad_norm": 0.6963145732879639, "learning_rate": 4.976260373693056e-05, "loss": 0.6584, "mean_token_accuracy": 0.8001789212226867, "num_tokens": 105551455.0, "step": 3150 }, { "epoch": 0.1883357211079274, "grad_norm": 0.7384754419326782, "learning_rate": 4.976099221435428e-05, "loss": 0.6259, "mean_token_accuracy": 0.808672308921814, "num_tokens": 105719135.0, "step": 3155 }, { "epoch": 0.1886341929321872, "grad_norm": 0.7297983765602112, "learning_rate": 4.9759375269702455e-05, "loss": 0.637, "mean_token_accuracy": 0.8058511257171631, "num_tokens": 105886815.0, "step": 3160 }, { "epoch": 0.188932664756447, "grad_norm": 0.6801038384437561, "learning_rate": 4.975775290336893e-05, "loss": 0.649, "mean_token_accuracy": 0.8018847703933716, "num_tokens": 106054495.0, "step": 3165 }, { "epoch": 0.18923113658070678, "grad_norm": 0.6962680220603943, "learning_rate": 4.975612511574884e-05, "loss": 0.6377, "mean_token_accuracy": 0.8036621570587158, "num_tokens": 106222175.0, "step": 3170 }, { "epoch": 0.18952960840496658, "grad_norm": 0.6041184067726135, "learning_rate": 4.975449190723869e-05, "loss": 0.5969, "mean_token_accuracy": 0.8162590861320496, "num_tokens": 106389855.0, "step": 3175 }, { "epoch": 0.18982808022922637, "grad_norm": 0.7075260281562805, "learning_rate": 4.9752853278236256e-05, "loss": 0.6197, "mean_token_accuracy": 0.8071334838867188, "num_tokens": 106557535.0, "step": 3180 }, { "epoch": 0.19012655205348616, "grad_norm": 0.7273681163787842, "learning_rate": 4.975120922914067e-05, "loss": 0.6314, "mean_token_accuracy": 0.8052725791931152, "num_tokens": 106725215.0, "step": 3185 }, { "epoch": 0.19042502387774593, "grad_norm": 0.8485504388809204, "learning_rate": 4.974955976035236e-05, "loss": 0.6224, "mean_token_accuracy": 0.8077001094818115, "num_tokens": 106892895.0, "step": 3190 }, { "epoch": 0.19072349570200572, "grad_norm": 0.7034767270088196, "learning_rate": 4.97479048722731e-05, "loss": 0.5787, "mean_token_accuracy": 0.8210724115371704, "num_tokens": 107060575.0, "step": 3195 }, { "epoch": 0.19102196752626552, "grad_norm": 0.7601812481880188, "learning_rate": 4.9746244565305956e-05, "loss": 0.6551, "mean_token_accuracy": 0.8004055857658386, "num_tokens": 107228255.0, "step": 3200 }, { "epoch": 0.1913204393505253, "grad_norm": 0.5988834500312805, "learning_rate": 4.974457883985533e-05, "loss": 0.5668, "mean_token_accuracy": 0.8238458752632141, "num_tokens": 107395935.0, "step": 3205 }, { "epoch": 0.1916189111747851, "grad_norm": 0.6376057267189026, "learning_rate": 4.9742907696326935e-05, "loss": 0.5963, "mean_token_accuracy": 0.8158354997634888, "num_tokens": 107558903.0, "step": 3210 }, { "epoch": 0.1919173829990449, "grad_norm": 0.6359529495239258, "learning_rate": 4.974123113512782e-05, "loss": 0.574, "mean_token_accuracy": 0.8197363615036011, "num_tokens": 107726583.0, "step": 3215 }, { "epoch": 0.1922158548233047, "grad_norm": 0.7011116743087769, "learning_rate": 4.9739549156666334e-05, "loss": 0.6012, "mean_token_accuracy": 0.8162233233451843, "num_tokens": 107894263.0, "step": 3220 }, { "epoch": 0.19251432664756446, "grad_norm": 0.7204210162162781, "learning_rate": 4.973786176135215e-05, "loss": 0.6283, "mean_token_accuracy": 0.8073661088943481, "num_tokens": 108061943.0, "step": 3225 }, { "epoch": 0.19281279847182425, "grad_norm": 0.6594974994659424, "learning_rate": 4.9736168949596276e-05, "loss": 0.5804, "mean_token_accuracy": 0.819086229801178, "num_tokens": 108229623.0, "step": 3230 }, { "epoch": 0.19311127029608405, "grad_norm": 0.6742432713508606, "learning_rate": 4.973447072181102e-05, "loss": 0.5868, "mean_token_accuracy": 0.8188238143920898, "num_tokens": 108397303.0, "step": 3235 }, { "epoch": 0.19340974212034384, "grad_norm": 0.6995094418525696, "learning_rate": 4.973276707841001e-05, "loss": 0.6454, "mean_token_accuracy": 0.8005725860595703, "num_tokens": 108564983.0, "step": 3240 }, { "epoch": 0.19370821394460364, "grad_norm": 0.6854166984558105, "learning_rate": 4.9731058019808225e-05, "loss": 0.5931, "mean_token_accuracy": 0.816115927696228, "num_tokens": 108732663.0, "step": 3245 }, { "epoch": 0.19400668576886343, "grad_norm": 0.7256622314453125, "learning_rate": 4.972934354642191e-05, "loss": 0.5934, "mean_token_accuracy": 0.8162113785743713, "num_tokens": 108900343.0, "step": 3250 }, { "epoch": 0.1943051575931232, "grad_norm": 0.6493568420410156, "learning_rate": 4.9727623658668674e-05, "loss": 0.6268, "mean_token_accuracy": 0.8067100048065186, "num_tokens": 109068023.0, "step": 3255 }, { "epoch": 0.194603629417383, "grad_norm": 0.8499940633773804, "learning_rate": 4.972589835696742e-05, "loss": 0.6285, "mean_token_accuracy": 0.8055290460586548, "num_tokens": 109235703.0, "step": 3260 }, { "epoch": 0.19490210124164278, "grad_norm": 0.6785118579864502, "learning_rate": 4.9724167641738375e-05, "loss": 0.6376, "mean_token_accuracy": 0.8027734637260437, "num_tokens": 109403383.0, "step": 3265 }, { "epoch": 0.19520057306590258, "grad_norm": 0.6749463677406311, "learning_rate": 4.972243151340309e-05, "loss": 0.5759, "mean_token_accuracy": 0.8217225313186646, "num_tokens": 109571063.0, "step": 3270 }, { "epoch": 0.19549904489016237, "grad_norm": 0.6525623202323914, "learning_rate": 4.9720689972384435e-05, "loss": 0.6043, "mean_token_accuracy": 0.8143862724304199, "num_tokens": 109738743.0, "step": 3275 }, { "epoch": 0.19579751671442217, "grad_norm": 0.6807870864868164, "learning_rate": 4.971894301910658e-05, "loss": 0.6436, "mean_token_accuracy": 0.80174161195755, "num_tokens": 109906423.0, "step": 3280 }, { "epoch": 0.19609598853868196, "grad_norm": 0.6803872585296631, "learning_rate": 4.971719065399505e-05, "loss": 0.5847, "mean_token_accuracy": 0.8211976647377014, "num_tokens": 110074103.0, "step": 3285 }, { "epoch": 0.19639446036294173, "grad_norm": 0.6901847124099731, "learning_rate": 4.971543287747665e-05, "loss": 0.6033, "mean_token_accuracy": 0.8121905088424682, "num_tokens": 110233485.0, "step": 3290 }, { "epoch": 0.19669293218720152, "grad_norm": 0.7427123785018921, "learning_rate": 4.971366968997951e-05, "loss": 0.6064, "mean_token_accuracy": 0.8110998511314392, "num_tokens": 110401165.0, "step": 3295 }, { "epoch": 0.1969914040114613, "grad_norm": 0.7191442847251892, "learning_rate": 4.971190109193312e-05, "loss": 0.5735, "mean_token_accuracy": 0.8219611287117005, "num_tokens": 110568845.0, "step": 3300 }, { "epoch": 0.1972898758357211, "grad_norm": 0.66484534740448, "learning_rate": 4.971012708376822e-05, "loss": 0.6363, "mean_token_accuracy": 0.8046343803405762, "num_tokens": 110736525.0, "step": 3305 }, { "epoch": 0.1975883476599809, "grad_norm": 0.6512221693992615, "learning_rate": 4.970834766591692e-05, "loss": 0.5841, "mean_token_accuracy": 0.8190087080001831, "num_tokens": 110904205.0, "step": 3310 }, { "epoch": 0.1978868194842407, "grad_norm": 0.5992953181266785, "learning_rate": 4.9706562838812634e-05, "loss": 0.6738, "mean_token_accuracy": 0.7961708307266235, "num_tokens": 111071885.0, "step": 3315 }, { "epoch": 0.1981852913085005, "grad_norm": 0.6900875568389893, "learning_rate": 4.9704772602890075e-05, "loss": 0.6207, "mean_token_accuracy": 0.8099308133125305, "num_tokens": 111239565.0, "step": 3320 }, { "epoch": 0.19848376313276025, "grad_norm": 0.6812499165534973, "learning_rate": 4.97029769585853e-05, "loss": 0.6232, "mean_token_accuracy": 0.8077537894248963, "num_tokens": 111407245.0, "step": 3325 }, { "epoch": 0.19878223495702005, "grad_norm": 0.6209660172462463, "learning_rate": 4.9701175906335665e-05, "loss": 0.6258, "mean_token_accuracy": 0.806203031539917, "num_tokens": 111574925.0, "step": 3330 }, { "epoch": 0.19908070678127984, "grad_norm": 0.7421411275863647, "learning_rate": 4.969936944657985e-05, "loss": 0.6517, "mean_token_accuracy": 0.8002862930297852, "num_tokens": 111742605.0, "step": 3335 }, { "epoch": 0.19937917860553964, "grad_norm": 0.6757756471633911, "learning_rate": 4.969755757975785e-05, "loss": 0.6736, "mean_token_accuracy": 0.7945544481277466, "num_tokens": 111910285.0, "step": 3340 }, { "epoch": 0.19967765042979943, "grad_norm": 0.6517324447631836, "learning_rate": 4.969574030631098e-05, "loss": 0.6072, "mean_token_accuracy": 0.8123881578445434, "num_tokens": 112077965.0, "step": 3345 }, { "epoch": 0.19997612225405922, "grad_norm": 0.7713308930397034, "learning_rate": 4.969391762668188e-05, "loss": 0.5744, "mean_token_accuracy": 0.8211737990379333, "num_tokens": 112245645.0, "step": 3350 }, { "epoch": 0.20027459407831902, "grad_norm": 0.6571708917617798, "learning_rate": 4.9692089541314485e-05, "loss": 0.5806, "mean_token_accuracy": 0.8187880158424378, "num_tokens": 112413325.0, "step": 3355 }, { "epoch": 0.20057306590257878, "grad_norm": 0.5897857546806335, "learning_rate": 4.969025605065407e-05, "loss": 0.5863, "mean_token_accuracy": 0.8179649233818054, "num_tokens": 112581005.0, "step": 3360 }, { "epoch": 0.20087153772683858, "grad_norm": 0.6495668292045593, "learning_rate": 4.9688417155147207e-05, "loss": 0.5984, "mean_token_accuracy": 0.8132231712341309, "num_tokens": 112748685.0, "step": 3365 }, { "epoch": 0.20117000955109837, "grad_norm": 0.7834144830703735, "learning_rate": 4.9686572855241784e-05, "loss": 0.6144, "mean_token_accuracy": 0.8103304386138916, "num_tokens": 112916365.0, "step": 3370 }, { "epoch": 0.20146848137535817, "grad_norm": 0.7511405348777771, "learning_rate": 4.968472315138703e-05, "loss": 0.5892, "mean_token_accuracy": 0.8186700940132141, "num_tokens": 113079045.0, "step": 3375 }, { "epoch": 0.20176695319961796, "grad_norm": 0.6783428192138672, "learning_rate": 4.968286804403347e-05, "loss": 0.6151, "mean_token_accuracy": 0.8093164801597595, "num_tokens": 113246725.0, "step": 3380 }, { "epoch": 0.20206542502387775, "grad_norm": 0.6980903148651123, "learning_rate": 4.968100753363295e-05, "loss": 0.6271, "mean_token_accuracy": 0.8070857763290405, "num_tokens": 113414405.0, "step": 3385 }, { "epoch": 0.20236389684813755, "grad_norm": 0.6501882672309875, "learning_rate": 4.9679141620638634e-05, "loss": 0.6166, "mean_token_accuracy": 0.8123693943023682, "num_tokens": 113575308.0, "step": 3390 }, { "epoch": 0.20266236867239731, "grad_norm": 0.6438986659049988, "learning_rate": 4.9677270305504995e-05, "loss": 0.6031, "mean_token_accuracy": 0.8148276209831238, "num_tokens": 113742988.0, "step": 3395 }, { "epoch": 0.2029608404966571, "grad_norm": 0.7259078025817871, "learning_rate": 4.967539358868783e-05, "loss": 0.603, "mean_token_accuracy": 0.8132351279258728, "num_tokens": 113910668.0, "step": 3400 }, { "epoch": 0.2032593123209169, "grad_norm": 0.659095048904419, "learning_rate": 4.9673511470644236e-05, "loss": 0.6468, "mean_token_accuracy": 0.8014254927635193, "num_tokens": 114078348.0, "step": 3405 }, { "epoch": 0.2035577841451767, "grad_norm": 0.6263108849525452, "learning_rate": 4.9671623951832644e-05, "loss": 0.5949, "mean_token_accuracy": 0.8181438565254211, "num_tokens": 114246028.0, "step": 3410 }, { "epoch": 0.2038562559694365, "grad_norm": 0.8719925880432129, "learning_rate": 4.9669731032712814e-05, "loss": 0.6868, "mean_token_accuracy": 0.7903018116950988, "num_tokens": 114413708.0, "step": 3415 }, { "epoch": 0.20415472779369628, "grad_norm": 0.7731387615203857, "learning_rate": 4.9667832713745774e-05, "loss": 0.6717, "mean_token_accuracy": 0.7944709539413453, "num_tokens": 114581388.0, "step": 3420 }, { "epoch": 0.20445319961795608, "grad_norm": 0.6292768716812134, "learning_rate": 4.9665928995393904e-05, "loss": 0.5709, "mean_token_accuracy": 0.823332941532135, "num_tokens": 114749068.0, "step": 3425 }, { "epoch": 0.20475167144221584, "grad_norm": 0.6553537845611572, "learning_rate": 4.9664019878120896e-05, "loss": 0.6282, "mean_token_accuracy": 0.8055290460586548, "num_tokens": 114916748.0, "step": 3430 }, { "epoch": 0.20505014326647564, "grad_norm": 0.6622532606124878, "learning_rate": 4.966210536239175e-05, "loss": 0.6241, "mean_token_accuracy": 0.8065608859062194, "num_tokens": 115084428.0, "step": 3435 }, { "epoch": 0.20534861509073543, "grad_norm": 0.6557806730270386, "learning_rate": 4.9660185448672764e-05, "loss": 0.6005, "mean_token_accuracy": 0.8137540221214294, "num_tokens": 115252108.0, "step": 3440 }, { "epoch": 0.20564708691499523, "grad_norm": 0.6983974575996399, "learning_rate": 4.9658260137431586e-05, "loss": 0.6516, "mean_token_accuracy": 0.8017356514930725, "num_tokens": 115419788.0, "step": 3445 }, { "epoch": 0.20594555873925502, "grad_norm": 0.7253179550170898, "learning_rate": 4.965632942913717e-05, "loss": 0.6372, "mean_token_accuracy": 0.8035786628723145, "num_tokens": 115587468.0, "step": 3450 }, { "epoch": 0.2062440305635148, "grad_norm": 0.6325120329856873, "learning_rate": 4.965439332425975e-05, "loss": 0.622, "mean_token_accuracy": 0.8079983353614807, "num_tokens": 115755148.0, "step": 3455 }, { "epoch": 0.2065425023877746, "grad_norm": 0.6540398001670837, "learning_rate": 4.965245182327092e-05, "loss": 0.6245, "mean_token_accuracy": 0.8069963097572327, "num_tokens": 115922828.0, "step": 3460 }, { "epoch": 0.20684097421203437, "grad_norm": 0.6008408069610596, "learning_rate": 4.9650504926643554e-05, "loss": 0.5675, "mean_token_accuracy": 0.8241619944572449, "num_tokens": 116090508.0, "step": 3465 }, { "epoch": 0.20713944603629417, "grad_norm": 1.1994218826293945, "learning_rate": 4.9648552634851864e-05, "loss": 0.6094, "mean_token_accuracy": 0.8137122631072998, "num_tokens": 116258188.0, "step": 3470 }, { "epoch": 0.20743791786055396, "grad_norm": 0.6906866431236267, "learning_rate": 4.964659494837136e-05, "loss": 0.6335, "mean_token_accuracy": 0.8043421149253845, "num_tokens": 116425868.0, "step": 3475 }, { "epoch": 0.20773638968481375, "grad_norm": 0.6637681126594543, "learning_rate": 4.964463186767888e-05, "loss": 0.6357, "mean_token_accuracy": 0.804962420463562, "num_tokens": 116593548.0, "step": 3480 }, { "epoch": 0.20803486150907355, "grad_norm": 0.6916055083274841, "learning_rate": 4.964266339325257e-05, "loss": 0.6221, "mean_token_accuracy": 0.806465458869934, "num_tokens": 116761228.0, "step": 3485 }, { "epoch": 0.20833333333333334, "grad_norm": 0.6898741722106934, "learning_rate": 4.964068952557188e-05, "loss": 0.6408, "mean_token_accuracy": 0.8034593820571899, "num_tokens": 116928908.0, "step": 3490 }, { "epoch": 0.20863180515759314, "grad_norm": 0.6792036294937134, "learning_rate": 4.963871026511759e-05, "loss": 0.6599, "mean_token_accuracy": 0.7973160028457642, "num_tokens": 117096588.0, "step": 3495 }, { "epoch": 0.2089302769818529, "grad_norm": 0.6463459730148315, "learning_rate": 4.963672561237177e-05, "loss": 0.6135, "mean_token_accuracy": 0.8103841066360473, "num_tokens": 117264268.0, "step": 3500 }, { "epoch": 0.2092287488061127, "grad_norm": 0.6414973735809326, "learning_rate": 4.9634735567817843e-05, "loss": 0.6016, "mean_token_accuracy": 0.8125849962234497, "num_tokens": 117431948.0, "step": 3505 }, { "epoch": 0.2095272206303725, "grad_norm": 0.6723977327346802, "learning_rate": 4.9632740131940506e-05, "loss": 0.6232, "mean_token_accuracy": 0.8084635496139526, "num_tokens": 117599628.0, "step": 3510 }, { "epoch": 0.20982569245463228, "grad_norm": 0.7419140338897705, "learning_rate": 4.963073930522578e-05, "loss": 0.6242, "mean_token_accuracy": 0.8072468042373657, "num_tokens": 117767308.0, "step": 3515 }, { "epoch": 0.21012416427889208, "grad_norm": 0.6374903321266174, "learning_rate": 4.962873308816101e-05, "loss": 0.5918, "mean_token_accuracy": 0.8160920739173889, "num_tokens": 117934988.0, "step": 3520 }, { "epoch": 0.21042263610315187, "grad_norm": 0.6475487947463989, "learning_rate": 4.962672148123486e-05, "loss": 0.584, "mean_token_accuracy": 0.8184003353118896, "num_tokens": 118102668.0, "step": 3525 }, { "epoch": 0.21072110792741167, "grad_norm": 0.6644144654273987, "learning_rate": 4.962470448493726e-05, "loss": 0.5722, "mean_token_accuracy": 0.8227364778518677, "num_tokens": 118270348.0, "step": 3530 }, { "epoch": 0.21101957975167143, "grad_norm": 0.6092865467071533, "learning_rate": 4.962268209975952e-05, "loss": 0.5983, "mean_token_accuracy": 0.8152511119842529, "num_tokens": 118438028.0, "step": 3535 }, { "epoch": 0.21131805157593123, "grad_norm": 0.6891133785247803, "learning_rate": 4.9620654326194216e-05, "loss": 0.6472, "mean_token_accuracy": 0.8025468230247498, "num_tokens": 118605708.0, "step": 3540 }, { "epoch": 0.21161652340019102, "grad_norm": 0.7612316012382507, "learning_rate": 4.961862116473525e-05, "loss": 0.6036, "mean_token_accuracy": 0.8143266320228577, "num_tokens": 118773388.0, "step": 3545 }, { "epoch": 0.21191499522445081, "grad_norm": 0.6373063325881958, "learning_rate": 4.961658261587783e-05, "loss": 0.6244, "mean_token_accuracy": 0.8092806816101075, "num_tokens": 118941068.0, "step": 3550 }, { "epoch": 0.2122134670487106, "grad_norm": 0.7420687675476074, "learning_rate": 4.961453868011849e-05, "loss": 0.6076, "mean_token_accuracy": 0.8117857575416565, "num_tokens": 119108748.0, "step": 3555 }, { "epoch": 0.2125119388729704, "grad_norm": 0.6555312871932983, "learning_rate": 4.9612489357955076e-05, "loss": 0.5866, "mean_token_accuracy": 0.8182929873466491, "num_tokens": 119276428.0, "step": 3560 }, { "epoch": 0.21281041069723017, "grad_norm": 0.6671750545501709, "learning_rate": 4.961043464988673e-05, "loss": 0.6288, "mean_token_accuracy": 0.8062447786331177, "num_tokens": 119444108.0, "step": 3565 }, { "epoch": 0.21310888252148996, "grad_norm": 0.6754035353660583, "learning_rate": 4.96083745564139e-05, "loss": 0.5585, "mean_token_accuracy": 0.8251043796539307, "num_tokens": 119611788.0, "step": 3570 }, { "epoch": 0.21340735434574976, "grad_norm": 0.602658212184906, "learning_rate": 4.960630907803838e-05, "loss": 0.609, "mean_token_accuracy": 0.8133842349052429, "num_tokens": 119779468.0, "step": 3575 }, { "epoch": 0.21370582617000955, "grad_norm": 0.7140071392059326, "learning_rate": 4.960423821526325e-05, "loss": 0.6103, "mean_token_accuracy": 0.811177396774292, "num_tokens": 119947148.0, "step": 3580 }, { "epoch": 0.21400429799426934, "grad_norm": 0.6357149481773376, "learning_rate": 4.96021619685929e-05, "loss": 0.6191, "mean_token_accuracy": 0.8099486947059631, "num_tokens": 120114828.0, "step": 3585 }, { "epoch": 0.21430276981852914, "grad_norm": 0.6583658456802368, "learning_rate": 4.9600080338533046e-05, "loss": 0.5944, "mean_token_accuracy": 0.8169867515563964, "num_tokens": 120282508.0, "step": 3590 }, { "epoch": 0.21460124164278893, "grad_norm": 0.7762123346328735, "learning_rate": 4.959799332559071e-05, "loss": 0.6848, "mean_token_accuracy": 0.7904628276824951, "num_tokens": 120450188.0, "step": 3595 }, { "epoch": 0.2148997134670487, "grad_norm": 0.6695089340209961, "learning_rate": 4.9595900930274206e-05, "loss": 0.6312, "mean_token_accuracy": 0.8058749794960022, "num_tokens": 120617868.0, "step": 3600 }, { "epoch": 0.2151981852913085, "grad_norm": 0.645816445350647, "learning_rate": 4.95938031530932e-05, "loss": 0.6231, "mean_token_accuracy": 0.8087200284004211, "num_tokens": 120785548.0, "step": 3605 }, { "epoch": 0.21549665711556829, "grad_norm": 1.1988598108291626, "learning_rate": 4.959169999455862e-05, "loss": 0.6063, "mean_token_accuracy": 0.8146964073181152, "num_tokens": 120953228.0, "step": 3610 }, { "epoch": 0.21579512893982808, "grad_norm": 0.7384743094444275, "learning_rate": 4.958959145518275e-05, "loss": 0.618, "mean_token_accuracy": 0.8079864025115967, "num_tokens": 121120908.0, "step": 3615 }, { "epoch": 0.21609360076408787, "grad_norm": 0.7345996499061584, "learning_rate": 4.958747753547915e-05, "loss": 0.629, "mean_token_accuracy": 0.8074317097663879, "num_tokens": 121288588.0, "step": 3620 }, { "epoch": 0.21639207258834767, "grad_norm": 0.6977571249008179, "learning_rate": 4.9585358235962704e-05, "loss": 0.6473, "mean_token_accuracy": 0.800614333152771, "num_tokens": 121456268.0, "step": 3625 }, { "epoch": 0.21669054441260746, "grad_norm": 0.7001806497573853, "learning_rate": 4.9583233557149616e-05, "loss": 0.6342, "mean_token_accuracy": 0.8061672568321228, "num_tokens": 121623948.0, "step": 3630 }, { "epoch": 0.21698901623686723, "grad_norm": 0.6496277451515198, "learning_rate": 4.958110349955738e-05, "loss": 0.6127, "mean_token_accuracy": 0.8109745860099793, "num_tokens": 121791628.0, "step": 3635 }, { "epoch": 0.21728748806112702, "grad_norm": 0.7329492568969727, "learning_rate": 4.957896806370482e-05, "loss": 0.6363, "mean_token_accuracy": 0.8030239701271057, "num_tokens": 121959308.0, "step": 3640 }, { "epoch": 0.21758595988538681, "grad_norm": 0.6131376028060913, "learning_rate": 4.9576827250112055e-05, "loss": 0.5979, "mean_token_accuracy": 0.8146188855171204, "num_tokens": 122126988.0, "step": 3645 }, { "epoch": 0.2178844317096466, "grad_norm": 0.6058253049850464, "learning_rate": 4.9574681059300525e-05, "loss": 0.5701, "mean_token_accuracy": 0.822330904006958, "num_tokens": 122294668.0, "step": 3650 }, { "epoch": 0.2181829035339064, "grad_norm": 0.6051797270774841, "learning_rate": 4.9572529491792966e-05, "loss": 0.5564, "mean_token_accuracy": 0.8265835642814636, "num_tokens": 122462348.0, "step": 3655 }, { "epoch": 0.2184813753581662, "grad_norm": 0.6974071860313416, "learning_rate": 4.957037254811344e-05, "loss": 0.6101, "mean_token_accuracy": 0.8135273575782775, "num_tokens": 122630028.0, "step": 3660 }, { "epoch": 0.218779847182426, "grad_norm": 0.6367104053497314, "learning_rate": 4.956821022878731e-05, "loss": 0.6145, "mean_token_accuracy": 0.8118096113204956, "num_tokens": 122797708.0, "step": 3665 }, { "epoch": 0.21907831900668576, "grad_norm": 0.6408621668815613, "learning_rate": 4.956604253434124e-05, "loss": 0.5942, "mean_token_accuracy": 0.8145830750465393, "num_tokens": 122965388.0, "step": 3670 }, { "epoch": 0.21937679083094555, "grad_norm": 0.6505680680274963, "learning_rate": 4.956386946530323e-05, "loss": 0.5939, "mean_token_accuracy": 0.8148634195327759, "num_tokens": 123133068.0, "step": 3675 }, { "epoch": 0.21967526265520534, "grad_norm": 0.646426260471344, "learning_rate": 4.9561691022202555e-05, "loss": 0.6046, "mean_token_accuracy": 0.8117618918418884, "num_tokens": 123300748.0, "step": 3680 }, { "epoch": 0.21997373447946514, "grad_norm": 0.6544671654701233, "learning_rate": 4.955950720556981e-05, "loss": 0.6677, "mean_token_accuracy": 0.7957055926322937, "num_tokens": 123468428.0, "step": 3685 }, { "epoch": 0.22027220630372493, "grad_norm": 0.683693528175354, "learning_rate": 4.955731801593692e-05, "loss": 0.6198, "mean_token_accuracy": 0.8088333606719971, "num_tokens": 123636108.0, "step": 3690 }, { "epoch": 0.22057067812798473, "grad_norm": 0.6387953162193298, "learning_rate": 4.95551234538371e-05, "loss": 0.5699, "mean_token_accuracy": 0.8228319406509399, "num_tokens": 123803788.0, "step": 3695 }, { "epoch": 0.22086914995224452, "grad_norm": 0.6805548071861267, "learning_rate": 4.955292351980487e-05, "loss": 0.6391, "mean_token_accuracy": 0.8038768887519836, "num_tokens": 123971468.0, "step": 3700 }, { "epoch": 0.2211676217765043, "grad_norm": 0.5999314188957214, "learning_rate": 4.955071821437605e-05, "loss": 0.5867, "mean_token_accuracy": 0.8186746954917907, "num_tokens": 124139148.0, "step": 3705 }, { "epoch": 0.22146609360076408, "grad_norm": 0.7274695634841919, "learning_rate": 4.954850753808782e-05, "loss": 0.6324, "mean_token_accuracy": 0.8037397027015686, "num_tokens": 124306828.0, "step": 3710 }, { "epoch": 0.22176456542502387, "grad_norm": 0.6396534442901611, "learning_rate": 4.95462914914786e-05, "loss": 0.6739, "mean_token_accuracy": 0.7941429018974304, "num_tokens": 124474508.0, "step": 3715 }, { "epoch": 0.22206303724928367, "grad_norm": 0.6167986989021301, "learning_rate": 4.9544070075088166e-05, "loss": 0.6011, "mean_token_accuracy": 0.815537405014038, "num_tokens": 124642188.0, "step": 3720 }, { "epoch": 0.22236150907354346, "grad_norm": 0.5847646594047546, "learning_rate": 4.954184328945757e-05, "loss": 0.6379, "mean_token_accuracy": 0.802719795703888, "num_tokens": 124809868.0, "step": 3725 }, { "epoch": 0.22265998089780326, "grad_norm": 1.0940500497817993, "learning_rate": 4.9539611135129205e-05, "loss": 0.6194, "mean_token_accuracy": 0.8100858807563782, "num_tokens": 124977548.0, "step": 3730 }, { "epoch": 0.22295845272206305, "grad_norm": 0.7148606181144714, "learning_rate": 4.953737361264674e-05, "loss": 0.6337, "mean_token_accuracy": 0.8053322196006775, "num_tokens": 125145228.0, "step": 3735 }, { "epoch": 0.22325692454632282, "grad_norm": 0.7344508171081543, "learning_rate": 4.953513072255516e-05, "loss": 0.574, "mean_token_accuracy": 0.8218537449836731, "num_tokens": 125312908.0, "step": 3740 }, { "epoch": 0.2235553963705826, "grad_norm": 0.7283851504325867, "learning_rate": 4.9532882465400774e-05, "loss": 0.6111, "mean_token_accuracy": 0.8125074505805969, "num_tokens": 125480588.0, "step": 3745 }, { "epoch": 0.2238538681948424, "grad_norm": 0.7857192158699036, "learning_rate": 4.9530628841731185e-05, "loss": 0.6268, "mean_token_accuracy": 0.8052487254142762, "num_tokens": 125648268.0, "step": 3750 }, { "epoch": 0.2241523400191022, "grad_norm": 0.6330496668815613, "learning_rate": 4.952836985209531e-05, "loss": 0.6035, "mean_token_accuracy": 0.8128772377967834, "num_tokens": 125815948.0, "step": 3755 }, { "epoch": 0.224450811843362, "grad_norm": 0.6812946200370789, "learning_rate": 4.952610549704336e-05, "loss": 0.5655, "mean_token_accuracy": 0.823708713054657, "num_tokens": 125983628.0, "step": 3760 }, { "epoch": 0.22474928366762179, "grad_norm": 0.6766478419303894, "learning_rate": 4.9523835777126855e-05, "loss": 0.6546, "mean_token_accuracy": 0.7995705604553223, "num_tokens": 126151308.0, "step": 3765 }, { "epoch": 0.22504775549188158, "grad_norm": 0.8571595549583435, "learning_rate": 4.952156069289863e-05, "loss": 0.6042, "mean_token_accuracy": 0.8122211694717407, "num_tokens": 126318988.0, "step": 3770 }, { "epoch": 0.22534622731614135, "grad_norm": 0.605312705039978, "learning_rate": 4.9519280244912825e-05, "loss": 0.5674, "mean_token_accuracy": 0.8233567953109742, "num_tokens": 126486668.0, "step": 3775 }, { "epoch": 0.22564469914040114, "grad_norm": 0.6619325876235962, "learning_rate": 4.951699443372489e-05, "loss": 0.616, "mean_token_accuracy": 0.8100679874420166, "num_tokens": 126654348.0, "step": 3780 }, { "epoch": 0.22594317096466093, "grad_norm": 0.6574784517288208, "learning_rate": 4.951470325989158e-05, "loss": 0.5418, "mean_token_accuracy": 0.8305976390838623, "num_tokens": 126822028.0, "step": 3785 }, { "epoch": 0.22624164278892073, "grad_norm": 0.5957261919975281, "learning_rate": 4.9512406723970946e-05, "loss": 0.5998, "mean_token_accuracy": 0.8132172226905823, "num_tokens": 126989708.0, "step": 3790 }, { "epoch": 0.22654011461318052, "grad_norm": 0.8026285171508789, "learning_rate": 4.951010482652234e-05, "loss": 0.599, "mean_token_accuracy": 0.8172134160995483, "num_tokens": 127157388.0, "step": 3795 }, { "epoch": 0.22683858643744031, "grad_norm": 0.6566646099090576, "learning_rate": 4.9507797568106454e-05, "loss": 0.649, "mean_token_accuracy": 0.8028092503547668, "num_tokens": 127325068.0, "step": 3800 }, { "epoch": 0.2271370582617001, "grad_norm": 0.6030519604682922, "learning_rate": 4.950548494928525e-05, "loss": 0.5754, "mean_token_accuracy": 0.8199928522109985, "num_tokens": 127492748.0, "step": 3805 }, { "epoch": 0.22743553008595987, "grad_norm": 0.7191128730773926, "learning_rate": 4.950316697062201e-05, "loss": 0.6478, "mean_token_accuracy": 0.8025825977325439, "num_tokens": 127660428.0, "step": 3810 }, { "epoch": 0.22773400191021967, "grad_norm": 0.7019906044006348, "learning_rate": 4.950084363268133e-05, "loss": 0.6325, "mean_token_accuracy": 0.8053620457649231, "num_tokens": 127828108.0, "step": 3815 }, { "epoch": 0.22803247373447946, "grad_norm": 0.6270615458488464, "learning_rate": 4.949851493602909e-05, "loss": 0.6229, "mean_token_accuracy": 0.8078253626823425, "num_tokens": 127995788.0, "step": 3820 }, { "epoch": 0.22833094555873926, "grad_norm": 0.6706055402755737, "learning_rate": 4.9496180881232486e-05, "loss": 0.5751, "mean_token_accuracy": 0.8208576798439026, "num_tokens": 128163468.0, "step": 3825 }, { "epoch": 0.22862941738299905, "grad_norm": 0.6419352889060974, "learning_rate": 4.9493841468860034e-05, "loss": 0.6171, "mean_token_accuracy": 0.8114517450332641, "num_tokens": 128331148.0, "step": 3830 }, { "epoch": 0.22892788920725884, "grad_norm": 0.5844231843948364, "learning_rate": 4.949149669948152e-05, "loss": 0.6126, "mean_token_accuracy": 0.8111296653747558, "num_tokens": 128498828.0, "step": 3835 }, { "epoch": 0.22922636103151864, "grad_norm": 0.657459020614624, "learning_rate": 4.948914657366808e-05, "loss": 0.6175, "mean_token_accuracy": 0.8085530281066895, "num_tokens": 128666508.0, "step": 3840 }, { "epoch": 0.2295248328557784, "grad_norm": 0.7376065850257874, "learning_rate": 4.948679109199212e-05, "loss": 0.64, "mean_token_accuracy": 0.8036323666572571, "num_tokens": 128834188.0, "step": 3845 }, { "epoch": 0.2298233046800382, "grad_norm": 0.7029840350151062, "learning_rate": 4.948443025502735e-05, "loss": 0.5621, "mean_token_accuracy": 0.8264105796813965, "num_tokens": 129001868.0, "step": 3850 }, { "epoch": 0.230121776504298, "grad_norm": 0.6539315581321716, "learning_rate": 4.948206406334881e-05, "loss": 0.6402, "mean_token_accuracy": 0.8021770238876342, "num_tokens": 129169548.0, "step": 3855 }, { "epoch": 0.2304202483285578, "grad_norm": 0.6318440437316895, "learning_rate": 4.9479692517532826e-05, "loss": 0.6407, "mean_token_accuracy": 0.800864839553833, "num_tokens": 129337228.0, "step": 3860 }, { "epoch": 0.23071872015281758, "grad_norm": 0.6926705837249756, "learning_rate": 4.9477315618157024e-05, "loss": 0.6569, "mean_token_accuracy": 0.7985446691513062, "num_tokens": 129504908.0, "step": 3865 }, { "epoch": 0.23101719197707737, "grad_norm": 0.6548948884010315, "learning_rate": 4.947493336580036e-05, "loss": 0.5607, "mean_token_accuracy": 0.8236311554908753, "num_tokens": 129672588.0, "step": 3870 }, { "epoch": 0.23131566380133714, "grad_norm": 0.7093825340270996, "learning_rate": 4.947254576104306e-05, "loss": 0.6198, "mean_token_accuracy": 0.8086961507797241, "num_tokens": 129840268.0, "step": 3875 }, { "epoch": 0.23161413562559693, "grad_norm": 0.7560344338417053, "learning_rate": 4.947015280446667e-05, "loss": 0.6544, "mean_token_accuracy": 0.7972981214523316, "num_tokens": 130007948.0, "step": 3880 }, { "epoch": 0.23191260744985673, "grad_norm": 0.6496684551239014, "learning_rate": 4.946775449665404e-05, "loss": 0.6367, "mean_token_accuracy": 0.8032744884490967, "num_tokens": 130175628.0, "step": 3885 }, { "epoch": 0.23221107927411652, "grad_norm": 0.6434205770492554, "learning_rate": 4.9465350838189326e-05, "loss": 0.6565, "mean_token_accuracy": 0.7995884537696838, "num_tokens": 130343308.0, "step": 3890 }, { "epoch": 0.23250955109837632, "grad_norm": 0.6018290519714355, "learning_rate": 4.9462941829657987e-05, "loss": 0.6235, "mean_token_accuracy": 0.8085709333419799, "num_tokens": 130510988.0, "step": 3895 }, { "epoch": 0.2328080229226361, "grad_norm": 0.573374330997467, "learning_rate": 4.9460527471646775e-05, "loss": 0.6335, "mean_token_accuracy": 0.8070976972579956, "num_tokens": 130678668.0, "step": 3900 }, { "epoch": 0.2331064947468959, "grad_norm": 0.6306058764457703, "learning_rate": 4.945810776474375e-05, "loss": 0.5795, "mean_token_accuracy": 0.8211857318878174, "num_tokens": 130846348.0, "step": 3905 }, { "epoch": 0.23340496657115567, "grad_norm": 0.652961790561676, "learning_rate": 4.945568270953827e-05, "loss": 0.6291, "mean_token_accuracy": 0.8067636847496032, "num_tokens": 131014028.0, "step": 3910 }, { "epoch": 0.23370343839541546, "grad_norm": 0.7349520325660706, "learning_rate": 4.9453252306621026e-05, "loss": 0.6554, "mean_token_accuracy": 0.7977335095405579, "num_tokens": 131181708.0, "step": 3915 }, { "epoch": 0.23400191021967526, "grad_norm": 0.6300112009048462, "learning_rate": 4.9450816556583955e-05, "loss": 0.5391, "mean_token_accuracy": 0.831003212928772, "num_tokens": 131349388.0, "step": 3920 }, { "epoch": 0.23430038204393505, "grad_norm": 0.6644827127456665, "learning_rate": 4.944837546002036e-05, "loss": 0.6238, "mean_token_accuracy": 0.8076285362243653, "num_tokens": 131517068.0, "step": 3925 }, { "epoch": 0.23459885386819485, "grad_norm": 0.5999575257301331, "learning_rate": 4.944592901752479e-05, "loss": 0.5803, "mean_token_accuracy": 0.8205177187919617, "num_tokens": 131684748.0, "step": 3930 }, { "epoch": 0.23489732569245464, "grad_norm": 0.5984506607055664, "learning_rate": 4.944347722969314e-05, "loss": 0.5472, "mean_token_accuracy": 0.829023027420044, "num_tokens": 131852428.0, "step": 3935 }, { "epoch": 0.23519579751671443, "grad_norm": 0.6539884805679321, "learning_rate": 4.9441020097122584e-05, "loss": 0.6394, "mean_token_accuracy": 0.8013181447982788, "num_tokens": 132020108.0, "step": 3940 }, { "epoch": 0.2354942693409742, "grad_norm": 0.6690188050270081, "learning_rate": 4.943855762041159e-05, "loss": 0.5877, "mean_token_accuracy": 0.8185554146766663, "num_tokens": 132187788.0, "step": 3945 }, { "epoch": 0.235792741165234, "grad_norm": 0.7017703652381897, "learning_rate": 4.9436089800159956e-05, "loss": 0.6142, "mean_token_accuracy": 0.8101097464561462, "num_tokens": 132355468.0, "step": 3950 }, { "epoch": 0.2360912129894938, "grad_norm": 0.6887523531913757, "learning_rate": 4.943361663696875e-05, "loss": 0.6289, "mean_token_accuracy": 0.8065489649772644, "num_tokens": 132523148.0, "step": 3955 }, { "epoch": 0.23638968481375358, "grad_norm": 0.6603471636772156, "learning_rate": 4.9431138131440376e-05, "loss": 0.6517, "mean_token_accuracy": 0.8001968145370484, "num_tokens": 132690828.0, "step": 3960 }, { "epoch": 0.23668815663801338, "grad_norm": 0.602262020111084, "learning_rate": 4.94286542841785e-05, "loss": 0.6092, "mean_token_accuracy": 0.812429916858673, "num_tokens": 132858508.0, "step": 3965 }, { "epoch": 0.23698662846227317, "grad_norm": 0.5906766057014465, "learning_rate": 4.9426165095788105e-05, "loss": 0.5855, "mean_token_accuracy": 0.8167839646339417, "num_tokens": 133026188.0, "step": 3970 }, { "epoch": 0.23728510028653296, "grad_norm": 0.7193766832351685, "learning_rate": 4.942367056687551e-05, "loss": 0.6409, "mean_token_accuracy": 0.8023977041244507, "num_tokens": 133193868.0, "step": 3975 }, { "epoch": 0.23758357211079273, "grad_norm": 0.6253636479377747, "learning_rate": 4.942117069804827e-05, "loss": 0.6299, "mean_token_accuracy": 0.8059704303741455, "num_tokens": 133361548.0, "step": 3980 }, { "epoch": 0.23788204393505252, "grad_norm": 0.6552958488464355, "learning_rate": 4.941866548991529e-05, "loss": 0.6087, "mean_token_accuracy": 0.8109447598457337, "num_tokens": 133529228.0, "step": 3985 }, { "epoch": 0.23818051575931232, "grad_norm": 0.6816372871398926, "learning_rate": 4.9416154943086765e-05, "loss": 0.6263, "mean_token_accuracy": 0.8083025217056274, "num_tokens": 133696908.0, "step": 3990 }, { "epoch": 0.2384789875835721, "grad_norm": 0.613618791103363, "learning_rate": 4.941363905817417e-05, "loss": 0.6176, "mean_token_accuracy": 0.811302638053894, "num_tokens": 133864588.0, "step": 3995 }, { "epoch": 0.2387774594078319, "grad_norm": 0.6847855448722839, "learning_rate": 4.9411117835790304e-05, "loss": 0.5766, "mean_token_accuracy": 0.8193307876586914, "num_tokens": 134032268.0, "step": 4000 }, { "epoch": 0.2390759312320917, "grad_norm": 0.682997465133667, "learning_rate": 4.9408591276549266e-05, "loss": 0.6139, "mean_token_accuracy": 0.8106763601303101, "num_tokens": 134199948.0, "step": 4005 }, { "epoch": 0.2393744030563515, "grad_norm": 0.7522417306900024, "learning_rate": 4.940605938106643e-05, "loss": 0.5928, "mean_token_accuracy": 0.8140880346298218, "num_tokens": 134367628.0, "step": 4010 }, { "epoch": 0.23967287488061126, "grad_norm": 0.7058261632919312, "learning_rate": 4.9403522149958485e-05, "loss": 0.6209, "mean_token_accuracy": 0.8093761205673218, "num_tokens": 134535308.0, "step": 4015 }, { "epoch": 0.23997134670487105, "grad_norm": 0.5665675401687622, "learning_rate": 4.940097958384343e-05, "loss": 0.5954, "mean_token_accuracy": 0.8134498476982117, "num_tokens": 134702988.0, "step": 4020 }, { "epoch": 0.24026981852913085, "grad_norm": 0.6552174091339111, "learning_rate": 4.939843168334056e-05, "loss": 0.6219, "mean_token_accuracy": 0.8063819646835327, "num_tokens": 134870668.0, "step": 4025 }, { "epoch": 0.24056829035339064, "grad_norm": 0.7946946620941162, "learning_rate": 4.939587844907045e-05, "loss": 0.592, "mean_token_accuracy": 0.8169211506843567, "num_tokens": 135038348.0, "step": 4030 }, { "epoch": 0.24086676217765043, "grad_norm": 0.6194802522659302, "learning_rate": 4.9393319881655e-05, "loss": 0.6307, "mean_token_accuracy": 0.8075032711029053, "num_tokens": 135206028.0, "step": 4035 }, { "epoch": 0.24116523400191023, "grad_norm": 0.6646754741668701, "learning_rate": 4.939075598171737e-05, "loss": 0.6366, "mean_token_accuracy": 0.8035011291503906, "num_tokens": 135373708.0, "step": 4040 }, { "epoch": 0.24146370582617002, "grad_norm": 0.6231975555419922, "learning_rate": 4.9388186749882074e-05, "loss": 0.6283, "mean_token_accuracy": 0.8051354050636291, "num_tokens": 135541388.0, "step": 4045 }, { "epoch": 0.2417621776504298, "grad_norm": 0.6142847537994385, "learning_rate": 4.9385612186774895e-05, "loss": 0.5823, "mean_token_accuracy": 0.8176905632019043, "num_tokens": 135709068.0, "step": 4050 }, { "epoch": 0.24206064947468958, "grad_norm": 0.6034107804298401, "learning_rate": 4.938303229302289e-05, "loss": 0.5621, "mean_token_accuracy": 0.8236550092697144, "num_tokens": 135876748.0, "step": 4055 }, { "epoch": 0.24235912129894938, "grad_norm": 0.6483427882194519, "learning_rate": 4.9380447069254465e-05, "loss": 0.5761, "mean_token_accuracy": 0.818370521068573, "num_tokens": 136044428.0, "step": 4060 }, { "epoch": 0.24265759312320917, "grad_norm": 0.6459857821464539, "learning_rate": 4.937785651609929e-05, "loss": 0.6138, "mean_token_accuracy": 0.8086663603782653, "num_tokens": 136212108.0, "step": 4065 }, { "epoch": 0.24295606494746896, "grad_norm": 0.7154399156570435, "learning_rate": 4.937526063418834e-05, "loss": 0.5977, "mean_token_accuracy": 0.813998568058014, "num_tokens": 136379788.0, "step": 4070 }, { "epoch": 0.24325453677172876, "grad_norm": 0.7276933789253235, "learning_rate": 4.937265942415388e-05, "loss": 0.6303, "mean_token_accuracy": 0.8070678591728211, "num_tokens": 136547468.0, "step": 4075 }, { "epoch": 0.24355300859598855, "grad_norm": 0.6441953182220459, "learning_rate": 4.937005288662951e-05, "loss": 0.6076, "mean_token_accuracy": 0.8098473072052002, "num_tokens": 136715148.0, "step": 4080 }, { "epoch": 0.24385148042024832, "grad_norm": 0.6848450303077698, "learning_rate": 4.9367441022250075e-05, "loss": 0.6334, "mean_token_accuracy": 0.8028569698333741, "num_tokens": 136882828.0, "step": 4085 }, { "epoch": 0.2441499522445081, "grad_norm": 0.69948810338974, "learning_rate": 4.936482383165175e-05, "loss": 0.6412, "mean_token_accuracy": 0.8035249829292297, "num_tokens": 137050508.0, "step": 4090 }, { "epoch": 0.2444484240687679, "grad_norm": 0.6532810926437378, "learning_rate": 4.936220131547201e-05, "loss": 0.6337, "mean_token_accuracy": 0.8033341407775879, "num_tokens": 137218188.0, "step": 4095 }, { "epoch": 0.2447468958930277, "grad_norm": 0.6094919443130493, "learning_rate": 4.9359573474349605e-05, "loss": 0.5844, "mean_token_accuracy": 0.8183228015899658, "num_tokens": 137385868.0, "step": 4100 }, { "epoch": 0.2450453677172875, "grad_norm": 0.6443773508071899, "learning_rate": 4.93569403089246e-05, "loss": 0.6021, "mean_token_accuracy": 0.8111415982246399, "num_tokens": 137553548.0, "step": 4105 }, { "epoch": 0.2453438395415473, "grad_norm": 0.6929174065589905, "learning_rate": 4.9354301819838364e-05, "loss": 0.605, "mean_token_accuracy": 0.812895143032074, "num_tokens": 137721228.0, "step": 4110 }, { "epoch": 0.24564231136580708, "grad_norm": 0.6365860104560852, "learning_rate": 4.935165800773352e-05, "loss": 0.6466, "mean_token_accuracy": 0.7997017621994018, "num_tokens": 137888908.0, "step": 4115 }, { "epoch": 0.24594078319006685, "grad_norm": 0.636452317237854, "learning_rate": 4.934900887325404e-05, "loss": 0.6349, "mean_token_accuracy": 0.8062028527259827, "num_tokens": 138049187.0, "step": 4120 }, { "epoch": 0.24623925501432664, "grad_norm": 0.6648581624031067, "learning_rate": 4.934635441704515e-05, "loss": 0.5835, "mean_token_accuracy": 0.8195753335952759, "num_tokens": 138216867.0, "step": 4125 }, { "epoch": 0.24653772683858644, "grad_norm": 0.6769698858261108, "learning_rate": 4.934369463975341e-05, "loss": 0.5938, "mean_token_accuracy": 0.8174996972084045, "num_tokens": 138384547.0, "step": 4130 }, { "epoch": 0.24683619866284623, "grad_norm": 0.5818044543266296, "learning_rate": 4.9341029542026656e-05, "loss": 0.6322, "mean_token_accuracy": 0.8060300588607788, "num_tokens": 138552227.0, "step": 4135 }, { "epoch": 0.24713467048710602, "grad_norm": 1.1713244915008545, "learning_rate": 4.9338359124514016e-05, "loss": 0.685, "mean_token_accuracy": 0.7939281821250915, "num_tokens": 138719907.0, "step": 4140 }, { "epoch": 0.24743314231136582, "grad_norm": 0.8352034091949463, "learning_rate": 4.9335683387865924e-05, "loss": 0.6586, "mean_token_accuracy": 0.7974352955818176, "num_tokens": 138887587.0, "step": 4145 }, { "epoch": 0.2477316141356256, "grad_norm": 0.6547311544418335, "learning_rate": 4.93330023327341e-05, "loss": 0.5677, "mean_token_accuracy": 0.8241619825363159, "num_tokens": 139055267.0, "step": 4150 }, { "epoch": 0.24803008595988538, "grad_norm": 0.6543726921081543, "learning_rate": 4.933031595977157e-05, "loss": 0.5974, "mean_token_accuracy": 0.8156030058860779, "num_tokens": 139222947.0, "step": 4155 }, { "epoch": 0.24832855778414517, "grad_norm": 0.6652186512947083, "learning_rate": 4.932762426963264e-05, "loss": 0.6245, "mean_token_accuracy": 0.8084337472915649, "num_tokens": 139390627.0, "step": 4160 }, { "epoch": 0.24862702960840496, "grad_norm": 0.7863958477973938, "learning_rate": 4.932492726297293e-05, "loss": 0.64, "mean_token_accuracy": 0.8058869123458863, "num_tokens": 139558307.0, "step": 4165 }, { "epoch": 0.24892550143266476, "grad_norm": 0.6341575384140015, "learning_rate": 4.9322224940449344e-05, "loss": 0.6417, "mean_token_accuracy": 0.8025945425033569, "num_tokens": 139725987.0, "step": 4170 }, { "epoch": 0.24922397325692455, "grad_norm": 0.6730334758758545, "learning_rate": 4.931951730272008e-05, "loss": 0.6251, "mean_token_accuracy": 0.8078849911689758, "num_tokens": 139893667.0, "step": 4175 }, { "epoch": 0.24952244508118435, "grad_norm": 0.6012901663780212, "learning_rate": 4.931680435044464e-05, "loss": 0.5785, "mean_token_accuracy": 0.8199510931968689, "num_tokens": 140061347.0, "step": 4180 }, { "epoch": 0.2498209169054441, "grad_norm": 0.6753756403923035, "learning_rate": 4.931408608428381e-05, "loss": 0.6032, "mean_token_accuracy": 0.8137811779975891, "num_tokens": 140224430.0, "step": 4185 }, { "epoch": 0.2501193887297039, "grad_norm": 0.8917942643165588, "learning_rate": 4.931136250489966e-05, "loss": 0.581, "mean_token_accuracy": 0.8197602391242981, "num_tokens": 140392110.0, "step": 4190 }, { "epoch": 0.2504178605539637, "grad_norm": 0.656954288482666, "learning_rate": 4.930863361295559e-05, "loss": 0.629, "mean_token_accuracy": 0.8068591117858886, "num_tokens": 140559790.0, "step": 4195 }, { "epoch": 0.2507163323782235, "grad_norm": 0.7210436463356018, "learning_rate": 4.930589940911626e-05, "loss": 0.5872, "mean_token_accuracy": 0.8177144169807434, "num_tokens": 140727470.0, "step": 4200 }, { "epoch": 0.2510148042024833, "grad_norm": 0.6364195346832275, "learning_rate": 4.930315989404763e-05, "loss": 0.5738, "mean_token_accuracy": 0.8197661995887756, "num_tokens": 140895150.0, "step": 4205 }, { "epoch": 0.2513132760267431, "grad_norm": 0.6811306476593018, "learning_rate": 4.930041506841698e-05, "loss": 0.586, "mean_token_accuracy": 0.8177382707595825, "num_tokens": 141062830.0, "step": 4210 }, { "epoch": 0.2516117478510029, "grad_norm": 0.6453559398651123, "learning_rate": 4.9297664932892844e-05, "loss": 0.6279, "mean_token_accuracy": 0.805588698387146, "num_tokens": 141230510.0, "step": 4215 }, { "epoch": 0.25191021967526267, "grad_norm": 0.6769728064537048, "learning_rate": 4.929490948814508e-05, "loss": 0.5905, "mean_token_accuracy": 0.8151079773902893, "num_tokens": 141398190.0, "step": 4220 }, { "epoch": 0.25220869149952246, "grad_norm": 0.6134464740753174, "learning_rate": 4.9292148734844816e-05, "loss": 0.5882, "mean_token_accuracy": 0.8175219058990478, "num_tokens": 141559577.0, "step": 4225 }, { "epoch": 0.25250716332378226, "grad_norm": 0.6419605612754822, "learning_rate": 4.92893826736645e-05, "loss": 0.6042, "mean_token_accuracy": 0.8128474235534668, "num_tokens": 141727257.0, "step": 4230 }, { "epoch": 0.25280563514804205, "grad_norm": 0.6950979828834534, "learning_rate": 4.9286611305277844e-05, "loss": 0.6199, "mean_token_accuracy": 0.8077239632606507, "num_tokens": 141894937.0, "step": 4235 }, { "epoch": 0.2531041069723018, "grad_norm": 0.6773786544799805, "learning_rate": 4.9283834630359856e-05, "loss": 0.5812, "mean_token_accuracy": 0.8211857318878174, "num_tokens": 142062617.0, "step": 4240 }, { "epoch": 0.2534025787965616, "grad_norm": 0.7095805406570435, "learning_rate": 4.928105264958687e-05, "loss": 0.667, "mean_token_accuracy": 0.7949660062789917, "num_tokens": 142230297.0, "step": 4245 }, { "epoch": 0.2537010506208214, "grad_norm": 0.7544382214546204, "learning_rate": 4.9278265363636485e-05, "loss": 0.6297, "mean_token_accuracy": 0.8055350065231324, "num_tokens": 142397977.0, "step": 4250 }, { "epoch": 0.25399952244508117, "grad_norm": 0.6471272706985474, "learning_rate": 4.927547277318759e-05, "loss": 0.581, "mean_token_accuracy": 0.8175414562225342, "num_tokens": 142565657.0, "step": 4255 }, { "epoch": 0.25429799426934097, "grad_norm": 0.7374337911605835, "learning_rate": 4.927267487892036e-05, "loss": 0.6279, "mean_token_accuracy": 0.8062562942504883, "num_tokens": 142727426.0, "step": 4260 }, { "epoch": 0.25459646609360076, "grad_norm": 0.7466955780982971, "learning_rate": 4.92698716815163e-05, "loss": 0.6534, "mean_token_accuracy": 0.7972504019737243, "num_tokens": 142895106.0, "step": 4265 }, { "epoch": 0.25489493791786055, "grad_norm": 0.6746529936790466, "learning_rate": 4.926706318165815e-05, "loss": 0.636, "mean_token_accuracy": 0.8036800622940063, "num_tokens": 143062786.0, "step": 4270 }, { "epoch": 0.25519340974212035, "grad_norm": 1.364909052848816, "learning_rate": 4.926424938003e-05, "loss": 0.6147, "mean_token_accuracy": 0.8098294258117675, "num_tokens": 143230466.0, "step": 4275 }, { "epoch": 0.25549188156638014, "grad_norm": 0.6397042274475098, "learning_rate": 4.9261430277317196e-05, "loss": 0.6297, "mean_token_accuracy": 0.8068173646926879, "num_tokens": 143398146.0, "step": 4280 }, { "epoch": 0.25579035339063994, "grad_norm": 0.5720118880271912, "learning_rate": 4.9258605874206364e-05, "loss": 0.6016, "mean_token_accuracy": 0.8148157000541687, "num_tokens": 143565826.0, "step": 4285 }, { "epoch": 0.25608882521489973, "grad_norm": 0.6098594665527344, "learning_rate": 4.925577617138546e-05, "loss": 0.5976, "mean_token_accuracy": 0.8132112741470336, "num_tokens": 143733506.0, "step": 4290 }, { "epoch": 0.2563872970391595, "grad_norm": 0.5886116623878479, "learning_rate": 4.925294116954371e-05, "loss": 0.5775, "mean_token_accuracy": 0.820589280128479, "num_tokens": 143901186.0, "step": 4295 }, { "epoch": 0.2566857688634193, "grad_norm": 0.6583279371261597, "learning_rate": 4.925010086937161e-05, "loss": 0.6053, "mean_token_accuracy": 0.8142192482948303, "num_tokens": 144068866.0, "step": 4300 }, { "epoch": 0.2569842406876791, "grad_norm": 0.6619601845741272, "learning_rate": 4.9247255271560994e-05, "loss": 0.5762, "mean_token_accuracy": 0.8225515842437744, "num_tokens": 144236546.0, "step": 4305 }, { "epoch": 0.25728271251193885, "grad_norm": 0.8144528865814209, "learning_rate": 4.924440437680495e-05, "loss": 0.6199, "mean_token_accuracy": 0.8079506039619446, "num_tokens": 144404226.0, "step": 4310 }, { "epoch": 0.25758118433619864, "grad_norm": 0.6154191493988037, "learning_rate": 4.924154818579786e-05, "loss": 0.5922, "mean_token_accuracy": 0.814887273311615, "num_tokens": 144571906.0, "step": 4315 }, { "epoch": 0.25787965616045844, "grad_norm": 0.6959332823753357, "learning_rate": 4.92386866992354e-05, "loss": 0.6252, "mean_token_accuracy": 0.80547536611557, "num_tokens": 144739586.0, "step": 4320 }, { "epoch": 0.25817812798471823, "grad_norm": 0.642940104007721, "learning_rate": 4.923581991781456e-05, "loss": 0.5916, "mean_token_accuracy": 0.8142550230026245, "num_tokens": 144907266.0, "step": 4325 }, { "epoch": 0.258476599808978, "grad_norm": 0.6554369926452637, "learning_rate": 4.9232947842233567e-05, "loss": 0.5374, "mean_token_accuracy": 0.8312060236930847, "num_tokens": 145074946.0, "step": 4330 }, { "epoch": 0.2587750716332378, "grad_norm": 0.6094810962677002, "learning_rate": 4.9230070473191994e-05, "loss": 0.573, "mean_token_accuracy": 0.820589292049408, "num_tokens": 145242626.0, "step": 4335 }, { "epoch": 0.2590735434574976, "grad_norm": 0.634344220161438, "learning_rate": 4.9227187811390654e-05, "loss": 0.5919, "mean_token_accuracy": 0.8159250855445862, "num_tokens": 145410306.0, "step": 4340 }, { "epoch": 0.2593720152817574, "grad_norm": 0.6004631519317627, "learning_rate": 4.922429985753169e-05, "loss": 0.5849, "mean_token_accuracy": 0.8194620132446289, "num_tokens": 145577986.0, "step": 4345 }, { "epoch": 0.2596704871060172, "grad_norm": 0.6163318157196045, "learning_rate": 4.922140661231852e-05, "loss": 0.5744, "mean_token_accuracy": 0.819724440574646, "num_tokens": 145745666.0, "step": 4350 }, { "epoch": 0.259968958930277, "grad_norm": 0.6415529251098633, "learning_rate": 4.921850807645582e-05, "loss": 0.5937, "mean_token_accuracy": 0.8149588346481323, "num_tokens": 145913346.0, "step": 4355 }, { "epoch": 0.2602674307545368, "grad_norm": 0.7055518627166748, "learning_rate": 4.9215604250649613e-05, "loss": 0.6781, "mean_token_accuracy": 0.7928009033203125, "num_tokens": 146081026.0, "step": 4360 }, { "epoch": 0.2605659025787966, "grad_norm": 0.6350159049034119, "learning_rate": 4.9212695135607166e-05, "loss": 0.5556, "mean_token_accuracy": 0.8243946194648742, "num_tokens": 146248706.0, "step": 4365 }, { "epoch": 0.2608643744030564, "grad_norm": 0.5864571332931519, "learning_rate": 4.920978073203705e-05, "loss": 0.5874, "mean_token_accuracy": 0.8168495893478394, "num_tokens": 146416386.0, "step": 4370 }, { "epoch": 0.2611628462273161, "grad_norm": 0.6074321866035461, "learning_rate": 4.9206861040649116e-05, "loss": 0.6377, "mean_token_accuracy": 0.8045389533042908, "num_tokens": 146584066.0, "step": 4375 }, { "epoch": 0.2614613180515759, "grad_norm": 0.6155579090118408, "learning_rate": 4.920393606215451e-05, "loss": 0.5663, "mean_token_accuracy": 0.8229452490806579, "num_tokens": 146751746.0, "step": 4380 }, { "epoch": 0.2617597898758357, "grad_norm": 0.7060921788215637, "learning_rate": 4.920100579726567e-05, "loss": 0.6268, "mean_token_accuracy": 0.8044196724891662, "num_tokens": 146919426.0, "step": 4385 }, { "epoch": 0.2620582617000955, "grad_norm": 0.743566632270813, "learning_rate": 4.919807024669632e-05, "loss": 0.5814, "mean_token_accuracy": 0.8192711353302002, "num_tokens": 147087106.0, "step": 4390 }, { "epoch": 0.2623567335243553, "grad_norm": 0.6009845733642578, "learning_rate": 4.919512941116145e-05, "loss": 0.5866, "mean_token_accuracy": 0.8185792565345764, "num_tokens": 147254786.0, "step": 4395 }, { "epoch": 0.2626552053486151, "grad_norm": 0.5867632031440735, "learning_rate": 4.919218329137737e-05, "loss": 0.6278, "mean_token_accuracy": 0.8053262591361999, "num_tokens": 147422466.0, "step": 4400 }, { "epoch": 0.2629536771728749, "grad_norm": 0.7206937670707703, "learning_rate": 4.918923188806166e-05, "loss": 0.605, "mean_token_accuracy": 0.8124836087226868, "num_tokens": 147590146.0, "step": 4405 }, { "epoch": 0.26325214899713467, "grad_norm": 0.5884151458740234, "learning_rate": 4.918627520193319e-05, "loss": 0.6189, "mean_token_accuracy": 0.8088274002075195, "num_tokens": 147757826.0, "step": 4410 }, { "epoch": 0.26355062082139447, "grad_norm": 0.7009053826332092, "learning_rate": 4.91833132337121e-05, "loss": 0.6253, "mean_token_accuracy": 0.8057199120521545, "num_tokens": 147925506.0, "step": 4415 }, { "epoch": 0.26384909264565426, "grad_norm": 0.5944790840148926, "learning_rate": 4.918034598411985e-05, "loss": 0.5852, "mean_token_accuracy": 0.8170404195785522, "num_tokens": 148093186.0, "step": 4420 }, { "epoch": 0.26414756446991405, "grad_norm": 0.659602165222168, "learning_rate": 4.917737345387916e-05, "loss": 0.577, "mean_token_accuracy": 0.8198437333106995, "num_tokens": 148260866.0, "step": 4425 }, { "epoch": 0.26444603629417385, "grad_norm": 0.6222054362297058, "learning_rate": 4.9174395643714046e-05, "loss": 0.5624, "mean_token_accuracy": 0.8251282334327698, "num_tokens": 148428546.0, "step": 4430 }, { "epoch": 0.26474450811843364, "grad_norm": 0.6671966910362244, "learning_rate": 4.917141255434982e-05, "loss": 0.6131, "mean_token_accuracy": 0.8097936272621155, "num_tokens": 148596226.0, "step": 4435 }, { "epoch": 0.26504297994269344, "grad_norm": 0.6124293804168701, "learning_rate": 4.916842418651305e-05, "loss": 0.5991, "mean_token_accuracy": 0.813897168636322, "num_tokens": 148763906.0, "step": 4440 }, { "epoch": 0.2653414517669532, "grad_norm": 0.7368993759155273, "learning_rate": 4.916543054093161e-05, "loss": 0.5929, "mean_token_accuracy": 0.8160443663597107, "num_tokens": 148931586.0, "step": 4445 }, { "epoch": 0.26563992359121297, "grad_norm": 0.6545404195785522, "learning_rate": 4.916243161833468e-05, "loss": 0.5994, "mean_token_accuracy": 0.813396155834198, "num_tokens": 149099266.0, "step": 4450 }, { "epoch": 0.26593839541547276, "grad_norm": 0.722064733505249, "learning_rate": 4.9159427419452683e-05, "loss": 0.6337, "mean_token_accuracy": 0.8045628190040588, "num_tokens": 149266946.0, "step": 4455 }, { "epoch": 0.26623686723973256, "grad_norm": 0.6071226596832275, "learning_rate": 4.9156417945017335e-05, "loss": 0.6157, "mean_token_accuracy": 0.8071223258972168, "num_tokens": 149432368.0, "step": 4460 }, { "epoch": 0.26653533906399235, "grad_norm": 0.7124296426773071, "learning_rate": 4.915340319576168e-05, "loss": 0.6045, "mean_token_accuracy": 0.8115591168403625, "num_tokens": 149600048.0, "step": 4465 }, { "epoch": 0.26683381088825214, "grad_norm": 0.6773062348365784, "learning_rate": 4.915038317242e-05, "loss": 0.6336, "mean_token_accuracy": 0.8048848986625672, "num_tokens": 149767728.0, "step": 4470 }, { "epoch": 0.26713228271251194, "grad_norm": 0.5971149206161499, "learning_rate": 4.914735787572787e-05, "loss": 0.611, "mean_token_accuracy": 0.8102946400642395, "num_tokens": 149935408.0, "step": 4475 }, { "epoch": 0.26743075453677173, "grad_norm": 0.6414796710014343, "learning_rate": 4.914432730642217e-05, "loss": 0.606, "mean_token_accuracy": 0.814410126209259, "num_tokens": 150103088.0, "step": 4480 }, { "epoch": 0.2677292263610315, "grad_norm": 0.6498230695724487, "learning_rate": 4.914129146524104e-05, "loss": 0.6119, "mean_token_accuracy": 0.8103244781494141, "num_tokens": 150270768.0, "step": 4485 }, { "epoch": 0.2680276981852913, "grad_norm": 0.7054505944252014, "learning_rate": 4.913825035292392e-05, "loss": 0.6122, "mean_token_accuracy": 0.8097757458686828, "num_tokens": 150438448.0, "step": 4490 }, { "epoch": 0.2683261700095511, "grad_norm": 0.6897138357162476, "learning_rate": 4.913520397021154e-05, "loss": 0.6458, "mean_token_accuracy": 0.7985923767089844, "num_tokens": 150606128.0, "step": 4495 }, { "epoch": 0.2686246418338109, "grad_norm": 0.6120455861091614, "learning_rate": 4.913215231784589e-05, "loss": 0.6003, "mean_token_accuracy": 0.8152928471565246, "num_tokens": 150773808.0, "step": 4500 }, { "epoch": 0.2689231136580707, "grad_norm": 0.7380448579788208, "learning_rate": 4.912909539657026e-05, "loss": 0.65, "mean_token_accuracy": 0.7993319749832153, "num_tokens": 150941488.0, "step": 4505 }, { "epoch": 0.2692215854823305, "grad_norm": 0.6018991470336914, "learning_rate": 4.9126033207129216e-05, "loss": 0.5659, "mean_token_accuracy": 0.8222354888916016, "num_tokens": 151109168.0, "step": 4510 }, { "epoch": 0.26952005730659023, "grad_norm": 0.6238842010498047, "learning_rate": 4.912296575026861e-05, "loss": 0.572, "mean_token_accuracy": 0.8205714106559754, "num_tokens": 151276848.0, "step": 4515 }, { "epoch": 0.26981852913085, "grad_norm": 0.6419805288314819, "learning_rate": 4.911989302673558e-05, "loss": 0.6238, "mean_token_accuracy": 0.8058690190315246, "num_tokens": 151444528.0, "step": 4520 }, { "epoch": 0.2701170009551098, "grad_norm": 0.6212139129638672, "learning_rate": 4.911681503727856e-05, "loss": 0.6277, "mean_token_accuracy": 0.8063998579978943, "num_tokens": 151612208.0, "step": 4525 }, { "epoch": 0.2704154727793696, "grad_norm": 0.6858535408973694, "learning_rate": 4.911373178264723e-05, "loss": 0.5831, "mean_token_accuracy": 0.8192097425460816, "num_tokens": 151772383.0, "step": 4530 }, { "epoch": 0.2707139446036294, "grad_norm": 0.6183403730392456, "learning_rate": 4.911064326359258e-05, "loss": 0.5956, "mean_token_accuracy": 0.8123822093009949, "num_tokens": 151940063.0, "step": 4535 }, { "epoch": 0.2710124164278892, "grad_norm": 0.6655312180519104, "learning_rate": 4.910754948086688e-05, "loss": 0.7022, "mean_token_accuracy": 0.7862400054931641, "num_tokens": 152107743.0, "step": 4540 }, { "epoch": 0.271310888252149, "grad_norm": 0.6107160449028015, "learning_rate": 4.910445043522366e-05, "loss": 0.5805, "mean_token_accuracy": 0.8194679856300354, "num_tokens": 152275423.0, "step": 4545 }, { "epoch": 0.2716093600764088, "grad_norm": 0.6021928191184998, "learning_rate": 4.910134612741777e-05, "loss": 0.6332, "mean_token_accuracy": 0.8057378053665161, "num_tokens": 152443103.0, "step": 4550 }, { "epoch": 0.2719078319006686, "grad_norm": 0.6171747446060181, "learning_rate": 4.909823655820533e-05, "loss": 0.5359, "mean_token_accuracy": 0.8300071597099304, "num_tokens": 152610783.0, "step": 4555 }, { "epoch": 0.2722063037249284, "grad_norm": 0.6629837155342102, "learning_rate": 4.9095121728343703e-05, "loss": 0.6131, "mean_token_accuracy": 0.8109447717666626, "num_tokens": 152778463.0, "step": 4560 }, { "epoch": 0.27250477554918817, "grad_norm": 0.6519883871078491, "learning_rate": 4.909200163859158e-05, "loss": 0.6218, "mean_token_accuracy": 0.8087200284004211, "num_tokens": 152946143.0, "step": 4565 }, { "epoch": 0.27280324737344797, "grad_norm": 0.7325212955474854, "learning_rate": 4.908887628970892e-05, "loss": 0.6511, "mean_token_accuracy": 0.798783254623413, "num_tokens": 153113823.0, "step": 4570 }, { "epoch": 0.27310171919770776, "grad_norm": 0.7060492634773254, "learning_rate": 4.908574568245696e-05, "loss": 0.6135, "mean_token_accuracy": 0.8075808167457581, "num_tokens": 153281503.0, "step": 4575 }, { "epoch": 0.27340019102196755, "grad_norm": 2.399169921875, "learning_rate": 4.90826098175982e-05, "loss": 0.6074, "mean_token_accuracy": 0.8116068243980408, "num_tokens": 153449183.0, "step": 4580 }, { "epoch": 0.2736986628462273, "grad_norm": 0.7301245927810669, "learning_rate": 4.907946869589646e-05, "loss": 0.6652, "mean_token_accuracy": 0.7932243824005127, "num_tokens": 153616863.0, "step": 4585 }, { "epoch": 0.2739971346704871, "grad_norm": 0.8387046456336975, "learning_rate": 4.90763223181168e-05, "loss": 0.638, "mean_token_accuracy": 0.8020458102226258, "num_tokens": 153784543.0, "step": 4590 }, { "epoch": 0.2742956064947469, "grad_norm": 0.6804915070533752, "learning_rate": 4.9073170685025585e-05, "loss": 0.6487, "mean_token_accuracy": 0.7996182918548584, "num_tokens": 153952223.0, "step": 4595 }, { "epoch": 0.2745940783190067, "grad_norm": 0.6785187125205994, "learning_rate": 4.9070013797390466e-05, "loss": 0.6194, "mean_token_accuracy": 0.8091673493385315, "num_tokens": 154119903.0, "step": 4600 }, { "epoch": 0.27489255014326647, "grad_norm": 0.6009164452552795, "learning_rate": 4.906685165598034e-05, "loss": 0.6108, "mean_token_accuracy": 0.8108791589736939, "num_tokens": 154287583.0, "step": 4605 }, { "epoch": 0.27519102196752626, "grad_norm": 0.6155039072036743, "learning_rate": 4.906368426156542e-05, "loss": 0.6083, "mean_token_accuracy": 0.8118394494056702, "num_tokens": 154455263.0, "step": 4610 }, { "epoch": 0.27548949379178606, "grad_norm": 0.779868483543396, "learning_rate": 4.9060511614917174e-05, "loss": 0.6843, "mean_token_accuracy": 0.7925682783126831, "num_tokens": 154622943.0, "step": 4615 }, { "epoch": 0.27578796561604585, "grad_norm": 0.6313185691833496, "learning_rate": 4.905733371680836e-05, "loss": 0.6082, "mean_token_accuracy": 0.8114636778831482, "num_tokens": 154790623.0, "step": 4620 }, { "epoch": 0.27608643744030564, "grad_norm": 0.7433581352233887, "learning_rate": 4.9054150568013015e-05, "loss": 0.6507, "mean_token_accuracy": 0.7985446810722351, "num_tokens": 154958303.0, "step": 4625 }, { "epoch": 0.27638490926456544, "grad_norm": 0.6007549166679382, "learning_rate": 4.905096216930647e-05, "loss": 0.5948, "mean_token_accuracy": 0.8150483131408691, "num_tokens": 155125983.0, "step": 4630 }, { "epoch": 0.27668338108882523, "grad_norm": 0.64377760887146, "learning_rate": 4.904776852146529e-05, "loss": 0.6639, "mean_token_accuracy": 0.7965108036994935, "num_tokens": 155293663.0, "step": 4635 }, { "epoch": 0.276981852913085, "grad_norm": 0.6571340560913086, "learning_rate": 4.904456962526736e-05, "loss": 0.5969, "mean_token_accuracy": 0.8157521247863769, "num_tokens": 155461343.0, "step": 4640 }, { "epoch": 0.2772803247373448, "grad_norm": 0.8081356287002563, "learning_rate": 4.904136548149184e-05, "loss": 0.6486, "mean_token_accuracy": 0.7998512029647827, "num_tokens": 155621705.0, "step": 4645 }, { "epoch": 0.27757879656160456, "grad_norm": 0.5961964726448059, "learning_rate": 4.903815609091915e-05, "loss": 0.5854, "mean_token_accuracy": 0.8178158164024353, "num_tokens": 155789385.0, "step": 4650 }, { "epoch": 0.27787726838586435, "grad_norm": 0.6135856509208679, "learning_rate": 4.903494145433098e-05, "loss": 0.5794, "mean_token_accuracy": 0.8214601039886474, "num_tokens": 155957065.0, "step": 4655 }, { "epoch": 0.27817574021012414, "grad_norm": 0.625612199306488, "learning_rate": 4.903172157251034e-05, "loss": 0.646, "mean_token_accuracy": 0.8011093854904174, "num_tokens": 156124745.0, "step": 4660 }, { "epoch": 0.27847421203438394, "grad_norm": 0.6212636828422546, "learning_rate": 4.9028496446241475e-05, "loss": 0.5763, "mean_token_accuracy": 0.8192711353302002, "num_tokens": 156292425.0, "step": 4665 }, { "epoch": 0.27877268385864373, "grad_norm": 0.6626037359237671, "learning_rate": 4.902526607630994e-05, "loss": 0.6119, "mean_token_accuracy": 0.8083144426345825, "num_tokens": 156460105.0, "step": 4670 }, { "epoch": 0.2790711556829035, "grad_norm": 0.6458653211593628, "learning_rate": 4.9022030463502535e-05, "loss": 0.5655, "mean_token_accuracy": 0.8228319168090821, "num_tokens": 156627785.0, "step": 4675 }, { "epoch": 0.2793696275071633, "grad_norm": 0.7207940220832825, "learning_rate": 4.901878960860736e-05, "loss": 0.6332, "mean_token_accuracy": 0.8043540477752685, "num_tokens": 156795465.0, "step": 4680 }, { "epoch": 0.2796680993314231, "grad_norm": 0.6036509871482849, "learning_rate": 4.9015543512413784e-05, "loss": 0.6246, "mean_token_accuracy": 0.8071394443511963, "num_tokens": 156963145.0, "step": 4685 }, { "epoch": 0.2799665711556829, "grad_norm": 0.6261608600616455, "learning_rate": 4.9012292175712455e-05, "loss": 0.6058, "mean_token_accuracy": 0.8114457845687866, "num_tokens": 157130825.0, "step": 4690 }, { "epoch": 0.2802650429799427, "grad_norm": 0.5815515518188477, "learning_rate": 4.900903559929529e-05, "loss": 0.5283, "mean_token_accuracy": 0.8339675664901733, "num_tokens": 157298505.0, "step": 4695 }, { "epoch": 0.2805635148042025, "grad_norm": 0.8820967078208923, "learning_rate": 4.90057737839555e-05, "loss": 0.6077, "mean_token_accuracy": 0.8129726767539978, "num_tokens": 157466185.0, "step": 4700 }, { "epoch": 0.2808619866284623, "grad_norm": 0.6315600275993347, "learning_rate": 4.900250673048755e-05, "loss": 0.5957, "mean_token_accuracy": 0.8143325924873352, "num_tokens": 157633865.0, "step": 4705 }, { "epoch": 0.2811604584527221, "grad_norm": 0.6613008975982666, "learning_rate": 4.89992344396872e-05, "loss": 0.6525, "mean_token_accuracy": 0.7995705485343934, "num_tokens": 157801545.0, "step": 4710 }, { "epoch": 0.2814589302769819, "grad_norm": 0.6789300441741943, "learning_rate": 4.899595691235147e-05, "loss": 0.6126, "mean_token_accuracy": 0.8100262403488159, "num_tokens": 157969225.0, "step": 4715 }, { "epoch": 0.2817574021012416, "grad_norm": 0.5721486806869507, "learning_rate": 4.8992674149278666e-05, "loss": 0.5895, "mean_token_accuracy": 0.8170821905136109, "num_tokens": 158136905.0, "step": 4720 }, { "epoch": 0.2820558739255014, "grad_norm": 0.658048152923584, "learning_rate": 4.898938615126836e-05, "loss": 0.5828, "mean_token_accuracy": 0.8190743207931519, "num_tokens": 158304585.0, "step": 4725 }, { "epoch": 0.2823543457497612, "grad_norm": 0.6302525997161865, "learning_rate": 4.898609291912141e-05, "loss": 0.5807, "mean_token_accuracy": 0.8197602272033692, "num_tokens": 158472265.0, "step": 4730 }, { "epoch": 0.282652817574021, "grad_norm": 0.7022088766098022, "learning_rate": 4.898279445363994e-05, "loss": 0.6114, "mean_token_accuracy": 0.810401999950409, "num_tokens": 158639945.0, "step": 4735 }, { "epoch": 0.2829512893982808, "grad_norm": 0.686716616153717, "learning_rate": 4.897949075562735e-05, "loss": 0.6057, "mean_token_accuracy": 0.8107419848442078, "num_tokens": 158807625.0, "step": 4740 }, { "epoch": 0.2832497612225406, "grad_norm": 0.6530365347862244, "learning_rate": 4.897618182588832e-05, "loss": 0.6132, "mean_token_accuracy": 0.809799587726593, "num_tokens": 158975305.0, "step": 4745 }, { "epoch": 0.2835482330468004, "grad_norm": 0.6255146861076355, "learning_rate": 4.8972867665228814e-05, "loss": 0.6057, "mean_token_accuracy": 0.8126386761665344, "num_tokens": 159142985.0, "step": 4750 }, { "epoch": 0.2838467048710602, "grad_norm": 0.6031068563461304, "learning_rate": 4.896954827445604e-05, "loss": 0.5822, "mean_token_accuracy": 0.8170821785926818, "num_tokens": 159310665.0, "step": 4755 }, { "epoch": 0.28414517669531997, "grad_norm": 0.5631386637687683, "learning_rate": 4.896622365437849e-05, "loss": 0.5838, "mean_token_accuracy": 0.8180186152458191, "num_tokens": 159478345.0, "step": 4760 }, { "epoch": 0.28444364851957976, "grad_norm": 0.5906351804733276, "learning_rate": 4.896289380580596e-05, "loss": 0.6043, "mean_token_accuracy": 0.8128533959388733, "num_tokens": 159646025.0, "step": 4765 }, { "epoch": 0.28474212034383956, "grad_norm": 0.6804120540618896, "learning_rate": 4.8959558729549474e-05, "loss": 0.5715, "mean_token_accuracy": 0.8215197443962097, "num_tokens": 159813705.0, "step": 4770 }, { "epoch": 0.28504059216809935, "grad_norm": 0.5899435877799988, "learning_rate": 4.8956218426421376e-05, "loss": 0.563, "mean_token_accuracy": 0.8240963816642761, "num_tokens": 159981385.0, "step": 4775 }, { "epoch": 0.28533906399235914, "grad_norm": 0.6063807010650635, "learning_rate": 4.8952872897235235e-05, "loss": 0.6559, "mean_token_accuracy": 0.7989741206169129, "num_tokens": 160149065.0, "step": 4780 }, { "epoch": 0.28563753581661894, "grad_norm": 0.6038150191307068, "learning_rate": 4.894952214280592e-05, "loss": 0.6318, "mean_token_accuracy": 0.804962420463562, "num_tokens": 160316745.0, "step": 4785 }, { "epoch": 0.2859360076408787, "grad_norm": 0.6020172238349915, "learning_rate": 4.894616616394958e-05, "loss": 0.611, "mean_token_accuracy": 0.8092150688171387, "num_tokens": 160484425.0, "step": 4790 }, { "epoch": 0.28623447946513847, "grad_norm": 0.6848268508911133, "learning_rate": 4.8942804961483625e-05, "loss": 0.6041, "mean_token_accuracy": 0.8130681157112122, "num_tokens": 160652105.0, "step": 4795 }, { "epoch": 0.28653295128939826, "grad_norm": 0.6249750256538391, "learning_rate": 4.893943853622672e-05, "loss": 0.6236, "mean_token_accuracy": 0.805588674545288, "num_tokens": 160819785.0, "step": 4800 }, { "epoch": 0.28683142311365806, "grad_norm": 0.5681930184364319, "learning_rate": 4.893606688899884e-05, "loss": 0.5269, "mean_token_accuracy": 0.8348920345306396, "num_tokens": 160987465.0, "step": 4805 }, { "epoch": 0.28712989493791785, "grad_norm": 0.5807148218154907, "learning_rate": 4.893269002062121e-05, "loss": 0.6059, "mean_token_accuracy": 0.8145651936531066, "num_tokens": 161155145.0, "step": 4810 }, { "epoch": 0.28742836676217765, "grad_norm": 0.6384624242782593, "learning_rate": 4.892930793191632e-05, "loss": 0.5722, "mean_token_accuracy": 0.8224919438362122, "num_tokens": 161322825.0, "step": 4815 }, { "epoch": 0.28772683858643744, "grad_norm": 0.6156942248344421, "learning_rate": 4.892592062370795e-05, "loss": 0.6309, "mean_token_accuracy": 0.8054872870445251, "num_tokens": 161490505.0, "step": 4820 }, { "epoch": 0.28802531041069723, "grad_norm": 0.5932487845420837, "learning_rate": 4.892252809682112e-05, "loss": 0.5731, "mean_token_accuracy": 0.820827865600586, "num_tokens": 161658185.0, "step": 4825 }, { "epoch": 0.288323782234957, "grad_norm": 0.5755166411399841, "learning_rate": 4.8919130352082166e-05, "loss": 0.6025, "mean_token_accuracy": 0.8136943817138672, "num_tokens": 161825865.0, "step": 4830 }, { "epoch": 0.2886222540592168, "grad_norm": 0.60075843334198, "learning_rate": 4.8915727390318654e-05, "loss": 0.6399, "mean_token_accuracy": 0.8035786867141723, "num_tokens": 161993545.0, "step": 4835 }, { "epoch": 0.2889207258834766, "grad_norm": 0.7384197115898132, "learning_rate": 4.8912319212359455e-05, "loss": 0.6278, "mean_token_accuracy": 0.8037695288658142, "num_tokens": 162161225.0, "step": 4840 }, { "epoch": 0.2892191977077364, "grad_norm": 0.6177079081535339, "learning_rate": 4.890890581903468e-05, "loss": 0.6294, "mean_token_accuracy": 0.8046045541763306, "num_tokens": 162328905.0, "step": 4845 }, { "epoch": 0.2895176695319962, "grad_norm": 0.6216784715652466, "learning_rate": 4.8905487211175735e-05, "loss": 0.6247, "mean_token_accuracy": 0.8074018836021424, "num_tokens": 162496585.0, "step": 4850 }, { "epoch": 0.289816141356256, "grad_norm": 0.6456221342086792, "learning_rate": 4.890206338961528e-05, "loss": 0.5592, "mean_token_accuracy": 0.8251819133758544, "num_tokens": 162664265.0, "step": 4855 }, { "epoch": 0.29011461318051573, "grad_norm": 0.5484521985054016, "learning_rate": 4.8898634355187253e-05, "loss": 0.5744, "mean_token_accuracy": 0.8210604906082153, "num_tokens": 162831945.0, "step": 4860 }, { "epoch": 0.29041308500477553, "grad_norm": 0.5259699821472168, "learning_rate": 4.889520010872685e-05, "loss": 0.56, "mean_token_accuracy": 0.8252773523330689, "num_tokens": 162999625.0, "step": 4865 }, { "epoch": 0.2907115568290353, "grad_norm": 0.6381258964538574, "learning_rate": 4.889176065107055e-05, "loss": 0.6122, "mean_token_accuracy": 0.8114159584045411, "num_tokens": 163167305.0, "step": 4870 }, { "epoch": 0.2910100286532951, "grad_norm": 0.6049275398254395, "learning_rate": 4.88883159830561e-05, "loss": 0.5841, "mean_token_accuracy": 0.819223427772522, "num_tokens": 163334985.0, "step": 4875 }, { "epoch": 0.2913085004775549, "grad_norm": 0.7412250638008118, "learning_rate": 4.888486610552251e-05, "loss": 0.593, "mean_token_accuracy": 0.8158833503723144, "num_tokens": 163502665.0, "step": 4880 }, { "epoch": 0.2916069723018147, "grad_norm": 0.587427020072937, "learning_rate": 4.888141101931005e-05, "loss": 0.5552, "mean_token_accuracy": 0.8264642596244812, "num_tokens": 163670345.0, "step": 4885 }, { "epoch": 0.2919054441260745, "grad_norm": 0.5680304765701294, "learning_rate": 4.887795072526029e-05, "loss": 0.5707, "mean_token_accuracy": 0.8226350903511047, "num_tokens": 163838025.0, "step": 4890 }, { "epoch": 0.2922039159503343, "grad_norm": 0.6085522770881653, "learning_rate": 4.8874485224216045e-05, "loss": 0.6243, "mean_token_accuracy": 0.8064058303833008, "num_tokens": 164005705.0, "step": 4895 }, { "epoch": 0.2925023877745941, "grad_norm": 0.580993115901947, "learning_rate": 4.887101451702139e-05, "loss": 0.5771, "mean_token_accuracy": 0.821000850200653, "num_tokens": 164173385.0, "step": 4900 }, { "epoch": 0.2928008595988539, "grad_norm": 0.6264471411705017, "learning_rate": 4.886753860452168e-05, "loss": 0.5935, "mean_token_accuracy": 0.8143146872520447, "num_tokens": 164341065.0, "step": 4905 }, { "epoch": 0.2930993314231137, "grad_norm": 0.6031619310379028, "learning_rate": 4.886405748756354e-05, "loss": 0.5942, "mean_token_accuracy": 0.8155314326286316, "num_tokens": 164508745.0, "step": 4910 }, { "epoch": 0.29339780324737347, "grad_norm": 0.6414000391960144, "learning_rate": 4.886057116699488e-05, "loss": 0.6379, "mean_token_accuracy": 0.8034951686859131, "num_tokens": 164676425.0, "step": 4915 }, { "epoch": 0.29369627507163326, "grad_norm": 0.7670672535896301, "learning_rate": 4.8857079643664835e-05, "loss": 0.6457, "mean_token_accuracy": 0.8030001282691955, "num_tokens": 164844105.0, "step": 4920 }, { "epoch": 0.29399474689589306, "grad_norm": 0.7135143876075745, "learning_rate": 4.885358291842383e-05, "loss": 0.6023, "mean_token_accuracy": 0.8126386761665344, "num_tokens": 165011785.0, "step": 4925 }, { "epoch": 0.2942932187201528, "grad_norm": 0.5915950536727905, "learning_rate": 4.8850080992123566e-05, "loss": 0.617, "mean_token_accuracy": 0.8075629234313965, "num_tokens": 165179465.0, "step": 4930 }, { "epoch": 0.2945916905444126, "grad_norm": 0.6435756683349609, "learning_rate": 4.8846573865617e-05, "loss": 0.5822, "mean_token_accuracy": 0.8188059210777283, "num_tokens": 165347145.0, "step": 4935 }, { "epoch": 0.2948901623686724, "grad_norm": 0.6211757063865662, "learning_rate": 4.8843061539758356e-05, "loss": 0.5978, "mean_token_accuracy": 0.81315758228302, "num_tokens": 165514825.0, "step": 4940 }, { "epoch": 0.2951886341929322, "grad_norm": 0.5791261196136475, "learning_rate": 4.8839544015403124e-05, "loss": 0.6208, "mean_token_accuracy": 0.8100761532783508, "num_tokens": 165682265.0, "step": 4945 }, { "epoch": 0.29548710601719197, "grad_norm": 0.5903180837631226, "learning_rate": 4.883602129340807e-05, "loss": 0.5966, "mean_token_accuracy": 0.8133126497268677, "num_tokens": 165849945.0, "step": 4950 }, { "epoch": 0.29578557784145176, "grad_norm": 0.6267656683921814, "learning_rate": 4.883249337463121e-05, "loss": 0.5853, "mean_token_accuracy": 0.8176667094230652, "num_tokens": 166017625.0, "step": 4955 }, { "epoch": 0.29608404966571156, "grad_norm": 0.7100227475166321, "learning_rate": 4.882896025993183e-05, "loss": 0.5917, "mean_token_accuracy": 0.8154658198356628, "num_tokens": 166185305.0, "step": 4960 }, { "epoch": 0.29638252148997135, "grad_norm": 0.6177758574485779, "learning_rate": 4.8825421950170504e-05, "loss": 0.5757, "mean_token_accuracy": 0.8209352374076844, "num_tokens": 166352985.0, "step": 4965 }, { "epoch": 0.29668099331423115, "grad_norm": 0.6195228099822998, "learning_rate": 4.882187844620902e-05, "loss": 0.6513, "mean_token_accuracy": 0.7995347738265991, "num_tokens": 166520665.0, "step": 4970 }, { "epoch": 0.29697946513849094, "grad_norm": 0.6087715029716492, "learning_rate": 4.88183297489105e-05, "loss": 0.6273, "mean_token_accuracy": 0.8037814617156982, "num_tokens": 166688345.0, "step": 4975 }, { "epoch": 0.29727793696275073, "grad_norm": 0.6276805996894836, "learning_rate": 4.8814775859139275e-05, "loss": 0.6761, "mean_token_accuracy": 0.7932720899581909, "num_tokens": 166856025.0, "step": 4980 }, { "epoch": 0.2975764087870105, "grad_norm": 0.6047452688217163, "learning_rate": 4.881121677776097e-05, "loss": 0.5799, "mean_token_accuracy": 0.8176786303520203, "num_tokens": 167023705.0, "step": 4985 }, { "epoch": 0.2978748806112703, "grad_norm": 0.6401591897010803, "learning_rate": 4.880765250564245e-05, "loss": 0.5929, "mean_token_accuracy": 0.8156089663505555, "num_tokens": 167191385.0, "step": 4990 }, { "epoch": 0.29817335243553006, "grad_norm": 0.5788501501083374, "learning_rate": 4.8804083043651866e-05, "loss": 0.5909, "mean_token_accuracy": 0.8146785020828247, "num_tokens": 167359065.0, "step": 4995 }, { "epoch": 0.29847182425978985, "grad_norm": 0.6091424226760864, "learning_rate": 4.880050839265863e-05, "loss": 0.6354, "mean_token_accuracy": 0.8067947149276733, "num_tokens": 167519785.0, "step": 5000 }, { "epoch": 0.29877029608404965, "grad_norm": 0.6005545854568481, "learning_rate": 4.879692855353342e-05, "loss": 0.6073, "mean_token_accuracy": 0.811767864227295, "num_tokens": 167687465.0, "step": 5005 }, { "epoch": 0.29906876790830944, "grad_norm": 0.5670431852340698, "learning_rate": 4.879334352714815e-05, "loss": 0.5295, "mean_token_accuracy": 0.8326553702354431, "num_tokens": 167855145.0, "step": 5010 }, { "epoch": 0.29936723973256923, "grad_norm": 0.6601032018661499, "learning_rate": 4.878975331437605e-05, "loss": 0.5877, "mean_token_accuracy": 0.8166706323623657, "num_tokens": 168022825.0, "step": 5015 }, { "epoch": 0.29966571155682903, "grad_norm": 0.6273783445358276, "learning_rate": 4.8786157916091557e-05, "loss": 0.6016, "mean_token_accuracy": 0.8141297698020935, "num_tokens": 168190505.0, "step": 5020 }, { "epoch": 0.2999641833810888, "grad_norm": 0.6342858672142029, "learning_rate": 4.8782557333170407e-05, "loss": 0.5616, "mean_token_accuracy": 0.8260228872299195, "num_tokens": 168358185.0, "step": 5025 }, { "epoch": 0.3002626552053486, "grad_norm": 0.6288208961486816, "learning_rate": 4.877895156648959e-05, "loss": 0.6302, "mean_token_accuracy": 0.804348087310791, "num_tokens": 168525865.0, "step": 5030 }, { "epoch": 0.3005611270296084, "grad_norm": 0.6230400204658508, "learning_rate": 4.8775340616927357e-05, "loss": 0.5905, "mean_token_accuracy": 0.8134021043777466, "num_tokens": 168693545.0, "step": 5035 }, { "epoch": 0.3008595988538682, "grad_norm": 0.6192215085029602, "learning_rate": 4.877172448536322e-05, "loss": 0.5606, "mean_token_accuracy": 0.8259632587432861, "num_tokens": 168861225.0, "step": 5040 }, { "epoch": 0.301158070678128, "grad_norm": 0.6162911057472229, "learning_rate": 4.8768103172677946e-05, "loss": 0.614, "mean_token_accuracy": 0.8094655871391296, "num_tokens": 169028905.0, "step": 5045 }, { "epoch": 0.3014565425023878, "grad_norm": 0.6504818797111511, "learning_rate": 4.876447667975358e-05, "loss": 0.5949, "mean_token_accuracy": 0.8127400755882264, "num_tokens": 169196585.0, "step": 5050 }, { "epoch": 0.3017550143266476, "grad_norm": 0.5875588059425354, "learning_rate": 4.8760845007473425e-05, "loss": 0.5729, "mean_token_accuracy": 0.8196349740028381, "num_tokens": 169364265.0, "step": 5055 }, { "epoch": 0.3020534861509074, "grad_norm": 0.5812243819236755, "learning_rate": 4.875720815672204e-05, "loss": 0.578, "mean_token_accuracy": 0.8185196161270142, "num_tokens": 169531945.0, "step": 5060 }, { "epoch": 0.3023519579751671, "grad_norm": 0.5958006381988525, "learning_rate": 4.8753566128385245e-05, "loss": 0.6153, "mean_token_accuracy": 0.8077537894248963, "num_tokens": 169699625.0, "step": 5065 }, { "epoch": 0.3026504297994269, "grad_norm": 0.6520281434059143, "learning_rate": 4.8749918923350116e-05, "loss": 0.5923, "mean_token_accuracy": 0.8155552864074707, "num_tokens": 169867305.0, "step": 5070 }, { "epoch": 0.3029489016236867, "grad_norm": 0.6548631191253662, "learning_rate": 4.874626654250502e-05, "loss": 0.6496, "mean_token_accuracy": 0.800375759601593, "num_tokens": 170034985.0, "step": 5075 }, { "epoch": 0.3032473734479465, "grad_norm": 0.5772848725318909, "learning_rate": 4.874260898673953e-05, "loss": 0.6061, "mean_token_accuracy": 0.811040198802948, "num_tokens": 170202665.0, "step": 5080 }, { "epoch": 0.3035458452722063, "grad_norm": 0.6648582220077515, "learning_rate": 4.873894625694453e-05, "loss": 0.6712, "mean_token_accuracy": 0.7928247570991516, "num_tokens": 170370345.0, "step": 5085 }, { "epoch": 0.3038443170964661, "grad_norm": 0.5819846391677856, "learning_rate": 4.873527835401215e-05, "loss": 0.569, "mean_token_accuracy": 0.8232553958892822, "num_tokens": 170538025.0, "step": 5090 }, { "epoch": 0.3041427889207259, "grad_norm": 0.613950252532959, "learning_rate": 4.8731605278835765e-05, "loss": 0.63, "mean_token_accuracy": 0.8058272719383239, "num_tokens": 170705705.0, "step": 5095 }, { "epoch": 0.3044412607449857, "grad_norm": 0.6041838526725769, "learning_rate": 4.872792703231003e-05, "loss": 0.6195, "mean_token_accuracy": 0.8056960463523865, "num_tokens": 170873385.0, "step": 5100 }, { "epoch": 0.30473973256924547, "grad_norm": 0.5953035354614258, "learning_rate": 4.872424361533083e-05, "loss": 0.548, "mean_token_accuracy": 0.8270905375480652, "num_tokens": 171041065.0, "step": 5105 }, { "epoch": 0.30503820439350526, "grad_norm": 0.6270595192909241, "learning_rate": 4.872055502879536e-05, "loss": 0.6101, "mean_token_accuracy": 0.8102767467498779, "num_tokens": 171208745.0, "step": 5110 }, { "epoch": 0.30533667621776506, "grad_norm": 1.1966700553894043, "learning_rate": 4.871686127360202e-05, "loss": 0.6191, "mean_token_accuracy": 0.8083681106567383, "num_tokens": 171376425.0, "step": 5115 }, { "epoch": 0.30563514804202485, "grad_norm": 0.6643148064613342, "learning_rate": 4.8713162350650494e-05, "loss": 0.6257, "mean_token_accuracy": 0.806465458869934, "num_tokens": 171544105.0, "step": 5120 }, { "epoch": 0.30593361986628465, "grad_norm": 0.5752717852592468, "learning_rate": 4.8709458260841726e-05, "loss": 0.5948, "mean_token_accuracy": 0.8158594846725464, "num_tokens": 171711785.0, "step": 5125 }, { "epoch": 0.30623209169054444, "grad_norm": 0.6883658170700073, "learning_rate": 4.870574900507792e-05, "loss": 0.5731, "mean_token_accuracy": 0.8207443714141845, "num_tokens": 171879465.0, "step": 5130 }, { "epoch": 0.3065305635148042, "grad_norm": 0.5872220993041992, "learning_rate": 4.8702034584262526e-05, "loss": 0.5396, "mean_token_accuracy": 0.8304544806480407, "num_tokens": 172047145.0, "step": 5135 }, { "epoch": 0.30682903533906397, "grad_norm": 0.7178948521614075, "learning_rate": 4.869831499930028e-05, "loss": 0.6339, "mean_token_accuracy": 0.8045568346977234, "num_tokens": 172214825.0, "step": 5140 }, { "epoch": 0.30712750716332377, "grad_norm": 0.6904928684234619, "learning_rate": 4.869459025109712e-05, "loss": 0.6241, "mean_token_accuracy": 0.8038709282875061, "num_tokens": 172382505.0, "step": 5145 }, { "epoch": 0.30742597898758356, "grad_norm": 0.5749028325080872, "learning_rate": 4.8690860340560305e-05, "loss": 0.5499, "mean_token_accuracy": 0.8282416939735413, "num_tokens": 172550185.0, "step": 5150 }, { "epoch": 0.30772445081184335, "grad_norm": 0.6001327633857727, "learning_rate": 4.8687125268598313e-05, "loss": 0.555, "mean_token_accuracy": 0.8251520872116089, "num_tokens": 172717865.0, "step": 5155 }, { "epoch": 0.30802292263610315, "grad_norm": 0.5972523093223572, "learning_rate": 4.86833850361209e-05, "loss": 0.5873, "mean_token_accuracy": 0.817756175994873, "num_tokens": 172885545.0, "step": 5160 }, { "epoch": 0.30832139446036294, "grad_norm": 0.5958548784255981, "learning_rate": 4.867963964403906e-05, "loss": 0.5543, "mean_token_accuracy": 0.825915539264679, "num_tokens": 173053225.0, "step": 5165 }, { "epoch": 0.30861986628462273, "grad_norm": 0.6663253903388977, "learning_rate": 4.8675889093265045e-05, "loss": 0.5804, "mean_token_accuracy": 0.8201359868049621, "num_tokens": 173220905.0, "step": 5170 }, { "epoch": 0.30891833810888253, "grad_norm": 0.6232882738113403, "learning_rate": 4.867213338471239e-05, "loss": 0.5736, "mean_token_accuracy": 0.8201598525047302, "num_tokens": 173388585.0, "step": 5175 }, { "epoch": 0.3092168099331423, "grad_norm": 0.6426844000816345, "learning_rate": 4.866837251929585e-05, "loss": 0.6398, "mean_token_accuracy": 0.8025706648826599, "num_tokens": 173556265.0, "step": 5180 }, { "epoch": 0.3095152817574021, "grad_norm": 1.2553555965423584, "learning_rate": 4.866460649793146e-05, "loss": 0.6617, "mean_token_accuracy": 0.7992246150970459, "num_tokens": 173723945.0, "step": 5185 }, { "epoch": 0.3098137535816619, "grad_norm": 0.7143647074699402, "learning_rate": 4.866083532153651e-05, "loss": 0.5855, "mean_token_accuracy": 0.8164678454399109, "num_tokens": 173891625.0, "step": 5190 }, { "epoch": 0.3101122254059217, "grad_norm": 0.672615647315979, "learning_rate": 4.865705899102953e-05, "loss": 0.6071, "mean_token_accuracy": 0.8126148223876953, "num_tokens": 174059305.0, "step": 5195 }, { "epoch": 0.3104106972301815, "grad_norm": 0.6389151215553284, "learning_rate": 4.865327750733031e-05, "loss": 0.6086, "mean_token_accuracy": 0.8121197700500489, "num_tokens": 174226985.0, "step": 5200 }, { "epoch": 0.31070916905444124, "grad_norm": 0.6401156783103943, "learning_rate": 4.864949087135992e-05, "loss": 0.562, "mean_token_accuracy": 0.8238935947418213, "num_tokens": 174394665.0, "step": 5205 }, { "epoch": 0.31100764087870103, "grad_norm": 0.5634213089942932, "learning_rate": 4.8645699084040635e-05, "loss": 0.5407, "mean_token_accuracy": 0.8304485321044922, "num_tokens": 174562345.0, "step": 5210 }, { "epoch": 0.3113061127029608, "grad_norm": 0.6145051717758179, "learning_rate": 4.8641902146296046e-05, "loss": 0.5903, "mean_token_accuracy": 0.8170046448707581, "num_tokens": 174730025.0, "step": 5215 }, { "epoch": 0.3116045845272206, "grad_norm": 0.6142057776451111, "learning_rate": 4.8638100059050954e-05, "loss": 0.5961, "mean_token_accuracy": 0.8149648070335388, "num_tokens": 174897705.0, "step": 5220 }, { "epoch": 0.3119030563514804, "grad_norm": 0.6189945936203003, "learning_rate": 4.863429282323143e-05, "loss": 0.5458, "mean_token_accuracy": 0.8285876154899597, "num_tokens": 175065385.0, "step": 5225 }, { "epoch": 0.3122015281757402, "grad_norm": 0.6174318790435791, "learning_rate": 4.8630480439764795e-05, "loss": 0.5513, "mean_token_accuracy": 0.8268877506256104, "num_tokens": 175233065.0, "step": 5230 }, { "epoch": 0.3125, "grad_norm": 0.7125818133354187, "learning_rate": 4.862666290957962e-05, "loss": 0.6382, "mean_token_accuracy": 0.8030537962913513, "num_tokens": 175400745.0, "step": 5235 }, { "epoch": 0.3127984718242598, "grad_norm": 0.6511279344558716, "learning_rate": 4.862284023360574e-05, "loss": 0.6085, "mean_token_accuracy": 0.8101157069206237, "num_tokens": 175568425.0, "step": 5240 }, { "epoch": 0.3130969436485196, "grad_norm": 0.5827562808990479, "learning_rate": 4.861901241277424e-05, "loss": 0.634, "mean_token_accuracy": 0.8035011410713195, "num_tokens": 175736105.0, "step": 5245 }, { "epoch": 0.3133954154727794, "grad_norm": 0.6435595154762268, "learning_rate": 4.8615179448017455e-05, "loss": 0.6515, "mean_token_accuracy": 0.8005535006523132, "num_tokens": 175903562.0, "step": 5250 }, { "epoch": 0.3136938872970392, "grad_norm": 0.6507923603057861, "learning_rate": 4.8611341340268977e-05, "loss": 0.6437, "mean_token_accuracy": 0.8026899814605712, "num_tokens": 176071242.0, "step": 5255 }, { "epoch": 0.31399235912129897, "grad_norm": 0.5808089375495911, "learning_rate": 4.860749809046364e-05, "loss": 0.6198, "mean_token_accuracy": 0.8061851263046265, "num_tokens": 176238922.0, "step": 5260 }, { "epoch": 0.31429083094555876, "grad_norm": 0.6327818632125854, "learning_rate": 4.860364969953754e-05, "loss": 0.6264, "mean_token_accuracy": 0.8052010059356689, "num_tokens": 176406602.0, "step": 5265 }, { "epoch": 0.3145893027698185, "grad_norm": 0.6396894454956055, "learning_rate": 4.859979616842803e-05, "loss": 0.5811, "mean_token_accuracy": 0.8183526277542115, "num_tokens": 176574282.0, "step": 5270 }, { "epoch": 0.3148877745940783, "grad_norm": 1.0681463479995728, "learning_rate": 4.859593749807371e-05, "loss": 0.6022, "mean_token_accuracy": 0.8144280195236206, "num_tokens": 176741962.0, "step": 5275 }, { "epoch": 0.3151862464183381, "grad_norm": 0.5932404398918152, "learning_rate": 4.8592073689414425e-05, "loss": 0.6206, "mean_token_accuracy": 0.8069306969642639, "num_tokens": 176909642.0, "step": 5280 }, { "epoch": 0.3154847182425979, "grad_norm": 0.624931812286377, "learning_rate": 4.858820474339128e-05, "loss": 0.6432, "mean_token_accuracy": 0.8028987169265747, "num_tokens": 177077322.0, "step": 5285 }, { "epoch": 0.3157831900668577, "grad_norm": 0.6247636079788208, "learning_rate": 4.8584330660946616e-05, "loss": 0.5895, "mean_token_accuracy": 0.8153167128562927, "num_tokens": 177245002.0, "step": 5290 }, { "epoch": 0.31608166189111747, "grad_norm": 0.7553396224975586, "learning_rate": 4.8580451443024055e-05, "loss": 0.6278, "mean_token_accuracy": 0.8084098815917968, "num_tokens": 177412682.0, "step": 5295 }, { "epoch": 0.31638013371537727, "grad_norm": 0.7019616365432739, "learning_rate": 4.8576567090568444e-05, "loss": 0.6217, "mean_token_accuracy": 0.8063640713691711, "num_tokens": 177580362.0, "step": 5300 }, { "epoch": 0.31667860553963706, "grad_norm": 0.6351245045661926, "learning_rate": 4.857267760452589e-05, "loss": 0.6409, "mean_token_accuracy": 0.8039186477661133, "num_tokens": 177748042.0, "step": 5305 }, { "epoch": 0.31697707736389685, "grad_norm": 0.6876610517501831, "learning_rate": 4.856878298584375e-05, "loss": 0.6451, "mean_token_accuracy": 0.8022128224372864, "num_tokens": 177915722.0, "step": 5310 }, { "epoch": 0.31727554918815665, "grad_norm": 0.6411224603652954, "learning_rate": 4.856488323547062e-05, "loss": 0.5567, "mean_token_accuracy": 0.8240248203277588, "num_tokens": 178083402.0, "step": 5315 }, { "epoch": 0.31757402101241644, "grad_norm": 0.5736415982246399, "learning_rate": 4.856097835435637e-05, "loss": 0.5964, "mean_token_accuracy": 0.8160384178161622, "num_tokens": 178251082.0, "step": 5320 }, { "epoch": 0.31787249283667623, "grad_norm": 0.6970974802970886, "learning_rate": 4.85570683434521e-05, "loss": 0.6301, "mean_token_accuracy": 0.8065429925918579, "num_tokens": 178418762.0, "step": 5325 }, { "epoch": 0.31817096466093603, "grad_norm": 0.5778356790542603, "learning_rate": 4.855315320371017e-05, "loss": 0.6304, "mean_token_accuracy": 0.8064356327056885, "num_tokens": 178586442.0, "step": 5330 }, { "epoch": 0.3184694364851958, "grad_norm": 0.6154081225395203, "learning_rate": 4.854923293608418e-05, "loss": 0.5991, "mean_token_accuracy": 0.8138733148574829, "num_tokens": 178754122.0, "step": 5335 }, { "epoch": 0.31876790830945556, "grad_norm": 0.5807360410690308, "learning_rate": 4.8545307541528976e-05, "loss": 0.5316, "mean_token_accuracy": 0.8323934197425842, "num_tokens": 178919522.0, "step": 5340 }, { "epoch": 0.31906638013371535, "grad_norm": 0.5846946239471436, "learning_rate": 4.854137702100068e-05, "loss": 0.6195, "mean_token_accuracy": 0.8068889498710632, "num_tokens": 179087202.0, "step": 5345 }, { "epoch": 0.31936485195797515, "grad_norm": 0.679115355014801, "learning_rate": 4.8537441375456616e-05, "loss": 0.5838, "mean_token_accuracy": 0.8179350972175599, "num_tokens": 179254882.0, "step": 5350 }, { "epoch": 0.31966332378223494, "grad_norm": 0.6877555251121521, "learning_rate": 4.853350060585541e-05, "loss": 0.5909, "mean_token_accuracy": 0.815525472164154, "num_tokens": 179422562.0, "step": 5355 }, { "epoch": 0.31996179560649474, "grad_norm": 0.5969734787940979, "learning_rate": 4.8529554713156894e-05, "loss": 0.5847, "mean_token_accuracy": 0.8171123147010804, "num_tokens": 179587954.0, "step": 5360 }, { "epoch": 0.32026026743075453, "grad_norm": 0.6116472482681274, "learning_rate": 4.852560369832217e-05, "loss": 0.562, "mean_token_accuracy": 0.8221042513847351, "num_tokens": 179755634.0, "step": 5365 }, { "epoch": 0.3205587392550143, "grad_norm": 0.5308366417884827, "learning_rate": 4.852164756231357e-05, "loss": 0.5794, "mean_token_accuracy": 0.8180126547813416, "num_tokens": 179923314.0, "step": 5370 }, { "epoch": 0.3208572110792741, "grad_norm": 0.652714729309082, "learning_rate": 4.85176863060947e-05, "loss": 0.5416, "mean_token_accuracy": 0.8298818945884705, "num_tokens": 180090994.0, "step": 5375 }, { "epoch": 0.3211556829035339, "grad_norm": 0.5556767582893372, "learning_rate": 4.8513719930630384e-05, "loss": 0.5427, "mean_token_accuracy": 0.8290886402130127, "num_tokens": 180258674.0, "step": 5380 }, { "epoch": 0.3214541547277937, "grad_norm": 0.7110474109649658, "learning_rate": 4.850974843688671e-05, "loss": 0.6025, "mean_token_accuracy": 0.8142908215522766, "num_tokens": 180426354.0, "step": 5385 }, { "epoch": 0.3217526265520535, "grad_norm": 0.5711827874183655, "learning_rate": 4.850577182583101e-05, "loss": 0.5814, "mean_token_accuracy": 0.8186866283416748, "num_tokens": 180594034.0, "step": 5390 }, { "epoch": 0.3220510983763133, "grad_norm": 0.5733540654182434, "learning_rate": 4.850179009843185e-05, "loss": 0.5854, "mean_token_accuracy": 0.8165752053260803, "num_tokens": 180761714.0, "step": 5395 }, { "epoch": 0.3223495702005731, "grad_norm": 0.6314049959182739, "learning_rate": 4.849780325565908e-05, "loss": 0.6026, "mean_token_accuracy": 0.8126088500022888, "num_tokens": 180929394.0, "step": 5400 }, { "epoch": 0.3226480420248329, "grad_norm": 0.6054881811141968, "learning_rate": 4.8493811298483734e-05, "loss": 0.5932, "mean_token_accuracy": 0.8160324454307556, "num_tokens": 181097074.0, "step": 5405 }, { "epoch": 0.3229465138490926, "grad_norm": 0.7026839256286621, "learning_rate": 4.848981422787816e-05, "loss": 0.6158, "mean_token_accuracy": 0.8087021350860596, "num_tokens": 181264754.0, "step": 5410 }, { "epoch": 0.3232449856733524, "grad_norm": 0.6497235298156738, "learning_rate": 4.848581204481589e-05, "loss": 0.5707, "mean_token_accuracy": 0.8209889173507691, "num_tokens": 181432434.0, "step": 5415 }, { "epoch": 0.3235434574976122, "grad_norm": 0.6246758699417114, "learning_rate": 4.848180475027175e-05, "loss": 0.5663, "mean_token_accuracy": 0.8242335677146911, "num_tokens": 181600114.0, "step": 5420 }, { "epoch": 0.323841929321872, "grad_norm": 0.5725294351577759, "learning_rate": 4.847779234522179e-05, "loss": 0.5731, "mean_token_accuracy": 0.818722414970398, "num_tokens": 181767794.0, "step": 5425 }, { "epoch": 0.3241404011461318, "grad_norm": 0.6300048828125, "learning_rate": 4.8473774830643285e-05, "loss": 0.5898, "mean_token_accuracy": 0.8144757270812988, "num_tokens": 181935474.0, "step": 5430 }, { "epoch": 0.3244388729703916, "grad_norm": 0.6033457517623901, "learning_rate": 4.84697522075148e-05, "loss": 0.549, "mean_token_accuracy": 0.8275855898857116, "num_tokens": 182103154.0, "step": 5435 }, { "epoch": 0.3247373447946514, "grad_norm": 0.6648246645927429, "learning_rate": 4.84657244768161e-05, "loss": 0.6028, "mean_token_accuracy": 0.8138315558433533, "num_tokens": 182270834.0, "step": 5440 }, { "epoch": 0.3250358166189112, "grad_norm": 0.6584506630897522, "learning_rate": 4.846169163952823e-05, "loss": 0.6648, "mean_token_accuracy": 0.7933377027511597, "num_tokens": 182438514.0, "step": 5445 }, { "epoch": 0.32533428844317097, "grad_norm": 0.6389214992523193, "learning_rate": 4.8457653696633445e-05, "loss": 0.548, "mean_token_accuracy": 0.8272515892982483, "num_tokens": 182606194.0, "step": 5450 }, { "epoch": 0.32563276026743077, "grad_norm": 0.5182211995124817, "learning_rate": 4.845361064911526e-05, "loss": 0.518, "mean_token_accuracy": 0.8367708563804627, "num_tokens": 182773874.0, "step": 5455 }, { "epoch": 0.32593123209169056, "grad_norm": 0.558883547782898, "learning_rate": 4.844956249795846e-05, "loss": 0.5726, "mean_token_accuracy": 0.8197960257530212, "num_tokens": 182941554.0, "step": 5460 }, { "epoch": 0.32622970391595035, "grad_norm": 0.5931894183158875, "learning_rate": 4.844550924414902e-05, "loss": 0.5469, "mean_token_accuracy": 0.8291184544563294, "num_tokens": 183109234.0, "step": 5465 }, { "epoch": 0.32652817574021015, "grad_norm": 0.5589408874511719, "learning_rate": 4.84414508886742e-05, "loss": 0.5301, "mean_token_accuracy": 0.8336275815963745, "num_tokens": 183276914.0, "step": 5470 }, { "epoch": 0.32682664756446994, "grad_norm": 0.5838602781295776, "learning_rate": 4.843738743252247e-05, "loss": 0.5677, "mean_token_accuracy": 0.8215972661972046, "num_tokens": 183444594.0, "step": 5475 }, { "epoch": 0.3271251193887297, "grad_norm": 0.5662837624549866, "learning_rate": 4.843331887668358e-05, "loss": 0.5726, "mean_token_accuracy": 0.8211618661880493, "num_tokens": 183612274.0, "step": 5480 }, { "epoch": 0.3274235912129895, "grad_norm": 0.5298907160758972, "learning_rate": 4.842924522214849e-05, "loss": 0.6208, "mean_token_accuracy": 0.8065310716629028, "num_tokens": 183779954.0, "step": 5485 }, { "epoch": 0.32772206303724927, "grad_norm": 0.6623823642730713, "learning_rate": 4.842516646990941e-05, "loss": 0.5948, "mean_token_accuracy": 0.8137659430503845, "num_tokens": 183947634.0, "step": 5490 }, { "epoch": 0.32802053486150906, "grad_norm": 0.6267683506011963, "learning_rate": 4.842108262095981e-05, "loss": 0.5876, "mean_token_accuracy": 0.8164260983467102, "num_tokens": 184115314.0, "step": 5495 }, { "epoch": 0.32831900668576885, "grad_norm": 0.5885863900184631, "learning_rate": 4.8416993676294376e-05, "loss": 0.577, "mean_token_accuracy": 0.8195395350456238, "num_tokens": 184282994.0, "step": 5500 }, { "epoch": 0.32861747851002865, "grad_norm": 0.6000836491584778, "learning_rate": 4.8412899636909046e-05, "loss": 0.5741, "mean_token_accuracy": 0.8211916923522949, "num_tokens": 184450674.0, "step": 5505 }, { "epoch": 0.32891595033428844, "grad_norm": 0.7105187773704529, "learning_rate": 4.840880050380101e-05, "loss": 0.6194, "mean_token_accuracy": 0.8071275234222413, "num_tokens": 184618354.0, "step": 5510 }, { "epoch": 0.32921442215854824, "grad_norm": 0.5919845700263977, "learning_rate": 4.840469627796867e-05, "loss": 0.5702, "mean_token_accuracy": 0.8209590792655945, "num_tokens": 184786034.0, "step": 5515 }, { "epoch": 0.32951289398280803, "grad_norm": 0.7687093019485474, "learning_rate": 4.84005869604117e-05, "loss": 0.6197, "mean_token_accuracy": 0.8077418565750122, "num_tokens": 184953714.0, "step": 5520 }, { "epoch": 0.3298113658070678, "grad_norm": 0.5766890645027161, "learning_rate": 4.839647255213099e-05, "loss": 0.5799, "mean_token_accuracy": 0.8185673356056213, "num_tokens": 185121394.0, "step": 5525 }, { "epoch": 0.3301098376313276, "grad_norm": 0.5770844221115112, "learning_rate": 4.839235305412869e-05, "loss": 0.5586, "mean_token_accuracy": 0.8244423151016236, "num_tokens": 185289074.0, "step": 5530 }, { "epoch": 0.3304083094555874, "grad_norm": 0.628121018409729, "learning_rate": 4.838822846740818e-05, "loss": 0.6078, "mean_token_accuracy": 0.8108910918235779, "num_tokens": 185456754.0, "step": 5535 }, { "epoch": 0.3307067812798472, "grad_norm": 0.6534087657928467, "learning_rate": 4.838409879297407e-05, "loss": 0.5987, "mean_token_accuracy": 0.8138912081718445, "num_tokens": 185624434.0, "step": 5540 }, { "epoch": 0.33100525310410694, "grad_norm": 0.6166808009147644, "learning_rate": 4.837996403183221e-05, "loss": 0.5865, "mean_token_accuracy": 0.8158236861228942, "num_tokens": 185792114.0, "step": 5545 }, { "epoch": 0.33130372492836674, "grad_norm": 0.6812239289283752, "learning_rate": 4.837582418498972e-05, "loss": 0.5766, "mean_token_accuracy": 0.8192234396934509, "num_tokens": 185959794.0, "step": 5550 }, { "epoch": 0.33160219675262653, "grad_norm": 0.969882607460022, "learning_rate": 4.837167925345493e-05, "loss": 0.5693, "mean_token_accuracy": 0.8214481711387634, "num_tokens": 186127474.0, "step": 5555 }, { "epoch": 0.3319006685768863, "grad_norm": 0.6157154440879822, "learning_rate": 4.8367529238237415e-05, "loss": 0.6312, "mean_token_accuracy": 0.8026601552963257, "num_tokens": 186295154.0, "step": 5560 }, { "epoch": 0.3321991404011461, "grad_norm": 0.5648495554924011, "learning_rate": 4.836337414034798e-05, "loss": 0.5949, "mean_token_accuracy": 0.8149349808692932, "num_tokens": 186462834.0, "step": 5565 }, { "epoch": 0.3324976122254059, "grad_norm": 0.6061586737632751, "learning_rate": 4.835921396079867e-05, "loss": 0.6278, "mean_token_accuracy": 0.8072468042373657, "num_tokens": 186630514.0, "step": 5570 }, { "epoch": 0.3327960840496657, "grad_norm": 0.5938173532485962, "learning_rate": 4.835504870060279e-05, "loss": 0.6261, "mean_token_accuracy": 0.806191086769104, "num_tokens": 186798194.0, "step": 5575 }, { "epoch": 0.3330945558739255, "grad_norm": 0.580461859703064, "learning_rate": 4.835087836077486e-05, "loss": 0.6032, "mean_token_accuracy": 0.8120541572570801, "num_tokens": 186965874.0, "step": 5580 }, { "epoch": 0.3333930276981853, "grad_norm": 0.5872578024864197, "learning_rate": 4.834670294233064e-05, "loss": 0.57, "mean_token_accuracy": 0.8211857318878174, "num_tokens": 187133554.0, "step": 5585 }, { "epoch": 0.3336914995224451, "grad_norm": 0.6603530645370483, "learning_rate": 4.834252244628714e-05, "loss": 0.5981, "mean_token_accuracy": 0.8149349808692932, "num_tokens": 187301234.0, "step": 5590 }, { "epoch": 0.3339899713467049, "grad_norm": 0.587928056716919, "learning_rate": 4.8338336873662575e-05, "loss": 0.5821, "mean_token_accuracy": 0.8193904399871826, "num_tokens": 187468914.0, "step": 5595 }, { "epoch": 0.3342884431709647, "grad_norm": 0.582267165184021, "learning_rate": 4.833414622547643e-05, "loss": 0.6273, "mean_token_accuracy": 0.8049087524414062, "num_tokens": 187636594.0, "step": 5600 }, { "epoch": 0.33458691499522447, "grad_norm": 0.6343382596969604, "learning_rate": 4.832995050274943e-05, "loss": 0.6159, "mean_token_accuracy": 0.8087200284004211, "num_tokens": 187804274.0, "step": 5605 }, { "epoch": 0.33488538681948427, "grad_norm": 0.8218846917152405, "learning_rate": 4.832574970650351e-05, "loss": 0.6026, "mean_token_accuracy": 0.8121257305145264, "num_tokens": 187971954.0, "step": 5610 }, { "epoch": 0.335183858643744, "grad_norm": 0.5850387811660767, "learning_rate": 4.8321543837761845e-05, "loss": 0.5869, "mean_token_accuracy": 0.8166825652122498, "num_tokens": 188139634.0, "step": 5615 }, { "epoch": 0.3354823304680038, "grad_norm": 0.6085715293884277, "learning_rate": 4.8317332897548854e-05, "loss": 0.6095, "mean_token_accuracy": 0.8088845252990723, "num_tokens": 188304602.0, "step": 5620 }, { "epoch": 0.3357808022922636, "grad_norm": 0.6014330983161926, "learning_rate": 4.83131168868902e-05, "loss": 0.5904, "mean_token_accuracy": 0.8155970335006714, "num_tokens": 188472282.0, "step": 5625 }, { "epoch": 0.3360792741165234, "grad_norm": 0.6113285422325134, "learning_rate": 4.830889580681276e-05, "loss": 0.5406, "mean_token_accuracy": 0.8278301358222961, "num_tokens": 188639962.0, "step": 5630 }, { "epoch": 0.3363777459407832, "grad_norm": 0.5936959385871887, "learning_rate": 4.830466965834465e-05, "loss": 0.6193, "mean_token_accuracy": 0.8090242147445679, "num_tokens": 188807642.0, "step": 5635 }, { "epoch": 0.336676217765043, "grad_norm": 0.9623625874519348, "learning_rate": 4.830043844251524e-05, "loss": 0.5776, "mean_token_accuracy": 0.8192413210868835, "num_tokens": 188975322.0, "step": 5640 }, { "epoch": 0.33697468958930277, "grad_norm": 0.6620087623596191, "learning_rate": 4.829620216035511e-05, "loss": 0.6114, "mean_token_accuracy": 0.8087617874145507, "num_tokens": 189143002.0, "step": 5645 }, { "epoch": 0.33727316141356256, "grad_norm": 0.6530949473381042, "learning_rate": 4.82919608128961e-05, "loss": 0.6088, "mean_token_accuracy": 0.808946681022644, "num_tokens": 189310682.0, "step": 5650 }, { "epoch": 0.33757163323782235, "grad_norm": 0.5739794969558716, "learning_rate": 4.828771440117125e-05, "loss": 0.5597, "mean_token_accuracy": 0.8250924468040466, "num_tokens": 189478362.0, "step": 5655 }, { "epoch": 0.33787010506208215, "grad_norm": 0.7298800349235535, "learning_rate": 4.828346292621487e-05, "loss": 0.6046, "mean_token_accuracy": 0.8116187572479248, "num_tokens": 189646042.0, "step": 5660 }, { "epoch": 0.33816857688634194, "grad_norm": 0.8617460131645203, "learning_rate": 4.827920638906246e-05, "loss": 0.6807, "mean_token_accuracy": 0.7946856856346131, "num_tokens": 189813722.0, "step": 5665 }, { "epoch": 0.33846704871060174, "grad_norm": 0.6872591376304626, "learning_rate": 4.82749447907508e-05, "loss": 0.6192, "mean_token_accuracy": 0.8082965612411499, "num_tokens": 189981402.0, "step": 5670 }, { "epoch": 0.33876552053486153, "grad_norm": 0.7133093476295471, "learning_rate": 4.827067813231788e-05, "loss": 0.6134, "mean_token_accuracy": 0.8093224287033081, "num_tokens": 190149082.0, "step": 5675 }, { "epoch": 0.3390639923591213, "grad_norm": 0.627886176109314, "learning_rate": 4.8266406414802916e-05, "loss": 0.5919, "mean_token_accuracy": 0.8165155649185181, "num_tokens": 190316762.0, "step": 5680 }, { "epoch": 0.33936246418338106, "grad_norm": 0.5913046002388, "learning_rate": 4.8262129639246366e-05, "loss": 0.5536, "mean_token_accuracy": 0.8268459916114808, "num_tokens": 190484442.0, "step": 5685 }, { "epoch": 0.33966093600764086, "grad_norm": 0.5674902200698853, "learning_rate": 4.825784780668991e-05, "loss": 0.5862, "mean_token_accuracy": 0.8183526158332824, "num_tokens": 190652122.0, "step": 5690 }, { "epoch": 0.33995940783190065, "grad_norm": 0.5599184632301331, "learning_rate": 4.825356091817648e-05, "loss": 0.5884, "mean_token_accuracy": 0.8159370064735413, "num_tokens": 190819802.0, "step": 5695 }, { "epoch": 0.34025787965616044, "grad_norm": 0.5582353472709656, "learning_rate": 4.8249268974750227e-05, "loss": 0.5732, "mean_token_accuracy": 0.8200942397117614, "num_tokens": 190987482.0, "step": 5700 }, { "epoch": 0.34055635148042024, "grad_norm": 0.6247255206108093, "learning_rate": 4.824497197745653e-05, "loss": 0.5789, "mean_token_accuracy": 0.8181975364685059, "num_tokens": 191155162.0, "step": 5705 }, { "epoch": 0.34085482330468003, "grad_norm": 0.695816159248352, "learning_rate": 4.8240669927341986e-05, "loss": 0.5873, "mean_token_accuracy": 0.8188416957855225, "num_tokens": 191322842.0, "step": 5710 }, { "epoch": 0.3411532951289398, "grad_norm": 0.5786401033401489, "learning_rate": 4.823636282545446e-05, "loss": 0.5789, "mean_token_accuracy": 0.8193904399871826, "num_tokens": 191490522.0, "step": 5715 }, { "epoch": 0.3414517669531996, "grad_norm": 0.5475450754165649, "learning_rate": 4.823205067284301e-05, "loss": 0.5965, "mean_token_accuracy": 0.8146308064460754, "num_tokens": 191658202.0, "step": 5720 }, { "epoch": 0.3417502387774594, "grad_norm": 0.9496316313743591, "learning_rate": 4.822773347055796e-05, "loss": 0.5691, "mean_token_accuracy": 0.823947274684906, "num_tokens": 191825882.0, "step": 5725 }, { "epoch": 0.3420487106017192, "grad_norm": 0.703660786151886, "learning_rate": 4.8223411219650835e-05, "loss": 0.6503, "mean_token_accuracy": 0.8019205451011657, "num_tokens": 191993562.0, "step": 5730 }, { "epoch": 0.342347182425979, "grad_norm": 0.56575608253479, "learning_rate": 4.821908392117439e-05, "loss": 0.5818, "mean_token_accuracy": 0.8168614864349365, "num_tokens": 192161242.0, "step": 5735 }, { "epoch": 0.3426456542502388, "grad_norm": 0.5935403108596802, "learning_rate": 4.8214751576182626e-05, "loss": 0.5999, "mean_token_accuracy": 0.8140880346298218, "num_tokens": 192328922.0, "step": 5740 }, { "epoch": 0.3429441260744986, "grad_norm": 0.5520789623260498, "learning_rate": 4.821041418573077e-05, "loss": 0.5297, "mean_token_accuracy": 0.8318740248680114, "num_tokens": 192496602.0, "step": 5745 }, { "epoch": 0.3432425978987584, "grad_norm": 0.6562302708625793, "learning_rate": 4.820607175087526e-05, "loss": 0.5844, "mean_token_accuracy": 0.816754150390625, "num_tokens": 192664282.0, "step": 5750 }, { "epoch": 0.3435410697230181, "grad_norm": 0.6340336203575134, "learning_rate": 4.820172427267379e-05, "loss": 0.6331, "mean_token_accuracy": 0.805099618434906, "num_tokens": 192831962.0, "step": 5755 }, { "epoch": 0.3438395415472779, "grad_norm": 0.6429221034049988, "learning_rate": 4.819737175218527e-05, "loss": 0.6264, "mean_token_accuracy": 0.8077299356460571, "num_tokens": 192999642.0, "step": 5760 }, { "epoch": 0.3441380133715377, "grad_norm": 1.1211243867874146, "learning_rate": 4.8193014190469815e-05, "loss": 0.555, "mean_token_accuracy": 0.824060595035553, "num_tokens": 193167322.0, "step": 5765 }, { "epoch": 0.3444364851957975, "grad_norm": 0.66719651222229, "learning_rate": 4.818865158858882e-05, "loss": 0.6682, "mean_token_accuracy": 0.7946558475494385, "num_tokens": 193335002.0, "step": 5770 }, { "epoch": 0.3447349570200573, "grad_norm": 0.6126347780227661, "learning_rate": 4.818428394760485e-05, "loss": 0.5907, "mean_token_accuracy": 0.8166944980621338, "num_tokens": 193502682.0, "step": 5775 }, { "epoch": 0.3450334288443171, "grad_norm": 0.627385139465332, "learning_rate": 4.817991126858173e-05, "loss": 0.6272, "mean_token_accuracy": 0.8069843769073486, "num_tokens": 193670362.0, "step": 5780 }, { "epoch": 0.3453319006685769, "grad_norm": 0.6703435182571411, "learning_rate": 4.817553355258453e-05, "loss": 0.6911, "mean_token_accuracy": 0.7877192020416259, "num_tokens": 193838042.0, "step": 5785 }, { "epoch": 0.3456303724928367, "grad_norm": 0.5937191247940063, "learning_rate": 4.81711508006795e-05, "loss": 0.6407, "mean_token_accuracy": 0.8059823393821717, "num_tokens": 194005722.0, "step": 5790 }, { "epoch": 0.3459288443170965, "grad_norm": 0.6775105595588684, "learning_rate": 4.816676301393415e-05, "loss": 0.5873, "mean_token_accuracy": 0.8178337097167969, "num_tokens": 194173402.0, "step": 5795 }, { "epoch": 0.34622731614135627, "grad_norm": 0.6267567873001099, "learning_rate": 4.81623701934172e-05, "loss": 0.5597, "mean_token_accuracy": 0.8242633819580079, "num_tokens": 194341082.0, "step": 5800 }, { "epoch": 0.34652578796561606, "grad_norm": 0.6258201003074646, "learning_rate": 4.8157972340198605e-05, "loss": 0.5549, "mean_token_accuracy": 0.8259095788002014, "num_tokens": 194508762.0, "step": 5805 }, { "epoch": 0.34682425978987586, "grad_norm": 0.5671982169151306, "learning_rate": 4.8153569455349545e-05, "loss": 0.5331, "mean_token_accuracy": 0.831414771080017, "num_tokens": 194676442.0, "step": 5810 }, { "epoch": 0.34712273161413565, "grad_norm": 0.6403409242630005, "learning_rate": 4.814916153994243e-05, "loss": 0.5619, "mean_token_accuracy": 0.8223845720291137, "num_tokens": 194844122.0, "step": 5815 }, { "epoch": 0.34742120343839544, "grad_norm": 0.6637452840805054, "learning_rate": 4.814474859505087e-05, "loss": 0.624, "mean_token_accuracy": 0.8084635496139526, "num_tokens": 195011802.0, "step": 5820 }, { "epoch": 0.3477196752626552, "grad_norm": 0.6146729588508606, "learning_rate": 4.8140330621749735e-05, "loss": 0.5998, "mean_token_accuracy": 0.8148813128471375, "num_tokens": 195179482.0, "step": 5825 }, { "epoch": 0.348018147086915, "grad_norm": 0.5506935715675354, "learning_rate": 4.8135907621115105e-05, "loss": 0.5396, "mean_token_accuracy": 0.8299057602882385, "num_tokens": 195347162.0, "step": 5830 }, { "epoch": 0.34831661891117477, "grad_norm": 0.612335205078125, "learning_rate": 4.813147959422427e-05, "loss": 0.5724, "mean_token_accuracy": 0.821591317653656, "num_tokens": 195514842.0, "step": 5835 }, { "epoch": 0.34861509073543456, "grad_norm": 0.6685415506362915, "learning_rate": 4.812704654215577e-05, "loss": 0.5801, "mean_token_accuracy": 0.818758201599121, "num_tokens": 195682522.0, "step": 5840 }, { "epoch": 0.34891356255969436, "grad_norm": 0.6858870387077332, "learning_rate": 4.812260846598936e-05, "loss": 0.578, "mean_token_accuracy": 0.8179887890815735, "num_tokens": 195850202.0, "step": 5845 }, { "epoch": 0.34921203438395415, "grad_norm": 0.7021246552467346, "learning_rate": 4.8118165366806e-05, "loss": 0.5741, "mean_token_accuracy": 0.8202373862266541, "num_tokens": 196017882.0, "step": 5850 }, { "epoch": 0.34951050620821394, "grad_norm": 0.6232729554176331, "learning_rate": 4.81137172456879e-05, "loss": 0.6022, "mean_token_accuracy": 0.8130025029182434, "num_tokens": 196185562.0, "step": 5855 }, { "epoch": 0.34980897803247374, "grad_norm": 1.3280320167541504, "learning_rate": 4.8109264103718485e-05, "loss": 0.5551, "mean_token_accuracy": 0.8264940977096558, "num_tokens": 196353242.0, "step": 5860 }, { "epoch": 0.35010744985673353, "grad_norm": 0.7067484855651855, "learning_rate": 4.810480594198239e-05, "loss": 0.6009, "mean_token_accuracy": 0.8130800485610962, "num_tokens": 196520922.0, "step": 5865 }, { "epoch": 0.3504059216809933, "grad_norm": 0.5749301314353943, "learning_rate": 4.8100342761565486e-05, "loss": 0.5474, "mean_token_accuracy": 0.8267028570175171, "num_tokens": 196688602.0, "step": 5870 }, { "epoch": 0.3507043935052531, "grad_norm": 0.5796326994895935, "learning_rate": 4.809587456355486e-05, "loss": 0.5897, "mean_token_accuracy": 0.8188715219497681, "num_tokens": 196856282.0, "step": 5875 }, { "epoch": 0.3510028653295129, "grad_norm": 0.6365723013877869, "learning_rate": 4.8091401349038826e-05, "loss": 0.6236, "mean_token_accuracy": 0.8061314463615418, "num_tokens": 197023962.0, "step": 5880 }, { "epoch": 0.3513013371537727, "grad_norm": 0.545111358165741, "learning_rate": 4.808692311910693e-05, "loss": 0.5416, "mean_token_accuracy": 0.8279911637306213, "num_tokens": 197191642.0, "step": 5885 }, { "epoch": 0.35159980897803245, "grad_norm": 0.6369797587394714, "learning_rate": 4.808243987484991e-05, "loss": 0.6055, "mean_token_accuracy": 0.8125849962234497, "num_tokens": 197359322.0, "step": 5890 }, { "epoch": 0.35189828080229224, "grad_norm": 0.5923847556114197, "learning_rate": 4.8077951617359756e-05, "loss": 0.6146, "mean_token_accuracy": 0.8082548022270203, "num_tokens": 197527002.0, "step": 5895 }, { "epoch": 0.35219675262655203, "grad_norm": 0.652245283126831, "learning_rate": 4.8073458347729656e-05, "loss": 0.6368, "mean_token_accuracy": 0.8015030503273011, "num_tokens": 197694682.0, "step": 5900 }, { "epoch": 0.35249522445081183, "grad_norm": 0.5620662569999695, "learning_rate": 4.806896006705403e-05, "loss": 0.5735, "mean_token_accuracy": 0.8208934664726257, "num_tokens": 197862362.0, "step": 5905 }, { "epoch": 0.3527936962750716, "grad_norm": 0.6360174417495728, "learning_rate": 4.806445677642852e-05, "loss": 0.5581, "mean_token_accuracy": 0.8255397796630859, "num_tokens": 198030042.0, "step": 5910 }, { "epoch": 0.3530921680993314, "grad_norm": 0.6562075018882751, "learning_rate": 4.805994847695e-05, "loss": 0.569, "mean_token_accuracy": 0.8215137839317321, "num_tokens": 198197722.0, "step": 5915 }, { "epoch": 0.3533906399235912, "grad_norm": 0.6920533776283264, "learning_rate": 4.8055435169716525e-05, "loss": 0.5723, "mean_token_accuracy": 0.8186985611915588, "num_tokens": 198365402.0, "step": 5920 }, { "epoch": 0.353689111747851, "grad_norm": 0.5866786241531372, "learning_rate": 4.80509168558274e-05, "loss": 0.56, "mean_token_accuracy": 0.8237142205238343, "num_tokens": 198527639.0, "step": 5925 }, { "epoch": 0.3539875835721108, "grad_norm": 0.5532835721969604, "learning_rate": 4.804639353638315e-05, "loss": 0.6067, "mean_token_accuracy": 0.8116366386413574, "num_tokens": 198695319.0, "step": 5930 }, { "epoch": 0.3542860553963706, "grad_norm": 0.5809913277626038, "learning_rate": 4.8041865212485515e-05, "loss": 0.5903, "mean_token_accuracy": 0.8160861253738403, "num_tokens": 198862999.0, "step": 5935 }, { "epoch": 0.3545845272206304, "grad_norm": 0.689055323600769, "learning_rate": 4.8037331885237447e-05, "loss": 0.6276, "mean_token_accuracy": 0.8048729658126831, "num_tokens": 199030679.0, "step": 5940 }, { "epoch": 0.3548829990448902, "grad_norm": 0.5742608904838562, "learning_rate": 4.8032793555743114e-05, "loss": 0.5871, "mean_token_accuracy": 0.8169748306274414, "num_tokens": 199198359.0, "step": 5945 }, { "epoch": 0.35518147086915, "grad_norm": 0.6180241107940674, "learning_rate": 4.802825022510793e-05, "loss": 0.5768, "mean_token_accuracy": 0.8204461455345153, "num_tokens": 199366039.0, "step": 5950 }, { "epoch": 0.35547994269340977, "grad_norm": 1.0495320558547974, "learning_rate": 4.802370189443849e-05, "loss": 0.5744, "mean_token_accuracy": 0.8227603673934937, "num_tokens": 199533719.0, "step": 5955 }, { "epoch": 0.3557784145176695, "grad_norm": 0.6751816868782043, "learning_rate": 4.8019148564842634e-05, "loss": 0.625, "mean_token_accuracy": 0.8050518989562988, "num_tokens": 199701399.0, "step": 5960 }, { "epoch": 0.3560768863419293, "grad_norm": 0.5694411396980286, "learning_rate": 4.8014590237429405e-05, "loss": 0.5916, "mean_token_accuracy": 0.8127042770385742, "num_tokens": 199869079.0, "step": 5965 }, { "epoch": 0.3563753581661891, "grad_norm": 0.5997521281242371, "learning_rate": 4.801002691330906e-05, "loss": 0.6002, "mean_token_accuracy": 0.8116902589797974, "num_tokens": 200036676.0, "step": 5970 }, { "epoch": 0.3566738299904489, "grad_norm": 0.6004998683929443, "learning_rate": 4.80054585935931e-05, "loss": 0.6003, "mean_token_accuracy": 0.8130382895469666, "num_tokens": 200204356.0, "step": 5975 }, { "epoch": 0.3569723018147087, "grad_norm": 0.7464216351509094, "learning_rate": 4.8000885279394206e-05, "loss": 0.5725, "mean_token_accuracy": 0.8201896786689759, "num_tokens": 200372036.0, "step": 5980 }, { "epoch": 0.3572707736389685, "grad_norm": 0.6673312783241272, "learning_rate": 4.79963069718263e-05, "loss": 0.5677, "mean_token_accuracy": 0.8203149199485779, "num_tokens": 200539716.0, "step": 5985 }, { "epoch": 0.35756924546322827, "grad_norm": 0.5537075996398926, "learning_rate": 4.799172367200451e-05, "loss": 0.5545, "mean_token_accuracy": 0.824185848236084, "num_tokens": 200707396.0, "step": 5990 }, { "epoch": 0.35786771728748806, "grad_norm": 0.5983226299285889, "learning_rate": 4.798713538104519e-05, "loss": 0.5804, "mean_token_accuracy": 0.818108081817627, "num_tokens": 200875076.0, "step": 5995 }, { "epoch": 0.35816618911174786, "grad_norm": 0.5817372798919678, "learning_rate": 4.798254210006589e-05, "loss": 0.5443, "mean_token_accuracy": 0.8290826559066773, "num_tokens": 201042756.0, "step": 6000 }, { "epoch": 0.35846466093600765, "grad_norm": 0.5799314975738525, "learning_rate": 4.7977943830185406e-05, "loss": 0.5388, "mean_token_accuracy": 0.8296015620231628, "num_tokens": 201210436.0, "step": 6005 }, { "epoch": 0.35876313276026744, "grad_norm": 0.6342442035675049, "learning_rate": 4.7973340572523724e-05, "loss": 0.6185, "mean_token_accuracy": 0.8091793060302734, "num_tokens": 201378116.0, "step": 6010 }, { "epoch": 0.35906160458452724, "grad_norm": 0.619718611240387, "learning_rate": 4.7968732328202046e-05, "loss": 0.5994, "mean_token_accuracy": 0.8141953945159912, "num_tokens": 201545796.0, "step": 6015 }, { "epoch": 0.35936007640878703, "grad_norm": 0.5960201621055603, "learning_rate": 4.7964119098342804e-05, "loss": 0.5697, "mean_token_accuracy": 0.822354769706726, "num_tokens": 201713476.0, "step": 6020 }, { "epoch": 0.3596585482330468, "grad_norm": 0.5542184114456177, "learning_rate": 4.795950088406962e-05, "loss": 0.5548, "mean_token_accuracy": 0.8247345805168151, "num_tokens": 201881156.0, "step": 6025 }, { "epoch": 0.35995702005730656, "grad_norm": 0.5916186571121216, "learning_rate": 4.795487768650737e-05, "loss": 0.5881, "mean_token_accuracy": 0.8167601108551026, "num_tokens": 202048836.0, "step": 6030 }, { "epoch": 0.36025549188156636, "grad_norm": 0.6291908025741577, "learning_rate": 4.7950249506782104e-05, "loss": 0.6296, "mean_token_accuracy": 0.8032906413078308, "num_tokens": 202213547.0, "step": 6035 }, { "epoch": 0.36055396370582615, "grad_norm": 0.6131892204284668, "learning_rate": 4.794561634602109e-05, "loss": 0.5751, "mean_token_accuracy": 0.8217881441116333, "num_tokens": 202381227.0, "step": 6040 }, { "epoch": 0.36085243553008595, "grad_norm": 0.5936862826347351, "learning_rate": 4.7940978205352845e-05, "loss": 0.5563, "mean_token_accuracy": 0.8248419404029846, "num_tokens": 202548907.0, "step": 6045 }, { "epoch": 0.36115090735434574, "grad_norm": 0.5770553946495056, "learning_rate": 4.7936335085907064e-05, "loss": 0.5773, "mean_token_accuracy": 0.8193128824234008, "num_tokens": 202716587.0, "step": 6050 }, { "epoch": 0.36144937917860553, "grad_norm": 0.6152001023292542, "learning_rate": 4.7931686988814644e-05, "loss": 0.5468, "mean_token_accuracy": 0.8282595753669739, "num_tokens": 202884267.0, "step": 6055 }, { "epoch": 0.36174785100286533, "grad_norm": 0.5983825922012329, "learning_rate": 4.792703391520775e-05, "loss": 0.5844, "mean_token_accuracy": 0.8193784952163696, "num_tokens": 203051947.0, "step": 6060 }, { "epoch": 0.3620463228271251, "grad_norm": 0.6283164620399475, "learning_rate": 4.79223758662197e-05, "loss": 0.5762, "mean_token_accuracy": 0.8198198676109314, "num_tokens": 203219627.0, "step": 6065 }, { "epoch": 0.3623447946513849, "grad_norm": 0.6504350900650024, "learning_rate": 4.791771284298505e-05, "loss": 0.5752, "mean_token_accuracy": 0.8200107336044311, "num_tokens": 203387307.0, "step": 6070 }, { "epoch": 0.3626432664756447, "grad_norm": 0.7191824316978455, "learning_rate": 4.791304484663957e-05, "loss": 0.5735, "mean_token_accuracy": 0.8221042633056641, "num_tokens": 203554987.0, "step": 6075 }, { "epoch": 0.3629417382999045, "grad_norm": 0.5728492140769958, "learning_rate": 4.7908371878320234e-05, "loss": 0.5967, "mean_token_accuracy": 0.8138315677642822, "num_tokens": 203722667.0, "step": 6080 }, { "epoch": 0.3632402101241643, "grad_norm": 0.5692455172538757, "learning_rate": 4.790369393916524e-05, "loss": 0.5384, "mean_token_accuracy": 0.8291065335273743, "num_tokens": 203890347.0, "step": 6085 }, { "epoch": 0.3635386819484241, "grad_norm": 0.5722931623458862, "learning_rate": 4.789901103031397e-05, "loss": 0.5352, "mean_token_accuracy": 0.8326374769210816, "num_tokens": 204058027.0, "step": 6090 }, { "epoch": 0.3638371537726839, "grad_norm": 0.6488029956817627, "learning_rate": 4.789432315290705e-05, "loss": 0.5678, "mean_token_accuracy": 0.8214243054389954, "num_tokens": 204225707.0, "step": 6095 }, { "epoch": 0.3641356255969436, "grad_norm": 0.7144589424133301, "learning_rate": 4.788963030808628e-05, "loss": 0.5958, "mean_token_accuracy": 0.8143087267875672, "num_tokens": 204393387.0, "step": 6100 }, { "epoch": 0.3644340974212034, "grad_norm": 0.6138399839401245, "learning_rate": 4.78849324969947e-05, "loss": 0.5498, "mean_token_accuracy": 0.8279136300086976, "num_tokens": 204561067.0, "step": 6105 }, { "epoch": 0.3647325692454632, "grad_norm": 0.5629082322120667, "learning_rate": 4.7880229720776556e-05, "loss": 0.5881, "mean_token_accuracy": 0.8167958855628967, "num_tokens": 204728747.0, "step": 6110 }, { "epoch": 0.365031041069723, "grad_norm": 0.6037213206291199, "learning_rate": 4.787552198057728e-05, "loss": 0.5968, "mean_token_accuracy": 0.8123762488365174, "num_tokens": 204896427.0, "step": 6115 }, { "epoch": 0.3653295128939828, "grad_norm": 0.5730690360069275, "learning_rate": 4.787080927754354e-05, "loss": 0.543, "mean_token_accuracy": 0.8294703722000122, "num_tokens": 205064107.0, "step": 6120 }, { "epoch": 0.3656279847182426, "grad_norm": 0.5808693170547485, "learning_rate": 4.78660916128232e-05, "loss": 0.6218, "mean_token_accuracy": 0.8055827260017395, "num_tokens": 205231787.0, "step": 6125 }, { "epoch": 0.3659264565425024, "grad_norm": 0.5321030616760254, "learning_rate": 4.786136898756533e-05, "loss": 0.5775, "mean_token_accuracy": 0.8188536286354064, "num_tokens": 205399467.0, "step": 6130 }, { "epoch": 0.3662249283667622, "grad_norm": 0.5482165813446045, "learning_rate": 4.785664140292021e-05, "loss": 0.6068, "mean_token_accuracy": 0.8114696383476258, "num_tokens": 205567147.0, "step": 6135 }, { "epoch": 0.366523400191022, "grad_norm": 0.5584302544593811, "learning_rate": 4.785190886003934e-05, "loss": 0.5869, "mean_token_accuracy": 0.8161099672317504, "num_tokens": 205734827.0, "step": 6140 }, { "epoch": 0.36682187201528177, "grad_norm": 0.5813570618629456, "learning_rate": 4.784717136007542e-05, "loss": 0.584, "mean_token_accuracy": 0.8184301614761352, "num_tokens": 205902507.0, "step": 6145 }, { "epoch": 0.36712034383954156, "grad_norm": 0.57797771692276, "learning_rate": 4.7842428904182346e-05, "loss": 0.5445, "mean_token_accuracy": 0.8281761169433594, "num_tokens": 206065792.0, "step": 6150 }, { "epoch": 0.36741881566380136, "grad_norm": 0.6327470541000366, "learning_rate": 4.783768149351523e-05, "loss": 0.6095, "mean_token_accuracy": 0.8096922397613525, "num_tokens": 206233472.0, "step": 6155 }, { "epoch": 0.36771728748806115, "grad_norm": 0.5569842457771301, "learning_rate": 4.78329291292304e-05, "loss": 0.5963, "mean_token_accuracy": 0.8137421011924744, "num_tokens": 206401152.0, "step": 6160 }, { "epoch": 0.3680157593123209, "grad_norm": 0.6722360849380493, "learning_rate": 4.782817181248537e-05, "loss": 0.5768, "mean_token_accuracy": 0.8195932149887085, "num_tokens": 206568832.0, "step": 6165 }, { "epoch": 0.3683142311365807, "grad_norm": 0.6275756359100342, "learning_rate": 4.782340954443889e-05, "loss": 0.584, "mean_token_accuracy": 0.8166110038757324, "num_tokens": 206736512.0, "step": 6170 }, { "epoch": 0.3686127029608405, "grad_norm": 0.6040167212486267, "learning_rate": 4.7818642326250886e-05, "loss": 0.5611, "mean_token_accuracy": 0.8252415657043457, "num_tokens": 206904192.0, "step": 6175 }, { "epoch": 0.36891117478510027, "grad_norm": 0.5967251062393188, "learning_rate": 4.78138701590825e-05, "loss": 0.5449, "mean_token_accuracy": 0.8273828029632568, "num_tokens": 207071872.0, "step": 6180 }, { "epoch": 0.36920964660936006, "grad_norm": 0.5543979406356812, "learning_rate": 4.780909304409609e-05, "loss": 0.5373, "mean_token_accuracy": 0.8296134948730469, "num_tokens": 207239552.0, "step": 6185 }, { "epoch": 0.36950811843361986, "grad_norm": 0.5790201425552368, "learning_rate": 4.7804310982455206e-05, "loss": 0.6047, "mean_token_accuracy": 0.8101813077926636, "num_tokens": 207407232.0, "step": 6190 }, { "epoch": 0.36980659025787965, "grad_norm": 0.6514462232589722, "learning_rate": 4.779952397532461e-05, "loss": 0.6143, "mean_token_accuracy": 0.8098652124404907, "num_tokens": 207574912.0, "step": 6195 }, { "epoch": 0.37010506208213945, "grad_norm": 0.553922712802887, "learning_rate": 4.779473202387026e-05, "loss": 0.6114, "mean_token_accuracy": 0.8086484551429749, "num_tokens": 207742592.0, "step": 6200 }, { "epoch": 0.37040353390639924, "grad_norm": 0.6184931397438049, "learning_rate": 4.7789935129259325e-05, "loss": 0.6002, "mean_token_accuracy": 0.8134378910064697, "num_tokens": 207910272.0, "step": 6205 }, { "epoch": 0.37070200573065903, "grad_norm": 0.599904477596283, "learning_rate": 4.778513329266018e-05, "loss": 0.5428, "mean_token_accuracy": 0.827180016040802, "num_tokens": 208077952.0, "step": 6210 }, { "epoch": 0.37100047755491883, "grad_norm": 0.5533261299133301, "learning_rate": 4.778032651524241e-05, "loss": 0.547, "mean_token_accuracy": 0.8287009477615357, "num_tokens": 208245632.0, "step": 6215 }, { "epoch": 0.3712989493791786, "grad_norm": 0.63448166847229, "learning_rate": 4.777551479817677e-05, "loss": 0.6222, "mean_token_accuracy": 0.8068472027778626, "num_tokens": 208413312.0, "step": 6220 }, { "epoch": 0.3715974212034384, "grad_norm": 0.593621015548706, "learning_rate": 4.7770698142635274e-05, "loss": 0.5822, "mean_token_accuracy": 0.8181200146675109, "num_tokens": 208580992.0, "step": 6225 }, { "epoch": 0.3718958930276982, "grad_norm": 0.603478729724884, "learning_rate": 4.776587654979108e-05, "loss": 0.5902, "mean_token_accuracy": 0.8145592331886291, "num_tokens": 208748672.0, "step": 6230 }, { "epoch": 0.37219436485195795, "grad_norm": 0.6217207908630371, "learning_rate": 4.776105002081859e-05, "loss": 0.5379, "mean_token_accuracy": 0.8302874684333801, "num_tokens": 208916352.0, "step": 6235 }, { "epoch": 0.37249283667621774, "grad_norm": 0.5804060101509094, "learning_rate": 4.7756218556893396e-05, "loss": 0.5486, "mean_token_accuracy": 0.826667058467865, "num_tokens": 209084032.0, "step": 6240 }, { "epoch": 0.37279130850047754, "grad_norm": 0.5571052432060242, "learning_rate": 4.7751382159192276e-05, "loss": 0.5739, "mean_token_accuracy": 0.8195097208023071, "num_tokens": 209251712.0, "step": 6245 }, { "epoch": 0.37308978032473733, "grad_norm": 1.8832085132598877, "learning_rate": 4.7746540828893246e-05, "loss": 0.5994, "mean_token_accuracy": 0.8152332186698914, "num_tokens": 209419392.0, "step": 6250 }, { "epoch": 0.3733882521489971, "grad_norm": 0.6478602886199951, "learning_rate": 4.774169456717548e-05, "loss": 0.5896, "mean_token_accuracy": 0.8154598593711853, "num_tokens": 209587072.0, "step": 6255 }, { "epoch": 0.3736867239732569, "grad_norm": 0.5612985491752625, "learning_rate": 4.7736843375219384e-05, "loss": 0.5836, "mean_token_accuracy": 0.8176607370376587, "num_tokens": 209754752.0, "step": 6260 }, { "epoch": 0.3739851957975167, "grad_norm": 0.5482649207115173, "learning_rate": 4.7731987254206555e-05, "loss": 0.5573, "mean_token_accuracy": 0.8265477776527405, "num_tokens": 209922432.0, "step": 6265 }, { "epoch": 0.3742836676217765, "grad_norm": 0.5954232215881348, "learning_rate": 4.7727126205319794e-05, "loss": 0.5789, "mean_token_accuracy": 0.8219231247901917, "num_tokens": 210084696.0, "step": 6270 }, { "epoch": 0.3745821394460363, "grad_norm": 0.5713602304458618, "learning_rate": 4.772226022974309e-05, "loss": 0.5537, "mean_token_accuracy": 0.825426459312439, "num_tokens": 210252376.0, "step": 6275 }, { "epoch": 0.3748806112702961, "grad_norm": 0.5526502728462219, "learning_rate": 4.771738932866165e-05, "loss": 0.5746, "mean_token_accuracy": 0.8212215185165406, "num_tokens": 210420056.0, "step": 6280 }, { "epoch": 0.3751790830945559, "grad_norm": 0.5443446636199951, "learning_rate": 4.771251350326187e-05, "loss": 0.5628, "mean_token_accuracy": 0.8232554078102112, "num_tokens": 210587736.0, "step": 6285 }, { "epoch": 0.3754775549188157, "grad_norm": 0.5898823738098145, "learning_rate": 4.7707632754731345e-05, "loss": 0.5655, "mean_token_accuracy": 0.8227961301803589, "num_tokens": 210755416.0, "step": 6290 }, { "epoch": 0.3757760267430755, "grad_norm": 0.5943527221679688, "learning_rate": 4.770274708425887e-05, "loss": 0.62, "mean_token_accuracy": 0.8066026449203492, "num_tokens": 210923096.0, "step": 6295 }, { "epoch": 0.37607449856733527, "grad_norm": 0.5689024329185486, "learning_rate": 4.769785649303445e-05, "loss": 0.547, "mean_token_accuracy": 0.8284027218818665, "num_tokens": 211090776.0, "step": 6300 }, { "epoch": 0.376372970391595, "grad_norm": 0.5758464336395264, "learning_rate": 4.7692960982249265e-05, "loss": 0.5664, "mean_token_accuracy": 0.8225813984870911, "num_tokens": 211258456.0, "step": 6305 }, { "epoch": 0.3766714422158548, "grad_norm": 0.5908043384552002, "learning_rate": 4.768806055309571e-05, "loss": 0.6244, "mean_token_accuracy": 0.804723858833313, "num_tokens": 211426136.0, "step": 6310 }, { "epoch": 0.3769699140401146, "grad_norm": 0.6135730147361755, "learning_rate": 4.7683155206767385e-05, "loss": 0.5471, "mean_token_accuracy": 0.8274305105209351, "num_tokens": 211593816.0, "step": 6315 }, { "epoch": 0.3772683858643744, "grad_norm": 0.5765379667282104, "learning_rate": 4.7678244944459064e-05, "loss": 0.5968, "mean_token_accuracy": 0.814010500907898, "num_tokens": 211761496.0, "step": 6320 }, { "epoch": 0.3775668576886342, "grad_norm": 0.6675630211830139, "learning_rate": 4.767332976736675e-05, "loss": 0.6126, "mean_token_accuracy": 0.8097280144691468, "num_tokens": 211929176.0, "step": 6325 }, { "epoch": 0.377865329512894, "grad_norm": 0.5652087330818176, "learning_rate": 4.766840967668761e-05, "loss": 0.5985, "mean_token_accuracy": 0.8130979418754578, "num_tokens": 212096856.0, "step": 6330 }, { "epoch": 0.37816380133715377, "grad_norm": 0.597262978553772, "learning_rate": 4.7663484673620026e-05, "loss": 0.5925, "mean_token_accuracy": 0.8139210343360901, "num_tokens": 212264536.0, "step": 6335 }, { "epoch": 0.37846227316141356, "grad_norm": 0.5369839668273926, "learning_rate": 4.765855475936357e-05, "loss": 0.5344, "mean_token_accuracy": 0.830138373374939, "num_tokens": 212432216.0, "step": 6340 }, { "epoch": 0.37876074498567336, "grad_norm": 0.5923442840576172, "learning_rate": 4.7653619935119027e-05, "loss": 0.5988, "mean_token_accuracy": 0.8131396889686584, "num_tokens": 212599896.0, "step": 6345 }, { "epoch": 0.37905921680993315, "grad_norm": 0.6336514353752136, "learning_rate": 4.764868020208835e-05, "loss": 0.6201, "mean_token_accuracy": 0.808535122871399, "num_tokens": 212767576.0, "step": 6350 }, { "epoch": 0.37935768863419295, "grad_norm": 0.5950546860694885, "learning_rate": 4.76437355614747e-05, "loss": 0.6075, "mean_token_accuracy": 0.81177978515625, "num_tokens": 212935256.0, "step": 6355 }, { "epoch": 0.37965616045845274, "grad_norm": 0.5491812229156494, "learning_rate": 4.763878601448246e-05, "loss": 0.6138, "mean_token_accuracy": 0.8096564650535584, "num_tokens": 213098151.0, "step": 6360 }, { "epoch": 0.37995463228271253, "grad_norm": 0.5339094400405884, "learning_rate": 4.763383156231716e-05, "loss": 0.5647, "mean_token_accuracy": 0.8233806610107421, "num_tokens": 213265831.0, "step": 6365 }, { "epoch": 0.38025310410697233, "grad_norm": 0.5538163185119629, "learning_rate": 4.762887220618555e-05, "loss": 0.576, "mean_token_accuracy": 0.8189192414283752, "num_tokens": 213433511.0, "step": 6370 }, { "epoch": 0.38055157593123207, "grad_norm": 0.6217353940010071, "learning_rate": 4.762390794729558e-05, "loss": 0.5597, "mean_token_accuracy": 0.8234402894973755, "num_tokens": 213601191.0, "step": 6375 }, { "epoch": 0.38085004775549186, "grad_norm": 0.5830212235450745, "learning_rate": 4.761893878685639e-05, "loss": 0.5635, "mean_token_accuracy": 0.8230645298957825, "num_tokens": 213768871.0, "step": 6380 }, { "epoch": 0.38114851957975165, "grad_norm": 0.7095737457275391, "learning_rate": 4.76139647260783e-05, "loss": 0.6056, "mean_token_accuracy": 0.8110700249671936, "num_tokens": 213936551.0, "step": 6385 }, { "epoch": 0.38144699140401145, "grad_norm": 0.6289616823196411, "learning_rate": 4.7608985766172834e-05, "loss": 0.5641, "mean_token_accuracy": 0.8217463850975036, "num_tokens": 214104231.0, "step": 6390 }, { "epoch": 0.38174546322827124, "grad_norm": 0.6377663016319275, "learning_rate": 4.760400190835273e-05, "loss": 0.6435, "mean_token_accuracy": 0.8008230924606323, "num_tokens": 214271911.0, "step": 6395 }, { "epoch": 0.38204393505253104, "grad_norm": 0.5742936134338379, "learning_rate": 4.7599013153831875e-05, "loss": 0.5704, "mean_token_accuracy": 0.8208159446716309, "num_tokens": 214439591.0, "step": 6400 }, { "epoch": 0.38234240687679083, "grad_norm": 0.548237144947052, "learning_rate": 4.759401950382538e-05, "loss": 0.5801, "mean_token_accuracy": 0.8183406949043274, "num_tokens": 214607271.0, "step": 6405 }, { "epoch": 0.3826408787010506, "grad_norm": 0.6111705899238586, "learning_rate": 4.758902095954954e-05, "loss": 0.5866, "mean_token_accuracy": 0.8150065660476684, "num_tokens": 214774951.0, "step": 6410 }, { "epoch": 0.3829393505253104, "grad_norm": 0.6521761417388916, "learning_rate": 4.758401752222185e-05, "loss": 0.6238, "mean_token_accuracy": 0.80608971118927, "num_tokens": 214942631.0, "step": 6415 }, { "epoch": 0.3832378223495702, "grad_norm": 0.5645994544029236, "learning_rate": 4.757900919306098e-05, "loss": 0.5575, "mean_token_accuracy": 0.8229870080947876, "num_tokens": 215110311.0, "step": 6420 }, { "epoch": 0.38353629417383, "grad_norm": 0.6383602023124695, "learning_rate": 4.757399597328681e-05, "loss": 0.6136, "mean_token_accuracy": 0.8083502411842346, "num_tokens": 215277991.0, "step": 6425 }, { "epoch": 0.3838347659980898, "grad_norm": 0.5976201891899109, "learning_rate": 4.7568977864120395e-05, "loss": 0.6209, "mean_token_accuracy": 0.8083502411842346, "num_tokens": 215445671.0, "step": 6430 }, { "epoch": 0.3841332378223496, "grad_norm": 0.574621856212616, "learning_rate": 4.756395486678399e-05, "loss": 0.5663, "mean_token_accuracy": 0.8204580664634704, "num_tokens": 215613351.0, "step": 6435 }, { "epoch": 0.3844317096466094, "grad_norm": 0.6360025405883789, "learning_rate": 4.755892698250104e-05, "loss": 0.5734, "mean_token_accuracy": 0.8212871313095093, "num_tokens": 215781031.0, "step": 6440 }, { "epoch": 0.3847301814708691, "grad_norm": 0.5911943912506104, "learning_rate": 4.755389421249618e-05, "loss": 0.5859, "mean_token_accuracy": 0.8161934852600098, "num_tokens": 215948711.0, "step": 6445 }, { "epoch": 0.3850286532951289, "grad_norm": 0.6358640789985657, "learning_rate": 4.7548856557995234e-05, "loss": 0.5775, "mean_token_accuracy": 0.8203805208206176, "num_tokens": 216116391.0, "step": 6450 }, { "epoch": 0.3853271251193887, "grad_norm": 0.546363890171051, "learning_rate": 4.75438140202252e-05, "loss": 0.5743, "mean_token_accuracy": 0.8201717615127564, "num_tokens": 216284071.0, "step": 6455 }, { "epoch": 0.3856255969436485, "grad_norm": 0.6215122938156128, "learning_rate": 4.7538766600414304e-05, "loss": 0.5798, "mean_token_accuracy": 0.8191995739936828, "num_tokens": 216451751.0, "step": 6460 }, { "epoch": 0.3859240687679083, "grad_norm": 0.5524929165840149, "learning_rate": 4.753371429979192e-05, "loss": 0.5462, "mean_token_accuracy": 0.8284802556037902, "num_tokens": 216619431.0, "step": 6465 }, { "epoch": 0.3862225405921681, "grad_norm": 0.5880683064460754, "learning_rate": 4.752865711958866e-05, "loss": 0.6087, "mean_token_accuracy": 0.8109447598457337, "num_tokens": 216787111.0, "step": 6470 }, { "epoch": 0.3865210124164279, "grad_norm": 0.5194839239120483, "learning_rate": 4.752359506103626e-05, "loss": 0.5219, "mean_token_accuracy": 0.8333591818809509, "num_tokens": 216954791.0, "step": 6475 }, { "epoch": 0.3868194842406877, "grad_norm": 0.6575645208358765, "learning_rate": 4.7518528125367685e-05, "loss": 0.6105, "mean_token_accuracy": 0.808320415019989, "num_tokens": 217122471.0, "step": 6480 }, { "epoch": 0.3871179560649475, "grad_norm": 0.6227360367774963, "learning_rate": 4.751345631381709e-05, "loss": 0.6125, "mean_token_accuracy": 0.8094178676605225, "num_tokens": 217290151.0, "step": 6485 }, { "epoch": 0.38741642788920727, "grad_norm": 0.5925331711769104, "learning_rate": 4.7508379627619806e-05, "loss": 0.6085, "mean_token_accuracy": 0.8091971755027771, "num_tokens": 217457831.0, "step": 6490 }, { "epoch": 0.38771489971346706, "grad_norm": 0.5787959098815918, "learning_rate": 4.750329806801234e-05, "loss": 0.5641, "mean_token_accuracy": 0.8212453842163085, "num_tokens": 217625511.0, "step": 6495 }, { "epoch": 0.38801337153772686, "grad_norm": 0.5762561559677124, "learning_rate": 4.749821163623242e-05, "loss": 0.5683, "mean_token_accuracy": 0.8214004516601563, "num_tokens": 217793191.0, "step": 6500 }, { "epoch": 0.38831184336198665, "grad_norm": 0.580866813659668, "learning_rate": 4.7493120333518934e-05, "loss": 0.5739, "mean_token_accuracy": 0.8196469068527221, "num_tokens": 217960871.0, "step": 6505 }, { "epoch": 0.3886103151862464, "grad_norm": 0.6367342472076416, "learning_rate": 4.748802416111196e-05, "loss": 0.5974, "mean_token_accuracy": 0.811803650856018, "num_tokens": 218128551.0, "step": 6510 }, { "epoch": 0.3889087870105062, "grad_norm": 0.5629777312278748, "learning_rate": 4.7482923120252754e-05, "loss": 0.5564, "mean_token_accuracy": 0.8240427136421203, "num_tokens": 218296231.0, "step": 6515 }, { "epoch": 0.389207258834766, "grad_norm": 0.6516347527503967, "learning_rate": 4.747781721218379e-05, "loss": 0.6249, "mean_token_accuracy": 0.8065803408622741, "num_tokens": 218461234.0, "step": 6520 }, { "epoch": 0.3895057306590258, "grad_norm": 0.5853041410446167, "learning_rate": 4.747270643814869e-05, "loss": 0.6331, "mean_token_accuracy": 0.8044912338256835, "num_tokens": 218628914.0, "step": 6525 }, { "epoch": 0.38980420248328557, "grad_norm": 0.5444105267524719, "learning_rate": 4.746759079939229e-05, "loss": 0.5686, "mean_token_accuracy": 0.8207026243209838, "num_tokens": 218796594.0, "step": 6530 }, { "epoch": 0.39010267430754536, "grad_norm": 0.627741813659668, "learning_rate": 4.746247029716059e-05, "loss": 0.5789, "mean_token_accuracy": 0.817857563495636, "num_tokens": 218964274.0, "step": 6535 }, { "epoch": 0.39040114613180515, "grad_norm": 0.641633927822113, "learning_rate": 4.745734493270077e-05, "loss": 0.604, "mean_token_accuracy": 0.812417995929718, "num_tokens": 219131954.0, "step": 6540 }, { "epoch": 0.39069961795606495, "grad_norm": 0.7165398001670837, "learning_rate": 4.745221470726124e-05, "loss": 0.5866, "mean_token_accuracy": 0.8165394306182862, "num_tokens": 219299634.0, "step": 6545 }, { "epoch": 0.39099808978032474, "grad_norm": 0.6099076271057129, "learning_rate": 4.7447079622091535e-05, "loss": 0.5649, "mean_token_accuracy": 0.8223189830780029, "num_tokens": 219467314.0, "step": 6550 }, { "epoch": 0.39129656160458454, "grad_norm": 0.5561268329620361, "learning_rate": 4.744193967844241e-05, "loss": 0.5832, "mean_token_accuracy": 0.8191160678863525, "num_tokens": 219634994.0, "step": 6555 }, { "epoch": 0.39159503342884433, "grad_norm": 0.6390789151191711, "learning_rate": 4.7436794877565784e-05, "loss": 0.5905, "mean_token_accuracy": 0.8159906983375549, "num_tokens": 219802674.0, "step": 6560 }, { "epoch": 0.3918935052531041, "grad_norm": 0.604485034942627, "learning_rate": 4.7431645220714775e-05, "loss": 0.5844, "mean_token_accuracy": 0.8169330954551697, "num_tokens": 219970354.0, "step": 6565 }, { "epoch": 0.3921919770773639, "grad_norm": 0.6159762740135193, "learning_rate": 4.742649070914368e-05, "loss": 0.5583, "mean_token_accuracy": 0.8233389019966125, "num_tokens": 220138034.0, "step": 6570 }, { "epoch": 0.3924904489016237, "grad_norm": 0.6343648433685303, "learning_rate": 4.742133134410797e-05, "loss": 0.5981, "mean_token_accuracy": 0.8126863956451416, "num_tokens": 220305714.0, "step": 6575 }, { "epoch": 0.39278892072588345, "grad_norm": 0.5860236287117004, "learning_rate": 4.741616712686431e-05, "loss": 0.6068, "mean_token_accuracy": 0.8143018960952759, "num_tokens": 220466307.0, "step": 6580 }, { "epoch": 0.39308739255014324, "grad_norm": 0.5698738694190979, "learning_rate": 4.741099805867053e-05, "loss": 0.531, "mean_token_accuracy": 0.8326016902923584, "num_tokens": 220633987.0, "step": 6585 }, { "epoch": 0.39338586437440304, "grad_norm": 0.5755841135978699, "learning_rate": 4.740582414078566e-05, "loss": 0.5441, "mean_token_accuracy": 0.8273887634277344, "num_tokens": 220801667.0, "step": 6590 }, { "epoch": 0.39368433619866283, "grad_norm": 0.5453025698661804, "learning_rate": 4.7400645374469896e-05, "loss": 0.5596, "mean_token_accuracy": 0.8226649045944214, "num_tokens": 220969347.0, "step": 6595 }, { "epoch": 0.3939828080229226, "grad_norm": 0.5659153461456299, "learning_rate": 4.739546176098464e-05, "loss": 0.579, "mean_token_accuracy": 0.8178933501243592, "num_tokens": 221137027.0, "step": 6600 }, { "epoch": 0.3942812798471824, "grad_norm": 0.5193234086036682, "learning_rate": 4.7390273301592436e-05, "loss": 0.5313, "mean_token_accuracy": 0.8320708513259888, "num_tokens": 221304707.0, "step": 6605 }, { "epoch": 0.3945797516714422, "grad_norm": 0.6077717542648315, "learning_rate": 4.738507999755704e-05, "loss": 0.6023, "mean_token_accuracy": 0.8145830869674683, "num_tokens": 221472387.0, "step": 6610 }, { "epoch": 0.394878223495702, "grad_norm": 0.5752969980239868, "learning_rate": 4.737988185014337e-05, "loss": 0.5823, "mean_token_accuracy": 0.8173061609268188, "num_tokens": 221637199.0, "step": 6615 }, { "epoch": 0.3951766953199618, "grad_norm": 0.541581928730011, "learning_rate": 4.737467886061753e-05, "loss": 0.5808, "mean_token_accuracy": 0.817368483543396, "num_tokens": 221804879.0, "step": 6620 }, { "epoch": 0.3954751671442216, "grad_norm": 0.7009362578392029, "learning_rate": 4.736947103024682e-05, "loss": 0.6138, "mean_token_accuracy": 0.8095013737678528, "num_tokens": 221972559.0, "step": 6625 }, { "epoch": 0.3957736389684814, "grad_norm": 0.5366577506065369, "learning_rate": 4.7364258360299696e-05, "loss": 0.5713, "mean_token_accuracy": 0.8189550399780273, "num_tokens": 222140239.0, "step": 6630 }, { "epoch": 0.3960721107927412, "grad_norm": 0.6163941621780396, "learning_rate": 4.7359040852045784e-05, "loss": 0.5877, "mean_token_accuracy": 0.8163246989250184, "num_tokens": 222307919.0, "step": 6635 }, { "epoch": 0.396370582617001, "grad_norm": 0.6006356477737427, "learning_rate": 4.735381850675592e-05, "loss": 0.5771, "mean_token_accuracy": 0.8192174673080445, "num_tokens": 222475599.0, "step": 6640 }, { "epoch": 0.39666905444126077, "grad_norm": 0.5882141590118408, "learning_rate": 4.7348591325702096e-05, "loss": 0.633, "mean_token_accuracy": 0.8048968195915223, "num_tokens": 222643279.0, "step": 6645 }, { "epoch": 0.3969675262655205, "grad_norm": 0.725095808506012, "learning_rate": 4.73433593101575e-05, "loss": 0.5831, "mean_token_accuracy": 0.8178635358810424, "num_tokens": 222810959.0, "step": 6650 }, { "epoch": 0.3972659980897803, "grad_norm": 0.5529491305351257, "learning_rate": 4.733812246139647e-05, "loss": 0.579, "mean_token_accuracy": 0.8188297748565674, "num_tokens": 222978639.0, "step": 6655 }, { "epoch": 0.3975644699140401, "grad_norm": 0.5802292823791504, "learning_rate": 4.733288078069455e-05, "loss": 0.5735, "mean_token_accuracy": 0.8197721600532532, "num_tokens": 223146319.0, "step": 6660 }, { "epoch": 0.3978629417382999, "grad_norm": 0.7063968777656555, "learning_rate": 4.732763426932844e-05, "loss": 0.5993, "mean_token_accuracy": 0.8129905819892883, "num_tokens": 223313999.0, "step": 6665 }, { "epoch": 0.3981614135625597, "grad_norm": 0.6847906708717346, "learning_rate": 4.7322382928576033e-05, "loss": 0.5669, "mean_token_accuracy": 0.8226708889007568, "num_tokens": 223481679.0, "step": 6670 }, { "epoch": 0.3984598853868195, "grad_norm": 0.5250869393348694, "learning_rate": 4.731712675971637e-05, "loss": 0.5679, "mean_token_accuracy": 0.8215734362602234, "num_tokens": 223649359.0, "step": 6675 }, { "epoch": 0.3987583572110793, "grad_norm": 0.5753973722457886, "learning_rate": 4.7311865764029716e-05, "loss": 0.5904, "mean_token_accuracy": 0.8140641689300537, "num_tokens": 223817039.0, "step": 6680 }, { "epoch": 0.39905682903533907, "grad_norm": 0.6264418363571167, "learning_rate": 4.730659994279747e-05, "loss": 0.6066, "mean_token_accuracy": 0.8108910918235779, "num_tokens": 223984719.0, "step": 6685 }, { "epoch": 0.39935530085959886, "grad_norm": 0.6425790786743164, "learning_rate": 4.730132929730221e-05, "loss": 0.6017, "mean_token_accuracy": 0.8102588653564453, "num_tokens": 224152399.0, "step": 6690 }, { "epoch": 0.39965377268385865, "grad_norm": 0.612373948097229, "learning_rate": 4.7296053828827716e-05, "loss": 0.6163, "mean_token_accuracy": 0.8088214278221131, "num_tokens": 224320079.0, "step": 6695 }, { "epoch": 0.39995224450811845, "grad_norm": 0.5595587491989136, "learning_rate": 4.7290773538658925e-05, "loss": 0.6089, "mean_token_accuracy": 0.8089130282402038, "num_tokens": 224483175.0, "step": 6700 }, { "epoch": 0.40025071633237824, "grad_norm": 0.555931806564331, "learning_rate": 4.7285488428081934e-05, "loss": 0.5823, "mean_token_accuracy": 0.8164738297462464, "num_tokens": 224650855.0, "step": 6705 }, { "epoch": 0.40054918815663804, "grad_norm": 0.6434721946716309, "learning_rate": 4.728019849838404e-05, "loss": 0.5982, "mean_token_accuracy": 0.8130502343177796, "num_tokens": 224818535.0, "step": 6710 }, { "epoch": 0.40084765998089783, "grad_norm": 0.6346591114997864, "learning_rate": 4.7274903750853696e-05, "loss": 0.5638, "mean_token_accuracy": 0.82160325050354, "num_tokens": 224986215.0, "step": 6715 }, { "epoch": 0.40114613180515757, "grad_norm": 0.5797615051269531, "learning_rate": 4.7269604186780546e-05, "loss": 0.57, "mean_token_accuracy": 0.8205057978630066, "num_tokens": 225153895.0, "step": 6720 }, { "epoch": 0.40144460362941736, "grad_norm": 0.5798340439796448, "learning_rate": 4.72642998074554e-05, "loss": 0.56, "mean_token_accuracy": 0.8243588209152222, "num_tokens": 225321575.0, "step": 6725 }, { "epoch": 0.40174307545367716, "grad_norm": 0.5631234049797058, "learning_rate": 4.725899061417021e-05, "loss": 0.5925, "mean_token_accuracy": 0.8168436169624329, "num_tokens": 225489255.0, "step": 6730 }, { "epoch": 0.40204154727793695, "grad_norm": 0.5207034945487976, "learning_rate": 4.725367660821815e-05, "loss": 0.5435, "mean_token_accuracy": 0.829875934123993, "num_tokens": 225656935.0, "step": 6735 }, { "epoch": 0.40234001910219674, "grad_norm": 0.6461813449859619, "learning_rate": 4.724835779089355e-05, "loss": 0.5983, "mean_token_accuracy": 0.8127281427383423, "num_tokens": 225824615.0, "step": 6740 }, { "epoch": 0.40263849092645654, "grad_norm": 0.5818511843681335, "learning_rate": 4.72430341634919e-05, "loss": 0.5735, "mean_token_accuracy": 0.8205236911773681, "num_tokens": 225992295.0, "step": 6745 }, { "epoch": 0.40293696275071633, "grad_norm": 0.525959312915802, "learning_rate": 4.723770572730986e-05, "loss": 0.5518, "mean_token_accuracy": 0.8268280982971191, "num_tokens": 226159975.0, "step": 6750 }, { "epoch": 0.4032354345749761, "grad_norm": 0.5492711067199707, "learning_rate": 4.723237248364527e-05, "loss": 0.5415, "mean_token_accuracy": 0.8284027218818665, "num_tokens": 226327655.0, "step": 6755 }, { "epoch": 0.4035339063992359, "grad_norm": 0.6119962930679321, "learning_rate": 4.722703443379714e-05, "loss": 0.5662, "mean_token_accuracy": 0.8214779853820801, "num_tokens": 226495335.0, "step": 6760 }, { "epoch": 0.4038323782234957, "grad_norm": 0.6557056307792664, "learning_rate": 4.722169157906567e-05, "loss": 0.5571, "mean_token_accuracy": 0.8267744302749633, "num_tokens": 226663015.0, "step": 6765 }, { "epoch": 0.4041308500477555, "grad_norm": 0.5651956796646118, "learning_rate": 4.7216343920752185e-05, "loss": 0.5733, "mean_token_accuracy": 0.8198258399963378, "num_tokens": 226830695.0, "step": 6770 }, { "epoch": 0.4044293218720153, "grad_norm": 0.5722317695617676, "learning_rate": 4.721099146015921e-05, "loss": 0.6007, "mean_token_accuracy": 0.812429916858673, "num_tokens": 226998375.0, "step": 6775 }, { "epoch": 0.4047277936962751, "grad_norm": 0.5798092484474182, "learning_rate": 4.7205634198590446e-05, "loss": 0.559, "mean_token_accuracy": 0.8246212601661682, "num_tokens": 227166055.0, "step": 6780 }, { "epoch": 0.40502626552053483, "grad_norm": 0.58822101354599, "learning_rate": 4.720027213735076e-05, "loss": 0.5171, "mean_token_accuracy": 0.8375820040702819, "num_tokens": 227333735.0, "step": 6785 }, { "epoch": 0.40532473734479463, "grad_norm": 0.5558359026908875, "learning_rate": 4.719490527774616e-05, "loss": 0.5695, "mean_token_accuracy": 0.8221817970275879, "num_tokens": 227501415.0, "step": 6790 }, { "epoch": 0.4056232091690544, "grad_norm": 0.5774255990982056, "learning_rate": 4.7189533621083856e-05, "loss": 0.577, "mean_token_accuracy": 0.816628885269165, "num_tokens": 227669095.0, "step": 6795 }, { "epoch": 0.4059216809933142, "grad_norm": 0.6004100441932678, "learning_rate": 4.71841571686722e-05, "loss": 0.6043, "mean_token_accuracy": 0.8094954013824462, "num_tokens": 227836775.0, "step": 6800 }, { "epoch": 0.406220152817574, "grad_norm": 0.622145414352417, "learning_rate": 4.7178775921820754e-05, "loss": 0.6366, "mean_token_accuracy": 0.8022784113883972, "num_tokens": 228004455.0, "step": 6805 }, { "epoch": 0.4065186246418338, "grad_norm": 0.5691177845001221, "learning_rate": 4.71733898818402e-05, "loss": 0.5774, "mean_token_accuracy": 0.819479787349701, "num_tokens": 228169230.0, "step": 6810 }, { "epoch": 0.4068170964660936, "grad_norm": 0.566308856010437, "learning_rate": 4.7167999050042405e-05, "loss": 0.5765, "mean_token_accuracy": 0.8199809193611145, "num_tokens": 228336910.0, "step": 6815 }, { "epoch": 0.4071155682903534, "grad_norm": 0.6092915534973145, "learning_rate": 4.716260342774041e-05, "loss": 0.6006, "mean_token_accuracy": 0.8117917299270629, "num_tokens": 228504590.0, "step": 6820 }, { "epoch": 0.4074140401146132, "grad_norm": 0.5469214916229248, "learning_rate": 4.715720301624843e-05, "loss": 0.5788, "mean_token_accuracy": 0.821227490901947, "num_tokens": 228672270.0, "step": 6825 }, { "epoch": 0.407712511938873, "grad_norm": 0.5775798559188843, "learning_rate": 4.7151797816881823e-05, "loss": 0.5487, "mean_token_accuracy": 0.8256352186203003, "num_tokens": 228839950.0, "step": 6830 }, { "epoch": 0.4080109837631328, "grad_norm": 0.5717394948005676, "learning_rate": 4.714638783095712e-05, "loss": 0.5429, "mean_token_accuracy": 0.8299296140670777, "num_tokens": 229007630.0, "step": 6835 }, { "epoch": 0.40830945558739257, "grad_norm": 0.5832452774047852, "learning_rate": 4.714097305979203e-05, "loss": 0.5517, "mean_token_accuracy": 0.8249135136604309, "num_tokens": 229175310.0, "step": 6840 }, { "epoch": 0.40860792741165236, "grad_norm": 0.6015300750732422, "learning_rate": 4.713555350470542e-05, "loss": 0.5962, "mean_token_accuracy": 0.8129547834396362, "num_tokens": 229342990.0, "step": 6845 }, { "epoch": 0.40890639923591215, "grad_norm": 0.5837094783782959, "learning_rate": 4.713012916701734e-05, "loss": 0.5924, "mean_token_accuracy": 0.8144041538238526, "num_tokens": 229510670.0, "step": 6850 }, { "epoch": 0.4092048710601719, "grad_norm": 0.6850583553314209, "learning_rate": 4.712470004804895e-05, "loss": 0.6467, "mean_token_accuracy": 0.8003697991371155, "num_tokens": 229678350.0, "step": 6855 }, { "epoch": 0.4095033428844317, "grad_norm": 0.6050103902816772, "learning_rate": 4.7119266149122646e-05, "loss": 0.6092, "mean_token_accuracy": 0.8098234534263611, "num_tokens": 229846030.0, "step": 6860 }, { "epoch": 0.4098018147086915, "grad_norm": 0.5681546330451965, "learning_rate": 4.7113827471561934e-05, "loss": 0.5642, "mean_token_accuracy": 0.8228677153587342, "num_tokens": 230013710.0, "step": 6865 }, { "epoch": 0.4101002865329513, "grad_norm": 0.5979629755020142, "learning_rate": 4.710838401669152e-05, "loss": 0.5581, "mean_token_accuracy": 0.8249135136604309, "num_tokens": 230181390.0, "step": 6870 }, { "epoch": 0.41039875835721107, "grad_norm": 0.5612121224403381, "learning_rate": 4.7102935785837244e-05, "loss": 0.5314, "mean_token_accuracy": 0.8320708632469177, "num_tokens": 230349070.0, "step": 6875 }, { "epoch": 0.41069723018147086, "grad_norm": 0.8240800499916077, "learning_rate": 4.7097482780326126e-05, "loss": 0.557, "mean_token_accuracy": 0.8262256979942322, "num_tokens": 230516750.0, "step": 6880 }, { "epoch": 0.41099570200573066, "grad_norm": 0.6205378770828247, "learning_rate": 4.7092025001486344e-05, "loss": 0.609, "mean_token_accuracy": 0.8097220659255981, "num_tokens": 230684430.0, "step": 6885 }, { "epoch": 0.41129417382999045, "grad_norm": 0.5555119514465332, "learning_rate": 4.708656245064726e-05, "loss": 0.5588, "mean_token_accuracy": 0.8247882723808289, "num_tokens": 230852110.0, "step": 6890 }, { "epoch": 0.41159264565425024, "grad_norm": 0.5389045476913452, "learning_rate": 4.708109512913935e-05, "loss": 0.5726, "mean_token_accuracy": 0.8197184681892395, "num_tokens": 231019790.0, "step": 6895 }, { "epoch": 0.41189111747851004, "grad_norm": 0.5762794613838196, "learning_rate": 4.707562303829431e-05, "loss": 0.5791, "mean_token_accuracy": 0.8184605956077575, "num_tokens": 231182870.0, "step": 6900 }, { "epoch": 0.41218958930276983, "grad_norm": 0.5309786796569824, "learning_rate": 4.707014617944495e-05, "loss": 0.5582, "mean_token_accuracy": 0.8255874872207641, "num_tokens": 231350550.0, "step": 6905 }, { "epoch": 0.4124880611270296, "grad_norm": 0.5733486413955688, "learning_rate": 4.706466455392526e-05, "loss": 0.5947, "mean_token_accuracy": 0.8153226852416993, "num_tokens": 231518230.0, "step": 6910 }, { "epoch": 0.4127865329512894, "grad_norm": 0.5413124561309814, "learning_rate": 4.7059178163070395e-05, "loss": 0.5789, "mean_token_accuracy": 0.8188118815422059, "num_tokens": 231685910.0, "step": 6915 }, { "epoch": 0.4130850047755492, "grad_norm": 0.626871645450592, "learning_rate": 4.705368700821667e-05, "loss": 0.5431, "mean_token_accuracy": 0.8269951105117798, "num_tokens": 231853590.0, "step": 6920 }, { "epoch": 0.41338347659980895, "grad_norm": 0.5655709505081177, "learning_rate": 4.704819109070155e-05, "loss": 0.564, "mean_token_accuracy": 0.8230585813522339, "num_tokens": 232021270.0, "step": 6925 }, { "epoch": 0.41368194842406875, "grad_norm": 0.5631151795387268, "learning_rate": 4.7042690411863674e-05, "loss": 0.5781, "mean_token_accuracy": 0.8179649353027344, "num_tokens": 232188950.0, "step": 6930 }, { "epoch": 0.41398042024832854, "grad_norm": 1.1137640476226807, "learning_rate": 4.7037184973042834e-05, "loss": 0.6052, "mean_token_accuracy": 0.8105809450149536, "num_tokens": 232356630.0, "step": 6935 }, { "epoch": 0.41427889207258833, "grad_norm": 0.6091469526290894, "learning_rate": 4.703167477557997e-05, "loss": 0.5846, "mean_token_accuracy": 0.8158236980438233, "num_tokens": 232524310.0, "step": 6940 }, { "epoch": 0.41457736389684813, "grad_norm": 0.7949170470237732, "learning_rate": 4.702615982081722e-05, "loss": 0.6359, "mean_token_accuracy": 0.8052422165870666, "num_tokens": 232686315.0, "step": 6945 }, { "epoch": 0.4148758357211079, "grad_norm": 0.5734235644340515, "learning_rate": 4.702064011009782e-05, "loss": 0.6142, "mean_token_accuracy": 0.8074973225593567, "num_tokens": 232853995.0, "step": 6950 }, { "epoch": 0.4151743075453677, "grad_norm": 0.5585874319076538, "learning_rate": 4.7015115644766214e-05, "loss": 0.574, "mean_token_accuracy": 0.820100212097168, "num_tokens": 233021675.0, "step": 6955 }, { "epoch": 0.4154727793696275, "grad_norm": 0.5752120614051819, "learning_rate": 4.700958642616799e-05, "loss": 0.5851, "mean_token_accuracy": 0.8153346061706543, "num_tokens": 233189355.0, "step": 6960 }, { "epoch": 0.4157712511938873, "grad_norm": 0.6133684515953064, "learning_rate": 4.7004052455649876e-05, "loss": 0.607, "mean_token_accuracy": 0.8133245825767517, "num_tokens": 233357035.0, "step": 6965 }, { "epoch": 0.4160697230181471, "grad_norm": 0.5853560566902161, "learning_rate": 4.699851373455979e-05, "loss": 0.6178, "mean_token_accuracy": 0.8065907239913941, "num_tokens": 233524715.0, "step": 6970 }, { "epoch": 0.4163681948424069, "grad_norm": 0.5792456865310669, "learning_rate": 4.6992970264246775e-05, "loss": 0.546, "mean_token_accuracy": 0.826428484916687, "num_tokens": 233692395.0, "step": 6975 }, { "epoch": 0.4166666666666667, "grad_norm": 0.6258230805397034, "learning_rate": 4.698742204606107e-05, "loss": 0.6231, "mean_token_accuracy": 0.8080758690834046, "num_tokens": 233860075.0, "step": 6980 }, { "epoch": 0.4169651384909265, "grad_norm": 0.5646248459815979, "learning_rate": 4.698186908135401e-05, "loss": 0.5963, "mean_token_accuracy": 0.8156209111213684, "num_tokens": 234027755.0, "step": 6985 }, { "epoch": 0.4172636103151863, "grad_norm": 0.5563027262687683, "learning_rate": 4.697631137147815e-05, "loss": 0.5796, "mean_token_accuracy": 0.8195276021957397, "num_tokens": 234195435.0, "step": 6990 }, { "epoch": 0.417562082139446, "grad_norm": 0.5151941180229187, "learning_rate": 4.697074891778716e-05, "loss": 0.594, "mean_token_accuracy": 0.8130442500114441, "num_tokens": 234363115.0, "step": 6995 }, { "epoch": 0.4178605539637058, "grad_norm": 0.6522302627563477, "learning_rate": 4.696518172163589e-05, "loss": 0.6223, "mean_token_accuracy": 0.8075330972671508, "num_tokens": 234530795.0, "step": 7000 }, { "epoch": 0.4181590257879656, "grad_norm": 0.5927281975746155, "learning_rate": 4.695960978438033e-05, "loss": 0.5562, "mean_token_accuracy": 0.8239830613136292, "num_tokens": 234698475.0, "step": 7005 }, { "epoch": 0.4184574976122254, "grad_norm": 0.6028661727905273, "learning_rate": 4.69540331073776e-05, "loss": 0.5674, "mean_token_accuracy": 0.8212095975875855, "num_tokens": 234866155.0, "step": 7010 }, { "epoch": 0.4187559694364852, "grad_norm": 0.5302370190620422, "learning_rate": 4.6948451691986045e-05, "loss": 0.5316, "mean_token_accuracy": 0.8314267039299011, "num_tokens": 235033835.0, "step": 7015 }, { "epoch": 0.419054441260745, "grad_norm": 0.5614240169525146, "learning_rate": 4.69428655395651e-05, "loss": 0.5691, "mean_token_accuracy": 0.8193367481231689, "num_tokens": 235201515.0, "step": 7020 }, { "epoch": 0.4193529130850048, "grad_norm": 0.632599413394928, "learning_rate": 4.6937274651475385e-05, "loss": 0.6344, "mean_token_accuracy": 0.8035667419433594, "num_tokens": 235369195.0, "step": 7025 }, { "epoch": 0.41965138490926457, "grad_norm": 0.5534483790397644, "learning_rate": 4.693167902907865e-05, "loss": 0.5897, "mean_token_accuracy": 0.8155195116996765, "num_tokens": 235536875.0, "step": 7030 }, { "epoch": 0.41994985673352436, "grad_norm": 0.5887237787246704, "learning_rate": 4.692607867373781e-05, "loss": 0.5653, "mean_token_accuracy": 0.8224442362785339, "num_tokens": 235704555.0, "step": 7035 }, { "epoch": 0.42024832855778416, "grad_norm": 0.6462065577507019, "learning_rate": 4.692047358681696e-05, "loss": 0.625, "mean_token_accuracy": 0.8062925100326538, "num_tokens": 235872235.0, "step": 7040 }, { "epoch": 0.42054680038204395, "grad_norm": 0.5301601886749268, "learning_rate": 4.69148637696813e-05, "loss": 0.5651, "mean_token_accuracy": 0.8209352374076844, "num_tokens": 236039915.0, "step": 7045 }, { "epoch": 0.42084527220630374, "grad_norm": 0.5547974109649658, "learning_rate": 4.6909249223697216e-05, "loss": 0.5528, "mean_token_accuracy": 0.8276810050010681, "num_tokens": 236207595.0, "step": 7050 }, { "epoch": 0.42114374403056354, "grad_norm": 0.6368840932846069, "learning_rate": 4.6903629950232225e-05, "loss": 0.5941, "mean_token_accuracy": 0.8133245825767517, "num_tokens": 236375275.0, "step": 7055 }, { "epoch": 0.42144221585482333, "grad_norm": 0.6618936061859131, "learning_rate": 4.689800595065501e-05, "loss": 0.5829, "mean_token_accuracy": 0.8166944980621338, "num_tokens": 236542955.0, "step": 7060 }, { "epoch": 0.42174068767908307, "grad_norm": 0.630326509475708, "learning_rate": 4.6892377226335405e-05, "loss": 0.592, "mean_token_accuracy": 0.8146367788314819, "num_tokens": 236710635.0, "step": 7065 }, { "epoch": 0.42203915950334286, "grad_norm": 0.6314406394958496, "learning_rate": 4.6886743778644385e-05, "loss": 0.5805, "mean_token_accuracy": 0.8181378960609436, "num_tokens": 236878315.0, "step": 7070 }, { "epoch": 0.42233763132760266, "grad_norm": 0.5336928367614746, "learning_rate": 4.688110560895407e-05, "loss": 0.5381, "mean_token_accuracy": 0.8289812684059144, "num_tokens": 237045995.0, "step": 7075 }, { "epoch": 0.42263610315186245, "grad_norm": 0.5644545555114746, "learning_rate": 4.687546271863776e-05, "loss": 0.5634, "mean_token_accuracy": 0.8218716502189636, "num_tokens": 237213675.0, "step": 7080 }, { "epoch": 0.42293457497612225, "grad_norm": 0.5761260986328125, "learning_rate": 4.686981510906986e-05, "loss": 0.5914, "mean_token_accuracy": 0.8146844744682312, "num_tokens": 237381355.0, "step": 7085 }, { "epoch": 0.42323304680038204, "grad_norm": 0.6816636323928833, "learning_rate": 4.686416278162598e-05, "loss": 0.6247, "mean_token_accuracy": 0.8046164870262146, "num_tokens": 237549035.0, "step": 7090 }, { "epoch": 0.42353151862464183, "grad_norm": 0.5634966492652893, "learning_rate": 4.685850573768283e-05, "loss": 0.5327, "mean_token_accuracy": 0.831766664981842, "num_tokens": 237716715.0, "step": 7095 }, { "epoch": 0.42382999044890163, "grad_norm": 0.5552952885627747, "learning_rate": 4.685284397861828e-05, "loss": 0.5691, "mean_token_accuracy": 0.8212573051452636, "num_tokens": 237884395.0, "step": 7100 }, { "epoch": 0.4241284622731614, "grad_norm": 0.5909750461578369, "learning_rate": 4.684717750581138e-05, "loss": 0.6142, "mean_token_accuracy": 0.8085888147354126, "num_tokens": 238052075.0, "step": 7105 }, { "epoch": 0.4244269340974212, "grad_norm": 0.5910950303077698, "learning_rate": 4.684150632064228e-05, "loss": 0.614, "mean_token_accuracy": 0.8090182662010192, "num_tokens": 238219755.0, "step": 7110 }, { "epoch": 0.424725405921681, "grad_norm": 0.5288212299346924, "learning_rate": 4.683583042449232e-05, "loss": 0.5398, "mean_token_accuracy": 0.8288918137550354, "num_tokens": 238387435.0, "step": 7115 }, { "epoch": 0.4250238777459408, "grad_norm": 0.637802243232727, "learning_rate": 4.6830149818743956e-05, "loss": 0.5676, "mean_token_accuracy": 0.8223010897636414, "num_tokens": 238555115.0, "step": 7120 }, { "epoch": 0.4253223495702006, "grad_norm": 0.6213273406028748, "learning_rate": 4.682446450478082e-05, "loss": 0.597, "mean_token_accuracy": 0.81300847530365, "num_tokens": 238722795.0, "step": 7125 }, { "epoch": 0.42562082139446034, "grad_norm": 0.5779014229774475, "learning_rate": 4.6818774483987655e-05, "loss": 0.6167, "mean_token_accuracy": 0.8072706699371338, "num_tokens": 238890475.0, "step": 7130 }, { "epoch": 0.42591929321872013, "grad_norm": 0.6011877059936523, "learning_rate": 4.6813079757750386e-05, "loss": 0.5877, "mean_token_accuracy": 0.8154956459999084, "num_tokens": 239058155.0, "step": 7135 }, { "epoch": 0.4262177650429799, "grad_norm": 0.6756574511528015, "learning_rate": 4.680738032745606e-05, "loss": 0.5846, "mean_token_accuracy": 0.8168197631835937, "num_tokens": 239225835.0, "step": 7140 }, { "epoch": 0.4265162368672397, "grad_norm": 0.523808479309082, "learning_rate": 4.6801676194492884e-05, "loss": 0.5749, "mean_token_accuracy": 0.8182392954826355, "num_tokens": 239393515.0, "step": 7145 }, { "epoch": 0.4268147086914995, "grad_norm": 0.5292114019393921, "learning_rate": 4.67959673602502e-05, "loss": 0.5163, "mean_token_accuracy": 0.8359119772911072, "num_tokens": 239561195.0, "step": 7150 }, { "epoch": 0.4271131805157593, "grad_norm": 0.6238457560539246, "learning_rate": 4.67902538261185e-05, "loss": 0.5963, "mean_token_accuracy": 0.81266850233078, "num_tokens": 239728875.0, "step": 7155 }, { "epoch": 0.4274116523400191, "grad_norm": 0.5610671639442444, "learning_rate": 4.678453559348943e-05, "loss": 0.6089, "mean_token_accuracy": 0.811314582824707, "num_tokens": 239896555.0, "step": 7160 }, { "epoch": 0.4277101241642789, "grad_norm": 0.569820761680603, "learning_rate": 4.677881266375576e-05, "loss": 0.5406, "mean_token_accuracy": 0.8295717597007751, "num_tokens": 240064235.0, "step": 7165 }, { "epoch": 0.4280085959885387, "grad_norm": 0.5484038591384888, "learning_rate": 4.677308503831143e-05, "loss": 0.5154, "mean_token_accuracy": 0.8342180609703064, "num_tokens": 240231915.0, "step": 7170 }, { "epoch": 0.4283070678127985, "grad_norm": 0.5385307669639587, "learning_rate": 4.6767352718551484e-05, "loss": 0.5743, "mean_token_accuracy": 0.8187880158424378, "num_tokens": 240399595.0, "step": 7175 }, { "epoch": 0.4286055396370583, "grad_norm": 0.5796418786048889, "learning_rate": 4.676161570587216e-05, "loss": 0.5965, "mean_token_accuracy": 0.8141775131225586, "num_tokens": 240567275.0, "step": 7180 }, { "epoch": 0.42890401146131807, "grad_norm": 0.5219092965126038, "learning_rate": 4.675587400167079e-05, "loss": 0.5903, "mean_token_accuracy": 0.8163843512535095, "num_tokens": 240734955.0, "step": 7185 }, { "epoch": 0.42920248328557786, "grad_norm": 0.6324279308319092, "learning_rate": 4.675012760734589e-05, "loss": 0.5435, "mean_token_accuracy": 0.8295657873153687, "num_tokens": 240902635.0, "step": 7190 }, { "epoch": 0.42950095510983766, "grad_norm": 0.5694840550422668, "learning_rate": 4.67443765242971e-05, "loss": 0.5701, "mean_token_accuracy": 0.8202791452407837, "num_tokens": 241070315.0, "step": 7195 }, { "epoch": 0.4297994269340974, "grad_norm": 0.588840901851654, "learning_rate": 4.6738620753925197e-05, "loss": 0.5626, "mean_token_accuracy": 0.8215137720108032, "num_tokens": 241237995.0, "step": 7200 }, { "epoch": 0.4300978987583572, "grad_norm": 0.5361875891685486, "learning_rate": 4.6732860297632095e-05, "loss": 0.5586, "mean_token_accuracy": 0.8234164476394653, "num_tokens": 241405675.0, "step": 7205 }, { "epoch": 0.430396370582617, "grad_norm": 0.63075852394104, "learning_rate": 4.6727095156820874e-05, "loss": 0.5828, "mean_token_accuracy": 0.817517614364624, "num_tokens": 241573355.0, "step": 7210 }, { "epoch": 0.4306948424068768, "grad_norm": 0.6125807166099548, "learning_rate": 4.6721325332895743e-05, "loss": 0.565, "mean_token_accuracy": 0.8224263429641724, "num_tokens": 241741035.0, "step": 7215 }, { "epoch": 0.43099331423113657, "grad_norm": 0.5737380981445312, "learning_rate": 4.671555082726204e-05, "loss": 0.5527, "mean_token_accuracy": 0.8244542479515076, "num_tokens": 241908715.0, "step": 7220 }, { "epoch": 0.43129178605539636, "grad_norm": 0.5836515426635742, "learning_rate": 4.6709771641326244e-05, "loss": 0.5587, "mean_token_accuracy": 0.8233389019966125, "num_tokens": 242076395.0, "step": 7225 }, { "epoch": 0.43159025787965616, "grad_norm": 0.5387793183326721, "learning_rate": 4.6703987776496004e-05, "loss": 0.5468, "mean_token_accuracy": 0.8256829380989075, "num_tokens": 242244075.0, "step": 7230 }, { "epoch": 0.43188872970391595, "grad_norm": 0.5948957800865173, "learning_rate": 4.669819923418008e-05, "loss": 0.6127, "mean_token_accuracy": 0.8066623091697693, "num_tokens": 242411755.0, "step": 7235 }, { "epoch": 0.43218720152817575, "grad_norm": 0.5518760085105896, "learning_rate": 4.669240601578835e-05, "loss": 0.5872, "mean_token_accuracy": 0.8159429788589477, "num_tokens": 242579435.0, "step": 7240 }, { "epoch": 0.43248567335243554, "grad_norm": 0.5337597131729126, "learning_rate": 4.6686608122731906e-05, "loss": 0.5195, "mean_token_accuracy": 0.8343254208564759, "num_tokens": 242747115.0, "step": 7245 }, { "epoch": 0.43278414517669533, "grad_norm": 0.551590085029602, "learning_rate": 4.6680805556422905e-05, "loss": 0.5188, "mean_token_accuracy": 0.8353870749473572, "num_tokens": 242914795.0, "step": 7250 }, { "epoch": 0.43308261700095513, "grad_norm": 0.5611405372619629, "learning_rate": 4.6674998318274674e-05, "loss": 0.5933, "mean_token_accuracy": 0.8142669677734375, "num_tokens": 243082475.0, "step": 7255 }, { "epoch": 0.4333810888252149, "grad_norm": 0.5417499542236328, "learning_rate": 4.666918640970166e-05, "loss": 0.5946, "mean_token_accuracy": 0.8130681157112122, "num_tokens": 243250155.0, "step": 7260 }, { "epoch": 0.4336795606494747, "grad_norm": 0.600557267665863, "learning_rate": 4.666336983211949e-05, "loss": 0.6019, "mean_token_accuracy": 0.8110342264175415, "num_tokens": 243417835.0, "step": 7265 }, { "epoch": 0.43397803247373445, "grad_norm": 0.5825746059417725, "learning_rate": 4.6657548586944884e-05, "loss": 0.5592, "mean_token_accuracy": 0.8250805139541626, "num_tokens": 243585515.0, "step": 7270 }, { "epoch": 0.43427650429799425, "grad_norm": 0.5456596612930298, "learning_rate": 4.66517226755957e-05, "loss": 0.5563, "mean_token_accuracy": 0.8250387787818909, "num_tokens": 243753195.0, "step": 7275 }, { "epoch": 0.43457497612225404, "grad_norm": 0.5647388100624084, "learning_rate": 4.6645892099490963e-05, "loss": 0.5776, "mean_token_accuracy": 0.8185196280479431, "num_tokens": 243920875.0, "step": 7280 }, { "epoch": 0.43487344794651384, "grad_norm": 0.5186007618904114, "learning_rate": 4.6640056860050814e-05, "loss": 0.5511, "mean_token_accuracy": 0.825790286064148, "num_tokens": 244088555.0, "step": 7285 }, { "epoch": 0.43517191977077363, "grad_norm": 0.5805457234382629, "learning_rate": 4.6634216958696534e-05, "loss": 0.5575, "mean_token_accuracy": 0.8240307807922364, "num_tokens": 244256235.0, "step": 7290 }, { "epoch": 0.4354703915950334, "grad_norm": 0.5633774995803833, "learning_rate": 4.662837239685052e-05, "loss": 0.6312, "mean_token_accuracy": 0.8053202867507935, "num_tokens": 244423915.0, "step": 7295 }, { "epoch": 0.4357688634192932, "grad_norm": 0.7506545186042786, "learning_rate": 4.662252317593636e-05, "loss": 0.577, "mean_token_accuracy": 0.8191458821296692, "num_tokens": 244591595.0, "step": 7300 }, { "epoch": 0.436067335243553, "grad_norm": 0.6479071378707886, "learning_rate": 4.6616669297378705e-05, "loss": 0.5864, "mean_token_accuracy": 0.8154181122779847, "num_tokens": 244759275.0, "step": 7305 }, { "epoch": 0.4363658070678128, "grad_norm": 0.5674999356269836, "learning_rate": 4.6610810762603405e-05, "loss": 0.5496, "mean_token_accuracy": 0.8271919369697571, "num_tokens": 244926955.0, "step": 7310 }, { "epoch": 0.4366642788920726, "grad_norm": 0.554707944393158, "learning_rate": 4.6604947573037386e-05, "loss": 0.5783, "mean_token_accuracy": 0.8181140422821045, "num_tokens": 245094635.0, "step": 7315 }, { "epoch": 0.4369627507163324, "grad_norm": 0.5681338906288147, "learning_rate": 4.6599079730108745e-05, "loss": 0.6185, "mean_token_accuracy": 0.8066563248634339, "num_tokens": 245262315.0, "step": 7320 }, { "epoch": 0.4372612225405922, "grad_norm": 0.6053183078765869, "learning_rate": 4.659320723524672e-05, "loss": 0.5287, "mean_token_accuracy": 0.8312119722366333, "num_tokens": 245429995.0, "step": 7325 }, { "epoch": 0.437559694364852, "grad_norm": 0.5857833027839661, "learning_rate": 4.658733008988164e-05, "loss": 0.5705, "mean_token_accuracy": 0.8193784952163696, "num_tokens": 245597675.0, "step": 7330 }, { "epoch": 0.4378581661891118, "grad_norm": 0.5568007826805115, "learning_rate": 4.6581448295445016e-05, "loss": 0.5547, "mean_token_accuracy": 0.8239949822425843, "num_tokens": 245765355.0, "step": 7335 }, { "epoch": 0.4381566380133715, "grad_norm": 0.5761382579803467, "learning_rate": 4.657556185336945e-05, "loss": 0.5857, "mean_token_accuracy": 0.814988660812378, "num_tokens": 245933035.0, "step": 7340 }, { "epoch": 0.4384551098376313, "grad_norm": 0.5609983801841736, "learning_rate": 4.6569670765088703e-05, "loss": 0.595, "mean_token_accuracy": 0.813533341884613, "num_tokens": 246100715.0, "step": 7345 }, { "epoch": 0.4387535816618911, "grad_norm": 0.5667198300361633, "learning_rate": 4.6563775032037654e-05, "loss": 0.5683, "mean_token_accuracy": 0.820055615901947, "num_tokens": 246262211.0, "step": 7350 }, { "epoch": 0.4390520534861509, "grad_norm": 0.5638264417648315, "learning_rate": 4.6557874655652316e-05, "loss": 0.5684, "mean_token_accuracy": 0.8206250667572021, "num_tokens": 246429891.0, "step": 7355 }, { "epoch": 0.4393505253104107, "grad_norm": 0.5304365754127502, "learning_rate": 4.655196963736985e-05, "loss": 0.5498, "mean_token_accuracy": 0.8263509631156921, "num_tokens": 246597571.0, "step": 7360 }, { "epoch": 0.4396489971346705, "grad_norm": 0.6020424365997314, "learning_rate": 4.65460599786285e-05, "loss": 0.5994, "mean_token_accuracy": 0.813634741306305, "num_tokens": 246765251.0, "step": 7365 }, { "epoch": 0.4399474689589303, "grad_norm": 0.5858073830604553, "learning_rate": 4.654014568086771e-05, "loss": 0.6441, "mean_token_accuracy": 0.7982822299003601, "num_tokens": 246932931.0, "step": 7370 }, { "epoch": 0.44024594078319007, "grad_norm": 0.5727011561393738, "learning_rate": 4.653422674552799e-05, "loss": 0.6163, "mean_token_accuracy": 0.8085649490356446, "num_tokens": 247100611.0, "step": 7375 }, { "epoch": 0.44054441260744986, "grad_norm": 0.5156970024108887, "learning_rate": 4.6528303174051015e-05, "loss": 0.5541, "mean_token_accuracy": 0.8256352186203003, "num_tokens": 247268291.0, "step": 7380 }, { "epoch": 0.44084288443170966, "grad_norm": 0.5324041843414307, "learning_rate": 4.652237496787958e-05, "loss": 0.5682, "mean_token_accuracy": 0.8214839696884155, "num_tokens": 247435971.0, "step": 7385 }, { "epoch": 0.44114135625596945, "grad_norm": 0.6281463503837585, "learning_rate": 4.6516442128457604e-05, "loss": 0.5869, "mean_token_accuracy": 0.8148037791252136, "num_tokens": 247603651.0, "step": 7390 }, { "epoch": 0.44143982808022925, "grad_norm": 0.5490525364875793, "learning_rate": 4.6510504657230156e-05, "loss": 0.5446, "mean_token_accuracy": 0.8279553771018981, "num_tokens": 247771331.0, "step": 7395 }, { "epoch": 0.44173829990448904, "grad_norm": 0.6098350882530212, "learning_rate": 4.65045625556434e-05, "loss": 0.5919, "mean_token_accuracy": 0.8160861253738403, "num_tokens": 247939011.0, "step": 7400 }, { "epoch": 0.4420367717287488, "grad_norm": 0.6032289862632751, "learning_rate": 4.6498615825144644e-05, "loss": 0.5706, "mean_token_accuracy": 0.8224740505218506, "num_tokens": 248106691.0, "step": 7405 }, { "epoch": 0.4423352435530086, "grad_norm": 0.5556846261024475, "learning_rate": 4.6492664467182334e-05, "loss": 0.5565, "mean_token_accuracy": 0.825050699710846, "num_tokens": 248274371.0, "step": 7410 }, { "epoch": 0.44263371537726837, "grad_norm": 0.6704667806625366, "learning_rate": 4.648670848320603e-05, "loss": 0.5369, "mean_token_accuracy": 0.8290823459625244, "num_tokens": 248435093.0, "step": 7415 }, { "epoch": 0.44293218720152816, "grad_norm": 0.5523572564125061, "learning_rate": 4.648074787466642e-05, "loss": 0.5641, "mean_token_accuracy": 0.8217821836471557, "num_tokens": 248602773.0, "step": 7420 }, { "epoch": 0.44323065902578795, "grad_norm": 0.5404214859008789, "learning_rate": 4.647478264301532e-05, "loss": 0.562, "mean_token_accuracy": 0.8221996784210205, "num_tokens": 248770453.0, "step": 7425 }, { "epoch": 0.44352913085004775, "grad_norm": 0.5859255194664001, "learning_rate": 4.646881278970566e-05, "loss": 0.612, "mean_token_accuracy": 0.811541211605072, "num_tokens": 248938133.0, "step": 7430 }, { "epoch": 0.44382760267430754, "grad_norm": 0.6272110342979431, "learning_rate": 4.646283831619154e-05, "loss": 0.6051, "mean_token_accuracy": 0.8101753473281861, "num_tokens": 249105813.0, "step": 7435 }, { "epoch": 0.44412607449856734, "grad_norm": 0.671910285949707, "learning_rate": 4.6456859223928135e-05, "loss": 0.6462, "mean_token_accuracy": 0.7992126941680908, "num_tokens": 249273493.0, "step": 7440 }, { "epoch": 0.44442454632282713, "grad_norm": 0.5461686849594116, "learning_rate": 4.645087551437175e-05, "loss": 0.5788, "mean_token_accuracy": 0.8167720317840577, "num_tokens": 249441173.0, "step": 7445 }, { "epoch": 0.4447230181470869, "grad_norm": 0.5986597537994385, "learning_rate": 4.644488718897984e-05, "loss": 0.5986, "mean_token_accuracy": 0.8116545438766479, "num_tokens": 249608853.0, "step": 7450 }, { "epoch": 0.4450214899713467, "grad_norm": 0.5439218878746033, "learning_rate": 4.643889424921098e-05, "loss": 0.5844, "mean_token_accuracy": 0.8167660713195801, "num_tokens": 249776533.0, "step": 7455 }, { "epoch": 0.4453199617956065, "grad_norm": 0.6067100763320923, "learning_rate": 4.643289669652485e-05, "loss": 0.58, "mean_token_accuracy": 0.8171835899353027, "num_tokens": 249944213.0, "step": 7460 }, { "epoch": 0.4456184336198663, "grad_norm": 0.5906630754470825, "learning_rate": 4.6426894532382275e-05, "loss": 0.5721, "mean_token_accuracy": 0.8190743327140808, "num_tokens": 250111893.0, "step": 7465 }, { "epoch": 0.4459169054441261, "grad_norm": 0.5301469564437866, "learning_rate": 4.642088775824517e-05, "loss": 0.6097, "mean_token_accuracy": 0.8108731865882873, "num_tokens": 250279573.0, "step": 7470 }, { "epoch": 0.44621537726838584, "grad_norm": 0.5304574966430664, "learning_rate": 4.6414876375576634e-05, "loss": 0.5532, "mean_token_accuracy": 0.8259871125221252, "num_tokens": 250447253.0, "step": 7475 }, { "epoch": 0.44651384909264563, "grad_norm": 0.6436883211135864, "learning_rate": 4.6408860385840813e-05, "loss": 0.6137, "mean_token_accuracy": 0.8074078321456909, "num_tokens": 250614933.0, "step": 7480 }, { "epoch": 0.4468123209169054, "grad_norm": 0.6863070726394653, "learning_rate": 4.6402839790503035e-05, "loss": 0.5676, "mean_token_accuracy": 0.8223965048789978, "num_tokens": 250782613.0, "step": 7485 }, { "epoch": 0.4471107927411652, "grad_norm": 0.5239238739013672, "learning_rate": 4.6396814591029715e-05, "loss": 0.5508, "mean_token_accuracy": 0.8264463901519775, "num_tokens": 250950293.0, "step": 7490 }, { "epoch": 0.447409264565425, "grad_norm": 0.5771133899688721, "learning_rate": 4.639078478888841e-05, "loss": 0.5565, "mean_token_accuracy": 0.8247763276100158, "num_tokens": 251117973.0, "step": 7495 }, { "epoch": 0.4477077363896848, "grad_norm": 0.5490459203720093, "learning_rate": 4.638475038554778e-05, "loss": 0.5546, "mean_token_accuracy": 0.8253906726837158, "num_tokens": 251285653.0, "step": 7500 }, { "epoch": 0.4480062082139446, "grad_norm": 0.6839123964309692, "learning_rate": 4.6378711382477626e-05, "loss": 0.6101, "mean_token_accuracy": 0.8080877900123596, "num_tokens": 251453333.0, "step": 7505 }, { "epoch": 0.4483046800382044, "grad_norm": 0.573498010635376, "learning_rate": 4.6372667781148855e-05, "loss": 0.6186, "mean_token_accuracy": 0.8067040324211121, "num_tokens": 251621013.0, "step": 7510 }, { "epoch": 0.4486031518624642, "grad_norm": 0.5457788705825806, "learning_rate": 4.636661958303348e-05, "loss": 0.5432, "mean_token_accuracy": 0.8286114811897278, "num_tokens": 251788693.0, "step": 7515 }, { "epoch": 0.448901623686724, "grad_norm": 0.5477657914161682, "learning_rate": 4.636056678960469e-05, "loss": 0.5486, "mean_token_accuracy": 0.8247763276100158, "num_tokens": 251956373.0, "step": 7520 }, { "epoch": 0.4492000955109838, "grad_norm": 0.5851730108261108, "learning_rate": 4.635450940233672e-05, "loss": 0.5535, "mean_token_accuracy": 0.8250268459320068, "num_tokens": 252124053.0, "step": 7525 }, { "epoch": 0.44949856733524357, "grad_norm": 0.5567428469657898, "learning_rate": 4.634844742270497e-05, "loss": 0.5484, "mean_token_accuracy": 0.8279434561729431, "num_tokens": 252291733.0, "step": 7530 }, { "epoch": 0.44979703915950336, "grad_norm": 0.5958724021911621, "learning_rate": 4.6342380852185954e-05, "loss": 0.6196, "mean_token_accuracy": 0.8064833641052246, "num_tokens": 252459413.0, "step": 7535 }, { "epoch": 0.45009551098376316, "grad_norm": 0.6218613982200623, "learning_rate": 4.633630969225729e-05, "loss": 0.6107, "mean_token_accuracy": 0.8095788955688477, "num_tokens": 252627093.0, "step": 7540 }, { "epoch": 0.4503939828080229, "grad_norm": 0.5165412425994873, "learning_rate": 4.633023394439772e-05, "loss": 0.5437, "mean_token_accuracy": 0.8296671986579895, "num_tokens": 252794773.0, "step": 7545 }, { "epoch": 0.4506924546322827, "grad_norm": 0.5493040084838867, "learning_rate": 4.632415361008711e-05, "loss": 0.5434, "mean_token_accuracy": 0.8295299887657166, "num_tokens": 252962453.0, "step": 7550 }, { "epoch": 0.4509909264565425, "grad_norm": 0.5395864844322205, "learning_rate": 4.6318068690806426e-05, "loss": 0.5483, "mean_token_accuracy": 0.8269950985908509, "num_tokens": 253130133.0, "step": 7555 }, { "epoch": 0.4512893982808023, "grad_norm": 0.5615440011024475, "learning_rate": 4.631197918803778e-05, "loss": 0.5749, "mean_token_accuracy": 0.8188894271850586, "num_tokens": 253297813.0, "step": 7560 }, { "epoch": 0.4515878701050621, "grad_norm": 0.6137975454330444, "learning_rate": 4.630588510326437e-05, "loss": 0.5627, "mean_token_accuracy": 0.8240904211997986, "num_tokens": 253465493.0, "step": 7565 }, { "epoch": 0.45188634192932187, "grad_norm": 0.6349193453788757, "learning_rate": 4.6299786437970524e-05, "loss": 0.5685, "mean_token_accuracy": 0.8202373862266541, "num_tokens": 253633173.0, "step": 7570 }, { "epoch": 0.45218481375358166, "grad_norm": 0.9740370512008667, "learning_rate": 4.6293683193641687e-05, "loss": 0.653, "mean_token_accuracy": 0.7981808423995972, "num_tokens": 253800853.0, "step": 7575 }, { "epoch": 0.45248328557784145, "grad_norm": 0.7178189158439636, "learning_rate": 4.628757537176442e-05, "loss": 0.612, "mean_token_accuracy": 0.8102111458778382, "num_tokens": 253968533.0, "step": 7580 }, { "epoch": 0.45278175740210125, "grad_norm": 0.5762429237365723, "learning_rate": 4.628146297382638e-05, "loss": 0.5698, "mean_token_accuracy": 0.820326852798462, "num_tokens": 254136213.0, "step": 7585 }, { "epoch": 0.45308022922636104, "grad_norm": 0.585010290145874, "learning_rate": 4.627534600131639e-05, "loss": 0.573, "mean_token_accuracy": 0.820199978351593, "num_tokens": 254301665.0, "step": 7590 }, { "epoch": 0.45337870105062084, "grad_norm": 0.6211780309677124, "learning_rate": 4.6269224455724305e-05, "loss": 0.552, "mean_token_accuracy": 0.8250626206398011, "num_tokens": 254469345.0, "step": 7595 }, { "epoch": 0.45367717287488063, "grad_norm": 0.5588411688804626, "learning_rate": 4.626309833854118e-05, "loss": 0.5663, "mean_token_accuracy": 0.8222593426704407, "num_tokens": 254637025.0, "step": 7600 }, { "epoch": 0.4539756446991404, "grad_norm": 0.7048266530036926, "learning_rate": 4.625696765125912e-05, "loss": 0.6264, "mean_token_accuracy": 0.807431697845459, "num_tokens": 254804705.0, "step": 7605 }, { "epoch": 0.4542741165234002, "grad_norm": 0.52033931016922, "learning_rate": 4.625083239537137e-05, "loss": 0.5326, "mean_token_accuracy": 0.830889880657196, "num_tokens": 254972385.0, "step": 7610 }, { "epoch": 0.45457258834765996, "grad_norm": 0.6067246794700623, "learning_rate": 4.62446925723723e-05, "loss": 0.5693, "mean_token_accuracy": 0.8232494235038758, "num_tokens": 255140065.0, "step": 7615 }, { "epoch": 0.45487106017191975, "grad_norm": 0.5551836490631104, "learning_rate": 4.6238548183757366e-05, "loss": 0.5984, "mean_token_accuracy": 0.812531316280365, "num_tokens": 255307745.0, "step": 7620 }, { "epoch": 0.45516953199617954, "grad_norm": 0.5718890428543091, "learning_rate": 4.623239923102314e-05, "loss": 0.5818, "mean_token_accuracy": 0.8176667094230652, "num_tokens": 255475425.0, "step": 7625 }, { "epoch": 0.45546800382043934, "grad_norm": 0.6421093940734863, "learning_rate": 4.6226245715667323e-05, "loss": 0.5761, "mean_token_accuracy": 0.8189311861991883, "num_tokens": 255643105.0, "step": 7630 }, { "epoch": 0.45576647564469913, "grad_norm": 0.6722767949104309, "learning_rate": 4.6220087639188713e-05, "loss": 0.5495, "mean_token_accuracy": 0.826792311668396, "num_tokens": 255810785.0, "step": 7635 }, { "epoch": 0.4560649474689589, "grad_norm": 0.5936805009841919, "learning_rate": 4.621392500308723e-05, "loss": 0.5401, "mean_token_accuracy": 0.8289812684059144, "num_tokens": 255978465.0, "step": 7640 }, { "epoch": 0.4563634192932187, "grad_norm": 0.5662449598312378, "learning_rate": 4.620775780886389e-05, "loss": 0.5726, "mean_token_accuracy": 0.8206429600715637, "num_tokens": 256146145.0, "step": 7645 }, { "epoch": 0.4566618911174785, "grad_norm": 0.6059299111366272, "learning_rate": 4.620158605802083e-05, "loss": 0.5853, "mean_token_accuracy": 0.8163962841033936, "num_tokens": 256313825.0, "step": 7650 }, { "epoch": 0.4569603629417383, "grad_norm": 0.5768796801567078, "learning_rate": 4.61954097520613e-05, "loss": 0.5785, "mean_token_accuracy": 0.8180603623390198, "num_tokens": 256481505.0, "step": 7655 }, { "epoch": 0.4572588347659981, "grad_norm": 0.5965715050697327, "learning_rate": 4.618922889248965e-05, "loss": 0.584, "mean_token_accuracy": 0.8151437401771545, "num_tokens": 256649185.0, "step": 7660 }, { "epoch": 0.4575573065902579, "grad_norm": 0.5589231848716736, "learning_rate": 4.6183043480811326e-05, "loss": 0.557, "mean_token_accuracy": 0.8250502586364746, "num_tokens": 256810593.0, "step": 7665 }, { "epoch": 0.4578557784145177, "grad_norm": 0.5479607582092285, "learning_rate": 4.617685351853292e-05, "loss": 0.5593, "mean_token_accuracy": 0.8241798758506775, "num_tokens": 256978273.0, "step": 7670 }, { "epoch": 0.4581542502387775, "grad_norm": 0.5506994724273682, "learning_rate": 4.617065900716211e-05, "loss": 0.6136, "mean_token_accuracy": 0.8089586019515991, "num_tokens": 257145953.0, "step": 7675 }, { "epoch": 0.4584527220630373, "grad_norm": 0.5962391495704651, "learning_rate": 4.616445994820766e-05, "loss": 0.5625, "mean_token_accuracy": 0.8232673168182373, "num_tokens": 257313633.0, "step": 7680 }, { "epoch": 0.458751193887297, "grad_norm": 0.5734214782714844, "learning_rate": 4.615825634317949e-05, "loss": 0.5455, "mean_token_accuracy": 0.8281999230384827, "num_tokens": 257481313.0, "step": 7685 }, { "epoch": 0.4590496657115568, "grad_norm": 0.596531867980957, "learning_rate": 4.615204819358861e-05, "loss": 0.6096, "mean_token_accuracy": 0.8095729351043701, "num_tokens": 257648993.0, "step": 7690 }, { "epoch": 0.4593481375358166, "grad_norm": 0.5632465481758118, "learning_rate": 4.6145835500947107e-05, "loss": 0.5528, "mean_token_accuracy": 0.8254026055335999, "num_tokens": 257816673.0, "step": 7695 }, { "epoch": 0.4596466093600764, "grad_norm": 0.6089497208595276, "learning_rate": 4.61396182667682e-05, "loss": 0.582, "mean_token_accuracy": 0.8167243242263794, "num_tokens": 257984353.0, "step": 7700 }, { "epoch": 0.4599450811843362, "grad_norm": 0.6036685109138489, "learning_rate": 4.613339649256622e-05, "loss": 0.5787, "mean_token_accuracy": 0.8192771077156067, "num_tokens": 258152033.0, "step": 7705 }, { "epoch": 0.460243553008596, "grad_norm": 0.553791880607605, "learning_rate": 4.612717017985659e-05, "loss": 0.5908, "mean_token_accuracy": 0.814762020111084, "num_tokens": 258319713.0, "step": 7710 }, { "epoch": 0.4605420248328558, "grad_norm": 0.563846230506897, "learning_rate": 4.6120939330155846e-05, "loss": 0.566, "mean_token_accuracy": 0.8215078234672546, "num_tokens": 258487393.0, "step": 7715 }, { "epoch": 0.4608404966571156, "grad_norm": 0.5484046936035156, "learning_rate": 4.611470394498162e-05, "loss": 0.5902, "mean_token_accuracy": 0.8158058047294616, "num_tokens": 258655073.0, "step": 7720 }, { "epoch": 0.46113896848137537, "grad_norm": 0.6056272387504578, "learning_rate": 4.610846402585267e-05, "loss": 0.5704, "mean_token_accuracy": 0.821579384803772, "num_tokens": 258822753.0, "step": 7725 }, { "epoch": 0.46143744030563516, "grad_norm": 0.5771185755729675, "learning_rate": 4.6102219574288825e-05, "loss": 0.5905, "mean_token_accuracy": 0.8153763532638549, "num_tokens": 258990433.0, "step": 7730 }, { "epoch": 0.46173591212989495, "grad_norm": 0.6007190942764282, "learning_rate": 4.609597059181104e-05, "loss": 0.5916, "mean_token_accuracy": 0.8155493140220642, "num_tokens": 259158113.0, "step": 7735 }, { "epoch": 0.46203438395415475, "grad_norm": 0.5840270519256592, "learning_rate": 4.608971707994139e-05, "loss": 0.5755, "mean_token_accuracy": 0.8209054112434387, "num_tokens": 259325793.0, "step": 7740 }, { "epoch": 0.46233285577841454, "grad_norm": 0.6028809547424316, "learning_rate": 4.608345904020301e-05, "loss": 0.5914, "mean_token_accuracy": 0.8142132878303527, "num_tokens": 259493473.0, "step": 7745 }, { "epoch": 0.4626313276026743, "grad_norm": 0.5474963188171387, "learning_rate": 4.6077196474120176e-05, "loss": 0.5578, "mean_token_accuracy": 0.8243886470794678, "num_tokens": 259661153.0, "step": 7750 }, { "epoch": 0.4629297994269341, "grad_norm": 0.5470784306526184, "learning_rate": 4.607092938321824e-05, "loss": 0.5923, "mean_token_accuracy": 0.8146009802818298, "num_tokens": 259828833.0, "step": 7755 }, { "epoch": 0.46322827125119387, "grad_norm": 0.6136278510093689, "learning_rate": 4.606465776902367e-05, "loss": 0.534, "mean_token_accuracy": 0.8311582922935485, "num_tokens": 259996513.0, "step": 7760 }, { "epoch": 0.46352674307545366, "grad_norm": 0.5415297746658325, "learning_rate": 4.6058381633064036e-05, "loss": 0.5611, "mean_token_accuracy": 0.8241023659706116, "num_tokens": 260164193.0, "step": 7765 }, { "epoch": 0.46382521489971346, "grad_norm": 0.5938883423805237, "learning_rate": 4.6052100976868006e-05, "loss": 0.6202, "mean_token_accuracy": 0.8048968195915223, "num_tokens": 260331873.0, "step": 7770 }, { "epoch": 0.46412368672397325, "grad_norm": 0.5361422300338745, "learning_rate": 4.604581580196536e-05, "loss": 0.6023, "mean_token_accuracy": 0.8137242197990417, "num_tokens": 260499553.0, "step": 7775 }, { "epoch": 0.46442215854823304, "grad_norm": 0.5868556499481201, "learning_rate": 4.603952610988695e-05, "loss": 0.5991, "mean_token_accuracy": 0.8122450351715088, "num_tokens": 260667233.0, "step": 7780 }, { "epoch": 0.46472063037249284, "grad_norm": 0.6046263575553894, "learning_rate": 4.603323190216476e-05, "loss": 0.5591, "mean_token_accuracy": 0.8242932081222534, "num_tokens": 260834913.0, "step": 7785 }, { "epoch": 0.46501910219675263, "grad_norm": 0.7702558636665344, "learning_rate": 4.602693318033185e-05, "loss": 0.5488, "mean_token_accuracy": 0.8276690721511841, "num_tokens": 261002593.0, "step": 7790 }, { "epoch": 0.4653175740210124, "grad_norm": 0.5694584846496582, "learning_rate": 4.602062994592241e-05, "loss": 0.5991, "mean_token_accuracy": 0.8131158351898193, "num_tokens": 261170273.0, "step": 7795 }, { "epoch": 0.4656160458452722, "grad_norm": 0.5280610918998718, "learning_rate": 4.601432220047168e-05, "loss": 0.5554, "mean_token_accuracy": 0.8255040049552917, "num_tokens": 261337953.0, "step": 7800 }, { "epoch": 0.465914517669532, "grad_norm": 0.5759225487709045, "learning_rate": 4.6008009945516054e-05, "loss": 0.5879, "mean_token_accuracy": 0.816742217540741, "num_tokens": 261505633.0, "step": 7805 }, { "epoch": 0.4662129894937918, "grad_norm": 0.5653066635131836, "learning_rate": 4.600169318259297e-05, "loss": 0.5402, "mean_token_accuracy": 0.827269458770752, "num_tokens": 261673313.0, "step": 7810 }, { "epoch": 0.4665114613180516, "grad_norm": 0.5461359620094299, "learning_rate": 4.599537191324102e-05, "loss": 0.5447, "mean_token_accuracy": 0.8271979093551636, "num_tokens": 261840993.0, "step": 7815 }, { "epoch": 0.46680993314231134, "grad_norm": 0.5427731871604919, "learning_rate": 4.598904613899985e-05, "loss": 0.5378, "mean_token_accuracy": 0.8306722164154052, "num_tokens": 262004519.0, "step": 7820 }, { "epoch": 0.46710840496657113, "grad_norm": 0.5485845804214478, "learning_rate": 4.598271586141024e-05, "loss": 0.5831, "mean_token_accuracy": 0.8175772428512573, "num_tokens": 262172199.0, "step": 7825 }, { "epoch": 0.4674068767908309, "grad_norm": 0.5768746733665466, "learning_rate": 4.5976381082014016e-05, "loss": 0.5683, "mean_token_accuracy": 0.8193904399871826, "num_tokens": 262339879.0, "step": 7830 }, { "epoch": 0.4677053486150907, "grad_norm": 0.5218663215637207, "learning_rate": 4.597004180235415e-05, "loss": 0.5162, "mean_token_accuracy": 0.8376476287841796, "num_tokens": 262507559.0, "step": 7835 }, { "epoch": 0.4680038204393505, "grad_norm": 0.5930300354957581, "learning_rate": 4.596369802397468e-05, "loss": 0.5666, "mean_token_accuracy": 0.8224144220352173, "num_tokens": 262675239.0, "step": 7840 }, { "epoch": 0.4683022922636103, "grad_norm": 0.5332184433937073, "learning_rate": 4.595734974842076e-05, "loss": 0.6273, "mean_token_accuracy": 0.8054932475090026, "num_tokens": 262842919.0, "step": 7845 }, { "epoch": 0.4686007640878701, "grad_norm": 0.5373886227607727, "learning_rate": 4.5950996977238625e-05, "loss": 0.569, "mean_token_accuracy": 0.8213765859603882, "num_tokens": 263010599.0, "step": 7850 }, { "epoch": 0.4688992359121299, "grad_norm": 0.5584951639175415, "learning_rate": 4.594463971197561e-05, "loss": 0.5568, "mean_token_accuracy": 0.8220267176628113, "num_tokens": 263178279.0, "step": 7855 }, { "epoch": 0.4691977077363897, "grad_norm": 0.510094165802002, "learning_rate": 4.593827795418014e-05, "loss": 0.545, "mean_token_accuracy": 0.8298282146453857, "num_tokens": 263345959.0, "step": 7860 }, { "epoch": 0.4694961795606495, "grad_norm": 0.5799757242202759, "learning_rate": 4.593191170540175e-05, "loss": 0.5829, "mean_token_accuracy": 0.8147143006324769, "num_tokens": 263513639.0, "step": 7865 }, { "epoch": 0.4697946513849093, "grad_norm": 1.1775456666946411, "learning_rate": 4.5925540967191044e-05, "loss": 0.6029, "mean_token_accuracy": 0.8124657034873962, "num_tokens": 263681319.0, "step": 7870 }, { "epoch": 0.4700931232091691, "grad_norm": 0.9137069582939148, "learning_rate": 4.591916574109974e-05, "loss": 0.6164, "mean_token_accuracy": 0.8097817063331604, "num_tokens": 263848999.0, "step": 7875 }, { "epoch": 0.47039159503342887, "grad_norm": 0.6295863389968872, "learning_rate": 4.591278602868064e-05, "loss": 0.5946, "mean_token_accuracy": 0.814887273311615, "num_tokens": 264016679.0, "step": 7880 }, { "epoch": 0.47069006685768866, "grad_norm": 0.5366268754005432, "learning_rate": 4.5906401831487644e-05, "loss": 0.5492, "mean_token_accuracy": 0.8271084308624268, "num_tokens": 264184359.0, "step": 7885 }, { "epoch": 0.4709885386819484, "grad_norm": 0.5900691151618958, "learning_rate": 4.590001315107573e-05, "loss": 0.5808, "mean_token_accuracy": 0.8175786972045899, "num_tokens": 264347485.0, "step": 7890 }, { "epoch": 0.4712870105062082, "grad_norm": 0.5910937190055847, "learning_rate": 4.5893619989001005e-05, "loss": 0.5607, "mean_token_accuracy": 0.8234223961830139, "num_tokens": 264515165.0, "step": 7895 }, { "epoch": 0.471585482330468, "grad_norm": 0.5602463483810425, "learning_rate": 4.588722234682061e-05, "loss": 0.5952, "mean_token_accuracy": 0.8131635427474976, "num_tokens": 264682845.0, "step": 7900 }, { "epoch": 0.4718839541547278, "grad_norm": 0.5950137376785278, "learning_rate": 4.5880820226092824e-05, "loss": 0.5476, "mean_token_accuracy": 0.8288262009620666, "num_tokens": 264850525.0, "step": 7905 }, { "epoch": 0.4721824259789876, "grad_norm": 0.4922201633453369, "learning_rate": 4.5874413628377014e-05, "loss": 0.6, "mean_token_accuracy": 0.8148097395896912, "num_tokens": 265018205.0, "step": 7910 }, { "epoch": 0.47248089780324737, "grad_norm": 0.5623324513435364, "learning_rate": 4.5868002555233594e-05, "loss": 0.5545, "mean_token_accuracy": 0.8247047781944274, "num_tokens": 265185885.0, "step": 7915 }, { "epoch": 0.47277936962750716, "grad_norm": 0.5618387460708618, "learning_rate": 4.586158700822413e-05, "loss": 0.5672, "mean_token_accuracy": 0.8208696126937867, "num_tokens": 265353565.0, "step": 7920 }, { "epoch": 0.47307784145176696, "grad_norm": 0.5495238304138184, "learning_rate": 4.585516698891123e-05, "loss": 0.553, "mean_token_accuracy": 0.8243349671363831, "num_tokens": 265521245.0, "step": 7925 }, { "epoch": 0.47337631327602675, "grad_norm": 0.5636260509490967, "learning_rate": 4.584874249885861e-05, "loss": 0.5642, "mean_token_accuracy": 0.8226350903511047, "num_tokens": 265688925.0, "step": 7930 }, { "epoch": 0.47367478510028654, "grad_norm": 0.5569517016410828, "learning_rate": 4.584231353963108e-05, "loss": 0.551, "mean_token_accuracy": 0.8252833008766174, "num_tokens": 265856605.0, "step": 7935 }, { "epoch": 0.47397325692454634, "grad_norm": 0.6061868667602539, "learning_rate": 4.5835880112794524e-05, "loss": 0.5728, "mean_token_accuracy": 0.8192174673080445, "num_tokens": 266024285.0, "step": 7940 }, { "epoch": 0.47427172874880613, "grad_norm": 0.5769259333610535, "learning_rate": 4.582944221991592e-05, "loss": 0.6292, "mean_token_accuracy": 0.8031611561775207, "num_tokens": 266191965.0, "step": 7945 }, { "epoch": 0.4745702005730659, "grad_norm": 0.5785863995552063, "learning_rate": 4.582299986256335e-05, "loss": 0.5473, "mean_token_accuracy": 0.8263986706733704, "num_tokens": 266359645.0, "step": 7950 }, { "epoch": 0.4748686723973257, "grad_norm": 0.5971212387084961, "learning_rate": 4.581655304230596e-05, "loss": 0.586, "mean_token_accuracy": 0.81563880443573, "num_tokens": 266527325.0, "step": 7955 }, { "epoch": 0.47516714422158546, "grad_norm": 0.608715832233429, "learning_rate": 4.581010176071399e-05, "loss": 0.5468, "mean_token_accuracy": 0.8272098302841187, "num_tokens": 266695005.0, "step": 7960 }, { "epoch": 0.47546561604584525, "grad_norm": 0.5763922929763794, "learning_rate": 4.5803646019358764e-05, "loss": 0.5603, "mean_token_accuracy": 0.8245735406875611, "num_tokens": 266862685.0, "step": 7965 }, { "epoch": 0.47576408787010505, "grad_norm": 0.5652427673339844, "learning_rate": 4.5797185819812705e-05, "loss": 0.5712, "mean_token_accuracy": 0.8206310391426086, "num_tokens": 267030365.0, "step": 7970 }, { "epoch": 0.47606255969436484, "grad_norm": 0.5526332855224609, "learning_rate": 4.579072116364932e-05, "loss": 0.5373, "mean_token_accuracy": 0.8294405341148376, "num_tokens": 267198045.0, "step": 7975 }, { "epoch": 0.47636103151862463, "grad_norm": 0.5545656085014343, "learning_rate": 4.578425205244318e-05, "loss": 0.5415, "mean_token_accuracy": 0.8296552538871765, "num_tokens": 267365725.0, "step": 7980 }, { "epoch": 0.4766595033428844, "grad_norm": 0.5866954922676086, "learning_rate": 4.577777848776997e-05, "loss": 0.559, "mean_token_accuracy": 0.8245079278945923, "num_tokens": 267533405.0, "step": 7985 }, { "epoch": 0.4769579751671442, "grad_norm": 0.631314218044281, "learning_rate": 4.577130047120643e-05, "loss": 0.5939, "mean_token_accuracy": 0.8154777526855469, "num_tokens": 267701085.0, "step": 7990 }, { "epoch": 0.477256446991404, "grad_norm": 0.6295937299728394, "learning_rate": 4.576481800433042e-05, "loss": 0.5576, "mean_token_accuracy": 0.8249135136604309, "num_tokens": 267868765.0, "step": 7995 }, { "epoch": 0.4775549188156638, "grad_norm": 0.6489912271499634, "learning_rate": 4.575833108872085e-05, "loss": 0.6152, "mean_token_accuracy": 0.81052725315094, "num_tokens": 268036445.0, "step": 8000 }, { "epoch": 0.4778533906399236, "grad_norm": 0.533629834651947, "learning_rate": 4.575183972595774e-05, "loss": 0.6079, "mean_token_accuracy": 0.806453537940979, "num_tokens": 268204125.0, "step": 8005 }, { "epoch": 0.4781518624641834, "grad_norm": 0.5676308274269104, "learning_rate": 4.574534391762216e-05, "loss": 0.6181, "mean_token_accuracy": 0.8083084940910339, "num_tokens": 268371805.0, "step": 8010 }, { "epoch": 0.4784503342884432, "grad_norm": 0.5404603481292725, "learning_rate": 4.573884366529632e-05, "loss": 0.5824, "mean_token_accuracy": 0.8169569373130798, "num_tokens": 268539485.0, "step": 8015 }, { "epoch": 0.478748806112703, "grad_norm": 0.5885908007621765, "learning_rate": 4.573233897056344e-05, "loss": 0.5595, "mean_token_accuracy": 0.8206489205360412, "num_tokens": 268707165.0, "step": 8020 }, { "epoch": 0.4790472779369627, "grad_norm": 0.5349931716918945, "learning_rate": 4.572582983500787e-05, "loss": 0.6162, "mean_token_accuracy": 0.8094596147537232, "num_tokens": 268874845.0, "step": 8025 }, { "epoch": 0.4793457497612225, "grad_norm": 0.541793704032898, "learning_rate": 4.5719316260215045e-05, "loss": 0.5807, "mean_token_accuracy": 0.8177561640739441, "num_tokens": 269042525.0, "step": 8030 }, { "epoch": 0.4796442215854823, "grad_norm": 0.5695494413375854, "learning_rate": 4.571279824777145e-05, "loss": 0.5976, "mean_token_accuracy": 0.8117559313774109, "num_tokens": 269210205.0, "step": 8035 }, { "epoch": 0.4799426934097421, "grad_norm": 0.5621160864830017, "learning_rate": 4.570627579926468e-05, "loss": 0.6179, "mean_token_accuracy": 0.8079923748970032, "num_tokens": 269377885.0, "step": 8040 }, { "epoch": 0.4802411652340019, "grad_norm": 0.5891603231430054, "learning_rate": 4.569974891628337e-05, "loss": 0.585, "mean_token_accuracy": 0.8176070570945739, "num_tokens": 269545565.0, "step": 8045 }, { "epoch": 0.4805396370582617, "grad_norm": 0.5767423510551453, "learning_rate": 4.56932176004173e-05, "loss": 0.591, "mean_token_accuracy": 0.8160443782806397, "num_tokens": 269713245.0, "step": 8050 }, { "epoch": 0.4808381088825215, "grad_norm": 0.5442712903022766, "learning_rate": 4.568668185325726e-05, "loss": 0.5447, "mean_token_accuracy": 0.8274364709854126, "num_tokens": 269880925.0, "step": 8055 }, { "epoch": 0.4811365807067813, "grad_norm": 0.5646932125091553, "learning_rate": 4.568014167639518e-05, "loss": 0.5992, "mean_token_accuracy": 0.814135754108429, "num_tokens": 270048605.0, "step": 8060 }, { "epoch": 0.4814350525310411, "grad_norm": 0.5659125447273254, "learning_rate": 4.567359707142402e-05, "loss": 0.5365, "mean_token_accuracy": 0.8292675614356995, "num_tokens": 270216285.0, "step": 8065 }, { "epoch": 0.48173352435530087, "grad_norm": 0.5524060726165771, "learning_rate": 4.5667048039937854e-05, "loss": 0.5121, "mean_token_accuracy": 0.8350172877311707, "num_tokens": 270383965.0, "step": 8070 }, { "epoch": 0.48203199617956066, "grad_norm": 0.6202818155288696, "learning_rate": 4.566049458353181e-05, "loss": 0.5439, "mean_token_accuracy": 0.8290289998054504, "num_tokens": 270551645.0, "step": 8075 }, { "epoch": 0.48233046800382046, "grad_norm": 0.6832270622253418, "learning_rate": 4.56539367038021e-05, "loss": 0.5961, "mean_token_accuracy": 0.8125969290733337, "num_tokens": 270719325.0, "step": 8080 }, { "epoch": 0.48262893982808025, "grad_norm": 0.6123245358467102, "learning_rate": 4.564737440234604e-05, "loss": 0.6067, "mean_token_accuracy": 0.8086663484573364, "num_tokens": 270887005.0, "step": 8085 }, { "epoch": 0.48292741165234004, "grad_norm": 0.7667441368103027, "learning_rate": 4.564080768076196e-05, "loss": 0.5875, "mean_token_accuracy": 0.8150602459907532, "num_tokens": 271054685.0, "step": 8090 }, { "epoch": 0.4832258834765998, "grad_norm": 0.5241187214851379, "learning_rate": 4.563423654064934e-05, "loss": 0.5343, "mean_token_accuracy": 0.8302934527397156, "num_tokens": 271222365.0, "step": 8095 }, { "epoch": 0.4835243553008596, "grad_norm": 0.6005362272262573, "learning_rate": 4.5627660983608696e-05, "loss": 0.5878, "mean_token_accuracy": 0.8154359936714173, "num_tokens": 271390045.0, "step": 8100 }, { "epoch": 0.48382282712511937, "grad_norm": 0.6519680023193359, "learning_rate": 4.5621081011241615e-05, "loss": 0.5725, "mean_token_accuracy": 0.8208815455436707, "num_tokens": 271557725.0, "step": 8105 }, { "epoch": 0.48412129894937916, "grad_norm": 0.6114306449890137, "learning_rate": 4.5614496625150774e-05, "loss": 0.6172, "mean_token_accuracy": 0.809811532497406, "num_tokens": 271725405.0, "step": 8110 }, { "epoch": 0.48441977077363896, "grad_norm": 0.527331531047821, "learning_rate": 4.560790782693993e-05, "loss": 0.5839, "mean_token_accuracy": 0.8166587233543396, "num_tokens": 271893085.0, "step": 8115 }, { "epoch": 0.48471824259789875, "grad_norm": 0.5360596179962158, "learning_rate": 4.560131461821389e-05, "loss": 0.563, "mean_token_accuracy": 0.824424433708191, "num_tokens": 272060765.0, "step": 8120 }, { "epoch": 0.48501671442215855, "grad_norm": 0.6610274910926819, "learning_rate": 4.5594717000578577e-05, "loss": 0.565, "mean_token_accuracy": 0.8247763395309449, "num_tokens": 272228445.0, "step": 8125 }, { "epoch": 0.48531518624641834, "grad_norm": 0.583880603313446, "learning_rate": 4.5588114975640944e-05, "loss": 0.5612, "mean_token_accuracy": 0.823434317111969, "num_tokens": 272396125.0, "step": 8130 }, { "epoch": 0.48561365807067813, "grad_norm": 0.5621193647384644, "learning_rate": 4.558150854500903e-05, "loss": 0.5396, "mean_token_accuracy": 0.8299176931381226, "num_tokens": 272563805.0, "step": 8135 }, { "epoch": 0.4859121298949379, "grad_norm": 0.5357057452201843, "learning_rate": 4.5574897710291975e-05, "loss": 0.5575, "mean_token_accuracy": 0.8243170738220215, "num_tokens": 272731485.0, "step": 8140 }, { "epoch": 0.4862106017191977, "grad_norm": 0.5147784948348999, "learning_rate": 4.556828247309995e-05, "loss": 0.5261, "mean_token_accuracy": 0.8347488880157471, "num_tokens": 272899165.0, "step": 8145 }, { "epoch": 0.4865090735434575, "grad_norm": 0.580520510673523, "learning_rate": 4.556166283504424e-05, "loss": 0.576, "mean_token_accuracy": 0.820350706577301, "num_tokens": 273066845.0, "step": 8150 }, { "epoch": 0.4868075453677173, "grad_norm": 0.5211859345436096, "learning_rate": 4.555503879773715e-05, "loss": 0.5627, "mean_token_accuracy": 0.8230347037315369, "num_tokens": 273234525.0, "step": 8155 }, { "epoch": 0.4871060171919771, "grad_norm": 0.5352399945259094, "learning_rate": 4.554841036279212e-05, "loss": 0.5684, "mean_token_accuracy": 0.8206429839134216, "num_tokens": 273402205.0, "step": 8160 }, { "epoch": 0.48740448901623684, "grad_norm": 0.5476245284080505, "learning_rate": 4.55417775318236e-05, "loss": 0.5893, "mean_token_accuracy": 0.8178754568099975, "num_tokens": 273569885.0, "step": 8165 }, { "epoch": 0.48770296084049664, "grad_norm": 0.5912073850631714, "learning_rate": 4.553514030644715e-05, "loss": 0.5861, "mean_token_accuracy": 0.8170821905136109, "num_tokens": 273737565.0, "step": 8170 }, { "epoch": 0.48800143266475643, "grad_norm": 0.5961964726448059, "learning_rate": 4.552849868827939e-05, "loss": 0.5728, "mean_token_accuracy": 0.8198854804039002, "num_tokens": 273905245.0, "step": 8175 }, { "epoch": 0.4882999044890162, "grad_norm": 0.5517949461936951, "learning_rate": 4.5521852678938e-05, "loss": 0.6329, "mean_token_accuracy": 0.8060718059539795, "num_tokens": 274072925.0, "step": 8180 }, { "epoch": 0.488598376313276, "grad_norm": 0.5162144899368286, "learning_rate": 4.551520228004175e-05, "loss": 0.5412, "mean_token_accuracy": 0.8275498151779175, "num_tokens": 274240605.0, "step": 8185 }, { "epoch": 0.4888968481375358, "grad_norm": 0.5340211391448975, "learning_rate": 4.550854749321046e-05, "loss": 0.5828, "mean_token_accuracy": 0.8140701413154602, "num_tokens": 274408285.0, "step": 8190 }, { "epoch": 0.4891953199617956, "grad_norm": 0.5133636593818665, "learning_rate": 4.550188832006503e-05, "loss": 0.568, "mean_token_accuracy": 0.8207324504852295, "num_tokens": 274575965.0, "step": 8195 }, { "epoch": 0.4894937917860554, "grad_norm": 0.6545477509498596, "learning_rate": 4.549522476222742e-05, "loss": 0.6044, "mean_token_accuracy": 0.8120183706283569, "num_tokens": 274743645.0, "step": 8200 }, { "epoch": 0.4897922636103152, "grad_norm": 0.4933609366416931, "learning_rate": 4.5488556821320676e-05, "loss": 0.5966, "mean_token_accuracy": 0.814911139011383, "num_tokens": 274911325.0, "step": 8205 }, { "epoch": 0.490090735434575, "grad_norm": 0.6106254458427429, "learning_rate": 4.548188449896888e-05, "loss": 0.6006, "mean_token_accuracy": 0.8145294070243836, "num_tokens": 275079005.0, "step": 8210 }, { "epoch": 0.4903892072588348, "grad_norm": 0.540390133857727, "learning_rate": 4.5475207796797206e-05, "loss": 0.5864, "mean_token_accuracy": 0.8152689933776855, "num_tokens": 275246685.0, "step": 8215 }, { "epoch": 0.4906876790830946, "grad_norm": 0.5468898415565491, "learning_rate": 4.546852671643189e-05, "loss": 0.5263, "mean_token_accuracy": 0.8331027150154113, "num_tokens": 275414365.0, "step": 8220 }, { "epoch": 0.49098615090735437, "grad_norm": 0.5242897868156433, "learning_rate": 4.546184125950023e-05, "loss": 0.5768, "mean_token_accuracy": 0.8187641620635986, "num_tokens": 275582045.0, "step": 8225 }, { "epoch": 0.49128462273161416, "grad_norm": 0.5289314389228821, "learning_rate": 4.54551514276306e-05, "loss": 0.5213, "mean_token_accuracy": 0.833633553981781, "num_tokens": 275749725.0, "step": 8230 }, { "epoch": 0.4915830945558739, "grad_norm": 0.7056505084037781, "learning_rate": 4.544845722245242e-05, "loss": 0.6329, "mean_token_accuracy": 0.8035309553146363, "num_tokens": 275917405.0, "step": 8235 }, { "epoch": 0.4918815663801337, "grad_norm": 0.560248851776123, "learning_rate": 4.544175864559619e-05, "loss": 0.5685, "mean_token_accuracy": 0.8206310272216797, "num_tokens": 276085085.0, "step": 8240 }, { "epoch": 0.4921800382043935, "grad_norm": 0.7740751504898071, "learning_rate": 4.543505569869348e-05, "loss": 0.5649, "mean_token_accuracy": 0.8229213953018188, "num_tokens": 276252765.0, "step": 8245 }, { "epoch": 0.4924785100286533, "grad_norm": 0.5696920156478882, "learning_rate": 4.54283483833769e-05, "loss": 0.5675, "mean_token_accuracy": 0.8218060255050659, "num_tokens": 276420445.0, "step": 8250 }, { "epoch": 0.4927769818529131, "grad_norm": 0.5968448519706726, "learning_rate": 4.542163670128017e-05, "loss": 0.605, "mean_token_accuracy": 0.8120124220848084, "num_tokens": 276588125.0, "step": 8255 }, { "epoch": 0.49307545367717287, "grad_norm": 0.54146808385849, "learning_rate": 4.5414920654038014e-05, "loss": 0.5598, "mean_token_accuracy": 0.8222295045852661, "num_tokens": 276755805.0, "step": 8260 }, { "epoch": 0.49337392550143266, "grad_norm": 0.555019199848175, "learning_rate": 4.540820024328627e-05, "loss": 0.5739, "mean_token_accuracy": 0.8204163312911987, "num_tokens": 276923485.0, "step": 8265 }, { "epoch": 0.49367239732569246, "grad_norm": 0.5188547968864441, "learning_rate": 4.54014754706618e-05, "loss": 0.6033, "mean_token_accuracy": 0.8122867822647095, "num_tokens": 277091165.0, "step": 8270 }, { "epoch": 0.49397086914995225, "grad_norm": 0.525219738483429, "learning_rate": 4.539474633780256e-05, "loss": 0.5942, "mean_token_accuracy": 0.8140939950942994, "num_tokens": 277258845.0, "step": 8275 }, { "epoch": 0.49426934097421205, "grad_norm": 0.5421309471130371, "learning_rate": 4.538801284634755e-05, "loss": 0.543, "mean_token_accuracy": 0.8292735338211059, "num_tokens": 277426525.0, "step": 8280 }, { "epoch": 0.49456781279847184, "grad_norm": 0.4715145528316498, "learning_rate": 4.538127499793683e-05, "loss": 0.5421, "mean_token_accuracy": 0.8290826678276062, "num_tokens": 277594205.0, "step": 8285 }, { "epoch": 0.49486628462273163, "grad_norm": 0.5990399122238159, "learning_rate": 4.5374532794211544e-05, "loss": 0.6005, "mean_token_accuracy": 0.8137480616569519, "num_tokens": 277761885.0, "step": 8290 }, { "epoch": 0.4951647564469914, "grad_norm": 0.6102491021156311, "learning_rate": 4.5367786236813856e-05, "loss": 0.5773, "mean_token_accuracy": 0.8196528553962708, "num_tokens": 277929565.0, "step": 8295 }, { "epoch": 0.4954632282712512, "grad_norm": 0.5606467127799988, "learning_rate": 4.536103532738704e-05, "loss": 0.5709, "mean_token_accuracy": 0.8191995739936828, "num_tokens": 278097245.0, "step": 8300 }, { "epoch": 0.49576170009551096, "grad_norm": 0.5498657822608948, "learning_rate": 4.535428006757539e-05, "loss": 0.5025, "mean_token_accuracy": 0.8396516799926758, "num_tokens": 278264925.0, "step": 8305 }, { "epoch": 0.49606017191977075, "grad_norm": 0.5730018615722656, "learning_rate": 4.5347520459024285e-05, "loss": 0.6072, "mean_token_accuracy": 0.8089765071868896, "num_tokens": 278432605.0, "step": 8310 }, { "epoch": 0.49635864374403055, "grad_norm": 0.5413969159126282, "learning_rate": 4.534075650338014e-05, "loss": 0.5681, "mean_token_accuracy": 0.8203984379768372, "num_tokens": 278600285.0, "step": 8315 }, { "epoch": 0.49665711556829034, "grad_norm": 0.5423916578292847, "learning_rate": 4.533398820229044e-05, "loss": 0.5756, "mean_token_accuracy": 0.8190504550933838, "num_tokens": 278767965.0, "step": 8320 }, { "epoch": 0.49695558739255014, "grad_norm": 0.5244458317756653, "learning_rate": 4.532721555740374e-05, "loss": 0.5436, "mean_token_accuracy": 0.8271263241767883, "num_tokens": 278935645.0, "step": 8325 }, { "epoch": 0.49725405921680993, "grad_norm": 0.6236850023269653, "learning_rate": 4.5320438570369636e-05, "loss": 0.5309, "mean_token_accuracy": 0.8316831588745117, "num_tokens": 279103325.0, "step": 8330 }, { "epoch": 0.4975525310410697, "grad_norm": 0.543181300163269, "learning_rate": 4.53136572428388e-05, "loss": 0.5671, "mean_token_accuracy": 0.8195753216743469, "num_tokens": 279271005.0, "step": 8335 }, { "epoch": 0.4978510028653295, "grad_norm": 0.5347437262535095, "learning_rate": 4.530687157646293e-05, "loss": 0.5683, "mean_token_accuracy": 0.8222831964492798, "num_tokens": 279438685.0, "step": 8340 }, { "epoch": 0.4981494746895893, "grad_norm": 0.5316510796546936, "learning_rate": 4.530008157289481e-05, "loss": 0.6315, "mean_token_accuracy": 0.8046284198760987, "num_tokens": 279606365.0, "step": 8345 }, { "epoch": 0.4984479465138491, "grad_norm": 0.6096969842910767, "learning_rate": 4.5293287233788276e-05, "loss": 0.5682, "mean_token_accuracy": 0.8230406880378723, "num_tokens": 279774045.0, "step": 8350 }, { "epoch": 0.4987464183381089, "grad_norm": 0.6531689167022705, "learning_rate": 4.528648856079821e-05, "loss": 0.5523, "mean_token_accuracy": 0.8257604598999023, "num_tokens": 279941725.0, "step": 8355 }, { "epoch": 0.4990448901623687, "grad_norm": 0.5208596587181091, "learning_rate": 4.527968555558055e-05, "loss": 0.5436, "mean_token_accuracy": 0.8273291230201721, "num_tokens": 280109405.0, "step": 8360 }, { "epoch": 0.4993433619866285, "grad_norm": 0.5364457368850708, "learning_rate": 4.52728782197923e-05, "loss": 0.5945, "mean_token_accuracy": 0.8145174741744995, "num_tokens": 280277085.0, "step": 8365 }, { "epoch": 0.4996418338108882, "grad_norm": 0.574241578578949, "learning_rate": 4.5266066555091506e-05, "loss": 0.5685, "mean_token_accuracy": 0.8201956391334534, "num_tokens": 280444765.0, "step": 8370 }, { "epoch": 0.499940305635148, "grad_norm": 0.5839861631393433, "learning_rate": 4.5259250563137284e-05, "loss": 0.5826, "mean_token_accuracy": 0.8177144289016723, "num_tokens": 280612445.0, "step": 8375 }, { "epoch": 0.5002387774594078, "grad_norm": 0.6105102896690369, "learning_rate": 4.5252430245589786e-05, "loss": 0.5819, "mean_token_accuracy": 0.8183048963546753, "num_tokens": 280780125.0, "step": 8380 }, { "epoch": 0.5005372492836676, "grad_norm": 0.6635046005249023, "learning_rate": 4.524560560411023e-05, "loss": 0.5976, "mean_token_accuracy": 0.8133543968200684, "num_tokens": 280947805.0, "step": 8385 }, { "epoch": 0.5008357211079274, "grad_norm": 0.6643062233924866, "learning_rate": 4.523877664036088e-05, "loss": 0.5587, "mean_token_accuracy": 0.8244661808013916, "num_tokens": 281115485.0, "step": 8390 }, { "epoch": 0.5011341929321872, "grad_norm": 0.5526153445243835, "learning_rate": 4.523194335600507e-05, "loss": 0.5678, "mean_token_accuracy": 0.8207622408866883, "num_tokens": 281283165.0, "step": 8395 }, { "epoch": 0.501432664756447, "grad_norm": 0.5357623100280762, "learning_rate": 4.522510575270715e-05, "loss": 0.5762, "mean_token_accuracy": 0.8183287620544434, "num_tokens": 281450845.0, "step": 8400 }, { "epoch": 0.5017311365807068, "grad_norm": 0.4978879988193512, "learning_rate": 4.521826383213256e-05, "loss": 0.5427, "mean_token_accuracy": 0.8289156675338745, "num_tokens": 281618525.0, "step": 8405 }, { "epoch": 0.5020296084049666, "grad_norm": 0.5199429392814636, "learning_rate": 4.521141759594776e-05, "loss": 0.5524, "mean_token_accuracy": 0.8254741668701172, "num_tokens": 281786205.0, "step": 8410 }, { "epoch": 0.5023280802292264, "grad_norm": 0.559115469455719, "learning_rate": 4.520456704582031e-05, "loss": 0.5762, "mean_token_accuracy": 0.8197184920310974, "num_tokens": 281953885.0, "step": 8415 }, { "epoch": 0.5026265520534862, "grad_norm": 0.59633868932724, "learning_rate": 4.519771218341876e-05, "loss": 0.6122, "mean_token_accuracy": 0.8071632981300354, "num_tokens": 282121565.0, "step": 8420 }, { "epoch": 0.502925023877746, "grad_norm": 0.6832320690155029, "learning_rate": 4.5190853010412745e-05, "loss": 0.5991, "mean_token_accuracy": 0.8122509837150573, "num_tokens": 282289245.0, "step": 8425 }, { "epoch": 0.5032234957020058, "grad_norm": 0.5692151784896851, "learning_rate": 4.518398952847294e-05, "loss": 0.5938, "mean_token_accuracy": 0.8133544206619263, "num_tokens": 282456925.0, "step": 8430 }, { "epoch": 0.5035219675262655, "grad_norm": 0.5501351952552795, "learning_rate": 4.5177121739271073e-05, "loss": 0.5354, "mean_token_accuracy": 0.8299713730812073, "num_tokens": 282624605.0, "step": 8435 }, { "epoch": 0.5038204393505253, "grad_norm": 0.557152509689331, "learning_rate": 4.5170249644479937e-05, "loss": 0.5753, "mean_token_accuracy": 0.8187999606132508, "num_tokens": 282792285.0, "step": 8440 }, { "epoch": 0.5041189111747851, "grad_norm": 0.6025996208190918, "learning_rate": 4.516337324577333e-05, "loss": 0.6139, "mean_token_accuracy": 0.80866037607193, "num_tokens": 282959965.0, "step": 8445 }, { "epoch": 0.5044173829990449, "grad_norm": 0.5659865140914917, "learning_rate": 4.5156492544826146e-05, "loss": 0.5736, "mean_token_accuracy": 0.8184062838554382, "num_tokens": 283127645.0, "step": 8450 }, { "epoch": 0.5047158548233047, "grad_norm": 0.5491483211517334, "learning_rate": 4.5149607543314306e-05, "loss": 0.541, "mean_token_accuracy": 0.8302159190177918, "num_tokens": 283295325.0, "step": 8455 }, { "epoch": 0.5050143266475645, "grad_norm": 0.6142468452453613, "learning_rate": 4.514271824291477e-05, "loss": 0.579, "mean_token_accuracy": 0.8175414562225342, "num_tokens": 283463005.0, "step": 8460 }, { "epoch": 0.5053127984718243, "grad_norm": 0.6395441889762878, "learning_rate": 4.513582464530556e-05, "loss": 0.5975, "mean_token_accuracy": 0.814022445678711, "num_tokens": 283630685.0, "step": 8465 }, { "epoch": 0.5056112702960841, "grad_norm": 0.5647761821746826, "learning_rate": 4.512892675216574e-05, "loss": 0.5598, "mean_token_accuracy": 0.8255397796630859, "num_tokens": 283798365.0, "step": 8470 }, { "epoch": 0.5059097421203438, "grad_norm": 0.5564920902252197, "learning_rate": 4.5122024565175414e-05, "loss": 0.5952, "mean_token_accuracy": 0.8134021162986755, "num_tokens": 283966045.0, "step": 8475 }, { "epoch": 0.5062082139446036, "grad_norm": 0.5405991673469543, "learning_rate": 4.511511808601574e-05, "loss": 0.5433, "mean_token_accuracy": 0.8270010828971863, "num_tokens": 284133725.0, "step": 8480 }, { "epoch": 0.5065066857688634, "grad_norm": 0.546653151512146, "learning_rate": 4.5108207316368915e-05, "loss": 0.5887, "mean_token_accuracy": 0.8135870218276977, "num_tokens": 284301405.0, "step": 8485 }, { "epoch": 0.5068051575931232, "grad_norm": 0.6025099754333496, "learning_rate": 4.5101292257918184e-05, "loss": 0.5453, "mean_token_accuracy": 0.8282893896102905, "num_tokens": 284469085.0, "step": 8490 }, { "epoch": 0.507103629417383, "grad_norm": 0.5866109132766724, "learning_rate": 4.509437291234786e-05, "loss": 0.6112, "mean_token_accuracy": 0.8091912150382996, "num_tokens": 284636765.0, "step": 8495 }, { "epoch": 0.5074021012416428, "grad_norm": 0.5114835500717163, "learning_rate": 4.508744928134324e-05, "loss": 0.5474, "mean_token_accuracy": 0.8278122425079346, "num_tokens": 284804445.0, "step": 8500 }, { "epoch": 0.5077005730659025, "grad_norm": 0.54168701171875, "learning_rate": 4.5080521366590724e-05, "loss": 0.5361, "mean_token_accuracy": 0.833245849609375, "num_tokens": 284972125.0, "step": 8505 }, { "epoch": 0.5079990448901623, "grad_norm": 0.5267937183380127, "learning_rate": 4.507358916977774e-05, "loss": 0.5503, "mean_token_accuracy": 0.8275855898857116, "num_tokens": 285139805.0, "step": 8510 }, { "epoch": 0.5082975167144221, "grad_norm": 0.5776879787445068, "learning_rate": 4.5066652692592734e-05, "loss": 0.5735, "mean_token_accuracy": 0.8207443714141845, "num_tokens": 285307485.0, "step": 8515 }, { "epoch": 0.5085959885386819, "grad_norm": 0.6104937791824341, "learning_rate": 4.505971193672522e-05, "loss": 0.6164, "mean_token_accuracy": 0.8079148411750794, "num_tokens": 285475165.0, "step": 8520 }, { "epoch": 0.5088944603629417, "grad_norm": 0.6085206866264343, "learning_rate": 4.505276690386575e-05, "loss": 0.559, "mean_token_accuracy": 0.824561607837677, "num_tokens": 285642845.0, "step": 8525 }, { "epoch": 0.5091929321872015, "grad_norm": 0.5954418182373047, "learning_rate": 4.504581759570591e-05, "loss": 0.5224, "mean_token_accuracy": 0.834122633934021, "num_tokens": 285810525.0, "step": 8530 }, { "epoch": 0.5094914040114613, "grad_norm": 0.5252765417098999, "learning_rate": 4.503886401393834e-05, "loss": 0.5973, "mean_token_accuracy": 0.8123881697654725, "num_tokens": 285978205.0, "step": 8535 }, { "epoch": 0.5097898758357211, "grad_norm": 0.6340962052345276, "learning_rate": 4.503190616025672e-05, "loss": 0.5953, "mean_token_accuracy": 0.8133245825767517, "num_tokens": 286145885.0, "step": 8540 }, { "epoch": 0.5100883476599809, "grad_norm": 0.5511323809623718, "learning_rate": 4.5024944036355735e-05, "loss": 0.5703, "mean_token_accuracy": 0.8201662182807923, "num_tokens": 286305497.0, "step": 8545 }, { "epoch": 0.5103868194842407, "grad_norm": 0.6255331635475159, "learning_rate": 4.501797764393116e-05, "loss": 0.601, "mean_token_accuracy": 0.8114756107330322, "num_tokens": 286473177.0, "step": 8550 }, { "epoch": 0.5106852913085005, "grad_norm": 0.5895081758499146, "learning_rate": 4.50110069846798e-05, "loss": 0.5892, "mean_token_accuracy": 0.8159310460090637, "num_tokens": 286640857.0, "step": 8555 }, { "epoch": 0.5109837631327603, "grad_norm": 0.555617094039917, "learning_rate": 4.5004032060299455e-05, "loss": 0.5521, "mean_token_accuracy": 0.8255636334419251, "num_tokens": 286808537.0, "step": 8560 }, { "epoch": 0.5112822349570201, "grad_norm": 0.5435903072357178, "learning_rate": 4.4997052872489023e-05, "loss": 0.5364, "mean_token_accuracy": 0.8305618405342102, "num_tokens": 286976217.0, "step": 8565 }, { "epoch": 0.5115807067812799, "grad_norm": 0.6119329333305359, "learning_rate": 4.49900694229484e-05, "loss": 0.5342, "mean_token_accuracy": 0.8312954783439637, "num_tokens": 287143897.0, "step": 8570 }, { "epoch": 0.5118791786055397, "grad_norm": 0.5741212368011475, "learning_rate": 4.498308171337854e-05, "loss": 0.5755, "mean_token_accuracy": 0.8186866164207458, "num_tokens": 287311577.0, "step": 8575 }, { "epoch": 0.5121776504297995, "grad_norm": 0.5080263614654541, "learning_rate": 4.4976089745481437e-05, "loss": 0.5224, "mean_token_accuracy": 0.8341643571853637, "num_tokens": 287479257.0, "step": 8580 }, { "epoch": 0.5124761222540593, "grad_norm": 0.6037417054176331, "learning_rate": 4.4969093520960096e-05, "loss": 0.5774, "mean_token_accuracy": 0.8187522411346435, "num_tokens": 287646937.0, "step": 8585 }, { "epoch": 0.512774594078319, "grad_norm": 0.5993456840515137, "learning_rate": 4.496209304151859e-05, "loss": 0.5495, "mean_token_accuracy": 0.8264583110809326, "num_tokens": 287814617.0, "step": 8590 }, { "epoch": 0.5130730659025788, "grad_norm": 0.6401481628417969, "learning_rate": 4.495508830886201e-05, "loss": 0.5836, "mean_token_accuracy": 0.8159489393234253, "num_tokens": 287982297.0, "step": 8595 }, { "epoch": 0.5133715377268386, "grad_norm": 0.5762320756912231, "learning_rate": 4.494807932469649e-05, "loss": 0.59, "mean_token_accuracy": 0.813421881198883, "num_tokens": 288141450.0, "step": 8600 }, { "epoch": 0.5136700095510984, "grad_norm": 0.5389105677604675, "learning_rate": 4.494106609072919e-05, "loss": 0.5402, "mean_token_accuracy": 0.829046881198883, "num_tokens": 288309130.0, "step": 8605 }, { "epoch": 0.5139684813753582, "grad_norm": 0.5098772644996643, "learning_rate": 4.493404860866832e-05, "loss": 0.5181, "mean_token_accuracy": 0.8354884862899781, "num_tokens": 288476810.0, "step": 8610 }, { "epoch": 0.5142669531996179, "grad_norm": 0.5440981984138489, "learning_rate": 4.4927026880223117e-05, "loss": 0.5643, "mean_token_accuracy": 0.8219611048698425, "num_tokens": 288644490.0, "step": 8615 }, { "epoch": 0.5145654250238777, "grad_norm": 0.5773564577102661, "learning_rate": 4.492000090710385e-05, "loss": 0.5593, "mean_token_accuracy": 0.8247703671455383, "num_tokens": 288812170.0, "step": 8620 }, { "epoch": 0.5148638968481375, "grad_norm": 0.6357622742652893, "learning_rate": 4.4912970691021824e-05, "loss": 0.5455, "mean_token_accuracy": 0.8289633870124817, "num_tokens": 288979850.0, "step": 8625 }, { "epoch": 0.5151623686723973, "grad_norm": 0.6056704521179199, "learning_rate": 4.490593623368938e-05, "loss": 0.5649, "mean_token_accuracy": 0.8217642784118653, "num_tokens": 289147530.0, "step": 8630 }, { "epoch": 0.5154608404966571, "grad_norm": 0.5196501612663269, "learning_rate": 4.489889753681989e-05, "loss": 0.5532, "mean_token_accuracy": 0.8234880089759826, "num_tokens": 289315210.0, "step": 8635 }, { "epoch": 0.5157593123209169, "grad_norm": 0.5886099338531494, "learning_rate": 4.489185460212777e-05, "loss": 0.5608, "mean_token_accuracy": 0.8236609697341919, "num_tokens": 289482890.0, "step": 8640 }, { "epoch": 0.5160577841451767, "grad_norm": 0.5845518708229065, "learning_rate": 4.488480743132843e-05, "loss": 0.5837, "mean_token_accuracy": 0.8166706323623657, "num_tokens": 289650570.0, "step": 8645 }, { "epoch": 0.5163562559694365, "grad_norm": 0.5307056903839111, "learning_rate": 4.487775602613836e-05, "loss": 0.5353, "mean_token_accuracy": 0.8278703212738037, "num_tokens": 289815987.0, "step": 8650 }, { "epoch": 0.5166547277936963, "grad_norm": 0.5496380925178528, "learning_rate": 4.487070038827505e-05, "loss": 0.5607, "mean_token_accuracy": 0.8231361150741577, "num_tokens": 289983667.0, "step": 8655 }, { "epoch": 0.516953199617956, "grad_norm": 0.5331660509109497, "learning_rate": 4.4863640519457034e-05, "loss": 0.552, "mean_token_accuracy": 0.8254204988479614, "num_tokens": 290151347.0, "step": 8660 }, { "epoch": 0.5172516714422158, "grad_norm": 0.5752771496772766, "learning_rate": 4.485657642140387e-05, "loss": 0.5264, "mean_token_accuracy": 0.8333114743232727, "num_tokens": 290319027.0, "step": 8665 }, { "epoch": 0.5175501432664756, "grad_norm": 0.47481390833854675, "learning_rate": 4.484950809583616e-05, "loss": 0.509, "mean_token_accuracy": 0.8388835072517395, "num_tokens": 290482019.0, "step": 8670 }, { "epoch": 0.5178486150907354, "grad_norm": 0.5849865674972534, "learning_rate": 4.484243554447552e-05, "loss": 0.6146, "mean_token_accuracy": 0.8089228272438049, "num_tokens": 290649699.0, "step": 8675 }, { "epoch": 0.5181470869149952, "grad_norm": 0.6237294673919678, "learning_rate": 4.483535876904459e-05, "loss": 0.5992, "mean_token_accuracy": 0.8107419848442078, "num_tokens": 290817379.0, "step": 8680 }, { "epoch": 0.518445558739255, "grad_norm": 0.554248034954071, "learning_rate": 4.482827777126706e-05, "loss": 0.5654, "mean_token_accuracy": 0.8227603554725647, "num_tokens": 290985059.0, "step": 8685 }, { "epoch": 0.5187440305635148, "grad_norm": 0.6273401975631714, "learning_rate": 4.4821192552867636e-05, "loss": 0.5713, "mean_token_accuracy": 0.8194798946380615, "num_tokens": 291152739.0, "step": 8690 }, { "epoch": 0.5190425023877746, "grad_norm": 0.563341498374939, "learning_rate": 4.481410311557206e-05, "loss": 0.5913, "mean_token_accuracy": 0.814034354686737, "num_tokens": 291320419.0, "step": 8695 }, { "epoch": 0.5193409742120344, "grad_norm": 0.5764652490615845, "learning_rate": 4.480700946110708e-05, "loss": 0.5654, "mean_token_accuracy": 0.8213109850883484, "num_tokens": 291488099.0, "step": 8700 }, { "epoch": 0.5196394460362942, "grad_norm": 0.5594584345817566, "learning_rate": 4.47999115912005e-05, "loss": 0.5275, "mean_token_accuracy": 0.831641411781311, "num_tokens": 291655779.0, "step": 8705 }, { "epoch": 0.519937917860554, "grad_norm": 0.5473382472991943, "learning_rate": 4.4792809507581126e-05, "loss": 0.5885, "mean_token_accuracy": 0.8138613820075988, "num_tokens": 291823459.0, "step": 8710 }, { "epoch": 0.5202363896848138, "grad_norm": 0.5849176049232483, "learning_rate": 4.478570321197881e-05, "loss": 0.5742, "mean_token_accuracy": 0.8186761736869812, "num_tokens": 291986614.0, "step": 8715 }, { "epoch": 0.5205348615090736, "grad_norm": 0.4745195209980011, "learning_rate": 4.4778592706124426e-05, "loss": 0.5296, "mean_token_accuracy": 0.8316831707954406, "num_tokens": 292154294.0, "step": 8720 }, { "epoch": 0.5208333333333334, "grad_norm": 0.5845153331756592, "learning_rate": 4.477147799174986e-05, "loss": 0.6311, "mean_token_accuracy": 0.8032446503639221, "num_tokens": 292321974.0, "step": 8725 }, { "epoch": 0.5211318051575932, "grad_norm": 0.5913403034210205, "learning_rate": 4.476435907058802e-05, "loss": 0.6044, "mean_token_accuracy": 0.8127877831459045, "num_tokens": 292489654.0, "step": 8730 }, { "epoch": 0.521430276981853, "grad_norm": 0.5641769170761108, "learning_rate": 4.475723594437289e-05, "loss": 0.5772, "mean_token_accuracy": 0.8175951361656189, "num_tokens": 292657334.0, "step": 8735 }, { "epoch": 0.5217287488061128, "grad_norm": 0.5563650727272034, "learning_rate": 4.475010861483939e-05, "loss": 0.6175, "mean_token_accuracy": 0.8094894409179687, "num_tokens": 292825014.0, "step": 8740 }, { "epoch": 0.5220272206303725, "grad_norm": 0.5406704545021057, "learning_rate": 4.474297708372355e-05, "loss": 0.579, "mean_token_accuracy": 0.8181498289108277, "num_tokens": 292992694.0, "step": 8745 }, { "epoch": 0.5223256924546322, "grad_norm": 0.6333975195884705, "learning_rate": 4.4735841352762367e-05, "loss": 0.6105, "mean_token_accuracy": 0.8097817182540894, "num_tokens": 293160374.0, "step": 8750 }, { "epoch": 0.522624164278892, "grad_norm": 0.6653600931167603, "learning_rate": 4.472870142369388e-05, "loss": 0.5534, "mean_token_accuracy": 0.8252177119255066, "num_tokens": 293328054.0, "step": 8755 }, { "epoch": 0.5229226361031518, "grad_norm": 0.5845597386360168, "learning_rate": 4.472155729825717e-05, "loss": 0.5636, "mean_token_accuracy": 0.8220207691192627, "num_tokens": 293495734.0, "step": 8760 }, { "epoch": 0.5232211079274116, "grad_norm": 0.459767609834671, "learning_rate": 4.4714408978192294e-05, "loss": 0.5244, "mean_token_accuracy": 0.8326732754707337, "num_tokens": 293663414.0, "step": 8765 }, { "epoch": 0.5235195797516714, "grad_norm": 0.521870493888855, "learning_rate": 4.470725646524038e-05, "loss": 0.5627, "mean_token_accuracy": 0.8234760761260986, "num_tokens": 293831094.0, "step": 8770 }, { "epoch": 0.5238180515759312, "grad_norm": 0.5080561637878418, "learning_rate": 4.4700099761143535e-05, "loss": 0.5466, "mean_token_accuracy": 0.8269712448120117, "num_tokens": 293998774.0, "step": 8775 }, { "epoch": 0.524116523400191, "grad_norm": 0.5342180132865906, "learning_rate": 4.469293886764493e-05, "loss": 0.5558, "mean_token_accuracy": 0.8248896598815918, "num_tokens": 294166454.0, "step": 8780 }, { "epoch": 0.5244149952244508, "grad_norm": 0.599281907081604, "learning_rate": 4.4685773786488704e-05, "loss": 0.5696, "mean_token_accuracy": 0.8206191062927246, "num_tokens": 294334134.0, "step": 8785 }, { "epoch": 0.5247134670487106, "grad_norm": 0.513864278793335, "learning_rate": 4.467860451942006e-05, "loss": 0.5804, "mean_token_accuracy": 0.81725515127182, "num_tokens": 294501814.0, "step": 8790 }, { "epoch": 0.5250119388729704, "grad_norm": 0.4977003335952759, "learning_rate": 4.467143106818521e-05, "loss": 0.5485, "mean_token_accuracy": 0.8274842023849487, "num_tokens": 294669494.0, "step": 8795 }, { "epoch": 0.5253104106972302, "grad_norm": 0.6562310457229614, "learning_rate": 4.4664253434531375e-05, "loss": 0.5537, "mean_token_accuracy": 0.8244960069656372, "num_tokens": 294837174.0, "step": 8800 }, { "epoch": 0.52560888252149, "grad_norm": 0.53802090883255, "learning_rate": 4.46570716202068e-05, "loss": 0.5485, "mean_token_accuracy": 0.8254324197769165, "num_tokens": 295004854.0, "step": 8805 }, { "epoch": 0.5259073543457498, "grad_norm": 0.5729022026062012, "learning_rate": 4.464988562696075e-05, "loss": 0.5364, "mean_token_accuracy": 0.8288023352622986, "num_tokens": 295172534.0, "step": 8810 }, { "epoch": 0.5262058261700095, "grad_norm": 0.5360018014907837, "learning_rate": 4.46426954565435e-05, "loss": 0.5878, "mean_token_accuracy": 0.8148276329040527, "num_tokens": 295340214.0, "step": 8815 }, { "epoch": 0.5265042979942693, "grad_norm": 0.5254443287849426, "learning_rate": 4.463550111070635e-05, "loss": 0.5877, "mean_token_accuracy": 0.8135631561279297, "num_tokens": 295507894.0, "step": 8820 }, { "epoch": 0.5268027698185291, "grad_norm": 0.5107590556144714, "learning_rate": 4.462830259120163e-05, "loss": 0.5163, "mean_token_accuracy": 0.8352976202964782, "num_tokens": 295675574.0, "step": 8825 }, { "epoch": 0.5271012416427889, "grad_norm": 0.5577269792556763, "learning_rate": 4.462109989978265e-05, "loss": 0.5534, "mean_token_accuracy": 0.8250566601753235, "num_tokens": 295843254.0, "step": 8830 }, { "epoch": 0.5273997134670487, "grad_norm": 0.603557288646698, "learning_rate": 4.461389303820378e-05, "loss": 0.5516, "mean_token_accuracy": 0.8260050177574157, "num_tokens": 296010934.0, "step": 8835 }, { "epoch": 0.5276981852913085, "grad_norm": 0.5219425559043884, "learning_rate": 4.460668200822037e-05, "loss": 0.5637, "mean_token_accuracy": 0.8235178351402282, "num_tokens": 296178614.0, "step": 8840 }, { "epoch": 0.5279966571155683, "grad_norm": 0.5341835021972656, "learning_rate": 4.4599466811588805e-05, "loss": 0.5579, "mean_token_accuracy": 0.8254085659980774, "num_tokens": 296346294.0, "step": 8845 }, { "epoch": 0.5282951289398281, "grad_norm": 0.5543561577796936, "learning_rate": 4.459224745006648e-05, "loss": 0.5876, "mean_token_accuracy": 0.8151079535484314, "num_tokens": 296513974.0, "step": 8850 }, { "epoch": 0.5285936007640879, "grad_norm": 0.5444608926773071, "learning_rate": 4.45850239254118e-05, "loss": 0.5706, "mean_token_accuracy": 0.819837772846222, "num_tokens": 296681654.0, "step": 8855 }, { "epoch": 0.5288920725883477, "grad_norm": 0.5355438590049744, "learning_rate": 4.45777962393842e-05, "loss": 0.5358, "mean_token_accuracy": 0.830788505077362, "num_tokens": 296849334.0, "step": 8860 }, { "epoch": 0.5291905444126075, "grad_norm": 0.5682732462882996, "learning_rate": 4.457056439374408e-05, "loss": 0.5848, "mean_token_accuracy": 0.8136884212493897, "num_tokens": 297017014.0, "step": 8865 }, { "epoch": 0.5294890162368673, "grad_norm": 0.5435397624969482, "learning_rate": 4.456332839025293e-05, "loss": 0.5794, "mean_token_accuracy": 0.817117965221405, "num_tokens": 297184694.0, "step": 8870 }, { "epoch": 0.5297874880611271, "grad_norm": 0.5456133484840393, "learning_rate": 4.4556088230673207e-05, "loss": 0.5567, "mean_token_accuracy": 0.82181795835495, "num_tokens": 297352374.0, "step": 8875 }, { "epoch": 0.5300859598853869, "grad_norm": 0.5780700445175171, "learning_rate": 4.4548843916768364e-05, "loss": 0.6033, "mean_token_accuracy": 0.8101037859916687, "num_tokens": 297520054.0, "step": 8880 }, { "epoch": 0.5303844317096467, "grad_norm": 0.5610108971595764, "learning_rate": 4.45415954503029e-05, "loss": 0.5674, "mean_token_accuracy": 0.8212632656097412, "num_tokens": 297687734.0, "step": 8885 }, { "epoch": 0.5306829035339063, "grad_norm": 0.5550732612609863, "learning_rate": 4.453434283304232e-05, "loss": 0.6066, "mean_token_accuracy": 0.810050094127655, "num_tokens": 297855414.0, "step": 8890 }, { "epoch": 0.5309813753581661, "grad_norm": 0.5507266521453857, "learning_rate": 4.452708606675311e-05, "loss": 0.5778, "mean_token_accuracy": 0.8170404314994812, "num_tokens": 298023094.0, "step": 8895 }, { "epoch": 0.5312798471824259, "grad_norm": 0.5527105331420898, "learning_rate": 4.4519825153202824e-05, "loss": 0.6087, "mean_token_accuracy": 0.8090003609657288, "num_tokens": 298190774.0, "step": 8900 }, { "epoch": 0.5315783190066857, "grad_norm": 0.5168611407279968, "learning_rate": 4.451256009415996e-05, "loss": 0.5489, "mean_token_accuracy": 0.8256888866424561, "num_tokens": 298358454.0, "step": 8905 }, { "epoch": 0.5318767908309455, "grad_norm": 0.5142448544502258, "learning_rate": 4.450529089139408e-05, "loss": 0.572, "mean_token_accuracy": 0.8191995859146118, "num_tokens": 298526134.0, "step": 8910 }, { "epoch": 0.5321752626552053, "grad_norm": 0.5494459271430969, "learning_rate": 4.4498017546675714e-05, "loss": 0.5446, "mean_token_accuracy": 0.8277227640151977, "num_tokens": 298693814.0, "step": 8915 }, { "epoch": 0.5324737344794651, "grad_norm": 0.6558916568756104, "learning_rate": 4.449074006177642e-05, "loss": 0.5685, "mean_token_accuracy": 0.8241262197494507, "num_tokens": 298861494.0, "step": 8920 }, { "epoch": 0.5327722063037249, "grad_norm": 0.642120897769928, "learning_rate": 4.448345843846878e-05, "loss": 0.6123, "mean_token_accuracy": 0.8097459197044372, "num_tokens": 299029174.0, "step": 8925 }, { "epoch": 0.5330706781279847, "grad_norm": 0.5801108479499817, "learning_rate": 4.447617267852635e-05, "loss": 0.5795, "mean_token_accuracy": 0.8184778809547424, "num_tokens": 299196854.0, "step": 8930 }, { "epoch": 0.5333691499522445, "grad_norm": 0.5552683472633362, "learning_rate": 4.446888278372373e-05, "loss": 0.5623, "mean_token_accuracy": 0.8219969034194946, "num_tokens": 299364534.0, "step": 8935 }, { "epoch": 0.5336676217765043, "grad_norm": 0.5098156929016113, "learning_rate": 4.446158875583649e-05, "loss": 0.5423, "mean_token_accuracy": 0.8268102049827576, "num_tokens": 299532214.0, "step": 8940 }, { "epoch": 0.5339660936007641, "grad_norm": 0.6488260626792908, "learning_rate": 4.445429059664124e-05, "loss": 0.6259, "mean_token_accuracy": 0.8039365291595459, "num_tokens": 299699894.0, "step": 8945 }, { "epoch": 0.5342645654250239, "grad_norm": 0.531151294708252, "learning_rate": 4.444698830791557e-05, "loss": 0.5675, "mean_token_accuracy": 0.8217225313186646, "num_tokens": 299867574.0, "step": 8950 }, { "epoch": 0.5345630372492837, "grad_norm": 0.5506296157836914, "learning_rate": 4.4439681891438096e-05, "loss": 0.5476, "mean_token_accuracy": 0.8268996834754944, "num_tokens": 300035254.0, "step": 8955 }, { "epoch": 0.5348615090735435, "grad_norm": 0.5589233040809631, "learning_rate": 4.443237134898843e-05, "loss": 0.5473, "mean_token_accuracy": 0.8274662971496582, "num_tokens": 300202934.0, "step": 8960 }, { "epoch": 0.5351599808978033, "grad_norm": 0.5993244647979736, "learning_rate": 4.442505668234718e-05, "loss": 0.5338, "mean_token_accuracy": 0.8296969890594482, "num_tokens": 300370614.0, "step": 8965 }, { "epoch": 0.535458452722063, "grad_norm": 0.5460119843482971, "learning_rate": 4.441773789329598e-05, "loss": 0.6338, "mean_token_accuracy": 0.8036979675292969, "num_tokens": 300538294.0, "step": 8970 }, { "epoch": 0.5357569245463228, "grad_norm": 0.6032574772834778, "learning_rate": 4.4410414983617446e-05, "loss": 0.5501, "mean_token_accuracy": 0.8245258331298828, "num_tokens": 300705974.0, "step": 8975 }, { "epoch": 0.5360553963705826, "grad_norm": 0.6453452110290527, "learning_rate": 4.44030879550952e-05, "loss": 0.5489, "mean_token_accuracy": 0.82506263256073, "num_tokens": 300873654.0, "step": 8980 }, { "epoch": 0.5363538681948424, "grad_norm": 0.5219424366950989, "learning_rate": 4.4395756809513906e-05, "loss": 0.5633, "mean_token_accuracy": 0.8222533583641052, "num_tokens": 301041334.0, "step": 8985 }, { "epoch": 0.5366523400191022, "grad_norm": 0.5612058043479919, "learning_rate": 4.438842154865917e-05, "loss": 0.6, "mean_token_accuracy": 0.8105272650718689, "num_tokens": 301209014.0, "step": 8990 }, { "epoch": 0.536950811843362, "grad_norm": 0.6003037095069885, "learning_rate": 4.438108217431765e-05, "loss": 0.621, "mean_token_accuracy": 0.8066145658493042, "num_tokens": 301376694.0, "step": 8995 }, { "epoch": 0.5372492836676218, "grad_norm": 0.5480248332023621, "learning_rate": 4.437373868827698e-05, "loss": 0.5523, "mean_token_accuracy": 0.8252296328544617, "num_tokens": 301544374.0, "step": 9000 }, { "epoch": 0.5375477554918816, "grad_norm": 0.5819894671440125, "learning_rate": 4.436639109232579e-05, "loss": 0.5816, "mean_token_accuracy": 0.8197363615036011, "num_tokens": 301712054.0, "step": 9005 }, { "epoch": 0.5378462273161414, "grad_norm": 0.5392967462539673, "learning_rate": 4.4359039388253726e-05, "loss": 0.5378, "mean_token_accuracy": 0.8287963747978211, "num_tokens": 301879734.0, "step": 9010 }, { "epoch": 0.5381446991404012, "grad_norm": 0.5332432985305786, "learning_rate": 4.435168357785145e-05, "loss": 0.5237, "mean_token_accuracy": 0.8327225804328918, "num_tokens": 302046313.0, "step": 9015 }, { "epoch": 0.538443170964661, "grad_norm": 0.5505347847938538, "learning_rate": 4.434432366291058e-05, "loss": 0.5227, "mean_token_accuracy": 0.833860182762146, "num_tokens": 302213993.0, "step": 9020 }, { "epoch": 0.5387416427889207, "grad_norm": 0.5260748267173767, "learning_rate": 4.433695964522378e-05, "loss": 0.5976, "mean_token_accuracy": 0.8123762369155884, "num_tokens": 302381673.0, "step": 9025 }, { "epoch": 0.5390401146131805, "grad_norm": 0.5036420226097107, "learning_rate": 4.432959152658469e-05, "loss": 0.5824, "mean_token_accuracy": 0.8154956459999084, "num_tokens": 302549353.0, "step": 9030 }, { "epoch": 0.5393385864374403, "grad_norm": 0.5186771750450134, "learning_rate": 4.432221930878793e-05, "loss": 0.5687, "mean_token_accuracy": 0.820237398147583, "num_tokens": 302717033.0, "step": 9035 }, { "epoch": 0.5396370582617, "grad_norm": 0.5144567489624023, "learning_rate": 4.431484299362916e-05, "loss": 0.5814, "mean_token_accuracy": 0.8172253370285034, "num_tokens": 302884713.0, "step": 9040 }, { "epoch": 0.5399355300859598, "grad_norm": 0.5339194536209106, "learning_rate": 4.430746258290502e-05, "loss": 0.5156, "mean_token_accuracy": 0.835965633392334, "num_tokens": 303052393.0, "step": 9045 }, { "epoch": 0.5402340019102196, "grad_norm": 0.560391366481781, "learning_rate": 4.4300078078413126e-05, "loss": 0.5771, "mean_token_accuracy": 0.8182512283325195, "num_tokens": 303220073.0, "step": 9050 }, { "epoch": 0.5405324737344794, "grad_norm": 0.6061593890190125, "learning_rate": 4.429268948195213e-05, "loss": 0.5785, "mean_token_accuracy": 0.8181557774543762, "num_tokens": 303387753.0, "step": 9055 }, { "epoch": 0.5408309455587392, "grad_norm": 0.6637635231018066, "learning_rate": 4.428529679532165e-05, "loss": 0.5747, "mean_token_accuracy": 0.8197304129600524, "num_tokens": 303555433.0, "step": 9060 }, { "epoch": 0.541129417382999, "grad_norm": 0.5166680812835693, "learning_rate": 4.4277900020322325e-05, "loss": 0.5716, "mean_token_accuracy": 0.8197423338890075, "num_tokens": 303723113.0, "step": 9065 }, { "epoch": 0.5414278892072588, "grad_norm": 0.5843801498413086, "learning_rate": 4.427049915875575e-05, "loss": 0.5736, "mean_token_accuracy": 0.8185255885124206, "num_tokens": 303890793.0, "step": 9070 }, { "epoch": 0.5417263610315186, "grad_norm": 0.5218236446380615, "learning_rate": 4.4263094212424564e-05, "loss": 0.5723, "mean_token_accuracy": 0.8196767210960388, "num_tokens": 304058473.0, "step": 9075 }, { "epoch": 0.5420248328557784, "grad_norm": 0.525384783744812, "learning_rate": 4.425568518313237e-05, "loss": 0.5496, "mean_token_accuracy": 0.8261362314224243, "num_tokens": 304226153.0, "step": 9080 }, { "epoch": 0.5423233046800382, "grad_norm": 0.563209593296051, "learning_rate": 4.424827207268377e-05, "loss": 0.5669, "mean_token_accuracy": 0.8216509580612182, "num_tokens": 304393833.0, "step": 9085 }, { "epoch": 0.542621776504298, "grad_norm": 0.576561450958252, "learning_rate": 4.424085488288438e-05, "loss": 0.5772, "mean_token_accuracy": 0.8167839646339417, "num_tokens": 304561513.0, "step": 9090 }, { "epoch": 0.5429202483285578, "grad_norm": 0.49649518728256226, "learning_rate": 4.423343361554077e-05, "loss": 0.5389, "mean_token_accuracy": 0.829410707950592, "num_tokens": 304729193.0, "step": 9095 }, { "epoch": 0.5432187201528176, "grad_norm": 0.6102119088172913, "learning_rate": 4.422600827246055e-05, "loss": 0.5425, "mean_token_accuracy": 0.8270070314407348, "num_tokens": 304896873.0, "step": 9100 }, { "epoch": 0.5435171919770774, "grad_norm": 0.5200836658477783, "learning_rate": 4.421857885545229e-05, "loss": 0.5408, "mean_token_accuracy": 0.8289395093917846, "num_tokens": 305064553.0, "step": 9105 }, { "epoch": 0.5438156638013372, "grad_norm": 0.5582632422447205, "learning_rate": 4.4211145366325545e-05, "loss": 0.5569, "mean_token_accuracy": 0.823833966255188, "num_tokens": 305232233.0, "step": 9110 }, { "epoch": 0.544114135625597, "grad_norm": 0.5659685730934143, "learning_rate": 4.42037078068909e-05, "loss": 0.5809, "mean_token_accuracy": 0.8184301614761352, "num_tokens": 305399913.0, "step": 9115 }, { "epoch": 0.5444126074498568, "grad_norm": 0.5390270948410034, "learning_rate": 4.419626617895991e-05, "loss": 0.5738, "mean_token_accuracy": 0.8193188667297363, "num_tokens": 305567593.0, "step": 9120 }, { "epoch": 0.5447110792741165, "grad_norm": 0.5471552014350891, "learning_rate": 4.4188820484345116e-05, "loss": 0.5273, "mean_token_accuracy": 0.8319873452186585, "num_tokens": 305735273.0, "step": 9125 }, { "epoch": 0.5450095510983763, "grad_norm": 0.5766529440879822, "learning_rate": 4.418137072486005e-05, "loss": 0.5263, "mean_token_accuracy": 0.83276869058609, "num_tokens": 305902953.0, "step": 9130 }, { "epoch": 0.5453080229226361, "grad_norm": 0.5826560258865356, "learning_rate": 4.417391690231924e-05, "loss": 0.5802, "mean_token_accuracy": 0.8160622596740723, "num_tokens": 306070633.0, "step": 9135 }, { "epoch": 0.5456064947468959, "grad_norm": 0.5199210047721863, "learning_rate": 4.4166459018538196e-05, "loss": 0.5091, "mean_token_accuracy": 0.8379756569862366, "num_tokens": 306238313.0, "step": 9140 }, { "epoch": 0.5459049665711557, "grad_norm": 0.5484384894371033, "learning_rate": 4.415899707533344e-05, "loss": 0.5642, "mean_token_accuracy": 0.8205177307128906, "num_tokens": 306405993.0, "step": 9145 }, { "epoch": 0.5462034383954155, "grad_norm": 0.551522970199585, "learning_rate": 4.415153107452245e-05, "loss": 0.5351, "mean_token_accuracy": 0.8285518169403077, "num_tokens": 306573673.0, "step": 9150 }, { "epoch": 0.5465019102196753, "grad_norm": 0.5362953543663025, "learning_rate": 4.414406101792372e-05, "loss": 0.5528, "mean_token_accuracy": 0.8251699924468994, "num_tokens": 306741353.0, "step": 9155 }, { "epoch": 0.5468003820439351, "grad_norm": 0.5488634705543518, "learning_rate": 4.413658690735671e-05, "loss": 0.544, "mean_token_accuracy": 0.8298699736595154, "num_tokens": 306909033.0, "step": 9160 }, { "epoch": 0.5470988538681948, "grad_norm": 0.5442475080490112, "learning_rate": 4.412910874464189e-05, "loss": 0.5801, "mean_token_accuracy": 0.8183048963546753, "num_tokens": 307076713.0, "step": 9165 }, { "epoch": 0.5473973256924546, "grad_norm": 0.6032072305679321, "learning_rate": 4.412162653160068e-05, "loss": 0.5838, "mean_token_accuracy": 0.8158356428146363, "num_tokens": 307244393.0, "step": 9170 }, { "epoch": 0.5476957975167144, "grad_norm": 0.5607478022575378, "learning_rate": 4.411414027005555e-05, "loss": 0.5784, "mean_token_accuracy": 0.8199689745903015, "num_tokens": 307412073.0, "step": 9175 }, { "epoch": 0.5479942693409742, "grad_norm": 0.5132443904876709, "learning_rate": 4.410664996182989e-05, "loss": 0.554, "mean_token_accuracy": 0.8261720180511475, "num_tokens": 307579753.0, "step": 9180 }, { "epoch": 0.548292741165234, "grad_norm": 0.5602428913116455, "learning_rate": 4.40991556087481e-05, "loss": 0.5751, "mean_token_accuracy": 0.8194560289382935, "num_tokens": 307747433.0, "step": 9185 }, { "epoch": 0.5485912129894938, "grad_norm": 0.5555048584938049, "learning_rate": 4.4091657212635586e-05, "loss": 0.5654, "mean_token_accuracy": 0.8209352254867553, "num_tokens": 307915113.0, "step": 9190 }, { "epoch": 0.5488896848137536, "grad_norm": 0.611984133720398, "learning_rate": 4.40841547753187e-05, "loss": 0.5451, "mean_token_accuracy": 0.827669084072113, "num_tokens": 308082793.0, "step": 9195 }, { "epoch": 0.5491881566380133, "grad_norm": 0.4953276515007019, "learning_rate": 4.4076648298624815e-05, "loss": 0.569, "mean_token_accuracy": 0.8204163312911987, "num_tokens": 308250473.0, "step": 9200 }, { "epoch": 0.5494866284622731, "grad_norm": 0.5716685056686401, "learning_rate": 4.4069137784382266e-05, "loss": 0.5322, "mean_token_accuracy": 0.8322796106338501, "num_tokens": 308418153.0, "step": 9205 }, { "epoch": 0.5497851002865329, "grad_norm": 0.5615285038948059, "learning_rate": 4.406162323442038e-05, "loss": 0.5546, "mean_token_accuracy": 0.8238518476486206, "num_tokens": 308585833.0, "step": 9210 }, { "epoch": 0.5500835721107927, "grad_norm": 0.5403125882148743, "learning_rate": 4.405410465056946e-05, "loss": 0.5466, "mean_token_accuracy": 0.8265835642814636, "num_tokens": 308753513.0, "step": 9215 }, { "epoch": 0.5503820439350525, "grad_norm": 0.5557153820991516, "learning_rate": 4.40465820346608e-05, "loss": 0.5573, "mean_token_accuracy": 0.8229631543159485, "num_tokens": 308921193.0, "step": 9220 }, { "epoch": 0.5506805157593123, "grad_norm": 0.5201708078384399, "learning_rate": 4.4039055388526674e-05, "loss": 0.5721, "mean_token_accuracy": 0.8206608533859253, "num_tokens": 309088873.0, "step": 9225 }, { "epoch": 0.5509789875835721, "grad_norm": 0.5418476462364197, "learning_rate": 4.403152471400033e-05, "loss": 0.5611, "mean_token_accuracy": 0.8228677034378051, "num_tokens": 309256553.0, "step": 9230 }, { "epoch": 0.5512774594078319, "grad_norm": 0.5212776064872742, "learning_rate": 4.4023990012916e-05, "loss": 0.5779, "mean_token_accuracy": 0.8170285105705262, "num_tokens": 309424233.0, "step": 9235 }, { "epoch": 0.5515759312320917, "grad_norm": 0.5821331739425659, "learning_rate": 4.401645128710892e-05, "loss": 0.5783, "mean_token_accuracy": 0.8175652980804443, "num_tokens": 309591913.0, "step": 9240 }, { "epoch": 0.5518744030563515, "grad_norm": 0.5345621109008789, "learning_rate": 4.400890853841525e-05, "loss": 0.5586, "mean_token_accuracy": 0.8244960069656372, "num_tokens": 309759593.0, "step": 9245 }, { "epoch": 0.5521728748806113, "grad_norm": 0.557401716709137, "learning_rate": 4.40013617686722e-05, "loss": 0.5821, "mean_token_accuracy": 0.8174639105796814, "num_tokens": 309927273.0, "step": 9250 }, { "epoch": 0.5524713467048711, "grad_norm": 0.5907803177833557, "learning_rate": 4.399381097971791e-05, "loss": 0.6042, "mean_token_accuracy": 0.8101932406425476, "num_tokens": 310094953.0, "step": 9255 }, { "epoch": 0.5527698185291309, "grad_norm": 0.5467956066131592, "learning_rate": 4.39862561733915e-05, "loss": 0.5613, "mean_token_accuracy": 0.8219969034194946, "num_tokens": 310262633.0, "step": 9260 }, { "epoch": 0.5530682903533907, "grad_norm": 0.49250057339668274, "learning_rate": 4.397869735153312e-05, "loss": 0.5403, "mean_token_accuracy": 0.8290886282920837, "num_tokens": 310430313.0, "step": 9265 }, { "epoch": 0.5533667621776505, "grad_norm": 0.47722843289375305, "learning_rate": 4.3971134515983824e-05, "loss": 0.5538, "mean_token_accuracy": 0.8251759529113769, "num_tokens": 310597993.0, "step": 9270 }, { "epoch": 0.5536652340019103, "grad_norm": 0.5117371082305908, "learning_rate": 4.39635676685857e-05, "loss": 0.5841, "mean_token_accuracy": 0.8160324454307556, "num_tokens": 310765673.0, "step": 9275 }, { "epoch": 0.55396370582617, "grad_norm": 0.5404113531112671, "learning_rate": 4.395599681118178e-05, "loss": 0.5415, "mean_token_accuracy": 0.8286114692687988, "num_tokens": 310933353.0, "step": 9280 }, { "epoch": 0.5542621776504298, "grad_norm": 0.4832151532173157, "learning_rate": 4.3948421945616095e-05, "loss": 0.5158, "mean_token_accuracy": 0.835840392112732, "num_tokens": 311101033.0, "step": 9285 }, { "epoch": 0.5545606494746896, "grad_norm": 0.5472707748413086, "learning_rate": 4.394084307373365e-05, "loss": 0.5756, "mean_token_accuracy": 0.8209352135658264, "num_tokens": 311268713.0, "step": 9290 }, { "epoch": 0.5548591212989494, "grad_norm": 1.157281517982483, "learning_rate": 4.393326019738041e-05, "loss": 0.569, "mean_token_accuracy": 0.8219909429550171, "num_tokens": 311436393.0, "step": 9295 }, { "epoch": 0.5551575931232091, "grad_norm": 0.5492601990699768, "learning_rate": 4.392567331840332e-05, "loss": 0.5624, "mean_token_accuracy": 0.823696768283844, "num_tokens": 311604073.0, "step": 9300 }, { "epoch": 0.5554560649474689, "grad_norm": 0.5792446136474609, "learning_rate": 4.391808243865031e-05, "loss": 0.5631, "mean_token_accuracy": 0.8220625042915344, "num_tokens": 311771753.0, "step": 9305 }, { "epoch": 0.5557545367717287, "grad_norm": 0.493204265832901, "learning_rate": 4.391048755997028e-05, "loss": 0.5343, "mean_token_accuracy": 0.8309495449066162, "num_tokens": 311939433.0, "step": 9310 }, { "epoch": 0.5560530085959885, "grad_norm": 0.5801738500595093, "learning_rate": 4.390288868421311e-05, "loss": 0.542, "mean_token_accuracy": 0.8293331742286683, "num_tokens": 312107113.0, "step": 9315 }, { "epoch": 0.5563514804202483, "grad_norm": 0.6259301900863647, "learning_rate": 4.389528581322963e-05, "loss": 0.6023, "mean_token_accuracy": 0.8118394255638123, "num_tokens": 312274793.0, "step": 9320 }, { "epoch": 0.5566499522445081, "grad_norm": 0.5844714045524597, "learning_rate": 4.388767894887167e-05, "loss": 0.6053, "mean_token_accuracy": 0.8097101211547851, "num_tokens": 312442473.0, "step": 9325 }, { "epoch": 0.5569484240687679, "grad_norm": 0.5438195466995239, "learning_rate": 4.388006809299202e-05, "loss": 0.5275, "mean_token_accuracy": 0.8327627301216125, "num_tokens": 312610153.0, "step": 9330 }, { "epoch": 0.5572468958930277, "grad_norm": 0.5652937889099121, "learning_rate": 4.387245324744444e-05, "loss": 0.5673, "mean_token_accuracy": 0.8214183568954467, "num_tokens": 312777833.0, "step": 9335 }, { "epoch": 0.5575453677172875, "grad_norm": 0.5891305804252625, "learning_rate": 4.386483441408368e-05, "loss": 0.5322, "mean_token_accuracy": 0.8325539708137513, "num_tokens": 312945513.0, "step": 9340 }, { "epoch": 0.5578438395415473, "grad_norm": 0.5884708762168884, "learning_rate": 4.3857211594765435e-05, "loss": 0.6028, "mean_token_accuracy": 0.8108314394950866, "num_tokens": 313113193.0, "step": 9345 }, { "epoch": 0.558142311365807, "grad_norm": 0.5042849183082581, "learning_rate": 4.384958479134638e-05, "loss": 0.5473, "mean_token_accuracy": 0.8275975227355957, "num_tokens": 313280873.0, "step": 9350 }, { "epoch": 0.5584407831900668, "grad_norm": 0.4993573725223541, "learning_rate": 4.384195400568418e-05, "loss": 0.5372, "mean_token_accuracy": 0.8292258024215698, "num_tokens": 313448553.0, "step": 9355 }, { "epoch": 0.5587392550143266, "grad_norm": 0.5659687519073486, "learning_rate": 4.383431923963743e-05, "loss": 0.648, "mean_token_accuracy": 0.7970893502235412, "num_tokens": 313616233.0, "step": 9360 }, { "epoch": 0.5590377268385864, "grad_norm": 0.5630433559417725, "learning_rate": 4.382668049506575e-05, "loss": 0.5855, "mean_token_accuracy": 0.8163246989250184, "num_tokens": 313783913.0, "step": 9365 }, { "epoch": 0.5593361986628462, "grad_norm": 0.6412773728370667, "learning_rate": 4.3819037773829666e-05, "loss": 0.5516, "mean_token_accuracy": 0.8258380055427551, "num_tokens": 313951593.0, "step": 9370 }, { "epoch": 0.559634670487106, "grad_norm": 0.5726149678230286, "learning_rate": 4.381139107779073e-05, "loss": 0.5945, "mean_token_accuracy": 0.8127221822738647, "num_tokens": 314119273.0, "step": 9375 }, { "epoch": 0.5599331423113658, "grad_norm": 0.5358670353889465, "learning_rate": 4.380374040881141e-05, "loss": 0.5302, "mean_token_accuracy": 0.8315400123596192, "num_tokens": 314286953.0, "step": 9380 }, { "epoch": 0.5602316141356256, "grad_norm": 0.4920586943626404, "learning_rate": 4.379608576875519e-05, "loss": 0.5699, "mean_token_accuracy": 0.8202254414558411, "num_tokens": 314454633.0, "step": 9385 }, { "epoch": 0.5605300859598854, "grad_norm": 0.5719877481460571, "learning_rate": 4.378842715948648e-05, "loss": 0.538, "mean_token_accuracy": 0.8295538663864136, "num_tokens": 314622313.0, "step": 9390 }, { "epoch": 0.5608285577841452, "grad_norm": 0.5797923803329468, "learning_rate": 4.37807645828707e-05, "loss": 0.5424, "mean_token_accuracy": 0.8275376915931701, "num_tokens": 314788279.0, "step": 9395 }, { "epoch": 0.561127029608405, "grad_norm": 0.581619918346405, "learning_rate": 4.3773098040774185e-05, "loss": 0.512, "mean_token_accuracy": 0.8385482430458069, "num_tokens": 314955959.0, "step": 9400 }, { "epoch": 0.5614255014326648, "grad_norm": 0.5046367645263672, "learning_rate": 4.3765427535064275e-05, "loss": 0.5298, "mean_token_accuracy": 0.8310807704925537, "num_tokens": 315123639.0, "step": 9405 }, { "epoch": 0.5617239732569246, "grad_norm": 0.6082244515419006, "learning_rate": 4.375775306760927e-05, "loss": 0.5583, "mean_token_accuracy": 0.8237564086914062, "num_tokens": 315291319.0, "step": 9410 }, { "epoch": 0.5620224450811844, "grad_norm": 0.5142558813095093, "learning_rate": 4.3750074640278414e-05, "loss": 0.5056, "mean_token_accuracy": 0.8406477332115173, "num_tokens": 315458999.0, "step": 9415 }, { "epoch": 0.5623209169054442, "grad_norm": 0.5200819373130798, "learning_rate": 4.374239225494195e-05, "loss": 0.5209, "mean_token_accuracy": 0.8346773266792298, "num_tokens": 315626679.0, "step": 9420 }, { "epoch": 0.562619388729704, "grad_norm": 0.466137170791626, "learning_rate": 4.373470591347104e-05, "loss": 0.5232, "mean_token_accuracy": 0.8334725141525269, "num_tokens": 315794359.0, "step": 9425 }, { "epoch": 0.5629178605539638, "grad_norm": 0.6146972179412842, "learning_rate": 4.3727015617737846e-05, "loss": 0.5821, "mean_token_accuracy": 0.8179410696029663, "num_tokens": 315962039.0, "step": 9430 }, { "epoch": 0.5632163323782235, "grad_norm": 0.5308025479316711, "learning_rate": 4.371932136961549e-05, "loss": 0.5152, "mean_token_accuracy": 0.8369080185890198, "num_tokens": 316129719.0, "step": 9435 }, { "epoch": 0.5635148042024832, "grad_norm": 0.7136004567146301, "learning_rate": 4.371162317097804e-05, "loss": 0.5591, "mean_token_accuracy": 0.8240427017211914, "num_tokens": 316297399.0, "step": 9440 }, { "epoch": 0.563813276026743, "grad_norm": 0.6727559566497803, "learning_rate": 4.3703921023700535e-05, "loss": 0.5811, "mean_token_accuracy": 0.8192592024803161, "num_tokens": 316465079.0, "step": 9445 }, { "epoch": 0.5641117478510028, "grad_norm": 0.4987635016441345, "learning_rate": 4.369621492965896e-05, "loss": 0.5335, "mean_token_accuracy": 0.8316951036453247, "num_tokens": 316632759.0, "step": 9450 }, { "epoch": 0.5644102196752626, "grad_norm": 0.5753521919250488, "learning_rate": 4.368850489073031e-05, "loss": 0.5824, "mean_token_accuracy": 0.8179172158241272, "num_tokens": 316800439.0, "step": 9455 }, { "epoch": 0.5647086914995224, "grad_norm": 0.5425297021865845, "learning_rate": 4.368079090879248e-05, "loss": 0.5921, "mean_token_accuracy": 0.8140403270721436, "num_tokens": 316968119.0, "step": 9460 }, { "epoch": 0.5650071633237822, "grad_norm": 0.5226919651031494, "learning_rate": 4.3673072985724364e-05, "loss": 0.5893, "mean_token_accuracy": 0.8144518494606018, "num_tokens": 317135799.0, "step": 9465 }, { "epoch": 0.565305635148042, "grad_norm": 0.5343566536903381, "learning_rate": 4.366535112340579e-05, "loss": 0.5539, "mean_token_accuracy": 0.8235655546188354, "num_tokens": 317303479.0, "step": 9470 }, { "epoch": 0.5656041069723018, "grad_norm": 0.6416198015213013, "learning_rate": 4.365762532371758e-05, "loss": 0.5733, "mean_token_accuracy": 0.8221039295196533, "num_tokens": 317462968.0, "step": 9475 }, { "epoch": 0.5659025787965616, "grad_norm": 0.5042870044708252, "learning_rate": 4.364989558854149e-05, "loss": 0.5394, "mean_token_accuracy": 0.8275856018066406, "num_tokens": 317630648.0, "step": 9480 }, { "epoch": 0.5662010506208214, "grad_norm": 0.5087573528289795, "learning_rate": 4.364216191976024e-05, "loss": 0.5347, "mean_token_accuracy": 0.8297566652297974, "num_tokens": 317798328.0, "step": 9485 }, { "epoch": 0.5664995224450812, "grad_norm": 0.5444625616073608, "learning_rate": 4.3634424319257495e-05, "loss": 0.5488, "mean_token_accuracy": 0.8262435913085937, "num_tokens": 317966008.0, "step": 9490 }, { "epoch": 0.566797994269341, "grad_norm": 0.5101728439331055, "learning_rate": 4.36266827889179e-05, "loss": 0.5384, "mean_token_accuracy": 0.8292794942855835, "num_tokens": 318133688.0, "step": 9495 }, { "epoch": 0.5670964660936008, "grad_norm": 0.5687821507453918, "learning_rate": 4.361893733062705e-05, "loss": 0.5564, "mean_token_accuracy": 0.8238279700279236, "num_tokens": 318301368.0, "step": 9500 }, { "epoch": 0.5673949379178606, "grad_norm": 0.5257329940795898, "learning_rate": 4.3611187946271495e-05, "loss": 0.5781, "mean_token_accuracy": 0.8187760829925537, "num_tokens": 318469048.0, "step": 9505 }, { "epoch": 0.5676934097421203, "grad_norm": 0.6087618470191956, "learning_rate": 4.360343463773873e-05, "loss": 0.5528, "mean_token_accuracy": 0.8239353418350219, "num_tokens": 318636728.0, "step": 9510 }, { "epoch": 0.5679918815663801, "grad_norm": 0.5090599060058594, "learning_rate": 4.359567740691723e-05, "loss": 0.5327, "mean_token_accuracy": 0.8291423320770264, "num_tokens": 318804408.0, "step": 9515 }, { "epoch": 0.5682903533906399, "grad_norm": 0.6896741986274719, "learning_rate": 4.35879162556964e-05, "loss": 0.574, "mean_token_accuracy": 0.8179172039031982, "num_tokens": 318972088.0, "step": 9520 }, { "epoch": 0.5685888252148997, "grad_norm": 0.5273687839508057, "learning_rate": 4.3580151185966625e-05, "loss": 0.5561, "mean_token_accuracy": 0.8233985424041748, "num_tokens": 319139768.0, "step": 9525 }, { "epoch": 0.5688872970391595, "grad_norm": 0.5132014751434326, "learning_rate": 4.357238219961922e-05, "loss": 0.54, "mean_token_accuracy": 0.8274961352348328, "num_tokens": 319307448.0, "step": 9530 }, { "epoch": 0.5691857688634193, "grad_norm": 0.5733283758163452, "learning_rate": 4.3564609298546463e-05, "loss": 0.5573, "mean_token_accuracy": 0.8231062889099121, "num_tokens": 319475128.0, "step": 9535 }, { "epoch": 0.5694842406876791, "grad_norm": 0.5522651672363281, "learning_rate": 4.3556832484641594e-05, "loss": 0.5233, "mean_token_accuracy": 0.8317189574241638, "num_tokens": 319642808.0, "step": 9540 }, { "epoch": 0.5697827125119389, "grad_norm": 0.5674959421157837, "learning_rate": 4.3549051759798805e-05, "loss": 0.5505, "mean_token_accuracy": 0.8244125008583069, "num_tokens": 319810488.0, "step": 9545 }, { "epoch": 0.5700811843361987, "grad_norm": 0.5781137347221375, "learning_rate": 4.354126712591322e-05, "loss": 0.551, "mean_token_accuracy": 0.8258797526359558, "num_tokens": 319978168.0, "step": 9550 }, { "epoch": 0.5703796561604585, "grad_norm": 0.6012808680534363, "learning_rate": 4.3533478584880945e-05, "loss": 0.5749, "mean_token_accuracy": 0.8197065591812134, "num_tokens": 320145848.0, "step": 9555 }, { "epoch": 0.5706781279847183, "grad_norm": 0.5958951711654663, "learning_rate": 4.352568613859901e-05, "loss": 0.558, "mean_token_accuracy": 0.8245258212089539, "num_tokens": 320313528.0, "step": 9560 }, { "epoch": 0.5709765998089781, "grad_norm": 0.564767062664032, "learning_rate": 4.3517889788965426e-05, "loss": 0.555, "mean_token_accuracy": 0.8242634057998657, "num_tokens": 320481208.0, "step": 9565 }, { "epoch": 0.5712750716332379, "grad_norm": 0.5786231160163879, "learning_rate": 4.3510089537879113e-05, "loss": 0.5642, "mean_token_accuracy": 0.8220267176628113, "num_tokens": 320648888.0, "step": 9570 }, { "epoch": 0.5715735434574976, "grad_norm": 0.6010572910308838, "learning_rate": 4.350228538723999e-05, "loss": 0.6235, "mean_token_accuracy": 0.8046940326690674, "num_tokens": 320816568.0, "step": 9575 }, { "epoch": 0.5718720152817574, "grad_norm": 0.6927845478057861, "learning_rate": 4.349447733894889e-05, "loss": 0.5852, "mean_token_accuracy": 0.8170523524284363, "num_tokens": 320984248.0, "step": 9580 }, { "epoch": 0.5721704871060171, "grad_norm": 0.517074465751648, "learning_rate": 4.348666539490761e-05, "loss": 0.5453, "mean_token_accuracy": 0.8278778553009033, "num_tokens": 321151928.0, "step": 9585 }, { "epoch": 0.5724689589302769, "grad_norm": 0.5346642732620239, "learning_rate": 4.34788495570189e-05, "loss": 0.5664, "mean_token_accuracy": 0.8198437333106995, "num_tokens": 321319608.0, "step": 9590 }, { "epoch": 0.5727674307545367, "grad_norm": 0.5233079791069031, "learning_rate": 4.347102982718644e-05, "loss": 0.5575, "mean_token_accuracy": 0.8219968914985657, "num_tokens": 321487288.0, "step": 9595 }, { "epoch": 0.5730659025787965, "grad_norm": 0.5271203517913818, "learning_rate": 4.346320620731487e-05, "loss": 0.5725, "mean_token_accuracy": 0.8205177068710328, "num_tokens": 321654968.0, "step": 9600 }, { "epoch": 0.5733643744030563, "grad_norm": 0.5608517527580261, "learning_rate": 4.345537869930978e-05, "loss": 0.5646, "mean_token_accuracy": 0.8202791333198547, "num_tokens": 321822648.0, "step": 9605 }, { "epoch": 0.5736628462273161, "grad_norm": 0.49211350083351135, "learning_rate": 4.3447547305077716e-05, "loss": 0.5173, "mean_token_accuracy": 0.8362519383430481, "num_tokens": 321990328.0, "step": 9610 }, { "epoch": 0.5739613180515759, "grad_norm": 0.5640000700950623, "learning_rate": 4.3439712026526134e-05, "loss": 0.5237, "mean_token_accuracy": 0.8342419147491456, "num_tokens": 322158008.0, "step": 9615 }, { "epoch": 0.5742597898758357, "grad_norm": 0.5405912399291992, "learning_rate": 4.3431872865563484e-05, "loss": 0.5623, "mean_token_accuracy": 0.821728503704071, "num_tokens": 322325688.0, "step": 9620 }, { "epoch": 0.5745582617000955, "grad_norm": 0.5111109018325806, "learning_rate": 4.3424029824099126e-05, "loss": 0.5811, "mean_token_accuracy": 0.8195753216743469, "num_tokens": 322493368.0, "step": 9625 }, { "epoch": 0.5748567335243553, "grad_norm": 0.48165133595466614, "learning_rate": 4.341618290404338e-05, "loss": 0.5331, "mean_token_accuracy": 0.82977454662323, "num_tokens": 322661048.0, "step": 9630 }, { "epoch": 0.5751552053486151, "grad_norm": 0.6553030610084534, "learning_rate": 4.340833210730751e-05, "loss": 0.5654, "mean_token_accuracy": 0.8210724115371704, "num_tokens": 322828728.0, "step": 9635 }, { "epoch": 0.5754536771728749, "grad_norm": 0.5732831358909607, "learning_rate": 4.340047743580372e-05, "loss": 0.6124, "mean_token_accuracy": 0.8088810801506042, "num_tokens": 322996408.0, "step": 9640 }, { "epoch": 0.5757521489971347, "grad_norm": 0.5754977464675903, "learning_rate": 4.339261889144516e-05, "loss": 0.5964, "mean_token_accuracy": 0.8130919814109803, "num_tokens": 323164088.0, "step": 9645 }, { "epoch": 0.5760506208213945, "grad_norm": 0.5848853588104248, "learning_rate": 4.338475647614593e-05, "loss": 0.5491, "mean_token_accuracy": 0.8263748049736023, "num_tokens": 323331768.0, "step": 9650 }, { "epoch": 0.5763490926456543, "grad_norm": 0.6020246744155884, "learning_rate": 4.3376890191821054e-05, "loss": 0.5781, "mean_token_accuracy": 0.8168436169624329, "num_tokens": 323499448.0, "step": 9655 }, { "epoch": 0.576647564469914, "grad_norm": 0.6079437732696533, "learning_rate": 4.336902004038653e-05, "loss": 0.545, "mean_token_accuracy": 0.8260706305503845, "num_tokens": 323667128.0, "step": 9660 }, { "epoch": 0.5769460362941738, "grad_norm": 0.5589531064033508, "learning_rate": 4.336114602375926e-05, "loss": 0.5385, "mean_token_accuracy": 0.8265060067176819, "num_tokens": 323834808.0, "step": 9665 }, { "epoch": 0.5772445081184336, "grad_norm": 0.5987696647644043, "learning_rate": 4.3353268143857104e-05, "loss": 0.5751, "mean_token_accuracy": 0.8176845788955689, "num_tokens": 324002488.0, "step": 9670 }, { "epoch": 0.5775429799426934, "grad_norm": 0.5677342414855957, "learning_rate": 4.334538640259888e-05, "loss": 0.5416, "mean_token_accuracy": 0.8283788681030273, "num_tokens": 324170168.0, "step": 9675 }, { "epoch": 0.5778414517669532, "grad_norm": 0.526494562625885, "learning_rate": 4.3337500801904336e-05, "loss": 0.6107, "mean_token_accuracy": 0.8081235766410828, "num_tokens": 324337848.0, "step": 9680 }, { "epoch": 0.578139923591213, "grad_norm": 0.5406463742256165, "learning_rate": 4.332961134369413e-05, "loss": 0.569, "mean_token_accuracy": 0.8202552795410156, "num_tokens": 324505528.0, "step": 9685 }, { "epoch": 0.5784383954154728, "grad_norm": 0.6007771492004395, "learning_rate": 4.332171802988991e-05, "loss": 0.5669, "mean_token_accuracy": 0.8211857318878174, "num_tokens": 324673208.0, "step": 9690 }, { "epoch": 0.5787368672397326, "grad_norm": 0.6053233742713928, "learning_rate": 4.3313820862414225e-05, "loss": 0.5868, "mean_token_accuracy": 0.8157401919364929, "num_tokens": 324840888.0, "step": 9695 }, { "epoch": 0.5790353390639924, "grad_norm": 0.6275655627250671, "learning_rate": 4.3305919843190576e-05, "loss": 0.582, "mean_token_accuracy": 0.8180722951889038, "num_tokens": 325008568.0, "step": 9700 }, { "epoch": 0.5793338108882522, "grad_norm": 0.573482096195221, "learning_rate": 4.32980149741434e-05, "loss": 0.5506, "mean_token_accuracy": 0.8243409276008606, "num_tokens": 325176248.0, "step": 9705 }, { "epoch": 0.579632282712512, "grad_norm": 0.5884882211685181, "learning_rate": 4.329010625719809e-05, "loss": 0.5934, "mean_token_accuracy": 0.8156030178070068, "num_tokens": 325343928.0, "step": 9710 }, { "epoch": 0.5799307545367717, "grad_norm": 0.5336214303970337, "learning_rate": 4.328219369428094e-05, "loss": 0.5549, "mean_token_accuracy": 0.8237444877624511, "num_tokens": 325511608.0, "step": 9715 }, { "epoch": 0.5802292263610315, "grad_norm": 0.5747628808021545, "learning_rate": 4.327427728731922e-05, "loss": 0.5199, "mean_token_accuracy": 0.8334784746170044, "num_tokens": 325679288.0, "step": 9720 }, { "epoch": 0.5805276981852913, "grad_norm": 0.5531827807426453, "learning_rate": 4.3266357038241104e-05, "loss": 0.5738, "mean_token_accuracy": 0.8195393085479736, "num_tokens": 325846270.0, "step": 9725 }, { "epoch": 0.5808261700095511, "grad_norm": 0.5761283040046692, "learning_rate": 4.3258432948975714e-05, "loss": 0.5596, "mean_token_accuracy": 0.8210008263587951, "num_tokens": 326013950.0, "step": 9730 }, { "epoch": 0.5811246418338109, "grad_norm": 0.551312267780304, "learning_rate": 4.325050502145311e-05, "loss": 0.577, "mean_token_accuracy": 0.8192174553871154, "num_tokens": 326181630.0, "step": 9735 }, { "epoch": 0.5814231136580706, "grad_norm": 0.6245538592338562, "learning_rate": 4.32425732576043e-05, "loss": 0.5884, "mean_token_accuracy": 0.8152570724487305, "num_tokens": 326349310.0, "step": 9740 }, { "epoch": 0.5817215854823304, "grad_norm": 0.5730269551277161, "learning_rate": 4.3234637659361186e-05, "loss": 0.5705, "mean_token_accuracy": 0.8192592144012452, "num_tokens": 326516990.0, "step": 9745 }, { "epoch": 0.5820200573065902, "grad_norm": 0.5351265072822571, "learning_rate": 4.322669822865665e-05, "loss": 0.5542, "mean_token_accuracy": 0.8247405409812927, "num_tokens": 326684670.0, "step": 9750 }, { "epoch": 0.58231852913085, "grad_norm": 0.48421576619148254, "learning_rate": 4.3218754967424465e-05, "loss": 0.5154, "mean_token_accuracy": 0.8354944586753845, "num_tokens": 326852350.0, "step": 9755 }, { "epoch": 0.5826170009551098, "grad_norm": 0.5339226126670837, "learning_rate": 4.321080787759938e-05, "loss": 0.5855, "mean_token_accuracy": 0.8142848610877991, "num_tokens": 327020030.0, "step": 9760 }, { "epoch": 0.5829154727793696, "grad_norm": 0.46684908866882324, "learning_rate": 4.3202856961117056e-05, "loss": 0.5203, "mean_token_accuracy": 0.8335202217102051, "num_tokens": 327187710.0, "step": 9765 }, { "epoch": 0.5832139446036294, "grad_norm": 0.534252405166626, "learning_rate": 4.3194902219914074e-05, "loss": 0.55, "mean_token_accuracy": 0.82533700466156, "num_tokens": 327355390.0, "step": 9770 }, { "epoch": 0.5835124164278892, "grad_norm": 0.5405399203300476, "learning_rate": 4.318694365592795e-05, "loss": 0.5434, "mean_token_accuracy": 0.826273787021637, "num_tokens": 327518373.0, "step": 9775 }, { "epoch": 0.583810888252149, "grad_norm": 0.5407677292823792, "learning_rate": 4.317898127109717e-05, "loss": 0.5783, "mean_token_accuracy": 0.8166765928268432, "num_tokens": 327686053.0, "step": 9780 }, { "epoch": 0.5841093600764088, "grad_norm": 0.5031781196594238, "learning_rate": 4.3171015067361086e-05, "loss": 0.5592, "mean_token_accuracy": 0.8218119859695434, "num_tokens": 327853733.0, "step": 9785 }, { "epoch": 0.5844078319006686, "grad_norm": 0.5705164670944214, "learning_rate": 4.316304504666004e-05, "loss": 0.575, "mean_token_accuracy": 0.8173446297645569, "num_tokens": 328021413.0, "step": 9790 }, { "epoch": 0.5847063037249284, "grad_norm": 0.5071836113929749, "learning_rate": 4.315507121093525e-05, "loss": 0.5842, "mean_token_accuracy": 0.8162352323532105, "num_tokens": 328189093.0, "step": 9795 }, { "epoch": 0.5850047755491882, "grad_norm": 0.49702900648117065, "learning_rate": 4.314709356212893e-05, "loss": 0.5378, "mean_token_accuracy": 0.8300668120384216, "num_tokens": 328356773.0, "step": 9800 }, { "epoch": 0.585303247373448, "grad_norm": 0.5357098579406738, "learning_rate": 4.313911210218414e-05, "loss": 0.567, "mean_token_accuracy": 0.8208934783935546, "num_tokens": 328524453.0, "step": 9805 }, { "epoch": 0.5856017191977078, "grad_norm": 0.6276393532752991, "learning_rate": 4.313112683304494e-05, "loss": 0.5671, "mean_token_accuracy": 0.8203610420227051, "num_tokens": 328689970.0, "step": 9810 }, { "epoch": 0.5859001910219676, "grad_norm": 0.5248390436172485, "learning_rate": 4.312313775665628e-05, "loss": 0.5563, "mean_token_accuracy": 0.8238339424133301, "num_tokens": 328857650.0, "step": 9815 }, { "epoch": 0.5861986628462273, "grad_norm": 0.5058759450912476, "learning_rate": 4.311514487496405e-05, "loss": 0.5188, "mean_token_accuracy": 0.8338363528251648, "num_tokens": 329025330.0, "step": 9820 }, { "epoch": 0.5864971346704871, "grad_norm": 0.5292270183563232, "learning_rate": 4.310714818991507e-05, "loss": 0.5457, "mean_token_accuracy": 0.8273947358131408, "num_tokens": 329193010.0, "step": 9825 }, { "epoch": 0.5867956064947469, "grad_norm": 0.5985789895057678, "learning_rate": 4.3099147703457065e-05, "loss": 0.5591, "mean_token_accuracy": 0.8240128874778747, "num_tokens": 329360690.0, "step": 9830 }, { "epoch": 0.5870940783190067, "grad_norm": 0.4757097363471985, "learning_rate": 4.3091143417538714e-05, "loss": 0.5555, "mean_token_accuracy": 0.8237862229347229, "num_tokens": 329528370.0, "step": 9835 }, { "epoch": 0.5873925501432665, "grad_norm": 0.550864577293396, "learning_rate": 4.3083135334109595e-05, "loss": 0.5511, "mean_token_accuracy": 0.8260825514793396, "num_tokens": 329696050.0, "step": 9840 }, { "epoch": 0.5876910219675263, "grad_norm": 0.5893641710281372, "learning_rate": 4.307512345512025e-05, "loss": 0.5191, "mean_token_accuracy": 0.8350016593933105, "num_tokens": 329855467.0, "step": 9845 }, { "epoch": 0.5879894937917861, "grad_norm": 0.5553637146949768, "learning_rate": 4.306710778252209e-05, "loss": 0.5652, "mean_token_accuracy": 0.8219551563262939, "num_tokens": 330023147.0, "step": 9850 }, { "epoch": 0.5882879656160458, "grad_norm": 0.5286957025527954, "learning_rate": 4.30590883182675e-05, "loss": 0.5326, "mean_token_accuracy": 0.8308302640914917, "num_tokens": 330190827.0, "step": 9855 }, { "epoch": 0.5885864374403056, "grad_norm": 0.5817766189575195, "learning_rate": 4.305106506430976e-05, "loss": 0.6034, "mean_token_accuracy": 0.8110819578170776, "num_tokens": 330358507.0, "step": 9860 }, { "epoch": 0.5888849092645654, "grad_norm": 0.518365204334259, "learning_rate": 4.3043038022603086e-05, "loss": 0.5446, "mean_token_accuracy": 0.8270607233047486, "num_tokens": 330526187.0, "step": 9865 }, { "epoch": 0.5891833810888252, "grad_norm": 0.5628482103347778, "learning_rate": 4.303500719510261e-05, "loss": 0.5875, "mean_token_accuracy": 0.8142371416091919, "num_tokens": 330693867.0, "step": 9870 }, { "epoch": 0.589481852913085, "grad_norm": 0.5530689358711243, "learning_rate": 4.3026972583764384e-05, "loss": 0.5415, "mean_token_accuracy": 0.827657163143158, "num_tokens": 330861547.0, "step": 9875 }, { "epoch": 0.5897803247373448, "grad_norm": 0.5350573062896729, "learning_rate": 4.3018934190545397e-05, "loss": 0.5369, "mean_token_accuracy": 0.8293570399284362, "num_tokens": 331029227.0, "step": 9880 }, { "epoch": 0.5900787965616046, "grad_norm": 0.4851717948913574, "learning_rate": 4.3010892017403535e-05, "loss": 0.5421, "mean_token_accuracy": 0.8271263360977172, "num_tokens": 331196907.0, "step": 9885 }, { "epoch": 0.5903772683858644, "grad_norm": 0.564995288848877, "learning_rate": 4.300284606629763e-05, "loss": 0.6306, "mean_token_accuracy": 0.8031850099563599, "num_tokens": 331364587.0, "step": 9890 }, { "epoch": 0.5906757402101241, "grad_norm": 0.563106119632721, "learning_rate": 4.299479633918741e-05, "loss": 0.6004, "mean_token_accuracy": 0.8085470557212829, "num_tokens": 331532267.0, "step": 9895 }, { "epoch": 0.5909742120343839, "grad_norm": 0.5216640830039978, "learning_rate": 4.298674283803354e-05, "loss": 0.5502, "mean_token_accuracy": 0.8246331930160522, "num_tokens": 331699947.0, "step": 9900 }, { "epoch": 0.5912726838586437, "grad_norm": 0.5671737194061279, "learning_rate": 4.2978685564797606e-05, "loss": 0.5878, "mean_token_accuracy": 0.815626859664917, "num_tokens": 331867627.0, "step": 9905 }, { "epoch": 0.5915711556829035, "grad_norm": 0.5278947949409485, "learning_rate": 4.297062452144209e-05, "loss": 0.5746, "mean_token_accuracy": 0.8176488161087037, "num_tokens": 332035307.0, "step": 9910 }, { "epoch": 0.5918696275071633, "grad_norm": 0.5447534918785095, "learning_rate": 4.296255970993042e-05, "loss": 0.5442, "mean_token_accuracy": 0.8262256979942322, "num_tokens": 332202987.0, "step": 9915 }, { "epoch": 0.5921680993314231, "grad_norm": 0.5424688458442688, "learning_rate": 4.295449113222693e-05, "loss": 0.5458, "mean_token_accuracy": 0.8272754192352295, "num_tokens": 332370667.0, "step": 9920 }, { "epoch": 0.5924665711556829, "grad_norm": 0.507956326007843, "learning_rate": 4.2946418790296866e-05, "loss": 0.511, "mean_token_accuracy": 0.8354944467544556, "num_tokens": 332538347.0, "step": 9925 }, { "epoch": 0.5927650429799427, "grad_norm": 0.5435249209403992, "learning_rate": 4.2938342686106394e-05, "loss": 0.5337, "mean_token_accuracy": 0.8304545044898987, "num_tokens": 332706027.0, "step": 9930 }, { "epoch": 0.5930635148042025, "grad_norm": 0.5713160634040833, "learning_rate": 4.293026282162261e-05, "loss": 0.5512, "mean_token_accuracy": 0.826142179965973, "num_tokens": 332873707.0, "step": 9935 }, { "epoch": 0.5933619866284623, "grad_norm": 0.578020453453064, "learning_rate": 4.29221791988135e-05, "loss": 0.5546, "mean_token_accuracy": 0.8232732892036438, "num_tokens": 333041387.0, "step": 9940 }, { "epoch": 0.5936604584527221, "grad_norm": 0.5681842565536499, "learning_rate": 4.291409181964799e-05, "loss": 0.5804, "mean_token_accuracy": 0.8164141774177551, "num_tokens": 333209067.0, "step": 9945 }, { "epoch": 0.5939589302769819, "grad_norm": 0.5614805817604065, "learning_rate": 4.290600068609589e-05, "loss": 0.5503, "mean_token_accuracy": 0.8254622459411621, "num_tokens": 333376747.0, "step": 9950 }, { "epoch": 0.5942574021012417, "grad_norm": 0.6109113693237305, "learning_rate": 4.289790580012797e-05, "loss": 0.6079, "mean_token_accuracy": 0.8090242147445679, "num_tokens": 333544427.0, "step": 9955 }, { "epoch": 0.5945558739255015, "grad_norm": 0.530504047870636, "learning_rate": 4.2889807163715886e-05, "loss": 0.555, "mean_token_accuracy": 0.8229869961738586, "num_tokens": 333712107.0, "step": 9960 }, { "epoch": 0.5948543457497613, "grad_norm": 0.5134127140045166, "learning_rate": 4.2881704778832196e-05, "loss": 0.6026, "mean_token_accuracy": 0.8102051734924316, "num_tokens": 333879787.0, "step": 9965 }, { "epoch": 0.595152817574021, "grad_norm": 0.6299505233764648, "learning_rate": 4.2873598647450406e-05, "loss": 0.6735, "mean_token_accuracy": 0.7928307175636291, "num_tokens": 334047467.0, "step": 9970 }, { "epoch": 0.5954512893982808, "grad_norm": 0.5211063027381897, "learning_rate": 4.286548877154489e-05, "loss": 0.5522, "mean_token_accuracy": 0.8243946075439453, "num_tokens": 334215147.0, "step": 9975 }, { "epoch": 0.5957497612225406, "grad_norm": 0.5255116820335388, "learning_rate": 4.285737515309097e-05, "loss": 0.5315, "mean_token_accuracy": 0.8329178094863892, "num_tokens": 334382827.0, "step": 9980 }, { "epoch": 0.5960482330468004, "grad_norm": 0.5419749617576599, "learning_rate": 4.284925779406487e-05, "loss": 0.5967, "mean_token_accuracy": 0.8149648070335388, "num_tokens": 334550507.0, "step": 9985 }, { "epoch": 0.5963467048710601, "grad_norm": 0.602005660533905, "learning_rate": 4.284113669644372e-05, "loss": 0.6162, "mean_token_accuracy": 0.8094417214393616, "num_tokens": 334718187.0, "step": 9990 }, { "epoch": 0.5966451766953199, "grad_norm": 0.5772136449813843, "learning_rate": 4.2833011862205555e-05, "loss": 0.5873, "mean_token_accuracy": 0.8133782625198365, "num_tokens": 334885867.0, "step": 9995 }, { "epoch": 0.5969436485195797, "grad_norm": 0.6226353049278259, "learning_rate": 4.282488329332934e-05, "loss": 0.5509, "mean_token_accuracy": 0.8242872595787049, "num_tokens": 335053547.0, "step": 10000 }, { "epoch": 0.5972421203438395, "grad_norm": 0.5816977620124817, "learning_rate": 4.281675099179494e-05, "loss": 0.5349, "mean_token_accuracy": 0.8323511958122254, "num_tokens": 335221227.0, "step": 10005 }, { "epoch": 0.5975405921680993, "grad_norm": 0.5523040890693665, "learning_rate": 4.2808614959583115e-05, "loss": 0.599, "mean_token_accuracy": 0.8114696383476258, "num_tokens": 335388907.0, "step": 10010 }, { "epoch": 0.5978390639923591, "grad_norm": 0.5386897921562195, "learning_rate": 4.280047519867555e-05, "loss": 0.5731, "mean_token_accuracy": 0.8187999486923218, "num_tokens": 335556587.0, "step": 10015 }, { "epoch": 0.5981375358166189, "grad_norm": 0.5356436967849731, "learning_rate": 4.2792331711054843e-05, "loss": 0.54, "mean_token_accuracy": 0.8271024703979493, "num_tokens": 335724267.0, "step": 10020 }, { "epoch": 0.5984360076408787, "grad_norm": 0.5296046733856201, "learning_rate": 4.2784184498704484e-05, "loss": 0.5563, "mean_token_accuracy": 0.8252773523330689, "num_tokens": 335891947.0, "step": 10025 }, { "epoch": 0.5987344794651385, "grad_norm": 0.5193471312522888, "learning_rate": 4.277603356360887e-05, "loss": 0.5421, "mean_token_accuracy": 0.8285756945610047, "num_tokens": 336059627.0, "step": 10030 }, { "epoch": 0.5990329512893983, "grad_norm": 0.5423997640609741, "learning_rate": 4.2767878907753325e-05, "loss": 0.5492, "mean_token_accuracy": 0.8246033787727356, "num_tokens": 336227307.0, "step": 10035 }, { "epoch": 0.5993314231136581, "grad_norm": 0.536270022392273, "learning_rate": 4.275972053312405e-05, "loss": 0.6006, "mean_token_accuracy": 0.8123165965080261, "num_tokens": 336394987.0, "step": 10040 }, { "epoch": 0.5996298949379179, "grad_norm": 0.5258265137672424, "learning_rate": 4.2751558441708184e-05, "loss": 0.5342, "mean_token_accuracy": 0.8293331742286683, "num_tokens": 336562667.0, "step": 10045 }, { "epoch": 0.5999283667621776, "grad_norm": 0.5426004528999329, "learning_rate": 4.274339263549375e-05, "loss": 0.5603, "mean_token_accuracy": 0.8236192345619202, "num_tokens": 336730347.0, "step": 10050 }, { "epoch": 0.6002268385864374, "grad_norm": 0.509745180606842, "learning_rate": 4.2735223116469666e-05, "loss": 0.5711, "mean_token_accuracy": 0.8194977879524231, "num_tokens": 336898027.0, "step": 10055 }, { "epoch": 0.6005253104106972, "grad_norm": 0.5620666146278381, "learning_rate": 4.2727049886625795e-05, "loss": 0.5621, "mean_token_accuracy": 0.8229034900665283, "num_tokens": 337065707.0, "step": 10060 }, { "epoch": 0.600823782234957, "grad_norm": 0.5186175107955933, "learning_rate": 4.271887294795286e-05, "loss": 0.5713, "mean_token_accuracy": 0.8214362502098084, "num_tokens": 337233387.0, "step": 10065 }, { "epoch": 0.6011222540592168, "grad_norm": 0.5515191555023193, "learning_rate": 4.2710692302442515e-05, "loss": 0.5673, "mean_token_accuracy": 0.8205057978630066, "num_tokens": 337401067.0, "step": 10070 }, { "epoch": 0.6014207258834766, "grad_norm": 0.5776181817054749, "learning_rate": 4.270250795208729e-05, "loss": 0.5578, "mean_token_accuracy": 0.825462257862091, "num_tokens": 337568747.0, "step": 10075 }, { "epoch": 0.6017191977077364, "grad_norm": 0.5542211532592773, "learning_rate": 4.269431989888066e-05, "loss": 0.535, "mean_token_accuracy": 0.8300608396530151, "num_tokens": 337736427.0, "step": 10080 }, { "epoch": 0.6020176695319962, "grad_norm": 0.5703225135803223, "learning_rate": 4.268612814481696e-05, "loss": 0.5956, "mean_token_accuracy": 0.8141119003295898, "num_tokens": 337904107.0, "step": 10085 }, { "epoch": 0.602316141356256, "grad_norm": 0.5758171081542969, "learning_rate": 4.267793269189145e-05, "loss": 0.5867, "mean_token_accuracy": 0.8173765897750854, "num_tokens": 338067957.0, "step": 10090 }, { "epoch": 0.6026146131805158, "grad_norm": 0.5809513330459595, "learning_rate": 4.266973354210028e-05, "loss": 0.5476, "mean_token_accuracy": 0.8254025936126709, "num_tokens": 338235637.0, "step": 10095 }, { "epoch": 0.6029130850047756, "grad_norm": 0.6540355682373047, "learning_rate": 4.26615306974405e-05, "loss": 0.6, "mean_token_accuracy": 0.812793743610382, "num_tokens": 338403317.0, "step": 10100 }, { "epoch": 0.6032115568290354, "grad_norm": 0.5405868291854858, "learning_rate": 4.265332415991007e-05, "loss": 0.5713, "mean_token_accuracy": 0.8203745603561401, "num_tokens": 338570997.0, "step": 10105 }, { "epoch": 0.6035100286532952, "grad_norm": 0.5001184940338135, "learning_rate": 4.264511393150784e-05, "loss": 0.5794, "mean_token_accuracy": 0.8165871262550354, "num_tokens": 338738677.0, "step": 10110 }, { "epoch": 0.603808500477555, "grad_norm": 0.5327947735786438, "learning_rate": 4.263690001423357e-05, "loss": 0.5694, "mean_token_accuracy": 0.8189490675926209, "num_tokens": 338906357.0, "step": 10115 }, { "epoch": 0.6041069723018148, "grad_norm": 0.5442296862602234, "learning_rate": 4.26286824100879e-05, "loss": 0.6141, "mean_token_accuracy": 0.8060837388038635, "num_tokens": 339074037.0, "step": 10120 }, { "epoch": 0.6044054441260746, "grad_norm": 0.49894338846206665, "learning_rate": 4.262046112107239e-05, "loss": 0.5575, "mean_token_accuracy": 0.8232255816459656, "num_tokens": 339241717.0, "step": 10125 }, { "epoch": 0.6047039159503342, "grad_norm": 0.5164963603019714, "learning_rate": 4.261223614918949e-05, "loss": 0.538, "mean_token_accuracy": 0.8288500428199768, "num_tokens": 339409397.0, "step": 10130 }, { "epoch": 0.605002387774594, "grad_norm": 0.8744543194770813, "learning_rate": 4.260400749644252e-05, "loss": 0.5226, "mean_token_accuracy": 0.8351723790168762, "num_tokens": 339577077.0, "step": 10135 }, { "epoch": 0.6053008595988538, "grad_norm": 0.5206270813941956, "learning_rate": 4.2595775164835743e-05, "loss": 0.5927, "mean_token_accuracy": 0.8134796619415283, "num_tokens": 339744757.0, "step": 10140 }, { "epoch": 0.6055993314231136, "grad_norm": 0.5476736426353455, "learning_rate": 4.2587539156374295e-05, "loss": 0.5633, "mean_token_accuracy": 0.8216390252113343, "num_tokens": 339912437.0, "step": 10145 }, { "epoch": 0.6058978032473734, "grad_norm": 0.5553728938102722, "learning_rate": 4.257929947306419e-05, "loss": 0.5881, "mean_token_accuracy": 0.8156865119934082, "num_tokens": 340080117.0, "step": 10150 }, { "epoch": 0.6061962750716332, "grad_norm": 0.5113295316696167, "learning_rate": 4.2571056116912376e-05, "loss": 0.5476, "mean_token_accuracy": 0.8250268578529358, "num_tokens": 340247797.0, "step": 10155 }, { "epoch": 0.606494746895893, "grad_norm": 0.5264196395874023, "learning_rate": 4.2562809089926665e-05, "loss": 0.567, "mean_token_accuracy": 0.819861626625061, "num_tokens": 340415477.0, "step": 10160 }, { "epoch": 0.6067932187201528, "grad_norm": 0.5544784069061279, "learning_rate": 4.255455839411577e-05, "loss": 0.6149, "mean_token_accuracy": 0.8072709560394287, "num_tokens": 340578191.0, "step": 10165 }, { "epoch": 0.6070916905444126, "grad_norm": 0.6234328150749207, "learning_rate": 4.254630403148929e-05, "loss": 0.5562, "mean_token_accuracy": 0.8248061537742615, "num_tokens": 340745871.0, "step": 10170 }, { "epoch": 0.6073901623686724, "grad_norm": 0.5575888752937317, "learning_rate": 4.2538046004057766e-05, "loss": 0.5947, "mean_token_accuracy": 0.8110819578170776, "num_tokens": 340913551.0, "step": 10175 }, { "epoch": 0.6076886341929322, "grad_norm": 0.5839712619781494, "learning_rate": 4.252978431383255e-05, "loss": 0.529, "mean_token_accuracy": 0.8309077858924866, "num_tokens": 341081231.0, "step": 10180 }, { "epoch": 0.607987106017192, "grad_norm": 0.5024023056030273, "learning_rate": 4.2521518962825955e-05, "loss": 0.5259, "mean_token_accuracy": 0.8329063534736634, "num_tokens": 341247330.0, "step": 10185 }, { "epoch": 0.6082855778414518, "grad_norm": 0.5004905462265015, "learning_rate": 4.251324995305114e-05, "loss": 0.558, "mean_token_accuracy": 0.8245437264442443, "num_tokens": 341415010.0, "step": 10190 }, { "epoch": 0.6085840496657116, "grad_norm": 0.5296902060508728, "learning_rate": 4.250497728652219e-05, "loss": 0.5686, "mean_token_accuracy": 0.8214004635810852, "num_tokens": 341582690.0, "step": 10195 }, { "epoch": 0.6088825214899714, "grad_norm": 0.6006523370742798, "learning_rate": 4.249670096525406e-05, "loss": 0.5812, "mean_token_accuracy": 0.8172730445861817, "num_tokens": 341750370.0, "step": 10200 }, { "epoch": 0.6091809933142311, "grad_norm": 0.623622477054596, "learning_rate": 4.2488420991262607e-05, "loss": 0.6192, "mean_token_accuracy": 0.8064117908477784, "num_tokens": 341918050.0, "step": 10205 }, { "epoch": 0.6094794651384909, "grad_norm": 0.5206360220909119, "learning_rate": 4.248013736656457e-05, "loss": 0.5798, "mean_token_accuracy": 0.8177382826805115, "num_tokens": 342085730.0, "step": 10210 }, { "epoch": 0.6097779369627507, "grad_norm": 0.5445969700813293, "learning_rate": 4.247185009317757e-05, "loss": 0.5676, "mean_token_accuracy": 0.8214422106742859, "num_tokens": 342253410.0, "step": 10215 }, { "epoch": 0.6100764087870105, "grad_norm": 0.5614628195762634, "learning_rate": 4.246355917312013e-05, "loss": 0.6046, "mean_token_accuracy": 0.8115829586982727, "num_tokens": 342421090.0, "step": 10220 }, { "epoch": 0.6103748806112703, "grad_norm": 0.49875083565711975, "learning_rate": 4.245526460841165e-05, "loss": 0.5316, "mean_token_accuracy": 0.8295061349868774, "num_tokens": 342588770.0, "step": 10225 }, { "epoch": 0.6106733524355301, "grad_norm": 0.6407276391983032, "learning_rate": 4.2446966401072454e-05, "loss": 0.5682, "mean_token_accuracy": 0.8199511051177979, "num_tokens": 342756450.0, "step": 10230 }, { "epoch": 0.6109718242597899, "grad_norm": 0.5701903700828552, "learning_rate": 4.2438664553123694e-05, "loss": 0.5697, "mean_token_accuracy": 0.8202970385551452, "num_tokens": 342924130.0, "step": 10235 }, { "epoch": 0.6112702960840497, "grad_norm": 0.5019197463989258, "learning_rate": 4.2430359066587436e-05, "loss": 0.6115, "mean_token_accuracy": 0.8076762557029724, "num_tokens": 343091810.0, "step": 10240 }, { "epoch": 0.6115687679083095, "grad_norm": 0.4857781231403351, "learning_rate": 4.242204994348665e-05, "loss": 0.506, "mean_token_accuracy": 0.8365740180015564, "num_tokens": 343259490.0, "step": 10245 }, { "epoch": 0.6118672397325693, "grad_norm": 0.5388410091400146, "learning_rate": 4.241373718584517e-05, "loss": 0.5497, "mean_token_accuracy": 0.8249015808105469, "num_tokens": 343427170.0, "step": 10250 }, { "epoch": 0.6121657115568291, "grad_norm": 0.5654188394546509, "learning_rate": 4.240542079568772e-05, "loss": 0.5509, "mean_token_accuracy": 0.8270547389984131, "num_tokens": 343594850.0, "step": 10255 }, { "epoch": 0.6124641833810889, "grad_norm": 0.5905010104179382, "learning_rate": 4.2397100775039925e-05, "loss": 0.6175, "mean_token_accuracy": 0.8060479521751404, "num_tokens": 343762530.0, "step": 10260 }, { "epoch": 0.6127626552053486, "grad_norm": 0.5358613729476929, "learning_rate": 4.238877712592826e-05, "loss": 0.5519, "mean_token_accuracy": 0.8270965099334717, "num_tokens": 343930210.0, "step": 10265 }, { "epoch": 0.6130611270296084, "grad_norm": 0.5065945386886597, "learning_rate": 4.238044985038011e-05, "loss": 0.518, "mean_token_accuracy": 0.83399738073349, "num_tokens": 344097890.0, "step": 10270 }, { "epoch": 0.6133595988538681, "grad_norm": 0.526066780090332, "learning_rate": 4.237211895042373e-05, "loss": 0.5203, "mean_token_accuracy": 0.833472490310669, "num_tokens": 344265570.0, "step": 10275 }, { "epoch": 0.6136580706781279, "grad_norm": 0.5284095406532288, "learning_rate": 4.2363784428088275e-05, "loss": 0.5978, "mean_token_accuracy": 0.8105749726295471, "num_tokens": 344433250.0, "step": 10280 }, { "epoch": 0.6139565425023877, "grad_norm": 0.5001017451286316, "learning_rate": 4.235544628540378e-05, "loss": 0.5417, "mean_token_accuracy": 0.8277048707008362, "num_tokens": 344600930.0, "step": 10285 }, { "epoch": 0.6142550143266475, "grad_norm": 0.6117889285087585, "learning_rate": 4.234710452440113e-05, "loss": 0.5615, "mean_token_accuracy": 0.8219014644622803, "num_tokens": 344768610.0, "step": 10290 }, { "epoch": 0.6145534861509073, "grad_norm": 0.5988826155662537, "learning_rate": 4.233875914711213e-05, "loss": 0.5669, "mean_token_accuracy": 0.8210246920585632, "num_tokens": 344936290.0, "step": 10295 }, { "epoch": 0.6148519579751671, "grad_norm": 0.5304162502288818, "learning_rate": 4.233041015556945e-05, "loss": 0.5361, "mean_token_accuracy": 0.8294286012649537, "num_tokens": 345103970.0, "step": 10300 }, { "epoch": 0.6151504297994269, "grad_norm": 0.5469595789909363, "learning_rate": 4.2322057551806635e-05, "loss": 0.5844, "mean_token_accuracy": 0.8154956579208374, "num_tokens": 345271650.0, "step": 10305 }, { "epoch": 0.6154489016236867, "grad_norm": 0.5119556784629822, "learning_rate": 4.231370133785812e-05, "loss": 0.5549, "mean_token_accuracy": 0.8262555241584778, "num_tokens": 345439330.0, "step": 10310 }, { "epoch": 0.6157473734479465, "grad_norm": 0.5373770594596863, "learning_rate": 4.2305341515759224e-05, "loss": 0.5816, "mean_token_accuracy": 0.8163604855537414, "num_tokens": 345607010.0, "step": 10315 }, { "epoch": 0.6160458452722063, "grad_norm": 0.5381370782852173, "learning_rate": 4.229697808754612e-05, "loss": 0.522, "mean_token_accuracy": 0.8334307551383973, "num_tokens": 345774690.0, "step": 10320 }, { "epoch": 0.6163443170964661, "grad_norm": 0.5179805159568787, "learning_rate": 4.228861105525587e-05, "loss": 0.5626, "mean_token_accuracy": 0.8212215304374695, "num_tokens": 345942370.0, "step": 10325 }, { "epoch": 0.6166427889207259, "grad_norm": 0.5689610838890076, "learning_rate": 4.228024042092645e-05, "loss": 0.5428, "mean_token_accuracy": 0.8277108550071717, "num_tokens": 346110050.0, "step": 10330 }, { "epoch": 0.6169412607449857, "grad_norm": 0.5430788993835449, "learning_rate": 4.227186618659665e-05, "loss": 0.5552, "mean_token_accuracy": 0.8233269691467285, "num_tokens": 346277730.0, "step": 10335 }, { "epoch": 0.6172397325692455, "grad_norm": 0.56238853931427, "learning_rate": 4.226348835430619e-05, "loss": 0.5572, "mean_token_accuracy": 0.8235536217689514, "num_tokens": 346445410.0, "step": 10340 }, { "epoch": 0.6175382043935053, "grad_norm": 0.5491074323654175, "learning_rate": 4.225510692609564e-05, "loss": 0.5899, "mean_token_accuracy": 0.814523434638977, "num_tokens": 346613090.0, "step": 10345 }, { "epoch": 0.6178366762177651, "grad_norm": 0.5272854566574097, "learning_rate": 4.224672190400645e-05, "loss": 0.5534, "mean_token_accuracy": 0.8263449907302857, "num_tokens": 346780770.0, "step": 10350 }, { "epoch": 0.6181351480420249, "grad_norm": 0.4945986270904541, "learning_rate": 4.2238333290080936e-05, "loss": 0.569, "mean_token_accuracy": 0.82071453332901, "num_tokens": 346948450.0, "step": 10355 }, { "epoch": 0.6184336198662846, "grad_norm": 0.5312017202377319, "learning_rate": 4.222994108636232e-05, "loss": 0.5825, "mean_token_accuracy": 0.8145174860954285, "num_tokens": 347116130.0, "step": 10360 }, { "epoch": 0.6187320916905444, "grad_norm": 0.5052494406700134, "learning_rate": 4.222154529489467e-05, "loss": 0.554, "mean_token_accuracy": 0.8242872357368469, "num_tokens": 347283810.0, "step": 10365 }, { "epoch": 0.6190305635148042, "grad_norm": 0.5967806577682495, "learning_rate": 4.221314591772293e-05, "loss": 0.5797, "mean_token_accuracy": 0.818370521068573, "num_tokens": 347451490.0, "step": 10370 }, { "epoch": 0.619329035339064, "grad_norm": 0.5441157817840576, "learning_rate": 4.2204742956892925e-05, "loss": 0.5403, "mean_token_accuracy": 0.8269712567329407, "num_tokens": 347619170.0, "step": 10375 }, { "epoch": 0.6196275071633238, "grad_norm": 0.61036616563797, "learning_rate": 4.219633641445136e-05, "loss": 0.5407, "mean_token_accuracy": 0.8299654006958008, "num_tokens": 347786850.0, "step": 10380 }, { "epoch": 0.6199259789875836, "grad_norm": 0.5069438815116882, "learning_rate": 4.2187926292445803e-05, "loss": 0.5658, "mean_token_accuracy": 0.8200286269187927, "num_tokens": 347954530.0, "step": 10385 }, { "epoch": 0.6202244508118434, "grad_norm": 0.5739896297454834, "learning_rate": 4.217951259292468e-05, "loss": 0.5697, "mean_token_accuracy": 0.8203745722770691, "num_tokens": 348122210.0, "step": 10390 }, { "epoch": 0.6205229226361032, "grad_norm": 0.5449885725975037, "learning_rate": 4.217109531793731e-05, "loss": 0.5705, "mean_token_accuracy": 0.818972933292389, "num_tokens": 348289890.0, "step": 10395 }, { "epoch": 0.620821394460363, "grad_norm": 0.46956634521484375, "learning_rate": 4.2162674469533894e-05, "loss": 0.559, "mean_token_accuracy": 0.8203805446624756, "num_tokens": 348457570.0, "step": 10400 }, { "epoch": 0.6211198662846227, "grad_norm": 0.550395131111145, "learning_rate": 4.2154250049765456e-05, "loss": 0.553, "mean_token_accuracy": 0.8255338191986084, "num_tokens": 348625250.0, "step": 10405 }, { "epoch": 0.6214183381088825, "grad_norm": 0.5533021688461304, "learning_rate": 4.214582206068394e-05, "loss": 0.5911, "mean_token_accuracy": 0.8165095925331116, "num_tokens": 348792930.0, "step": 10410 }, { "epoch": 0.6217168099331423, "grad_norm": 0.5339484810829163, "learning_rate": 4.213739050434212e-05, "loss": 0.5581, "mean_token_accuracy": 0.8240128755569458, "num_tokens": 348960610.0, "step": 10415 }, { "epoch": 0.6220152817574021, "grad_norm": 0.5239465832710266, "learning_rate": 4.212895538279367e-05, "loss": 0.5512, "mean_token_accuracy": 0.8253310322761536, "num_tokens": 349128290.0, "step": 10420 }, { "epoch": 0.6223137535816619, "grad_norm": 0.5749861598014832, "learning_rate": 4.212051669809312e-05, "loss": 0.5316, "mean_token_accuracy": 0.8337230205535888, "num_tokens": 349295970.0, "step": 10425 }, { "epoch": 0.6226122254059216, "grad_norm": 0.5575063824653625, "learning_rate": 4.211207445229586e-05, "loss": 0.5411, "mean_token_accuracy": 0.8276750564575195, "num_tokens": 349463650.0, "step": 10430 }, { "epoch": 0.6229106972301814, "grad_norm": 0.5194706320762634, "learning_rate": 4.210362864745817e-05, "loss": 0.543, "mean_token_accuracy": 0.8282297611236572, "num_tokens": 349631330.0, "step": 10435 }, { "epoch": 0.6232091690544412, "grad_norm": 0.5498590469360352, "learning_rate": 4.209517928563717e-05, "loss": 0.5916, "mean_token_accuracy": 0.8153167247772217, "num_tokens": 349799010.0, "step": 10440 }, { "epoch": 0.623507640878701, "grad_norm": 0.5149429440498352, "learning_rate": 4.208672636889085e-05, "loss": 0.5607, "mean_token_accuracy": 0.8220565438270568, "num_tokens": 349966690.0, "step": 10445 }, { "epoch": 0.6238061127029608, "grad_norm": 0.644813060760498, "learning_rate": 4.207826989927808e-05, "loss": 0.5805, "mean_token_accuracy": 0.8166110038757324, "num_tokens": 350134370.0, "step": 10450 }, { "epoch": 0.6241045845272206, "grad_norm": 0.5750823616981506, "learning_rate": 4.2069809878858604e-05, "loss": 0.5757, "mean_token_accuracy": 0.8192413210868835, "num_tokens": 350302050.0, "step": 10455 }, { "epoch": 0.6244030563514804, "grad_norm": 0.48424893617630005, "learning_rate": 4.2061346309692995e-05, "loss": 0.554, "mean_token_accuracy": 0.8266253113746643, "num_tokens": 350469730.0, "step": 10460 }, { "epoch": 0.6247015281757402, "grad_norm": 0.5818796753883362, "learning_rate": 4.205287919384272e-05, "loss": 0.5738, "mean_token_accuracy": 0.8195037603378296, "num_tokens": 350637410.0, "step": 10465 }, { "epoch": 0.625, "grad_norm": 0.571145236492157, "learning_rate": 4.20444085333701e-05, "loss": 0.5706, "mean_token_accuracy": 0.8211141586303711, "num_tokens": 350805090.0, "step": 10470 }, { "epoch": 0.6252984718242598, "grad_norm": 0.5542051196098328, "learning_rate": 4.203593433033833e-05, "loss": 0.5609, "mean_token_accuracy": 0.8236907958984375, "num_tokens": 350972770.0, "step": 10475 }, { "epoch": 0.6255969436485196, "grad_norm": 0.4784558415412903, "learning_rate": 4.202745658681145e-05, "loss": 0.5789, "mean_token_accuracy": 0.8159131407737732, "num_tokens": 351140450.0, "step": 10480 }, { "epoch": 0.6258954154727794, "grad_norm": 0.5490285754203796, "learning_rate": 4.201897530485437e-05, "loss": 0.5333, "mean_token_accuracy": 0.8304783463478088, "num_tokens": 351308130.0, "step": 10485 }, { "epoch": 0.6261938872970392, "grad_norm": 0.5519647598266602, "learning_rate": 4.201049048653287e-05, "loss": 0.5832, "mean_token_accuracy": 0.8183048963546753, "num_tokens": 351475810.0, "step": 10490 }, { "epoch": 0.626492359121299, "grad_norm": 0.5074359178543091, "learning_rate": 4.200200213391358e-05, "loss": 0.5563, "mean_token_accuracy": 0.823583447933197, "num_tokens": 351643490.0, "step": 10495 }, { "epoch": 0.6267908309455588, "grad_norm": 0.4795893430709839, "learning_rate": 4.1993510249063985e-05, "loss": 0.5408, "mean_token_accuracy": 0.8303053736686706, "num_tokens": 351811170.0, "step": 10500 }, { "epoch": 0.6270893027698186, "grad_norm": 0.5287749767303467, "learning_rate": 4.1985014834052456e-05, "loss": 0.588, "mean_token_accuracy": 0.8165752053260803, "num_tokens": 351978850.0, "step": 10505 }, { "epoch": 0.6273877745940784, "grad_norm": 0.5833807587623596, "learning_rate": 4.197651589094819e-05, "loss": 0.5891, "mean_token_accuracy": 0.8154717803001403, "num_tokens": 352146530.0, "step": 10510 }, { "epoch": 0.6276862464183381, "grad_norm": 0.50035560131073, "learning_rate": 4.196801342182129e-05, "loss": 0.5339, "mean_token_accuracy": 0.8296791076660156, "num_tokens": 352314210.0, "step": 10515 }, { "epoch": 0.6279847182425979, "grad_norm": 0.5603047013282776, "learning_rate": 4.195950742874267e-05, "loss": 0.5628, "mean_token_accuracy": 0.8212453722953796, "num_tokens": 352481890.0, "step": 10520 }, { "epoch": 0.6282831900668577, "grad_norm": 0.5571617484092712, "learning_rate": 4.195099791378411e-05, "loss": 0.5333, "mean_token_accuracy": 0.8330788493156434, "num_tokens": 352649570.0, "step": 10525 }, { "epoch": 0.6285816618911175, "grad_norm": 0.5076308846473694, "learning_rate": 4.194248487901828e-05, "loss": 0.5507, "mean_token_accuracy": 0.8260773777961731, "num_tokens": 352810928.0, "step": 10530 }, { "epoch": 0.6288801337153773, "grad_norm": 0.6115785241127014, "learning_rate": 4.193396832651868e-05, "loss": 0.5769, "mean_token_accuracy": 0.8177800297737121, "num_tokens": 352978608.0, "step": 10535 }, { "epoch": 0.629178605539637, "grad_norm": 0.5612708926200867, "learning_rate": 4.192544825835968e-05, "loss": 0.5804, "mean_token_accuracy": 0.8171000719070435, "num_tokens": 353146288.0, "step": 10540 }, { "epoch": 0.6294770773638968, "grad_norm": 0.5526641607284546, "learning_rate": 4.191692467661649e-05, "loss": 0.5779, "mean_token_accuracy": 0.8166587114334106, "num_tokens": 353313968.0, "step": 10545 }, { "epoch": 0.6297755491881566, "grad_norm": 0.5095444321632385, "learning_rate": 4.190839758336518e-05, "loss": 0.5053, "mean_token_accuracy": 0.8365501642227173, "num_tokens": 353481648.0, "step": 10550 }, { "epoch": 0.6300740210124164, "grad_norm": 0.529994547367096, "learning_rate": 4.1899866980682693e-05, "loss": 0.5899, "mean_token_accuracy": 0.8148037672042847, "num_tokens": 353649328.0, "step": 10555 }, { "epoch": 0.6303724928366762, "grad_norm": 0.5498631000518799, "learning_rate": 4.189133287064682e-05, "loss": 0.5621, "mean_token_accuracy": 0.8221400499343872, "num_tokens": 353817008.0, "step": 10560 }, { "epoch": 0.630670964660936, "grad_norm": 0.5215694308280945, "learning_rate": 4.188279525533619e-05, "loss": 0.5442, "mean_token_accuracy": 0.8271501779556274, "num_tokens": 353984688.0, "step": 10565 }, { "epoch": 0.6309694364851958, "grad_norm": 0.5187599658966064, "learning_rate": 4.18742541368303e-05, "loss": 0.521, "mean_token_accuracy": 0.8338959813117981, "num_tokens": 354152368.0, "step": 10570 }, { "epoch": 0.6312679083094556, "grad_norm": 0.5384235978126526, "learning_rate": 4.186570951720949e-05, "loss": 0.5305, "mean_token_accuracy": 0.8320947170257569, "num_tokens": 354320048.0, "step": 10575 }, { "epoch": 0.6315663801337154, "grad_norm": 0.4864861071109772, "learning_rate": 4.185716139855497e-05, "loss": 0.5608, "mean_token_accuracy": 0.8226231694221496, "num_tokens": 354487728.0, "step": 10580 }, { "epoch": 0.6318648519579751, "grad_norm": 0.49161964654922485, "learning_rate": 4.184860978294879e-05, "loss": 0.52, "mean_token_accuracy": 0.8338065147399902, "num_tokens": 354655408.0, "step": 10585 }, { "epoch": 0.6321633237822349, "grad_norm": 0.5725266933441162, "learning_rate": 4.184005467247386e-05, "loss": 0.5645, "mean_token_accuracy": 0.8218418240547181, "num_tokens": 354823088.0, "step": 10590 }, { "epoch": 0.6324617956064947, "grad_norm": 0.5377210378646851, "learning_rate": 4.183149606921392e-05, "loss": 0.6134, "mean_token_accuracy": 0.8093761205673218, "num_tokens": 354990768.0, "step": 10595 }, { "epoch": 0.6327602674307545, "grad_norm": 0.534552276134491, "learning_rate": 4.182293397525358e-05, "loss": 0.4982, "mean_token_accuracy": 0.8397769212722779, "num_tokens": 355158448.0, "step": 10600 }, { "epoch": 0.6330587392550143, "grad_norm": 0.4438951909542084, "learning_rate": 4.1814368392678304e-05, "loss": 0.5354, "mean_token_accuracy": 0.8311761736869812, "num_tokens": 355326128.0, "step": 10605 }, { "epoch": 0.6333572110792741, "grad_norm": 0.5355693697929382, "learning_rate": 4.180579932357439e-05, "loss": 0.5512, "mean_token_accuracy": 0.8268519639968872, "num_tokens": 355493808.0, "step": 10610 }, { "epoch": 0.6336556829035339, "grad_norm": 0.5087995529174805, "learning_rate": 4.1797226770029e-05, "loss": 0.5515, "mean_token_accuracy": 0.8248658061027527, "num_tokens": 355661488.0, "step": 10615 }, { "epoch": 0.6339541547277937, "grad_norm": 0.6685255169868469, "learning_rate": 4.178865073413012e-05, "loss": 0.6056, "mean_token_accuracy": 0.8093522667884827, "num_tokens": 355829168.0, "step": 10620 }, { "epoch": 0.6342526265520535, "grad_norm": 0.5254787802696228, "learning_rate": 4.178007121796663e-05, "loss": 0.5419, "mean_token_accuracy": 0.8273947238922119, "num_tokens": 355996848.0, "step": 10625 }, { "epoch": 0.6345510983763133, "grad_norm": 0.5092074275016785, "learning_rate": 4.1771488223628216e-05, "loss": 0.563, "mean_token_accuracy": 0.8221400499343872, "num_tokens": 356164528.0, "step": 10630 }, { "epoch": 0.6348495702005731, "grad_norm": 0.6070569753646851, "learning_rate": 4.1762901753205416e-05, "loss": 0.5889, "mean_token_accuracy": 0.8147679328918457, "num_tokens": 356327278.0, "step": 10635 }, { "epoch": 0.6351480420248329, "grad_norm": 0.5248810052871704, "learning_rate": 4.175431180878964e-05, "loss": 0.602, "mean_token_accuracy": 0.8114099979400635, "num_tokens": 356494958.0, "step": 10640 }, { "epoch": 0.6354465138490927, "grad_norm": 0.5092383623123169, "learning_rate": 4.174571839247311e-05, "loss": 0.5323, "mean_token_accuracy": 0.8320529580116272, "num_tokens": 356662638.0, "step": 10645 }, { "epoch": 0.6357449856733525, "grad_norm": 0.5364642143249512, "learning_rate": 4.173712150634892e-05, "loss": 0.547, "mean_token_accuracy": 0.8257246851921082, "num_tokens": 356830318.0, "step": 10650 }, { "epoch": 0.6360434574976123, "grad_norm": 0.5233772993087769, "learning_rate": 4.1728521152511e-05, "loss": 0.5457, "mean_token_accuracy": 0.826690936088562, "num_tokens": 356997998.0, "step": 10655 }, { "epoch": 0.6363419293218721, "grad_norm": 0.5759458541870117, "learning_rate": 4.171991733305413e-05, "loss": 0.5373, "mean_token_accuracy": 0.8284742951393127, "num_tokens": 357165678.0, "step": 10660 }, { "epoch": 0.6366404011461319, "grad_norm": 0.5486500859260559, "learning_rate": 4.171131005007392e-05, "loss": 0.5473, "mean_token_accuracy": 0.8266968846321106, "num_tokens": 357333358.0, "step": 10665 }, { "epoch": 0.6369388729703916, "grad_norm": 0.5107816457748413, "learning_rate": 4.1702699305666835e-05, "loss": 0.5545, "mean_token_accuracy": 0.8247822880744934, "num_tokens": 357501038.0, "step": 10670 }, { "epoch": 0.6372373447946514, "grad_norm": 0.5290648341178894, "learning_rate": 4.1694085101930187e-05, "loss": 0.5761, "mean_token_accuracy": 0.8195693612098693, "num_tokens": 357668718.0, "step": 10675 }, { "epoch": 0.6375358166189111, "grad_norm": 0.5462967753410339, "learning_rate": 4.168546744096211e-05, "loss": 0.5499, "mean_token_accuracy": 0.8227126240730286, "num_tokens": 357836398.0, "step": 10680 }, { "epoch": 0.6378342884431709, "grad_norm": 0.47816041111946106, "learning_rate": 4.167684632486162e-05, "loss": 0.5247, "mean_token_accuracy": 0.8340212345123291, "num_tokens": 358004078.0, "step": 10685 }, { "epoch": 0.6381327602674307, "grad_norm": 0.6931360960006714, "learning_rate": 4.166822175572852e-05, "loss": 0.5992, "mean_token_accuracy": 0.8122330904006958, "num_tokens": 358171758.0, "step": 10690 }, { "epoch": 0.6384312320916905, "grad_norm": 0.5212947130203247, "learning_rate": 4.16595937356635e-05, "loss": 0.503, "mean_token_accuracy": 0.8401049733161926, "num_tokens": 358339438.0, "step": 10695 }, { "epoch": 0.6387297039159503, "grad_norm": 0.49074992537498474, "learning_rate": 4.1650962266768074e-05, "loss": 0.5247, "mean_token_accuracy": 0.8335381150245667, "num_tokens": 358507118.0, "step": 10700 }, { "epoch": 0.6390281757402101, "grad_norm": 0.5606208443641663, "learning_rate": 4.164232735114458e-05, "loss": 0.5611, "mean_token_accuracy": 0.8220565319061279, "num_tokens": 358674798.0, "step": 10705 }, { "epoch": 0.6393266475644699, "grad_norm": 0.5256921648979187, "learning_rate": 4.1633688990896236e-05, "loss": 0.5841, "mean_token_accuracy": 0.8178038954734802, "num_tokens": 358842478.0, "step": 10710 }, { "epoch": 0.6396251193887297, "grad_norm": 0.4700057804584503, "learning_rate": 4.162504718812705e-05, "loss": 0.5507, "mean_token_accuracy": 0.8245437145233154, "num_tokens": 359010158.0, "step": 10715 }, { "epoch": 0.6399235912129895, "grad_norm": 0.5408313870429993, "learning_rate": 4.16164019449419e-05, "loss": 0.5965, "mean_token_accuracy": 0.8115710258483887, "num_tokens": 359177838.0, "step": 10720 }, { "epoch": 0.6402220630372493, "grad_norm": 0.5639321208000183, "learning_rate": 4.1607753263446515e-05, "loss": 0.5579, "mean_token_accuracy": 0.823833954334259, "num_tokens": 359345518.0, "step": 10725 }, { "epoch": 0.6405205348615091, "grad_norm": 0.5468165278434753, "learning_rate": 4.159910114574741e-05, "loss": 0.5111, "mean_token_accuracy": 0.8363891243934631, "num_tokens": 359513198.0, "step": 10730 }, { "epoch": 0.6408190066857689, "grad_norm": 0.5450208783149719, "learning_rate": 4.1590445593951975e-05, "loss": 0.5808, "mean_token_accuracy": 0.8147978067398072, "num_tokens": 359680878.0, "step": 10735 }, { "epoch": 0.6411174785100286, "grad_norm": 0.46378010511398315, "learning_rate": 4.158178661016845e-05, "loss": 0.5623, "mean_token_accuracy": 0.8223726630210877, "num_tokens": 359848558.0, "step": 10740 }, { "epoch": 0.6414159503342884, "grad_norm": 0.48744845390319824, "learning_rate": 4.1573124196505866e-05, "loss": 0.4943, "mean_token_accuracy": 0.8421329021453857, "num_tokens": 360016238.0, "step": 10745 }, { "epoch": 0.6417144221585482, "grad_norm": 0.5765150189399719, "learning_rate": 4.156445835507413e-05, "loss": 0.5382, "mean_token_accuracy": 0.8284921884536743, "num_tokens": 360183918.0, "step": 10750 }, { "epoch": 0.642012893982808, "grad_norm": 0.5338862538337708, "learning_rate": 4.155578908798396e-05, "loss": 0.5858, "mean_token_accuracy": 0.8138196349143982, "num_tokens": 360351598.0, "step": 10755 }, { "epoch": 0.6423113658070678, "grad_norm": 0.4562791585922241, "learning_rate": 4.154711639734692e-05, "loss": 0.5297, "mean_token_accuracy": 0.8298878788948059, "num_tokens": 360519278.0, "step": 10760 }, { "epoch": 0.6426098376313276, "grad_norm": 0.5389201641082764, "learning_rate": 4.1538440285275396e-05, "loss": 0.548, "mean_token_accuracy": 0.8253071784973145, "num_tokens": 360686958.0, "step": 10765 }, { "epoch": 0.6429083094555874, "grad_norm": 0.5104871392250061, "learning_rate": 4.152976075388263e-05, "loss": 0.5234, "mean_token_accuracy": 0.8333770751953125, "num_tokens": 360854638.0, "step": 10770 }, { "epoch": 0.6432067812798472, "grad_norm": 0.5955930948257446, "learning_rate": 4.1521077805282654e-05, "loss": 0.6227, "mean_token_accuracy": 0.8061076045036316, "num_tokens": 361022318.0, "step": 10775 }, { "epoch": 0.643505253104107, "grad_norm": 0.4999445378780365, "learning_rate": 4.1512391441590396e-05, "loss": 0.5529, "mean_token_accuracy": 0.8257962584495544, "num_tokens": 361189998.0, "step": 10780 }, { "epoch": 0.6438037249283668, "grad_norm": 0.4865644872188568, "learning_rate": 4.150370166492156e-05, "loss": 0.5199, "mean_token_accuracy": 0.8348980069160461, "num_tokens": 361357678.0, "step": 10785 }, { "epoch": 0.6441021967526266, "grad_norm": 0.5511912703514099, "learning_rate": 4.14950084773927e-05, "loss": 0.5518, "mean_token_accuracy": 0.8262495517730712, "num_tokens": 361525358.0, "step": 10790 }, { "epoch": 0.6444006685768864, "grad_norm": 0.5373526215553284, "learning_rate": 4.148631188112121e-05, "loss": 0.5492, "mean_token_accuracy": 0.8248956203460693, "num_tokens": 361693038.0, "step": 10795 }, { "epoch": 0.6446991404011462, "grad_norm": 0.6302651166915894, "learning_rate": 4.147761187822531e-05, "loss": 0.5821, "mean_token_accuracy": 0.8165573358535767, "num_tokens": 361860718.0, "step": 10800 }, { "epoch": 0.644997612225406, "grad_norm": 0.4753219187259674, "learning_rate": 4.146890847082403e-05, "loss": 0.5445, "mean_token_accuracy": 0.827669095993042, "num_tokens": 362028398.0, "step": 10805 }, { "epoch": 0.6452960840496658, "grad_norm": 0.591619074344635, "learning_rate": 4.1460201661037254e-05, "loss": 0.5795, "mean_token_accuracy": 0.8173506021499634, "num_tokens": 362196078.0, "step": 10810 }, { "epoch": 0.6455945558739254, "grad_norm": 0.5333344340324402, "learning_rate": 4.1451491450985694e-05, "loss": 0.55, "mean_token_accuracy": 0.827555775642395, "num_tokens": 362363758.0, "step": 10815 }, { "epoch": 0.6458930276981852, "grad_norm": 0.5531866550445557, "learning_rate": 4.1442777842790884e-05, "loss": 0.5108, "mean_token_accuracy": 0.836365258693695, "num_tokens": 362531438.0, "step": 10820 }, { "epoch": 0.646191499522445, "grad_norm": 0.5012946128845215, "learning_rate": 4.143406083857516e-05, "loss": 0.5928, "mean_token_accuracy": 0.8163425922393799, "num_tokens": 362699118.0, "step": 10825 }, { "epoch": 0.6464899713467048, "grad_norm": 0.5352234840393066, "learning_rate": 4.142534044046173e-05, "loss": 0.5593, "mean_token_accuracy": 0.8233329296112061, "num_tokens": 362866798.0, "step": 10830 }, { "epoch": 0.6467884431709646, "grad_norm": 0.563372790813446, "learning_rate": 4.14166166505746e-05, "loss": 0.5755, "mean_token_accuracy": 0.8181259751319885, "num_tokens": 363034478.0, "step": 10835 }, { "epoch": 0.6470869149952244, "grad_norm": 0.5415008664131165, "learning_rate": 4.1407889471038624e-05, "loss": 0.5324, "mean_token_accuracy": 0.8302338004112244, "num_tokens": 363202158.0, "step": 10840 }, { "epoch": 0.6473853868194842, "grad_norm": 0.48782598972320557, "learning_rate": 4.139915890397946e-05, "loss": 0.5191, "mean_token_accuracy": 0.8339914083480835, "num_tokens": 363369838.0, "step": 10845 }, { "epoch": 0.647683858643744, "grad_norm": 0.6319615840911865, "learning_rate": 4.1390424951523584e-05, "loss": 0.5812, "mean_token_accuracy": 0.8150304079055786, "num_tokens": 363537518.0, "step": 10850 }, { "epoch": 0.6479823304680038, "grad_norm": 0.5104392766952515, "learning_rate": 4.138168761579833e-05, "loss": 0.5683, "mean_token_accuracy": 0.8196170926094055, "num_tokens": 363705198.0, "step": 10855 }, { "epoch": 0.6482808022922636, "grad_norm": 0.5604403018951416, "learning_rate": 4.137294689893182e-05, "loss": 0.5432, "mean_token_accuracy": 0.827191948890686, "num_tokens": 363872878.0, "step": 10860 }, { "epoch": 0.6485792741165234, "grad_norm": 0.5327582955360413, "learning_rate": 4.136420280305304e-05, "loss": 0.5867, "mean_token_accuracy": 0.8157044053077698, "num_tokens": 364040558.0, "step": 10865 }, { "epoch": 0.6488777459407832, "grad_norm": 0.47843194007873535, "learning_rate": 4.135545533029176e-05, "loss": 0.5149, "mean_token_accuracy": 0.8352916717529297, "num_tokens": 364208238.0, "step": 10870 }, { "epoch": 0.649176217765043, "grad_norm": 0.5262420177459717, "learning_rate": 4.134670448277859e-05, "loss": 0.5578, "mean_token_accuracy": 0.8243289947509765, "num_tokens": 364375918.0, "step": 10875 }, { "epoch": 0.6494746895893028, "grad_norm": 0.4862832725048065, "learning_rate": 4.133795026264497e-05, "loss": 0.5439, "mean_token_accuracy": 0.8269652843475341, "num_tokens": 364543598.0, "step": 10880 }, { "epoch": 0.6497731614135626, "grad_norm": 0.5758223533630371, "learning_rate": 4.132919267202313e-05, "loss": 0.5645, "mean_token_accuracy": 0.8212990522384643, "num_tokens": 364711278.0, "step": 10885 }, { "epoch": 0.6500716332378224, "grad_norm": 0.5943029522895813, "learning_rate": 4.132043171304616e-05, "loss": 0.5612, "mean_token_accuracy": 0.8238474488258362, "num_tokens": 364877455.0, "step": 10890 }, { "epoch": 0.6503701050620821, "grad_norm": 0.5892188549041748, "learning_rate": 4.131166738784796e-05, "loss": 0.6368, "mean_token_accuracy": 0.8013360381126404, "num_tokens": 365045135.0, "step": 10895 }, { "epoch": 0.6506685768863419, "grad_norm": 0.5199084281921387, "learning_rate": 4.130289969856322e-05, "loss": 0.5624, "mean_token_accuracy": 0.8210306525230407, "num_tokens": 365212815.0, "step": 10900 }, { "epoch": 0.6509670487106017, "grad_norm": 0.534847617149353, "learning_rate": 4.12941286473275e-05, "loss": 0.5597, "mean_token_accuracy": 0.8198974013328553, "num_tokens": 365380495.0, "step": 10905 }, { "epoch": 0.6512655205348615, "grad_norm": 0.5123786926269531, "learning_rate": 4.1285354236277134e-05, "loss": 0.6039, "mean_token_accuracy": 0.8097638010978698, "num_tokens": 365548175.0, "step": 10910 }, { "epoch": 0.6515639923591213, "grad_norm": 0.5008794665336609, "learning_rate": 4.127657646754929e-05, "loss": 0.4995, "mean_token_accuracy": 0.8405940532684326, "num_tokens": 365715855.0, "step": 10915 }, { "epoch": 0.6518624641833811, "grad_norm": 0.524743914604187, "learning_rate": 4.126779534328196e-05, "loss": 0.5536, "mean_token_accuracy": 0.8236729025840759, "num_tokens": 365883535.0, "step": 10920 }, { "epoch": 0.6521609360076409, "grad_norm": 0.5517908930778503, "learning_rate": 4.125901086561396e-05, "loss": 0.5624, "mean_token_accuracy": 0.8214481711387634, "num_tokens": 366051215.0, "step": 10925 }, { "epoch": 0.6524594078319007, "grad_norm": 0.5227869749069214, "learning_rate": 4.125022303668489e-05, "loss": 0.5565, "mean_token_accuracy": 0.8237325549125671, "num_tokens": 366218895.0, "step": 10930 }, { "epoch": 0.6527578796561605, "grad_norm": 0.5112661123275757, "learning_rate": 4.1241431858635204e-05, "loss": 0.5888, "mean_token_accuracy": 0.8149707794189454, "num_tokens": 366386575.0, "step": 10935 }, { "epoch": 0.6530563514804203, "grad_norm": 0.5125031471252441, "learning_rate": 4.123263733360615e-05, "loss": 0.5368, "mean_token_accuracy": 0.8308600783348083, "num_tokens": 366554255.0, "step": 10940 }, { "epoch": 0.6533548233046801, "grad_norm": 0.5317060947418213, "learning_rate": 4.122383946373981e-05, "loss": 0.5573, "mean_token_accuracy": 0.8230526208877563, "num_tokens": 366721935.0, "step": 10945 }, { "epoch": 0.6536532951289399, "grad_norm": 0.5169796347618103, "learning_rate": 4.121503825117904e-05, "loss": 0.5212, "mean_token_accuracy": 0.8330848097801209, "num_tokens": 366889615.0, "step": 10950 }, { "epoch": 0.6539517669531996, "grad_norm": 0.6890859603881836, "learning_rate": 4.120623369806756e-05, "loss": 0.5837, "mean_token_accuracy": 0.816730284690857, "num_tokens": 367057295.0, "step": 10955 }, { "epoch": 0.6542502387774594, "grad_norm": 0.5225417017936707, "learning_rate": 4.1197425806549876e-05, "loss": 0.5546, "mean_token_accuracy": 0.8240904211997986, "num_tokens": 367224975.0, "step": 10960 }, { "epoch": 0.6545487106017192, "grad_norm": 0.5400025248527527, "learning_rate": 4.11886145787713e-05, "loss": 0.5688, "mean_token_accuracy": 0.8195097208023071, "num_tokens": 367392655.0, "step": 10965 }, { "epoch": 0.654847182425979, "grad_norm": 0.5433284640312195, "learning_rate": 4.117980001687799e-05, "loss": 0.5338, "mean_token_accuracy": 0.8300966262817383, "num_tokens": 367560335.0, "step": 10970 }, { "epoch": 0.6551456542502387, "grad_norm": 0.5311228632926941, "learning_rate": 4.117098212301689e-05, "loss": 0.5347, "mean_token_accuracy": 0.828533935546875, "num_tokens": 367728015.0, "step": 10975 }, { "epoch": 0.6554441260744985, "grad_norm": 0.5552988052368164, "learning_rate": 4.116216089933575e-05, "loss": 0.591, "mean_token_accuracy": 0.8147262215614319, "num_tokens": 367895695.0, "step": 10980 }, { "epoch": 0.6557425978987583, "grad_norm": 0.494136244058609, "learning_rate": 4.1153336347983154e-05, "loss": 0.5532, "mean_token_accuracy": 0.8228140354156495, "num_tokens": 368063375.0, "step": 10985 }, { "epoch": 0.6560410697230181, "grad_norm": 0.5637604594230652, "learning_rate": 4.114450847110848e-05, "loss": 0.5518, "mean_token_accuracy": 0.8252773404121398, "num_tokens": 368231055.0, "step": 10990 }, { "epoch": 0.6563395415472779, "grad_norm": 0.5620095729827881, "learning_rate": 4.11356772708619e-05, "loss": 0.5966, "mean_token_accuracy": 0.8131516218185425, "num_tokens": 368398735.0, "step": 10995 }, { "epoch": 0.6566380133715377, "grad_norm": 0.5555197596549988, "learning_rate": 4.112684274939445e-05, "loss": 0.5342, "mean_token_accuracy": 0.8292198538780212, "num_tokens": 368566415.0, "step": 11000 }, { "epoch": 0.6569364851957975, "grad_norm": 0.5233317613601685, "learning_rate": 4.111800490885793e-05, "loss": 0.5794, "mean_token_accuracy": 0.8216628909111023, "num_tokens": 368734095.0, "step": 11005 }, { "epoch": 0.6572349570200573, "grad_norm": 0.5886181592941284, "learning_rate": 4.110916375140494e-05, "loss": 0.553, "mean_token_accuracy": 0.8228378772735596, "num_tokens": 368901775.0, "step": 11010 }, { "epoch": 0.6575334288443171, "grad_norm": 0.6114769577980042, "learning_rate": 4.110031927918894e-05, "loss": 0.5488, "mean_token_accuracy": 0.8270488023757935, "num_tokens": 369069455.0, "step": 11015 }, { "epoch": 0.6578319006685769, "grad_norm": 0.5818997025489807, "learning_rate": 4.109147149436414e-05, "loss": 0.5491, "mean_token_accuracy": 0.8271859645843506, "num_tokens": 369237135.0, "step": 11020 }, { "epoch": 0.6581303724928367, "grad_norm": 0.4692915380001068, "learning_rate": 4.108262039908558e-05, "loss": 0.5042, "mean_token_accuracy": 0.8374388813972473, "num_tokens": 369404815.0, "step": 11025 }, { "epoch": 0.6584288443170965, "grad_norm": 0.61583012342453, "learning_rate": 4.107376599550912e-05, "loss": 0.5627, "mean_token_accuracy": 0.8220305800437927, "num_tokens": 369566416.0, "step": 11030 }, { "epoch": 0.6587273161413563, "grad_norm": 0.5520955920219421, "learning_rate": 4.106490828579141e-05, "loss": 0.55, "mean_token_accuracy": 0.824323034286499, "num_tokens": 369734096.0, "step": 11035 }, { "epoch": 0.6590257879656161, "grad_norm": 0.5377001166343689, "learning_rate": 4.10560472720899e-05, "loss": 0.5508, "mean_token_accuracy": 0.8276750564575195, "num_tokens": 369901776.0, "step": 11040 }, { "epoch": 0.6593242597898759, "grad_norm": 0.5589100122451782, "learning_rate": 4.104718295656287e-05, "loss": 0.5195, "mean_token_accuracy": 0.8350590467453003, "num_tokens": 370069456.0, "step": 11045 }, { "epoch": 0.6596227316141356, "grad_norm": 0.5749718546867371, "learning_rate": 4.103831534136938e-05, "loss": 0.5403, "mean_token_accuracy": 0.8269652843475341, "num_tokens": 370237136.0, "step": 11050 }, { "epoch": 0.6599212034383954, "grad_norm": 0.5292873382568359, "learning_rate": 4.102944442866929e-05, "loss": 0.5078, "mean_token_accuracy": 0.8370392560958863, "num_tokens": 370404816.0, "step": 11055 }, { "epoch": 0.6602196752626552, "grad_norm": 0.5220714211463928, "learning_rate": 4.102057022062329e-05, "loss": 0.5398, "mean_token_accuracy": 0.8292496681213379, "num_tokens": 370572496.0, "step": 11060 }, { "epoch": 0.660518147086915, "grad_norm": 0.47866925597190857, "learning_rate": 4.101169271939286e-05, "loss": 0.5166, "mean_token_accuracy": 0.8339794754981995, "num_tokens": 370740176.0, "step": 11065 }, { "epoch": 0.6608166189111748, "grad_norm": 0.4852518141269684, "learning_rate": 4.100281192714026e-05, "loss": 0.5341, "mean_token_accuracy": 0.8323750495910645, "num_tokens": 370907856.0, "step": 11070 }, { "epoch": 0.6611150907354346, "grad_norm": 0.6355628371238708, "learning_rate": 4.0993927846028584e-05, "loss": 0.5367, "mean_token_accuracy": 0.8273529648780823, "num_tokens": 371075536.0, "step": 11075 }, { "epoch": 0.6614135625596944, "grad_norm": 0.47717875242233276, "learning_rate": 4.0985040478221715e-05, "loss": 0.5267, "mean_token_accuracy": 0.8321245431900024, "num_tokens": 371243216.0, "step": 11080 }, { "epoch": 0.6617120343839542, "grad_norm": 0.5030310750007629, "learning_rate": 4.097614982588433e-05, "loss": 0.582, "mean_token_accuracy": 0.8168614983558655, "num_tokens": 371410896.0, "step": 11085 }, { "epoch": 0.6620105062082139, "grad_norm": 0.5493571162223816, "learning_rate": 4.096725589118191e-05, "loss": 0.5565, "mean_token_accuracy": 0.8234760880470275, "num_tokens": 371578576.0, "step": 11090 }, { "epoch": 0.6623089780324737, "grad_norm": 0.6388803124427795, "learning_rate": 4.0958358676280743e-05, "loss": 0.5603, "mean_token_accuracy": 0.822569477558136, "num_tokens": 371746256.0, "step": 11095 }, { "epoch": 0.6626074498567335, "grad_norm": 0.5451918840408325, "learning_rate": 4.094945818334791e-05, "loss": 0.5391, "mean_token_accuracy": 0.827806270122528, "num_tokens": 371913936.0, "step": 11100 }, { "epoch": 0.6629059216809933, "grad_norm": 0.5010148882865906, "learning_rate": 4.0940554414551294e-05, "loss": 0.5462, "mean_token_accuracy": 0.8250984191894531, "num_tokens": 372081616.0, "step": 11105 }, { "epoch": 0.6632043935052531, "grad_norm": 0.5547599792480469, "learning_rate": 4.0931647372059556e-05, "loss": 0.5618, "mean_token_accuracy": 0.8234104633331298, "num_tokens": 372249296.0, "step": 11110 }, { "epoch": 0.6635028653295129, "grad_norm": 0.5289527773857117, "learning_rate": 4.092273705804218e-05, "loss": 0.5502, "mean_token_accuracy": 0.8256829261779786, "num_tokens": 372416976.0, "step": 11115 }, { "epoch": 0.6638013371537727, "grad_norm": 0.5747686624526978, "learning_rate": 4.091382347466945e-05, "loss": 0.5508, "mean_token_accuracy": 0.8247942209243775, "num_tokens": 372584656.0, "step": 11120 }, { "epoch": 0.6640998089780324, "grad_norm": 0.5888289213180542, "learning_rate": 4.090490662411241e-05, "loss": 0.5238, "mean_token_accuracy": 0.8328581571578979, "num_tokens": 372752336.0, "step": 11125 }, { "epoch": 0.6643982808022922, "grad_norm": 0.5585777759552002, "learning_rate": 4.089598650854294e-05, "loss": 0.5967, "mean_token_accuracy": 0.8128116369247437, "num_tokens": 372920016.0, "step": 11130 }, { "epoch": 0.664696752626552, "grad_norm": 0.5006683468818665, "learning_rate": 4.088706313013369e-05, "loss": 0.5254, "mean_token_accuracy": 0.8320708513259888, "num_tokens": 373087696.0, "step": 11135 }, { "epoch": 0.6649952244508118, "grad_norm": 0.5076429843902588, "learning_rate": 4.0878136491058125e-05, "loss": 0.4872, "mean_token_accuracy": 0.8438029408454895, "num_tokens": 373255376.0, "step": 11140 }, { "epoch": 0.6652936962750716, "grad_norm": 0.5112064480781555, "learning_rate": 4.0869206593490475e-05, "loss": 0.5505, "mean_token_accuracy": 0.8246809005737304, "num_tokens": 373423056.0, "step": 11145 }, { "epoch": 0.6655921680993314, "grad_norm": 0.5198042392730713, "learning_rate": 4.086027343960581e-05, "loss": 0.5783, "mean_token_accuracy": 0.8192055344581604, "num_tokens": 373590736.0, "step": 11150 }, { "epoch": 0.6658906399235912, "grad_norm": 0.48146018385887146, "learning_rate": 4.085133703157993e-05, "loss": 0.5239, "mean_token_accuracy": 0.8302696108818054, "num_tokens": 373758416.0, "step": 11155 }, { "epoch": 0.666189111747851, "grad_norm": 0.49789804220199585, "learning_rate": 4.0842397371589484e-05, "loss": 0.5291, "mean_token_accuracy": 0.8303948521614075, "num_tokens": 373926096.0, "step": 11160 }, { "epoch": 0.6664875835721108, "grad_norm": 0.5661185383796692, "learning_rate": 4.083345446181188e-05, "loss": 0.5112, "mean_token_accuracy": 0.8365084171295166, "num_tokens": 374093776.0, "step": 11165 }, { "epoch": 0.6667860553963706, "grad_norm": 0.5354464054107666, "learning_rate": 4.082450830442534e-05, "loss": 0.5018, "mean_token_accuracy": 0.8397828936576843, "num_tokens": 374261456.0, "step": 11170 }, { "epoch": 0.6670845272206304, "grad_norm": 0.556145191192627, "learning_rate": 4.081555890160886e-05, "loss": 0.5456, "mean_token_accuracy": 0.8248717546463012, "num_tokens": 374429136.0, "step": 11175 }, { "epoch": 0.6673829990448902, "grad_norm": 0.5707510113716125, "learning_rate": 4.080660625554223e-05, "loss": 0.5323, "mean_token_accuracy": 0.8308541059494019, "num_tokens": 374596816.0, "step": 11180 }, { "epoch": 0.66768147086915, "grad_norm": 0.549965500831604, "learning_rate": 4.0797650368406044e-05, "loss": 0.5618, "mean_token_accuracy": 0.820213520526886, "num_tokens": 374764496.0, "step": 11185 }, { "epoch": 0.6679799426934098, "grad_norm": 0.5679465532302856, "learning_rate": 4.078869124238165e-05, "loss": 0.5411, "mean_token_accuracy": 0.828647255897522, "num_tokens": 374932176.0, "step": 11190 }, { "epoch": 0.6682784145176696, "grad_norm": 0.511679470539093, "learning_rate": 4.077972887965123e-05, "loss": 0.5272, "mean_token_accuracy": 0.8318680644035339, "num_tokens": 375099856.0, "step": 11195 }, { "epoch": 0.6685768863419294, "grad_norm": 0.4986500144004822, "learning_rate": 4.077076328239773e-05, "loss": 0.5589, "mean_token_accuracy": 0.8212096095085144, "num_tokens": 375267536.0, "step": 11200 }, { "epoch": 0.6688753581661891, "grad_norm": 0.6039528846740723, "learning_rate": 4.076179445280488e-05, "loss": 0.5612, "mean_token_accuracy": 0.8201720952987671, "num_tokens": 375433898.0, "step": 11205 }, { "epoch": 0.6691738299904489, "grad_norm": 0.535342276096344, "learning_rate": 4.07528223930572e-05, "loss": 0.5632, "mean_token_accuracy": 0.8205833315849305, "num_tokens": 375601578.0, "step": 11210 }, { "epoch": 0.6694723018147087, "grad_norm": 0.540589451789856, "learning_rate": 4.074384710534001e-05, "loss": 0.5321, "mean_token_accuracy": 0.8301443338394165, "num_tokens": 375769258.0, "step": 11215 }, { "epoch": 0.6697707736389685, "grad_norm": 0.510288417339325, "learning_rate": 4.0734868591839404e-05, "loss": 0.5401, "mean_token_accuracy": 0.8273291230201721, "num_tokens": 375936938.0, "step": 11220 }, { "epoch": 0.6700692454632283, "grad_norm": 0.5015286207199097, "learning_rate": 4.0725886854742266e-05, "loss": 0.5084, "mean_token_accuracy": 0.8369080305099488, "num_tokens": 376104618.0, "step": 11225 }, { "epoch": 0.670367717287488, "grad_norm": 0.5854654908180237, "learning_rate": 4.0716901896236264e-05, "loss": 0.542, "mean_token_accuracy": 0.827794349193573, "num_tokens": 376272298.0, "step": 11230 }, { "epoch": 0.6706661891117478, "grad_norm": 0.6088428497314453, "learning_rate": 4.070791371850985e-05, "loss": 0.5794, "mean_token_accuracy": 0.8176786422729492, "num_tokens": 376439978.0, "step": 11235 }, { "epoch": 0.6709646609360076, "grad_norm": 0.5472153425216675, "learning_rate": 4.069892232375226e-05, "loss": 0.5251, "mean_token_accuracy": 0.8329714894294739, "num_tokens": 376607658.0, "step": 11240 }, { "epoch": 0.6712631327602674, "grad_norm": 0.5092966556549072, "learning_rate": 4.0689927714153514e-05, "loss": 0.5462, "mean_token_accuracy": 0.8273112297058105, "num_tokens": 376775338.0, "step": 11245 }, { "epoch": 0.6715616045845272, "grad_norm": 0.5294577479362488, "learning_rate": 4.068092989190441e-05, "loss": 0.5336, "mean_token_accuracy": 0.831003212928772, "num_tokens": 376943018.0, "step": 11250 }, { "epoch": 0.671860076408787, "grad_norm": 0.5168683528900146, "learning_rate": 4.067192885919654e-05, "loss": 0.5366, "mean_token_accuracy": 0.8299057602882385, "num_tokens": 377110698.0, "step": 11255 }, { "epoch": 0.6721585482330468, "grad_norm": 0.5447584986686707, "learning_rate": 4.0662924618222276e-05, "loss": 0.5544, "mean_token_accuracy": 0.8227782368659973, "num_tokens": 377278378.0, "step": 11260 }, { "epoch": 0.6724570200573066, "grad_norm": 0.5891069173812866, "learning_rate": 4.0653917171174754e-05, "loss": 0.6083, "mean_token_accuracy": 0.8079506158828735, "num_tokens": 377446058.0, "step": 11265 }, { "epoch": 0.6727554918815664, "grad_norm": 0.5154567956924438, "learning_rate": 4.064490652024791e-05, "loss": 0.555, "mean_token_accuracy": 0.8246033549308777, "num_tokens": 377613738.0, "step": 11270 }, { "epoch": 0.6730539637058262, "grad_norm": 0.5120232701301575, "learning_rate": 4.063589266763646e-05, "loss": 0.5024, "mean_token_accuracy": 0.8392043352127075, "num_tokens": 377781418.0, "step": 11275 }, { "epoch": 0.673352435530086, "grad_norm": 0.5150737762451172, "learning_rate": 4.062687561553589e-05, "loss": 0.5392, "mean_token_accuracy": 0.8282416820526123, "num_tokens": 377949098.0, "step": 11280 }, { "epoch": 0.6736509073543457, "grad_norm": 0.6296200156211853, "learning_rate": 4.061785536614248e-05, "loss": 0.5853, "mean_token_accuracy": 0.8143564343452454, "num_tokens": 378116778.0, "step": 11285 }, { "epoch": 0.6739493791786055, "grad_norm": 0.5804410576820374, "learning_rate": 4.060883192165326e-05, "loss": 0.5638, "mean_token_accuracy": 0.8228199720382691, "num_tokens": 378284458.0, "step": 11290 }, { "epoch": 0.6742478510028653, "grad_norm": 0.4958733320236206, "learning_rate": 4.059980528426606e-05, "loss": 0.5386, "mean_token_accuracy": 0.8267982840538025, "num_tokens": 378452138.0, "step": 11295 }, { "epoch": 0.6745463228271251, "grad_norm": 0.5601612329483032, "learning_rate": 4.059077545617948e-05, "loss": 0.5519, "mean_token_accuracy": 0.825187873840332, "num_tokens": 378619818.0, "step": 11300 }, { "epoch": 0.6748447946513849, "grad_norm": 0.5521968603134155, "learning_rate": 4.058174243959292e-05, "loss": 0.552, "mean_token_accuracy": 0.8240665555000305, "num_tokens": 378787498.0, "step": 11305 }, { "epoch": 0.6751432664756447, "grad_norm": 0.5210601687431335, "learning_rate": 4.057270623670653e-05, "loss": 0.5915, "mean_token_accuracy": 0.8142132878303527, "num_tokens": 378955178.0, "step": 11310 }, { "epoch": 0.6754417382999045, "grad_norm": 0.4831472635269165, "learning_rate": 4.0563666849721246e-05, "loss": 0.5379, "mean_token_accuracy": 0.8285160541534424, "num_tokens": 379122858.0, "step": 11315 }, { "epoch": 0.6757402101241643, "grad_norm": 0.5644519925117493, "learning_rate": 4.0554624280838766e-05, "loss": 0.5395, "mean_token_accuracy": 0.8277525901794434, "num_tokens": 379290538.0, "step": 11320 }, { "epoch": 0.6760386819484241, "grad_norm": 0.5638044476509094, "learning_rate": 4.054557853226158e-05, "loss": 0.5692, "mean_token_accuracy": 0.8201180934906006, "num_tokens": 379458218.0, "step": 11325 }, { "epoch": 0.6763371537726839, "grad_norm": 0.4986534118652344, "learning_rate": 4.0536529606192945e-05, "loss": 0.503, "mean_token_accuracy": 0.8384468674659729, "num_tokens": 379625898.0, "step": 11330 }, { "epoch": 0.6766356255969437, "grad_norm": 0.5392152070999146, "learning_rate": 4.05274775048369e-05, "loss": 0.5476, "mean_token_accuracy": 0.8245914459228516, "num_tokens": 379793578.0, "step": 11335 }, { "epoch": 0.6769340974212035, "grad_norm": 0.5276528596878052, "learning_rate": 4.0518422230398256e-05, "loss": 0.5765, "mean_token_accuracy": 0.8172611355781555, "num_tokens": 379961258.0, "step": 11340 }, { "epoch": 0.6772325692454633, "grad_norm": 0.6615886092185974, "learning_rate": 4.0509363785082585e-05, "loss": 0.5776, "mean_token_accuracy": 0.8180245637893677, "num_tokens": 380128938.0, "step": 11345 }, { "epoch": 0.6775310410697231, "grad_norm": 0.6085119247436523, "learning_rate": 4.050030217109623e-05, "loss": 0.5224, "mean_token_accuracy": 0.8313193321228027, "num_tokens": 380296618.0, "step": 11350 }, { "epoch": 0.6778295128939829, "grad_norm": 0.5373436808586121, "learning_rate": 4.049123739064633e-05, "loss": 0.4897, "mean_token_accuracy": 0.8414588928222656, "num_tokens": 380464298.0, "step": 11355 }, { "epoch": 0.6781279847182426, "grad_norm": 0.5334434509277344, "learning_rate": 4.048216944594077e-05, "loss": 0.5479, "mean_token_accuracy": 0.8263449788093566, "num_tokens": 380631978.0, "step": 11360 }, { "epoch": 0.6784264565425024, "grad_norm": 0.530987560749054, "learning_rate": 4.047309833918822e-05, "loss": 0.5911, "mean_token_accuracy": 0.8143027663230896, "num_tokens": 380799658.0, "step": 11365 }, { "epoch": 0.6787249283667621, "grad_norm": 0.5713772177696228, "learning_rate": 4.046402407259812e-05, "loss": 0.5414, "mean_token_accuracy": 0.8291661739349365, "num_tokens": 380967338.0, "step": 11370 }, { "epoch": 0.6790234001910219, "grad_norm": 0.5532450079917908, "learning_rate": 4.045494664838066e-05, "loss": 0.5429, "mean_token_accuracy": 0.8269891500473022, "num_tokens": 381135018.0, "step": 11375 }, { "epoch": 0.6793218720152817, "grad_norm": 0.5362998843193054, "learning_rate": 4.044586606874682e-05, "loss": 0.5108, "mean_token_accuracy": 0.8373195767402649, "num_tokens": 381302698.0, "step": 11380 }, { "epoch": 0.6796203438395415, "grad_norm": 0.4848956763744354, "learning_rate": 4.043678233590835e-05, "loss": 0.5415, "mean_token_accuracy": 0.8279255628585815, "num_tokens": 381470378.0, "step": 11385 }, { "epoch": 0.6799188156638013, "grad_norm": 0.49829065799713135, "learning_rate": 4.042769545207775e-05, "loss": 0.5308, "mean_token_accuracy": 0.8315101981163024, "num_tokens": 381638058.0, "step": 11390 }, { "epoch": 0.6802172874880611, "grad_norm": 0.4904751479625702, "learning_rate": 4.041860541946831e-05, "loss": 0.5361, "mean_token_accuracy": 0.8296194791793823, "num_tokens": 381805738.0, "step": 11395 }, { "epoch": 0.6805157593123209, "grad_norm": 0.5338941812515259, "learning_rate": 4.040951224029407e-05, "loss": 0.5671, "mean_token_accuracy": 0.8181259751319885, "num_tokens": 381973418.0, "step": 11400 }, { "epoch": 0.6808142311365807, "grad_norm": 0.5032311081886292, "learning_rate": 4.040041591676985e-05, "loss": 0.5246, "mean_token_accuracy": 0.831754732131958, "num_tokens": 382141098.0, "step": 11405 }, { "epoch": 0.6811127029608405, "grad_norm": 0.6189849376678467, "learning_rate": 4.0391316451111206e-05, "loss": 0.5464, "mean_token_accuracy": 0.8272933244705201, "num_tokens": 382308778.0, "step": 11410 }, { "epoch": 0.6814111747851003, "grad_norm": 0.5293188095092773, "learning_rate": 4.03822138455345e-05, "loss": 0.5415, "mean_token_accuracy": 0.8288500547409058, "num_tokens": 382476458.0, "step": 11415 }, { "epoch": 0.6817096466093601, "grad_norm": 0.46318134665489197, "learning_rate": 4.037310810225684e-05, "loss": 0.5277, "mean_token_accuracy": 0.8333233952522278, "num_tokens": 382644138.0, "step": 11420 }, { "epoch": 0.6820081184336199, "grad_norm": 0.46766602993011475, "learning_rate": 4.0363999223496086e-05, "loss": 0.5148, "mean_token_accuracy": 0.8358284711837769, "num_tokens": 382811818.0, "step": 11425 }, { "epoch": 0.6823065902578797, "grad_norm": 0.5994117856025696, "learning_rate": 4.035488721147087e-05, "loss": 0.6112, "mean_token_accuracy": 0.8090047240257263, "num_tokens": 382978660.0, "step": 11430 }, { "epoch": 0.6826050620821394, "grad_norm": 0.670329749584198, "learning_rate": 4.0345772068400613e-05, "loss": 0.5322, "mean_token_accuracy": 0.8300548791885376, "num_tokens": 383146340.0, "step": 11435 }, { "epoch": 0.6829035339063992, "grad_norm": 0.45337140560150146, "learning_rate": 4.0336653796505454e-05, "loss": 0.5051, "mean_token_accuracy": 0.839311707019806, "num_tokens": 383314020.0, "step": 11440 }, { "epoch": 0.683202005730659, "grad_norm": 0.6317613124847412, "learning_rate": 4.032753239800633e-05, "loss": 0.5602, "mean_token_accuracy": 0.8210187315940857, "num_tokens": 383481700.0, "step": 11445 }, { "epoch": 0.6835004775549188, "grad_norm": 3.444387912750244, "learning_rate": 4.031840787512492e-05, "loss": 0.5405, "mean_token_accuracy": 0.8279613614082336, "num_tokens": 383649380.0, "step": 11450 }, { "epoch": 0.6837989493791786, "grad_norm": 0.49650055170059204, "learning_rate": 4.0309280230083675e-05, "loss": 0.5693, "mean_token_accuracy": 0.8220326781272889, "num_tokens": 383817060.0, "step": 11455 }, { "epoch": 0.6840974212034384, "grad_norm": 0.49125024676322937, "learning_rate": 4.030014946510578e-05, "loss": 0.5188, "mean_token_accuracy": 0.8342896342277527, "num_tokens": 383984740.0, "step": 11460 }, { "epoch": 0.6843958930276982, "grad_norm": 0.5134462118148804, "learning_rate": 4.029101558241523e-05, "loss": 0.4904, "mean_token_accuracy": 0.842305850982666, "num_tokens": 384152420.0, "step": 11465 }, { "epoch": 0.684694364851958, "grad_norm": 0.5404857397079468, "learning_rate": 4.028187858423675e-05, "loss": 0.5382, "mean_token_accuracy": 0.830752718448639, "num_tokens": 384320100.0, "step": 11470 }, { "epoch": 0.6849928366762178, "grad_norm": 0.5220535397529602, "learning_rate": 4.027273847279581e-05, "loss": 0.551, "mean_token_accuracy": 0.8270487904548645, "num_tokens": 384487780.0, "step": 11475 }, { "epoch": 0.6852913085004776, "grad_norm": 0.5012786984443665, "learning_rate": 4.026359525031865e-05, "loss": 0.5389, "mean_token_accuracy": 0.8275796294212341, "num_tokens": 384655460.0, "step": 11480 }, { "epoch": 0.6855897803247374, "grad_norm": 0.5386480093002319, "learning_rate": 4.025444891903228e-05, "loss": 0.593, "mean_token_accuracy": 0.8125670909881592, "num_tokens": 384823140.0, "step": 11485 }, { "epoch": 0.6858882521489972, "grad_norm": 0.5205801725387573, "learning_rate": 4.024529948116446e-05, "loss": 0.5183, "mean_token_accuracy": 0.8351306200027466, "num_tokens": 384990820.0, "step": 11490 }, { "epoch": 0.686186723973257, "grad_norm": 0.5129269361495972, "learning_rate": 4.02361469389437e-05, "loss": 0.5559, "mean_token_accuracy": 0.8248896479606629, "num_tokens": 385158500.0, "step": 11495 }, { "epoch": 0.6864851957975168, "grad_norm": 0.5198088884353638, "learning_rate": 4.022699129459927e-05, "loss": 0.5297, "mean_token_accuracy": 0.8319456100463867, "num_tokens": 385326180.0, "step": 11500 }, { "epoch": 0.6867836676217765, "grad_norm": 0.5159403681755066, "learning_rate": 4.021783255036119e-05, "loss": 0.5388, "mean_token_accuracy": 0.8272396564483643, "num_tokens": 385493860.0, "step": 11505 }, { "epoch": 0.6870821394460362, "grad_norm": 0.5392984747886658, "learning_rate": 4.020867070846025e-05, "loss": 0.5173, "mean_token_accuracy": 0.8346176862716674, "num_tokens": 385661540.0, "step": 11510 }, { "epoch": 0.687380611270296, "grad_norm": 0.5313904285430908, "learning_rate": 4.0199505771127994e-05, "loss": 0.5205, "mean_token_accuracy": 0.8346057534217834, "num_tokens": 385829220.0, "step": 11515 }, { "epoch": 0.6876790830945558, "grad_norm": 0.5097572207450867, "learning_rate": 4.019033774059669e-05, "loss": 0.5435, "mean_token_accuracy": 0.8272277235984802, "num_tokens": 385996900.0, "step": 11520 }, { "epoch": 0.6879775549188156, "grad_norm": 0.5073562860488892, "learning_rate": 4.01811666190994e-05, "loss": 0.4907, "mean_token_accuracy": 0.843045449256897, "num_tokens": 386164580.0, "step": 11525 }, { "epoch": 0.6882760267430754, "grad_norm": 0.4953153431415558, "learning_rate": 4.017199240886991e-05, "loss": 0.5051, "mean_token_accuracy": 0.8365501642227173, "num_tokens": 386332260.0, "step": 11530 }, { "epoch": 0.6885744985673352, "grad_norm": 0.5245486497879028, "learning_rate": 4.0162815112142767e-05, "loss": 0.511, "mean_token_accuracy": 0.8372300863265991, "num_tokens": 386499940.0, "step": 11535 }, { "epoch": 0.688872970391595, "grad_norm": 0.6272040009498596, "learning_rate": 4.015363473115328e-05, "loss": 0.5482, "mean_token_accuracy": 0.826303231716156, "num_tokens": 386667620.0, "step": 11540 }, { "epoch": 0.6891714422158548, "grad_norm": 0.5627403855323792, "learning_rate": 4.014445126813749e-05, "loss": 0.5539, "mean_token_accuracy": 0.8240188479423523, "num_tokens": 386835300.0, "step": 11545 }, { "epoch": 0.6894699140401146, "grad_norm": 0.5106717348098755, "learning_rate": 4.0135264725332205e-05, "loss": 0.5309, "mean_token_accuracy": 0.8307348132133484, "num_tokens": 387002980.0, "step": 11550 }, { "epoch": 0.6897683858643744, "grad_norm": 0.4849369525909424, "learning_rate": 4.0126075104974975e-05, "loss": 0.5139, "mean_token_accuracy": 0.8371466040611267, "num_tokens": 387170660.0, "step": 11555 }, { "epoch": 0.6900668576886342, "grad_norm": 0.5245375633239746, "learning_rate": 4.011688240930411e-05, "loss": 0.5589, "mean_token_accuracy": 0.8229213953018188, "num_tokens": 387338340.0, "step": 11560 }, { "epoch": 0.690365329512894, "grad_norm": 0.5323561429977417, "learning_rate": 4.010768664055865e-05, "loss": 0.5137, "mean_token_accuracy": 0.8351425528526306, "num_tokens": 387506020.0, "step": 11565 }, { "epoch": 0.6906638013371538, "grad_norm": 0.5747860074043274, "learning_rate": 4.009848780097839e-05, "loss": 0.5165, "mean_token_accuracy": 0.8349517107009887, "num_tokens": 387673700.0, "step": 11570 }, { "epoch": 0.6909622731614136, "grad_norm": 0.5796414613723755, "learning_rate": 4.008928589280389e-05, "loss": 0.5634, "mean_token_accuracy": 0.8215495467185974, "num_tokens": 387841380.0, "step": 11575 }, { "epoch": 0.6912607449856734, "grad_norm": 0.5231610536575317, "learning_rate": 4.008008091827644e-05, "loss": 0.5251, "mean_token_accuracy": 0.8329392671585083, "num_tokens": 388001766.0, "step": 11580 }, { "epoch": 0.6915592168099332, "grad_norm": 0.49011021852493286, "learning_rate": 4.0070872879638084e-05, "loss": 0.5026, "mean_token_accuracy": 0.8392520546913147, "num_tokens": 388169446.0, "step": 11585 }, { "epoch": 0.691857688634193, "grad_norm": 0.5687540173530579, "learning_rate": 4.0061661779131604e-05, "loss": 0.5524, "mean_token_accuracy": 0.8256352066993713, "num_tokens": 388337126.0, "step": 11590 }, { "epoch": 0.6921561604584527, "grad_norm": 0.5706913471221924, "learning_rate": 4.0052447619000536e-05, "loss": 0.5793, "mean_token_accuracy": 0.8191113352775574, "num_tokens": 388504676.0, "step": 11595 }, { "epoch": 0.6924546322827125, "grad_norm": 0.4936061203479767, "learning_rate": 4.004323040148914e-05, "loss": 0.5478, "mean_token_accuracy": 0.8254204988479614, "num_tokens": 388672356.0, "step": 11600 }, { "epoch": 0.6927531041069723, "grad_norm": 0.5241875052452087, "learning_rate": 4.0034010128842484e-05, "loss": 0.5732, "mean_token_accuracy": 0.8186508536338806, "num_tokens": 388840036.0, "step": 11605 }, { "epoch": 0.6930515759312321, "grad_norm": 0.5459261536598206, "learning_rate": 4.00247868033063e-05, "loss": 0.551, "mean_token_accuracy": 0.8260706067085266, "num_tokens": 389007716.0, "step": 11610 }, { "epoch": 0.6933500477554919, "grad_norm": 0.5209773182868958, "learning_rate": 4.001556042712711e-05, "loss": 0.5897, "mean_token_accuracy": 0.8160682201385498, "num_tokens": 389175396.0, "step": 11615 }, { "epoch": 0.6936485195797517, "grad_norm": 0.4841661751270294, "learning_rate": 4.000633100255217e-05, "loss": 0.4998, "mean_token_accuracy": 0.8390492677688599, "num_tokens": 389343076.0, "step": 11620 }, { "epoch": 0.6939469914040115, "grad_norm": 0.5271602272987366, "learning_rate": 3.9997098531829475e-05, "loss": 0.5545, "mean_token_accuracy": 0.8236132621765136, "num_tokens": 389510756.0, "step": 11625 }, { "epoch": 0.6942454632282713, "grad_norm": 0.45626330375671387, "learning_rate": 3.9987863017207755e-05, "loss": 0.5747, "mean_token_accuracy": 0.8168734312057495, "num_tokens": 389678436.0, "step": 11630 }, { "epoch": 0.6945439350525311, "grad_norm": 0.5206072926521301, "learning_rate": 3.99786244609365e-05, "loss": 0.5613, "mean_token_accuracy": 0.8222772240638733, "num_tokens": 389846116.0, "step": 11635 }, { "epoch": 0.6948424068767909, "grad_norm": 0.5700442790985107, "learning_rate": 3.9969382865265936e-05, "loss": 0.5338, "mean_token_accuracy": 0.8291244149208069, "num_tokens": 390013796.0, "step": 11640 }, { "epoch": 0.6951408787010506, "grad_norm": 0.5254628658294678, "learning_rate": 3.9960138232447004e-05, "loss": 0.5192, "mean_token_accuracy": 0.8340987682342529, "num_tokens": 390181476.0, "step": 11645 }, { "epoch": 0.6954393505253104, "grad_norm": 0.5238580107688904, "learning_rate": 3.995089056473143e-05, "loss": 0.5276, "mean_token_accuracy": 0.831742811203003, "num_tokens": 390349156.0, "step": 11650 }, { "epoch": 0.6957378223495702, "grad_norm": 0.5199779272079468, "learning_rate": 3.994163986437163e-05, "loss": 0.5507, "mean_token_accuracy": 0.8270249485969543, "num_tokens": 390516836.0, "step": 11655 }, { "epoch": 0.69603629417383, "grad_norm": 0.7571831941604614, "learning_rate": 3.9932386133620795e-05, "loss": 0.5396, "mean_token_accuracy": 0.8308421850204468, "num_tokens": 390684516.0, "step": 11660 }, { "epoch": 0.6963347659980897, "grad_norm": 0.5050497651100159, "learning_rate": 3.9923129374732854e-05, "loss": 0.5254, "mean_token_accuracy": 0.8310747861862182, "num_tokens": 390852196.0, "step": 11665 }, { "epoch": 0.6966332378223495, "grad_norm": 0.4855859577655792, "learning_rate": 3.9913869589962445e-05, "loss": 0.5368, "mean_token_accuracy": 0.8278778433799744, "num_tokens": 391019876.0, "step": 11670 }, { "epoch": 0.6969317096466093, "grad_norm": 0.5321979522705078, "learning_rate": 3.9904606781564965e-05, "loss": 0.5205, "mean_token_accuracy": 0.832142424583435, "num_tokens": 391187556.0, "step": 11675 }, { "epoch": 0.6972301814708691, "grad_norm": 0.536324143409729, "learning_rate": 3.989534095179654e-05, "loss": 0.5885, "mean_token_accuracy": 0.8154181122779847, "num_tokens": 391355236.0, "step": 11680 }, { "epoch": 0.6975286532951289, "grad_norm": 0.5377629399299622, "learning_rate": 3.9886072102914016e-05, "loss": 0.5614, "mean_token_accuracy": 0.8204342246055603, "num_tokens": 391522916.0, "step": 11685 }, { "epoch": 0.6978271251193887, "grad_norm": 0.4855128228664398, "learning_rate": 3.987680023717503e-05, "loss": 0.5199, "mean_token_accuracy": 0.8315936923027039, "num_tokens": 391690596.0, "step": 11690 }, { "epoch": 0.6981255969436485, "grad_norm": 0.5176255702972412, "learning_rate": 3.98675253568379e-05, "loss": 0.5914, "mean_token_accuracy": 0.8139866471290589, "num_tokens": 391858276.0, "step": 11695 }, { "epoch": 0.6984240687679083, "grad_norm": 0.5208262801170349, "learning_rate": 3.985824746416168e-05, "loss": 0.5236, "mean_token_accuracy": 0.8321722507476806, "num_tokens": 392025956.0, "step": 11700 }, { "epoch": 0.6987225405921681, "grad_norm": 0.6102920770645142, "learning_rate": 3.9848966561406185e-05, "loss": 0.5612, "mean_token_accuracy": 0.8202373743057251, "num_tokens": 392193636.0, "step": 11705 }, { "epoch": 0.6990210124164279, "grad_norm": 0.5241305232048035, "learning_rate": 3.9839682650831956e-05, "loss": 0.559, "mean_token_accuracy": 0.8223368763923645, "num_tokens": 392361316.0, "step": 11710 }, { "epoch": 0.6993194842406877, "grad_norm": 0.44423869252204895, "learning_rate": 3.983039573470024e-05, "loss": 0.5544, "mean_token_accuracy": 0.8246928215026855, "num_tokens": 392528996.0, "step": 11715 }, { "epoch": 0.6996179560649475, "grad_norm": 0.5356616377830505, "learning_rate": 3.982110581527306e-05, "loss": 0.5148, "mean_token_accuracy": 0.8359656572341919, "num_tokens": 392696676.0, "step": 11720 }, { "epoch": 0.6999164278892073, "grad_norm": 0.5257697701454163, "learning_rate": 3.981181289481313e-05, "loss": 0.5379, "mean_token_accuracy": 0.8279494166374206, "num_tokens": 392864356.0, "step": 11725 }, { "epoch": 0.7002148997134671, "grad_norm": 0.5748518705368042, "learning_rate": 3.980251697558392e-05, "loss": 0.5771, "mean_token_accuracy": 0.8179112434387207, "num_tokens": 393032036.0, "step": 11730 }, { "epoch": 0.7005133715377269, "grad_norm": 0.5754997134208679, "learning_rate": 3.9793218059849627e-05, "loss": 0.5829, "mean_token_accuracy": 0.8174519777297974, "num_tokens": 393199716.0, "step": 11735 }, { "epoch": 0.7008118433619867, "grad_norm": 0.6012305617332458, "learning_rate": 3.978391614987517e-05, "loss": 0.5569, "mean_token_accuracy": 0.8231122493743896, "num_tokens": 393367396.0, "step": 11740 }, { "epoch": 0.7011103151862464, "grad_norm": 0.5065628290176392, "learning_rate": 3.97746112479262e-05, "loss": 0.5348, "mean_token_accuracy": 0.8282297492027283, "num_tokens": 393535076.0, "step": 11745 }, { "epoch": 0.7014087870105062, "grad_norm": 0.5431389808654785, "learning_rate": 3.976530335626909e-05, "loss": 0.5798, "mean_token_accuracy": 0.8172134160995483, "num_tokens": 393702756.0, "step": 11750 }, { "epoch": 0.701707258834766, "grad_norm": 0.5676679611206055, "learning_rate": 3.975599247717096e-05, "loss": 0.5718, "mean_token_accuracy": 0.8192114949226379, "num_tokens": 393870436.0, "step": 11755 }, { "epoch": 0.7020057306590258, "grad_norm": 0.5505884885787964, "learning_rate": 3.974667861289964e-05, "loss": 0.5709, "mean_token_accuracy": 0.8195395469665527, "num_tokens": 394038116.0, "step": 11760 }, { "epoch": 0.7023042024832856, "grad_norm": 0.5596762895584106, "learning_rate": 3.9737361765723703e-05, "loss": 0.5106, "mean_token_accuracy": 0.8344387531280517, "num_tokens": 394205796.0, "step": 11765 }, { "epoch": 0.7026026743075454, "grad_norm": 0.48200860619544983, "learning_rate": 3.972804193791242e-05, "loss": 0.5475, "mean_token_accuracy": 0.8268400430679321, "num_tokens": 394373476.0, "step": 11770 }, { "epoch": 0.7029011461318052, "grad_norm": 0.5312851667404175, "learning_rate": 3.9718719131735824e-05, "loss": 0.5471, "mean_token_accuracy": 0.8260825395584106, "num_tokens": 394541156.0, "step": 11775 }, { "epoch": 0.7031996179560649, "grad_norm": 0.49796268343925476, "learning_rate": 3.9709393349464656e-05, "loss": 0.5014, "mean_token_accuracy": 0.8380353093147278, "num_tokens": 394708836.0, "step": 11780 }, { "epoch": 0.7034980897803247, "grad_norm": 0.5282920598983765, "learning_rate": 3.970006459337038e-05, "loss": 0.5596, "mean_token_accuracy": 0.8240128993988037, "num_tokens": 394876516.0, "step": 11785 }, { "epoch": 0.7037965616045845, "grad_norm": 0.5482062697410583, "learning_rate": 3.9690732865725186e-05, "loss": 0.5674, "mean_token_accuracy": 0.8192890405654907, "num_tokens": 395044196.0, "step": 11790 }, { "epoch": 0.7040950334288443, "grad_norm": 0.5222627520561218, "learning_rate": 3.9681398168801974e-05, "loss": 0.5149, "mean_token_accuracy": 0.833997368812561, "num_tokens": 395211876.0, "step": 11795 }, { "epoch": 0.7043935052531041, "grad_norm": 0.5341181755065918, "learning_rate": 3.967206050487441e-05, "loss": 0.5683, "mean_token_accuracy": 0.8223965167999268, "num_tokens": 395379556.0, "step": 11800 }, { "epoch": 0.7046919770773639, "grad_norm": 0.5348356366157532, "learning_rate": 3.966271987621683e-05, "loss": 0.4906, "mean_token_accuracy": 0.8417690515518188, "num_tokens": 395547236.0, "step": 11805 }, { "epoch": 0.7049904489016237, "grad_norm": 0.5016529560089111, "learning_rate": 3.965337628510433e-05, "loss": 0.4846, "mean_token_accuracy": 0.8448347806930542, "num_tokens": 395714916.0, "step": 11810 }, { "epoch": 0.7052889207258835, "grad_norm": 0.5226614475250244, "learning_rate": 3.964402973381271e-05, "loss": 0.5207, "mean_token_accuracy": 0.8325957298278809, "num_tokens": 395882596.0, "step": 11815 }, { "epoch": 0.7055873925501432, "grad_norm": 0.5080669522285461, "learning_rate": 3.9634680224618495e-05, "loss": 0.6092, "mean_token_accuracy": 0.8084575891494751, "num_tokens": 396050276.0, "step": 11820 }, { "epoch": 0.705885864374403, "grad_norm": 0.5736380219459534, "learning_rate": 3.962532775979893e-05, "loss": 0.569, "mean_token_accuracy": 0.8204342126846313, "num_tokens": 396217956.0, "step": 11825 }, { "epoch": 0.7061843361986628, "grad_norm": 0.48929014801979065, "learning_rate": 3.961597234163199e-05, "loss": 0.5596, "mean_token_accuracy": 0.822957181930542, "num_tokens": 396385636.0, "step": 11830 }, { "epoch": 0.7064828080229226, "grad_norm": 0.47653937339782715, "learning_rate": 3.9606613972396334e-05, "loss": 0.5443, "mean_token_accuracy": 0.8255397796630859, "num_tokens": 396553316.0, "step": 11835 }, { "epoch": 0.7067812798471824, "grad_norm": 0.5823026299476624, "learning_rate": 3.959725265437139e-05, "loss": 0.5712, "mean_token_accuracy": 0.8201896786689759, "num_tokens": 396720996.0, "step": 11840 }, { "epoch": 0.7070797516714422, "grad_norm": 0.5395498275756836, "learning_rate": 3.958788838983727e-05, "loss": 0.5117, "mean_token_accuracy": 0.834862220287323, "num_tokens": 396888676.0, "step": 11845 }, { "epoch": 0.707378223495702, "grad_norm": 0.599995493888855, "learning_rate": 3.9578521181074825e-05, "loss": 0.5234, "mean_token_accuracy": 0.8309496760368347, "num_tokens": 397041315.0, "step": 11850 }, { "epoch": 0.7076766953199618, "grad_norm": 0.6038875579833984, "learning_rate": 3.9569151030365596e-05, "loss": 0.5794, "mean_token_accuracy": 0.8185613512992859, "num_tokens": 397208995.0, "step": 11855 }, { "epoch": 0.7079751671442216, "grad_norm": 0.5417625904083252, "learning_rate": 3.955977793999186e-05, "loss": 0.5957, "mean_token_accuracy": 0.8149170875549316, "num_tokens": 397376675.0, "step": 11860 }, { "epoch": 0.7082736389684814, "grad_norm": 0.5411232113838196, "learning_rate": 3.955040191223661e-05, "loss": 0.5645, "mean_token_accuracy": 0.8206071734428406, "num_tokens": 397544355.0, "step": 11865 }, { "epoch": 0.7085721107927412, "grad_norm": 0.5124090909957886, "learning_rate": 3.9541022949383563e-05, "loss": 0.5332, "mean_token_accuracy": 0.8303113579750061, "num_tokens": 397712035.0, "step": 11870 }, { "epoch": 0.708870582617001, "grad_norm": 0.5989068150520325, "learning_rate": 3.9531641053717107e-05, "loss": 0.5753, "mean_token_accuracy": 0.8204282402992249, "num_tokens": 397879715.0, "step": 11875 }, { "epoch": 0.7091690544412608, "grad_norm": 0.5417442321777344, "learning_rate": 3.95222562275224e-05, "loss": 0.543, "mean_token_accuracy": 0.8272575497627258, "num_tokens": 398047395.0, "step": 11880 }, { "epoch": 0.7094675262655206, "grad_norm": 0.5037710666656494, "learning_rate": 3.951286847308528e-05, "loss": 0.5674, "mean_token_accuracy": 0.8210068106651306, "num_tokens": 398215075.0, "step": 11885 }, { "epoch": 0.7097659980897804, "grad_norm": 0.5060092210769653, "learning_rate": 3.950347779269232e-05, "loss": 0.5724, "mean_token_accuracy": 0.8195335865020752, "num_tokens": 398382755.0, "step": 11890 }, { "epoch": 0.7100644699140402, "grad_norm": 0.4879721701145172, "learning_rate": 3.9494084188630776e-05, "loss": 0.5205, "mean_token_accuracy": 0.834009301662445, "num_tokens": 398550435.0, "step": 11895 }, { "epoch": 0.7103629417383, "grad_norm": 0.49785342812538147, "learning_rate": 3.948468766318864e-05, "loss": 0.5832, "mean_token_accuracy": 0.815513551235199, "num_tokens": 398718115.0, "step": 11900 }, { "epoch": 0.7106614135625597, "grad_norm": 0.5198028087615967, "learning_rate": 3.9475288218654615e-05, "loss": 0.5383, "mean_token_accuracy": 0.8280090689659119, "num_tokens": 398885795.0, "step": 11905 }, { "epoch": 0.7109598853868195, "grad_norm": 0.5034233927726746, "learning_rate": 3.946588585731812e-05, "loss": 0.5343, "mean_token_accuracy": 0.8311761975288391, "num_tokens": 399053475.0, "step": 11910 }, { "epoch": 0.7112583572110793, "grad_norm": 0.523741602897644, "learning_rate": 3.945648058146924e-05, "loss": 0.5224, "mean_token_accuracy": 0.8322975039482117, "num_tokens": 399221155.0, "step": 11915 }, { "epoch": 0.711556829035339, "grad_norm": 0.49971213936805725, "learning_rate": 3.9447072393398835e-05, "loss": 0.5481, "mean_token_accuracy": 0.8265239119529724, "num_tokens": 399388835.0, "step": 11920 }, { "epoch": 0.7118553008595988, "grad_norm": 0.5266532897949219, "learning_rate": 3.943766129539843e-05, "loss": 0.5608, "mean_token_accuracy": 0.8214004516601563, "num_tokens": 399556515.0, "step": 11925 }, { "epoch": 0.7121537726838586, "grad_norm": 0.5646588802337646, "learning_rate": 3.9428247289760266e-05, "loss": 0.5514, "mean_token_accuracy": 0.8242395162582398, "num_tokens": 399724195.0, "step": 11930 }, { "epoch": 0.7124522445081184, "grad_norm": 0.5337622761726379, "learning_rate": 3.94188303787773e-05, "loss": 0.5632, "mean_token_accuracy": 0.8218060255050659, "num_tokens": 399891875.0, "step": 11935 }, { "epoch": 0.7127507163323782, "grad_norm": 0.4910246729850769, "learning_rate": 3.9409410564743215e-05, "loss": 0.5686, "mean_token_accuracy": 0.8197960138320923, "num_tokens": 400059555.0, "step": 11940 }, { "epoch": 0.713049188156638, "grad_norm": 0.5019068121910095, "learning_rate": 3.939998784995236e-05, "loss": 0.5313, "mean_token_accuracy": 0.8304067730903626, "num_tokens": 400227235.0, "step": 11945 }, { "epoch": 0.7133476599808978, "grad_norm": 0.5448180437088013, "learning_rate": 3.9390562236699804e-05, "loss": 0.5439, "mean_token_accuracy": 0.8271501898765564, "num_tokens": 400394915.0, "step": 11950 }, { "epoch": 0.7136461318051576, "grad_norm": 0.5013893842697144, "learning_rate": 3.938113372728135e-05, "loss": 0.5295, "mean_token_accuracy": 0.8310032248497009, "num_tokens": 400562595.0, "step": 11955 }, { "epoch": 0.7139446036294174, "grad_norm": 0.5113101601600647, "learning_rate": 3.937170232399348e-05, "loss": 0.4928, "mean_token_accuracy": 0.8408922672271728, "num_tokens": 400730275.0, "step": 11960 }, { "epoch": 0.7142430754536772, "grad_norm": 0.4909002482891083, "learning_rate": 3.936226802913337e-05, "loss": 0.5759, "mean_token_accuracy": 0.8190206289291382, "num_tokens": 400897955.0, "step": 11965 }, { "epoch": 0.714541547277937, "grad_norm": 0.5437666773796082, "learning_rate": 3.935283084499894e-05, "loss": 0.5645, "mean_token_accuracy": 0.8215435981750489, "num_tokens": 401065635.0, "step": 11970 }, { "epoch": 0.7148400191021967, "grad_norm": 0.5017220973968506, "learning_rate": 3.934339077388878e-05, "loss": 0.549, "mean_token_accuracy": 0.8266372323036194, "num_tokens": 401233315.0, "step": 11975 }, { "epoch": 0.7151384909264565, "grad_norm": 0.49572592973709106, "learning_rate": 3.9333947818102186e-05, "loss": 0.5548, "mean_token_accuracy": 0.8233866214752197, "num_tokens": 401400995.0, "step": 11980 }, { "epoch": 0.7154369627507163, "grad_norm": 0.48280057311058044, "learning_rate": 3.932450197993917e-05, "loss": 0.5287, "mean_token_accuracy": 0.8322199702262878, "num_tokens": 401568675.0, "step": 11985 }, { "epoch": 0.7157354345749761, "grad_norm": 0.54119873046875, "learning_rate": 3.931505326170044e-05, "loss": 0.5695, "mean_token_accuracy": 0.8198675870895386, "num_tokens": 401736355.0, "step": 11990 }, { "epoch": 0.7160339063992359, "grad_norm": 0.5073707103729248, "learning_rate": 3.9305601665687416e-05, "loss": 0.5307, "mean_token_accuracy": 0.8299296259880066, "num_tokens": 401904035.0, "step": 11995 }, { "epoch": 0.7163323782234957, "grad_norm": 0.4858524203300476, "learning_rate": 3.92961471942022e-05, "loss": 0.5213, "mean_token_accuracy": 0.8352320075035096, "num_tokens": 402071715.0, "step": 12000 }, { "epoch": 0.7166308500477555, "grad_norm": 0.502540647983551, "learning_rate": 3.92866898495476e-05, "loss": 0.5117, "mean_token_accuracy": 0.8379339218139649, "num_tokens": 402239395.0, "step": 12005 }, { "epoch": 0.7169293218720153, "grad_norm": 0.5087219476699829, "learning_rate": 3.927722963402712e-05, "loss": 0.5334, "mean_token_accuracy": 0.8295299887657166, "num_tokens": 402407075.0, "step": 12010 }, { "epoch": 0.7172277936962751, "grad_norm": 0.5110599398612976, "learning_rate": 3.926776654994499e-05, "loss": 0.5683, "mean_token_accuracy": 0.8199093461036682, "num_tokens": 402574755.0, "step": 12015 }, { "epoch": 0.7175262655205349, "grad_norm": 0.47929057478904724, "learning_rate": 3.925830059960611e-05, "loss": 0.5232, "mean_token_accuracy": 0.8329834103584289, "num_tokens": 402742435.0, "step": 12020 }, { "epoch": 0.7178247373447947, "grad_norm": 0.624451220035553, "learning_rate": 3.924883178531608e-05, "loss": 0.4865, "mean_token_accuracy": 0.8451986193656922, "num_tokens": 402910115.0, "step": 12025 }, { "epoch": 0.7181232091690545, "grad_norm": 0.6565896272659302, "learning_rate": 3.923936010938122e-05, "loss": 0.5543, "mean_token_accuracy": 0.8263302445411682, "num_tokens": 403070069.0, "step": 12030 }, { "epoch": 0.7184216809933143, "grad_norm": 0.5351299047470093, "learning_rate": 3.922988557410852e-05, "loss": 0.558, "mean_token_accuracy": 0.8221161842346192, "num_tokens": 403237749.0, "step": 12035 }, { "epoch": 0.7187201528175741, "grad_norm": 0.5642988681793213, "learning_rate": 3.922040818180567e-05, "loss": 0.5823, "mean_token_accuracy": 0.8155075788497925, "num_tokens": 403405429.0, "step": 12040 }, { "epoch": 0.7190186246418339, "grad_norm": 0.46795371174812317, "learning_rate": 3.921092793478107e-05, "loss": 0.5247, "mean_token_accuracy": 0.8317189455032349, "num_tokens": 403573109.0, "step": 12045 }, { "epoch": 0.7193170964660937, "grad_norm": 0.5519506335258484, "learning_rate": 3.920144483534382e-05, "loss": 0.5669, "mean_token_accuracy": 0.8202493190765381, "num_tokens": 403740789.0, "step": 12050 }, { "epoch": 0.7196155682903533, "grad_norm": 0.6325945854187012, "learning_rate": 3.9191958885803704e-05, "loss": 0.6071, "mean_token_accuracy": 0.8072289228439331, "num_tokens": 403908469.0, "step": 12055 }, { "epoch": 0.7199140401146131, "grad_norm": 0.5446462035179138, "learning_rate": 3.9182470088471186e-05, "loss": 0.5619, "mean_token_accuracy": 0.8210604786872864, "num_tokens": 404076149.0, "step": 12060 }, { "epoch": 0.7202125119388729, "grad_norm": 0.4894942343235016, "learning_rate": 3.9172978445657446e-05, "loss": 0.5261, "mean_token_accuracy": 0.8332219958305359, "num_tokens": 404243829.0, "step": 12065 }, { "epoch": 0.7205109837631327, "grad_norm": 0.5383883714675903, "learning_rate": 3.9163483959674335e-05, "loss": 0.534, "mean_token_accuracy": 0.8312239050865173, "num_tokens": 404411509.0, "step": 12070 }, { "epoch": 0.7208094555873925, "grad_norm": 0.5403720736503601, "learning_rate": 3.9153986632834436e-05, "loss": 0.5839, "mean_token_accuracy": 0.8152391672134399, "num_tokens": 404579189.0, "step": 12075 }, { "epoch": 0.7211079274116523, "grad_norm": 0.5110628008842468, "learning_rate": 3.9144486467450974e-05, "loss": 0.5637, "mean_token_accuracy": 0.8216330647468567, "num_tokens": 404746869.0, "step": 12080 }, { "epoch": 0.7214063992359121, "grad_norm": 0.5106228590011597, "learning_rate": 3.913498346583791e-05, "loss": 0.517, "mean_token_accuracy": 0.8357747673988343, "num_tokens": 404914549.0, "step": 12085 }, { "epoch": 0.7217048710601719, "grad_norm": 0.4880300462245941, "learning_rate": 3.9125477630309865e-05, "loss": 0.5361, "mean_token_accuracy": 0.8293093085289002, "num_tokens": 405082229.0, "step": 12090 }, { "epoch": 0.7220033428844317, "grad_norm": 0.530001163482666, "learning_rate": 3.911596896318216e-05, "loss": 0.5496, "mean_token_accuracy": 0.82604079246521, "num_tokens": 405249909.0, "step": 12095 }, { "epoch": 0.7223018147086915, "grad_norm": 0.5284551978111267, "learning_rate": 3.9106457466770805e-05, "loss": 0.524, "mean_token_accuracy": 0.8315698385238648, "num_tokens": 405417589.0, "step": 12100 }, { "epoch": 0.7226002865329513, "grad_norm": 0.4907339811325073, "learning_rate": 3.9096943143392506e-05, "loss": 0.5637, "mean_token_accuracy": 0.8217523574829102, "num_tokens": 405585269.0, "step": 12105 }, { "epoch": 0.7228987583572111, "grad_norm": 0.571054220199585, "learning_rate": 3.908742599536465e-05, "loss": 0.5781, "mean_token_accuracy": 0.8165990591049195, "num_tokens": 405752949.0, "step": 12110 }, { "epoch": 0.7231972301814709, "grad_norm": 0.516926646232605, "learning_rate": 3.9077906025005304e-05, "loss": 0.5494, "mean_token_accuracy": 0.8263688325881958, "num_tokens": 405920629.0, "step": 12115 }, { "epoch": 0.7234957020057307, "grad_norm": 0.46530216932296753, "learning_rate": 3.9068383234633256e-05, "loss": 0.5173, "mean_token_accuracy": 0.8344626069068909, "num_tokens": 406088309.0, "step": 12120 }, { "epoch": 0.7237941738299905, "grad_norm": 0.5585097074508667, "learning_rate": 3.905885762656793e-05, "loss": 0.5671, "mean_token_accuracy": 0.821340811252594, "num_tokens": 406255989.0, "step": 12125 }, { "epoch": 0.7240926456542502, "grad_norm": 0.534710168838501, "learning_rate": 3.904932920312948e-05, "loss": 0.5233, "mean_token_accuracy": 0.8315579175949097, "num_tokens": 406423669.0, "step": 12130 }, { "epoch": 0.72439111747851, "grad_norm": 0.4949762225151062, "learning_rate": 3.903979796663874e-05, "loss": 0.5604, "mean_token_accuracy": 0.822229516506195, "num_tokens": 406591349.0, "step": 12135 }, { "epoch": 0.7246895893027698, "grad_norm": 0.47784188389778137, "learning_rate": 3.9030263919417195e-05, "loss": 0.5005, "mean_token_accuracy": 0.8394965887069702, "num_tokens": 406759029.0, "step": 12140 }, { "epoch": 0.7249880611270296, "grad_norm": 0.5196817517280579, "learning_rate": 3.902072706378705e-05, "loss": 0.5057, "mean_token_accuracy": 0.8381546020507813, "num_tokens": 406926709.0, "step": 12145 }, { "epoch": 0.7252865329512894, "grad_norm": 0.5363680124282837, "learning_rate": 3.9011187402071186e-05, "loss": 0.5608, "mean_token_accuracy": 0.8227842092514038, "num_tokens": 407094389.0, "step": 12150 }, { "epoch": 0.7255850047755492, "grad_norm": 0.5104947686195374, "learning_rate": 3.900164493659314e-05, "loss": 0.5229, "mean_token_accuracy": 0.8333233952522278, "num_tokens": 407262069.0, "step": 12155 }, { "epoch": 0.725883476599809, "grad_norm": 0.6038317084312439, "learning_rate": 3.899209966967719e-05, "loss": 0.5708, "mean_token_accuracy": 0.8190385341644287, "num_tokens": 407429749.0, "step": 12160 }, { "epoch": 0.7261819484240688, "grad_norm": 0.5249974131584167, "learning_rate": 3.898255160364824e-05, "loss": 0.5286, "mean_token_accuracy": 0.8315817713737488, "num_tokens": 407597429.0, "step": 12165 }, { "epoch": 0.7264804202483286, "grad_norm": 0.5322732329368591, "learning_rate": 3.89730007408319e-05, "loss": 0.5373, "mean_token_accuracy": 0.828307282924652, "num_tokens": 407765109.0, "step": 12170 }, { "epoch": 0.7267788920725884, "grad_norm": 0.6955226063728333, "learning_rate": 3.896344708355447e-05, "loss": 0.5662, "mean_token_accuracy": 0.8209650397300721, "num_tokens": 407932789.0, "step": 12175 }, { "epoch": 0.7270773638968482, "grad_norm": 0.5516892671585083, "learning_rate": 3.8953890634142904e-05, "loss": 0.5452, "mean_token_accuracy": 0.8284146428108216, "num_tokens": 408100469.0, "step": 12180 }, { "epoch": 0.727375835721108, "grad_norm": 0.5327548980712891, "learning_rate": 3.894433139492487e-05, "loss": 0.5303, "mean_token_accuracy": 0.8306632518768311, "num_tokens": 408268149.0, "step": 12185 }, { "epoch": 0.7276743075453678, "grad_norm": 0.5411967039108276, "learning_rate": 3.8934769368228675e-05, "loss": 0.567, "mean_token_accuracy": 0.8178456425666809, "num_tokens": 408435829.0, "step": 12190 }, { "epoch": 0.7279727793696275, "grad_norm": 0.5814877152442932, "learning_rate": 3.8925204556383345e-05, "loss": 0.5931, "mean_token_accuracy": 0.81239413022995, "num_tokens": 408603509.0, "step": 12195 }, { "epoch": 0.7282712511938872, "grad_norm": 0.45790383219718933, "learning_rate": 3.891563696171856e-05, "loss": 0.4856, "mean_token_accuracy": 0.8433973550796509, "num_tokens": 408771189.0, "step": 12200 }, { "epoch": 0.728569723018147, "grad_norm": 0.495868057012558, "learning_rate": 3.8906066586564675e-05, "loss": 0.5405, "mean_token_accuracy": 0.8264404177665711, "num_tokens": 408938869.0, "step": 12205 }, { "epoch": 0.7288681948424068, "grad_norm": 0.6081781387329102, "learning_rate": 3.8896493433252755e-05, "loss": 0.617, "mean_token_accuracy": 0.806328272819519, "num_tokens": 409106549.0, "step": 12210 }, { "epoch": 0.7291666666666666, "grad_norm": 0.580693244934082, "learning_rate": 3.888691750411449e-05, "loss": 0.6039, "mean_token_accuracy": 0.8085828423500061, "num_tokens": 409274229.0, "step": 12215 }, { "epoch": 0.7294651384909264, "grad_norm": 0.5905386805534363, "learning_rate": 3.8877338801482306e-05, "loss": 0.5733, "mean_token_accuracy": 0.8189073085784913, "num_tokens": 409441909.0, "step": 12220 }, { "epoch": 0.7297636103151862, "grad_norm": 0.5744999647140503, "learning_rate": 3.886775732768924e-05, "loss": 0.5723, "mean_token_accuracy": 0.8189311742782592, "num_tokens": 409609589.0, "step": 12225 }, { "epoch": 0.730062082139446, "grad_norm": 0.5736605525016785, "learning_rate": 3.8858173085069063e-05, "loss": 0.5728, "mean_token_accuracy": 0.8173565626144409, "num_tokens": 409777269.0, "step": 12230 }, { "epoch": 0.7303605539637058, "grad_norm": 0.5335548520088196, "learning_rate": 3.884858607595617e-05, "loss": 0.5707, "mean_token_accuracy": 0.8188895583152771, "num_tokens": 409937824.0, "step": 12235 }, { "epoch": 0.7306590257879656, "grad_norm": 0.4800441265106201, "learning_rate": 3.883899630268568e-05, "loss": 0.56, "mean_token_accuracy": 0.8227305293083191, "num_tokens": 410105504.0, "step": 12240 }, { "epoch": 0.7309574976122254, "grad_norm": 0.5362090468406677, "learning_rate": 3.8829403767593345e-05, "loss": 0.5129, "mean_token_accuracy": 0.8355302333831787, "num_tokens": 410273184.0, "step": 12245 }, { "epoch": 0.7312559694364852, "grad_norm": 0.4971427321434021, "learning_rate": 3.88198084730156e-05, "loss": 0.5434, "mean_token_accuracy": 0.8270487904548645, "num_tokens": 410440864.0, "step": 12250 }, { "epoch": 0.731554441260745, "grad_norm": 0.5260369181632996, "learning_rate": 3.881021042128957e-05, "loss": 0.5621, "mean_token_accuracy": 0.8213348388671875, "num_tokens": 410608544.0, "step": 12255 }, { "epoch": 0.7318529130850048, "grad_norm": 0.5524857044219971, "learning_rate": 3.880060961475302e-05, "loss": 0.5543, "mean_token_accuracy": 0.824824047088623, "num_tokens": 410776224.0, "step": 12260 }, { "epoch": 0.7321513849092646, "grad_norm": 0.5446348786354065, "learning_rate": 3.8791006055744415e-05, "loss": 0.5694, "mean_token_accuracy": 0.8190802812576294, "num_tokens": 410943904.0, "step": 12265 }, { "epoch": 0.7324498567335244, "grad_norm": 0.5169851779937744, "learning_rate": 3.878139974660288e-05, "loss": 0.5295, "mean_token_accuracy": 0.831516170501709, "num_tokens": 411111584.0, "step": 12270 }, { "epoch": 0.7327483285577842, "grad_norm": 0.49699828028678894, "learning_rate": 3.8771790689668205e-05, "loss": 0.5374, "mean_token_accuracy": 0.8284862160682678, "num_tokens": 411279264.0, "step": 12275 }, { "epoch": 0.733046800382044, "grad_norm": 0.781423032283783, "learning_rate": 3.8762178887280856e-05, "loss": 0.5362, "mean_token_accuracy": 0.8279136300086976, "num_tokens": 411446944.0, "step": 12280 }, { "epoch": 0.7333452722063037, "grad_norm": 0.6001463532447815, "learning_rate": 3.875256434178196e-05, "loss": 0.5704, "mean_token_accuracy": 0.8198735594749451, "num_tokens": 411614624.0, "step": 12285 }, { "epoch": 0.7336437440305635, "grad_norm": 0.5646892786026001, "learning_rate": 3.874294705551332e-05, "loss": 0.5564, "mean_token_accuracy": 0.8235774874687195, "num_tokens": 411782304.0, "step": 12290 }, { "epoch": 0.7339422158548233, "grad_norm": 0.5249640345573425, "learning_rate": 3.873332703081741e-05, "loss": 0.5287, "mean_token_accuracy": 0.830901837348938, "num_tokens": 411949984.0, "step": 12295 }, { "epoch": 0.7342406876790831, "grad_norm": 0.49104538559913635, "learning_rate": 3.872370427003735e-05, "loss": 0.5079, "mean_token_accuracy": 0.8361565113067627, "num_tokens": 412117664.0, "step": 12300 }, { "epoch": 0.7345391595033429, "grad_norm": 0.583514392375946, "learning_rate": 3.871407877551695e-05, "loss": 0.5637, "mean_token_accuracy": 0.8202493190765381, "num_tokens": 412285344.0, "step": 12305 }, { "epoch": 0.7348376313276027, "grad_norm": 0.4964461922645569, "learning_rate": 3.870445054960068e-05, "loss": 0.5238, "mean_token_accuracy": 0.8318438649177551, "num_tokens": 412451956.0, "step": 12310 }, { "epoch": 0.7351361031518625, "grad_norm": 0.5448853373527527, "learning_rate": 3.869481959463366e-05, "loss": 0.5749, "mean_token_accuracy": 0.8187760949134827, "num_tokens": 412619636.0, "step": 12315 }, { "epoch": 0.7354345749761223, "grad_norm": 0.5224013924598694, "learning_rate": 3.86851859129617e-05, "loss": 0.5762, "mean_token_accuracy": 0.8196290135383606, "num_tokens": 412787316.0, "step": 12320 }, { "epoch": 0.7357330468003821, "grad_norm": 0.5130847692489624, "learning_rate": 3.867554950693126e-05, "loss": 0.5349, "mean_token_accuracy": 0.8297805190086365, "num_tokens": 412954996.0, "step": 12325 }, { "epoch": 0.7360315186246418, "grad_norm": 0.5278207063674927, "learning_rate": 3.866591037888946e-05, "loss": 0.5714, "mean_token_accuracy": 0.8177084445953369, "num_tokens": 413122676.0, "step": 12330 }, { "epoch": 0.7363299904489016, "grad_norm": 0.4937397241592407, "learning_rate": 3.865626853118409e-05, "loss": 0.5633, "mean_token_accuracy": 0.8221996903419495, "num_tokens": 413290356.0, "step": 12335 }, { "epoch": 0.7366284622731614, "grad_norm": 0.46921923756599426, "learning_rate": 3.8646623966163584e-05, "loss": 0.559, "mean_token_accuracy": 0.8241679668426514, "num_tokens": 413458036.0, "step": 12340 }, { "epoch": 0.7369269340974212, "grad_norm": 0.47525590658187866, "learning_rate": 3.863697668617708e-05, "loss": 0.5031, "mean_token_accuracy": 0.8403316259384155, "num_tokens": 413625716.0, "step": 12345 }, { "epoch": 0.737225405921681, "grad_norm": 0.4790109395980835, "learning_rate": 3.862732669357434e-05, "loss": 0.5394, "mean_token_accuracy": 0.8282595753669739, "num_tokens": 413793396.0, "step": 12350 }, { "epoch": 0.7375238777459407, "grad_norm": 0.563318133354187, "learning_rate": 3.8617673990705796e-05, "loss": 0.5453, "mean_token_accuracy": 0.82631516456604, "num_tokens": 413961076.0, "step": 12355 }, { "epoch": 0.7378223495702005, "grad_norm": 0.5196044445037842, "learning_rate": 3.860801857992253e-05, "loss": 0.5297, "mean_token_accuracy": 0.8309077858924866, "num_tokens": 414128756.0, "step": 12360 }, { "epoch": 0.7381208213944603, "grad_norm": 0.5479102730751038, "learning_rate": 3.8598360463576324e-05, "loss": 0.5908, "mean_token_accuracy": 0.8142967939376831, "num_tokens": 414296436.0, "step": 12365 }, { "epoch": 0.7384192932187201, "grad_norm": 0.5005536675453186, "learning_rate": 3.858869964401957e-05, "loss": 0.5202, "mean_token_accuracy": 0.8349457263946534, "num_tokens": 414464116.0, "step": 12370 }, { "epoch": 0.7387177650429799, "grad_norm": 0.5197511315345764, "learning_rate": 3.857903612360535e-05, "loss": 0.5323, "mean_token_accuracy": 0.8296910524368286, "num_tokens": 414631796.0, "step": 12375 }, { "epoch": 0.7390162368672397, "grad_norm": 0.5165748596191406, "learning_rate": 3.8569369904687375e-05, "loss": 0.4909, "mean_token_accuracy": 0.8423535704612732, "num_tokens": 414799476.0, "step": 12380 }, { "epoch": 0.7393147086914995, "grad_norm": 0.5564424395561218, "learning_rate": 3.855970098962005e-05, "loss": 0.5511, "mean_token_accuracy": 0.8253668069839477, "num_tokens": 414967156.0, "step": 12385 }, { "epoch": 0.7396131805157593, "grad_norm": 0.5473902225494385, "learning_rate": 3.855002938075841e-05, "loss": 0.5285, "mean_token_accuracy": 0.8327090382575989, "num_tokens": 415134836.0, "step": 12390 }, { "epoch": 0.7399116523400191, "grad_norm": 0.5365471839904785, "learning_rate": 3.854035508045815e-05, "loss": 0.532, "mean_token_accuracy": 0.8312775731086731, "num_tokens": 415302516.0, "step": 12395 }, { "epoch": 0.7402101241642789, "grad_norm": 0.572266697883606, "learning_rate": 3.8530678091075625e-05, "loss": 0.5337, "mean_token_accuracy": 0.8299952268600463, "num_tokens": 415470196.0, "step": 12400 }, { "epoch": 0.7405085959885387, "grad_norm": 0.4953557252883911, "learning_rate": 3.852099841496786e-05, "loss": 0.5282, "mean_token_accuracy": 0.8318978905677795, "num_tokens": 415637876.0, "step": 12405 }, { "epoch": 0.7408070678127985, "grad_norm": 0.5083139538764954, "learning_rate": 3.85113160544925e-05, "loss": 0.502, "mean_token_accuracy": 0.8381367087364197, "num_tokens": 415805556.0, "step": 12410 }, { "epoch": 0.7411055396370583, "grad_norm": 0.5080996155738831, "learning_rate": 3.8501631012007873e-05, "loss": 0.518, "mean_token_accuracy": 0.8339257955551147, "num_tokens": 415973236.0, "step": 12415 }, { "epoch": 0.7414040114613181, "grad_norm": 0.5270262360572815, "learning_rate": 3.849194328987293e-05, "loss": 0.563, "mean_token_accuracy": 0.8203029990196228, "num_tokens": 416140916.0, "step": 12420 }, { "epoch": 0.7417024832855779, "grad_norm": 0.4829140305519104, "learning_rate": 3.8482252890447334e-05, "loss": 0.5073, "mean_token_accuracy": 0.8386042952537537, "num_tokens": 416306881.0, "step": 12425 }, { "epoch": 0.7420009551098377, "grad_norm": 0.5917607545852661, "learning_rate": 3.8472559816091324e-05, "loss": 0.6197, "mean_token_accuracy": 0.8060181260108947, "num_tokens": 416474561.0, "step": 12430 }, { "epoch": 0.7422994269340975, "grad_norm": 0.5040755271911621, "learning_rate": 3.8462864069165846e-05, "loss": 0.5291, "mean_token_accuracy": 0.8313909053802491, "num_tokens": 416642241.0, "step": 12435 }, { "epoch": 0.7425978987583572, "grad_norm": 0.4962681233882904, "learning_rate": 3.845316565203247e-05, "loss": 0.5377, "mean_token_accuracy": 0.8275140047073364, "num_tokens": 416809921.0, "step": 12440 }, { "epoch": 0.742896370582617, "grad_norm": 0.4777105152606964, "learning_rate": 3.844346456705343e-05, "loss": 0.5151, "mean_token_accuracy": 0.8348204612731933, "num_tokens": 416977601.0, "step": 12445 }, { "epoch": 0.7431948424068768, "grad_norm": 0.5366834998130798, "learning_rate": 3.843376081659159e-05, "loss": 0.5567, "mean_token_accuracy": 0.8225754499435425, "num_tokens": 417145281.0, "step": 12450 }, { "epoch": 0.7434933142311366, "grad_norm": 0.4639711380004883, "learning_rate": 3.84240544030105e-05, "loss": 0.5169, "mean_token_accuracy": 0.8342299818992615, "num_tokens": 417312961.0, "step": 12455 }, { "epoch": 0.7437917860553964, "grad_norm": 0.4997599720954895, "learning_rate": 3.841434532867432e-05, "loss": 0.5319, "mean_token_accuracy": 0.8300071477890014, "num_tokens": 417480641.0, "step": 12460 }, { "epoch": 0.7440902578796562, "grad_norm": 0.5132542848587036, "learning_rate": 3.8404633595947884e-05, "loss": 0.5432, "mean_token_accuracy": 0.8284742951393127, "num_tokens": 417648321.0, "step": 12465 }, { "epoch": 0.7443887297039159, "grad_norm": 0.558612048625946, "learning_rate": 3.839491920719666e-05, "loss": 0.5224, "mean_token_accuracy": 0.8330669164657593, "num_tokens": 417816001.0, "step": 12470 }, { "epoch": 0.7446872015281757, "grad_norm": 0.5155857801437378, "learning_rate": 3.838520216478677e-05, "loss": 0.5297, "mean_token_accuracy": 0.8296671867370605, "num_tokens": 417983681.0, "step": 12475 }, { "epoch": 0.7449856733524355, "grad_norm": 0.591007649898529, "learning_rate": 3.8375482471084966e-05, "loss": 0.5188, "mean_token_accuracy": 0.8344447016716003, "num_tokens": 418151361.0, "step": 12480 }, { "epoch": 0.7452841451766953, "grad_norm": 0.5150274634361267, "learning_rate": 3.836576012845867e-05, "loss": 0.5408, "mean_token_accuracy": 0.828146243095398, "num_tokens": 418319041.0, "step": 12485 }, { "epoch": 0.7455826170009551, "grad_norm": 0.4874996542930603, "learning_rate": 3.835603513927593e-05, "loss": 0.5344, "mean_token_accuracy": 0.8289812803268433, "num_tokens": 418486721.0, "step": 12490 }, { "epoch": 0.7458810888252149, "grad_norm": 0.5001745820045471, "learning_rate": 3.834630750590546e-05, "loss": 0.5695, "mean_token_accuracy": 0.8206668257713318, "num_tokens": 418654401.0, "step": 12495 }, { "epoch": 0.7461795606494747, "grad_norm": 0.5270755887031555, "learning_rate": 3.8336577230716595e-05, "loss": 0.5714, "mean_token_accuracy": 0.818108069896698, "num_tokens": 418822081.0, "step": 12500 }, { "epoch": 0.7464780324737345, "grad_norm": 0.49236175417900085, "learning_rate": 3.832684431607933e-05, "loss": 0.5372, "mean_token_accuracy": 0.8276273250579834, "num_tokens": 418989761.0, "step": 12505 }, { "epoch": 0.7467765042979942, "grad_norm": 0.525929868221283, "learning_rate": 3.831710876436428e-05, "loss": 0.5391, "mean_token_accuracy": 0.827168083190918, "num_tokens": 419157441.0, "step": 12510 }, { "epoch": 0.747074976122254, "grad_norm": 0.5231567621231079, "learning_rate": 3.830737057794274e-05, "loss": 0.5477, "mean_token_accuracy": 0.82493736743927, "num_tokens": 419325121.0, "step": 12515 }, { "epoch": 0.7473734479465138, "grad_norm": 0.49298495054244995, "learning_rate": 3.829762975918661e-05, "loss": 0.565, "mean_token_accuracy": 0.8215435981750489, "num_tokens": 419492801.0, "step": 12520 }, { "epoch": 0.7476719197707736, "grad_norm": 0.46484676003456116, "learning_rate": 3.828788631046845e-05, "loss": 0.5007, "mean_token_accuracy": 0.837743055820465, "num_tokens": 419660481.0, "step": 12525 }, { "epoch": 0.7479703915950334, "grad_norm": 0.47831910848617554, "learning_rate": 3.8278140234161463e-05, "loss": 0.5771, "mean_token_accuracy": 0.8174460291862488, "num_tokens": 419828161.0, "step": 12530 }, { "epoch": 0.7482688634192932, "grad_norm": 0.5279443264007568, "learning_rate": 3.8268391532639474e-05, "loss": 0.4913, "mean_token_accuracy": 0.8427233695983887, "num_tokens": 419995841.0, "step": 12535 }, { "epoch": 0.748567335243553, "grad_norm": 0.4774021804332733, "learning_rate": 3.825864020827697e-05, "loss": 0.521, "mean_token_accuracy": 0.8333472490310669, "num_tokens": 420163521.0, "step": 12540 }, { "epoch": 0.7488658070678128, "grad_norm": 0.513725757598877, "learning_rate": 3.824888626344905e-05, "loss": 0.5341, "mean_token_accuracy": 0.8304843187332154, "num_tokens": 420331201.0, "step": 12545 }, { "epoch": 0.7491642788920726, "grad_norm": 0.4979497790336609, "learning_rate": 3.8239129700531495e-05, "loss": 0.5466, "mean_token_accuracy": 0.825038754940033, "num_tokens": 420498881.0, "step": 12550 }, { "epoch": 0.7494627507163324, "grad_norm": 0.4993870258331299, "learning_rate": 3.822937052190067e-05, "loss": 0.535, "mean_token_accuracy": 0.827645230293274, "num_tokens": 420666561.0, "step": 12555 }, { "epoch": 0.7497612225405922, "grad_norm": 0.4990299344062805, "learning_rate": 3.821960872993362e-05, "loss": 0.5079, "mean_token_accuracy": 0.8360014438629151, "num_tokens": 420834241.0, "step": 12560 }, { "epoch": 0.750059694364852, "grad_norm": 0.5518876314163208, "learning_rate": 3.8209844327008e-05, "loss": 0.4977, "mean_token_accuracy": 0.8402361989021301, "num_tokens": 421001921.0, "step": 12565 }, { "epoch": 0.7503581661891118, "grad_norm": 0.5664923191070557, "learning_rate": 3.820007731550211e-05, "loss": 0.5324, "mean_token_accuracy": 0.8299952387809754, "num_tokens": 421169601.0, "step": 12570 }, { "epoch": 0.7506566380133716, "grad_norm": 0.5897797346115112, "learning_rate": 3.819030769779489e-05, "loss": 0.5176, "mean_token_accuracy": 0.834342873096466, "num_tokens": 421331895.0, "step": 12575 }, { "epoch": 0.7509551098376314, "grad_norm": 0.529281497001648, "learning_rate": 3.818053547626591e-05, "loss": 0.5762, "mean_token_accuracy": 0.8180126428604126, "num_tokens": 421499575.0, "step": 12580 }, { "epoch": 0.7512535816618912, "grad_norm": 0.47769302129745483, "learning_rate": 3.817076065329537e-05, "loss": 0.5774, "mean_token_accuracy": 0.8178396821022034, "num_tokens": 421667255.0, "step": 12585 }, { "epoch": 0.751552053486151, "grad_norm": 0.48184019327163696, "learning_rate": 3.816098323126413e-05, "loss": 0.4995, "mean_token_accuracy": 0.8382977485656739, "num_tokens": 421834935.0, "step": 12590 }, { "epoch": 0.7518505253104107, "grad_norm": 0.49549925327301025, "learning_rate": 3.8151203212553624e-05, "loss": 0.5548, "mean_token_accuracy": 0.8245258212089539, "num_tokens": 422002615.0, "step": 12595 }, { "epoch": 0.7521489971346705, "grad_norm": 0.4643173813819885, "learning_rate": 3.814142059954599e-05, "loss": 0.5512, "mean_token_accuracy": 0.8241739392280578, "num_tokens": 422170295.0, "step": 12600 }, { "epoch": 0.7524474689589303, "grad_norm": 0.536861777305603, "learning_rate": 3.813163539462395e-05, "loss": 0.5535, "mean_token_accuracy": 0.8233031153678894, "num_tokens": 422337975.0, "step": 12605 }, { "epoch": 0.75274594078319, "grad_norm": 0.6138288974761963, "learning_rate": 3.8121847600170867e-05, "loss": 0.532, "mean_token_accuracy": 0.830526065826416, "num_tokens": 422505655.0, "step": 12610 }, { "epoch": 0.7530444126074498, "grad_norm": 0.48329293727874756, "learning_rate": 3.811205721857075e-05, "loss": 0.5018, "mean_token_accuracy": 0.8385071277618408, "num_tokens": 422673218.0, "step": 12615 }, { "epoch": 0.7533428844317096, "grad_norm": 0.48226669430732727, "learning_rate": 3.810226425220821e-05, "loss": 0.5319, "mean_token_accuracy": 0.8300966143608093, "num_tokens": 422840898.0, "step": 12620 }, { "epoch": 0.7536413562559694, "grad_norm": 0.5072008371353149, "learning_rate": 3.809246870346853e-05, "loss": 0.5599, "mean_token_accuracy": 0.8212334632873535, "num_tokens": 423008578.0, "step": 12625 }, { "epoch": 0.7539398280802292, "grad_norm": 0.6549006700515747, "learning_rate": 3.808267057473757e-05, "loss": 0.587, "mean_token_accuracy": 0.8156208992004395, "num_tokens": 423176258.0, "step": 12630 }, { "epoch": 0.754238299904489, "grad_norm": 0.5449286103248596, "learning_rate": 3.807286986840187e-05, "loss": 0.5121, "mean_token_accuracy": 0.8350440382957458, "num_tokens": 423338896.0, "step": 12635 }, { "epoch": 0.7545367717287488, "grad_norm": 0.4926968514919281, "learning_rate": 3.806306658684855e-05, "loss": 0.5247, "mean_token_accuracy": 0.8317487716674805, "num_tokens": 423506576.0, "step": 12640 }, { "epoch": 0.7548352435530086, "grad_norm": 0.5020460486412048, "learning_rate": 3.80532607324654e-05, "loss": 0.5433, "mean_token_accuracy": 0.8282893776893616, "num_tokens": 423674256.0, "step": 12645 }, { "epoch": 0.7551337153772684, "grad_norm": 0.5663554668426514, "learning_rate": 3.80434523076408e-05, "loss": 0.5243, "mean_token_accuracy": 0.8324048757553101, "num_tokens": 423841936.0, "step": 12650 }, { "epoch": 0.7554321872015282, "grad_norm": 0.4894520938396454, "learning_rate": 3.8033641314763786e-05, "loss": 0.5299, "mean_token_accuracy": 0.8319277167320251, "num_tokens": 424009616.0, "step": 12655 }, { "epoch": 0.755730659025788, "grad_norm": 0.5790097713470459, "learning_rate": 3.8023827756223996e-05, "loss": 0.61, "mean_token_accuracy": 0.8063879370689392, "num_tokens": 424177296.0, "step": 12660 }, { "epoch": 0.7560291308500477, "grad_norm": 0.4726533591747284, "learning_rate": 3.801401163441171e-05, "loss": 0.5335, "mean_token_accuracy": 0.8300608396530151, "num_tokens": 424344976.0, "step": 12665 }, { "epoch": 0.7563276026743075, "grad_norm": 0.48812583088874817, "learning_rate": 3.8004192951717826e-05, "loss": 0.4988, "mean_token_accuracy": 0.8403316140174866, "num_tokens": 424512656.0, "step": 12670 }, { "epoch": 0.7566260744985673, "grad_norm": 0.5608876943588257, "learning_rate": 3.799437171053386e-05, "loss": 0.5608, "mean_token_accuracy": 0.8230108499526978, "num_tokens": 424680336.0, "step": 12675 }, { "epoch": 0.7569245463228271, "grad_norm": 0.4958289563655853, "learning_rate": 3.798454791325196e-05, "loss": 0.5745, "mean_token_accuracy": 0.8201896667480468, "num_tokens": 424848016.0, "step": 12680 }, { "epoch": 0.7572230181470869, "grad_norm": 0.5346928834915161, "learning_rate": 3.79747215622649e-05, "loss": 0.4835, "mean_token_accuracy": 0.8438208341598511, "num_tokens": 425015696.0, "step": 12685 }, { "epoch": 0.7575214899713467, "grad_norm": 0.4995134770870209, "learning_rate": 3.796489265996606e-05, "loss": 0.4984, "mean_token_accuracy": 0.8379756689071656, "num_tokens": 425183376.0, "step": 12690 }, { "epoch": 0.7578199617956065, "grad_norm": 0.515295147895813, "learning_rate": 3.795506120874945e-05, "loss": 0.525, "mean_token_accuracy": 0.8330490231513977, "num_tokens": 425351056.0, "step": 12695 }, { "epoch": 0.7581184336198663, "grad_norm": 0.5539008378982544, "learning_rate": 3.79452272110097e-05, "loss": 0.5785, "mean_token_accuracy": 0.8171060442924499, "num_tokens": 425518736.0, "step": 12700 }, { "epoch": 0.7584169054441261, "grad_norm": 0.683463990688324, "learning_rate": 3.793539066914207e-05, "loss": 0.5559, "mean_token_accuracy": 0.8225396633148193, "num_tokens": 425686416.0, "step": 12705 }, { "epoch": 0.7587153772683859, "grad_norm": 0.5496516227722168, "learning_rate": 3.792555158554243e-05, "loss": 0.5912, "mean_token_accuracy": 0.8126148104667663, "num_tokens": 425854096.0, "step": 12710 }, { "epoch": 0.7590138490926457, "grad_norm": 0.6611767411231995, "learning_rate": 3.791570996260727e-05, "loss": 0.576, "mean_token_accuracy": 0.8191399216651917, "num_tokens": 426021776.0, "step": 12715 }, { "epoch": 0.7593123209169055, "grad_norm": 0.524413526058197, "learning_rate": 3.790586580273369e-05, "loss": 0.5289, "mean_token_accuracy": 0.8310390114784241, "num_tokens": 426189456.0, "step": 12720 }, { "epoch": 0.7596107927411653, "grad_norm": 0.5060654282569885, "learning_rate": 3.789601910831944e-05, "loss": 0.5265, "mean_token_accuracy": 0.8316712379455566, "num_tokens": 426357136.0, "step": 12725 }, { "epoch": 0.7599092645654251, "grad_norm": 0.5930113792419434, "learning_rate": 3.788616988176282e-05, "loss": 0.5499, "mean_token_accuracy": 0.824209702014923, "num_tokens": 426524816.0, "step": 12730 }, { "epoch": 0.7602077363896849, "grad_norm": 0.508378803730011, "learning_rate": 3.7876318125462836e-05, "loss": 0.5099, "mean_token_accuracy": 0.8372360706329346, "num_tokens": 426692496.0, "step": 12735 }, { "epoch": 0.7605062082139447, "grad_norm": 0.5116025805473328, "learning_rate": 3.786646384181905e-05, "loss": 0.5252, "mean_token_accuracy": 0.831641411781311, "num_tokens": 426860176.0, "step": 12740 }, { "epoch": 0.7608046800382043, "grad_norm": 0.5730810761451721, "learning_rate": 3.785660703323164e-05, "loss": 0.5468, "mean_token_accuracy": 0.8255099654197693, "num_tokens": 427027856.0, "step": 12745 }, { "epoch": 0.7611031518624641, "grad_norm": 0.49122607707977295, "learning_rate": 3.7846747702101436e-05, "loss": 0.5295, "mean_token_accuracy": 0.8305678367614746, "num_tokens": 427195536.0, "step": 12750 }, { "epoch": 0.7614016236867239, "grad_norm": 0.5074653029441833, "learning_rate": 3.783688585082985e-05, "loss": 0.49, "mean_token_accuracy": 0.8436657547950744, "num_tokens": 427363216.0, "step": 12755 }, { "epoch": 0.7617000955109837, "grad_norm": 0.5781688690185547, "learning_rate": 3.7827021481818917e-05, "loss": 0.5834, "mean_token_accuracy": 0.8172313094139099, "num_tokens": 427530896.0, "step": 12760 }, { "epoch": 0.7619985673352435, "grad_norm": 0.579497754573822, "learning_rate": 3.781715459747128e-05, "loss": 0.5369, "mean_token_accuracy": 0.8302270293235778, "num_tokens": 427693025.0, "step": 12765 }, { "epoch": 0.7622970391595033, "grad_norm": 0.5157675743103027, "learning_rate": 3.780728520019021e-05, "loss": 0.5256, "mean_token_accuracy": 0.8338005661964416, "num_tokens": 427860705.0, "step": 12770 }, { "epoch": 0.7625955109837631, "grad_norm": 0.5437571406364441, "learning_rate": 3.7797413292379585e-05, "loss": 0.571, "mean_token_accuracy": 0.8192413210868835, "num_tokens": 428028385.0, "step": 12775 }, { "epoch": 0.7628939828080229, "grad_norm": 0.5211789608001709, "learning_rate": 3.7787538876443884e-05, "loss": 0.5405, "mean_token_accuracy": 0.8283072948455811, "num_tokens": 428196065.0, "step": 12780 }, { "epoch": 0.7631924546322827, "grad_norm": 0.49580714106559753, "learning_rate": 3.77776619547882e-05, "loss": 0.4809, "mean_token_accuracy": 0.8436836481094361, "num_tokens": 428363745.0, "step": 12785 }, { "epoch": 0.7634909264565425, "grad_norm": 0.5346022844314575, "learning_rate": 3.776778252981824e-05, "loss": 0.5648, "mean_token_accuracy": 0.8198318123817444, "num_tokens": 428531425.0, "step": 12790 }, { "epoch": 0.7637893982808023, "grad_norm": 0.5726235508918762, "learning_rate": 3.7757900603940326e-05, "loss": 0.5419, "mean_token_accuracy": 0.8279017090797425, "num_tokens": 428699105.0, "step": 12795 }, { "epoch": 0.7640878701050621, "grad_norm": 0.520103394985199, "learning_rate": 3.7748016179561385e-05, "loss": 0.543, "mean_token_accuracy": 0.8271382570266723, "num_tokens": 428866785.0, "step": 12800 }, { "epoch": 0.7643863419293219, "grad_norm": 0.49786993861198425, "learning_rate": 3.773812925908894e-05, "loss": 0.5305, "mean_token_accuracy": 0.8287546277046204, "num_tokens": 429034465.0, "step": 12805 }, { "epoch": 0.7646848137535817, "grad_norm": 0.5412278771400452, "learning_rate": 3.772823984493114e-05, "loss": 0.5723, "mean_token_accuracy": 0.817732310295105, "num_tokens": 429202145.0, "step": 12810 }, { "epoch": 0.7649832855778415, "grad_norm": 0.546747088432312, "learning_rate": 3.7718347939496734e-05, "loss": 0.5753, "mean_token_accuracy": 0.8174162030220031, "num_tokens": 429369825.0, "step": 12815 }, { "epoch": 0.7652817574021012, "grad_norm": 0.5095201134681702, "learning_rate": 3.7708453545195085e-05, "loss": 0.5426, "mean_token_accuracy": 0.8274484157562256, "num_tokens": 429537505.0, "step": 12820 }, { "epoch": 0.765580229226361, "grad_norm": 0.5252211093902588, "learning_rate": 3.769855666443614e-05, "loss": 0.5833, "mean_token_accuracy": 0.815638792514801, "num_tokens": 429705185.0, "step": 12825 }, { "epoch": 0.7658787010506208, "grad_norm": 0.4762919843196869, "learning_rate": 3.7688657299630485e-05, "loss": 0.5477, "mean_token_accuracy": 0.8255576729774475, "num_tokens": 429872865.0, "step": 12830 }, { "epoch": 0.7661771728748806, "grad_norm": 0.49446505308151245, "learning_rate": 3.767875545318928e-05, "loss": 0.5466, "mean_token_accuracy": 0.8253489255905151, "num_tokens": 430040545.0, "step": 12835 }, { "epoch": 0.7664756446991404, "grad_norm": 0.5296342372894287, "learning_rate": 3.766885112752431e-05, "loss": 0.5359, "mean_token_accuracy": 0.8284623742103576, "num_tokens": 430208225.0, "step": 12840 }, { "epoch": 0.7667741165234002, "grad_norm": 0.47552910447120667, "learning_rate": 3.765894432504794e-05, "loss": 0.53, "mean_token_accuracy": 0.8313968777656555, "num_tokens": 430375905.0, "step": 12845 }, { "epoch": 0.76707258834766, "grad_norm": 0.5044168829917908, "learning_rate": 3.764903504817318e-05, "loss": 0.5439, "mean_token_accuracy": 0.8256053924560547, "num_tokens": 430543585.0, "step": 12850 }, { "epoch": 0.7673710601719198, "grad_norm": 0.5925907492637634, "learning_rate": 3.7639123299313586e-05, "loss": 0.5393, "mean_token_accuracy": 0.8274126172065734, "num_tokens": 430711265.0, "step": 12855 }, { "epoch": 0.7676695319961796, "grad_norm": 0.49887076020240784, "learning_rate": 3.7629209080883386e-05, "loss": 0.5537, "mean_token_accuracy": 0.823332941532135, "num_tokens": 430878945.0, "step": 12860 }, { "epoch": 0.7679680038204394, "grad_norm": 0.5620576739311218, "learning_rate": 3.761929239529734e-05, "loss": 0.5605, "mean_token_accuracy": 0.8211678385734558, "num_tokens": 431046625.0, "step": 12865 }, { "epoch": 0.7682664756446992, "grad_norm": 0.5315982103347778, "learning_rate": 3.760937324497086e-05, "loss": 0.5377, "mean_token_accuracy": 0.8285160303115845, "num_tokens": 431214305.0, "step": 12870 }, { "epoch": 0.768564947468959, "grad_norm": 0.5332516431808472, "learning_rate": 3.759945163231992e-05, "loss": 0.5112, "mean_token_accuracy": 0.8358039379119873, "num_tokens": 431374141.0, "step": 12875 }, { "epoch": 0.7688634192932188, "grad_norm": 0.4973939061164856, "learning_rate": 3.7589527559761124e-05, "loss": 0.5376, "mean_token_accuracy": 0.8283728957176208, "num_tokens": 431541821.0, "step": 12880 }, { "epoch": 0.7691618911174785, "grad_norm": 0.5521952509880066, "learning_rate": 3.757960102971165e-05, "loss": 0.5557, "mean_token_accuracy": 0.8238220334053039, "num_tokens": 431709501.0, "step": 12885 }, { "epoch": 0.7694603629417383, "grad_norm": 0.505549967288971, "learning_rate": 3.756967204458931e-05, "loss": 0.5522, "mean_token_accuracy": 0.8223487973213196, "num_tokens": 431877181.0, "step": 12890 }, { "epoch": 0.769758834765998, "grad_norm": 0.5413235425949097, "learning_rate": 3.7559740606812474e-05, "loss": 0.5187, "mean_token_accuracy": 0.8329118490219116, "num_tokens": 432044861.0, "step": 12895 }, { "epoch": 0.7700573065902578, "grad_norm": 0.5566238760948181, "learning_rate": 3.754980671880013e-05, "loss": 0.5461, "mean_token_accuracy": 0.8259751796722412, "num_tokens": 432212541.0, "step": 12900 }, { "epoch": 0.7703557784145176, "grad_norm": 0.5344740152359009, "learning_rate": 3.753987038297187e-05, "loss": 0.5774, "mean_token_accuracy": 0.8176667094230652, "num_tokens": 432380221.0, "step": 12905 }, { "epoch": 0.7706542502387774, "grad_norm": 0.5251920819282532, "learning_rate": 3.752993160174785e-05, "loss": 0.56, "mean_token_accuracy": 0.8227603554725647, "num_tokens": 432547901.0, "step": 12910 }, { "epoch": 0.7709527220630372, "grad_norm": 0.6028419137001038, "learning_rate": 3.751999037754886e-05, "loss": 0.5314, "mean_token_accuracy": 0.830537986755371, "num_tokens": 432715581.0, "step": 12915 }, { "epoch": 0.771251193887297, "grad_norm": 0.4760540723800659, "learning_rate": 3.751004671279627e-05, "loss": 0.4909, "mean_token_accuracy": 0.8439043283462524, "num_tokens": 432883261.0, "step": 12920 }, { "epoch": 0.7715496657115568, "grad_norm": 0.49214062094688416, "learning_rate": 3.7500100609912034e-05, "loss": 0.5249, "mean_token_accuracy": 0.8315579056739807, "num_tokens": 433050941.0, "step": 12925 }, { "epoch": 0.7718481375358166, "grad_norm": 0.5240420699119568, "learning_rate": 3.749015207131872e-05, "loss": 0.5502, "mean_token_accuracy": 0.8248479008674622, "num_tokens": 433218621.0, "step": 12930 }, { "epoch": 0.7721466093600764, "grad_norm": 0.5015308260917664, "learning_rate": 3.7480201099439476e-05, "loss": 0.5014, "mean_token_accuracy": 0.8387510418891907, "num_tokens": 433386301.0, "step": 12935 }, { "epoch": 0.7724450811843362, "grad_norm": 0.5115908980369568, "learning_rate": 3.747024769669803e-05, "loss": 0.5665, "mean_token_accuracy": 0.8219849824905395, "num_tokens": 433553981.0, "step": 12940 }, { "epoch": 0.772743553008596, "grad_norm": 0.5259997248649597, "learning_rate": 3.7460291865518736e-05, "loss": 0.5739, "mean_token_accuracy": 0.8163485646247863, "num_tokens": 433721661.0, "step": 12945 }, { "epoch": 0.7730420248328558, "grad_norm": 0.4921368658542633, "learning_rate": 3.7450333608326506e-05, "loss": 0.5285, "mean_token_accuracy": 0.8316235303878784, "num_tokens": 433889341.0, "step": 12950 }, { "epoch": 0.7733404966571156, "grad_norm": 0.649025022983551, "learning_rate": 3.744037292754687e-05, "loss": 0.5689, "mean_token_accuracy": 0.8203984379768372, "num_tokens": 434057021.0, "step": 12955 }, { "epoch": 0.7736389684813754, "grad_norm": 0.505024254322052, "learning_rate": 3.743040982560592e-05, "loss": 0.5115, "mean_token_accuracy": 0.8339735269546509, "num_tokens": 434224701.0, "step": 12960 }, { "epoch": 0.7739374403056352, "grad_norm": 0.5172065496444702, "learning_rate": 3.742044430493037e-05, "loss": 0.5764, "mean_token_accuracy": 0.8179178953170776, "num_tokens": 434386695.0, "step": 12965 }, { "epoch": 0.774235912129895, "grad_norm": 0.5499746203422546, "learning_rate": 3.741047636794749e-05, "loss": 0.5324, "mean_token_accuracy": 0.831015145778656, "num_tokens": 434554375.0, "step": 12970 }, { "epoch": 0.7745343839541547, "grad_norm": 0.5083252191543579, "learning_rate": 3.740050601708518e-05, "loss": 0.5371, "mean_token_accuracy": 0.8301860928535462, "num_tokens": 434722055.0, "step": 12975 }, { "epoch": 0.7748328557784145, "grad_norm": 0.5025472640991211, "learning_rate": 3.739053325477187e-05, "loss": 0.5733, "mean_token_accuracy": 0.8194799065589905, "num_tokens": 434889735.0, "step": 12980 }, { "epoch": 0.7751313276026743, "grad_norm": 0.5539953708648682, "learning_rate": 3.738055808343664e-05, "loss": 0.5595, "mean_token_accuracy": 0.8211618781089782, "num_tokens": 435057415.0, "step": 12985 }, { "epoch": 0.7754297994269341, "grad_norm": 0.5148086547851562, "learning_rate": 3.73705805055091e-05, "loss": 0.5403, "mean_token_accuracy": 0.8274722695350647, "num_tokens": 435225095.0, "step": 12990 }, { "epoch": 0.7757282712511939, "grad_norm": 0.45913150906562805, "learning_rate": 3.7360600523419506e-05, "loss": 0.5023, "mean_token_accuracy": 0.8391089081764221, "num_tokens": 435392775.0, "step": 12995 }, { "epoch": 0.7760267430754537, "grad_norm": 0.5459899306297302, "learning_rate": 3.735061813959864e-05, "loss": 0.5528, "mean_token_accuracy": 0.824460220336914, "num_tokens": 435560455.0, "step": 13000 }, { "epoch": 0.7763252148997135, "grad_norm": 0.4576101005077362, "learning_rate": 3.734063335647791e-05, "loss": 0.5177, "mean_token_accuracy": 0.8360193133354187, "num_tokens": 435728135.0, "step": 13005 }, { "epoch": 0.7766236867239733, "grad_norm": 0.49657967686653137, "learning_rate": 3.733064617648927e-05, "loss": 0.5507, "mean_token_accuracy": 0.825676965713501, "num_tokens": 435895815.0, "step": 13010 }, { "epoch": 0.7769221585482331, "grad_norm": 0.5331529974937439, "learning_rate": 3.732065660206532e-05, "loss": 0.5531, "mean_token_accuracy": 0.8237802624702454, "num_tokens": 436063495.0, "step": 13015 }, { "epoch": 0.7772206303724928, "grad_norm": 0.4832727313041687, "learning_rate": 3.731066463563919e-05, "loss": 0.4851, "mean_token_accuracy": 0.8433019161224365, "num_tokens": 436231175.0, "step": 13020 }, { "epoch": 0.7775191021967526, "grad_norm": 0.5183764100074768, "learning_rate": 3.730067027964459e-05, "loss": 0.5289, "mean_token_accuracy": 0.8313252925872803, "num_tokens": 436398855.0, "step": 13025 }, { "epoch": 0.7778175740210124, "grad_norm": 0.5579322576522827, "learning_rate": 3.729067353651585e-05, "loss": 0.5431, "mean_token_accuracy": 0.8266432166099549, "num_tokens": 436566535.0, "step": 13030 }, { "epoch": 0.7781160458452722, "grad_norm": 0.5010125041007996, "learning_rate": 3.7280674408687847e-05, "loss": 0.6078, "mean_token_accuracy": 0.8086782813072204, "num_tokens": 436734215.0, "step": 13035 }, { "epoch": 0.778414517669532, "grad_norm": 0.542412519454956, "learning_rate": 3.727067289859607e-05, "loss": 0.5562, "mean_token_accuracy": 0.8254025936126709, "num_tokens": 436901895.0, "step": 13040 }, { "epoch": 0.7787129894937918, "grad_norm": 0.538048505783081, "learning_rate": 3.726066900867656e-05, "loss": 0.499, "mean_token_accuracy": 0.838470721244812, "num_tokens": 437069575.0, "step": 13045 }, { "epoch": 0.7790114613180515, "grad_norm": 0.5493173599243164, "learning_rate": 3.7250662741365955e-05, "loss": 0.5265, "mean_token_accuracy": 0.8312716126441956, "num_tokens": 437237255.0, "step": 13050 }, { "epoch": 0.7793099331423113, "grad_norm": 0.4959273636341095, "learning_rate": 3.7240654099101464e-05, "loss": 0.4938, "mean_token_accuracy": 0.8409101843833924, "num_tokens": 437404935.0, "step": 13055 }, { "epoch": 0.7796084049665711, "grad_norm": 0.5687335133552551, "learning_rate": 3.723064308432086e-05, "loss": 0.5307, "mean_token_accuracy": 0.8290289878845215, "num_tokens": 437572615.0, "step": 13060 }, { "epoch": 0.7799068767908309, "grad_norm": 0.5093406438827515, "learning_rate": 3.722062969946254e-05, "loss": 0.5188, "mean_token_accuracy": 0.8340033411979675, "num_tokens": 437740295.0, "step": 13065 }, { "epoch": 0.7802053486150907, "grad_norm": 0.5641523599624634, "learning_rate": 3.7210613946965435e-05, "loss": 0.5554, "mean_token_accuracy": 0.8209879159927368, "num_tokens": 437901833.0, "step": 13070 }, { "epoch": 0.7805038204393505, "grad_norm": 0.4834235608577728, "learning_rate": 3.7200595829269064e-05, "loss": 0.5262, "mean_token_accuracy": 0.8319456100463867, "num_tokens": 438069513.0, "step": 13075 }, { "epoch": 0.7808022922636103, "grad_norm": 0.47315308451652527, "learning_rate": 3.719057534881353e-05, "loss": 0.568, "mean_token_accuracy": 0.8208159327507019, "num_tokens": 438237193.0, "step": 13080 }, { "epoch": 0.7811007640878701, "grad_norm": 0.565412163734436, "learning_rate": 3.718055250803949e-05, "loss": 0.5579, "mean_token_accuracy": 0.8213945031166077, "num_tokens": 438404873.0, "step": 13085 }, { "epoch": 0.7813992359121299, "grad_norm": 0.4810662567615509, "learning_rate": 3.717052730938821e-05, "loss": 0.5718, "mean_token_accuracy": 0.8193546414375306, "num_tokens": 438572553.0, "step": 13090 }, { "epoch": 0.7816977077363897, "grad_norm": 0.5309857726097107, "learning_rate": 3.71604997553015e-05, "loss": 0.5263, "mean_token_accuracy": 0.8315877318382263, "num_tokens": 438740233.0, "step": 13095 }, { "epoch": 0.7819961795606495, "grad_norm": 0.5122263431549072, "learning_rate": 3.715046984822177e-05, "loss": 0.5463, "mean_token_accuracy": 0.8265000581741333, "num_tokens": 438907913.0, "step": 13100 }, { "epoch": 0.7822946513849093, "grad_norm": 0.4956192374229431, "learning_rate": 3.714043759059198e-05, "loss": 0.5092, "mean_token_accuracy": 0.837593936920166, "num_tokens": 439075593.0, "step": 13105 }, { "epoch": 0.7825931232091691, "grad_norm": 0.5619259476661682, "learning_rate": 3.7130402984855675e-05, "loss": 0.5796, "mean_token_accuracy": 0.8169509768486023, "num_tokens": 439243273.0, "step": 13110 }, { "epoch": 0.7828915950334289, "grad_norm": 0.505506157875061, "learning_rate": 3.712036603345697e-05, "loss": 0.5013, "mean_token_accuracy": 0.8392654895782471, "num_tokens": 439404835.0, "step": 13115 }, { "epoch": 0.7831900668576887, "grad_norm": 0.4841116964817047, "learning_rate": 3.711032673884054e-05, "loss": 0.5672, "mean_token_accuracy": 0.8238651037216187, "num_tokens": 439565475.0, "step": 13120 }, { "epoch": 0.7834885386819485, "grad_norm": 0.48040181398391724, "learning_rate": 3.7100285103451644e-05, "loss": 0.528, "mean_token_accuracy": 0.8303113341331482, "num_tokens": 439733155.0, "step": 13125 }, { "epoch": 0.7837870105062082, "grad_norm": 0.530348002910614, "learning_rate": 3.709024112973611e-05, "loss": 0.5638, "mean_token_accuracy": 0.8213169574737549, "num_tokens": 439900835.0, "step": 13130 }, { "epoch": 0.784085482330468, "grad_norm": 0.5174166560173035, "learning_rate": 3.708019482014034e-05, "loss": 0.5284, "mean_token_accuracy": 0.8310509324073792, "num_tokens": 440068515.0, "step": 13135 }, { "epoch": 0.7843839541547278, "grad_norm": 0.5310731530189514, "learning_rate": 3.707014617711129e-05, "loss": 0.5443, "mean_token_accuracy": 0.8263211250305176, "num_tokens": 440236195.0, "step": 13140 }, { "epoch": 0.7846824259789876, "grad_norm": 0.5151842832565308, "learning_rate": 3.706009520309649e-05, "loss": 0.5516, "mean_token_accuracy": 0.8239114880561829, "num_tokens": 440403875.0, "step": 13145 }, { "epoch": 0.7849808978032474, "grad_norm": 0.6412427425384521, "learning_rate": 3.705004190054405e-05, "loss": 0.5855, "mean_token_accuracy": 0.8147441029548645, "num_tokens": 440571555.0, "step": 13150 }, { "epoch": 0.7852793696275072, "grad_norm": 0.4924268424510956, "learning_rate": 3.7039986271902624e-05, "loss": 0.5108, "mean_token_accuracy": 0.8361803650856018, "num_tokens": 440739235.0, "step": 13155 }, { "epoch": 0.7855778414517669, "grad_norm": 0.518210768699646, "learning_rate": 3.702992831962145e-05, "loss": 0.5321, "mean_token_accuracy": 0.8306095719337463, "num_tokens": 440906915.0, "step": 13160 }, { "epoch": 0.7858763132760267, "grad_norm": 0.5177653431892395, "learning_rate": 3.701986804615034e-05, "loss": 0.5405, "mean_token_accuracy": 0.8298103213310242, "num_tokens": 441074595.0, "step": 13165 }, { "epoch": 0.7861747851002865, "grad_norm": 0.5237435102462769, "learning_rate": 3.700980545393965e-05, "loss": 0.5436, "mean_token_accuracy": 0.82692950963974, "num_tokens": 441242275.0, "step": 13170 }, { "epoch": 0.7864732569245463, "grad_norm": 0.4853658974170685, "learning_rate": 3.69997405454403e-05, "loss": 0.5055, "mean_token_accuracy": 0.8395502805709839, "num_tokens": 441409955.0, "step": 13175 }, { "epoch": 0.7867717287488061, "grad_norm": 0.5364872813224792, "learning_rate": 3.698967332310381e-05, "loss": 0.5577, "mean_token_accuracy": 0.8237385392189026, "num_tokens": 441577635.0, "step": 13180 }, { "epoch": 0.7870702005730659, "grad_norm": 0.5034254193305969, "learning_rate": 3.697960378938222e-05, "loss": 0.5074, "mean_token_accuracy": 0.8354348063468933, "num_tokens": 441745315.0, "step": 13185 }, { "epoch": 0.7873686723973257, "grad_norm": 0.5822336673736572, "learning_rate": 3.696953194672815e-05, "loss": 0.574, "mean_token_accuracy": 0.8178873777389526, "num_tokens": 441912995.0, "step": 13190 }, { "epoch": 0.7876671442215855, "grad_norm": 0.5720198154449463, "learning_rate": 3.695945779759479e-05, "loss": 0.5459, "mean_token_accuracy": 0.8244960069656372, "num_tokens": 442080675.0, "step": 13195 }, { "epoch": 0.7879656160458453, "grad_norm": 0.49734410643577576, "learning_rate": 3.694938134443588e-05, "loss": 0.5094, "mean_token_accuracy": 0.8362841367721557, "num_tokens": 442248281.0, "step": 13200 }, { "epoch": 0.788264087870105, "grad_norm": 0.5321495532989502, "learning_rate": 3.693930258970573e-05, "loss": 0.5566, "mean_token_accuracy": 0.822557556629181, "num_tokens": 442415961.0, "step": 13205 }, { "epoch": 0.7885625596943648, "grad_norm": 0.48074623942375183, "learning_rate": 3.69292215358592e-05, "loss": 0.5189, "mean_token_accuracy": 0.8337528347969055, "num_tokens": 442583641.0, "step": 13210 }, { "epoch": 0.7888610315186246, "grad_norm": 0.4866797626018524, "learning_rate": 3.691913818535172e-05, "loss": 0.5478, "mean_token_accuracy": 0.8245735406875611, "num_tokens": 442751321.0, "step": 13215 }, { "epoch": 0.7891595033428844, "grad_norm": 0.541733980178833, "learning_rate": 3.690905254063928e-05, "loss": 0.516, "mean_token_accuracy": 0.834259819984436, "num_tokens": 442919001.0, "step": 13220 }, { "epoch": 0.7894579751671442, "grad_norm": 0.49007222056388855, "learning_rate": 3.6898964604178425e-05, "loss": 0.5126, "mean_token_accuracy": 0.8356495261192322, "num_tokens": 443086681.0, "step": 13225 }, { "epoch": 0.789756446991404, "grad_norm": 0.5745511651039124, "learning_rate": 3.688887437842626e-05, "loss": 0.5352, "mean_token_accuracy": 0.828885841369629, "num_tokens": 443254361.0, "step": 13230 }, { "epoch": 0.7900549188156638, "grad_norm": 0.5187568664550781, "learning_rate": 3.687878186584043e-05, "loss": 0.5708, "mean_token_accuracy": 0.818859589099884, "num_tokens": 443422041.0, "step": 13235 }, { "epoch": 0.7903533906399236, "grad_norm": 0.5131155848503113, "learning_rate": 3.6868687068879174e-05, "loss": 0.5309, "mean_token_accuracy": 0.8292616009712219, "num_tokens": 443589721.0, "step": 13240 }, { "epoch": 0.7906518624641834, "grad_norm": 0.5099996328353882, "learning_rate": 3.6858589990001256e-05, "loss": 0.4849, "mean_token_accuracy": 0.8425265431404114, "num_tokens": 443757401.0, "step": 13245 }, { "epoch": 0.7909503342884432, "grad_norm": 0.6119116544723511, "learning_rate": 3.6848490631666e-05, "loss": 0.517, "mean_token_accuracy": 0.8364189386367797, "num_tokens": 443925081.0, "step": 13250 }, { "epoch": 0.791248806112703, "grad_norm": 0.5421985983848572, "learning_rate": 3.68383889963333e-05, "loss": 0.5515, "mean_token_accuracy": 0.8247405409812927, "num_tokens": 444092761.0, "step": 13255 }, { "epoch": 0.7915472779369628, "grad_norm": 0.522763729095459, "learning_rate": 3.682828508646359e-05, "loss": 0.5229, "mean_token_accuracy": 0.8328343152999877, "num_tokens": 444260441.0, "step": 13260 }, { "epoch": 0.7918457497612226, "grad_norm": 0.5403424501419067, "learning_rate": 3.6818178904517874e-05, "loss": 0.569, "mean_token_accuracy": 0.8193785071372985, "num_tokens": 444428121.0, "step": 13265 }, { "epoch": 0.7921442215854824, "grad_norm": 0.5987299084663391, "learning_rate": 3.680807045295769e-05, "loss": 0.5431, "mean_token_accuracy": 0.8293868660926819, "num_tokens": 444595801.0, "step": 13270 }, { "epoch": 0.7924426934097422, "grad_norm": 0.4669605791568756, "learning_rate": 3.6797959734245134e-05, "loss": 0.5191, "mean_token_accuracy": 0.8352439403533936, "num_tokens": 444763481.0, "step": 13275 }, { "epoch": 0.792741165234002, "grad_norm": 0.48344823718070984, "learning_rate": 3.678784675084287e-05, "loss": 0.5441, "mean_token_accuracy": 0.8264821529388428, "num_tokens": 444931161.0, "step": 13280 }, { "epoch": 0.7930396370582617, "grad_norm": 0.5072746276855469, "learning_rate": 3.6777731505214086e-05, "loss": 0.5407, "mean_token_accuracy": 0.8270130038261414, "num_tokens": 445098841.0, "step": 13285 }, { "epoch": 0.7933381088825215, "grad_norm": 0.4818067252635956, "learning_rate": 3.676761399982255e-05, "loss": 0.5083, "mean_token_accuracy": 0.8379756450653076, "num_tokens": 445266521.0, "step": 13290 }, { "epoch": 0.7936365807067812, "grad_norm": 0.48458853363990784, "learning_rate": 3.675749423713257e-05, "loss": 0.5707, "mean_token_accuracy": 0.8203864932060242, "num_tokens": 445434201.0, "step": 13295 }, { "epoch": 0.793935052531041, "grad_norm": 0.5793253183364868, "learning_rate": 3.6747372219608996e-05, "loss": 0.6197, "mean_token_accuracy": 0.8065430164337158, "num_tokens": 445601881.0, "step": 13300 }, { "epoch": 0.7942335243553008, "grad_norm": 0.47477298974990845, "learning_rate": 3.6737247949717227e-05, "loss": 0.5257, "mean_token_accuracy": 0.8313610911369324, "num_tokens": 445769561.0, "step": 13305 }, { "epoch": 0.7945319961795606, "grad_norm": 0.5024024248123169, "learning_rate": 3.6727121429923214e-05, "loss": 0.5567, "mean_token_accuracy": 0.8228617548942566, "num_tokens": 445937241.0, "step": 13310 }, { "epoch": 0.7948304680038204, "grad_norm": 0.6198261380195618, "learning_rate": 3.671699266269347e-05, "loss": 0.5222, "mean_token_accuracy": 0.8339615821838379, "num_tokens": 446104921.0, "step": 13315 }, { "epoch": 0.7951289398280802, "grad_norm": 0.4861522912979126, "learning_rate": 3.6706861650495045e-05, "loss": 0.5139, "mean_token_accuracy": 0.8368066310882568, "num_tokens": 446272601.0, "step": 13320 }, { "epoch": 0.79542741165234, "grad_norm": 0.6107427477836609, "learning_rate": 3.669672839579552e-05, "loss": 0.5266, "mean_token_accuracy": 0.8310211181640625, "num_tokens": 446440281.0, "step": 13325 }, { "epoch": 0.7957258834765998, "grad_norm": 0.5753244757652283, "learning_rate": 3.668659290106303e-05, "loss": 0.5898, "mean_token_accuracy": 0.8134498476982117, "num_tokens": 446607961.0, "step": 13330 }, { "epoch": 0.7960243553008596, "grad_norm": 0.5145120024681091, "learning_rate": 3.667645516876629e-05, "loss": 0.5518, "mean_token_accuracy": 0.8248956084251404, "num_tokens": 446775641.0, "step": 13335 }, { "epoch": 0.7963228271251194, "grad_norm": 0.499484121799469, "learning_rate": 3.66663152013745e-05, "loss": 0.5194, "mean_token_accuracy": 0.8345461130142212, "num_tokens": 446943321.0, "step": 13340 }, { "epoch": 0.7966212989493792, "grad_norm": 0.4748535752296448, "learning_rate": 3.665617300135747e-05, "loss": 0.54, "mean_token_accuracy": 0.8282595753669739, "num_tokens": 447111001.0, "step": 13345 }, { "epoch": 0.796919770773639, "grad_norm": 0.5834560990333557, "learning_rate": 3.6646028571185485e-05, "loss": 0.5643, "mean_token_accuracy": 0.8213467597961426, "num_tokens": 447278681.0, "step": 13350 }, { "epoch": 0.7972182425978988, "grad_norm": 0.5450048446655273, "learning_rate": 3.663588191332944e-05, "loss": 0.5773, "mean_token_accuracy": 0.8185434818267823, "num_tokens": 447446361.0, "step": 13355 }, { "epoch": 0.7975167144221585, "grad_norm": 0.5127323865890503, "learning_rate": 3.662573303026072e-05, "loss": 0.556, "mean_token_accuracy": 0.8222473978996276, "num_tokens": 447614041.0, "step": 13360 }, { "epoch": 0.7978151862464183, "grad_norm": 0.536323606967926, "learning_rate": 3.6615581924451274e-05, "loss": 0.5421, "mean_token_accuracy": 0.8263986706733704, "num_tokens": 447781721.0, "step": 13365 }, { "epoch": 0.7981136580706781, "grad_norm": 0.5523877143859863, "learning_rate": 3.66054285983736e-05, "loss": 0.5522, "mean_token_accuracy": 0.8237385392189026, "num_tokens": 447949401.0, "step": 13370 }, { "epoch": 0.7984121298949379, "grad_norm": 0.6258491277694702, "learning_rate": 3.659527305450072e-05, "loss": 0.57, "mean_token_accuracy": 0.8175951361656189, "num_tokens": 448117081.0, "step": 13375 }, { "epoch": 0.7987106017191977, "grad_norm": 0.5400011539459229, "learning_rate": 3.658511529530622e-05, "loss": 0.5565, "mean_token_accuracy": 0.8227245688438416, "num_tokens": 448284761.0, "step": 13380 }, { "epoch": 0.7990090735434575, "grad_norm": 0.5358766913414001, "learning_rate": 3.657495532326419e-05, "loss": 0.5342, "mean_token_accuracy": 0.8327269434928894, "num_tokens": 448452441.0, "step": 13385 }, { "epoch": 0.7993075453677173, "grad_norm": 0.5900395512580872, "learning_rate": 3.656479314084928e-05, "loss": 0.5593, "mean_token_accuracy": 0.8233150482177735, "num_tokens": 448620121.0, "step": 13390 }, { "epoch": 0.7996060171919771, "grad_norm": 0.5061820149421692, "learning_rate": 3.655462875053669e-05, "loss": 0.5324, "mean_token_accuracy": 0.8286293625831604, "num_tokens": 448787801.0, "step": 13395 }, { "epoch": 0.7999044890162369, "grad_norm": 0.567453145980835, "learning_rate": 3.654446215480214e-05, "loss": 0.6078, "mean_token_accuracy": 0.8083919763565064, "num_tokens": 448955481.0, "step": 13400 }, { "epoch": 0.8002029608404967, "grad_norm": 0.5236828923225403, "learning_rate": 3.6534293356121885e-05, "loss": 0.5501, "mean_token_accuracy": 0.8239234089851379, "num_tokens": 449123161.0, "step": 13405 }, { "epoch": 0.8005014326647565, "grad_norm": 0.5467981100082397, "learning_rate": 3.652412235697274e-05, "loss": 0.5889, "mean_token_accuracy": 0.8145711541175842, "num_tokens": 449290841.0, "step": 13410 }, { "epoch": 0.8007999044890163, "grad_norm": 0.536647617816925, "learning_rate": 3.6513949159832024e-05, "loss": 0.5302, "mean_token_accuracy": 0.8306453704833985, "num_tokens": 449458521.0, "step": 13415 }, { "epoch": 0.8010983763132761, "grad_norm": 0.49470803141593933, "learning_rate": 3.650377376717761e-05, "loss": 0.5751, "mean_token_accuracy": 0.8175891757011413, "num_tokens": 449626201.0, "step": 13420 }, { "epoch": 0.8013968481375359, "grad_norm": 0.5080121159553528, "learning_rate": 3.6493596181487896e-05, "loss": 0.4775, "mean_token_accuracy": 0.8460634589195252, "num_tokens": 449793881.0, "step": 13425 }, { "epoch": 0.8016953199617957, "grad_norm": 0.5268555879592896, "learning_rate": 3.648341640524184e-05, "loss": 0.5126, "mean_token_accuracy": 0.8359537243843078, "num_tokens": 449961561.0, "step": 13430 }, { "epoch": 0.8019937917860553, "grad_norm": 0.5904242396354675, "learning_rate": 3.64732344409189e-05, "loss": 0.5912, "mean_token_accuracy": 0.8128235816955567, "num_tokens": 450129241.0, "step": 13435 }, { "epoch": 0.8022922636103151, "grad_norm": 0.5389574766159058, "learning_rate": 3.646305029099909e-05, "loss": 0.54, "mean_token_accuracy": 0.8272456169128418, "num_tokens": 450296921.0, "step": 13440 }, { "epoch": 0.8025907354345749, "grad_norm": 0.5236079096794128, "learning_rate": 3.645286395796293e-05, "loss": 0.5491, "mean_token_accuracy": 0.8264761924743652, "num_tokens": 450464601.0, "step": 13445 }, { "epoch": 0.8028892072588347, "grad_norm": 0.47309550642967224, "learning_rate": 3.6442675444291516e-05, "loss": 0.4904, "mean_token_accuracy": 0.8411547183990479, "num_tokens": 450632281.0, "step": 13450 }, { "epoch": 0.8031876790830945, "grad_norm": 0.5088681578636169, "learning_rate": 3.6432484752466425e-05, "loss": 0.5529, "mean_token_accuracy": 0.8232971549034118, "num_tokens": 450799961.0, "step": 13455 }, { "epoch": 0.8034861509073543, "grad_norm": 0.6903104186058044, "learning_rate": 3.64222918849698e-05, "loss": 0.5596, "mean_token_accuracy": 0.8209889173507691, "num_tokens": 450967641.0, "step": 13460 }, { "epoch": 0.8037846227316141, "grad_norm": 0.5452691316604614, "learning_rate": 3.6412096844284304e-05, "loss": 0.4934, "mean_token_accuracy": 0.8408445596694947, "num_tokens": 451135321.0, "step": 13465 }, { "epoch": 0.8040830945558739, "grad_norm": 0.5216230154037476, "learning_rate": 3.640189963289312e-05, "loss": 0.5723, "mean_token_accuracy": 0.8172849655151367, "num_tokens": 451303001.0, "step": 13470 }, { "epoch": 0.8043815663801337, "grad_norm": 0.5238450169563293, "learning_rate": 3.639170025327997e-05, "loss": 0.5428, "mean_token_accuracy": 0.8273768305778504, "num_tokens": 451470681.0, "step": 13475 }, { "epoch": 0.8046800382043935, "grad_norm": 0.5196294188499451, "learning_rate": 3.63814987079291e-05, "loss": 0.5122, "mean_token_accuracy": 0.8351186871528625, "num_tokens": 451638361.0, "step": 13480 }, { "epoch": 0.8049785100286533, "grad_norm": 0.5821755528450012, "learning_rate": 3.637129499932529e-05, "loss": 0.5896, "mean_token_accuracy": 0.8135810613632202, "num_tokens": 451806041.0, "step": 13485 }, { "epoch": 0.8052769818529131, "grad_norm": 0.5047705173492432, "learning_rate": 3.636108912995384e-05, "loss": 0.4996, "mean_token_accuracy": 0.8399200797080993, "num_tokens": 451973721.0, "step": 13490 }, { "epoch": 0.8055754536771729, "grad_norm": 0.5244894623756409, "learning_rate": 3.6350881102300566e-05, "loss": 0.523, "mean_token_accuracy": 0.833120596408844, "num_tokens": 452141401.0, "step": 13495 }, { "epoch": 0.8058739255014327, "grad_norm": 0.4949883222579956, "learning_rate": 3.6340670918851835e-05, "loss": 0.5102, "mean_token_accuracy": 0.8356793522834778, "num_tokens": 452309081.0, "step": 13500 }, { "epoch": 0.8061723973256925, "grad_norm": 0.5216145515441895, "learning_rate": 3.6330458582094525e-05, "loss": 0.5511, "mean_token_accuracy": 0.8228273987770081, "num_tokens": 452469604.0, "step": 13505 }, { "epoch": 0.8064708691499523, "grad_norm": 0.4776259958744049, "learning_rate": 3.6320244094516035e-05, "loss": 0.4624, "mean_token_accuracy": 0.849761426448822, "num_tokens": 452637284.0, "step": 13510 }, { "epoch": 0.806769340974212, "grad_norm": 0.5067180395126343, "learning_rate": 3.631002745860429e-05, "loss": 0.555, "mean_token_accuracy": 0.8254324197769165, "num_tokens": 452804964.0, "step": 13515 }, { "epoch": 0.8070678127984718, "grad_norm": 0.48812413215637207, "learning_rate": 3.629980867684775e-05, "loss": 0.5341, "mean_token_accuracy": 0.8304604530334473, "num_tokens": 452972644.0, "step": 13520 }, { "epoch": 0.8073662846227316, "grad_norm": 0.4747433662414551, "learning_rate": 3.628958775173537e-05, "loss": 0.5353, "mean_token_accuracy": 0.8279315114021302, "num_tokens": 453140324.0, "step": 13525 }, { "epoch": 0.8076647564469914, "grad_norm": 0.559675931930542, "learning_rate": 3.627936468575665e-05, "loss": 0.5505, "mean_token_accuracy": 0.8259036064147949, "num_tokens": 453308004.0, "step": 13530 }, { "epoch": 0.8079632282712512, "grad_norm": 0.4890124499797821, "learning_rate": 3.626913948140163e-05, "loss": 0.5362, "mean_token_accuracy": 0.8287188410758972, "num_tokens": 453475684.0, "step": 13535 }, { "epoch": 0.808261700095511, "grad_norm": 0.45533356070518494, "learning_rate": 3.6258912141160825e-05, "loss": 0.5454, "mean_token_accuracy": 0.8260825514793396, "num_tokens": 453643364.0, "step": 13540 }, { "epoch": 0.8085601719197708, "grad_norm": 0.5417728424072266, "learning_rate": 3.62486826675253e-05, "loss": 0.6006, "mean_token_accuracy": 0.8089944005012513, "num_tokens": 453811044.0, "step": 13545 }, { "epoch": 0.8088586437440306, "grad_norm": 0.454820454120636, "learning_rate": 3.6238451062986625e-05, "loss": 0.5002, "mean_token_accuracy": 0.8369318962097168, "num_tokens": 453978724.0, "step": 13550 }, { "epoch": 0.8091571155682904, "grad_norm": 0.5340718626976013, "learning_rate": 3.6228217330036894e-05, "loss": 0.5442, "mean_token_accuracy": 0.8275259494781494, "num_tokens": 454146404.0, "step": 13555 }, { "epoch": 0.8094555873925502, "grad_norm": 0.5246963500976562, "learning_rate": 3.621798147116873e-05, "loss": 0.533, "mean_token_accuracy": 0.8294703722000122, "num_tokens": 454314084.0, "step": 13560 }, { "epoch": 0.80975405921681, "grad_norm": 0.5007262825965881, "learning_rate": 3.620774348887527e-05, "loss": 0.5325, "mean_token_accuracy": 0.8301622271537781, "num_tokens": 454481764.0, "step": 13565 }, { "epoch": 0.8100525310410697, "grad_norm": 0.461511492729187, "learning_rate": 3.619750338565015e-05, "loss": 0.5062, "mean_token_accuracy": 0.8373911499977111, "num_tokens": 454649444.0, "step": 13570 }, { "epoch": 0.8103510028653295, "grad_norm": 0.5453050136566162, "learning_rate": 3.618726116398755e-05, "loss": 0.5343, "mean_token_accuracy": 0.8277346968650818, "num_tokens": 454817124.0, "step": 13575 }, { "epoch": 0.8106494746895893, "grad_norm": 0.48418128490448, "learning_rate": 3.617701682638214e-05, "loss": 0.5479, "mean_token_accuracy": 0.8239711165428162, "num_tokens": 454984804.0, "step": 13580 }, { "epoch": 0.810947946513849, "grad_norm": 1.3153331279754639, "learning_rate": 3.616677037532912e-05, "loss": 0.5717, "mean_token_accuracy": 0.8217762231826782, "num_tokens": 455152484.0, "step": 13585 }, { "epoch": 0.8112464183381088, "grad_norm": 1.1322860717773438, "learning_rate": 3.615652181332421e-05, "loss": 0.5784, "mean_token_accuracy": 0.8178814291954041, "num_tokens": 455320164.0, "step": 13590 }, { "epoch": 0.8115448901623686, "grad_norm": 0.4932667911052704, "learning_rate": 3.614627114286364e-05, "loss": 0.5411, "mean_token_accuracy": 0.8264702439308167, "num_tokens": 455487844.0, "step": 13595 }, { "epoch": 0.8118433619866284, "grad_norm": 0.561107873916626, "learning_rate": 3.613601836644412e-05, "loss": 0.5329, "mean_token_accuracy": 0.8300489068031311, "num_tokens": 455655524.0, "step": 13600 }, { "epoch": 0.8121418338108882, "grad_norm": 0.534894585609436, "learning_rate": 3.612576348656294e-05, "loss": 0.5242, "mean_token_accuracy": 0.8313491702079773, "num_tokens": 455823204.0, "step": 13605 }, { "epoch": 0.812440305635148, "grad_norm": 0.5064579248428345, "learning_rate": 3.611550650571784e-05, "loss": 0.5028, "mean_token_accuracy": 0.8381247878074646, "num_tokens": 455990884.0, "step": 13610 }, { "epoch": 0.8127387774594078, "grad_norm": 0.6063612103462219, "learning_rate": 3.6105247426407116e-05, "loss": 0.5776, "mean_token_accuracy": 0.8201061725616455, "num_tokens": 456158564.0, "step": 13615 }, { "epoch": 0.8130372492836676, "grad_norm": 0.548568069934845, "learning_rate": 3.609498625112954e-05, "loss": 0.5621, "mean_token_accuracy": 0.8212215185165406, "num_tokens": 456326244.0, "step": 13620 }, { "epoch": 0.8133357211079274, "grad_norm": 0.4991692900657654, "learning_rate": 3.6084722982384425e-05, "loss": 0.5023, "mean_token_accuracy": 0.8415066242218018, "num_tokens": 456493924.0, "step": 13625 }, { "epoch": 0.8136341929321872, "grad_norm": 0.5445214509963989, "learning_rate": 3.607445762267156e-05, "loss": 0.5309, "mean_token_accuracy": 0.8310449838638305, "num_tokens": 456661604.0, "step": 13630 }, { "epoch": 0.813932664756447, "grad_norm": 0.4892699718475342, "learning_rate": 3.6064190174491286e-05, "loss": 0.5339, "mean_token_accuracy": 0.8288082957267762, "num_tokens": 456829284.0, "step": 13635 }, { "epoch": 0.8142311365807068, "grad_norm": 0.4974958598613739, "learning_rate": 3.6053920640344404e-05, "loss": 0.5422, "mean_token_accuracy": 0.8274364829063415, "num_tokens": 456996964.0, "step": 13640 }, { "epoch": 0.8145296084049666, "grad_norm": 0.5441718697547913, "learning_rate": 3.604364902273227e-05, "loss": 0.5087, "mean_token_accuracy": 0.8363354325294494, "num_tokens": 457164644.0, "step": 13645 }, { "epoch": 0.8148280802292264, "grad_norm": 0.5584273338317871, "learning_rate": 3.603337532415672e-05, "loss": 0.5151, "mean_token_accuracy": 0.8359954595565796, "num_tokens": 457332324.0, "step": 13650 }, { "epoch": 0.8151265520534862, "grad_norm": 0.474702924489975, "learning_rate": 3.602309954712011e-05, "loss": 0.5502, "mean_token_accuracy": 0.8242574334144592, "num_tokens": 457500004.0, "step": 13655 }, { "epoch": 0.815425023877746, "grad_norm": 0.49339690804481506, "learning_rate": 3.601282169412528e-05, "loss": 0.5724, "mean_token_accuracy": 0.8200524806976318, "num_tokens": 457667684.0, "step": 13660 }, { "epoch": 0.8157234957020058, "grad_norm": 0.48369187116622925, "learning_rate": 3.6002541767675594e-05, "loss": 0.5581, "mean_token_accuracy": 0.8212871313095093, "num_tokens": 457835364.0, "step": 13665 }, { "epoch": 0.8160219675262655, "grad_norm": 0.5393469333648682, "learning_rate": 3.599225977027492e-05, "loss": 0.5437, "mean_token_accuracy": 0.8268638968467712, "num_tokens": 458003044.0, "step": 13670 }, { "epoch": 0.8163204393505253, "grad_norm": 0.5142853856086731, "learning_rate": 3.598197570442764e-05, "loss": 0.4921, "mean_token_accuracy": 0.8428009152412415, "num_tokens": 458170724.0, "step": 13675 }, { "epoch": 0.8166189111747851, "grad_norm": 0.5485891699790955, "learning_rate": 3.597168957263861e-05, "loss": 0.5405, "mean_token_accuracy": 0.825438380241394, "num_tokens": 458338404.0, "step": 13680 }, { "epoch": 0.8169173829990449, "grad_norm": 0.5144602656364441, "learning_rate": 3.5961401377413216e-05, "loss": 0.5245, "mean_token_accuracy": 0.8322855830192566, "num_tokens": 458506084.0, "step": 13685 }, { "epoch": 0.8172158548233047, "grad_norm": 0.5123664736747742, "learning_rate": 3.595111112125734e-05, "loss": 0.5694, "mean_token_accuracy": 0.8184301495552063, "num_tokens": 458673764.0, "step": 13690 }, { "epoch": 0.8175143266475645, "grad_norm": 0.5427488088607788, "learning_rate": 3.594081880667736e-05, "loss": 0.5424, "mean_token_accuracy": 0.8269533514976501, "num_tokens": 458841444.0, "step": 13695 }, { "epoch": 0.8178127984718243, "grad_norm": 0.5048689246177673, "learning_rate": 3.5930524436180155e-05, "loss": 0.5112, "mean_token_accuracy": 0.8360670447349549, "num_tokens": 459009124.0, "step": 13700 }, { "epoch": 0.8181112702960841, "grad_norm": 0.4972262978553772, "learning_rate": 3.592022801227312e-05, "loss": 0.5031, "mean_token_accuracy": 0.8376953363418579, "num_tokens": 459176804.0, "step": 13705 }, { "epoch": 0.8184097421203438, "grad_norm": 0.5604311227798462, "learning_rate": 3.5909929537464136e-05, "loss": 0.5472, "mean_token_accuracy": 0.8253489255905151, "num_tokens": 459344484.0, "step": 13710 }, { "epoch": 0.8187082139446036, "grad_norm": 0.500683069229126, "learning_rate": 3.589962901426157e-05, "loss": 0.5233, "mean_token_accuracy": 0.8328223824501038, "num_tokens": 459512164.0, "step": 13715 }, { "epoch": 0.8190066857688634, "grad_norm": 0.48557719588279724, "learning_rate": 3.5889326445174333e-05, "loss": 0.5349, "mean_token_accuracy": 0.8281701087951661, "num_tokens": 459679844.0, "step": 13720 }, { "epoch": 0.8193051575931232, "grad_norm": 0.5008373260498047, "learning_rate": 3.587902183271178e-05, "loss": 0.5681, "mean_token_accuracy": 0.81687833070755, "num_tokens": 459839940.0, "step": 13725 }, { "epoch": 0.819603629417383, "grad_norm": 0.50841224193573, "learning_rate": 3.58687151793838e-05, "loss": 0.5291, "mean_token_accuracy": 0.8292795062065125, "num_tokens": 460007620.0, "step": 13730 }, { "epoch": 0.8199021012416428, "grad_norm": 0.5785766839981079, "learning_rate": 3.585840648770078e-05, "loss": 0.5528, "mean_token_accuracy": 0.8259513258934021, "num_tokens": 460175300.0, "step": 13735 }, { "epoch": 0.8202005730659025, "grad_norm": 0.5972203612327576, "learning_rate": 3.584809576017358e-05, "loss": 0.5865, "mean_token_accuracy": 0.8151497125625611, "num_tokens": 460342980.0, "step": 13740 }, { "epoch": 0.8204990448901623, "grad_norm": 0.5197674036026001, "learning_rate": 3.5837782999313556e-05, "loss": 0.5408, "mean_token_accuracy": 0.8268281102180481, "num_tokens": 460510660.0, "step": 13745 }, { "epoch": 0.8207975167144221, "grad_norm": 0.48860955238342285, "learning_rate": 3.5827468207632605e-05, "loss": 0.519, "mean_token_accuracy": 0.8344685673713684, "num_tokens": 460678340.0, "step": 13750 }, { "epoch": 0.8210959885386819, "grad_norm": 0.5184617638587952, "learning_rate": 3.581715138764304e-05, "loss": 0.5321, "mean_token_accuracy": 0.8293570280075073, "num_tokens": 460846020.0, "step": 13755 }, { "epoch": 0.8213944603629417, "grad_norm": 0.4803366959095001, "learning_rate": 3.580683254185776e-05, "loss": 0.5861, "mean_token_accuracy": 0.8131277561187744, "num_tokens": 461013700.0, "step": 13760 }, { "epoch": 0.8216929321872015, "grad_norm": 0.5201559066772461, "learning_rate": 3.5796511672790076e-05, "loss": 0.5589, "mean_token_accuracy": 0.8221519708633422, "num_tokens": 461181380.0, "step": 13765 }, { "epoch": 0.8219914040114613, "grad_norm": 0.5416216850280762, "learning_rate": 3.578618878295384e-05, "loss": 0.5378, "mean_token_accuracy": 0.8290528535842896, "num_tokens": 461349060.0, "step": 13770 }, { "epoch": 0.8222898758357211, "grad_norm": 0.5434958338737488, "learning_rate": 3.577586387486338e-05, "loss": 0.5194, "mean_token_accuracy": 0.8329416632652282, "num_tokens": 461516740.0, "step": 13775 }, { "epoch": 0.8225883476599809, "grad_norm": 0.536403477191925, "learning_rate": 3.576553695103352e-05, "loss": 0.5626, "mean_token_accuracy": 0.8210246920585632, "num_tokens": 461684420.0, "step": 13780 }, { "epoch": 0.8228868194842407, "grad_norm": 0.5481341481208801, "learning_rate": 3.575520801397957e-05, "loss": 0.5795, "mean_token_accuracy": 0.8169628977775574, "num_tokens": 461852100.0, "step": 13785 }, { "epoch": 0.8231852913085005, "grad_norm": 0.5420999526977539, "learning_rate": 3.5744877066217344e-05, "loss": 0.5628, "mean_token_accuracy": 0.8219789981842041, "num_tokens": 462019780.0, "step": 13790 }, { "epoch": 0.8234837631327603, "grad_norm": 0.48876407742500305, "learning_rate": 3.573454411026311e-05, "loss": 0.5514, "mean_token_accuracy": 0.8228021025657654, "num_tokens": 462187460.0, "step": 13795 }, { "epoch": 0.8237822349570201, "grad_norm": 0.5534393191337585, "learning_rate": 3.572420914863368e-05, "loss": 0.579, "mean_token_accuracy": 0.8169867634773255, "num_tokens": 462355140.0, "step": 13800 }, { "epoch": 0.8240807067812799, "grad_norm": 0.5204468369483948, "learning_rate": 3.57138721838463e-05, "loss": 0.5136, "mean_token_accuracy": 0.8350769400596618, "num_tokens": 462522820.0, "step": 13805 }, { "epoch": 0.8243791786055397, "grad_norm": 0.5033581852912903, "learning_rate": 3.570353321841874e-05, "loss": 0.5415, "mean_token_accuracy": 0.8270845770835876, "num_tokens": 462690500.0, "step": 13810 }, { "epoch": 0.8246776504297995, "grad_norm": 0.508622944355011, "learning_rate": 3.569319225486924e-05, "loss": 0.5371, "mean_token_accuracy": 0.8308660387992859, "num_tokens": 462858180.0, "step": 13815 }, { "epoch": 0.8249761222540593, "grad_norm": 0.5545150637626648, "learning_rate": 3.568284929571655e-05, "loss": 0.5524, "mean_token_accuracy": 0.8241142988204956, "num_tokens": 463025860.0, "step": 13820 }, { "epoch": 0.825274594078319, "grad_norm": 0.46474355459213257, "learning_rate": 3.567250434347987e-05, "loss": 0.5105, "mean_token_accuracy": 0.8378623366355896, "num_tokens": 463193540.0, "step": 13825 }, { "epoch": 0.8255730659025788, "grad_norm": 0.5277979373931885, "learning_rate": 3.566215740067891e-05, "loss": 0.5655, "mean_token_accuracy": 0.8230478525161743, "num_tokens": 463359917.0, "step": 13830 }, { "epoch": 0.8258715377268386, "grad_norm": 0.5198206901550293, "learning_rate": 3.5651808469833867e-05, "loss": 0.5281, "mean_token_accuracy": 0.8314923048019409, "num_tokens": 463527597.0, "step": 13835 }, { "epoch": 0.8261700095510984, "grad_norm": 0.5170151591300964, "learning_rate": 3.564145755346539e-05, "loss": 0.5066, "mean_token_accuracy": 0.8356316328048706, "num_tokens": 463695277.0, "step": 13840 }, { "epoch": 0.8264684813753582, "grad_norm": 0.6290221810340881, "learning_rate": 3.563110465409466e-05, "loss": 0.5165, "mean_token_accuracy": 0.8329178094863892, "num_tokens": 463862957.0, "step": 13845 }, { "epoch": 0.8267669531996179, "grad_norm": 0.5610340237617493, "learning_rate": 3.56207497742433e-05, "loss": 0.5262, "mean_token_accuracy": 0.8316175699234009, "num_tokens": 464030637.0, "step": 13850 }, { "epoch": 0.8270654250238777, "grad_norm": 0.5238777995109558, "learning_rate": 3.561039291643345e-05, "loss": 0.5316, "mean_token_accuracy": 0.8308063864707946, "num_tokens": 464198317.0, "step": 13855 }, { "epoch": 0.8273638968481375, "grad_norm": 0.6883141994476318, "learning_rate": 3.560003408318771e-05, "loss": 0.5343, "mean_token_accuracy": 0.829386854171753, "num_tokens": 464365997.0, "step": 13860 }, { "epoch": 0.8276623686723973, "grad_norm": 0.5681681632995605, "learning_rate": 3.5589673277029154e-05, "loss": 0.5254, "mean_token_accuracy": 0.831754744052887, "num_tokens": 464533677.0, "step": 13865 }, { "epoch": 0.8279608404966571, "grad_norm": 0.5041458606719971, "learning_rate": 3.557931050048135e-05, "loss": 0.5383, "mean_token_accuracy": 0.8284206032752991, "num_tokens": 464701357.0, "step": 13870 }, { "epoch": 0.8282593123209169, "grad_norm": 0.5190247893333435, "learning_rate": 3.556894575606835e-05, "loss": 0.559, "mean_token_accuracy": 0.8227007031440735, "num_tokens": 464869037.0, "step": 13875 }, { "epoch": 0.8285577841451767, "grad_norm": 0.5099976062774658, "learning_rate": 3.555857904631467e-05, "loss": 0.5243, "mean_token_accuracy": 0.8317964911460877, "num_tokens": 465036717.0, "step": 13880 }, { "epoch": 0.8288562559694365, "grad_norm": 0.6251339912414551, "learning_rate": 3.554821037374533e-05, "loss": 0.5392, "mean_token_accuracy": 0.8280925631523133, "num_tokens": 465204397.0, "step": 13885 }, { "epoch": 0.8291547277936963, "grad_norm": 0.5084047317504883, "learning_rate": 3.5537839740885784e-05, "loss": 0.5222, "mean_token_accuracy": 0.8342240214347839, "num_tokens": 465372077.0, "step": 13890 }, { "epoch": 0.829453199617956, "grad_norm": 0.4804095923900604, "learning_rate": 3.552746715026202e-05, "loss": 0.5087, "mean_token_accuracy": 0.8358463644981384, "num_tokens": 465539757.0, "step": 13895 }, { "epoch": 0.8297516714422158, "grad_norm": 0.47673165798187256, "learning_rate": 3.5517092604400465e-05, "loss": 0.5533, "mean_token_accuracy": 0.8231122374534607, "num_tokens": 465707437.0, "step": 13900 }, { "epoch": 0.8300501432664756, "grad_norm": 0.5101574659347534, "learning_rate": 3.550671610582802e-05, "loss": 0.4975, "mean_token_accuracy": 0.8393236398696899, "num_tokens": 465875117.0, "step": 13905 }, { "epoch": 0.8303486150907354, "grad_norm": 0.5689011216163635, "learning_rate": 3.549633765707208e-05, "loss": 0.5379, "mean_token_accuracy": 0.8284027099609375, "num_tokens": 466042797.0, "step": 13910 }, { "epoch": 0.8306470869149952, "grad_norm": 0.47667232155799866, "learning_rate": 3.548595726066051e-05, "loss": 0.4812, "mean_token_accuracy": 0.8440057277679444, "num_tokens": 466210477.0, "step": 13915 }, { "epoch": 0.830945558739255, "grad_norm": 0.6022024154663086, "learning_rate": 3.547557491912164e-05, "loss": 0.5397, "mean_token_accuracy": 0.827657151222229, "num_tokens": 466378157.0, "step": 13920 }, { "epoch": 0.8312440305635148, "grad_norm": 0.5344946980476379, "learning_rate": 3.546519063498429e-05, "loss": 0.5409, "mean_token_accuracy": 0.8267982840538025, "num_tokens": 466545837.0, "step": 13925 }, { "epoch": 0.8315425023877746, "grad_norm": 0.5445512533187866, "learning_rate": 3.5454804410777744e-05, "loss": 0.5299, "mean_token_accuracy": 0.8288262009620666, "num_tokens": 466713517.0, "step": 13930 }, { "epoch": 0.8318409742120344, "grad_norm": 0.5051825046539307, "learning_rate": 3.544441624903176e-05, "loss": 0.5101, "mean_token_accuracy": 0.8361326456069946, "num_tokens": 466881197.0, "step": 13935 }, { "epoch": 0.8321394460362942, "grad_norm": 0.49107885360717773, "learning_rate": 3.543402615227655e-05, "loss": 0.5251, "mean_token_accuracy": 0.8305678129196167, "num_tokens": 467048877.0, "step": 13940 }, { "epoch": 0.832437917860554, "grad_norm": 0.526508092880249, "learning_rate": 3.542363412304283e-05, "loss": 0.5097, "mean_token_accuracy": 0.8367768168449402, "num_tokens": 467216557.0, "step": 13945 }, { "epoch": 0.8327363896848138, "grad_norm": 0.535841703414917, "learning_rate": 3.541324016386176e-05, "loss": 0.534, "mean_token_accuracy": 0.8307049989700317, "num_tokens": 467384237.0, "step": 13950 }, { "epoch": 0.8330348615090736, "grad_norm": 0.48743993043899536, "learning_rate": 3.540284427726499e-05, "loss": 0.5097, "mean_token_accuracy": 0.8360491633415222, "num_tokens": 467551917.0, "step": 13955 }, { "epoch": 0.8333333333333334, "grad_norm": 0.4653318524360657, "learning_rate": 3.5392446465784627e-05, "loss": 0.5392, "mean_token_accuracy": 0.8283669233322144, "num_tokens": 467719597.0, "step": 13960 }, { "epoch": 0.8336318051575932, "grad_norm": 0.5721853375434875, "learning_rate": 3.5382046731953256e-05, "loss": 0.5323, "mean_token_accuracy": 0.8286234140396118, "num_tokens": 467887277.0, "step": 13965 }, { "epoch": 0.833930276981853, "grad_norm": 0.514631986618042, "learning_rate": 3.537164507830391e-05, "loss": 0.527, "mean_token_accuracy": 0.8319933176040649, "num_tokens": 468054957.0, "step": 13970 }, { "epoch": 0.8342287488061128, "grad_norm": 0.5097449421882629, "learning_rate": 3.536124150737011e-05, "loss": 0.5118, "mean_token_accuracy": 0.8368066430091858, "num_tokens": 468222637.0, "step": 13975 }, { "epoch": 0.8345272206303725, "grad_norm": 0.6050868034362793, "learning_rate": 3.535083602168584e-05, "loss": 0.5203, "mean_token_accuracy": 0.834599781036377, "num_tokens": 468390317.0, "step": 13980 }, { "epoch": 0.8348256924546322, "grad_norm": 0.49385109543800354, "learning_rate": 3.534042862378556e-05, "loss": 0.5199, "mean_token_accuracy": 0.8334009289741516, "num_tokens": 468557997.0, "step": 13985 }, { "epoch": 0.835124164278892, "grad_norm": 0.4916013777256012, "learning_rate": 3.533001931620415e-05, "loss": 0.512, "mean_token_accuracy": 0.8355600476264954, "num_tokens": 468725677.0, "step": 13990 }, { "epoch": 0.8354226361031518, "grad_norm": 0.5108382105827332, "learning_rate": 3.531960810147702e-05, "loss": 0.5244, "mean_token_accuracy": 0.8322855710983277, "num_tokens": 468893357.0, "step": 13995 }, { "epoch": 0.8357211079274116, "grad_norm": 0.5445250272750854, "learning_rate": 3.530919498214e-05, "loss": 0.5851, "mean_token_accuracy": 0.8149469137191773, "num_tokens": 469061037.0, "step": 14000 }, { "epoch": 0.8360195797516714, "grad_norm": 0.522246241569519, "learning_rate": 3.529877996072941e-05, "loss": 0.5157, "mean_token_accuracy": 0.8337230205535888, "num_tokens": 469228717.0, "step": 14005 }, { "epoch": 0.8363180515759312, "grad_norm": 0.45550069212913513, "learning_rate": 3.5288363039782005e-05, "loss": 0.5238, "mean_token_accuracy": 0.8324943304061889, "num_tokens": 469396397.0, "step": 14010 }, { "epoch": 0.836616523400191, "grad_norm": 0.49933379888534546, "learning_rate": 3.5277944221835025e-05, "loss": 0.5459, "mean_token_accuracy": 0.8270607233047486, "num_tokens": 469564077.0, "step": 14015 }, { "epoch": 0.8369149952244508, "grad_norm": 0.553541362285614, "learning_rate": 3.5267523509426164e-05, "loss": 0.5202, "mean_token_accuracy": 0.8323273301124573, "num_tokens": 469731757.0, "step": 14020 }, { "epoch": 0.8372134670487106, "grad_norm": 0.43102484941482544, "learning_rate": 3.525710090509359e-05, "loss": 0.5183, "mean_token_accuracy": 0.8342777013778686, "num_tokens": 469899437.0, "step": 14025 }, { "epoch": 0.8375119388729704, "grad_norm": 0.49941667914390564, "learning_rate": 3.52466764113759e-05, "loss": 0.5318, "mean_token_accuracy": 0.8313193440437316, "num_tokens": 470067117.0, "step": 14030 }, { "epoch": 0.8378104106972302, "grad_norm": 0.5098081231117249, "learning_rate": 3.523625003081218e-05, "loss": 0.5078, "mean_token_accuracy": 0.8364726305007935, "num_tokens": 470234797.0, "step": 14035 }, { "epoch": 0.83810888252149, "grad_norm": 0.5632203221321106, "learning_rate": 3.5225821765941985e-05, "loss": 0.5511, "mean_token_accuracy": 0.8249612212181091, "num_tokens": 470402477.0, "step": 14040 }, { "epoch": 0.8384073543457498, "grad_norm": 0.5137168765068054, "learning_rate": 3.5215391619305284e-05, "loss": 0.5195, "mean_token_accuracy": 0.8332339286804199, "num_tokens": 470570157.0, "step": 14045 }, { "epoch": 0.8387058261700095, "grad_norm": 0.49517109990119934, "learning_rate": 3.520495959344255e-05, "loss": 0.5564, "mean_token_accuracy": 0.8223786234855652, "num_tokens": 470737837.0, "step": 14050 }, { "epoch": 0.8390042979942693, "grad_norm": 0.4742959141731262, "learning_rate": 3.519452569089468e-05, "loss": 0.523, "mean_token_accuracy": 0.8328789830207824, "num_tokens": 470901313.0, "step": 14055 }, { "epoch": 0.8393027698185291, "grad_norm": 0.5067028999328613, "learning_rate": 3.5184089914203064e-05, "loss": 0.5334, "mean_token_accuracy": 0.8302695870399475, "num_tokens": 471068993.0, "step": 14060 }, { "epoch": 0.8396012416427889, "grad_norm": 0.54022616147995, "learning_rate": 3.5173652265909514e-05, "loss": 0.5589, "mean_token_accuracy": 0.8213945031166077, "num_tokens": 471236673.0, "step": 14065 }, { "epoch": 0.8398997134670487, "grad_norm": 0.5471249222755432, "learning_rate": 3.5163212748556316e-05, "loss": 0.5365, "mean_token_accuracy": 0.8292138695716857, "num_tokens": 471404353.0, "step": 14070 }, { "epoch": 0.8401981852913085, "grad_norm": 0.4616066813468933, "learning_rate": 3.515277136468621e-05, "loss": 0.5391, "mean_token_accuracy": 0.8259334325790405, "num_tokens": 471572033.0, "step": 14075 }, { "epoch": 0.8404966571155683, "grad_norm": 0.5127003788948059, "learning_rate": 3.514232811684238e-05, "loss": 0.5756, "mean_token_accuracy": 0.8191250801086426, "num_tokens": 471737001.0, "step": 14080 }, { "epoch": 0.8407951289398281, "grad_norm": 0.8175386786460876, "learning_rate": 3.513188300756847e-05, "loss": 0.5338, "mean_token_accuracy": 0.8297101616859436, "num_tokens": 471903226.0, "step": 14085 }, { "epoch": 0.8410936007640879, "grad_norm": 0.49775615334510803, "learning_rate": 3.51214360394086e-05, "loss": 0.5201, "mean_token_accuracy": 0.8327925682067872, "num_tokens": 472070906.0, "step": 14090 }, { "epoch": 0.8413920725883477, "grad_norm": 0.5120581984519958, "learning_rate": 3.511098721490729e-05, "loss": 0.551, "mean_token_accuracy": 0.8235417008399963, "num_tokens": 472238586.0, "step": 14095 }, { "epoch": 0.8416905444126075, "grad_norm": 0.4909849762916565, "learning_rate": 3.5100536536609574e-05, "loss": 0.5645, "mean_token_accuracy": 0.8215436100959778, "num_tokens": 472406266.0, "step": 14100 }, { "epoch": 0.8419890162368673, "grad_norm": 0.5036343336105347, "learning_rate": 3.5090084007060895e-05, "loss": 0.5587, "mean_token_accuracy": 0.8215533375740052, "num_tokens": 472565981.0, "step": 14105 }, { "epoch": 0.8422874880611271, "grad_norm": 0.5000784993171692, "learning_rate": 3.507962962880715e-05, "loss": 0.5474, "mean_token_accuracy": 0.8252356052398682, "num_tokens": 472733661.0, "step": 14110 }, { "epoch": 0.8425859598853869, "grad_norm": 0.566957414150238, "learning_rate": 3.5069173404394704e-05, "loss": 0.549, "mean_token_accuracy": 0.8240427017211914, "num_tokens": 472901341.0, "step": 14115 }, { "epoch": 0.8428844317096467, "grad_norm": 0.5268917083740234, "learning_rate": 3.5058715336370374e-05, "loss": 0.54, "mean_token_accuracy": 0.8272157907485962, "num_tokens": 473069021.0, "step": 14120 }, { "epoch": 0.8431829035339063, "grad_norm": 0.5712652206420898, "learning_rate": 3.5048255427281404e-05, "loss": 0.5677, "mean_token_accuracy": 0.8207622647285462, "num_tokens": 473236701.0, "step": 14125 }, { "epoch": 0.8434813753581661, "grad_norm": 0.48082488775253296, "learning_rate": 3.50377936796755e-05, "loss": 0.4917, "mean_token_accuracy": 0.8411368250846862, "num_tokens": 473404381.0, "step": 14130 }, { "epoch": 0.8437798471824259, "grad_norm": 0.5146220326423645, "learning_rate": 3.5027330096100805e-05, "loss": 0.5246, "mean_token_accuracy": 0.8308839201927185, "num_tokens": 473572061.0, "step": 14135 }, { "epoch": 0.8440783190066857, "grad_norm": 0.5012195110321045, "learning_rate": 3.501686467910593e-05, "loss": 0.546, "mean_token_accuracy": 0.8261660456657409, "num_tokens": 473739741.0, "step": 14140 }, { "epoch": 0.8443767908309455, "grad_norm": 0.5551221966743469, "learning_rate": 3.5006397431239926e-05, "loss": 0.5524, "mean_token_accuracy": 0.8249970078468323, "num_tokens": 473907421.0, "step": 14145 }, { "epoch": 0.8446752626552053, "grad_norm": 0.5235398411750793, "learning_rate": 3.499592835505226e-05, "loss": 0.5548, "mean_token_accuracy": 0.8232971668243408, "num_tokens": 474075101.0, "step": 14150 }, { "epoch": 0.8449737344794651, "grad_norm": 0.6222826242446899, "learning_rate": 3.4985457453092894e-05, "loss": 0.5616, "mean_token_accuracy": 0.8232017040252686, "num_tokens": 474242781.0, "step": 14155 }, { "epoch": 0.8452722063037249, "grad_norm": 0.46936047077178955, "learning_rate": 3.497498472791221e-05, "loss": 0.5227, "mean_token_accuracy": 0.8320648908615113, "num_tokens": 474410461.0, "step": 14160 }, { "epoch": 0.8455706781279847, "grad_norm": 0.5509169101715088, "learning_rate": 3.4964510182061005e-05, "loss": 0.6008, "mean_token_accuracy": 0.8102409601211548, "num_tokens": 474578141.0, "step": 14165 }, { "epoch": 0.8458691499522445, "grad_norm": 0.46338951587677, "learning_rate": 3.495403381809057e-05, "loss": 0.4766, "mean_token_accuracy": 0.8434390902519227, "num_tokens": 474745821.0, "step": 14170 }, { "epoch": 0.8461676217765043, "grad_norm": 0.5123574733734131, "learning_rate": 3.4943555638552625e-05, "loss": 0.5569, "mean_token_accuracy": 0.8220147967338562, "num_tokens": 474913501.0, "step": 14175 }, { "epoch": 0.8464660936007641, "grad_norm": 0.47622036933898926, "learning_rate": 3.49330756459993e-05, "loss": 0.5351, "mean_token_accuracy": 0.8301741600036621, "num_tokens": 475081181.0, "step": 14180 }, { "epoch": 0.8467645654250239, "grad_norm": 0.5069224238395691, "learning_rate": 3.4922593842983206e-05, "loss": 0.5079, "mean_token_accuracy": 0.8378183722496033, "num_tokens": 475243223.0, "step": 14185 }, { "epoch": 0.8470630372492837, "grad_norm": 0.5753136277198792, "learning_rate": 3.491211023205737e-05, "loss": 0.5591, "mean_token_accuracy": 0.8227722644805908, "num_tokens": 475410903.0, "step": 14190 }, { "epoch": 0.8473615090735435, "grad_norm": 0.540684163570404, "learning_rate": 3.490162481577529e-05, "loss": 0.5948, "mean_token_accuracy": 0.8118394374847412, "num_tokens": 475578583.0, "step": 14195 }, { "epoch": 0.8476599808978033, "grad_norm": 0.497579962015152, "learning_rate": 3.4891137596690845e-05, "loss": 0.5088, "mean_token_accuracy": 0.8377669095993042, "num_tokens": 475746263.0, "step": 14200 }, { "epoch": 0.847958452722063, "grad_norm": 0.46217402815818787, "learning_rate": 3.488064857735843e-05, "loss": 0.492, "mean_token_accuracy": 0.8410712122917176, "num_tokens": 475913943.0, "step": 14205 }, { "epoch": 0.8482569245463228, "grad_norm": 0.44693678617477417, "learning_rate": 3.487015776033282e-05, "loss": 0.5356, "mean_token_accuracy": 0.828921627998352, "num_tokens": 476081623.0, "step": 14210 }, { "epoch": 0.8485553963705826, "grad_norm": 0.5191335082054138, "learning_rate": 3.485966514816925e-05, "loss": 0.5178, "mean_token_accuracy": 0.8327985167503357, "num_tokens": 476249303.0, "step": 14215 }, { "epoch": 0.8488538681948424, "grad_norm": 0.5962714552879333, "learning_rate": 3.4849170743423385e-05, "loss": 0.5494, "mean_token_accuracy": 0.8267624974250793, "num_tokens": 476416983.0, "step": 14220 }, { "epoch": 0.8491523400191022, "grad_norm": 0.5007015466690063, "learning_rate": 3.4838674548651346e-05, "loss": 0.5434, "mean_token_accuracy": 0.8269772052764892, "num_tokens": 476584663.0, "step": 14225 }, { "epoch": 0.849450811843362, "grad_norm": 0.4992782771587372, "learning_rate": 3.482817656640965e-05, "loss": 0.5596, "mean_token_accuracy": 0.8225515842437744, "num_tokens": 476752343.0, "step": 14230 }, { "epoch": 0.8497492836676218, "grad_norm": 0.5055899620056152, "learning_rate": 3.4817676799255307e-05, "loss": 0.5212, "mean_token_accuracy": 0.8330251693725585, "num_tokens": 476920023.0, "step": 14235 }, { "epoch": 0.8500477554918816, "grad_norm": 0.46591854095458984, "learning_rate": 3.480717524974569e-05, "loss": 0.5133, "mean_token_accuracy": 0.8340271949768067, "num_tokens": 477087703.0, "step": 14240 }, { "epoch": 0.8503462273161414, "grad_norm": 0.49624285101890564, "learning_rate": 3.479667192043868e-05, "loss": 0.5781, "mean_token_accuracy": 0.8159549117088318, "num_tokens": 477255383.0, "step": 14245 }, { "epoch": 0.8506446991404012, "grad_norm": 0.4981527328491211, "learning_rate": 3.4786166813892535e-05, "loss": 0.5494, "mean_token_accuracy": 0.8254622340202331, "num_tokens": 477423063.0, "step": 14250 }, { "epoch": 0.850943170964661, "grad_norm": 0.4836324155330658, "learning_rate": 3.477565993266598e-05, "loss": 0.5056, "mean_token_accuracy": 0.8368722319602966, "num_tokens": 477590743.0, "step": 14255 }, { "epoch": 0.8512416427889207, "grad_norm": 0.49234965443611145, "learning_rate": 3.476515127931816e-05, "loss": 0.5231, "mean_token_accuracy": 0.8327865839004517, "num_tokens": 477758423.0, "step": 14260 }, { "epoch": 0.8515401146131805, "grad_norm": 0.5004165172576904, "learning_rate": 3.475464085640864e-05, "loss": 0.5104, "mean_token_accuracy": 0.8364845514297485, "num_tokens": 477926103.0, "step": 14265 }, { "epoch": 0.8518385864374403, "grad_norm": 0.512992799282074, "learning_rate": 3.474412866649743e-05, "loss": 0.5262, "mean_token_accuracy": 0.8305201053619384, "num_tokens": 478093783.0, "step": 14270 }, { "epoch": 0.8521370582617, "grad_norm": 0.5965930819511414, "learning_rate": 3.473361471214498e-05, "loss": 0.5759, "mean_token_accuracy": 0.8187403082847595, "num_tokens": 478261463.0, "step": 14275 }, { "epoch": 0.8524355300859598, "grad_norm": 0.5190654993057251, "learning_rate": 3.4723098995912135e-05, "loss": 0.5023, "mean_token_accuracy": 0.8395443201065064, "num_tokens": 478429143.0, "step": 14280 }, { "epoch": 0.8527340019102196, "grad_norm": 0.5768404006958008, "learning_rate": 3.471258152036022e-05, "loss": 0.5949, "mean_token_accuracy": 0.8119467973709107, "num_tokens": 478596823.0, "step": 14285 }, { "epoch": 0.8530324737344794, "grad_norm": 0.5373517870903015, "learning_rate": 3.4702062288050945e-05, "loss": 0.4949, "mean_token_accuracy": 0.840838611125946, "num_tokens": 478764503.0, "step": 14290 }, { "epoch": 0.8533309455587392, "grad_norm": 0.5412562489509583, "learning_rate": 3.469154130154647e-05, "loss": 0.5872, "mean_token_accuracy": 0.8135452747344971, "num_tokens": 478932183.0, "step": 14295 }, { "epoch": 0.853629417382999, "grad_norm": 0.5922993421554565, "learning_rate": 3.4681018563409364e-05, "loss": 0.5583, "mean_token_accuracy": 0.8255338191986084, "num_tokens": 479099863.0, "step": 14300 }, { "epoch": 0.8539278892072588, "grad_norm": 0.490708589553833, "learning_rate": 3.467049407620264e-05, "loss": 0.5338, "mean_token_accuracy": 0.8310390114784241, "num_tokens": 479267543.0, "step": 14305 }, { "epoch": 0.8542263610315186, "grad_norm": 0.5206578373908997, "learning_rate": 3.4659967842489733e-05, "loss": 0.5026, "mean_token_accuracy": 0.8386675357818604, "num_tokens": 479435223.0, "step": 14310 }, { "epoch": 0.8545248328557784, "grad_norm": 0.5300891399383545, "learning_rate": 3.46494398648345e-05, "loss": 0.5393, "mean_token_accuracy": 0.8279852151870728, "num_tokens": 479602903.0, "step": 14315 }, { "epoch": 0.8548233046800382, "grad_norm": 0.459747850894928, "learning_rate": 3.463891014580122e-05, "loss": 0.5407, "mean_token_accuracy": 0.8270249485969543, "num_tokens": 479770583.0, "step": 14320 }, { "epoch": 0.855121776504298, "grad_norm": 0.612096905708313, "learning_rate": 3.462837868795461e-05, "loss": 0.5268, "mean_token_accuracy": 0.8332160353660584, "num_tokens": 479938263.0, "step": 14325 }, { "epoch": 0.8554202483285578, "grad_norm": 0.5251293778419495, "learning_rate": 3.461784549385979e-05, "loss": 0.5282, "mean_token_accuracy": 0.8328701019287109, "num_tokens": 480105943.0, "step": 14330 }, { "epoch": 0.8557187201528176, "grad_norm": 0.502863347530365, "learning_rate": 3.460731056608233e-05, "loss": 0.4748, "mean_token_accuracy": 0.8477573752403259, "num_tokens": 480273623.0, "step": 14335 }, { "epoch": 0.8560171919770774, "grad_norm": 0.5029167532920837, "learning_rate": 3.459677390718819e-05, "loss": 0.5348, "mean_token_accuracy": 0.8268519759178161, "num_tokens": 480441303.0, "step": 14340 }, { "epoch": 0.8563156638013372, "grad_norm": 0.5670776963233948, "learning_rate": 3.4586235519743774e-05, "loss": 0.5339, "mean_token_accuracy": 0.829899787902832, "num_tokens": 480608983.0, "step": 14345 }, { "epoch": 0.856614135625597, "grad_norm": 0.4996817111968994, "learning_rate": 3.45756954063159e-05, "loss": 0.5557, "mean_token_accuracy": 0.8222533702850342, "num_tokens": 480776663.0, "step": 14350 }, { "epoch": 0.8569126074498568, "grad_norm": 0.5385510325431824, "learning_rate": 3.456515356947181e-05, "loss": 0.5091, "mean_token_accuracy": 0.8370869398117066, "num_tokens": 480944343.0, "step": 14355 }, { "epoch": 0.8572110792741165, "grad_norm": 0.5175701975822449, "learning_rate": 3.455461001177915e-05, "loss": 0.5766, "mean_token_accuracy": 0.8181796431541443, "num_tokens": 481112023.0, "step": 14360 }, { "epoch": 0.8575095510983763, "grad_norm": 0.5024404525756836, "learning_rate": 3.454406473580602e-05, "loss": 0.5292, "mean_token_accuracy": 0.8305618405342102, "num_tokens": 481279703.0, "step": 14365 }, { "epoch": 0.8578080229226361, "grad_norm": 0.5207492709159851, "learning_rate": 3.4533517744120894e-05, "loss": 0.5142, "mean_token_accuracy": 0.8351246476173401, "num_tokens": 481447383.0, "step": 14370 }, { "epoch": 0.8581064947468959, "grad_norm": 0.5085864663124084, "learning_rate": 3.452296903929271e-05, "loss": 0.5527, "mean_token_accuracy": 0.8236311674118042, "num_tokens": 481615063.0, "step": 14375 }, { "epoch": 0.8584049665711557, "grad_norm": 0.4787059426307678, "learning_rate": 3.451241862389078e-05, "loss": 0.4988, "mean_token_accuracy": 0.8387331366539001, "num_tokens": 481782743.0, "step": 14380 }, { "epoch": 0.8587034383954155, "grad_norm": 0.5232252478599548, "learning_rate": 3.4501866500484856e-05, "loss": 0.5561, "mean_token_accuracy": 0.8238160490989686, "num_tokens": 481950423.0, "step": 14385 }, { "epoch": 0.8590019102196753, "grad_norm": 0.4464224874973297, "learning_rate": 3.449131267164512e-05, "loss": 0.4947, "mean_token_accuracy": 0.8401168942451477, "num_tokens": 482118103.0, "step": 14390 }, { "epoch": 0.8593003820439351, "grad_norm": 0.4599359631538391, "learning_rate": 3.448075713994213e-05, "loss": 0.5387, "mean_token_accuracy": 0.829524040222168, "num_tokens": 482285783.0, "step": 14395 }, { "epoch": 0.8595988538681948, "grad_norm": 0.4855235815048218, "learning_rate": 3.4470199907946886e-05, "loss": 0.5435, "mean_token_accuracy": 0.8270070433616639, "num_tokens": 482453463.0, "step": 14400 }, { "epoch": 0.8598973256924546, "grad_norm": 0.5263638496398926, "learning_rate": 3.44596409782308e-05, "loss": 0.5116, "mean_token_accuracy": 0.835351312160492, "num_tokens": 482621143.0, "step": 14405 }, { "epoch": 0.8601957975167144, "grad_norm": 0.5329366326332092, "learning_rate": 3.44490803533657e-05, "loss": 0.5174, "mean_token_accuracy": 0.8341345667839051, "num_tokens": 482788823.0, "step": 14410 }, { "epoch": 0.8604942693409742, "grad_norm": 0.5646621584892273, "learning_rate": 3.44385180359238e-05, "loss": 0.5704, "mean_token_accuracy": 0.8182392835617065, "num_tokens": 482956503.0, "step": 14415 }, { "epoch": 0.860792741165234, "grad_norm": 0.5521789193153381, "learning_rate": 3.442795402847778e-05, "loss": 0.5225, "mean_token_accuracy": 0.8330311298370361, "num_tokens": 483124183.0, "step": 14420 }, { "epoch": 0.8610912129894938, "grad_norm": 0.5149720907211304, "learning_rate": 3.441738833360067e-05, "loss": 0.5933, "mean_token_accuracy": 0.8144399523735046, "num_tokens": 483291863.0, "step": 14425 }, { "epoch": 0.8613896848137536, "grad_norm": 0.5148646831512451, "learning_rate": 3.440682095386595e-05, "loss": 0.5041, "mean_token_accuracy": 0.8392878413200379, "num_tokens": 483459543.0, "step": 14430 }, { "epoch": 0.8616881566380133, "grad_norm": 0.6794476509094238, "learning_rate": 3.43962518918475e-05, "loss": 0.5928, "mean_token_accuracy": 0.8129965424537658, "num_tokens": 483627223.0, "step": 14435 }, { "epoch": 0.8619866284622731, "grad_norm": 0.4807991683483124, "learning_rate": 3.438568115011962e-05, "loss": 0.5368, "mean_token_accuracy": 0.8297089219093323, "num_tokens": 483794903.0, "step": 14440 }, { "epoch": 0.8622851002865329, "grad_norm": 0.5064502954483032, "learning_rate": 3.437510873125699e-05, "loss": 0.5387, "mean_token_accuracy": 0.8275080561637879, "num_tokens": 483962583.0, "step": 14445 }, { "epoch": 0.8625835721107927, "grad_norm": 0.5123314261436462, "learning_rate": 3.436453463783474e-05, "loss": 0.5533, "mean_token_accuracy": 0.8242335796356202, "num_tokens": 484130263.0, "step": 14450 }, { "epoch": 0.8628820439350525, "grad_norm": 0.49425628781318665, "learning_rate": 3.435395887242837e-05, "loss": 0.5424, "mean_token_accuracy": 0.827317190170288, "num_tokens": 484297943.0, "step": 14455 }, { "epoch": 0.8631805157593123, "grad_norm": 0.5996102690696716, "learning_rate": 3.4343381437613814e-05, "loss": 0.515, "mean_token_accuracy": 0.8351186990737915, "num_tokens": 484465623.0, "step": 14460 }, { "epoch": 0.8634789875835721, "grad_norm": 0.501671314239502, "learning_rate": 3.433280233596738e-05, "loss": 0.5475, "mean_token_accuracy": 0.8252475261688232, "num_tokens": 484633303.0, "step": 14465 }, { "epoch": 0.8637774594078319, "grad_norm": 0.5192108154296875, "learning_rate": 3.4322221570065824e-05, "loss": 0.5415, "mean_token_accuracy": 0.8267028570175171, "num_tokens": 484800983.0, "step": 14470 }, { "epoch": 0.8640759312320917, "grad_norm": 0.4694269895553589, "learning_rate": 3.4311639142486295e-05, "loss": 0.4752, "mean_token_accuracy": 0.8457533240318298, "num_tokens": 484968663.0, "step": 14475 }, { "epoch": 0.8643744030563515, "grad_norm": 0.49048593640327454, "learning_rate": 3.430105505580631e-05, "loss": 0.5337, "mean_token_accuracy": 0.8305499315261841, "num_tokens": 485136343.0, "step": 14480 }, { "epoch": 0.8646728748806113, "grad_norm": 0.4649653434753418, "learning_rate": 3.4290469312603826e-05, "loss": 0.4975, "mean_token_accuracy": 0.8389499068260193, "num_tokens": 485302737.0, "step": 14485 }, { "epoch": 0.8649713467048711, "grad_norm": 0.5668534636497498, "learning_rate": 3.4279881915457216e-05, "loss": 0.533, "mean_token_accuracy": 0.8292675733566284, "num_tokens": 485470417.0, "step": 14490 }, { "epoch": 0.8652698185291309, "grad_norm": 0.55607670545578, "learning_rate": 3.426929286694522e-05, "loss": 0.5253, "mean_token_accuracy": 0.8315280914306641, "num_tokens": 485638097.0, "step": 14495 }, { "epoch": 0.8655682903533907, "grad_norm": 0.5578140616416931, "learning_rate": 3.4258702169646994e-05, "loss": 0.4983, "mean_token_accuracy": 0.8403435707092285, "num_tokens": 485805777.0, "step": 14500 }, { "epoch": 0.8658667621776505, "grad_norm": 0.5597586631774902, "learning_rate": 3.424810982614212e-05, "loss": 0.5686, "mean_token_accuracy": 0.821501863002777, "num_tokens": 485973457.0, "step": 14505 }, { "epoch": 0.8661652340019103, "grad_norm": 0.5656439661979675, "learning_rate": 3.423751583901052e-05, "loss": 0.5552, "mean_token_accuracy": 0.8234760880470275, "num_tokens": 486141137.0, "step": 14510 }, { "epoch": 0.86646370582617, "grad_norm": 0.5375494360923767, "learning_rate": 3.422692021083259e-05, "loss": 0.5136, "mean_token_accuracy": 0.8346415400505066, "num_tokens": 486308817.0, "step": 14515 }, { "epoch": 0.8667621776504298, "grad_norm": 0.44071945548057556, "learning_rate": 3.4216322944189076e-05, "loss": 0.5243, "mean_token_accuracy": 0.8339437007904053, "num_tokens": 486476497.0, "step": 14520 }, { "epoch": 0.8670606494746896, "grad_norm": 0.4848516285419464, "learning_rate": 3.4205724041661135e-05, "loss": 0.5301, "mean_token_accuracy": 0.8339735269546509, "num_tokens": 486644177.0, "step": 14525 }, { "epoch": 0.8673591212989494, "grad_norm": 0.4755540192127228, "learning_rate": 3.4195123505830326e-05, "loss": 0.5056, "mean_token_accuracy": 0.8366217374801636, "num_tokens": 486811857.0, "step": 14530 }, { "epoch": 0.8676575931232091, "grad_norm": 0.5334329009056091, "learning_rate": 3.418452133927862e-05, "loss": 0.5868, "mean_token_accuracy": 0.8151377677917481, "num_tokens": 486979537.0, "step": 14535 }, { "epoch": 0.8679560649474689, "grad_norm": 0.5428524017333984, "learning_rate": 3.417391754458836e-05, "loss": 0.5317, "mean_token_accuracy": 0.8293510675430298, "num_tokens": 487147217.0, "step": 14540 }, { "epoch": 0.8682545367717287, "grad_norm": 0.45626723766326904, "learning_rate": 3.41633121243423e-05, "loss": 0.5152, "mean_token_accuracy": 0.8340331554412842, "num_tokens": 487314897.0, "step": 14545 }, { "epoch": 0.8685530085959885, "grad_norm": 0.6469405293464661, "learning_rate": 3.415270508112357e-05, "loss": 0.5368, "mean_token_accuracy": 0.8293093204498291, "num_tokens": 487482577.0, "step": 14550 }, { "epoch": 0.8688514804202483, "grad_norm": 0.5415879487991333, "learning_rate": 3.414209641751572e-05, "loss": 0.5633, "mean_token_accuracy": 0.8217941045761108, "num_tokens": 487650257.0, "step": 14555 }, { "epoch": 0.8691499522445081, "grad_norm": 0.5342953205108643, "learning_rate": 3.41314861361027e-05, "loss": 0.5571, "mean_token_accuracy": 0.8227722764015197, "num_tokens": 487817937.0, "step": 14560 }, { "epoch": 0.8694484240687679, "grad_norm": 0.47511962056159973, "learning_rate": 3.412087423946883e-05, "loss": 0.5073, "mean_token_accuracy": 0.8363175392150879, "num_tokens": 487985617.0, "step": 14565 }, { "epoch": 0.8697468958930277, "grad_norm": 0.49829038977622986, "learning_rate": 3.411026073019882e-05, "loss": 0.4882, "mean_token_accuracy": 0.8436180472373962, "num_tokens": 488153297.0, "step": 14570 }, { "epoch": 0.8700453677172875, "grad_norm": 0.6791826486587524, "learning_rate": 3.409964561087781e-05, "loss": 0.544, "mean_token_accuracy": 0.8278003096580505, "num_tokens": 488320977.0, "step": 14575 }, { "epoch": 0.8703438395415473, "grad_norm": 0.46776020526885986, "learning_rate": 3.408902888409127e-05, "loss": 0.5373, "mean_token_accuracy": 0.827281391620636, "num_tokens": 488488657.0, "step": 14580 }, { "epoch": 0.870642311365807, "grad_norm": 0.5241900086402893, "learning_rate": 3.4078410552425154e-05, "loss": 0.5243, "mean_token_accuracy": 0.8316950917243957, "num_tokens": 488656337.0, "step": 14585 }, { "epoch": 0.8709407831900668, "grad_norm": 0.47714149951934814, "learning_rate": 3.406779061846571e-05, "loss": 0.5009, "mean_token_accuracy": 0.8374746561050415, "num_tokens": 488824017.0, "step": 14590 }, { "epoch": 0.8712392550143266, "grad_norm": 0.46514490246772766, "learning_rate": 3.4057169084799645e-05, "loss": 0.5412, "mean_token_accuracy": 0.8257604718208313, "num_tokens": 488991697.0, "step": 14595 }, { "epoch": 0.8715377268385864, "grad_norm": 0.5989508628845215, "learning_rate": 3.404654595401401e-05, "loss": 0.5436, "mean_token_accuracy": 0.826792311668396, "num_tokens": 489159377.0, "step": 14600 }, { "epoch": 0.8718361986628462, "grad_norm": 0.47459229826927185, "learning_rate": 3.4035921228696275e-05, "loss": 0.5097, "mean_token_accuracy": 0.8347548723220826, "num_tokens": 489327057.0, "step": 14605 }, { "epoch": 0.872134670487106, "grad_norm": 0.5450000762939453, "learning_rate": 3.402529491143428e-05, "loss": 0.5252, "mean_token_accuracy": 0.8310688376426697, "num_tokens": 489494737.0, "step": 14610 }, { "epoch": 0.8724331423113658, "grad_norm": 0.4706822335720062, "learning_rate": 3.401466700481627e-05, "loss": 0.4926, "mean_token_accuracy": 0.8403316259384155, "num_tokens": 489662417.0, "step": 14615 }, { "epoch": 0.8727316141356256, "grad_norm": 0.4576685130596161, "learning_rate": 3.4004037511430866e-05, "loss": 0.4926, "mean_token_accuracy": 0.8423893690109253, "num_tokens": 489830097.0, "step": 14620 }, { "epoch": 0.8730300859598854, "grad_norm": 0.4945715665817261, "learning_rate": 3.3993406433867076e-05, "loss": 0.4931, "mean_token_accuracy": 0.8403972387313843, "num_tokens": 489997777.0, "step": 14625 }, { "epoch": 0.8733285577841452, "grad_norm": 0.5267667770385742, "learning_rate": 3.398277377471429e-05, "loss": 0.5098, "mean_token_accuracy": 0.8352022051811219, "num_tokens": 490165457.0, "step": 14630 }, { "epoch": 0.873627029608405, "grad_norm": 0.4888465404510498, "learning_rate": 3.397213953656229e-05, "loss": 0.5227, "mean_token_accuracy": 0.8301025986671448, "num_tokens": 490333137.0, "step": 14635 }, { "epoch": 0.8739255014326648, "grad_norm": 0.4893541932106018, "learning_rate": 3.396150372200125e-05, "loss": 0.5171, "mean_token_accuracy": 0.8331981420516967, "num_tokens": 490500817.0, "step": 14640 }, { "epoch": 0.8742239732569246, "grad_norm": 0.494081974029541, "learning_rate": 3.3950866333621715e-05, "loss": 0.5002, "mean_token_accuracy": 0.8382440567016601, "num_tokens": 490668497.0, "step": 14645 }, { "epoch": 0.8745224450811844, "grad_norm": 0.465697318315506, "learning_rate": 3.394022737401462e-05, "loss": 0.5236, "mean_token_accuracy": 0.8325539827346802, "num_tokens": 490836177.0, "step": 14650 }, { "epoch": 0.8748209169054442, "grad_norm": 0.5026145577430725, "learning_rate": 3.392958684577126e-05, "loss": 0.5313, "mean_token_accuracy": 0.8313551306724548, "num_tokens": 491003857.0, "step": 14655 }, { "epoch": 0.875119388729704, "grad_norm": 0.4903579652309418, "learning_rate": 3.3918944751483353e-05, "loss": 0.5932, "mean_token_accuracy": 0.8120780110359191, "num_tokens": 491171537.0, "step": 14660 }, { "epoch": 0.8754178605539638, "grad_norm": 0.4996260106563568, "learning_rate": 3.390830109374297e-05, "loss": 0.5449, "mean_token_accuracy": 0.8251043796539307, "num_tokens": 491339217.0, "step": 14665 }, { "epoch": 0.8757163323782235, "grad_norm": 0.47404783964157104, "learning_rate": 3.389765587514258e-05, "loss": 0.515, "mean_token_accuracy": 0.8338005423545838, "num_tokens": 491506897.0, "step": 14670 }, { "epoch": 0.8760148042024832, "grad_norm": 0.5179705023765564, "learning_rate": 3.3887009098275006e-05, "loss": 0.5422, "mean_token_accuracy": 0.8266849517822266, "num_tokens": 491674577.0, "step": 14675 }, { "epoch": 0.876313276026743, "grad_norm": 0.5372985005378723, "learning_rate": 3.387636076573348e-05, "loss": 0.5455, "mean_token_accuracy": 0.8258857250213623, "num_tokens": 491842257.0, "step": 14680 }, { "epoch": 0.8766117478510028, "grad_norm": 0.5171255469322205, "learning_rate": 3.3865710880111594e-05, "loss": 0.5533, "mean_token_accuracy": 0.8231241703033447, "num_tokens": 492009937.0, "step": 14685 }, { "epoch": 0.8769102196752626, "grad_norm": 0.4769144654273987, "learning_rate": 3.385505944400332e-05, "loss": 0.5268, "mean_token_accuracy": 0.830776572227478, "num_tokens": 492177617.0, "step": 14690 }, { "epoch": 0.8772086914995224, "grad_norm": 0.47414425015449524, "learning_rate": 3.384440646000303e-05, "loss": 0.51, "mean_token_accuracy": 0.8342717409133911, "num_tokens": 492345297.0, "step": 14695 }, { "epoch": 0.8775071633237822, "grad_norm": 0.4481916129589081, "learning_rate": 3.383375193070543e-05, "loss": 0.5256, "mean_token_accuracy": 0.8328343033790588, "num_tokens": 492512977.0, "step": 14700 }, { "epoch": 0.877805635148042, "grad_norm": 0.49395641684532166, "learning_rate": 3.382309585870564e-05, "loss": 0.5421, "mean_token_accuracy": 0.8277048826217651, "num_tokens": 492680657.0, "step": 14705 }, { "epoch": 0.8781041069723018, "grad_norm": 0.45159032940864563, "learning_rate": 3.3812438246599154e-05, "loss": 0.529, "mean_token_accuracy": 0.8305558919906616, "num_tokens": 492848337.0, "step": 14710 }, { "epoch": 0.8784025787965616, "grad_norm": 0.48325857520103455, "learning_rate": 3.380177909698181e-05, "loss": 0.4791, "mean_token_accuracy": 0.8445902466773987, "num_tokens": 493016017.0, "step": 14715 }, { "epoch": 0.8787010506208214, "grad_norm": 0.44052574038505554, "learning_rate": 3.379111841244985e-05, "loss": 0.4907, "mean_token_accuracy": 0.8406119704246521, "num_tokens": 493183697.0, "step": 14720 }, { "epoch": 0.8789995224450812, "grad_norm": 0.5341507792472839, "learning_rate": 3.378045619559988e-05, "loss": 0.5639, "mean_token_accuracy": 0.8211737990379333, "num_tokens": 493351377.0, "step": 14725 }, { "epoch": 0.879297994269341, "grad_norm": 0.5167533159255981, "learning_rate": 3.3769792449028884e-05, "loss": 0.4826, "mean_token_accuracy": 0.8435106754302979, "num_tokens": 493519057.0, "step": 14730 }, { "epoch": 0.8795964660936008, "grad_norm": 0.5037776827812195, "learning_rate": 3.37591271753342e-05, "loss": 0.5282, "mean_token_accuracy": 0.8321126103401184, "num_tokens": 493686737.0, "step": 14735 }, { "epoch": 0.8798949379178606, "grad_norm": 0.4913686513900757, "learning_rate": 3.3748460377113584e-05, "loss": 0.4983, "mean_token_accuracy": 0.8401765346527099, "num_tokens": 493854417.0, "step": 14740 }, { "epoch": 0.8801934097421203, "grad_norm": 0.5046586394309998, "learning_rate": 3.3737792056965086e-05, "loss": 0.5086, "mean_token_accuracy": 0.8373255372047425, "num_tokens": 494022097.0, "step": 14745 }, { "epoch": 0.8804918815663801, "grad_norm": 0.5026734471321106, "learning_rate": 3.372712221748721e-05, "loss": 0.5453, "mean_token_accuracy": 0.8256411671638488, "num_tokens": 494189777.0, "step": 14750 }, { "epoch": 0.8807903533906399, "grad_norm": 0.5048609972000122, "learning_rate": 3.371645086127877e-05, "loss": 0.4984, "mean_token_accuracy": 0.8392878532409668, "num_tokens": 494357457.0, "step": 14755 }, { "epoch": 0.8810888252148997, "grad_norm": 0.5479912757873535, "learning_rate": 3.370577799093899e-05, "loss": 0.5376, "mean_token_accuracy": 0.8284614205360412, "num_tokens": 494518018.0, "step": 14760 }, { "epoch": 0.8813872970391595, "grad_norm": 0.44558560848236084, "learning_rate": 3.3695103609067427e-05, "loss": 0.5714, "mean_token_accuracy": 0.8166587233543396, "num_tokens": 494685698.0, "step": 14765 }, { "epoch": 0.8816857688634193, "grad_norm": 0.4608181118965149, "learning_rate": 3.368442771826403e-05, "loss": 0.459, "mean_token_accuracy": 0.8502147197723389, "num_tokens": 494853378.0, "step": 14770 }, { "epoch": 0.8819842406876791, "grad_norm": 0.5255980491638184, "learning_rate": 3.367375032112911e-05, "loss": 0.5358, "mean_token_accuracy": 0.8271340608596802, "num_tokens": 495021009.0, "step": 14775 }, { "epoch": 0.8822827125119389, "grad_norm": 0.5068820714950562, "learning_rate": 3.366307142026335e-05, "loss": 0.5608, "mean_token_accuracy": 0.8233090877532959, "num_tokens": 495188689.0, "step": 14780 }, { "epoch": 0.8825811843361987, "grad_norm": 0.5439766049385071, "learning_rate": 3.365239101826779e-05, "loss": 0.5311, "mean_token_accuracy": 0.8304664254188537, "num_tokens": 495356369.0, "step": 14785 }, { "epoch": 0.8828796561604585, "grad_norm": 0.5392639636993408, "learning_rate": 3.364170911774384e-05, "loss": 0.51, "mean_token_accuracy": 0.8345580458641052, "num_tokens": 495524049.0, "step": 14790 }, { "epoch": 0.8831781279847183, "grad_norm": 0.4509327709674835, "learning_rate": 3.3631025721293256e-05, "loss": 0.5112, "mean_token_accuracy": 0.8354348063468933, "num_tokens": 495691729.0, "step": 14795 }, { "epoch": 0.8834765998089781, "grad_norm": 0.5160630941390991, "learning_rate": 3.362034083151821e-05, "loss": 0.554, "mean_token_accuracy": 0.8232375025749207, "num_tokens": 495859409.0, "step": 14800 }, { "epoch": 0.8837750716332379, "grad_norm": 0.47062546014785767, "learning_rate": 3.360965445102118e-05, "loss": 0.4815, "mean_token_accuracy": 0.843033516407013, "num_tokens": 496027089.0, "step": 14805 }, { "epoch": 0.8840735434574976, "grad_norm": 0.5308709144592285, "learning_rate": 3.359896658240503e-05, "loss": 0.5249, "mean_token_accuracy": 0.8304843068122864, "num_tokens": 496194769.0, "step": 14810 }, { "epoch": 0.8843720152817574, "grad_norm": 0.5363506078720093, "learning_rate": 3.3588277228273e-05, "loss": 0.5065, "mean_token_accuracy": 0.8359596729278564, "num_tokens": 496362449.0, "step": 14815 }, { "epoch": 0.8846704871060171, "grad_norm": 0.5338091850280762, "learning_rate": 3.357758639122867e-05, "loss": 0.5453, "mean_token_accuracy": 0.8241619825363159, "num_tokens": 496530129.0, "step": 14820 }, { "epoch": 0.8849689589302769, "grad_norm": 0.5140828490257263, "learning_rate": 3.356689407387601e-05, "loss": 0.5149, "mean_token_accuracy": 0.8358046054840088, "num_tokens": 496697809.0, "step": 14825 }, { "epoch": 0.8852674307545367, "grad_norm": 0.5231084823608398, "learning_rate": 3.355620027881929e-05, "loss": 0.5737, "mean_token_accuracy": 0.8169211506843567, "num_tokens": 496865489.0, "step": 14830 }, { "epoch": 0.8855659025787965, "grad_norm": 0.4676034450531006, "learning_rate": 3.354550500866322e-05, "loss": 0.5274, "mean_token_accuracy": 0.8310091853141784, "num_tokens": 497033169.0, "step": 14835 }, { "epoch": 0.8858643744030563, "grad_norm": 0.490128755569458, "learning_rate": 3.353480826601283e-05, "loss": 0.5775, "mean_token_accuracy": 0.8170165896415711, "num_tokens": 497200849.0, "step": 14840 }, { "epoch": 0.8861628462273161, "grad_norm": 0.501987874507904, "learning_rate": 3.352411005347348e-05, "loss": 0.5378, "mean_token_accuracy": 0.8287725210189819, "num_tokens": 497368529.0, "step": 14845 }, { "epoch": 0.8864613180515759, "grad_norm": 0.4809821546077728, "learning_rate": 3.3513410373650943e-05, "loss": 0.5784, "mean_token_accuracy": 0.8153644442558289, "num_tokens": 497536209.0, "step": 14850 }, { "epoch": 0.8867597898758357, "grad_norm": 0.5100000500679016, "learning_rate": 3.350270922915131e-05, "loss": 0.4947, "mean_token_accuracy": 0.8402660012245178, "num_tokens": 497703889.0, "step": 14855 }, { "epoch": 0.8870582617000955, "grad_norm": 0.4943197965621948, "learning_rate": 3.3492006622581055e-05, "loss": 0.5804, "mean_token_accuracy": 0.8168376445770263, "num_tokens": 497871569.0, "step": 14860 }, { "epoch": 0.8873567335243553, "grad_norm": 0.4699857532978058, "learning_rate": 3.348130255654698e-05, "loss": 0.4959, "mean_token_accuracy": 0.8403614521026611, "num_tokens": 498039249.0, "step": 14865 }, { "epoch": 0.8876552053486151, "grad_norm": 0.5275648236274719, "learning_rate": 3.347059703365627e-05, "loss": 0.5614, "mean_token_accuracy": 0.8211976647377014, "num_tokens": 498206929.0, "step": 14870 }, { "epoch": 0.8879536771728749, "grad_norm": 0.44864362478256226, "learning_rate": 3.345989005651645e-05, "loss": 0.4876, "mean_token_accuracy": 0.8426637172698974, "num_tokens": 498374609.0, "step": 14875 }, { "epoch": 0.8882521489971347, "grad_norm": 0.5722591280937195, "learning_rate": 3.34491816277354e-05, "loss": 0.5269, "mean_token_accuracy": 0.830400812625885, "num_tokens": 498542289.0, "step": 14880 }, { "epoch": 0.8885506208213945, "grad_norm": 0.470558762550354, "learning_rate": 3.343847174992135e-05, "loss": 0.5291, "mean_token_accuracy": 0.8319873571395874, "num_tokens": 498709969.0, "step": 14885 }, { "epoch": 0.8888490926456543, "grad_norm": 0.47396644949913025, "learning_rate": 3.342776042568289e-05, "loss": 0.4856, "mean_token_accuracy": 0.8426637291908264, "num_tokens": 498877649.0, "step": 14890 }, { "epoch": 0.889147564469914, "grad_norm": 0.47508078813552856, "learning_rate": 3.341704765762897e-05, "loss": 0.5014, "mean_token_accuracy": 0.838208281993866, "num_tokens": 499045329.0, "step": 14895 }, { "epoch": 0.8894460362941738, "grad_norm": 0.49763280153274536, "learning_rate": 3.340633344836887e-05, "loss": 0.5244, "mean_token_accuracy": 0.832118570804596, "num_tokens": 499213009.0, "step": 14900 }, { "epoch": 0.8897445081184336, "grad_norm": 0.5509679913520813, "learning_rate": 3.3395617800512254e-05, "loss": 0.5132, "mean_token_accuracy": 0.8350829005241394, "num_tokens": 499380689.0, "step": 14905 }, { "epoch": 0.8900429799426934, "grad_norm": 0.5063443183898926, "learning_rate": 3.338490071666908e-05, "loss": 0.5612, "mean_token_accuracy": 0.822456157207489, "num_tokens": 499548369.0, "step": 14910 }, { "epoch": 0.8903414517669532, "grad_norm": 0.5468242168426514, "learning_rate": 3.337418219944974e-05, "loss": 0.5233, "mean_token_accuracy": 0.8337766885757446, "num_tokens": 499716049.0, "step": 14915 }, { "epoch": 0.890639923591213, "grad_norm": 0.6402581930160522, "learning_rate": 3.336346225146488e-05, "loss": 0.5352, "mean_token_accuracy": 0.8274354934692383, "num_tokens": 499875788.0, "step": 14920 }, { "epoch": 0.8909383954154728, "grad_norm": 0.5565267205238342, "learning_rate": 3.3352740875325575e-05, "loss": 0.5557, "mean_token_accuracy": 0.8220744252204895, "num_tokens": 500043468.0, "step": 14925 }, { "epoch": 0.8912368672397326, "grad_norm": 0.55153489112854, "learning_rate": 3.334201807364319e-05, "loss": 0.4965, "mean_token_accuracy": 0.8384945750236511, "num_tokens": 500211148.0, "step": 14930 }, { "epoch": 0.8915353390639924, "grad_norm": 0.44260117411613464, "learning_rate": 3.333129384902948e-05, "loss": 0.4923, "mean_token_accuracy": 0.8412859201431274, "num_tokens": 500378828.0, "step": 14935 }, { "epoch": 0.8918338108882522, "grad_norm": 0.5871852040290833, "learning_rate": 3.332056820409651e-05, "loss": 0.5454, "mean_token_accuracy": 0.8241441011428833, "num_tokens": 500546508.0, "step": 14940 }, { "epoch": 0.892132282712512, "grad_norm": 0.48628076910972595, "learning_rate": 3.330984114145673e-05, "loss": 0.5391, "mean_token_accuracy": 0.8266790032386779, "num_tokens": 500714188.0, "step": 14945 }, { "epoch": 0.8924307545367717, "grad_norm": 0.5118081569671631, "learning_rate": 3.32991126637229e-05, "loss": 0.5234, "mean_token_accuracy": 0.8316950917243957, "num_tokens": 500881868.0, "step": 14950 }, { "epoch": 0.8927292263610315, "grad_norm": 0.4748677611351013, "learning_rate": 3.3288382773508145e-05, "loss": 0.5074, "mean_token_accuracy": 0.8354825258255005, "num_tokens": 501049548.0, "step": 14955 }, { "epoch": 0.8930276981852913, "grad_norm": 0.4990309476852417, "learning_rate": 3.3277651473425905e-05, "loss": 0.5123, "mean_token_accuracy": 0.8350590467453003, "num_tokens": 501217228.0, "step": 14960 }, { "epoch": 0.8933261700095511, "grad_norm": 0.47078853845596313, "learning_rate": 3.326691876609002e-05, "loss": 0.4982, "mean_token_accuracy": 0.8375462174415589, "num_tokens": 501384908.0, "step": 14965 }, { "epoch": 0.8936246418338109, "grad_norm": 0.4765874445438385, "learning_rate": 3.325618465411463e-05, "loss": 0.5112, "mean_token_accuracy": 0.8348443269729614, "num_tokens": 501552588.0, "step": 14970 }, { "epoch": 0.8939231136580706, "grad_norm": 0.6424722075462341, "learning_rate": 3.324544914011421e-05, "loss": 0.5201, "mean_token_accuracy": 0.8334903955459595, "num_tokens": 501720268.0, "step": 14975 }, { "epoch": 0.8942215854823304, "grad_norm": 0.5706070065498352, "learning_rate": 3.323471222670361e-05, "loss": 0.526, "mean_token_accuracy": 0.8302039742469788, "num_tokens": 501887948.0, "step": 14980 }, { "epoch": 0.8945200573065902, "grad_norm": 0.5030493140220642, "learning_rate": 3.322397391649799e-05, "loss": 0.5254, "mean_token_accuracy": 0.8320827841758728, "num_tokens": 502055628.0, "step": 14985 }, { "epoch": 0.89481852913085, "grad_norm": 0.44015440344810486, "learning_rate": 3.3213234212112875e-05, "loss": 0.495, "mean_token_accuracy": 0.8401825070381165, "num_tokens": 502223308.0, "step": 14990 }, { "epoch": 0.8951170009551098, "grad_norm": 0.5025378465652466, "learning_rate": 3.3202493116164104e-05, "loss": 0.5521, "mean_token_accuracy": 0.8257246851921082, "num_tokens": 502390988.0, "step": 14995 }, { "epoch": 0.8954154727793696, "grad_norm": 0.4992840588092804, "learning_rate": 3.319175063126789e-05, "loss": 0.5267, "mean_token_accuracy": 0.8307884931564331, "num_tokens": 502558668.0, "step": 15000 }, { "epoch": 0.8957139446036294, "grad_norm": 0.5053568482398987, "learning_rate": 3.318100676004075e-05, "loss": 0.5199, "mean_token_accuracy": 0.8335261702537536, "num_tokens": 502726348.0, "step": 15005 }, { "epoch": 0.8960124164278892, "grad_norm": 0.5102866291999817, "learning_rate": 3.317026150509955e-05, "loss": 0.5464, "mean_token_accuracy": 0.8258201122283936, "num_tokens": 502894028.0, "step": 15010 }, { "epoch": 0.896310888252149, "grad_norm": 0.569139838218689, "learning_rate": 3.315951486906151e-05, "loss": 0.5155, "mean_token_accuracy": 0.8341941952705383, "num_tokens": 503061708.0, "step": 15015 }, { "epoch": 0.8966093600764088, "grad_norm": 0.5200099349021912, "learning_rate": 3.314876685454415e-05, "loss": 0.527, "mean_token_accuracy": 0.8316772103309631, "num_tokens": 503229388.0, "step": 15020 }, { "epoch": 0.8969078319006686, "grad_norm": 0.5570427179336548, "learning_rate": 3.313801746416536e-05, "loss": 0.5383, "mean_token_accuracy": 0.8284444689750672, "num_tokens": 503397068.0, "step": 15025 }, { "epoch": 0.8972063037249284, "grad_norm": 0.4885665774345398, "learning_rate": 3.312726670054335e-05, "loss": 0.4869, "mean_token_accuracy": 0.8417809844017029, "num_tokens": 503564748.0, "step": 15030 }, { "epoch": 0.8975047755491882, "grad_norm": 0.6092052459716797, "learning_rate": 3.311651456629667e-05, "loss": 0.5227, "mean_token_accuracy": 0.8317249178886413, "num_tokens": 503732428.0, "step": 15035 }, { "epoch": 0.897803247373448, "grad_norm": 0.5439836382865906, "learning_rate": 3.31057610640442e-05, "loss": 0.5753, "mean_token_accuracy": 0.8178217768669128, "num_tokens": 503900108.0, "step": 15040 }, { "epoch": 0.8981017191977078, "grad_norm": 0.4790586531162262, "learning_rate": 3.309500619640515e-05, "loss": 0.5009, "mean_token_accuracy": 0.838846480846405, "num_tokens": 504067788.0, "step": 15045 }, { "epoch": 0.8984001910219676, "grad_norm": 0.49048352241516113, "learning_rate": 3.308424996599906e-05, "loss": 0.5148, "mean_token_accuracy": 0.8354288339614868, "num_tokens": 504235468.0, "step": 15050 }, { "epoch": 0.8986986628462273, "grad_norm": 0.44867396354675293, "learning_rate": 3.3073492375445835e-05, "loss": 0.5035, "mean_token_accuracy": 0.8378921508789062, "num_tokens": 504403148.0, "step": 15055 }, { "epoch": 0.8989971346704871, "grad_norm": 0.4918707609176636, "learning_rate": 3.3062733427365664e-05, "loss": 0.5196, "mean_token_accuracy": 0.8321305036544799, "num_tokens": 504570828.0, "step": 15060 }, { "epoch": 0.8992956064947469, "grad_norm": 0.5182741284370422, "learning_rate": 3.305197312437908e-05, "loss": 0.5309, "mean_token_accuracy": 0.8293630003929138, "num_tokens": 504738508.0, "step": 15065 }, { "epoch": 0.8995940783190067, "grad_norm": 0.4851759076118469, "learning_rate": 3.3041211469106966e-05, "loss": 0.522, "mean_token_accuracy": 0.8325301289558411, "num_tokens": 504906188.0, "step": 15070 }, { "epoch": 0.8998925501432665, "grad_norm": 0.4884530007839203, "learning_rate": 3.3030448464170524e-05, "loss": 0.5307, "mean_token_accuracy": 0.8286472678184509, "num_tokens": 505073868.0, "step": 15075 }, { "epoch": 0.9001910219675263, "grad_norm": 0.5374017357826233, "learning_rate": 3.301968411219128e-05, "loss": 0.5587, "mean_token_accuracy": 0.8218418240547181, "num_tokens": 505241548.0, "step": 15080 }, { "epoch": 0.9004894937917861, "grad_norm": 0.4759073555469513, "learning_rate": 3.3008918415791083e-05, "loss": 0.4903, "mean_token_accuracy": 0.8395144939422607, "num_tokens": 505409228.0, "step": 15085 }, { "epoch": 0.9007879656160458, "grad_norm": 0.4784984290599823, "learning_rate": 3.299815137759213e-05, "loss": 0.4691, "mean_token_accuracy": 0.848395574092865, "num_tokens": 505576908.0, "step": 15090 }, { "epoch": 0.9010864374403056, "grad_norm": 0.48732420802116394, "learning_rate": 3.298738300021692e-05, "loss": 0.5313, "mean_token_accuracy": 0.8292556285858155, "num_tokens": 505744588.0, "step": 15095 }, { "epoch": 0.9013849092645654, "grad_norm": 0.4827894866466522, "learning_rate": 3.29766132862883e-05, "loss": 0.5146, "mean_token_accuracy": 0.8375394463539123, "num_tokens": 505909536.0, "step": 15100 }, { "epoch": 0.9016833810888252, "grad_norm": 0.5672697424888611, "learning_rate": 3.296584223842942e-05, "loss": 0.5543, "mean_token_accuracy": 0.8234224081039428, "num_tokens": 506077216.0, "step": 15105 }, { "epoch": 0.901981852913085, "grad_norm": 0.5556638240814209, "learning_rate": 3.295506985926377e-05, "loss": 0.5398, "mean_token_accuracy": 0.829535973072052, "num_tokens": 506244896.0, "step": 15110 }, { "epoch": 0.9022803247373448, "grad_norm": 0.5258305072784424, "learning_rate": 3.294429615141518e-05, "loss": 0.4894, "mean_token_accuracy": 0.8422641038894654, "num_tokens": 506412576.0, "step": 15115 }, { "epoch": 0.9025787965616046, "grad_norm": 0.5012132525444031, "learning_rate": 3.2933521117507766e-05, "loss": 0.5078, "mean_token_accuracy": 0.836454713344574, "num_tokens": 506580256.0, "step": 15120 }, { "epoch": 0.9028772683858644, "grad_norm": 0.4982709586620331, "learning_rate": 3.2922744760165994e-05, "loss": 0.5279, "mean_token_accuracy": 0.8287367343902587, "num_tokens": 506747936.0, "step": 15125 }, { "epoch": 0.9031757402101241, "grad_norm": 0.4942888915538788, "learning_rate": 3.291196708201464e-05, "loss": 0.5376, "mean_token_accuracy": 0.8272038698196411, "num_tokens": 506915616.0, "step": 15130 }, { "epoch": 0.9034742120343839, "grad_norm": 0.5942230820655823, "learning_rate": 3.290118808567881e-05, "loss": 0.5732, "mean_token_accuracy": 0.8190623998641968, "num_tokens": 507083296.0, "step": 15135 }, { "epoch": 0.9037726838586437, "grad_norm": 0.48611149191856384, "learning_rate": 3.2890407773783935e-05, "loss": 0.5161, "mean_token_accuracy": 0.8341763138771057, "num_tokens": 507250976.0, "step": 15140 }, { "epoch": 0.9040711556829035, "grad_norm": 0.47812941670417786, "learning_rate": 3.2879626148955745e-05, "loss": 0.5482, "mean_token_accuracy": 0.8242932200431824, "num_tokens": 507418656.0, "step": 15145 }, { "epoch": 0.9043696275071633, "grad_norm": 0.4972628355026245, "learning_rate": 3.2868843213820305e-05, "loss": 0.4946, "mean_token_accuracy": 0.8406954526901245, "num_tokens": 507586336.0, "step": 15150 }, { "epoch": 0.9046680993314231, "grad_norm": 0.4947787821292877, "learning_rate": 3.285805897100401e-05, "loss": 0.5503, "mean_token_accuracy": 0.8235118746757507, "num_tokens": 507754016.0, "step": 15155 }, { "epoch": 0.9049665711556829, "grad_norm": 0.5631944537162781, "learning_rate": 3.2847273423133546e-05, "loss": 0.553, "mean_token_accuracy": 0.8239114880561829, "num_tokens": 507921696.0, "step": 15160 }, { "epoch": 0.9052650429799427, "grad_norm": 0.47290360927581787, "learning_rate": 3.283648657283595e-05, "loss": 0.5292, "mean_token_accuracy": 0.829023027420044, "num_tokens": 508089376.0, "step": 15165 }, { "epoch": 0.9055635148042025, "grad_norm": 0.6200077533721924, "learning_rate": 3.2825698422738546e-05, "loss": 0.5008, "mean_token_accuracy": 0.838834547996521, "num_tokens": 508257056.0, "step": 15170 }, { "epoch": 0.9058619866284623, "grad_norm": 0.483733206987381, "learning_rate": 3.281490897546899e-05, "loss": 0.5112, "mean_token_accuracy": 0.8374030828475952, "num_tokens": 508424736.0, "step": 15175 }, { "epoch": 0.9061604584527221, "grad_norm": 0.46810218691825867, "learning_rate": 3.280411823365525e-05, "loss": 0.5169, "mean_token_accuracy": 0.8342836618423461, "num_tokens": 508592416.0, "step": 15180 }, { "epoch": 0.9064589302769819, "grad_norm": 0.565943717956543, "learning_rate": 3.2793326199925615e-05, "loss": 0.5454, "mean_token_accuracy": 0.8266611099243164, "num_tokens": 508760096.0, "step": 15185 }, { "epoch": 0.9067574021012417, "grad_norm": 0.5199659466743469, "learning_rate": 3.2782532876908674e-05, "loss": 0.5387, "mean_token_accuracy": 0.8281760811805725, "num_tokens": 508927776.0, "step": 15190 }, { "epoch": 0.9070558739255015, "grad_norm": 0.5202977061271667, "learning_rate": 3.2771738267233346e-05, "loss": 0.4923, "mean_token_accuracy": 0.8416557312011719, "num_tokens": 509095456.0, "step": 15195 }, { "epoch": 0.9073543457497613, "grad_norm": 0.5077999234199524, "learning_rate": 3.2760942373528867e-05, "loss": 0.537, "mean_token_accuracy": 0.8274662971496582, "num_tokens": 509263136.0, "step": 15200 }, { "epoch": 0.907652817574021, "grad_norm": 0.5351226329803467, "learning_rate": 3.275014519842476e-05, "loss": 0.5569, "mean_token_accuracy": 0.8228319168090821, "num_tokens": 509430816.0, "step": 15205 }, { "epoch": 0.9079512893982808, "grad_norm": 0.50678950548172, "learning_rate": 3.273934674455088e-05, "loss": 0.5075, "mean_token_accuracy": 0.8355600595474243, "num_tokens": 509598496.0, "step": 15210 }, { "epoch": 0.9082497612225406, "grad_norm": 0.556103527545929, "learning_rate": 3.2728547014537395e-05, "loss": 0.5811, "mean_token_accuracy": 0.8161576867103577, "num_tokens": 509766176.0, "step": 15215 }, { "epoch": 0.9085482330468004, "grad_norm": 0.49839338660240173, "learning_rate": 3.271774601101478e-05, "loss": 0.5233, "mean_token_accuracy": 0.8324048638343811, "num_tokens": 509933856.0, "step": 15220 }, { "epoch": 0.9088467048710601, "grad_norm": 0.5003037452697754, "learning_rate": 3.270694373661382e-05, "loss": 0.5239, "mean_token_accuracy": 0.8318203449249267, "num_tokens": 510101536.0, "step": 15225 }, { "epoch": 0.9091451766953199, "grad_norm": 0.5407162308692932, "learning_rate": 3.26961401939656e-05, "loss": 0.5343, "mean_token_accuracy": 0.8290170431137085, "num_tokens": 510269216.0, "step": 15230 }, { "epoch": 0.9094436485195797, "grad_norm": 0.5040706992149353, "learning_rate": 3.268533538570153e-05, "loss": 0.5319, "mean_token_accuracy": 0.830388879776001, "num_tokens": 510436896.0, "step": 15235 }, { "epoch": 0.9097421203438395, "grad_norm": 0.5021921992301941, "learning_rate": 3.2674529314453304e-05, "loss": 0.5282, "mean_token_accuracy": 0.8309733867645264, "num_tokens": 510604576.0, "step": 15240 }, { "epoch": 0.9100405921680993, "grad_norm": 0.4788692593574524, "learning_rate": 3.266372198285296e-05, "loss": 0.5191, "mean_token_accuracy": 0.8329297423362731, "num_tokens": 510772256.0, "step": 15245 }, { "epoch": 0.9103390639923591, "grad_norm": 0.47935912013053894, "learning_rate": 3.265291339353282e-05, "loss": 0.5138, "mean_token_accuracy": 0.8348860740661621, "num_tokens": 510939936.0, "step": 15250 }, { "epoch": 0.9106375358166189, "grad_norm": 0.5384992361068726, "learning_rate": 3.264210354912551e-05, "loss": 0.5721, "mean_token_accuracy": 0.8200584530830384, "num_tokens": 511107616.0, "step": 15255 }, { "epoch": 0.9109360076408787, "grad_norm": 0.5259842872619629, "learning_rate": 3.263129245226396e-05, "loss": 0.4942, "mean_token_accuracy": 0.8408326387405396, "num_tokens": 511275296.0, "step": 15260 }, { "epoch": 0.9112344794651385, "grad_norm": 0.5167257189750671, "learning_rate": 3.262048010558143e-05, "loss": 0.5191, "mean_token_accuracy": 0.8357509255409241, "num_tokens": 511442976.0, "step": 15265 }, { "epoch": 0.9115329512893983, "grad_norm": 0.4853335916996002, "learning_rate": 3.260966651171145e-05, "loss": 0.5279, "mean_token_accuracy": 0.8301801323890686, "num_tokens": 511610656.0, "step": 15270 }, { "epoch": 0.9118314231136581, "grad_norm": 0.5171031951904297, "learning_rate": 3.259885167328788e-05, "loss": 0.5494, "mean_token_accuracy": 0.8258797645568847, "num_tokens": 511778336.0, "step": 15275 }, { "epoch": 0.9121298949379179, "grad_norm": 0.5895015001296997, "learning_rate": 3.258803559294487e-05, "loss": 0.5226, "mean_token_accuracy": 0.832004714012146, "num_tokens": 511943286.0, "step": 15280 }, { "epoch": 0.9124283667621776, "grad_norm": 0.49766111373901367, "learning_rate": 3.257721827331688e-05, "loss": 0.504, "mean_token_accuracy": 0.8365501642227173, "num_tokens": 512110966.0, "step": 15285 }, { "epoch": 0.9127268385864374, "grad_norm": 0.4960426390171051, "learning_rate": 3.256639971703866e-05, "loss": 0.5209, "mean_token_accuracy": 0.8328641295433045, "num_tokens": 512278646.0, "step": 15290 }, { "epoch": 0.9130253104106972, "grad_norm": 0.49136650562286377, "learning_rate": 3.255557992674527e-05, "loss": 0.4948, "mean_token_accuracy": 0.8410950779914856, "num_tokens": 512446326.0, "step": 15295 }, { "epoch": 0.913323782234957, "grad_norm": 0.4935888648033142, "learning_rate": 3.254475890507209e-05, "loss": 0.5305, "mean_token_accuracy": 0.8296314001083374, "num_tokens": 512614006.0, "step": 15300 }, { "epoch": 0.9136222540592168, "grad_norm": 1.0330394506454468, "learning_rate": 3.253393665465475e-05, "loss": 0.532, "mean_token_accuracy": 0.8303053855895997, "num_tokens": 512781686.0, "step": 15305 }, { "epoch": 0.9139207258834766, "grad_norm": 0.4904855787754059, "learning_rate": 3.252311317812923e-05, "loss": 0.5296, "mean_token_accuracy": 0.8292735338211059, "num_tokens": 512949366.0, "step": 15310 }, { "epoch": 0.9142191977077364, "grad_norm": 0.45952892303466797, "learning_rate": 3.251228847813177e-05, "loss": 0.4818, "mean_token_accuracy": 0.8445723414421081, "num_tokens": 513117046.0, "step": 15315 }, { "epoch": 0.9145176695319962, "grad_norm": 0.4795607328414917, "learning_rate": 3.250146255729895e-05, "loss": 0.5051, "mean_token_accuracy": 0.8371108174324036, "num_tokens": 513284726.0, "step": 15320 }, { "epoch": 0.914816141356256, "grad_norm": 0.5202341675758362, "learning_rate": 3.2490635418267586e-05, "loss": 0.5407, "mean_token_accuracy": 0.8274484038352966, "num_tokens": 513452406.0, "step": 15325 }, { "epoch": 0.9151146131805158, "grad_norm": 0.5209221243858337, "learning_rate": 3.247980706367487e-05, "loss": 0.5186, "mean_token_accuracy": 0.8329595565795899, "num_tokens": 513620086.0, "step": 15330 }, { "epoch": 0.9154130850047756, "grad_norm": 0.485456258058548, "learning_rate": 3.246897749615821e-05, "loss": 0.5578, "mean_token_accuracy": 0.8226350903511047, "num_tokens": 513787766.0, "step": 15335 }, { "epoch": 0.9157115568290354, "grad_norm": 0.5021275281906128, "learning_rate": 3.2458146718355374e-05, "loss": 0.5183, "mean_token_accuracy": 0.8338244080543518, "num_tokens": 513955446.0, "step": 15340 }, { "epoch": 0.9160100286532952, "grad_norm": 0.5215170979499817, "learning_rate": 3.244731473290438e-05, "loss": 0.5359, "mean_token_accuracy": 0.8298341870307923, "num_tokens": 514123126.0, "step": 15345 }, { "epoch": 0.916308500477555, "grad_norm": 0.5435329675674438, "learning_rate": 3.2436481542443574e-05, "loss": 0.5249, "mean_token_accuracy": 0.8318740367889405, "num_tokens": 514290806.0, "step": 15350 }, { "epoch": 0.9166069723018148, "grad_norm": 0.5269924998283386, "learning_rate": 3.2425647149611564e-05, "loss": 0.5777, "mean_token_accuracy": 0.81725515127182, "num_tokens": 514458486.0, "step": 15355 }, { "epoch": 0.9169054441260746, "grad_norm": 0.5256832242012024, "learning_rate": 3.2414811557047284e-05, "loss": 0.5071, "mean_token_accuracy": 0.83711678981781, "num_tokens": 514626166.0, "step": 15360 }, { "epoch": 0.9172039159503342, "grad_norm": 0.48518332839012146, "learning_rate": 3.240397476738993e-05, "loss": 0.4918, "mean_token_accuracy": 0.8411547183990479, "num_tokens": 514793846.0, "step": 15365 }, { "epoch": 0.917502387774594, "grad_norm": 0.5374515056610107, "learning_rate": 3.2393136783279e-05, "loss": 0.5411, "mean_token_accuracy": 0.825426459312439, "num_tokens": 514961526.0, "step": 15370 }, { "epoch": 0.9178008595988538, "grad_norm": 0.4761549234390259, "learning_rate": 3.238229760735429e-05, "loss": 0.4947, "mean_token_accuracy": 0.8396695733070374, "num_tokens": 515129206.0, "step": 15375 }, { "epoch": 0.9180993314231136, "grad_norm": 0.465145081281662, "learning_rate": 3.2371457242255884e-05, "loss": 0.4703, "mean_token_accuracy": 0.8463378190994263, "num_tokens": 515296886.0, "step": 15380 }, { "epoch": 0.9183978032473734, "grad_norm": 0.5342923998832703, "learning_rate": 3.2360615690624144e-05, "loss": 0.5286, "mean_token_accuracy": 0.8328283309936524, "num_tokens": 515464566.0, "step": 15385 }, { "epoch": 0.9186962750716332, "grad_norm": 0.526098370552063, "learning_rate": 3.234977295509974e-05, "loss": 0.4981, "mean_token_accuracy": 0.8372599124908447, "num_tokens": 515632246.0, "step": 15390 }, { "epoch": 0.918994746895893, "grad_norm": 0.46639159321784973, "learning_rate": 3.2338929038323615e-05, "loss": 0.5273, "mean_token_accuracy": 0.8287307620048523, "num_tokens": 515799926.0, "step": 15395 }, { "epoch": 0.9192932187201528, "grad_norm": 0.5124784111976624, "learning_rate": 3.232808394293701e-05, "loss": 0.5627, "mean_token_accuracy": 0.8195395469665527, "num_tokens": 515967606.0, "step": 15400 }, { "epoch": 0.9195916905444126, "grad_norm": 0.5939876437187195, "learning_rate": 3.231723767158144e-05, "loss": 0.5363, "mean_token_accuracy": 0.8277108550071717, "num_tokens": 516135286.0, "step": 15405 }, { "epoch": 0.9198901623686724, "grad_norm": 0.4634951055049896, "learning_rate": 3.230639022689871e-05, "loss": 0.5402, "mean_token_accuracy": 0.8277406573295594, "num_tokens": 516302966.0, "step": 15410 }, { "epoch": 0.9201886341929322, "grad_norm": 0.5093323588371277, "learning_rate": 3.229554161153094e-05, "loss": 0.5303, "mean_token_accuracy": 0.8295717358589172, "num_tokens": 516470646.0, "step": 15415 }, { "epoch": 0.920487106017192, "grad_norm": 0.5386112332344055, "learning_rate": 3.228469182812049e-05, "loss": 0.5675, "mean_token_accuracy": 0.8193844676017761, "num_tokens": 516638326.0, "step": 15420 }, { "epoch": 0.9207855778414518, "grad_norm": 0.48846355080604553, "learning_rate": 3.227384087931003e-05, "loss": 0.4985, "mean_token_accuracy": 0.8384408950805664, "num_tokens": 516806006.0, "step": 15425 }, { "epoch": 0.9210840496657116, "grad_norm": 0.569942057132721, "learning_rate": 3.226298876774251e-05, "loss": 0.5326, "mean_token_accuracy": 0.8298818945884705, "num_tokens": 516973686.0, "step": 15430 }, { "epoch": 0.9213825214899714, "grad_norm": 0.5315511226654053, "learning_rate": 3.225213549606116e-05, "loss": 0.5266, "mean_token_accuracy": 0.8307467341423035, "num_tokens": 517141366.0, "step": 15435 }, { "epoch": 0.9216809933142311, "grad_norm": 0.5086844563484192, "learning_rate": 3.224128106690949e-05, "loss": 0.5862, "mean_token_accuracy": 0.8138733267784118, "num_tokens": 517309046.0, "step": 15440 }, { "epoch": 0.9219794651384909, "grad_norm": 0.49994051456451416, "learning_rate": 3.223042548293131e-05, "loss": 0.5465, "mean_token_accuracy": 0.8245019674301147, "num_tokens": 517476726.0, "step": 15445 }, { "epoch": 0.9222779369627507, "grad_norm": 0.5457820296287537, "learning_rate": 3.2219568746770685e-05, "loss": 0.5359, "mean_token_accuracy": 0.826816177368164, "num_tokens": 517644406.0, "step": 15450 }, { "epoch": 0.9225764087870105, "grad_norm": 0.46678459644317627, "learning_rate": 3.2208710861071986e-05, "loss": 0.5475, "mean_token_accuracy": 0.8279732942581177, "num_tokens": 517812086.0, "step": 15455 }, { "epoch": 0.9228748806112703, "grad_norm": 0.5285353660583496, "learning_rate": 3.219785182847983e-05, "loss": 0.5447, "mean_token_accuracy": 0.8274722576141358, "num_tokens": 517979766.0, "step": 15460 }, { "epoch": 0.9231733524355301, "grad_norm": 0.5083525776863098, "learning_rate": 3.218699165163916e-05, "loss": 0.5481, "mean_token_accuracy": 0.8246152877807618, "num_tokens": 518147446.0, "step": 15465 }, { "epoch": 0.9234718242597899, "grad_norm": 0.5395824313163757, "learning_rate": 3.2176130333195155e-05, "loss": 0.5549, "mean_token_accuracy": 0.8230585575103759, "num_tokens": 518315126.0, "step": 15470 }, { "epoch": 0.9237702960840497, "grad_norm": 0.5434145331382751, "learning_rate": 3.2165267875793305e-05, "loss": 0.5257, "mean_token_accuracy": 0.8322080492973327, "num_tokens": 518482806.0, "step": 15475 }, { "epoch": 0.9240687679083095, "grad_norm": 0.5313842296600342, "learning_rate": 3.215440428207934e-05, "loss": 0.5773, "mean_token_accuracy": 0.8145340085029602, "num_tokens": 518650096.0, "step": 15480 }, { "epoch": 0.9243672397325693, "grad_norm": 0.5660425424575806, "learning_rate": 3.2143539554699314e-05, "loss": 0.5672, "mean_token_accuracy": 0.8219372391700744, "num_tokens": 518817776.0, "step": 15485 }, { "epoch": 0.9246657115568291, "grad_norm": 0.49238401651382446, "learning_rate": 3.2132673696299515e-05, "loss": 0.5408, "mean_token_accuracy": 0.8271740436553955, "num_tokens": 518985456.0, "step": 15490 }, { "epoch": 0.9249641833810889, "grad_norm": 0.5256762504577637, "learning_rate": 3.212180670952653e-05, "loss": 0.559, "mean_token_accuracy": 0.8211797595024108, "num_tokens": 519153136.0, "step": 15495 }, { "epoch": 0.9252626552053486, "grad_norm": 0.5064694881439209, "learning_rate": 3.2110938597027204e-05, "loss": 0.5478, "mean_token_accuracy": 0.8261481642723083, "num_tokens": 519320816.0, "step": 15500 }, { "epoch": 0.9255611270296084, "grad_norm": 0.5971306562423706, "learning_rate": 3.210006936144869e-05, "loss": 0.4997, "mean_token_accuracy": 0.83870929479599, "num_tokens": 519488496.0, "step": 15505 }, { "epoch": 0.9258595988538681, "grad_norm": 0.5045725107192993, "learning_rate": 3.208919900543836e-05, "loss": 0.5019, "mean_token_accuracy": 0.8391924142837525, "num_tokens": 519656176.0, "step": 15510 }, { "epoch": 0.9261580706781279, "grad_norm": 0.4632502496242523, "learning_rate": 3.2078327531643903e-05, "loss": 0.5391, "mean_token_accuracy": 0.8304144024848938, "num_tokens": 519822501.0, "step": 15515 }, { "epoch": 0.9264565425023877, "grad_norm": 0.4779451787471771, "learning_rate": 3.206745494271328e-05, "loss": 0.5252, "mean_token_accuracy": 0.8307348132133484, "num_tokens": 519990181.0, "step": 15520 }, { "epoch": 0.9267550143266475, "grad_norm": 0.5667571425437927, "learning_rate": 3.20565812412947e-05, "loss": 0.5341, "mean_token_accuracy": 0.8276976108551025, "num_tokens": 520146945.0, "step": 15525 }, { "epoch": 0.9270534861509073, "grad_norm": 0.5411862730979919, "learning_rate": 3.204570643003665e-05, "loss": 0.5372, "mean_token_accuracy": 0.827317190170288, "num_tokens": 520314625.0, "step": 15530 }, { "epoch": 0.9273519579751671, "grad_norm": 0.5148149132728577, "learning_rate": 3.20348305115879e-05, "loss": 0.4926, "mean_token_accuracy": 0.8397113084793091, "num_tokens": 520482305.0, "step": 15535 }, { "epoch": 0.9276504297994269, "grad_norm": 0.5398635268211365, "learning_rate": 3.2023953488597466e-05, "loss": 0.5338, "mean_token_accuracy": 0.8288023233413696, "num_tokens": 520649985.0, "step": 15540 }, { "epoch": 0.9279489016236867, "grad_norm": 0.4518938362598419, "learning_rate": 3.201307536371466e-05, "loss": 0.5212, "mean_token_accuracy": 0.8311642527580261, "num_tokens": 520817665.0, "step": 15545 }, { "epoch": 0.9282473734479465, "grad_norm": 0.5684276819229126, "learning_rate": 3.200219613958905e-05, "loss": 0.5576, "mean_token_accuracy": 0.8230645418167114, "num_tokens": 520985345.0, "step": 15550 }, { "epoch": 0.9285458452722063, "grad_norm": 0.5751977562904358, "learning_rate": 3.199131581887047e-05, "loss": 0.5656, "mean_token_accuracy": 0.8203149199485779, "num_tokens": 521153025.0, "step": 15555 }, { "epoch": 0.9288443170964661, "grad_norm": 0.569940447807312, "learning_rate": 3.1980434404209024e-05, "loss": 0.5086, "mean_token_accuracy": 0.8360431909561157, "num_tokens": 521320705.0, "step": 15560 }, { "epoch": 0.9291427889207259, "grad_norm": 0.550160825252533, "learning_rate": 3.196955189825508e-05, "loss": 0.5296, "mean_token_accuracy": 0.830013120174408, "num_tokens": 521488385.0, "step": 15565 }, { "epoch": 0.9294412607449857, "grad_norm": 0.48204129934310913, "learning_rate": 3.195866830365927e-05, "loss": 0.5179, "mean_token_accuracy": 0.8323644518852233, "num_tokens": 521652077.0, "step": 15570 }, { "epoch": 0.9297397325692455, "grad_norm": 0.5202948451042175, "learning_rate": 3.194778362307249e-05, "loss": 0.5024, "mean_token_accuracy": 0.8378188848495484, "num_tokens": 521808106.0, "step": 15575 }, { "epoch": 0.9300382043935053, "grad_norm": 0.4583127498626709, "learning_rate": 3.193689785914592e-05, "loss": 0.5315, "mean_token_accuracy": 0.8298461318016053, "num_tokens": 521975786.0, "step": 15580 }, { "epoch": 0.9303366762177651, "grad_norm": 0.5228418707847595, "learning_rate": 3.192601101453098e-05, "loss": 0.5433, "mean_token_accuracy": 0.8251401662826539, "num_tokens": 522143466.0, "step": 15585 }, { "epoch": 0.9306351480420249, "grad_norm": 0.5833495855331421, "learning_rate": 3.191512309187936e-05, "loss": 0.5277, "mean_token_accuracy": 0.830764639377594, "num_tokens": 522311146.0, "step": 15590 }, { "epoch": 0.9309336198662846, "grad_norm": 0.5323729515075684, "learning_rate": 3.190423409384302e-05, "loss": 0.5303, "mean_token_accuracy": 0.8311463713645935, "num_tokens": 522478826.0, "step": 15595 }, { "epoch": 0.9312320916905444, "grad_norm": 0.4731408953666687, "learning_rate": 3.1893344023074176e-05, "loss": 0.538, "mean_token_accuracy": 0.8266505479812623, "num_tokens": 522642120.0, "step": 15600 }, { "epoch": 0.9315305635148042, "grad_norm": 0.5499926209449768, "learning_rate": 3.1882452882225296e-05, "loss": 0.5458, "mean_token_accuracy": 0.8248956203460693, "num_tokens": 522809800.0, "step": 15605 }, { "epoch": 0.931829035339064, "grad_norm": 0.480619877576828, "learning_rate": 3.187156067394913e-05, "loss": 0.5583, "mean_token_accuracy": 0.8228319168090821, "num_tokens": 522977480.0, "step": 15610 }, { "epoch": 0.9321275071633238, "grad_norm": 0.5242154002189636, "learning_rate": 3.1860667400898676e-05, "loss": 0.5567, "mean_token_accuracy": 0.8220445990562439, "num_tokens": 523145160.0, "step": 15615 }, { "epoch": 0.9324259789875836, "grad_norm": 0.47804081439971924, "learning_rate": 3.184977306572718e-05, "loss": 0.5617, "mean_token_accuracy": 0.8214481592178344, "num_tokens": 523312840.0, "step": 15620 }, { "epoch": 0.9327244508118434, "grad_norm": 0.4681434631347656, "learning_rate": 3.1838877671088166e-05, "loss": 0.4854, "mean_token_accuracy": 0.8425921440124512, "num_tokens": 523480520.0, "step": 15625 }, { "epoch": 0.9330229226361032, "grad_norm": 0.495015412569046, "learning_rate": 3.1827981219635415e-05, "loss": 0.5559, "mean_token_accuracy": 0.8237444877624511, "num_tokens": 523648200.0, "step": 15630 }, { "epoch": 0.933321394460363, "grad_norm": 0.5059530735015869, "learning_rate": 3.181708371402295e-05, "loss": 0.5676, "mean_token_accuracy": 0.8229034900665283, "num_tokens": 523815880.0, "step": 15635 }, { "epoch": 0.9336198662846227, "grad_norm": 0.5483002662658691, "learning_rate": 3.180618515690507e-05, "loss": 0.5117, "mean_token_accuracy": 0.8340451002120972, "num_tokens": 523983560.0, "step": 15640 }, { "epoch": 0.9339183381088825, "grad_norm": 0.6235918402671814, "learning_rate": 3.17952855509363e-05, "loss": 0.5436, "mean_token_accuracy": 0.8260825395584106, "num_tokens": 524151240.0, "step": 15645 }, { "epoch": 0.9342168099331423, "grad_norm": 0.4676841199398041, "learning_rate": 3.178438489877145e-05, "loss": 0.5104, "mean_token_accuracy": 0.8357926726341247, "num_tokens": 524318920.0, "step": 15650 }, { "epoch": 0.9345152817574021, "grad_norm": 0.5368183851242065, "learning_rate": 3.177348320306558e-05, "loss": 0.5537, "mean_token_accuracy": 0.8243767142295837, "num_tokens": 524486600.0, "step": 15655 }, { "epoch": 0.9348137535816619, "grad_norm": 0.4797813594341278, "learning_rate": 3.1762580466474005e-05, "loss": 0.4834, "mean_token_accuracy": 0.843295955657959, "num_tokens": 524654280.0, "step": 15660 }, { "epoch": 0.9351122254059216, "grad_norm": 0.4770227372646332, "learning_rate": 3.175167669165228e-05, "loss": 0.501, "mean_token_accuracy": 0.8369855761528016, "num_tokens": 524821960.0, "step": 15665 }, { "epoch": 0.9354106972301814, "grad_norm": 0.49592265486717224, "learning_rate": 3.1740771881256216e-05, "loss": 0.5539, "mean_token_accuracy": 0.8225515842437744, "num_tokens": 524989640.0, "step": 15670 }, { "epoch": 0.9357091690544412, "grad_norm": 0.527769923210144, "learning_rate": 3.172986603794188e-05, "loss": 0.5682, "mean_token_accuracy": 0.8192532420158386, "num_tokens": 525157320.0, "step": 15675 }, { "epoch": 0.936007640878701, "grad_norm": 0.6402641534805298, "learning_rate": 3.17189591643656e-05, "loss": 0.5409, "mean_token_accuracy": 0.8278182029724122, "num_tokens": 525325000.0, "step": 15680 }, { "epoch": 0.9363061127029608, "grad_norm": 0.48500001430511475, "learning_rate": 3.170805126318393e-05, "loss": 0.515, "mean_token_accuracy": 0.8350292205810547, "num_tokens": 525492680.0, "step": 15685 }, { "epoch": 0.9366045845272206, "grad_norm": 0.5422039031982422, "learning_rate": 3.169714233705372e-05, "loss": 0.5078, "mean_token_accuracy": 0.8368125915527344, "num_tokens": 525660360.0, "step": 15690 }, { "epoch": 0.9369030563514804, "grad_norm": 0.48064228892326355, "learning_rate": 3.1686232388632014e-05, "loss": 0.5382, "mean_token_accuracy": 0.8278838276863099, "num_tokens": 525828040.0, "step": 15695 }, { "epoch": 0.9372015281757402, "grad_norm": 0.46511518955230713, "learning_rate": 3.167532142057615e-05, "loss": 0.541, "mean_token_accuracy": 0.8270487785339355, "num_tokens": 525995720.0, "step": 15700 }, { "epoch": 0.9375, "grad_norm": 0.502470850944519, "learning_rate": 3.166440943554368e-05, "loss": 0.4972, "mean_token_accuracy": 0.8385422945022583, "num_tokens": 526163400.0, "step": 15705 }, { "epoch": 0.9377984718242598, "grad_norm": 0.5148174166679382, "learning_rate": 3.1653496436192423e-05, "loss": 0.4927, "mean_token_accuracy": 0.840689480304718, "num_tokens": 526331080.0, "step": 15710 }, { "epoch": 0.9380969436485196, "grad_norm": 0.5600574016571045, "learning_rate": 3.1642582425180444e-05, "loss": 0.5024, "mean_token_accuracy": 0.8386198163032532, "num_tokens": 526498760.0, "step": 15715 }, { "epoch": 0.9383954154727794, "grad_norm": 0.520237147808075, "learning_rate": 3.163166740516607e-05, "loss": 0.5432, "mean_token_accuracy": 0.8268698573112487, "num_tokens": 526666440.0, "step": 15720 }, { "epoch": 0.9386938872970392, "grad_norm": 0.45518752932548523, "learning_rate": 3.162075137880782e-05, "loss": 0.5101, "mean_token_accuracy": 0.8360849261283875, "num_tokens": 526834120.0, "step": 15725 }, { "epoch": 0.938992359121299, "grad_norm": 0.5561493039131165, "learning_rate": 3.160983434876452e-05, "loss": 0.5609, "mean_token_accuracy": 0.8224203705787658, "num_tokens": 527001800.0, "step": 15730 }, { "epoch": 0.9392908309455588, "grad_norm": 0.4951010048389435, "learning_rate": 3.1598916317695194e-05, "loss": 0.5134, "mean_token_accuracy": 0.8320410370826721, "num_tokens": 527169480.0, "step": 15735 }, { "epoch": 0.9395893027698186, "grad_norm": 0.5409601330757141, "learning_rate": 3.1587997288259144e-05, "loss": 0.5259, "mean_token_accuracy": 0.8323511958122254, "num_tokens": 527337160.0, "step": 15740 }, { "epoch": 0.9398877745940784, "grad_norm": 0.5131236910820007, "learning_rate": 3.157707726311591e-05, "loss": 0.4859, "mean_token_accuracy": 0.8410175323486329, "num_tokens": 527504840.0, "step": 15745 }, { "epoch": 0.9401862464183381, "grad_norm": 0.5314719080924988, "learning_rate": 3.156615624492524e-05, "loss": 0.5352, "mean_token_accuracy": 0.8283251762390137, "num_tokens": 527672520.0, "step": 15750 }, { "epoch": 0.9404847182425979, "grad_norm": 0.5749266743659973, "learning_rate": 3.1555234236347175e-05, "loss": 0.567, "mean_token_accuracy": 0.8178217887878418, "num_tokens": 527840200.0, "step": 15755 }, { "epoch": 0.9407831900668577, "grad_norm": 0.5439934134483337, "learning_rate": 3.154431124004196e-05, "loss": 0.5143, "mean_token_accuracy": 0.83399738073349, "num_tokens": 528007880.0, "step": 15760 }, { "epoch": 0.9410816618911175, "grad_norm": 0.5169892907142639, "learning_rate": 3.1533387258670076e-05, "loss": 0.5927, "mean_token_accuracy": 0.8124776363372803, "num_tokens": 528175560.0, "step": 15765 }, { "epoch": 0.9413801337153773, "grad_norm": 0.5228572487831116, "learning_rate": 3.152246229489229e-05, "loss": 0.5532, "mean_token_accuracy": 0.8227364897727967, "num_tokens": 528343240.0, "step": 15770 }, { "epoch": 0.941678605539637, "grad_norm": 0.4965108036994934, "learning_rate": 3.151153635136958e-05, "loss": 0.4895, "mean_token_accuracy": 0.8412859320640564, "num_tokens": 528510920.0, "step": 15775 }, { "epoch": 0.9419770773638968, "grad_norm": 0.517785370349884, "learning_rate": 3.150060943076313e-05, "loss": 0.5819, "mean_token_accuracy": 0.8136526346206665, "num_tokens": 528678600.0, "step": 15780 }, { "epoch": 0.9422755491881566, "grad_norm": 0.47368016839027405, "learning_rate": 3.148968153573441e-05, "loss": 0.5215, "mean_token_accuracy": 0.8339496493339539, "num_tokens": 528846280.0, "step": 15785 }, { "epoch": 0.9425740210124164, "grad_norm": 0.514857828617096, "learning_rate": 3.147875266894511e-05, "loss": 0.5014, "mean_token_accuracy": 0.8382378578186035, "num_tokens": 529012034.0, "step": 15790 }, { "epoch": 0.9428724928366762, "grad_norm": 0.4875916838645935, "learning_rate": 3.146782283305717e-05, "loss": 0.499, "mean_token_accuracy": 0.8389896273612976, "num_tokens": 529179714.0, "step": 15795 }, { "epoch": 0.943170964660936, "grad_norm": 0.49777963757514954, "learning_rate": 3.145689203073273e-05, "loss": 0.5641, "mean_token_accuracy": 0.8193785071372985, "num_tokens": 529347394.0, "step": 15800 }, { "epoch": 0.9434694364851958, "grad_norm": 0.5428093075752258, "learning_rate": 3.144596026463421e-05, "loss": 0.476, "mean_token_accuracy": 0.8452940583229065, "num_tokens": 529515074.0, "step": 15805 }, { "epoch": 0.9437679083094556, "grad_norm": 0.49488624930381775, "learning_rate": 3.1435027537424225e-05, "loss": 0.523, "mean_token_accuracy": 0.8318382382392884, "num_tokens": 529682754.0, "step": 15810 }, { "epoch": 0.9440663801337154, "grad_norm": 0.5103921294212341, "learning_rate": 3.1424093851765656e-05, "loss": 0.5116, "mean_token_accuracy": 0.8359835386276245, "num_tokens": 529850434.0, "step": 15815 }, { "epoch": 0.9443648519579751, "grad_norm": 0.5930900573730469, "learning_rate": 3.141315921032159e-05, "loss": 0.5744, "mean_token_accuracy": 0.8167563557624817, "num_tokens": 530014952.0, "step": 15820 }, { "epoch": 0.9446633237822349, "grad_norm": 0.5441305041313171, "learning_rate": 3.140222361575539e-05, "loss": 0.5182, "mean_token_accuracy": 0.8344387412071228, "num_tokens": 530182632.0, "step": 15825 }, { "epoch": 0.9449617956064947, "grad_norm": 0.5473672747612, "learning_rate": 3.139128707073058e-05, "loss": 0.5651, "mean_token_accuracy": 0.8199809193611145, "num_tokens": 530350312.0, "step": 15830 }, { "epoch": 0.9452602674307545, "grad_norm": 0.5072567462921143, "learning_rate": 3.138034957791099e-05, "loss": 0.5122, "mean_token_accuracy": 0.8345759153366089, "num_tokens": 530517992.0, "step": 15835 }, { "epoch": 0.9455587392550143, "grad_norm": 0.5286093950271606, "learning_rate": 3.1369411139960636e-05, "loss": 0.5592, "mean_token_accuracy": 0.8219193577766418, "num_tokens": 530685672.0, "step": 15840 }, { "epoch": 0.9458572110792741, "grad_norm": 0.529873788356781, "learning_rate": 3.135847175954378e-05, "loss": 0.5487, "mean_token_accuracy": 0.8244184613227844, "num_tokens": 530853352.0, "step": 15845 }, { "epoch": 0.9461556829035339, "grad_norm": 0.5681453943252563, "learning_rate": 3.1347531439324896e-05, "loss": 0.5504, "mean_token_accuracy": 0.8237504601478577, "num_tokens": 531021032.0, "step": 15850 }, { "epoch": 0.9464541547277937, "grad_norm": 0.8035094738006592, "learning_rate": 3.1336590181968725e-05, "loss": 0.5877, "mean_token_accuracy": 0.8152451515197754, "num_tokens": 531188712.0, "step": 15855 }, { "epoch": 0.9467526265520535, "grad_norm": 0.5231764912605286, "learning_rate": 3.132564799014019e-05, "loss": 0.5593, "mean_token_accuracy": 0.8210604786872864, "num_tokens": 531356392.0, "step": 15860 }, { "epoch": 0.9470510983763133, "grad_norm": 0.5790037512779236, "learning_rate": 3.1314704866504484e-05, "loss": 0.5434, "mean_token_accuracy": 0.8264821648597718, "num_tokens": 531524072.0, "step": 15865 }, { "epoch": 0.9473495702005731, "grad_norm": 0.4739135205745697, "learning_rate": 3.1303760813726996e-05, "loss": 0.5517, "mean_token_accuracy": 0.8243409276008606, "num_tokens": 531691752.0, "step": 15870 }, { "epoch": 0.9476480420248329, "grad_norm": 0.46261680126190186, "learning_rate": 3.129281583447335e-05, "loss": 0.5298, "mean_token_accuracy": 0.8301860928535462, "num_tokens": 531859432.0, "step": 15875 }, { "epoch": 0.9479465138490927, "grad_norm": 0.5272606611251831, "learning_rate": 3.1281869931409394e-05, "loss": 0.5429, "mean_token_accuracy": 0.8279673218727112, "num_tokens": 532027112.0, "step": 15880 }, { "epoch": 0.9482449856733525, "grad_norm": 0.4770931899547577, "learning_rate": 3.127092310720121e-05, "loss": 0.5055, "mean_token_accuracy": 0.8383931636810302, "num_tokens": 532194792.0, "step": 15885 }, { "epoch": 0.9485434574976123, "grad_norm": 0.48625510931015015, "learning_rate": 3.125997536451511e-05, "loss": 0.5557, "mean_token_accuracy": 0.8261541247367858, "num_tokens": 532362472.0, "step": 15890 }, { "epoch": 0.9488419293218721, "grad_norm": 0.4478358030319214, "learning_rate": 3.1249026706017604e-05, "loss": 0.4875, "mean_token_accuracy": 0.8413396120071411, "num_tokens": 532530152.0, "step": 15895 }, { "epoch": 0.9491404011461319, "grad_norm": 0.5235236883163452, "learning_rate": 3.123807713437544e-05, "loss": 0.5376, "mean_token_accuracy": 0.8288023352622986, "num_tokens": 532697832.0, "step": 15900 }, { "epoch": 0.9494388729703916, "grad_norm": 0.4963972568511963, "learning_rate": 3.12271266522556e-05, "loss": 0.4765, "mean_token_accuracy": 0.844912326335907, "num_tokens": 532865512.0, "step": 15905 }, { "epoch": 0.9497373447946514, "grad_norm": 0.4907382130622864, "learning_rate": 3.121617526232527e-05, "loss": 0.5233, "mean_token_accuracy": 0.832381010055542, "num_tokens": 533033192.0, "step": 15910 }, { "epoch": 0.9500358166189111, "grad_norm": 0.5912783145904541, "learning_rate": 3.1205222967251857e-05, "loss": 0.5655, "mean_token_accuracy": 0.8202075719833374, "num_tokens": 533200872.0, "step": 15915 }, { "epoch": 0.9503342884431709, "grad_norm": 0.5628465414047241, "learning_rate": 3.1194269769703e-05, "loss": 0.5689, "mean_token_accuracy": 0.8188178420066834, "num_tokens": 533368552.0, "step": 15920 }, { "epoch": 0.9506327602674307, "grad_norm": 0.4964522123336792, "learning_rate": 3.1183315672346555e-05, "loss": 0.5222, "mean_token_accuracy": 0.8315101981163024, "num_tokens": 533536232.0, "step": 15925 }, { "epoch": 0.9509312320916905, "grad_norm": 0.530226469039917, "learning_rate": 3.117236067785059e-05, "loss": 0.5255, "mean_token_accuracy": 0.832369077205658, "num_tokens": 533703912.0, "step": 15930 }, { "epoch": 0.9512297039159503, "grad_norm": 0.45650309324264526, "learning_rate": 3.116140478888339e-05, "loss": 0.5128, "mean_token_accuracy": 0.8332160353660584, "num_tokens": 533871592.0, "step": 15935 }, { "epoch": 0.9515281757402101, "grad_norm": 0.539772093296051, "learning_rate": 3.1150448008113495e-05, "loss": 0.4838, "mean_token_accuracy": 0.8441488742828369, "num_tokens": 534039272.0, "step": 15940 }, { "epoch": 0.9518266475644699, "grad_norm": 0.5195083618164062, "learning_rate": 3.1139490338209585e-05, "loss": 0.5189, "mean_token_accuracy": 0.8337707281112671, "num_tokens": 534206952.0, "step": 15945 }, { "epoch": 0.9521251193887297, "grad_norm": 0.4558088779449463, "learning_rate": 3.112853178184064e-05, "loss": 0.5096, "mean_token_accuracy": 0.8356018185615539, "num_tokens": 534374632.0, "step": 15950 }, { "epoch": 0.9524235912129895, "grad_norm": 0.554297149181366, "learning_rate": 3.1117572341675796e-05, "loss": 0.5529, "mean_token_accuracy": 0.8226768493652343, "num_tokens": 534542312.0, "step": 15955 }, { "epoch": 0.9527220630372493, "grad_norm": 0.5143594145774841, "learning_rate": 3.110661202038443e-05, "loss": 0.5554, "mean_token_accuracy": 0.8240367412567139, "num_tokens": 534709992.0, "step": 15960 }, { "epoch": 0.9530205348615091, "grad_norm": 0.47093838453292847, "learning_rate": 3.1095650820636146e-05, "loss": 0.5316, "mean_token_accuracy": 0.8290767073631287, "num_tokens": 534877672.0, "step": 15965 }, { "epoch": 0.9533190066857689, "grad_norm": 0.4957123398780823, "learning_rate": 3.1084688745100736e-05, "loss": 0.5579, "mean_token_accuracy": 0.8235417008399963, "num_tokens": 535045352.0, "step": 15970 }, { "epoch": 0.9536174785100286, "grad_norm": 0.49072322249412537, "learning_rate": 3.1073725796448216e-05, "loss": 0.4749, "mean_token_accuracy": 0.844286072254181, "num_tokens": 535213032.0, "step": 15975 }, { "epoch": 0.9539159503342884, "grad_norm": 0.5057286620140076, "learning_rate": 3.1062761977348814e-05, "loss": 0.5024, "mean_token_accuracy": 0.8388583898544312, "num_tokens": 535380712.0, "step": 15980 }, { "epoch": 0.9542144221585482, "grad_norm": 0.5004511475563049, "learning_rate": 3.1051797290472966e-05, "loss": 0.5229, "mean_token_accuracy": 0.8323452115058899, "num_tokens": 535548392.0, "step": 15985 }, { "epoch": 0.954512893982808, "grad_norm": 0.5344756841659546, "learning_rate": 3.104083173849134e-05, "loss": 0.5385, "mean_token_accuracy": 0.8270368576049805, "num_tokens": 535716072.0, "step": 15990 }, { "epoch": 0.9548113658070678, "grad_norm": 0.4614121913909912, "learning_rate": 3.102986532407478e-05, "loss": 0.493, "mean_token_accuracy": 0.8404210805892944, "num_tokens": 535883752.0, "step": 15995 }, { "epoch": 0.9551098376313276, "grad_norm": 0.4517372250556946, "learning_rate": 3.101889804989437e-05, "loss": 0.5317, "mean_token_accuracy": 0.8297268271446228, "num_tokens": 536051432.0, "step": 16000 }, { "epoch": 0.9554083094555874, "grad_norm": 0.655108630657196, "learning_rate": 3.1007929918621385e-05, "loss": 0.525, "mean_token_accuracy": 0.832518196105957, "num_tokens": 536219112.0, "step": 16005 }, { "epoch": 0.9557067812798472, "grad_norm": 0.5116455554962158, "learning_rate": 3.099696093292732e-05, "loss": 0.5359, "mean_token_accuracy": 0.8291124939918518, "num_tokens": 536386792.0, "step": 16010 }, { "epoch": 0.956005253104107, "grad_norm": 0.5170709490776062, "learning_rate": 3.0985991095483876e-05, "loss": 0.5723, "mean_token_accuracy": 0.8196230530738831, "num_tokens": 536554472.0, "step": 16015 }, { "epoch": 0.9563037249283668, "grad_norm": 0.4800170660018921, "learning_rate": 3.097502040896296e-05, "loss": 0.5137, "mean_token_accuracy": 0.834235954284668, "num_tokens": 536722152.0, "step": 16020 }, { "epoch": 0.9566021967526266, "grad_norm": 0.49619466066360474, "learning_rate": 3.096404887603667e-05, "loss": 0.5151, "mean_token_accuracy": 0.8323869705200195, "num_tokens": 536889832.0, "step": 16025 }, { "epoch": 0.9569006685768864, "grad_norm": 0.5148215889930725, "learning_rate": 3.095307649937735e-05, "loss": 0.5003, "mean_token_accuracy": 0.8387093067169189, "num_tokens": 537057512.0, "step": 16030 }, { "epoch": 0.9571991404011462, "grad_norm": 0.5083907246589661, "learning_rate": 3.09421032816575e-05, "loss": 0.5173, "mean_token_accuracy": 0.8319217443466187, "num_tokens": 537225192.0, "step": 16035 }, { "epoch": 0.957497612225406, "grad_norm": 0.6620160341262817, "learning_rate": 3.093112922554986e-05, "loss": 0.562, "mean_token_accuracy": 0.8210187196731568, "num_tokens": 537392872.0, "step": 16040 }, { "epoch": 0.9577960840496658, "grad_norm": 0.480284184217453, "learning_rate": 3.0920154333727386e-05, "loss": 0.5597, "mean_token_accuracy": 0.8216748356819152, "num_tokens": 537560552.0, "step": 16045 }, { "epoch": 0.9580945558739254, "grad_norm": 0.4943378269672394, "learning_rate": 3.0909178608863185e-05, "loss": 0.5255, "mean_token_accuracy": 0.831617558002472, "num_tokens": 537728232.0, "step": 16050 }, { "epoch": 0.9583930276981852, "grad_norm": 0.4783206582069397, "learning_rate": 3.08982020536306e-05, "loss": 0.5272, "mean_token_accuracy": 0.831402850151062, "num_tokens": 537895912.0, "step": 16055 }, { "epoch": 0.958691499522445, "grad_norm": 0.5013982653617859, "learning_rate": 3.088722467070319e-05, "loss": 0.4841, "mean_token_accuracy": 0.8433138489723205, "num_tokens": 538063592.0, "step": 16060 }, { "epoch": 0.9589899713467048, "grad_norm": 0.5668599009513855, "learning_rate": 3.0876246462754685e-05, "loss": 0.5421, "mean_token_accuracy": 0.826177966594696, "num_tokens": 538231272.0, "step": 16065 }, { "epoch": 0.9592884431709646, "grad_norm": 0.4846465289592743, "learning_rate": 3.086526743245904e-05, "loss": 0.5476, "mean_token_accuracy": 0.8265358448028565, "num_tokens": 538398952.0, "step": 16070 }, { "epoch": 0.9595869149952244, "grad_norm": 0.5607597231864929, "learning_rate": 3.08542875824904e-05, "loss": 0.5236, "mean_token_accuracy": 0.8326980590820312, "num_tokens": 538561362.0, "step": 16075 }, { "epoch": 0.9598853868194842, "grad_norm": 0.504673182964325, "learning_rate": 3.0843306915523096e-05, "loss": 0.5095, "mean_token_accuracy": 0.8356793522834778, "num_tokens": 538729042.0, "step": 16080 }, { "epoch": 0.960183858643744, "grad_norm": 0.5429574847221375, "learning_rate": 3.083232543423169e-05, "loss": 0.5394, "mean_token_accuracy": 0.8269891262054443, "num_tokens": 538896722.0, "step": 16085 }, { "epoch": 0.9604823304680038, "grad_norm": 0.5217997431755066, "learning_rate": 3.082134314129091e-05, "loss": 0.5212, "mean_token_accuracy": 0.8325420618057251, "num_tokens": 539064402.0, "step": 16090 }, { "epoch": 0.9607808022922636, "grad_norm": 0.5468534231185913, "learning_rate": 3.0810360039375705e-05, "loss": 0.5597, "mean_token_accuracy": 0.8216807842254639, "num_tokens": 539232082.0, "step": 16095 }, { "epoch": 0.9610792741165234, "grad_norm": 0.48495370149612427, "learning_rate": 3.079937613116121e-05, "loss": 0.5468, "mean_token_accuracy": 0.8240844488143921, "num_tokens": 539399762.0, "step": 16100 }, { "epoch": 0.9613777459407832, "grad_norm": 0.4977683424949646, "learning_rate": 3.0788391419322766e-05, "loss": 0.517, "mean_token_accuracy": 0.8354228734970093, "num_tokens": 539567442.0, "step": 16105 }, { "epoch": 0.961676217765043, "grad_norm": 0.5285274386405945, "learning_rate": 3.077740590653588e-05, "loss": 0.543, "mean_token_accuracy": 0.8270607113838195, "num_tokens": 539735122.0, "step": 16110 }, { "epoch": 0.9619746895893028, "grad_norm": 0.5071113705635071, "learning_rate": 3.076641959547631e-05, "loss": 0.5349, "mean_token_accuracy": 0.828659188747406, "num_tokens": 539902802.0, "step": 16115 }, { "epoch": 0.9622731614135626, "grad_norm": 0.4986821115016937, "learning_rate": 3.075543248881995e-05, "loss": 0.5369, "mean_token_accuracy": 0.8289574265480042, "num_tokens": 540070482.0, "step": 16120 }, { "epoch": 0.9625716332378224, "grad_norm": 0.5541632771492004, "learning_rate": 3.074444458924291e-05, "loss": 0.5094, "mean_token_accuracy": 0.83626229763031, "num_tokens": 540236251.0, "step": 16125 }, { "epoch": 0.9628701050620821, "grad_norm": 0.5603629350662231, "learning_rate": 3.073345589942152e-05, "loss": 0.5437, "mean_token_accuracy": 0.8257306456565857, "num_tokens": 540403931.0, "step": 16130 }, { "epoch": 0.9631685768863419, "grad_norm": 0.46965834498405457, "learning_rate": 3.0722466422032245e-05, "loss": 0.5219, "mean_token_accuracy": 0.8326461434364318, "num_tokens": 540571254.0, "step": 16135 }, { "epoch": 0.9634670487106017, "grad_norm": 0.49990078806877136, "learning_rate": 3.07114761597518e-05, "loss": 0.5531, "mean_token_accuracy": 0.8224203824996948, "num_tokens": 540738934.0, "step": 16140 }, { "epoch": 0.9637655205348615, "grad_norm": 0.4671829044818878, "learning_rate": 3.070048511525707e-05, "loss": 0.5112, "mean_token_accuracy": 0.8352439522743225, "num_tokens": 540906614.0, "step": 16145 }, { "epoch": 0.9640639923591213, "grad_norm": 0.5460081100463867, "learning_rate": 3.06894932912251e-05, "loss": 0.542, "mean_token_accuracy": 0.8288560152053833, "num_tokens": 541074294.0, "step": 16150 }, { "epoch": 0.9643624641833811, "grad_norm": 0.4894111752510071, "learning_rate": 3.067850069033318e-05, "loss": 0.5317, "mean_token_accuracy": 0.8283430695533752, "num_tokens": 541241974.0, "step": 16155 }, { "epoch": 0.9646609360076409, "grad_norm": 0.47574856877326965, "learning_rate": 3.0667507315258734e-05, "loss": 0.5031, "mean_token_accuracy": 0.8375402450561523, "num_tokens": 541409654.0, "step": 16160 }, { "epoch": 0.9649594078319007, "grad_norm": 0.49619680643081665, "learning_rate": 3.065651316867944e-05, "loss": 0.5188, "mean_token_accuracy": 0.8350590586662292, "num_tokens": 541577334.0, "step": 16165 }, { "epoch": 0.9652578796561605, "grad_norm": 0.8324124217033386, "learning_rate": 3.0645518253273076e-05, "loss": 0.5059, "mean_token_accuracy": 0.836967658996582, "num_tokens": 541745014.0, "step": 16170 }, { "epoch": 0.9655563514804203, "grad_norm": 0.4928455948829651, "learning_rate": 3.06345225717177e-05, "loss": 0.5547, "mean_token_accuracy": 0.8223602414131165, "num_tokens": 541906331.0, "step": 16175 }, { "epoch": 0.9658548233046801, "grad_norm": 0.5277348160743713, "learning_rate": 3.0623526126691484e-05, "loss": 0.4919, "mean_token_accuracy": 0.8410413861274719, "num_tokens": 542074011.0, "step": 16180 }, { "epoch": 0.9661532951289399, "grad_norm": 0.4890357553958893, "learning_rate": 3.061252892087282e-05, "loss": 0.5479, "mean_token_accuracy": 0.8254324197769165, "num_tokens": 542241691.0, "step": 16185 }, { "epoch": 0.9664517669531996, "grad_norm": 0.4640437364578247, "learning_rate": 3.06015309569403e-05, "loss": 0.5064, "mean_token_accuracy": 0.8366992831230163, "num_tokens": 542409371.0, "step": 16190 }, { "epoch": 0.9667502387774594, "grad_norm": 0.4964997172355652, "learning_rate": 3.059053223757267e-05, "loss": 0.5496, "mean_token_accuracy": 0.8235059022903443, "num_tokens": 542577051.0, "step": 16195 }, { "epoch": 0.9670487106017192, "grad_norm": 0.5225151777267456, "learning_rate": 3.0579532765448857e-05, "loss": 0.5344, "mean_token_accuracy": 0.8288202285766602, "num_tokens": 542744731.0, "step": 16200 }, { "epoch": 0.967347182425979, "grad_norm": 0.49010005593299866, "learning_rate": 3.0568532543248e-05, "loss": 0.5384, "mean_token_accuracy": 0.827156150341034, "num_tokens": 542912411.0, "step": 16205 }, { "epoch": 0.9676456542502387, "grad_norm": 0.5213024616241455, "learning_rate": 3.055753157364941e-05, "loss": 0.5226, "mean_token_accuracy": 0.8323511958122254, "num_tokens": 543080091.0, "step": 16210 }, { "epoch": 0.9679441260744985, "grad_norm": 0.5314414501190186, "learning_rate": 3.0546529859332566e-05, "loss": 0.5607, "mean_token_accuracy": 0.8219611167907714, "num_tokens": 543247771.0, "step": 16215 }, { "epoch": 0.9682425978987583, "grad_norm": 0.5320379734039307, "learning_rate": 3.0535527402977144e-05, "loss": 0.5087, "mean_token_accuracy": 0.8350888729095459, "num_tokens": 543415451.0, "step": 16220 }, { "epoch": 0.9685410697230181, "grad_norm": 0.5203273296356201, "learning_rate": 3.0524524207262994e-05, "loss": 0.5306, "mean_token_accuracy": 0.8277168035507202, "num_tokens": 543583131.0, "step": 16225 }, { "epoch": 0.9688395415472779, "grad_norm": 0.5501710772514343, "learning_rate": 3.0513520274870143e-05, "loss": 0.5195, "mean_token_accuracy": 0.8340152621269226, "num_tokens": 543750811.0, "step": 16230 }, { "epoch": 0.9691380133715377, "grad_norm": 0.5646803379058838, "learning_rate": 3.0502515608478805e-05, "loss": 0.5396, "mean_token_accuracy": 0.8267147779464722, "num_tokens": 543918491.0, "step": 16235 }, { "epoch": 0.9694364851957975, "grad_norm": 0.6729055643081665, "learning_rate": 3.0491510210769375e-05, "loss": 0.4854, "mean_token_accuracy": 0.8442264080047608, "num_tokens": 544086171.0, "step": 16240 }, { "epoch": 0.9697349570200573, "grad_norm": 0.5303055644035339, "learning_rate": 3.0480504084422413e-05, "loss": 0.483, "mean_token_accuracy": 0.8435882091522217, "num_tokens": 544253851.0, "step": 16245 }, { "epoch": 0.9700334288443171, "grad_norm": 0.4985578656196594, "learning_rate": 3.046949723211867e-05, "loss": 0.558, "mean_token_accuracy": 0.8207741856575013, "num_tokens": 544421531.0, "step": 16250 }, { "epoch": 0.9703319006685769, "grad_norm": 0.5357292294502258, "learning_rate": 3.0458489656539062e-05, "loss": 0.5275, "mean_token_accuracy": 0.8285041093826294, "num_tokens": 544589211.0, "step": 16255 }, { "epoch": 0.9706303724928367, "grad_norm": 0.8215190768241882, "learning_rate": 3.0447481360364693e-05, "loss": 0.5378, "mean_token_accuracy": 0.8250268459320068, "num_tokens": 544756891.0, "step": 16260 }, { "epoch": 0.9709288443170965, "grad_norm": 0.5300891995429993, "learning_rate": 3.0436472346276833e-05, "loss": 0.5128, "mean_token_accuracy": 0.8356316447257995, "num_tokens": 544924571.0, "step": 16265 }, { "epoch": 0.9712273161413563, "grad_norm": 0.4493369162082672, "learning_rate": 3.042546261695694e-05, "loss": 0.4879, "mean_token_accuracy": 0.8421567440032959, "num_tokens": 545092251.0, "step": 16270 }, { "epoch": 0.9715257879656161, "grad_norm": 0.5489478707313538, "learning_rate": 3.0414452175086623e-05, "loss": 0.556, "mean_token_accuracy": 0.8219193696975708, "num_tokens": 545259931.0, "step": 16275 }, { "epoch": 0.9718242597898759, "grad_norm": 0.5192468762397766, "learning_rate": 3.0403441023347677e-05, "loss": 0.5085, "mean_token_accuracy": 0.835589873790741, "num_tokens": 545427611.0, "step": 16280 }, { "epoch": 0.9721227316141356, "grad_norm": 0.48073679208755493, "learning_rate": 3.039242916442209e-05, "loss": 0.4838, "mean_token_accuracy": 0.8418764114379883, "num_tokens": 545595291.0, "step": 16285 }, { "epoch": 0.9724212034383954, "grad_norm": 0.512175440788269, "learning_rate": 3.0381416600991984e-05, "loss": 0.5298, "mean_token_accuracy": 0.8305737733840942, "num_tokens": 545762971.0, "step": 16290 }, { "epoch": 0.9727196752626552, "grad_norm": 0.5279102325439453, "learning_rate": 3.0370403335739677e-05, "loss": 0.4732, "mean_token_accuracy": 0.8479243636131286, "num_tokens": 545930651.0, "step": 16295 }, { "epoch": 0.973018147086915, "grad_norm": 0.4833485186100006, "learning_rate": 3.035938937134767e-05, "loss": 0.5479, "mean_token_accuracy": 0.824311101436615, "num_tokens": 546098331.0, "step": 16300 }, { "epoch": 0.9733166189111748, "grad_norm": 0.4734708070755005, "learning_rate": 3.0348374710498596e-05, "loss": 0.4867, "mean_token_accuracy": 0.8419002652168274, "num_tokens": 546266011.0, "step": 16305 }, { "epoch": 0.9736150907354346, "grad_norm": 0.5586357116699219, "learning_rate": 3.0337359355875282e-05, "loss": 0.5214, "mean_token_accuracy": 0.8335301876068115, "num_tokens": 546427743.0, "step": 16310 }, { "epoch": 0.9739135625596944, "grad_norm": 0.5801336765289307, "learning_rate": 3.0326343310160726e-05, "loss": 0.5105, "mean_token_accuracy": 0.8357032060623169, "num_tokens": 546595423.0, "step": 16315 }, { "epoch": 0.9742120343839542, "grad_norm": 0.5442116260528564, "learning_rate": 3.0315326576038094e-05, "loss": 0.5267, "mean_token_accuracy": 0.8301503181457519, "num_tokens": 546763103.0, "step": 16320 }, { "epoch": 0.9745105062082139, "grad_norm": 0.5060010552406311, "learning_rate": 3.0304309156190702e-05, "loss": 0.4704, "mean_token_accuracy": 0.8482882022857666, "num_tokens": 546930783.0, "step": 16325 }, { "epoch": 0.9748089780324737, "grad_norm": 0.4762267768383026, "learning_rate": 3.0293291053302064e-05, "loss": 0.4872, "mean_token_accuracy": 0.8422223567962647, "num_tokens": 547098463.0, "step": 16330 }, { "epoch": 0.9751074498567335, "grad_norm": 0.5427188277244568, "learning_rate": 3.028227227005582e-05, "loss": 0.5458, "mean_token_accuracy": 0.8237921953201294, "num_tokens": 547266143.0, "step": 16335 }, { "epoch": 0.9754059216809933, "grad_norm": 0.5529686808586121, "learning_rate": 3.027125280913582e-05, "loss": 0.5598, "mean_token_accuracy": 0.8193665862083435, "num_tokens": 547433823.0, "step": 16340 }, { "epoch": 0.9757043935052531, "grad_norm": 0.5190616846084595, "learning_rate": 3.026023267322604e-05, "loss": 0.5099, "mean_token_accuracy": 0.8367052435874939, "num_tokens": 547601503.0, "step": 16345 }, { "epoch": 0.9760028653295129, "grad_norm": 0.5212611556053162, "learning_rate": 3.0249211865010647e-05, "loss": 0.5526, "mean_token_accuracy": 0.8234377026557922, "num_tokens": 547764372.0, "step": 16350 }, { "epoch": 0.9763013371537727, "grad_norm": 0.5687365531921387, "learning_rate": 3.023819038717396e-05, "loss": 0.5486, "mean_token_accuracy": 0.8242097139358521, "num_tokens": 547932052.0, "step": 16355 }, { "epoch": 0.9765998089780324, "grad_norm": 0.5005989670753479, "learning_rate": 3.022716824240046e-05, "loss": 0.5555, "mean_token_accuracy": 0.8228519797325134, "num_tokens": 548096057.0, "step": 16360 }, { "epoch": 0.9768982808022922, "grad_norm": 0.5083593130111694, "learning_rate": 3.0216145433374806e-05, "loss": 0.4938, "mean_token_accuracy": 0.8415125846862793, "num_tokens": 548263737.0, "step": 16365 }, { "epoch": 0.977196752626552, "grad_norm": 0.45629918575286865, "learning_rate": 3.020512196278178e-05, "loss": 0.5143, "mean_token_accuracy": 0.8360491394996643, "num_tokens": 548431417.0, "step": 16370 }, { "epoch": 0.9774952244508118, "grad_norm": 0.5211593508720398, "learning_rate": 3.0194097833306383e-05, "loss": 0.5277, "mean_token_accuracy": 0.830239760875702, "num_tokens": 548599097.0, "step": 16375 }, { "epoch": 0.9777936962750716, "grad_norm": 0.4815974831581116, "learning_rate": 3.018307304763373e-05, "loss": 0.5232, "mean_token_accuracy": 0.8328104376792907, "num_tokens": 548766777.0, "step": 16380 }, { "epoch": 0.9780921680993314, "grad_norm": 0.49401426315307617, "learning_rate": 3.017204760844911e-05, "loss": 0.5277, "mean_token_accuracy": 0.8308302521705627, "num_tokens": 548934457.0, "step": 16385 }, { "epoch": 0.9783906399235912, "grad_norm": 0.5188146829605103, "learning_rate": 3.0161021518437976e-05, "loss": 0.5074, "mean_token_accuracy": 0.8375521898269653, "num_tokens": 549102137.0, "step": 16390 }, { "epoch": 0.978689111747851, "grad_norm": 0.5481185913085938, "learning_rate": 3.0149994780285932e-05, "loss": 0.5106, "mean_token_accuracy": 0.8341226220130921, "num_tokens": 549269817.0, "step": 16395 }, { "epoch": 0.9789875835721108, "grad_norm": 0.47941240668296814, "learning_rate": 3.013896739667874e-05, "loss": 0.5481, "mean_token_accuracy": 0.8241202473640442, "num_tokens": 549437497.0, "step": 16400 }, { "epoch": 0.9792860553963706, "grad_norm": 0.497881144285202, "learning_rate": 3.0127939370302344e-05, "loss": 0.5293, "mean_token_accuracy": 0.8290946006774902, "num_tokens": 549605177.0, "step": 16405 }, { "epoch": 0.9795845272206304, "grad_norm": 0.4580265283584595, "learning_rate": 3.0116910703842794e-05, "loss": 0.4825, "mean_token_accuracy": 0.8440057158470153, "num_tokens": 549772857.0, "step": 16410 }, { "epoch": 0.9798829990448902, "grad_norm": 0.5197835564613342, "learning_rate": 3.0105881399986353e-05, "loss": 0.5181, "mean_token_accuracy": 0.8343790888786315, "num_tokens": 549940537.0, "step": 16415 }, { "epoch": 0.98018147086915, "grad_norm": 0.47877657413482666, "learning_rate": 3.0094851461419387e-05, "loss": 0.4963, "mean_token_accuracy": 0.8376416683197021, "num_tokens": 550108217.0, "step": 16420 }, { "epoch": 0.9804799426934098, "grad_norm": 0.48808667063713074, "learning_rate": 3.008382089082845e-05, "loss": 0.4884, "mean_token_accuracy": 0.8425265431404114, "num_tokens": 550275897.0, "step": 16425 }, { "epoch": 0.9807784145176696, "grad_norm": 0.46539559960365295, "learning_rate": 3.007278969090025e-05, "loss": 0.5223, "mean_token_accuracy": 0.8330743074417114, "num_tokens": 550437385.0, "step": 16430 }, { "epoch": 0.9810768863419294, "grad_norm": 0.48131659626960754, "learning_rate": 3.0061757864321638e-05, "loss": 0.4955, "mean_token_accuracy": 0.8383454799652099, "num_tokens": 550605065.0, "step": 16435 }, { "epoch": 0.9813753581661891, "grad_norm": 0.576423704624176, "learning_rate": 3.0050725413779608e-05, "loss": 0.5799, "mean_token_accuracy": 0.8150900602340698, "num_tokens": 550772745.0, "step": 16440 }, { "epoch": 0.9816738299904489, "grad_norm": 0.5067886114120483, "learning_rate": 3.0039692341961317e-05, "loss": 0.5417, "mean_token_accuracy": 0.8267624855041504, "num_tokens": 550940425.0, "step": 16445 }, { "epoch": 0.9819723018147087, "grad_norm": 0.47427457571029663, "learning_rate": 3.0028658651554082e-05, "loss": 0.5234, "mean_token_accuracy": 0.8323452234268188, "num_tokens": 551108105.0, "step": 16450 }, { "epoch": 0.9822707736389685, "grad_norm": 0.4905551075935364, "learning_rate": 3.001762434524536e-05, "loss": 0.5356, "mean_token_accuracy": 0.8273887634277344, "num_tokens": 551275785.0, "step": 16455 }, { "epoch": 0.9825692454632283, "grad_norm": 0.45076194405555725, "learning_rate": 3.000658942572276e-05, "loss": 0.5331, "mean_token_accuracy": 0.8284146428108216, "num_tokens": 551443465.0, "step": 16460 }, { "epoch": 0.982867717287488, "grad_norm": 0.4858933091163635, "learning_rate": 2.999555389567404e-05, "loss": 0.5308, "mean_token_accuracy": 0.8305379986763001, "num_tokens": 551611145.0, "step": 16465 }, { "epoch": 0.9831661891117478, "grad_norm": 0.5855818390846252, "learning_rate": 2.99845177577871e-05, "loss": 0.5697, "mean_token_accuracy": 0.8184301495552063, "num_tokens": 551778825.0, "step": 16470 }, { "epoch": 0.9834646609360076, "grad_norm": 0.517839252948761, "learning_rate": 2.997348101475001e-05, "loss": 0.554, "mean_token_accuracy": 0.8234939813613892, "num_tokens": 551946505.0, "step": 16475 }, { "epoch": 0.9837631327602674, "grad_norm": 0.5414894223213196, "learning_rate": 2.996244366925095e-05, "loss": 0.5432, "mean_token_accuracy": 0.8261660575866699, "num_tokens": 552114185.0, "step": 16480 }, { "epoch": 0.9840616045845272, "grad_norm": 0.4802056550979614, "learning_rate": 2.9951405723978297e-05, "loss": 0.5078, "mean_token_accuracy": 0.834886085987091, "num_tokens": 552281865.0, "step": 16485 }, { "epoch": 0.984360076408787, "grad_norm": 0.5247105360031128, "learning_rate": 2.9940367181620522e-05, "loss": 0.4974, "mean_token_accuracy": 0.8404568672180176, "num_tokens": 552449545.0, "step": 16490 }, { "epoch": 0.9846585482330468, "grad_norm": 0.4956377446651459, "learning_rate": 2.9929328044866283e-05, "loss": 0.5199, "mean_token_accuracy": 0.8321364760398865, "num_tokens": 552617225.0, "step": 16495 }, { "epoch": 0.9849570200573066, "grad_norm": 0.4788982570171356, "learning_rate": 2.9918288316404358e-05, "loss": 0.5007, "mean_token_accuracy": 0.8384528279304504, "num_tokens": 552784905.0, "step": 16500 }, { "epoch": 0.9852554918815664, "grad_norm": 0.5005519390106201, "learning_rate": 2.9907247998923677e-05, "loss": 0.5257, "mean_token_accuracy": 0.8299773335456848, "num_tokens": 552952585.0, "step": 16505 }, { "epoch": 0.9855539637058262, "grad_norm": 0.5599433183670044, "learning_rate": 2.989620709511331e-05, "loss": 0.5219, "mean_token_accuracy": 0.8315817713737488, "num_tokens": 553120265.0, "step": 16510 }, { "epoch": 0.985852435530086, "grad_norm": 0.4988783299922943, "learning_rate": 2.9885165607662474e-05, "loss": 0.5147, "mean_token_accuracy": 0.8326248407363892, "num_tokens": 553281086.0, "step": 16515 }, { "epoch": 0.9861509073543457, "grad_norm": 0.46395114064216614, "learning_rate": 2.9874123539260533e-05, "loss": 0.5206, "mean_token_accuracy": 0.8307228922843933, "num_tokens": 553448766.0, "step": 16520 }, { "epoch": 0.9864493791786055, "grad_norm": 0.5822600722312927, "learning_rate": 2.9863080892596978e-05, "loss": 0.5299, "mean_token_accuracy": 0.8284921765327453, "num_tokens": 553616446.0, "step": 16525 }, { "epoch": 0.9867478510028653, "grad_norm": 0.5490720272064209, "learning_rate": 2.985203767036146e-05, "loss": 0.5645, "mean_token_accuracy": 0.8197602272033692, "num_tokens": 553784126.0, "step": 16530 }, { "epoch": 0.9870463228271251, "grad_norm": 0.5024212598800659, "learning_rate": 2.9840993875243754e-05, "loss": 0.5264, "mean_token_accuracy": 0.8299236416816711, "num_tokens": 553951806.0, "step": 16535 }, { "epoch": 0.9873447946513849, "grad_norm": 0.5087766647338867, "learning_rate": 2.9829949509933775e-05, "loss": 0.5418, "mean_token_accuracy": 0.8258678317070007, "num_tokens": 554119486.0, "step": 16540 }, { "epoch": 0.9876432664756447, "grad_norm": 0.46259379386901855, "learning_rate": 2.9818904577121587e-05, "loss": 0.5019, "mean_token_accuracy": 0.8383991479873657, "num_tokens": 554287166.0, "step": 16545 }, { "epoch": 0.9879417382999045, "grad_norm": 0.4664643704891205, "learning_rate": 2.9807859079497397e-05, "loss": 0.4916, "mean_token_accuracy": 0.8427591562271118, "num_tokens": 554454846.0, "step": 16550 }, { "epoch": 0.9882402101241643, "grad_norm": 0.5438719391822815, "learning_rate": 2.979681301975152e-05, "loss": 0.5444, "mean_token_accuracy": 0.8262555122375488, "num_tokens": 554622526.0, "step": 16555 }, { "epoch": 0.9885386819484241, "grad_norm": 0.5597907900810242, "learning_rate": 2.9785766400574443e-05, "loss": 0.5178, "mean_token_accuracy": 0.8334903836250305, "num_tokens": 554790206.0, "step": 16560 }, { "epoch": 0.9888371537726839, "grad_norm": 0.42497727274894714, "learning_rate": 2.9774719224656756e-05, "loss": 0.5303, "mean_token_accuracy": 0.8307348132133484, "num_tokens": 554957886.0, "step": 16565 }, { "epoch": 0.9891356255969437, "grad_norm": 0.5093929767608643, "learning_rate": 2.9763671494689232e-05, "loss": 0.5246, "mean_token_accuracy": 0.832417094707489, "num_tokens": 555119436.0, "step": 16570 }, { "epoch": 0.9894340974212035, "grad_norm": 0.49292880296707153, "learning_rate": 2.9752623213362724e-05, "loss": 0.5195, "mean_token_accuracy": 0.8331981420516967, "num_tokens": 555287116.0, "step": 16575 }, { "epoch": 0.9897325692454633, "grad_norm": 0.49336308240890503, "learning_rate": 2.974157438336826e-05, "loss": 0.4801, "mean_token_accuracy": 0.8432422757148743, "num_tokens": 555454796.0, "step": 16580 }, { "epoch": 0.9900310410697231, "grad_norm": 0.526658833026886, "learning_rate": 2.9730525007396974e-05, "loss": 0.5465, "mean_token_accuracy": 0.8256948709487915, "num_tokens": 555622476.0, "step": 16585 }, { "epoch": 0.9903295128939829, "grad_norm": 0.5207714438438416, "learning_rate": 2.971947508814016e-05, "loss": 0.5366, "mean_token_accuracy": 0.8277645111083984, "num_tokens": 555790156.0, "step": 16590 }, { "epoch": 0.9906279847182426, "grad_norm": 0.5171641707420349, "learning_rate": 2.9708424628289207e-05, "loss": 0.5159, "mean_token_accuracy": 0.8342598080635071, "num_tokens": 555957836.0, "step": 16595 }, { "epoch": 0.9909264565425024, "grad_norm": 0.487448126077652, "learning_rate": 2.969737363053568e-05, "loss": 0.5029, "mean_token_accuracy": 0.8397351741790772, "num_tokens": 556125516.0, "step": 16600 }, { "epoch": 0.9912249283667621, "grad_norm": 0.49776336550712585, "learning_rate": 2.968632209757125e-05, "loss": 0.4881, "mean_token_accuracy": 0.8399439334869385, "num_tokens": 556293196.0, "step": 16605 }, { "epoch": 0.9915234001910219, "grad_norm": 0.500651478767395, "learning_rate": 2.9675270032087714e-05, "loss": 0.5356, "mean_token_accuracy": 0.8271322965621948, "num_tokens": 556460876.0, "step": 16610 }, { "epoch": 0.9918218720152817, "grad_norm": 0.4809315502643585, "learning_rate": 2.966421743677702e-05, "loss": 0.5184, "mean_token_accuracy": 0.8325181841850281, "num_tokens": 556628556.0, "step": 16615 }, { "epoch": 0.9921203438395415, "grad_norm": 0.5235933661460876, "learning_rate": 2.965316431433121e-05, "loss": 0.5434, "mean_token_accuracy": 0.8260706067085266, "num_tokens": 556796236.0, "step": 16620 }, { "epoch": 0.9924188156638013, "grad_norm": 0.5473774671554565, "learning_rate": 2.964211066744249e-05, "loss": 0.5009, "mean_token_accuracy": 0.8375462174415589, "num_tokens": 556963916.0, "step": 16625 }, { "epoch": 0.9927172874880611, "grad_norm": 0.47868818044662476, "learning_rate": 2.9631056498803182e-05, "loss": 0.4994, "mean_token_accuracy": 0.8415423989295959, "num_tokens": 557131596.0, "step": 16630 }, { "epoch": 0.9930157593123209, "grad_norm": 0.5369992852210999, "learning_rate": 2.962000181110573e-05, "loss": 0.5513, "mean_token_accuracy": 0.8234522342681885, "num_tokens": 557299276.0, "step": 16635 }, { "epoch": 0.9933142311365807, "grad_norm": 0.433292418718338, "learning_rate": 2.960894660704271e-05, "loss": 0.5196, "mean_token_accuracy": 0.8330251693725585, "num_tokens": 557466956.0, "step": 16640 }, { "epoch": 0.9936127029608405, "grad_norm": 0.48996397852897644, "learning_rate": 2.9597890889306813e-05, "loss": 0.5077, "mean_token_accuracy": 0.8355123400688171, "num_tokens": 557634636.0, "step": 16645 }, { "epoch": 0.9939111747851003, "grad_norm": 0.523293137550354, "learning_rate": 2.958683466059086e-05, "loss": 0.5123, "mean_token_accuracy": 0.8356972455978393, "num_tokens": 557802316.0, "step": 16650 }, { "epoch": 0.9942096466093601, "grad_norm": 0.48072439432144165, "learning_rate": 2.9575777923587815e-05, "loss": 0.4863, "mean_token_accuracy": 0.8418108105659485, "num_tokens": 557969996.0, "step": 16655 }, { "epoch": 0.9945081184336199, "grad_norm": 0.4458793103694916, "learning_rate": 2.956472068099074e-05, "loss": 0.4911, "mean_token_accuracy": 0.8398186802864075, "num_tokens": 558137676.0, "step": 16660 }, { "epoch": 0.9948065902578797, "grad_norm": 0.4677237868309021, "learning_rate": 2.955366293549283e-05, "loss": 0.4879, "mean_token_accuracy": 0.8403912663459778, "num_tokens": 558305356.0, "step": 16665 }, { "epoch": 0.9951050620821394, "grad_norm": 0.46648067235946655, "learning_rate": 2.954260468978741e-05, "loss": 0.5127, "mean_token_accuracy": 0.8355540990829468, "num_tokens": 558473036.0, "step": 16670 }, { "epoch": 0.9954035339063992, "grad_norm": 0.47389277815818787, "learning_rate": 2.9531545946567917e-05, "loss": 0.534, "mean_token_accuracy": 0.8278718948364258, "num_tokens": 558640716.0, "step": 16675 }, { "epoch": 0.995702005730659, "grad_norm": 0.5456295013427734, "learning_rate": 2.9520486708527907e-05, "loss": 0.5273, "mean_token_accuracy": 0.8305558800697327, "num_tokens": 558808396.0, "step": 16680 }, { "epoch": 0.9960004775549188, "grad_norm": 0.5450401306152344, "learning_rate": 2.9509426978361073e-05, "loss": 0.5236, "mean_token_accuracy": 0.8299296259880066, "num_tokens": 558976076.0, "step": 16685 }, { "epoch": 0.9962989493791786, "grad_norm": 0.5304602384567261, "learning_rate": 2.9498366758761197e-05, "loss": 0.5253, "mean_token_accuracy": 0.8312477588653564, "num_tokens": 559143756.0, "step": 16690 }, { "epoch": 0.9965974212034384, "grad_norm": 0.4793808162212372, "learning_rate": 2.9487306052422213e-05, "loss": 0.4857, "mean_token_accuracy": 0.8424490213394165, "num_tokens": 559311436.0, "step": 16695 }, { "epoch": 0.9968958930276982, "grad_norm": 0.49623870849609375, "learning_rate": 2.9476244862038168e-05, "loss": 0.4742, "mean_token_accuracy": 0.845276153087616, "num_tokens": 559479116.0, "step": 16700 }, { "epoch": 0.997194364851958, "grad_norm": 0.5237420201301575, "learning_rate": 2.94651831903032e-05, "loss": 0.5282, "mean_token_accuracy": 0.8301801204681396, "num_tokens": 559646796.0, "step": 16705 }, { "epoch": 0.9974928366762178, "grad_norm": 0.510319173336029, "learning_rate": 2.9454121039911597e-05, "loss": 0.5267, "mean_token_accuracy": 0.8292496800422668, "num_tokens": 559814476.0, "step": 16710 }, { "epoch": 0.9977913085004776, "grad_norm": 0.5285435914993286, "learning_rate": 2.9443058413557746e-05, "loss": 0.5276, "mean_token_accuracy": 0.8320768117904663, "num_tokens": 559982156.0, "step": 16715 }, { "epoch": 0.9980897803247374, "grad_norm": 0.49364015460014343, "learning_rate": 2.943199531393615e-05, "loss": 0.4902, "mean_token_accuracy": 0.842454981803894, "num_tokens": 560149836.0, "step": 16720 }, { "epoch": 0.9983882521489972, "grad_norm": 0.4487300217151642, "learning_rate": 2.9420931743741443e-05, "loss": 0.5389, "mean_token_accuracy": 0.826690924167633, "num_tokens": 560317516.0, "step": 16725 }, { "epoch": 0.998686723973257, "grad_norm": 0.5413393378257751, "learning_rate": 2.9409867705668343e-05, "loss": 0.5819, "mean_token_accuracy": 0.8143027544021606, "num_tokens": 560485196.0, "step": 16730 }, { "epoch": 0.9989851957975168, "grad_norm": 0.4478927254676819, "learning_rate": 2.9398803202411718e-05, "loss": 0.5078, "mean_token_accuracy": 0.8353453397750854, "num_tokens": 560652876.0, "step": 16735 }, { "epoch": 0.9992836676217765, "grad_norm": 0.5136594176292419, "learning_rate": 2.9387738236666522e-05, "loss": 0.5292, "mean_token_accuracy": 0.8307467579841614, "num_tokens": 560820556.0, "step": 16740 }, { "epoch": 0.9995821394460362, "grad_norm": 0.604115903377533, "learning_rate": 2.9376672811127837e-05, "loss": 0.5076, "mean_token_accuracy": 0.8377668976783752, "num_tokens": 560988236.0, "step": 16745 }, { "epoch": 0.999880611270296, "grad_norm": 0.498207688331604, "learning_rate": 2.9365606928490847e-05, "loss": 0.4937, "mean_token_accuracy": 0.8410831332206726, "num_tokens": 561155916.0, "step": 16750 }, { "epoch": 1.0001790830945558, "grad_norm": 0.4530643820762634, "learning_rate": 2.9354540591450863e-05, "loss": 0.4816, "mean_token_accuracy": 0.8472503900527955, "num_tokens": 561306828.0, "step": 16755 }, { "epoch": 1.0004775549188156, "grad_norm": 0.5342052578926086, "learning_rate": 2.934347380270327e-05, "loss": 0.4478, "mean_token_accuracy": 0.8516342759132385, "num_tokens": 561474508.0, "step": 16760 }, { "epoch": 1.0007760267430754, "grad_norm": 0.5704651474952698, "learning_rate": 2.933240656494362e-05, "loss": 0.4743, "mean_token_accuracy": 0.8443039655685425, "num_tokens": 561642188.0, "step": 16765 }, { "epoch": 1.0010744985673352, "grad_norm": 0.507685661315918, "learning_rate": 2.9321338880867523e-05, "loss": 0.4416, "mean_token_accuracy": 0.8531253695487976, "num_tokens": 561809868.0, "step": 16770 }, { "epoch": 1.001372970391595, "grad_norm": 0.5275362133979797, "learning_rate": 2.9310270753170727e-05, "loss": 0.45, "mean_token_accuracy": 0.8498031616210937, "num_tokens": 561977548.0, "step": 16775 }, { "epoch": 1.0016714422158548, "grad_norm": 0.6999732851982117, "learning_rate": 2.929920218454907e-05, "loss": 0.4486, "mean_token_accuracy": 0.8510557174682617, "num_tokens": 562145228.0, "step": 16780 }, { "epoch": 1.0019699140401146, "grad_norm": 0.5578259229660034, "learning_rate": 2.9288133177698512e-05, "loss": 0.4745, "mean_token_accuracy": 0.8445007801055908, "num_tokens": 562312908.0, "step": 16785 }, { "epoch": 1.0022683858643744, "grad_norm": 0.4876433312892914, "learning_rate": 2.927706373531512e-05, "loss": 0.4543, "mean_token_accuracy": 0.8488130807876587, "num_tokens": 562480588.0, "step": 16790 }, { "epoch": 1.0025668576886342, "grad_norm": 0.6000683307647705, "learning_rate": 2.9265993860095054e-05, "loss": 0.4734, "mean_token_accuracy": 0.8427889823913575, "num_tokens": 562648268.0, "step": 16795 }, { "epoch": 1.002865329512894, "grad_norm": 0.485141783952713, "learning_rate": 2.9254923554734576e-05, "loss": 0.4666, "mean_token_accuracy": 0.847439420223236, "num_tokens": 562809626.0, "step": 16800 }, { "epoch": 1.0031638013371538, "grad_norm": 0.6160366535186768, "learning_rate": 2.9243852821930084e-05, "loss": 0.5028, "mean_token_accuracy": 0.8355898857116699, "num_tokens": 562977306.0, "step": 16805 }, { "epoch": 1.0034622731614136, "grad_norm": 0.5146437287330627, "learning_rate": 2.9232781664378045e-05, "loss": 0.4767, "mean_token_accuracy": 0.8429142236709595, "num_tokens": 563144986.0, "step": 16810 }, { "epoch": 1.0037607449856734, "grad_norm": 0.4939850866794586, "learning_rate": 2.922171008477505e-05, "loss": 0.4746, "mean_token_accuracy": 0.8421149969100952, "num_tokens": 563312666.0, "step": 16815 }, { "epoch": 1.0040592168099332, "grad_norm": 0.582244873046875, "learning_rate": 2.9210638085817782e-05, "loss": 0.4888, "mean_token_accuracy": 0.8381903767585754, "num_tokens": 563480346.0, "step": 16820 }, { "epoch": 1.004357688634193, "grad_norm": 0.47668758034706116, "learning_rate": 2.9199565670203037e-05, "loss": 0.4268, "mean_token_accuracy": 0.8566980719566345, "num_tokens": 563648026.0, "step": 16825 }, { "epoch": 1.0046561604584527, "grad_norm": 0.45325109362602234, "learning_rate": 2.9188492840627703e-05, "loss": 0.4134, "mean_token_accuracy": 0.8622331023216248, "num_tokens": 563815706.0, "step": 16830 }, { "epoch": 1.0049546322827125, "grad_norm": 0.4884812831878662, "learning_rate": 2.9177419599788775e-05, "loss": 0.4464, "mean_token_accuracy": 0.8531015157699585, "num_tokens": 563983386.0, "step": 16835 }, { "epoch": 1.0052531041069723, "grad_norm": 0.5482917428016663, "learning_rate": 2.9166345950383332e-05, "loss": 0.4632, "mean_token_accuracy": 0.8472265243530274, "num_tokens": 564151066.0, "step": 16840 }, { "epoch": 1.0055515759312321, "grad_norm": 0.5486830472946167, "learning_rate": 2.915527189510858e-05, "loss": 0.4522, "mean_token_accuracy": 0.8506859183311463, "num_tokens": 564318746.0, "step": 16845 }, { "epoch": 1.005850047755492, "grad_norm": 0.4940612316131592, "learning_rate": 2.9144197436661815e-05, "loss": 0.4871, "mean_token_accuracy": 0.8405463457107544, "num_tokens": 564486426.0, "step": 16850 }, { "epoch": 1.0061485195797517, "grad_norm": 0.4869822859764099, "learning_rate": 2.91331225777404e-05, "loss": 0.4357, "mean_token_accuracy": 0.8547298073768616, "num_tokens": 564654106.0, "step": 16855 }, { "epoch": 1.0064469914040115, "grad_norm": 0.4788769781589508, "learning_rate": 2.912204732104185e-05, "loss": 0.4439, "mean_token_accuracy": 0.8526959300041199, "num_tokens": 564821786.0, "step": 16860 }, { "epoch": 1.0067454632282713, "grad_norm": 0.4857526123523712, "learning_rate": 2.9110971669263727e-05, "loss": 0.4698, "mean_token_accuracy": 0.8441488742828369, "num_tokens": 564989466.0, "step": 16865 }, { "epoch": 1.007043935052531, "grad_norm": 0.5107783675193787, "learning_rate": 2.9099895625103724e-05, "loss": 0.4423, "mean_token_accuracy": 0.8519086122512818, "num_tokens": 565157146.0, "step": 16870 }, { "epoch": 1.0073424068767909, "grad_norm": 0.542470395565033, "learning_rate": 2.9088819191259603e-05, "loss": 0.4927, "mean_token_accuracy": 0.8381546139717102, "num_tokens": 565324826.0, "step": 16875 }, { "epoch": 1.0076408787010507, "grad_norm": 0.4769206941127777, "learning_rate": 2.9077742370429252e-05, "loss": 0.4668, "mean_token_accuracy": 0.846540629863739, "num_tokens": 565492506.0, "step": 16880 }, { "epoch": 1.0079393505253105, "grad_norm": 0.5395178198814392, "learning_rate": 2.9066665165310614e-05, "loss": 0.4516, "mean_token_accuracy": 0.8497673988342285, "num_tokens": 565660186.0, "step": 16885 }, { "epoch": 1.0082378223495703, "grad_norm": 0.5594173669815063, "learning_rate": 2.9055587578601767e-05, "loss": 0.4627, "mean_token_accuracy": 0.8471191644668579, "num_tokens": 565827866.0, "step": 16890 }, { "epoch": 1.00853629417383, "grad_norm": 0.5374088883399963, "learning_rate": 2.904450961300084e-05, "loss": 0.4725, "mean_token_accuracy": 0.8446319818496704, "num_tokens": 565995546.0, "step": 16895 }, { "epoch": 1.0088347659980899, "grad_norm": 0.6275510787963867, "learning_rate": 2.9033431271206106e-05, "loss": 0.4645, "mean_token_accuracy": 0.8466241240501404, "num_tokens": 566163226.0, "step": 16900 }, { "epoch": 1.0091332378223496, "grad_norm": 0.5092877745628357, "learning_rate": 2.9022352555915865e-05, "loss": 0.4399, "mean_token_accuracy": 0.852731728553772, "num_tokens": 566330906.0, "step": 16905 }, { "epoch": 1.0094317096466094, "grad_norm": 0.5199767351150513, "learning_rate": 2.9011273469828577e-05, "loss": 0.436, "mean_token_accuracy": 0.8531432628631592, "num_tokens": 566498586.0, "step": 16910 }, { "epoch": 1.0097301814708692, "grad_norm": 0.4958356022834778, "learning_rate": 2.9000194015642728e-05, "loss": 0.4413, "mean_token_accuracy": 0.8539126873016357, "num_tokens": 566666266.0, "step": 16915 }, { "epoch": 1.010028653295129, "grad_norm": 0.49873092770576477, "learning_rate": 2.8989114196056938e-05, "loss": 0.4339, "mean_token_accuracy": 0.8544793128967285, "num_tokens": 566833946.0, "step": 16920 }, { "epoch": 1.0103271251193888, "grad_norm": 0.5273438692092896, "learning_rate": 2.89780340137699e-05, "loss": 0.4748, "mean_token_accuracy": 0.8430037021636962, "num_tokens": 567001626.0, "step": 16925 }, { "epoch": 1.0106255969436486, "grad_norm": 0.4734728932380676, "learning_rate": 2.8966953471480396e-05, "loss": 0.4599, "mean_token_accuracy": 0.8484253764152527, "num_tokens": 567169306.0, "step": 16930 }, { "epoch": 1.0109240687679084, "grad_norm": 0.4726862609386444, "learning_rate": 2.89558725718873e-05, "loss": 0.4581, "mean_token_accuracy": 0.850614321231842, "num_tokens": 567336986.0, "step": 16935 }, { "epoch": 1.0112225405921682, "grad_norm": 0.4735446572303772, "learning_rate": 2.894479131768957e-05, "loss": 0.4558, "mean_token_accuracy": 0.8485148429870606, "num_tokens": 567504666.0, "step": 16940 }, { "epoch": 1.011521012416428, "grad_norm": 0.5605773329734802, "learning_rate": 2.8933709711586245e-05, "loss": 0.4495, "mean_token_accuracy": 0.8506501197814942, "num_tokens": 567672346.0, "step": 16945 }, { "epoch": 1.0118194842406876, "grad_norm": 0.49589309096336365, "learning_rate": 2.892262775627645e-05, "loss": 0.4338, "mean_token_accuracy": 0.8552129387855529, "num_tokens": 567840026.0, "step": 16950 }, { "epoch": 1.0121179560649474, "grad_norm": 0.4771466553211212, "learning_rate": 2.8911545454459406e-05, "loss": 0.4663, "mean_token_accuracy": 0.8449660062789917, "num_tokens": 568007706.0, "step": 16955 }, { "epoch": 1.0124164278892072, "grad_norm": 0.47365638613700867, "learning_rate": 2.8900462808834416e-05, "loss": 0.4221, "mean_token_accuracy": 0.8595013737678527, "num_tokens": 568175386.0, "step": 16960 }, { "epoch": 1.012714899713467, "grad_norm": 0.4710509181022644, "learning_rate": 2.888937982210086e-05, "loss": 0.4494, "mean_token_accuracy": 0.8508409857749939, "num_tokens": 568343066.0, "step": 16965 }, { "epoch": 1.0130133715377267, "grad_norm": 0.47972074151039124, "learning_rate": 2.8878296496958208e-05, "loss": 0.4292, "mean_token_accuracy": 0.8572110176086426, "num_tokens": 568510746.0, "step": 16970 }, { "epoch": 1.0133118433619865, "grad_norm": 0.5294878482818604, "learning_rate": 2.8867212836105995e-05, "loss": 0.4742, "mean_token_accuracy": 0.8439281940460205, "num_tokens": 568678426.0, "step": 16975 }, { "epoch": 1.0136103151862463, "grad_norm": 0.49053069949150085, "learning_rate": 2.8856128842243863e-05, "loss": 0.4593, "mean_token_accuracy": 0.8476857900619507, "num_tokens": 568846106.0, "step": 16980 }, { "epoch": 1.0139087870105061, "grad_norm": 0.9311002492904663, "learning_rate": 2.8845044518071523e-05, "loss": 0.4713, "mean_token_accuracy": 0.845526647567749, "num_tokens": 569013786.0, "step": 16985 }, { "epoch": 1.014207258834766, "grad_norm": 0.4663950502872467, "learning_rate": 2.883395986628876e-05, "loss": 0.4119, "mean_token_accuracy": 0.8621734499931335, "num_tokens": 569181466.0, "step": 16990 }, { "epoch": 1.0145057306590257, "grad_norm": 0.49617063999176025, "learning_rate": 2.8822874889595452e-05, "loss": 0.464, "mean_token_accuracy": 0.8443337678909302, "num_tokens": 569349146.0, "step": 16995 }, { "epoch": 1.0148042024832855, "grad_norm": 0.5455309748649597, "learning_rate": 2.8811789590691545e-05, "loss": 0.4584, "mean_token_accuracy": 0.8497912526130676, "num_tokens": 569516826.0, "step": 17000 }, { "epoch": 1.0151026743075453, "grad_norm": 0.5212569832801819, "learning_rate": 2.8800703972277077e-05, "loss": 0.4531, "mean_token_accuracy": 0.8501013994216919, "num_tokens": 569684506.0, "step": 17005 }, { "epoch": 1.015401146131805, "grad_norm": 0.5574789047241211, "learning_rate": 2.878961803705214e-05, "loss": 0.4558, "mean_token_accuracy": 0.8503101468086243, "num_tokens": 569852186.0, "step": 17010 }, { "epoch": 1.015699617956065, "grad_norm": 0.48953911662101746, "learning_rate": 2.8778531787716935e-05, "loss": 0.4464, "mean_token_accuracy": 0.8512883186340332, "num_tokens": 570019866.0, "step": 17015 }, { "epoch": 1.0159980897803247, "grad_norm": 0.502530038356781, "learning_rate": 2.8767445226971706e-05, "loss": 0.4501, "mean_token_accuracy": 0.8508469581604003, "num_tokens": 570187546.0, "step": 17020 }, { "epoch": 1.0162965616045845, "grad_norm": 0.4684881567955017, "learning_rate": 2.8756358357516805e-05, "loss": 0.4372, "mean_token_accuracy": 0.854825246334076, "num_tokens": 570355226.0, "step": 17025 }, { "epoch": 1.0165950334288443, "grad_norm": 0.49216997623443604, "learning_rate": 2.874527118205263e-05, "loss": 0.4222, "mean_token_accuracy": 0.8588094830513, "num_tokens": 570522906.0, "step": 17030 }, { "epoch": 1.016893505253104, "grad_norm": 0.5277981162071228, "learning_rate": 2.8734183703279672e-05, "loss": 0.4504, "mean_token_accuracy": 0.8510020256042481, "num_tokens": 570690586.0, "step": 17035 }, { "epoch": 1.0171919770773639, "grad_norm": 0.5185527801513672, "learning_rate": 2.8723095923898496e-05, "loss": 0.4586, "mean_token_accuracy": 0.8484373092651367, "num_tokens": 570858266.0, "step": 17040 }, { "epoch": 1.0174904489016237, "grad_norm": 0.4920758903026581, "learning_rate": 2.871200784660973e-05, "loss": 0.4603, "mean_token_accuracy": 0.8466778039932251, "num_tokens": 571025946.0, "step": 17045 }, { "epoch": 1.0177889207258835, "grad_norm": 0.8109400272369385, "learning_rate": 2.870091947411408e-05, "loss": 0.4408, "mean_token_accuracy": 0.8544315934181214, "num_tokens": 571193626.0, "step": 17050 }, { "epoch": 1.0180873925501432, "grad_norm": 0.45363783836364746, "learning_rate": 2.8689830809112332e-05, "loss": 0.4377, "mean_token_accuracy": 0.8521770238876343, "num_tokens": 571361306.0, "step": 17055 }, { "epoch": 1.018385864374403, "grad_norm": 0.5127880573272705, "learning_rate": 2.8678741854305324e-05, "loss": 0.4584, "mean_token_accuracy": 0.8477036714553833, "num_tokens": 571528986.0, "step": 17060 }, { "epoch": 1.0186843361986628, "grad_norm": 0.5394222140312195, "learning_rate": 2.8667652612393984e-05, "loss": 0.489, "mean_token_accuracy": 0.8415179133415223, "num_tokens": 571696632.0, "step": 17065 }, { "epoch": 1.0189828080229226, "grad_norm": 0.5126343965530396, "learning_rate": 2.8656563086079298e-05, "loss": 0.425, "mean_token_accuracy": 0.8594894409179688, "num_tokens": 571864312.0, "step": 17070 }, { "epoch": 1.0192812798471824, "grad_norm": 0.4388852119445801, "learning_rate": 2.8645473278062324e-05, "loss": 0.4298, "mean_token_accuracy": 0.8574376583099366, "num_tokens": 572031992.0, "step": 17075 }, { "epoch": 1.0195797516714422, "grad_norm": 0.4752084016799927, "learning_rate": 2.863438319104419e-05, "loss": 0.4893, "mean_token_accuracy": 0.8369438290596009, "num_tokens": 572199672.0, "step": 17080 }, { "epoch": 1.019878223495702, "grad_norm": 0.4854961335659027, "learning_rate": 2.8623292827726094e-05, "loss": 0.4365, "mean_token_accuracy": 0.8559226989746094, "num_tokens": 572367352.0, "step": 17085 }, { "epoch": 1.0201766953199618, "grad_norm": 0.5119810700416565, "learning_rate": 2.8612202190809285e-05, "loss": 0.4684, "mean_token_accuracy": 0.8451747536659241, "num_tokens": 572535032.0, "step": 17090 }, { "epoch": 1.0204751671442216, "grad_norm": 0.6095016598701477, "learning_rate": 2.8601111282995107e-05, "loss": 0.4793, "mean_token_accuracy": 0.8423833966255188, "num_tokens": 572702712.0, "step": 17095 }, { "epoch": 1.0207736389684814, "grad_norm": 0.49058955907821655, "learning_rate": 2.859002010698496e-05, "loss": 0.4483, "mean_token_accuracy": 0.8513121843338013, "num_tokens": 572870392.0, "step": 17100 }, { "epoch": 1.0210721107927412, "grad_norm": 0.5054989457130432, "learning_rate": 2.8578928665480297e-05, "loss": 0.4787, "mean_token_accuracy": 0.8425086498260498, "num_tokens": 573038072.0, "step": 17105 }, { "epoch": 1.021370582617001, "grad_norm": 0.5695568323135376, "learning_rate": 2.856783696118263e-05, "loss": 0.4453, "mean_token_accuracy": 0.8508708238601684, "num_tokens": 573205752.0, "step": 17110 }, { "epoch": 1.0216690544412608, "grad_norm": 0.505357563495636, "learning_rate": 2.855674499679357e-05, "loss": 0.4176, "mean_token_accuracy": 0.8603320598602295, "num_tokens": 573366307.0, "step": 17115 }, { "epoch": 1.0219675262655206, "grad_norm": 0.5061622858047485, "learning_rate": 2.854565277501476e-05, "loss": 0.4623, "mean_token_accuracy": 0.8473219633102417, "num_tokens": 573533987.0, "step": 17120 }, { "epoch": 1.0222659980897804, "grad_norm": 0.5195006132125854, "learning_rate": 2.853456029854792e-05, "loss": 0.4728, "mean_token_accuracy": 0.8437790751457215, "num_tokens": 573701667.0, "step": 17125 }, { "epoch": 1.0225644699140402, "grad_norm": 0.4995376169681549, "learning_rate": 2.8523467570094816e-05, "loss": 0.4497, "mean_token_accuracy": 0.8511571049690246, "num_tokens": 573869347.0, "step": 17130 }, { "epoch": 1.0228629417383, "grad_norm": 0.5369946360588074, "learning_rate": 2.8512374592357307e-05, "loss": 0.463, "mean_token_accuracy": 0.8471847891807556, "num_tokens": 574037027.0, "step": 17135 }, { "epoch": 1.0231614135625597, "grad_norm": 0.4634990394115448, "learning_rate": 2.8501281368037265e-05, "loss": 0.4673, "mean_token_accuracy": 0.8460455775260926, "num_tokens": 574204707.0, "step": 17140 }, { "epoch": 1.0234598853868195, "grad_norm": 0.5192978978157043, "learning_rate": 2.8490187899836668e-05, "loss": 0.4741, "mean_token_accuracy": 0.8441548466682434, "num_tokens": 574372387.0, "step": 17145 }, { "epoch": 1.0237583572110793, "grad_norm": 0.5141440629959106, "learning_rate": 2.847909419045754e-05, "loss": 0.4441, "mean_token_accuracy": 0.8513002395629883, "num_tokens": 574540067.0, "step": 17150 }, { "epoch": 1.0240568290353391, "grad_norm": 0.4873316287994385, "learning_rate": 2.8468000242601938e-05, "loss": 0.4313, "mean_token_accuracy": 0.8560896992683411, "num_tokens": 574707747.0, "step": 17155 }, { "epoch": 1.024355300859599, "grad_norm": 0.4926494061946869, "learning_rate": 2.8456906058972014e-05, "loss": 0.4723, "mean_token_accuracy": 0.8442025423049927, "num_tokens": 574875427.0, "step": 17160 }, { "epoch": 1.0246537726838587, "grad_norm": 0.5193369388580322, "learning_rate": 2.844581164226996e-05, "loss": 0.4447, "mean_token_accuracy": 0.8519265174865722, "num_tokens": 575043107.0, "step": 17165 }, { "epoch": 1.0249522445081185, "grad_norm": 0.5441789031028748, "learning_rate": 2.8434716995198018e-05, "loss": 0.4491, "mean_token_accuracy": 0.8523917436599732, "num_tokens": 575210787.0, "step": 17170 }, { "epoch": 1.0252507163323783, "grad_norm": 0.4996625483036041, "learning_rate": 2.8423622120458504e-05, "loss": 0.4385, "mean_token_accuracy": 0.8542705535888672, "num_tokens": 575378467.0, "step": 17175 }, { "epoch": 1.025549188156638, "grad_norm": 0.4947277903556824, "learning_rate": 2.8412527020753778e-05, "loss": 0.5025, "mean_token_accuracy": 0.8346295952796936, "num_tokens": 575546147.0, "step": 17180 }, { "epoch": 1.0258476599808979, "grad_norm": 0.475445032119751, "learning_rate": 2.840143169878624e-05, "loss": 0.422, "mean_token_accuracy": 0.8593820810317994, "num_tokens": 575713827.0, "step": 17185 }, { "epoch": 1.0261461318051577, "grad_norm": 0.5255208015441895, "learning_rate": 2.839033615725838e-05, "loss": 0.4577, "mean_token_accuracy": 0.8482762813568115, "num_tokens": 575881507.0, "step": 17190 }, { "epoch": 1.0264446036294175, "grad_norm": 0.4992060661315918, "learning_rate": 2.837924039887272e-05, "loss": 0.4456, "mean_token_accuracy": 0.8530239820480346, "num_tokens": 576049187.0, "step": 17195 }, { "epoch": 1.0267430754536773, "grad_norm": 0.5629310607910156, "learning_rate": 2.8368144426331822e-05, "loss": 0.4599, "mean_token_accuracy": 0.8473577499389648, "num_tokens": 576216867.0, "step": 17200 }, { "epoch": 1.027041547277937, "grad_norm": 0.49983513355255127, "learning_rate": 2.835704824233833e-05, "loss": 0.4679, "mean_token_accuracy": 0.844787061214447, "num_tokens": 576384547.0, "step": 17205 }, { "epoch": 1.0273400191021969, "grad_norm": 0.547196090221405, "learning_rate": 2.834595184959492e-05, "loss": 0.4606, "mean_token_accuracy": 0.8481032967567443, "num_tokens": 576552227.0, "step": 17210 }, { "epoch": 1.0276384909264566, "grad_norm": 1.0569008588790894, "learning_rate": 2.833485525080432e-05, "loss": 0.4602, "mean_token_accuracy": 0.8474352836608887, "num_tokens": 576719907.0, "step": 17215 }, { "epoch": 1.0279369627507164, "grad_norm": 0.46995216608047485, "learning_rate": 2.832375844866932e-05, "loss": 0.4678, "mean_token_accuracy": 0.8440832614898681, "num_tokens": 576887587.0, "step": 17220 }, { "epoch": 1.028235434574976, "grad_norm": 0.519990086555481, "learning_rate": 2.8312661445892737e-05, "loss": 0.4546, "mean_token_accuracy": 0.8497554540634156, "num_tokens": 577055267.0, "step": 17225 }, { "epoch": 1.0285339063992358, "grad_norm": 0.566287100315094, "learning_rate": 2.830156424517746e-05, "loss": 0.4719, "mean_token_accuracy": 0.8484850287437439, "num_tokens": 577222947.0, "step": 17230 }, { "epoch": 1.0288323782234956, "grad_norm": 0.5435327291488647, "learning_rate": 2.829046684922642e-05, "loss": 0.4313, "mean_token_accuracy": 0.8563760042190551, "num_tokens": 577390627.0, "step": 17235 }, { "epoch": 1.0291308500477554, "grad_norm": 0.5225843191146851, "learning_rate": 2.827936926074259e-05, "loss": 0.4533, "mean_token_accuracy": 0.8496421456336976, "num_tokens": 577558307.0, "step": 17240 }, { "epoch": 1.0294293218720152, "grad_norm": 0.5453398823738098, "learning_rate": 2.8268271482428986e-05, "loss": 0.4557, "mean_token_accuracy": 0.8497852802276611, "num_tokens": 577725987.0, "step": 17245 }, { "epoch": 1.029727793696275, "grad_norm": 0.5361250042915344, "learning_rate": 2.8257173516988688e-05, "loss": 0.4621, "mean_token_accuracy": 0.8470177769660949, "num_tokens": 577893667.0, "step": 17250 }, { "epoch": 1.0300262655205348, "grad_norm": 0.52862149477005, "learning_rate": 2.8246075367124797e-05, "loss": 0.4372, "mean_token_accuracy": 0.854598593711853, "num_tokens": 578061347.0, "step": 17255 }, { "epoch": 1.0303247373447946, "grad_norm": 0.5109806656837463, "learning_rate": 2.8234977035540483e-05, "loss": 0.445, "mean_token_accuracy": 0.8528033018112182, "num_tokens": 578229027.0, "step": 17260 }, { "epoch": 1.0306232091690544, "grad_norm": 0.5901560187339783, "learning_rate": 2.822387852493894e-05, "loss": 0.518, "mean_token_accuracy": 0.829023003578186, "num_tokens": 578396707.0, "step": 17265 }, { "epoch": 1.0309216809933142, "grad_norm": 0.5068350434303284, "learning_rate": 2.8212779838023435e-05, "loss": 0.4557, "mean_token_accuracy": 0.8478527903556824, "num_tokens": 578564387.0, "step": 17270 }, { "epoch": 1.031220152817574, "grad_norm": 0.505020260810852, "learning_rate": 2.820168097749723e-05, "loss": 0.4771, "mean_token_accuracy": 0.8409638643264771, "num_tokens": 578732067.0, "step": 17275 }, { "epoch": 1.0315186246418337, "grad_norm": 0.5027931332588196, "learning_rate": 2.8190581946063678e-05, "loss": 0.4371, "mean_token_accuracy": 0.8546224594116211, "num_tokens": 578899747.0, "step": 17280 }, { "epoch": 1.0318170964660935, "grad_norm": 0.5489554405212402, "learning_rate": 2.817948274642614e-05, "loss": 0.4635, "mean_token_accuracy": 0.8468626976013184, "num_tokens": 579067427.0, "step": 17285 }, { "epoch": 1.0321155682903533, "grad_norm": 0.5266851186752319, "learning_rate": 2.8168383381288042e-05, "loss": 0.4685, "mean_token_accuracy": 0.8452701807022095, "num_tokens": 579235107.0, "step": 17290 }, { "epoch": 1.0324140401146131, "grad_norm": 0.6650678515434265, "learning_rate": 2.815728385335284e-05, "loss": 0.4371, "mean_token_accuracy": 0.8537755012512207, "num_tokens": 579402787.0, "step": 17295 }, { "epoch": 1.032712511938873, "grad_norm": 0.5936505198478699, "learning_rate": 2.8146184165324017e-05, "loss": 0.4865, "mean_token_accuracy": 0.8394429206848144, "num_tokens": 579570467.0, "step": 17300 }, { "epoch": 1.0330109837631327, "grad_norm": 0.4832249581813812, "learning_rate": 2.8135084319905114e-05, "loss": 0.4342, "mean_token_accuracy": 0.8561075925827026, "num_tokens": 579738147.0, "step": 17305 }, { "epoch": 1.0333094555873925, "grad_norm": 0.4816303849220276, "learning_rate": 2.81239843197997e-05, "loss": 0.4378, "mean_token_accuracy": 0.8538649678230286, "num_tokens": 579905827.0, "step": 17310 }, { "epoch": 1.0336079274116523, "grad_norm": 0.4698982834815979, "learning_rate": 2.8112884167711395e-05, "loss": 0.4483, "mean_token_accuracy": 0.8508588790893554, "num_tokens": 580073507.0, "step": 17315 }, { "epoch": 1.033906399235912, "grad_norm": 0.5446665287017822, "learning_rate": 2.8101783866343834e-05, "loss": 0.4853, "mean_token_accuracy": 0.8409340500831604, "num_tokens": 580241187.0, "step": 17320 }, { "epoch": 1.034204871060172, "grad_norm": 0.49912068247795105, "learning_rate": 2.8090683418400705e-05, "loss": 0.45, "mean_token_accuracy": 0.850888705253601, "num_tokens": 580408867.0, "step": 17325 }, { "epoch": 1.0345033428844317, "grad_norm": 0.5369765758514404, "learning_rate": 2.8079582826585726e-05, "loss": 0.4346, "mean_token_accuracy": 0.8561493039131165, "num_tokens": 580571963.0, "step": 17330 }, { "epoch": 1.0348018147086915, "grad_norm": 0.527891993522644, "learning_rate": 2.8068482093602648e-05, "loss": 0.4669, "mean_token_accuracy": 0.8471967101097106, "num_tokens": 580739643.0, "step": 17335 }, { "epoch": 1.0351002865329513, "grad_norm": 0.5228444933891296, "learning_rate": 2.8057381222155266e-05, "loss": 0.4325, "mean_token_accuracy": 0.8545926213264465, "num_tokens": 580907323.0, "step": 17340 }, { "epoch": 1.035398758357211, "grad_norm": 0.516074001789093, "learning_rate": 2.8046280214947402e-05, "loss": 0.4469, "mean_token_accuracy": 0.8509722113609314, "num_tokens": 581075003.0, "step": 17345 }, { "epoch": 1.0356972301814709, "grad_norm": 0.5603234171867371, "learning_rate": 2.8035179074682903e-05, "loss": 0.4401, "mean_token_accuracy": 0.8537277817726135, "num_tokens": 581242683.0, "step": 17350 }, { "epoch": 1.0359957020057307, "grad_norm": 0.5408618450164795, "learning_rate": 2.8024077804065675e-05, "loss": 0.4893, "mean_token_accuracy": 0.8401467323303222, "num_tokens": 581410363.0, "step": 17355 }, { "epoch": 1.0362941738299905, "grad_norm": 0.5174014568328857, "learning_rate": 2.8012976405799613e-05, "loss": 0.4447, "mean_token_accuracy": 0.8507694125175476, "num_tokens": 581578043.0, "step": 17360 }, { "epoch": 1.0365926456542502, "grad_norm": 0.5578038692474365, "learning_rate": 2.800187488258868e-05, "loss": 0.4741, "mean_token_accuracy": 0.8429082751274108, "num_tokens": 581745723.0, "step": 17365 }, { "epoch": 1.03689111747851, "grad_norm": 0.4302588999271393, "learning_rate": 2.799077323713685e-05, "loss": 0.4451, "mean_token_accuracy": 0.8539126873016357, "num_tokens": 581913403.0, "step": 17370 }, { "epoch": 1.0371895893027698, "grad_norm": 0.5418205261230469, "learning_rate": 2.7979671472148155e-05, "loss": 0.4419, "mean_token_accuracy": 0.8527615427970886, "num_tokens": 582081083.0, "step": 17375 }, { "epoch": 1.0374880611270296, "grad_norm": 0.4621972441673279, "learning_rate": 2.796856959032661e-05, "loss": 0.4606, "mean_token_accuracy": 0.8478527903556824, "num_tokens": 582248763.0, "step": 17380 }, { "epoch": 1.0377865329512894, "grad_norm": 0.5263122916221619, "learning_rate": 2.7957467594376298e-05, "loss": 0.4812, "mean_token_accuracy": 0.8401347875595093, "num_tokens": 582416443.0, "step": 17385 }, { "epoch": 1.0380850047755492, "grad_norm": 0.7780062556266785, "learning_rate": 2.7946365487001296e-05, "loss": 0.5261, "mean_token_accuracy": 0.8289514422416687, "num_tokens": 582584123.0, "step": 17390 }, { "epoch": 1.038383476599809, "grad_norm": 0.5385981202125549, "learning_rate": 2.7935263270905753e-05, "loss": 0.4359, "mean_token_accuracy": 0.8551890850067139, "num_tokens": 582751803.0, "step": 17395 }, { "epoch": 1.0386819484240688, "grad_norm": 0.5106160044670105, "learning_rate": 2.7924160948793798e-05, "loss": 0.4171, "mean_token_accuracy": 0.8621495962142944, "num_tokens": 582919483.0, "step": 17400 }, { "epoch": 1.0389804202483286, "grad_norm": 0.5020478963851929, "learning_rate": 2.7913058523369624e-05, "loss": 0.4232, "mean_token_accuracy": 0.8596325755119324, "num_tokens": 583087163.0, "step": 17405 }, { "epoch": 1.0392788920725884, "grad_norm": 0.44876614212989807, "learning_rate": 2.790195599733742e-05, "loss": 0.4332, "mean_token_accuracy": 0.855815327167511, "num_tokens": 583254843.0, "step": 17410 }, { "epoch": 1.0395773638968482, "grad_norm": 0.5497639775276184, "learning_rate": 2.7890853373401416e-05, "loss": 0.4538, "mean_token_accuracy": 0.8499463319778442, "num_tokens": 583422523.0, "step": 17415 }, { "epoch": 1.039875835721108, "grad_norm": 0.5056437253952026, "learning_rate": 2.7879750654265847e-05, "loss": 0.452, "mean_token_accuracy": 0.8501790165901184, "num_tokens": 583589980.0, "step": 17420 }, { "epoch": 1.0401743075453678, "grad_norm": 0.5529347062110901, "learning_rate": 2.7868647842635008e-05, "loss": 0.4595, "mean_token_accuracy": 0.8473994970321655, "num_tokens": 583757660.0, "step": 17425 }, { "epoch": 1.0404727793696276, "grad_norm": 0.5109863877296448, "learning_rate": 2.785754494121317e-05, "loss": 0.4854, "mean_token_accuracy": 0.8398544669151307, "num_tokens": 583925340.0, "step": 17430 }, { "epoch": 1.0407712511938874, "grad_norm": 0.6398445963859558, "learning_rate": 2.784644195270468e-05, "loss": 0.4313, "mean_token_accuracy": 0.8573422312736512, "num_tokens": 584093020.0, "step": 17435 }, { "epoch": 1.0410697230181472, "grad_norm": 0.5292453765869141, "learning_rate": 2.7835338879813842e-05, "loss": 0.4518, "mean_token_accuracy": 0.8505725979804992, "num_tokens": 584260700.0, "step": 17440 }, { "epoch": 1.041368194842407, "grad_norm": 0.521647572517395, "learning_rate": 2.7824235725245042e-05, "loss": 0.4691, "mean_token_accuracy": 0.8443993926048279, "num_tokens": 584428380.0, "step": 17445 }, { "epoch": 1.0416666666666667, "grad_norm": 0.4575381875038147, "learning_rate": 2.7813132491702637e-05, "loss": 0.4374, "mean_token_accuracy": 0.8541274070739746, "num_tokens": 584596060.0, "step": 17450 }, { "epoch": 1.0419651384909265, "grad_norm": 0.5322298407554626, "learning_rate": 2.7802029181891044e-05, "loss": 0.4321, "mean_token_accuracy": 0.8572289109230041, "num_tokens": 584763740.0, "step": 17455 }, { "epoch": 1.0422636103151863, "grad_norm": 0.5360782742500305, "learning_rate": 2.7790925798514656e-05, "loss": 0.4734, "mean_token_accuracy": 0.842556357383728, "num_tokens": 584931420.0, "step": 17460 }, { "epoch": 1.0425620821394461, "grad_norm": 0.4834948480129242, "learning_rate": 2.777982234427793e-05, "loss": 0.4472, "mean_token_accuracy": 0.8519145965576171, "num_tokens": 585099100.0, "step": 17465 }, { "epoch": 1.042860553963706, "grad_norm": 0.5424057841300964, "learning_rate": 2.7768718821885305e-05, "loss": 0.423, "mean_token_accuracy": 0.8586365342140198, "num_tokens": 585266780.0, "step": 17470 }, { "epoch": 1.0431590257879657, "grad_norm": 0.5537320971488953, "learning_rate": 2.7757615234041245e-05, "loss": 0.4408, "mean_token_accuracy": 0.8541095018386841, "num_tokens": 585434460.0, "step": 17475 }, { "epoch": 1.0434574976122255, "grad_norm": 0.49758070707321167, "learning_rate": 2.7746511583450246e-05, "loss": 0.4531, "mean_token_accuracy": 0.8515865445137024, "num_tokens": 585602140.0, "step": 17480 }, { "epoch": 1.0437559694364853, "grad_norm": 0.5076404809951782, "learning_rate": 2.7735407872816797e-05, "loss": 0.4522, "mean_token_accuracy": 0.8494453072547913, "num_tokens": 585769820.0, "step": 17485 }, { "epoch": 1.044054441260745, "grad_norm": 0.4706971347332001, "learning_rate": 2.772430410484543e-05, "loss": 0.4553, "mean_token_accuracy": 0.8479541897773742, "num_tokens": 585937500.0, "step": 17490 }, { "epoch": 1.0443529130850049, "grad_norm": 0.517845869064331, "learning_rate": 2.7713200282240635e-05, "loss": 0.4884, "mean_token_accuracy": 0.8387987613677979, "num_tokens": 586105180.0, "step": 17495 }, { "epoch": 1.0446513849092645, "grad_norm": 0.5571268200874329, "learning_rate": 2.7702096407706996e-05, "loss": 0.472, "mean_token_accuracy": 0.8445544481277466, "num_tokens": 586272860.0, "step": 17500 }, { "epoch": 1.0449498567335243, "grad_norm": 0.5131720900535583, "learning_rate": 2.769099248394903e-05, "loss": 0.4354, "mean_token_accuracy": 0.8560419797897338, "num_tokens": 586440540.0, "step": 17505 }, { "epoch": 1.045248328557784, "grad_norm": 0.623945415019989, "learning_rate": 2.7679888513671326e-05, "loss": 0.4667, "mean_token_accuracy": 0.8485029220581055, "num_tokens": 586608220.0, "step": 17510 }, { "epoch": 1.0455468003820438, "grad_norm": 0.49267029762268066, "learning_rate": 2.7668784499578455e-05, "loss": 0.4578, "mean_token_accuracy": 0.8474233627319336, "num_tokens": 586775900.0, "step": 17515 }, { "epoch": 1.0458452722063036, "grad_norm": 0.587166428565979, "learning_rate": 2.7657680444375005e-05, "loss": 0.4916, "mean_token_accuracy": 0.8388584017753601, "num_tokens": 586943580.0, "step": 17520 }, { "epoch": 1.0461437440305634, "grad_norm": 0.5883978009223938, "learning_rate": 2.7646576350765574e-05, "loss": 0.4841, "mean_token_accuracy": 0.8425503969192505, "num_tokens": 587111260.0, "step": 17525 }, { "epoch": 1.0464422158548232, "grad_norm": 0.5067137479782104, "learning_rate": 2.763547222145476e-05, "loss": 0.4581, "mean_token_accuracy": 0.8483001470565796, "num_tokens": 587278940.0, "step": 17530 }, { "epoch": 1.046740687679083, "grad_norm": 0.509401798248291, "learning_rate": 2.7624368059147188e-05, "loss": 0.4315, "mean_token_accuracy": 0.8547178864479065, "num_tokens": 587446620.0, "step": 17535 }, { "epoch": 1.0470391595033428, "grad_norm": 0.5064797401428223, "learning_rate": 2.7613263866547483e-05, "loss": 0.4243, "mean_token_accuracy": 0.8578074693679809, "num_tokens": 587614300.0, "step": 17540 }, { "epoch": 1.0473376313276026, "grad_norm": 0.5510956645011902, "learning_rate": 2.760215964636027e-05, "loss": 0.4865, "mean_token_accuracy": 0.8381546020507813, "num_tokens": 587781980.0, "step": 17545 }, { "epoch": 1.0476361031518624, "grad_norm": 0.45521649718284607, "learning_rate": 2.7591055401290195e-05, "loss": 0.4367, "mean_token_accuracy": 0.8536383271217346, "num_tokens": 587949660.0, "step": 17550 }, { "epoch": 1.0479345749761222, "grad_norm": 0.4933571517467499, "learning_rate": 2.7579951134041893e-05, "loss": 0.4357, "mean_token_accuracy": 0.856328284740448, "num_tokens": 588117340.0, "step": 17555 }, { "epoch": 1.048233046800382, "grad_norm": 0.5497868657112122, "learning_rate": 2.7568846847320012e-05, "loss": 0.4411, "mean_token_accuracy": 0.8524573564529419, "num_tokens": 588285020.0, "step": 17560 }, { "epoch": 1.0485315186246418, "grad_norm": 0.5534296035766602, "learning_rate": 2.7557742543829213e-05, "loss": 0.4775, "mean_token_accuracy": 0.8417750120162963, "num_tokens": 588452700.0, "step": 17565 }, { "epoch": 1.0488299904489016, "grad_norm": 0.5633777976036072, "learning_rate": 2.754663822627415e-05, "loss": 0.463, "mean_token_accuracy": 0.8476798295974731, "num_tokens": 588620380.0, "step": 17570 }, { "epoch": 1.0491284622731614, "grad_norm": 0.515777051448822, "learning_rate": 2.7535533897359496e-05, "loss": 0.4578, "mean_token_accuracy": 0.8480734825134277, "num_tokens": 588788060.0, "step": 17575 }, { "epoch": 1.0494269340974212, "grad_norm": 0.5349860191345215, "learning_rate": 2.75244295597899e-05, "loss": 0.502, "mean_token_accuracy": 0.832756757736206, "num_tokens": 588955740.0, "step": 17580 }, { "epoch": 1.049725405921681, "grad_norm": 0.5449660420417786, "learning_rate": 2.7513325216270024e-05, "loss": 0.4646, "mean_token_accuracy": 0.8450733661651612, "num_tokens": 589123420.0, "step": 17585 }, { "epoch": 1.0500238777459407, "grad_norm": 0.5025606155395508, "learning_rate": 2.750222086950456e-05, "loss": 0.4756, "mean_token_accuracy": 0.8448705673217773, "num_tokens": 589291100.0, "step": 17590 }, { "epoch": 1.0503223495702005, "grad_norm": 0.5384414792060852, "learning_rate": 2.7491116522198155e-05, "loss": 0.4612, "mean_token_accuracy": 0.846302044391632, "num_tokens": 589458780.0, "step": 17595 }, { "epoch": 1.0506208213944603, "grad_norm": 0.4843895435333252, "learning_rate": 2.7480012177055487e-05, "loss": 0.4364, "mean_token_accuracy": 0.8541870474815368, "num_tokens": 589626460.0, "step": 17600 }, { "epoch": 1.0509192932187201, "grad_norm": 0.5766077637672424, "learning_rate": 2.7468907836781222e-05, "loss": 0.4848, "mean_token_accuracy": 0.8388882279396057, "num_tokens": 589794140.0, "step": 17605 }, { "epoch": 1.05121776504298, "grad_norm": 0.5743246674537659, "learning_rate": 2.7457803504080025e-05, "loss": 0.444, "mean_token_accuracy": 0.8532267689704895, "num_tokens": 589961820.0, "step": 17610 }, { "epoch": 1.0515162368672397, "grad_norm": 0.5840647220611572, "learning_rate": 2.7446699181656566e-05, "loss": 0.5097, "mean_token_accuracy": 0.8343552470207214, "num_tokens": 590129500.0, "step": 17615 }, { "epoch": 1.0518147086914995, "grad_norm": 0.5343279838562012, "learning_rate": 2.743559487221551e-05, "loss": 0.4998, "mean_token_accuracy": 0.8360849499702454, "num_tokens": 590297180.0, "step": 17620 }, { "epoch": 1.0521131805157593, "grad_norm": 0.5026448369026184, "learning_rate": 2.7424490578461504e-05, "loss": 0.4602, "mean_token_accuracy": 0.8483538150787353, "num_tokens": 590464860.0, "step": 17625 }, { "epoch": 1.052411652340019, "grad_norm": 0.48948389291763306, "learning_rate": 2.7413386303099203e-05, "loss": 0.4704, "mean_token_accuracy": 0.8451151132583619, "num_tokens": 590632540.0, "step": 17630 }, { "epoch": 1.052710124164279, "grad_norm": 0.5424418449401855, "learning_rate": 2.7402282048833283e-05, "loss": 0.4597, "mean_token_accuracy": 0.8469819903373719, "num_tokens": 590800220.0, "step": 17635 }, { "epoch": 1.0530085959885387, "grad_norm": 0.45166853070259094, "learning_rate": 2.7391177818368363e-05, "loss": 0.4448, "mean_token_accuracy": 0.8528033018112182, "num_tokens": 590967900.0, "step": 17640 }, { "epoch": 1.0533070678127985, "grad_norm": 0.7576605081558228, "learning_rate": 2.7380073614409103e-05, "loss": 0.4892, "mean_token_accuracy": 0.8398209929466247, "num_tokens": 591134125.0, "step": 17645 }, { "epoch": 1.0536055396370583, "grad_norm": 0.5706770420074463, "learning_rate": 2.736896943966012e-05, "loss": 0.4563, "mean_token_accuracy": 0.850481903553009, "num_tokens": 591293542.0, "step": 17650 }, { "epoch": 1.053904011461318, "grad_norm": 0.5847463607788086, "learning_rate": 2.7357865296826053e-05, "loss": 0.496, "mean_token_accuracy": 0.8374627232551575, "num_tokens": 591461222.0, "step": 17655 }, { "epoch": 1.0542024832855779, "grad_norm": 0.5394697189331055, "learning_rate": 2.734676118861152e-05, "loss": 0.4372, "mean_token_accuracy": 0.8544315934181214, "num_tokens": 591628902.0, "step": 17660 }, { "epoch": 1.0545009551098377, "grad_norm": 0.5168535113334656, "learning_rate": 2.733565711772112e-05, "loss": 0.4469, "mean_token_accuracy": 0.8512823581695557, "num_tokens": 591796582.0, "step": 17665 }, { "epoch": 1.0547994269340975, "grad_norm": 0.5262973308563232, "learning_rate": 2.7324553086859465e-05, "loss": 0.4696, "mean_token_accuracy": 0.8451747536659241, "num_tokens": 591964262.0, "step": 17670 }, { "epoch": 1.0550978987583572, "grad_norm": 0.4767979383468628, "learning_rate": 2.7313449098731142e-05, "loss": 0.4614, "mean_token_accuracy": 0.8475486159324646, "num_tokens": 592131942.0, "step": 17675 }, { "epoch": 1.055396370582617, "grad_norm": 0.5223391056060791, "learning_rate": 2.7302345156040736e-05, "loss": 0.4856, "mean_token_accuracy": 0.8395502805709839, "num_tokens": 592299622.0, "step": 17680 }, { "epoch": 1.0556948424068768, "grad_norm": 0.5225414037704468, "learning_rate": 2.7291241261492823e-05, "loss": 0.4737, "mean_token_accuracy": 0.8428128242492676, "num_tokens": 592467302.0, "step": 17685 }, { "epoch": 1.0559933142311366, "grad_norm": 0.5096659064292908, "learning_rate": 2.728013741779194e-05, "loss": 0.4575, "mean_token_accuracy": 0.8494088411331177, "num_tokens": 592633679.0, "step": 17690 }, { "epoch": 1.0562917860553964, "grad_norm": 0.5200692415237427, "learning_rate": 2.7269033627642665e-05, "loss": 0.4871, "mean_token_accuracy": 0.8391387343406678, "num_tokens": 592801359.0, "step": 17695 }, { "epoch": 1.0565902578796562, "grad_norm": 0.5188535451889038, "learning_rate": 2.7257929893749496e-05, "loss": 0.449, "mean_token_accuracy": 0.8516700625419616, "num_tokens": 592969039.0, "step": 17700 }, { "epoch": 1.056888729703916, "grad_norm": 0.503955602645874, "learning_rate": 2.7246826218816985e-05, "loss": 0.4496, "mean_token_accuracy": 0.8505427718162537, "num_tokens": 593136719.0, "step": 17705 }, { "epoch": 1.0571872015281758, "grad_norm": 0.46807822585105896, "learning_rate": 2.72357226055496e-05, "loss": 0.4367, "mean_token_accuracy": 0.8553262591362, "num_tokens": 593304399.0, "step": 17710 }, { "epoch": 1.0574856733524356, "grad_norm": 0.5091184377670288, "learning_rate": 2.7224619056651866e-05, "loss": 0.4357, "mean_token_accuracy": 0.854723846912384, "num_tokens": 593472079.0, "step": 17715 }, { "epoch": 1.0577841451766954, "grad_norm": 0.5885462164878845, "learning_rate": 2.7213515574828248e-05, "loss": 0.4666, "mean_token_accuracy": 0.8463080048561096, "num_tokens": 593639759.0, "step": 17720 }, { "epoch": 1.0580826170009552, "grad_norm": 0.47241973876953125, "learning_rate": 2.7202412162783193e-05, "loss": 0.4412, "mean_token_accuracy": 0.852731716632843, "num_tokens": 593807439.0, "step": 17725 }, { "epoch": 1.058381088825215, "grad_norm": 0.49074244499206543, "learning_rate": 2.7191308823221157e-05, "loss": 0.4384, "mean_token_accuracy": 0.8535846471786499, "num_tokens": 593975119.0, "step": 17730 }, { "epoch": 1.0586795606494748, "grad_norm": 0.5047051310539246, "learning_rate": 2.7180205558846543e-05, "loss": 0.4495, "mean_token_accuracy": 0.849988067150116, "num_tokens": 594142799.0, "step": 17735 }, { "epoch": 1.0589780324737346, "grad_norm": 0.5465596914291382, "learning_rate": 2.716910237236377e-05, "loss": 0.436, "mean_token_accuracy": 0.8557616591453552, "num_tokens": 594310479.0, "step": 17740 }, { "epoch": 1.0592765042979944, "grad_norm": 0.48841941356658936, "learning_rate": 2.715799926647723e-05, "loss": 0.4258, "mean_token_accuracy": 0.8571812033653259, "num_tokens": 594478159.0, "step": 17745 }, { "epoch": 1.0595749761222542, "grad_norm": 0.4936128258705139, "learning_rate": 2.7146896243891267e-05, "loss": 0.4416, "mean_token_accuracy": 0.8527555823326111, "num_tokens": 594645839.0, "step": 17750 }, { "epoch": 1.059873447946514, "grad_norm": 0.5698809623718262, "learning_rate": 2.7135793307310258e-05, "loss": 0.4891, "mean_token_accuracy": 0.8386377096176147, "num_tokens": 594813519.0, "step": 17755 }, { "epoch": 1.0601719197707737, "grad_norm": 0.5826289057731628, "learning_rate": 2.7124690459438494e-05, "loss": 0.4537, "mean_token_accuracy": 0.8490039467811584, "num_tokens": 594981199.0, "step": 17760 }, { "epoch": 1.0604703915950335, "grad_norm": 0.47566941380500793, "learning_rate": 2.711358770298031e-05, "loss": 0.4743, "mean_token_accuracy": 0.8447130441665649, "num_tokens": 595142018.0, "step": 17765 }, { "epoch": 1.0607688634192933, "grad_norm": 0.5589858293533325, "learning_rate": 2.7102485040639976e-05, "loss": 0.4672, "mean_token_accuracy": 0.8450912356376648, "num_tokens": 595309698.0, "step": 17770 }, { "epoch": 1.061067335243553, "grad_norm": 0.4675428569316864, "learning_rate": 2.7091382475121736e-05, "loss": 0.4792, "mean_token_accuracy": 0.8422939300537109, "num_tokens": 595477378.0, "step": 17775 }, { "epoch": 1.0613658070678127, "grad_norm": 0.49825581908226013, "learning_rate": 2.7080280009129844e-05, "loss": 0.4375, "mean_token_accuracy": 0.8545091271400451, "num_tokens": 595645058.0, "step": 17780 }, { "epoch": 1.0616642788920725, "grad_norm": 0.5521115660667419, "learning_rate": 2.70691776453685e-05, "loss": 0.4608, "mean_token_accuracy": 0.8469760179519653, "num_tokens": 595812738.0, "step": 17785 }, { "epoch": 1.0619627507163323, "grad_norm": 0.5025843381881714, "learning_rate": 2.7058075386541886e-05, "loss": 0.4446, "mean_token_accuracy": 0.8522426366806031, "num_tokens": 595980418.0, "step": 17790 }, { "epoch": 1.062261222540592, "grad_norm": 0.5706450343132019, "learning_rate": 2.704697323535418e-05, "loss": 0.4712, "mean_token_accuracy": 0.8440653681755066, "num_tokens": 596148098.0, "step": 17795 }, { "epoch": 1.0625596943648519, "grad_norm": 0.6271229982376099, "learning_rate": 2.70358711945095e-05, "loss": 0.4555, "mean_token_accuracy": 0.8511213183403015, "num_tokens": 596315778.0, "step": 17800 }, { "epoch": 1.0628581661891117, "grad_norm": 0.47752127051353455, "learning_rate": 2.7024769266711957e-05, "loss": 0.4231, "mean_token_accuracy": 0.8593940138816833, "num_tokens": 596483458.0, "step": 17805 }, { "epoch": 1.0631566380133715, "grad_norm": 0.501616895198822, "learning_rate": 2.701366745466563e-05, "loss": 0.4503, "mean_token_accuracy": 0.8496540665626526, "num_tokens": 596651138.0, "step": 17810 }, { "epoch": 1.0634551098376313, "grad_norm": 0.5748869180679321, "learning_rate": 2.700256576107458e-05, "loss": 0.4503, "mean_token_accuracy": 0.8501431465148925, "num_tokens": 596818818.0, "step": 17815 }, { "epoch": 1.063753581661891, "grad_norm": 0.5291325449943542, "learning_rate": 2.6991464188642808e-05, "loss": 0.4477, "mean_token_accuracy": 0.8504711985588074, "num_tokens": 596986498.0, "step": 17820 }, { "epoch": 1.0640520534861508, "grad_norm": 0.4641845226287842, "learning_rate": 2.6980362740074315e-05, "loss": 0.4153, "mean_token_accuracy": 0.8613503575325012, "num_tokens": 597154178.0, "step": 17825 }, { "epoch": 1.0643505253104106, "grad_norm": 0.514773428440094, "learning_rate": 2.6969261418073087e-05, "loss": 0.4575, "mean_token_accuracy": 0.8479005217552185, "num_tokens": 597321858.0, "step": 17830 }, { "epoch": 1.0646489971346704, "grad_norm": 0.48946326971054077, "learning_rate": 2.6958160225343022e-05, "loss": 0.4837, "mean_token_accuracy": 0.839836585521698, "num_tokens": 597489538.0, "step": 17835 }, { "epoch": 1.0649474689589302, "grad_norm": 0.5289530754089355, "learning_rate": 2.6947059164588045e-05, "loss": 0.4155, "mean_token_accuracy": 0.8611475467681885, "num_tokens": 597657218.0, "step": 17840 }, { "epoch": 1.06524594078319, "grad_norm": 0.4798479974269867, "learning_rate": 2.6935958238512006e-05, "loss": 0.4313, "mean_token_accuracy": 0.855827271938324, "num_tokens": 597824898.0, "step": 17845 }, { "epoch": 1.0655444126074498, "grad_norm": 0.4973350465297699, "learning_rate": 2.6924857449818754e-05, "loss": 0.4733, "mean_token_accuracy": 0.8430156230926513, "num_tokens": 597992578.0, "step": 17850 }, { "epoch": 1.0658428844317096, "grad_norm": 0.5921276807785034, "learning_rate": 2.6913756801212086e-05, "loss": 0.4491, "mean_token_accuracy": 0.8509006381034852, "num_tokens": 598160258.0, "step": 17855 }, { "epoch": 1.0661413562559694, "grad_norm": 0.5628989338874817, "learning_rate": 2.6902656295395766e-05, "loss": 0.4509, "mean_token_accuracy": 0.8491053342819214, "num_tokens": 598327938.0, "step": 17860 }, { "epoch": 1.0664398280802292, "grad_norm": 0.545533299446106, "learning_rate": 2.6891555935073537e-05, "loss": 0.4468, "mean_token_accuracy": 0.85284503698349, "num_tokens": 598495618.0, "step": 17865 }, { "epoch": 1.066738299904489, "grad_norm": 0.5277979373931885, "learning_rate": 2.6880455722949083e-05, "loss": 0.4387, "mean_token_accuracy": 0.8540200352668762, "num_tokens": 598663298.0, "step": 17870 }, { "epoch": 1.0670367717287488, "grad_norm": 0.5081989169120789, "learning_rate": 2.686935566172607e-05, "loss": 0.4442, "mean_token_accuracy": 0.8515626788139343, "num_tokens": 598830978.0, "step": 17875 }, { "epoch": 1.0673352435530086, "grad_norm": 0.4833400249481201, "learning_rate": 2.6858255754108148e-05, "loss": 0.4742, "mean_token_accuracy": 0.8425324916839599, "num_tokens": 598998658.0, "step": 17880 }, { "epoch": 1.0676337153772684, "grad_norm": 0.5343855023384094, "learning_rate": 2.6847156002798857e-05, "loss": 0.455, "mean_token_accuracy": 0.8490456938743591, "num_tokens": 599166338.0, "step": 17885 }, { "epoch": 1.0679321872015282, "grad_norm": 0.5258563160896301, "learning_rate": 2.683605641050179e-05, "loss": 0.4827, "mean_token_accuracy": 0.8414052128791809, "num_tokens": 599334018.0, "step": 17890 }, { "epoch": 1.068230659025788, "grad_norm": 0.4840978682041168, "learning_rate": 2.6824956979920424e-05, "loss": 0.4328, "mean_token_accuracy": 0.8547417402267456, "num_tokens": 599501698.0, "step": 17895 }, { "epoch": 1.0685291308500477, "grad_norm": 0.5214992761611938, "learning_rate": 2.681385771375825e-05, "loss": 0.4832, "mean_token_accuracy": 0.8407670259475708, "num_tokens": 599669378.0, "step": 17900 }, { "epoch": 1.0688276026743075, "grad_norm": 0.5052345991134644, "learning_rate": 2.6802758614718703e-05, "loss": 0.491, "mean_token_accuracy": 0.8385064959526062, "num_tokens": 599837058.0, "step": 17905 }, { "epoch": 1.0691260744985673, "grad_norm": 0.46404924988746643, "learning_rate": 2.679165968550516e-05, "loss": 0.432, "mean_token_accuracy": 0.8560240983963012, "num_tokens": 600004738.0, "step": 17910 }, { "epoch": 1.0694245463228271, "grad_norm": 0.5729808807373047, "learning_rate": 2.6780560928820974e-05, "loss": 0.4618, "mean_token_accuracy": 0.8469521641731262, "num_tokens": 600172418.0, "step": 17915 }, { "epoch": 1.069723018147087, "grad_norm": 0.5375544428825378, "learning_rate": 2.6769462347369456e-05, "loss": 0.4788, "mean_token_accuracy": 0.842067289352417, "num_tokens": 600340098.0, "step": 17920 }, { "epoch": 1.0700214899713467, "grad_norm": 0.5067377090454102, "learning_rate": 2.6758363943853865e-05, "loss": 0.4592, "mean_token_accuracy": 0.8473219394683837, "num_tokens": 600507778.0, "step": 17925 }, { "epoch": 1.0703199617956065, "grad_norm": 0.5274377465248108, "learning_rate": 2.674726572097742e-05, "loss": 0.4669, "mean_token_accuracy": 0.8452940464019776, "num_tokens": 600675458.0, "step": 17930 }, { "epoch": 1.0706184336198663, "grad_norm": 0.5200303792953491, "learning_rate": 2.6736167681443308e-05, "loss": 0.5074, "mean_token_accuracy": 0.8351008057594299, "num_tokens": 600843138.0, "step": 17935 }, { "epoch": 1.070916905444126, "grad_norm": 0.5001233220100403, "learning_rate": 2.6725069827954663e-05, "loss": 0.4535, "mean_token_accuracy": 0.8501670241355896, "num_tokens": 601010818.0, "step": 17940 }, { "epoch": 1.071215377268386, "grad_norm": 0.6038535237312317, "learning_rate": 2.671397216321455e-05, "loss": 0.4644, "mean_token_accuracy": 0.8464929103851319, "num_tokens": 601178498.0, "step": 17945 }, { "epoch": 1.0715138490926457, "grad_norm": 0.4926758408546448, "learning_rate": 2.670287468992604e-05, "loss": 0.4407, "mean_token_accuracy": 0.8539067149162293, "num_tokens": 601346178.0, "step": 17950 }, { "epoch": 1.0718123209169055, "grad_norm": 0.48250171542167664, "learning_rate": 2.6691777410792102e-05, "loss": 0.4453, "mean_token_accuracy": 0.8525050640106201, "num_tokens": 601513858.0, "step": 17955 }, { "epoch": 1.0721107927411653, "grad_norm": 0.4897820055484772, "learning_rate": 2.668068032851569e-05, "loss": 0.4679, "mean_token_accuracy": 0.8459441781044006, "num_tokens": 601681538.0, "step": 17960 }, { "epoch": 1.072409264565425, "grad_norm": 0.5335580110549927, "learning_rate": 2.666958344579972e-05, "loss": 0.4112, "mean_token_accuracy": 0.8627162218093872, "num_tokens": 601849218.0, "step": 17965 }, { "epoch": 1.0727077363896849, "grad_norm": 0.5549975633621216, "learning_rate": 2.665848676534702e-05, "loss": 0.4692, "mean_token_accuracy": 0.8447095274925231, "num_tokens": 602016898.0, "step": 17970 }, { "epoch": 1.0730062082139447, "grad_norm": 0.4920300543308258, "learning_rate": 2.66473902898604e-05, "loss": 0.4521, "mean_token_accuracy": 0.8501133322715759, "num_tokens": 602184578.0, "step": 17975 }, { "epoch": 1.0733046800382045, "grad_norm": 0.47080594301223755, "learning_rate": 2.6636294022042618e-05, "loss": 0.4779, "mean_token_accuracy": 0.8427054762840271, "num_tokens": 602352258.0, "step": 17980 }, { "epoch": 1.0736031518624642, "grad_norm": 0.5335698127746582, "learning_rate": 2.6625197964596366e-05, "loss": 0.4543, "mean_token_accuracy": 0.8492305755615235, "num_tokens": 602519938.0, "step": 17985 }, { "epoch": 1.073901623686724, "grad_norm": 0.49977582693099976, "learning_rate": 2.6614102120224298e-05, "loss": 0.4838, "mean_token_accuracy": 0.838834536075592, "num_tokens": 602687618.0, "step": 17990 }, { "epoch": 1.0742000955109838, "grad_norm": 0.5056637525558472, "learning_rate": 2.660300649162901e-05, "loss": 0.4109, "mean_token_accuracy": 0.8633066892623902, "num_tokens": 602855298.0, "step": 17995 }, { "epoch": 1.0744985673352436, "grad_norm": 0.4958396553993225, "learning_rate": 2.659191108151305e-05, "loss": 0.4396, "mean_token_accuracy": 0.8530060887336731, "num_tokens": 603022978.0, "step": 18000 }, { "epoch": 1.0747970391595034, "grad_norm": 0.4855215847492218, "learning_rate": 2.6580815892578904e-05, "loss": 0.4452, "mean_token_accuracy": 0.8541870594024659, "num_tokens": 603190658.0, "step": 18005 }, { "epoch": 1.0750955109837632, "grad_norm": 0.5160524845123291, "learning_rate": 2.6569720927529018e-05, "loss": 0.4328, "mean_token_accuracy": 0.856065857410431, "num_tokens": 603358338.0, "step": 18010 }, { "epoch": 1.075393982808023, "grad_norm": 0.5940030813217163, "learning_rate": 2.6558626189065765e-05, "loss": 0.4333, "mean_token_accuracy": 0.8556662321090698, "num_tokens": 603526018.0, "step": 18015 }, { "epoch": 1.0756924546322828, "grad_norm": 0.48193079233169556, "learning_rate": 2.654753167989148e-05, "loss": 0.4569, "mean_token_accuracy": 0.84797123670578, "num_tokens": 603692972.0, "step": 18020 }, { "epoch": 1.0759909264565426, "grad_norm": 0.5851207375526428, "learning_rate": 2.6536437402708442e-05, "loss": 0.4742, "mean_token_accuracy": 0.8444053530693054, "num_tokens": 603860652.0, "step": 18025 }, { "epoch": 1.0762893982808024, "grad_norm": 0.5075387358665466, "learning_rate": 2.6525343360218853e-05, "loss": 0.4432, "mean_token_accuracy": 0.8525706887245178, "num_tokens": 604028332.0, "step": 18030 }, { "epoch": 1.0765878701050622, "grad_norm": 0.47441422939300537, "learning_rate": 2.6514249555124887e-05, "loss": 0.4495, "mean_token_accuracy": 0.8499045729637146, "num_tokens": 604196012.0, "step": 18035 }, { "epoch": 1.0768863419293218, "grad_norm": 0.46453920006752014, "learning_rate": 2.650315599012862e-05, "loss": 0.427, "mean_token_accuracy": 0.856727910041809, "num_tokens": 604363692.0, "step": 18040 }, { "epoch": 1.0771848137535818, "grad_norm": 0.4999887943267822, "learning_rate": 2.6492062667932123e-05, "loss": 0.4583, "mean_token_accuracy": 0.8486758947372437, "num_tokens": 604531372.0, "step": 18045 }, { "epoch": 1.0774832855778413, "grad_norm": 0.5404815077781677, "learning_rate": 2.648096959123737e-05, "loss": 0.4626, "mean_token_accuracy": 0.8473219513893128, "num_tokens": 604699052.0, "step": 18050 }, { "epoch": 1.0777817574021014, "grad_norm": 0.5161058306694031, "learning_rate": 2.6469876762746277e-05, "loss": 0.4557, "mean_token_accuracy": 0.8495407462120056, "num_tokens": 604866732.0, "step": 18055 }, { "epoch": 1.078080229226361, "grad_norm": 0.48695194721221924, "learning_rate": 2.645878418516072e-05, "loss": 0.5036, "mean_token_accuracy": 0.8346176862716674, "num_tokens": 605034412.0, "step": 18060 }, { "epoch": 1.0783787010506207, "grad_norm": 0.5438864827156067, "learning_rate": 2.6447691861182483e-05, "loss": 0.473, "mean_token_accuracy": 0.8427651166915894, "num_tokens": 605202092.0, "step": 18065 }, { "epoch": 1.0786771728748805, "grad_norm": 0.5088119506835938, "learning_rate": 2.6436599793513317e-05, "loss": 0.4621, "mean_token_accuracy": 0.8465764045715332, "num_tokens": 605369772.0, "step": 18070 }, { "epoch": 1.0789756446991403, "grad_norm": 0.5458279252052307, "learning_rate": 2.6425507984854907e-05, "loss": 0.5051, "mean_token_accuracy": 0.8333353161811828, "num_tokens": 605537452.0, "step": 18075 }, { "epoch": 1.0792741165234, "grad_norm": 0.4567376971244812, "learning_rate": 2.6414416437908857e-05, "loss": 0.4539, "mean_token_accuracy": 0.8499701857566834, "num_tokens": 605705132.0, "step": 18080 }, { "epoch": 1.07957258834766, "grad_norm": 0.5640619993209839, "learning_rate": 2.640332515537672e-05, "loss": 0.4762, "mean_token_accuracy": 0.842544436454773, "num_tokens": 605872812.0, "step": 18085 }, { "epoch": 1.0798710601719197, "grad_norm": 0.48929524421691895, "learning_rate": 2.6392234139959988e-05, "loss": 0.4565, "mean_token_accuracy": 0.8481092691421509, "num_tokens": 606040492.0, "step": 18090 }, { "epoch": 1.0801695319961795, "grad_norm": 0.5491763353347778, "learning_rate": 2.6381143394360063e-05, "loss": 0.4443, "mean_token_accuracy": 0.8519145965576171, "num_tokens": 606208172.0, "step": 18095 }, { "epoch": 1.0804680038204393, "grad_norm": 0.5325464010238647, "learning_rate": 2.637005292127833e-05, "loss": 0.4418, "mean_token_accuracy": 0.8536383032798767, "num_tokens": 606375852.0, "step": 18100 }, { "epoch": 1.080766475644699, "grad_norm": 0.4878290593624115, "learning_rate": 2.6358962723416063e-05, "loss": 0.4244, "mean_token_accuracy": 0.8593820810317994, "num_tokens": 606543532.0, "step": 18105 }, { "epoch": 1.0810649474689589, "grad_norm": 0.46349263191223145, "learning_rate": 2.6347872803474487e-05, "loss": 0.4709, "mean_token_accuracy": 0.8423237442970276, "num_tokens": 606711212.0, "step": 18110 }, { "epoch": 1.0813634192932187, "grad_norm": 0.5191339254379272, "learning_rate": 2.6336783164154753e-05, "loss": 0.4433, "mean_token_accuracy": 0.8524752497673035, "num_tokens": 606878892.0, "step": 18115 }, { "epoch": 1.0816618911174785, "grad_norm": 0.4806431233882904, "learning_rate": 2.6325693808157952e-05, "loss": 0.4457, "mean_token_accuracy": 0.8508051991462707, "num_tokens": 607046572.0, "step": 18120 }, { "epoch": 1.0819603629417383, "grad_norm": 0.498297780752182, "learning_rate": 2.6314604738185095e-05, "loss": 0.443, "mean_token_accuracy": 0.8525647163391114, "num_tokens": 607214252.0, "step": 18125 }, { "epoch": 1.082258834765998, "grad_norm": 0.5350227952003479, "learning_rate": 2.6303515956937137e-05, "loss": 0.4727, "mean_token_accuracy": 0.8453835010528564, "num_tokens": 607381932.0, "step": 18130 }, { "epoch": 1.0825573065902578, "grad_norm": 0.4777717888355255, "learning_rate": 2.6292427467114957e-05, "loss": 0.4435, "mean_token_accuracy": 0.8528271555900574, "num_tokens": 607549612.0, "step": 18135 }, { "epoch": 1.0828557784145176, "grad_norm": 0.5520328879356384, "learning_rate": 2.6281339271419347e-05, "loss": 0.4652, "mean_token_accuracy": 0.8443755269050598, "num_tokens": 607717292.0, "step": 18140 }, { "epoch": 1.0831542502387774, "grad_norm": 0.4736959636211395, "learning_rate": 2.6270251372551058e-05, "loss": 0.4328, "mean_token_accuracy": 0.855189073085785, "num_tokens": 607884972.0, "step": 18145 }, { "epoch": 1.0834527220630372, "grad_norm": 0.6132194995880127, "learning_rate": 2.6259163773210743e-05, "loss": 0.4627, "mean_token_accuracy": 0.8478289365768432, "num_tokens": 608052652.0, "step": 18150 }, { "epoch": 1.083751193887297, "grad_norm": 0.5220558643341064, "learning_rate": 2.6248076476098986e-05, "loss": 0.4661, "mean_token_accuracy": 0.8455803275108338, "num_tokens": 608220332.0, "step": 18155 }, { "epoch": 1.0840496657115568, "grad_norm": 0.5272908210754395, "learning_rate": 2.623698948391633e-05, "loss": 0.4794, "mean_token_accuracy": 0.841291892528534, "num_tokens": 608388012.0, "step": 18160 }, { "epoch": 1.0843481375358166, "grad_norm": 0.629886269569397, "learning_rate": 2.6225902799363178e-05, "loss": 0.498, "mean_token_accuracy": 0.8378742575645447, "num_tokens": 608555692.0, "step": 18165 }, { "epoch": 1.0846466093600764, "grad_norm": 0.49738672375679016, "learning_rate": 2.6214816425139933e-05, "loss": 0.4463, "mean_token_accuracy": 0.8533043026924133, "num_tokens": 608723372.0, "step": 18170 }, { "epoch": 1.0849450811843362, "grad_norm": 0.5750638842582703, "learning_rate": 2.6203730363946855e-05, "loss": 0.4622, "mean_token_accuracy": 0.8458427786827087, "num_tokens": 608891052.0, "step": 18175 }, { "epoch": 1.085243553008596, "grad_norm": 0.5631683468818665, "learning_rate": 2.619264461848418e-05, "loss": 0.5008, "mean_token_accuracy": 0.834385073184967, "num_tokens": 609058732.0, "step": 18180 }, { "epoch": 1.0855420248328558, "grad_norm": 0.5108806490898132, "learning_rate": 2.6181559191452043e-05, "loss": 0.4356, "mean_token_accuracy": 0.854461419582367, "num_tokens": 609226412.0, "step": 18185 }, { "epoch": 1.0858404966571156, "grad_norm": 0.5062009692192078, "learning_rate": 2.6170474085550496e-05, "loss": 0.4404, "mean_token_accuracy": 0.8520398378372193, "num_tokens": 609394092.0, "step": 18190 }, { "epoch": 1.0861389684813754, "grad_norm": 0.4719405770301819, "learning_rate": 2.615938930347953e-05, "loss": 0.4274, "mean_token_accuracy": 0.8566384315490723, "num_tokens": 609561772.0, "step": 18195 }, { "epoch": 1.0864374403056352, "grad_norm": 0.555225133895874, "learning_rate": 2.614830484793904e-05, "loss": 0.4599, "mean_token_accuracy": 0.8467851638793945, "num_tokens": 609729452.0, "step": 18200 }, { "epoch": 1.086735912129895, "grad_norm": 0.5339514017105103, "learning_rate": 2.613722072162885e-05, "loss": 0.4784, "mean_token_accuracy": 0.8430871963500977, "num_tokens": 609897132.0, "step": 18205 }, { "epoch": 1.0870343839541547, "grad_norm": 0.5332335233688354, "learning_rate": 2.6126136927248718e-05, "loss": 0.4669, "mean_token_accuracy": 0.8456220984458923, "num_tokens": 610064812.0, "step": 18210 }, { "epoch": 1.0873328557784145, "grad_norm": 0.5602828860282898, "learning_rate": 2.6115053467498285e-05, "loss": 0.4873, "mean_token_accuracy": 0.8415722250938416, "num_tokens": 610232492.0, "step": 18215 }, { "epoch": 1.0876313276026743, "grad_norm": 0.48973003029823303, "learning_rate": 2.6103970345077154e-05, "loss": 0.4307, "mean_token_accuracy": 0.8565191507339478, "num_tokens": 610400172.0, "step": 18220 }, { "epoch": 1.0879297994269341, "grad_norm": 0.5911539196968079, "learning_rate": 2.6092887562684803e-05, "loss": 0.4706, "mean_token_accuracy": 0.8445567727088928, "num_tokens": 610566566.0, "step": 18225 }, { "epoch": 1.088228271251194, "grad_norm": 0.5489808917045593, "learning_rate": 2.608180512302067e-05, "loss": 0.4705, "mean_token_accuracy": 0.8445007801055908, "num_tokens": 610734246.0, "step": 18230 }, { "epoch": 1.0885267430754537, "grad_norm": 0.5308067798614502, "learning_rate": 2.607072302878406e-05, "loss": 0.4614, "mean_token_accuracy": 0.8454849004745484, "num_tokens": 610901926.0, "step": 18235 }, { "epoch": 1.0888252148997135, "grad_norm": 0.5792397260665894, "learning_rate": 2.6059641282674247e-05, "loss": 0.4713, "mean_token_accuracy": 0.8451688051223755, "num_tokens": 611069606.0, "step": 18240 }, { "epoch": 1.0891236867239733, "grad_norm": 0.47498780488967896, "learning_rate": 2.604855988739039e-05, "loss": 0.4616, "mean_token_accuracy": 0.8469521760940552, "num_tokens": 611237286.0, "step": 18245 }, { "epoch": 1.089422158548233, "grad_norm": 0.5143116116523743, "learning_rate": 2.603747884563155e-05, "loss": 0.4255, "mean_token_accuracy": 0.8578432679176331, "num_tokens": 611404966.0, "step": 18250 }, { "epoch": 1.089720630372493, "grad_norm": 1.8472613096237183, "learning_rate": 2.6026398160096742e-05, "loss": 0.445, "mean_token_accuracy": 0.8542347550392151, "num_tokens": 611572646.0, "step": 18255 }, { "epoch": 1.0900191021967527, "grad_norm": 0.5715947151184082, "learning_rate": 2.6015317833484863e-05, "loss": 0.4933, "mean_token_accuracy": 0.8368841648101807, "num_tokens": 611740326.0, "step": 18260 }, { "epoch": 1.0903175740210125, "grad_norm": 0.5168896317481995, "learning_rate": 2.6004237868494725e-05, "loss": 0.4454, "mean_token_accuracy": 0.8511213183403015, "num_tokens": 611908006.0, "step": 18265 }, { "epoch": 1.0906160458452723, "grad_norm": 0.5258520245552063, "learning_rate": 2.5993158267825068e-05, "loss": 0.4661, "mean_token_accuracy": 0.8458487391471863, "num_tokens": 612075686.0, "step": 18270 }, { "epoch": 1.090914517669532, "grad_norm": 0.5208800435066223, "learning_rate": 2.5982079034174522e-05, "loss": 0.4692, "mean_token_accuracy": 0.8447512745857239, "num_tokens": 612243366.0, "step": 18275 }, { "epoch": 1.0912129894937919, "grad_norm": 0.5926040410995483, "learning_rate": 2.5971000170241648e-05, "loss": 0.4338, "mean_token_accuracy": 0.8565012454986572, "num_tokens": 612411046.0, "step": 18280 }, { "epoch": 1.0915114613180517, "grad_norm": 0.5341820120811462, "learning_rate": 2.59599216787249e-05, "loss": 0.4312, "mean_token_accuracy": 0.8560837507247925, "num_tokens": 612578726.0, "step": 18285 }, { "epoch": 1.0918099331423115, "grad_norm": 0.5410572290420532, "learning_rate": 2.5948843562322655e-05, "loss": 0.429, "mean_token_accuracy": 0.8570141911506652, "num_tokens": 612746406.0, "step": 18290 }, { "epoch": 1.0921084049665712, "grad_norm": 0.5297465920448303, "learning_rate": 2.59377658237332e-05, "loss": 0.4638, "mean_token_accuracy": 0.8459386944770813, "num_tokens": 612908700.0, "step": 18295 }, { "epoch": 1.092406876790831, "grad_norm": 0.5624565482139587, "learning_rate": 2.5926688465654703e-05, "loss": 0.4409, "mean_token_accuracy": 0.8532744884490967, "num_tokens": 613076380.0, "step": 18300 }, { "epoch": 1.0927053486150908, "grad_norm": 0.5259370803833008, "learning_rate": 2.5915611490785286e-05, "loss": 0.4725, "mean_token_accuracy": 0.8425265431404114, "num_tokens": 613244060.0, "step": 18305 }, { "epoch": 1.0930038204393506, "grad_norm": 0.4987262487411499, "learning_rate": 2.590453490182292e-05, "loss": 0.434, "mean_token_accuracy": 0.8563044309616089, "num_tokens": 613411740.0, "step": 18310 }, { "epoch": 1.0933022922636102, "grad_norm": 0.4512346088886261, "learning_rate": 2.5893458701465528e-05, "loss": 0.3879, "mean_token_accuracy": 0.8685136556625366, "num_tokens": 613579420.0, "step": 18315 }, { "epoch": 1.0936007640878702, "grad_norm": 0.5192257761955261, "learning_rate": 2.5882382892410927e-05, "loss": 0.4245, "mean_token_accuracy": 0.8574973344802856, "num_tokens": 613747100.0, "step": 18320 }, { "epoch": 1.0938992359121298, "grad_norm": 0.5196990966796875, "learning_rate": 2.5871307477356832e-05, "loss": 0.4422, "mean_token_accuracy": 0.8525169968605042, "num_tokens": 613914780.0, "step": 18325 }, { "epoch": 1.0941977077363898, "grad_norm": 0.5397512912750244, "learning_rate": 2.586023245900086e-05, "loss": 0.4607, "mean_token_accuracy": 0.8479601621627808, "num_tokens": 614082460.0, "step": 18330 }, { "epoch": 1.0944961795606494, "grad_norm": 0.5651199221611023, "learning_rate": 2.584915784004054e-05, "loss": 0.4565, "mean_token_accuracy": 0.8499284267425538, "num_tokens": 614250140.0, "step": 18335 }, { "epoch": 1.0947946513849092, "grad_norm": 0.514983057975769, "learning_rate": 2.5838083623173303e-05, "loss": 0.4822, "mean_token_accuracy": 0.842204463481903, "num_tokens": 614417820.0, "step": 18340 }, { "epoch": 1.095093123209169, "grad_norm": 0.5464887619018555, "learning_rate": 2.582700981109647e-05, "loss": 0.4337, "mean_token_accuracy": 0.8544435024261474, "num_tokens": 614585500.0, "step": 18345 }, { "epoch": 1.0953915950334288, "grad_norm": 0.5516802668571472, "learning_rate": 2.5815936406507273e-05, "loss": 0.4588, "mean_token_accuracy": 0.8497495055198669, "num_tokens": 614753180.0, "step": 18350 }, { "epoch": 1.0956900668576885, "grad_norm": 0.5500524044036865, "learning_rate": 2.580486341210286e-05, "loss": 0.4294, "mean_token_accuracy": 0.8565907120704651, "num_tokens": 614920860.0, "step": 18355 }, { "epoch": 1.0959885386819483, "grad_norm": 0.5346205234527588, "learning_rate": 2.5793790830580243e-05, "loss": 0.4525, "mean_token_accuracy": 0.8499761462211609, "num_tokens": 615088540.0, "step": 18360 }, { "epoch": 1.0962870105062081, "grad_norm": 0.48874184489250183, "learning_rate": 2.5782718664636375e-05, "loss": 0.4675, "mean_token_accuracy": 0.8460873246192933, "num_tokens": 615256220.0, "step": 18365 }, { "epoch": 1.096585482330468, "grad_norm": 0.5291923880577087, "learning_rate": 2.577164691696805e-05, "loss": 0.4301, "mean_token_accuracy": 0.8561672449111939, "num_tokens": 615423900.0, "step": 18370 }, { "epoch": 1.0968839541547277, "grad_norm": 0.5688678622245789, "learning_rate": 2.5760575590272033e-05, "loss": 0.4526, "mean_token_accuracy": 0.8502266526222229, "num_tokens": 615591580.0, "step": 18375 }, { "epoch": 1.0971824259789875, "grad_norm": 0.5148406624794006, "learning_rate": 2.574950468724494e-05, "loss": 0.4472, "mean_token_accuracy": 0.8511391878128052, "num_tokens": 615759260.0, "step": 18380 }, { "epoch": 1.0974808978032473, "grad_norm": 0.5478710532188416, "learning_rate": 2.573843421058329e-05, "loss": 0.5155, "mean_token_accuracy": 0.8308302521705627, "num_tokens": 615926940.0, "step": 18385 }, { "epoch": 1.097779369627507, "grad_norm": 0.5326007008552551, "learning_rate": 2.5727364162983507e-05, "loss": 0.4357, "mean_token_accuracy": 0.8551711916923523, "num_tokens": 616094620.0, "step": 18390 }, { "epoch": 1.098077841451767, "grad_norm": 0.502690315246582, "learning_rate": 2.5716294547141895e-05, "loss": 0.46, "mean_token_accuracy": 0.8459739923477173, "num_tokens": 616262300.0, "step": 18395 }, { "epoch": 1.0983763132760267, "grad_norm": 0.4984673857688904, "learning_rate": 2.5705225365754665e-05, "loss": 0.4694, "mean_token_accuracy": 0.844924247264862, "num_tokens": 616429980.0, "step": 18400 }, { "epoch": 1.0986747851002865, "grad_norm": 0.5066306591033936, "learning_rate": 2.5694156621517935e-05, "loss": 0.4675, "mean_token_accuracy": 0.8442920327186585, "num_tokens": 616597660.0, "step": 18405 }, { "epoch": 1.0989732569245463, "grad_norm": 0.5151629447937012, "learning_rate": 2.5683088317127684e-05, "loss": 0.4265, "mean_token_accuracy": 0.8567219376564026, "num_tokens": 616765340.0, "step": 18410 }, { "epoch": 1.099271728748806, "grad_norm": 0.5981072187423706, "learning_rate": 2.5672020455279815e-05, "loss": 0.4702, "mean_token_accuracy": 0.8464453339576721, "num_tokens": 616927635.0, "step": 18415 }, { "epoch": 1.0995702005730659, "grad_norm": 0.5237789154052734, "learning_rate": 2.5660953038670105e-05, "loss": 0.4265, "mean_token_accuracy": 0.8574078559875489, "num_tokens": 617095315.0, "step": 18420 }, { "epoch": 1.0998686723973257, "grad_norm": 0.526952862739563, "learning_rate": 2.5649886069994217e-05, "loss": 0.4169, "mean_token_accuracy": 0.860187292098999, "num_tokens": 617262995.0, "step": 18425 }, { "epoch": 1.1001671442215855, "grad_norm": 0.5191293358802795, "learning_rate": 2.563881955194774e-05, "loss": 0.5046, "mean_token_accuracy": 0.8336156487464905, "num_tokens": 617430675.0, "step": 18430 }, { "epoch": 1.1004656160458453, "grad_norm": 0.4855729937553406, "learning_rate": 2.56277534872261e-05, "loss": 0.4672, "mean_token_accuracy": 0.8450912594795227, "num_tokens": 617598355.0, "step": 18435 }, { "epoch": 1.100764087870105, "grad_norm": 0.5127715468406677, "learning_rate": 2.561668787852467e-05, "loss": 0.468, "mean_token_accuracy": 0.8464809656143188, "num_tokens": 617766035.0, "step": 18440 }, { "epoch": 1.1010625596943648, "grad_norm": 0.520465612411499, "learning_rate": 2.560562272853866e-05, "loss": 0.4621, "mean_token_accuracy": 0.8458785653114319, "num_tokens": 617933715.0, "step": 18445 }, { "epoch": 1.1013610315186246, "grad_norm": 0.5159270763397217, "learning_rate": 2.5594558039963207e-05, "loss": 0.4565, "mean_token_accuracy": 0.8489025473594666, "num_tokens": 618101395.0, "step": 18450 }, { "epoch": 1.1016595033428844, "grad_norm": 0.5116859078407288, "learning_rate": 2.558349381549331e-05, "loss": 0.4681, "mean_token_accuracy": 0.8442264199256897, "num_tokens": 618269075.0, "step": 18455 }, { "epoch": 1.1019579751671442, "grad_norm": 0.48104482889175415, "learning_rate": 2.557243005782387e-05, "loss": 0.4372, "mean_token_accuracy": 0.8558511137962341, "num_tokens": 618436755.0, "step": 18460 }, { "epoch": 1.102256446991404, "grad_norm": 0.5392863154411316, "learning_rate": 2.556136676964967e-05, "loss": 0.4455, "mean_token_accuracy": 0.8509841322898865, "num_tokens": 618604435.0, "step": 18465 }, { "epoch": 1.1025549188156638, "grad_norm": 0.49735796451568604, "learning_rate": 2.5550303953665373e-05, "loss": 0.4684, "mean_token_accuracy": 0.8451151132583619, "num_tokens": 618772115.0, "step": 18470 }, { "epoch": 1.1028533906399236, "grad_norm": 0.5180985927581787, "learning_rate": 2.5539241612565533e-05, "loss": 0.4707, "mean_token_accuracy": 0.8442562341690063, "num_tokens": 618939795.0, "step": 18475 }, { "epoch": 1.1031518624641834, "grad_norm": 0.6098250150680542, "learning_rate": 2.5528179749044595e-05, "loss": 0.4807, "mean_token_accuracy": 0.8403674244880677, "num_tokens": 619107475.0, "step": 18480 }, { "epoch": 1.1034503342884432, "grad_norm": 0.576880156993866, "learning_rate": 2.5517118365796866e-05, "loss": 0.4786, "mean_token_accuracy": 0.8427412629127502, "num_tokens": 619275155.0, "step": 18485 }, { "epoch": 1.103748806112703, "grad_norm": 0.5477427244186401, "learning_rate": 2.5506057465516574e-05, "loss": 0.43, "mean_token_accuracy": 0.8560062050819397, "num_tokens": 619442835.0, "step": 18490 }, { "epoch": 1.1040472779369628, "grad_norm": 0.49006766080856323, "learning_rate": 2.549499705089778e-05, "loss": 0.4444, "mean_token_accuracy": 0.8523619294166564, "num_tokens": 619610515.0, "step": 18495 }, { "epoch": 1.1043457497612226, "grad_norm": 0.5077897310256958, "learning_rate": 2.548393712463447e-05, "loss": 0.4556, "mean_token_accuracy": 0.8489443063735962, "num_tokens": 619778195.0, "step": 18500 }, { "epoch": 1.1046442215854824, "grad_norm": 0.5169222950935364, "learning_rate": 2.547287768942047e-05, "loss": 0.5074, "mean_token_accuracy": 0.8335858345031738, "num_tokens": 619945875.0, "step": 18505 }, { "epoch": 1.1049426934097422, "grad_norm": 0.4919980466365814, "learning_rate": 2.5461818747949533e-05, "loss": 0.4366, "mean_token_accuracy": 0.8535548210144043, "num_tokens": 620113555.0, "step": 18510 }, { "epoch": 1.105241165234002, "grad_norm": 0.47955581545829773, "learning_rate": 2.5450760302915273e-05, "loss": 0.4327, "mean_token_accuracy": 0.855451512336731, "num_tokens": 620281235.0, "step": 18515 }, { "epoch": 1.1055396370582617, "grad_norm": 0.5274818539619446, "learning_rate": 2.5439702357011152e-05, "loss": 0.4077, "mean_token_accuracy": 0.8651019930839539, "num_tokens": 620448915.0, "step": 18520 }, { "epoch": 1.1058381088825215, "grad_norm": 0.5645363926887512, "learning_rate": 2.5428644912930555e-05, "loss": 0.4861, "mean_token_accuracy": 0.8398842930793762, "num_tokens": 620616595.0, "step": 18525 }, { "epoch": 1.1061365807067813, "grad_norm": 0.5423452258110046, "learning_rate": 2.541758797336672e-05, "loss": 0.4818, "mean_token_accuracy": 0.8411606907844543, "num_tokens": 620784275.0, "step": 18530 }, { "epoch": 1.1064350525310411, "grad_norm": 0.5464590191841125, "learning_rate": 2.5406531541012767e-05, "loss": 0.4853, "mean_token_accuracy": 0.8405761599540711, "num_tokens": 620951955.0, "step": 18535 }, { "epoch": 1.106733524355301, "grad_norm": 0.5267860889434814, "learning_rate": 2.539547561856171e-05, "loss": 0.4647, "mean_token_accuracy": 0.844661808013916, "num_tokens": 621119635.0, "step": 18540 }, { "epoch": 1.1070319961795607, "grad_norm": 0.46543750166893005, "learning_rate": 2.5384420208706406e-05, "loss": 0.4534, "mean_token_accuracy": 0.8505368113517762, "num_tokens": 621287315.0, "step": 18545 }, { "epoch": 1.1073304680038205, "grad_norm": 0.5024165511131287, "learning_rate": 2.5373365314139608e-05, "loss": 0.4767, "mean_token_accuracy": 0.842544436454773, "num_tokens": 621454995.0, "step": 18550 }, { "epoch": 1.1076289398280803, "grad_norm": 0.5313209891319275, "learning_rate": 2.536231093755394e-05, "loss": 0.453, "mean_token_accuracy": 0.8497912406921386, "num_tokens": 621622675.0, "step": 18555 }, { "epoch": 1.10792741165234, "grad_norm": 0.511211097240448, "learning_rate": 2.5351257081641906e-05, "loss": 0.4414, "mean_token_accuracy": 0.8542053461074829, "num_tokens": 621790225.0, "step": 18560 }, { "epoch": 1.1082258834766, "grad_norm": 0.5000944137573242, "learning_rate": 2.534020374909586e-05, "loss": 0.4634, "mean_token_accuracy": 0.8475009083747864, "num_tokens": 621957905.0, "step": 18565 }, { "epoch": 1.1085243553008597, "grad_norm": 0.507066547870636, "learning_rate": 2.532915094260806e-05, "loss": 0.4326, "mean_token_accuracy": 0.8578014969825745, "num_tokens": 622125585.0, "step": 18570 }, { "epoch": 1.1088228271251195, "grad_norm": 0.5120506882667542, "learning_rate": 2.531809866487062e-05, "loss": 0.4689, "mean_token_accuracy": 0.8474770307540893, "num_tokens": 622293265.0, "step": 18575 }, { "epoch": 1.1091212989493793, "grad_norm": 0.5658789277076721, "learning_rate": 2.530704691857552e-05, "loss": 0.4444, "mean_token_accuracy": 0.8538232207298279, "num_tokens": 622460945.0, "step": 18580 }, { "epoch": 1.109419770773639, "grad_norm": 0.6437373161315918, "learning_rate": 2.529599570641463e-05, "loss": 0.5148, "mean_token_accuracy": 0.8337647676467895, "num_tokens": 622628625.0, "step": 18585 }, { "epoch": 1.1097182425978986, "grad_norm": 0.4807692766189575, "learning_rate": 2.528494503107966e-05, "loss": 0.4625, "mean_token_accuracy": 0.8465406179428101, "num_tokens": 622796305.0, "step": 18590 }, { "epoch": 1.1100167144221587, "grad_norm": 0.46318143606185913, "learning_rate": 2.5273894895262213e-05, "loss": 0.4505, "mean_token_accuracy": 0.8503041863441467, "num_tokens": 622963985.0, "step": 18595 }, { "epoch": 1.1103151862464182, "grad_norm": 0.5829127430915833, "learning_rate": 2.5262845301653753e-05, "loss": 0.473, "mean_token_accuracy": 0.8428963422775269, "num_tokens": 623131665.0, "step": 18600 }, { "epoch": 1.1106136580706782, "grad_norm": 0.5519548058509827, "learning_rate": 2.5251796252945614e-05, "loss": 0.4838, "mean_token_accuracy": 0.840576171875, "num_tokens": 623299345.0, "step": 18605 }, { "epoch": 1.1109121298949378, "grad_norm": 0.5513665080070496, "learning_rate": 2.5240747751828998e-05, "loss": 0.4553, "mean_token_accuracy": 0.8498568534851074, "num_tokens": 623467025.0, "step": 18610 }, { "epoch": 1.1112106017191976, "grad_norm": 0.5571180582046509, "learning_rate": 2.522969980099496e-05, "loss": 0.5205, "mean_token_accuracy": 0.8306632518768311, "num_tokens": 623634705.0, "step": 18615 }, { "epoch": 1.1115090735434574, "grad_norm": 0.4902142286300659, "learning_rate": 2.521865240313444e-05, "loss": 0.452, "mean_token_accuracy": 0.8496480941772461, "num_tokens": 623802385.0, "step": 18620 }, { "epoch": 1.1118075453677172, "grad_norm": 0.4946683645248413, "learning_rate": 2.5207605560938248e-05, "loss": 0.4288, "mean_token_accuracy": 0.8569724440574646, "num_tokens": 623970065.0, "step": 18625 }, { "epoch": 1.112106017191977, "grad_norm": 0.5599443316459656, "learning_rate": 2.519655927709702e-05, "loss": 0.4285, "mean_token_accuracy": 0.8579577803611755, "num_tokens": 624135457.0, "step": 18630 }, { "epoch": 1.1124044890162368, "grad_norm": 0.4648134112358093, "learning_rate": 2.5185513554301315e-05, "loss": 0.4011, "mean_token_accuracy": 0.865650725364685, "num_tokens": 624303137.0, "step": 18635 }, { "epoch": 1.1127029608404966, "grad_norm": 0.45229512453079224, "learning_rate": 2.5174468395241484e-05, "loss": 0.4977, "mean_token_accuracy": 0.8354169011116028, "num_tokens": 624470817.0, "step": 18640 }, { "epoch": 1.1130014326647564, "grad_norm": 0.513867199420929, "learning_rate": 2.516342380260781e-05, "loss": 0.4482, "mean_token_accuracy": 0.851741623878479, "num_tokens": 624638497.0, "step": 18645 }, { "epoch": 1.1132999044890162, "grad_norm": 0.5177357196807861, "learning_rate": 2.5152379779090397e-05, "loss": 0.432, "mean_token_accuracy": 0.8550399661064148, "num_tokens": 624806177.0, "step": 18650 }, { "epoch": 1.113598376313276, "grad_norm": 0.5707488656044006, "learning_rate": 2.5141336327379218e-05, "loss": 0.445, "mean_token_accuracy": 0.8517833709716797, "num_tokens": 624973857.0, "step": 18655 }, { "epoch": 1.1138968481375358, "grad_norm": 0.5320921540260315, "learning_rate": 2.5130293450164123e-05, "loss": 0.4386, "mean_token_accuracy": 0.8533460617065429, "num_tokens": 625141537.0, "step": 18660 }, { "epoch": 1.1141953199617955, "grad_norm": 0.5352445840835571, "learning_rate": 2.511925115013479e-05, "loss": 0.4489, "mean_token_accuracy": 0.8527198076248169, "num_tokens": 625309217.0, "step": 18665 }, { "epoch": 1.1144937917860553, "grad_norm": 0.5579648017883301, "learning_rate": 2.5108209429980784e-05, "loss": 0.453, "mean_token_accuracy": 0.8490635871887207, "num_tokens": 625476897.0, "step": 18670 }, { "epoch": 1.1147922636103151, "grad_norm": 0.5246790051460266, "learning_rate": 2.509716829239151e-05, "loss": 0.4118, "mean_token_accuracy": 0.8606226921081543, "num_tokens": 625644577.0, "step": 18675 }, { "epoch": 1.115090735434575, "grad_norm": 0.5772244334220886, "learning_rate": 2.5086127740056258e-05, "loss": 0.4445, "mean_token_accuracy": 0.8525766491889953, "num_tokens": 625812257.0, "step": 18680 }, { "epoch": 1.1153892072588347, "grad_norm": 0.6147289276123047, "learning_rate": 2.507508777566415e-05, "loss": 0.4243, "mean_token_accuracy": 0.8570499777793884, "num_tokens": 625979937.0, "step": 18685 }, { "epoch": 1.1156876790830945, "grad_norm": 0.4873496890068054, "learning_rate": 2.5064048401904168e-05, "loss": 0.4578, "mean_token_accuracy": 0.8500179052352905, "num_tokens": 626147617.0, "step": 18690 }, { "epoch": 1.1159861509073543, "grad_norm": 0.4933112561702728, "learning_rate": 2.5053009621465167e-05, "loss": 0.4774, "mean_token_accuracy": 0.842908251285553, "num_tokens": 626315297.0, "step": 18695 }, { "epoch": 1.116284622731614, "grad_norm": 0.5187444090843201, "learning_rate": 2.5041971437035833e-05, "loss": 0.5004, "mean_token_accuracy": 0.8352618336677551, "num_tokens": 626482977.0, "step": 18700 }, { "epoch": 1.116583094555874, "grad_norm": 0.4695175290107727, "learning_rate": 2.503093385130473e-05, "loss": 0.4608, "mean_token_accuracy": 0.8457294464111328, "num_tokens": 626650657.0, "step": 18705 }, { "epoch": 1.1168815663801337, "grad_norm": 0.6617633104324341, "learning_rate": 2.5019896866960258e-05, "loss": 0.4471, "mean_token_accuracy": 0.8519742369651795, "num_tokens": 626818337.0, "step": 18710 }, { "epoch": 1.1171800382043935, "grad_norm": 0.5181426405906677, "learning_rate": 2.5008860486690682e-05, "loss": 0.4139, "mean_token_accuracy": 0.8622509837150574, "num_tokens": 626986017.0, "step": 18715 }, { "epoch": 1.1174785100286533, "grad_norm": 0.5520211458206177, "learning_rate": 2.499782471318412e-05, "loss": 0.4767, "mean_token_accuracy": 0.8423714637756348, "num_tokens": 627153697.0, "step": 18720 }, { "epoch": 1.117776981852913, "grad_norm": 0.4633947014808655, "learning_rate": 2.498678954912853e-05, "loss": 0.4499, "mean_token_accuracy": 0.8509954452514649, "num_tokens": 627316377.0, "step": 18725 }, { "epoch": 1.1180754536771729, "grad_norm": 0.566048800945282, "learning_rate": 2.4975754997211733e-05, "loss": 0.4564, "mean_token_accuracy": 0.8484796643257141, "num_tokens": 627478346.0, "step": 18730 }, { "epoch": 1.1183739255014327, "grad_norm": 0.5108193159103394, "learning_rate": 2.4964721060121405e-05, "loss": 0.4202, "mean_token_accuracy": 0.8592508673667908, "num_tokens": 627646026.0, "step": 18735 }, { "epoch": 1.1186723973256925, "grad_norm": 0.4598321318626404, "learning_rate": 2.4953687740545052e-05, "loss": 0.4388, "mean_token_accuracy": 0.85297030210495, "num_tokens": 627813706.0, "step": 18740 }, { "epoch": 1.1189708691499523, "grad_norm": 0.5469609498977661, "learning_rate": 2.4942655041170054e-05, "loss": 0.4781, "mean_token_accuracy": 0.844172728061676, "num_tokens": 627981386.0, "step": 18745 }, { "epoch": 1.119269340974212, "grad_norm": 0.4844552278518677, "learning_rate": 2.4931622964683622e-05, "loss": 0.4642, "mean_token_accuracy": 0.8461052179336548, "num_tokens": 628149066.0, "step": 18750 }, { "epoch": 1.1195678127984718, "grad_norm": 0.5120344758033752, "learning_rate": 2.4920591513772813e-05, "loss": 0.4253, "mean_token_accuracy": 0.8579744815826416, "num_tokens": 628316746.0, "step": 18755 }, { "epoch": 1.1198662846227316, "grad_norm": 0.4837286174297333, "learning_rate": 2.4909560691124573e-05, "loss": 0.4294, "mean_token_accuracy": 0.8561970591545105, "num_tokens": 628484426.0, "step": 18760 }, { "epoch": 1.1201647564469914, "grad_norm": 0.4842304289340973, "learning_rate": 2.489853049942562e-05, "loss": 0.4578, "mean_token_accuracy": 0.8488130688667297, "num_tokens": 628652106.0, "step": 18765 }, { "epoch": 1.1204632282712512, "grad_norm": 0.5748688578605652, "learning_rate": 2.4887500941362597e-05, "loss": 0.4872, "mean_token_accuracy": 0.8410055994987488, "num_tokens": 628819786.0, "step": 18770 }, { "epoch": 1.120761700095511, "grad_norm": 0.525775134563446, "learning_rate": 2.4876472019621926e-05, "loss": 0.4363, "mean_token_accuracy": 0.8562507510185242, "num_tokens": 628987466.0, "step": 18775 }, { "epoch": 1.1210601719197708, "grad_norm": 0.5338101983070374, "learning_rate": 2.486544373688993e-05, "loss": 0.4824, "mean_token_accuracy": 0.8400930523872375, "num_tokens": 629155146.0, "step": 18780 }, { "epoch": 1.1213586437440306, "grad_norm": 0.523887038230896, "learning_rate": 2.485441609585274e-05, "loss": 0.4439, "mean_token_accuracy": 0.8521949172019958, "num_tokens": 629322826.0, "step": 18785 }, { "epoch": 1.1216571155682904, "grad_norm": 0.5007719397544861, "learning_rate": 2.4843389099196333e-05, "loss": 0.4846, "mean_token_accuracy": 0.8414231300354004, "num_tokens": 629490506.0, "step": 18790 }, { "epoch": 1.1219555873925502, "grad_norm": 0.5108416080474854, "learning_rate": 2.4832362749606563e-05, "loss": 0.4365, "mean_token_accuracy": 0.8550638079643249, "num_tokens": 629658186.0, "step": 18795 }, { "epoch": 1.12225405921681, "grad_norm": 0.49002453684806824, "learning_rate": 2.482133704976907e-05, "loss": 0.4524, "mean_token_accuracy": 0.8481234669685364, "num_tokens": 629822897.0, "step": 18800 }, { "epoch": 1.1225525310410698, "grad_norm": 0.5626703500747681, "learning_rate": 2.481031200236939e-05, "loss": 0.4688, "mean_token_accuracy": 0.8455027937889099, "num_tokens": 629990577.0, "step": 18805 }, { "epoch": 1.1228510028653296, "grad_norm": 0.5119451284408569, "learning_rate": 2.4799287610092857e-05, "loss": 0.4761, "mean_token_accuracy": 0.8410875558853149, "num_tokens": 630150033.0, "step": 18810 }, { "epoch": 1.1231494746895894, "grad_norm": 0.48004892468452454, "learning_rate": 2.478826387562468e-05, "loss": 0.5217, "mean_token_accuracy": 0.8309793591499328, "num_tokens": 630317713.0, "step": 18815 }, { "epoch": 1.1234479465138492, "grad_norm": 0.507409393787384, "learning_rate": 2.4777240801649898e-05, "loss": 0.4183, "mean_token_accuracy": 0.8602648258209229, "num_tokens": 630485393.0, "step": 18820 }, { "epoch": 1.123746418338109, "grad_norm": 0.47925466299057007, "learning_rate": 2.476621839085336e-05, "loss": 0.4419, "mean_token_accuracy": 0.8538053154945373, "num_tokens": 630653073.0, "step": 18825 }, { "epoch": 1.1240448901623687, "grad_norm": 0.5088526010513306, "learning_rate": 2.4755196645919816e-05, "loss": 0.4178, "mean_token_accuracy": 0.8601216793060302, "num_tokens": 630820753.0, "step": 18830 }, { "epoch": 1.1243433619866285, "grad_norm": 0.5245963335037231, "learning_rate": 2.4744175569533767e-05, "loss": 0.4336, "mean_token_accuracy": 0.8552546858787536, "num_tokens": 630988433.0, "step": 18835 }, { "epoch": 1.1246418338108883, "grad_norm": 0.48288968205451965, "learning_rate": 2.4733155164379628e-05, "loss": 0.4132, "mean_token_accuracy": 0.8623464107513428, "num_tokens": 631156113.0, "step": 18840 }, { "epoch": 1.1249403056351481, "grad_norm": 0.5197480320930481, "learning_rate": 2.4722135433141628e-05, "loss": 0.4786, "mean_token_accuracy": 0.8421090245246887, "num_tokens": 631323793.0, "step": 18845 }, { "epoch": 1.125238777459408, "grad_norm": 0.8062328100204468, "learning_rate": 2.4711116378503816e-05, "loss": 0.4597, "mean_token_accuracy": 0.8470416307449341, "num_tokens": 631491473.0, "step": 18850 }, { "epoch": 1.1255372492836677, "grad_norm": 0.5386670827865601, "learning_rate": 2.470009800315009e-05, "loss": 0.4444, "mean_token_accuracy": 0.8508708119392395, "num_tokens": 631659153.0, "step": 18855 }, { "epoch": 1.1258357211079275, "grad_norm": 0.48615118861198425, "learning_rate": 2.4689080309764166e-05, "loss": 0.4568, "mean_token_accuracy": 0.8480973362922668, "num_tokens": 631826833.0, "step": 18860 }, { "epoch": 1.126134192932187, "grad_norm": 0.557109534740448, "learning_rate": 2.4678063301029623e-05, "loss": 0.4586, "mean_token_accuracy": 0.8491649746894836, "num_tokens": 631994513.0, "step": 18865 }, { "epoch": 1.126432664756447, "grad_norm": 0.4750424027442932, "learning_rate": 2.466704697962985e-05, "loss": 0.4425, "mean_token_accuracy": 0.8523380756378174, "num_tokens": 632162193.0, "step": 18870 }, { "epoch": 1.1267311365807067, "grad_norm": 0.5284352898597717, "learning_rate": 2.465603134824807e-05, "loss": 0.4698, "mean_token_accuracy": 0.8445007681846619, "num_tokens": 632329873.0, "step": 18875 }, { "epoch": 1.1270296084049667, "grad_norm": 0.5021812915802002, "learning_rate": 2.4645016409567358e-05, "loss": 0.4492, "mean_token_accuracy": 0.8509423971176148, "num_tokens": 632497553.0, "step": 18880 }, { "epoch": 1.1273280802292263, "grad_norm": 0.5287967324256897, "learning_rate": 2.4634002166270586e-05, "loss": 0.4859, "mean_token_accuracy": 0.8388882279396057, "num_tokens": 632665233.0, "step": 18885 }, { "epoch": 1.1276265520534863, "grad_norm": 0.5061269998550415, "learning_rate": 2.46229886210405e-05, "loss": 0.5049, "mean_token_accuracy": 0.8349457263946534, "num_tokens": 632832913.0, "step": 18890 }, { "epoch": 1.1279250238777458, "grad_norm": 0.5035704970359802, "learning_rate": 2.4611975776559626e-05, "loss": 0.472, "mean_token_accuracy": 0.8462602853775024, "num_tokens": 633000593.0, "step": 18895 }, { "epoch": 1.1282234957020056, "grad_norm": 0.6159593462944031, "learning_rate": 2.4600963635510354e-05, "loss": 0.4773, "mean_token_accuracy": 0.8428009033203125, "num_tokens": 633168273.0, "step": 18900 }, { "epoch": 1.1285219675262654, "grad_norm": 0.5488763451576233, "learning_rate": 2.458995220057491e-05, "loss": 0.4367, "mean_token_accuracy": 0.8522724628448486, "num_tokens": 633335953.0, "step": 18905 }, { "epoch": 1.1288204393505252, "grad_norm": 0.5279453992843628, "learning_rate": 2.457894147443532e-05, "loss": 0.435, "mean_token_accuracy": 0.8547178864479065, "num_tokens": 633503633.0, "step": 18910 }, { "epoch": 1.129118911174785, "grad_norm": 0.5483424067497253, "learning_rate": 2.4567931459773452e-05, "loss": 0.4452, "mean_token_accuracy": 0.8512585043907166, "num_tokens": 633671313.0, "step": 18915 }, { "epoch": 1.1294173829990448, "grad_norm": 0.5122362375259399, "learning_rate": 2.455692215927099e-05, "loss": 0.4769, "mean_token_accuracy": 0.8424489974975586, "num_tokens": 633838993.0, "step": 18920 }, { "epoch": 1.1297158548233046, "grad_norm": 0.5410659909248352, "learning_rate": 2.4545913575609463e-05, "loss": 0.4613, "mean_token_accuracy": 0.8473100304603577, "num_tokens": 634006673.0, "step": 18925 }, { "epoch": 1.1300143266475644, "grad_norm": 0.4490204453468323, "learning_rate": 2.4534905711470215e-05, "loss": 0.4404, "mean_token_accuracy": 0.8527018904685975, "num_tokens": 634174353.0, "step": 18930 }, { "epoch": 1.1303127984718242, "grad_norm": 0.5037288069725037, "learning_rate": 2.45238985695344e-05, "loss": 0.4582, "mean_token_accuracy": 0.8473219633102417, "num_tokens": 634342033.0, "step": 18935 }, { "epoch": 1.130611270296084, "grad_norm": 0.5119936466217041, "learning_rate": 2.451289215248303e-05, "loss": 0.454, "mean_token_accuracy": 0.8490933895111084, "num_tokens": 634509713.0, "step": 18940 }, { "epoch": 1.1309097421203438, "grad_norm": 0.4720776379108429, "learning_rate": 2.4501886462996908e-05, "loss": 0.4586, "mean_token_accuracy": 0.8477633237838745, "num_tokens": 634677393.0, "step": 18945 }, { "epoch": 1.1312082139446036, "grad_norm": 0.5458407998085022, "learning_rate": 2.4490881503756668e-05, "loss": 0.4678, "mean_token_accuracy": 0.8445604324340821, "num_tokens": 634845073.0, "step": 18950 }, { "epoch": 1.1315066857688634, "grad_norm": 0.5116104483604431, "learning_rate": 2.44798772774428e-05, "loss": 0.4191, "mean_token_accuracy": 0.8598532795906066, "num_tokens": 635012753.0, "step": 18955 }, { "epoch": 1.1318051575931232, "grad_norm": 0.4883235991001129, "learning_rate": 2.4468873786735553e-05, "loss": 0.4616, "mean_token_accuracy": 0.8462841510772705, "num_tokens": 635180433.0, "step": 18960 }, { "epoch": 1.132103629417383, "grad_norm": 0.5117110013961792, "learning_rate": 2.445787103431505e-05, "loss": 0.4354, "mean_token_accuracy": 0.8550399541854858, "num_tokens": 635348113.0, "step": 18965 }, { "epoch": 1.1324021012416428, "grad_norm": 0.49613380432128906, "learning_rate": 2.4446869022861195e-05, "loss": 0.4398, "mean_token_accuracy": 0.8532565832138062, "num_tokens": 635515793.0, "step": 18970 }, { "epoch": 1.1327005730659025, "grad_norm": 0.4743483066558838, "learning_rate": 2.443586775505376e-05, "loss": 0.4307, "mean_token_accuracy": 0.8560479640960693, "num_tokens": 635683473.0, "step": 18975 }, { "epoch": 1.1329990448901623, "grad_norm": 0.47173023223876953, "learning_rate": 2.442486723357228e-05, "loss": 0.4522, "mean_token_accuracy": 0.8483001232147217, "num_tokens": 635851153.0, "step": 18980 }, { "epoch": 1.1332975167144221, "grad_norm": 0.5263618230819702, "learning_rate": 2.4413867461096147e-05, "loss": 0.4633, "mean_token_accuracy": 0.845174765586853, "num_tokens": 636018833.0, "step": 18985 }, { "epoch": 1.133595988538682, "grad_norm": 0.5367741584777832, "learning_rate": 2.440286844030456e-05, "loss": 0.4773, "mean_token_accuracy": 0.8444530606269837, "num_tokens": 636186513.0, "step": 18990 }, { "epoch": 1.1338944603629417, "grad_norm": 0.5594318509101868, "learning_rate": 2.439187017387653e-05, "loss": 0.4809, "mean_token_accuracy": 0.8401586532592773, "num_tokens": 636354193.0, "step": 18995 }, { "epoch": 1.1341929321872015, "grad_norm": 0.5120664834976196, "learning_rate": 2.438087266449089e-05, "loss": 0.4567, "mean_token_accuracy": 0.8504831314086914, "num_tokens": 636521873.0, "step": 19000 }, { "epoch": 1.1344914040114613, "grad_norm": 0.48474791646003723, "learning_rate": 2.4369875914826283e-05, "loss": 0.4429, "mean_token_accuracy": 0.8525647163391114, "num_tokens": 636689553.0, "step": 19005 }, { "epoch": 1.134789875835721, "grad_norm": 0.43569645285606384, "learning_rate": 2.4358879927561173e-05, "loss": 0.4175, "mean_token_accuracy": 0.8619527578353882, "num_tokens": 636857233.0, "step": 19010 }, { "epoch": 1.135088347659981, "grad_norm": 0.5542641282081604, "learning_rate": 2.4347884705373836e-05, "loss": 0.4563, "mean_token_accuracy": 0.8471847891807556, "num_tokens": 637024913.0, "step": 19015 }, { "epoch": 1.1353868194842407, "grad_norm": 0.45384833216667175, "learning_rate": 2.4336890250942357e-05, "loss": 0.4741, "mean_token_accuracy": 0.843409287929535, "num_tokens": 637192593.0, "step": 19020 }, { "epoch": 1.1356852913085005, "grad_norm": 0.5924787521362305, "learning_rate": 2.432589656694465e-05, "loss": 0.4467, "mean_token_accuracy": 0.8513658523559571, "num_tokens": 637360273.0, "step": 19025 }, { "epoch": 1.1359837631327603, "grad_norm": 0.5013798475265503, "learning_rate": 2.4314903656058407e-05, "loss": 0.4798, "mean_token_accuracy": 0.8411785721778869, "num_tokens": 637527953.0, "step": 19030 }, { "epoch": 1.13628223495702, "grad_norm": 0.5134493708610535, "learning_rate": 2.4303911520961176e-05, "loss": 0.4696, "mean_token_accuracy": 0.8435822606086731, "num_tokens": 637695633.0, "step": 19035 }, { "epoch": 1.1365807067812799, "grad_norm": 0.5029426217079163, "learning_rate": 2.429292016433029e-05, "loss": 0.4388, "mean_token_accuracy": 0.8541810870170593, "num_tokens": 637863313.0, "step": 19040 }, { "epoch": 1.1368791786055397, "grad_norm": 0.48797866702079773, "learning_rate": 2.4281929588842893e-05, "loss": 0.4523, "mean_token_accuracy": 0.8495467066764831, "num_tokens": 638030993.0, "step": 19045 }, { "epoch": 1.1371776504297995, "grad_norm": 0.5125130414962769, "learning_rate": 2.4270939797175946e-05, "loss": 0.4878, "mean_token_accuracy": 0.8382977604866028, "num_tokens": 638198673.0, "step": 19050 }, { "epoch": 1.1374761222540593, "grad_norm": 0.5460643768310547, "learning_rate": 2.425995079200621e-05, "loss": 0.4666, "mean_token_accuracy": 0.8456459522247315, "num_tokens": 638366353.0, "step": 19055 }, { "epoch": 1.137774594078319, "grad_norm": 0.4995202422142029, "learning_rate": 2.424896257601026e-05, "loss": 0.4412, "mean_token_accuracy": 0.8523499965667725, "num_tokens": 638534033.0, "step": 19060 }, { "epoch": 1.1380730659025788, "grad_norm": 0.4863225817680359, "learning_rate": 2.423797515186449e-05, "loss": 0.4184, "mean_token_accuracy": 0.8598890542984009, "num_tokens": 638701713.0, "step": 19065 }, { "epoch": 1.1383715377268386, "grad_norm": 0.597139298915863, "learning_rate": 2.422698852224507e-05, "loss": 0.4652, "mean_token_accuracy": 0.8443874478340149, "num_tokens": 638869393.0, "step": 19070 }, { "epoch": 1.1386700095510984, "grad_norm": 0.5598382949829102, "learning_rate": 2.4216002689828022e-05, "loss": 0.4705, "mean_token_accuracy": 0.8461409926414489, "num_tokens": 639037073.0, "step": 19075 }, { "epoch": 1.1389684813753582, "grad_norm": 0.5371782183647156, "learning_rate": 2.4205017657289127e-05, "loss": 0.4652, "mean_token_accuracy": 0.8455564975738525, "num_tokens": 639204753.0, "step": 19080 }, { "epoch": 1.139266953199618, "grad_norm": 0.5139977931976318, "learning_rate": 2.4194033427304e-05, "loss": 0.4497, "mean_token_accuracy": 0.8494035363197326, "num_tokens": 639372433.0, "step": 19085 }, { "epoch": 1.1395654250238778, "grad_norm": 0.5258270502090454, "learning_rate": 2.4183050002548053e-05, "loss": 0.4652, "mean_token_accuracy": 0.8466984033584595, "num_tokens": 639532795.0, "step": 19090 }, { "epoch": 1.1398638968481376, "grad_norm": 0.5295782089233398, "learning_rate": 2.4172067385696495e-05, "loss": 0.4601, "mean_token_accuracy": 0.8459859251976013, "num_tokens": 639700475.0, "step": 19095 }, { "epoch": 1.1401623686723974, "grad_norm": 0.5212468504905701, "learning_rate": 2.416108557942437e-05, "loss": 0.4926, "mean_token_accuracy": 0.8374627232551575, "num_tokens": 639868155.0, "step": 19100 }, { "epoch": 1.1404608404966572, "grad_norm": 0.5109013915061951, "learning_rate": 2.4150104586406458e-05, "loss": 0.4567, "mean_token_accuracy": 0.8484373092651367, "num_tokens": 640035835.0, "step": 19105 }, { "epoch": 1.140759312320917, "grad_norm": 0.4664410650730133, "learning_rate": 2.4139124409317426e-05, "loss": 0.4414, "mean_token_accuracy": 0.8532983422279358, "num_tokens": 640203515.0, "step": 19110 }, { "epoch": 1.1410577841451768, "grad_norm": 0.49429747462272644, "learning_rate": 2.412814505083167e-05, "loss": 0.4539, "mean_token_accuracy": 0.8493438959121704, "num_tokens": 640371195.0, "step": 19115 }, { "epoch": 1.1413562559694366, "grad_norm": 0.5251138806343079, "learning_rate": 2.4117166513623422e-05, "loss": 0.4464, "mean_token_accuracy": 0.8530895948410034, "num_tokens": 640538875.0, "step": 19120 }, { "epoch": 1.1416547277936964, "grad_norm": 0.6203110218048096, "learning_rate": 2.4106188800366714e-05, "loss": 0.4734, "mean_token_accuracy": 0.8427710771560669, "num_tokens": 640706555.0, "step": 19125 }, { "epoch": 1.1419531996179562, "grad_norm": 0.4993927478790283, "learning_rate": 2.409521191373536e-05, "loss": 0.431, "mean_token_accuracy": 0.8566264986991883, "num_tokens": 640874235.0, "step": 19130 }, { "epoch": 1.142251671442216, "grad_norm": 0.6741517186164856, "learning_rate": 2.4084235856402996e-05, "loss": 0.4531, "mean_token_accuracy": 0.8510616779327392, "num_tokens": 641041915.0, "step": 19135 }, { "epoch": 1.1425501432664755, "grad_norm": 0.5305507779121399, "learning_rate": 2.4073260631043038e-05, "loss": 0.4983, "mean_token_accuracy": 0.8357807517051696, "num_tokens": 641209595.0, "step": 19140 }, { "epoch": 1.1428486150907355, "grad_norm": 0.5178223848342896, "learning_rate": 2.4062286240328702e-05, "loss": 0.4544, "mean_token_accuracy": 0.849487054347992, "num_tokens": 641377275.0, "step": 19145 }, { "epoch": 1.1431470869149951, "grad_norm": 0.4566723704338074, "learning_rate": 2.4051312686933004e-05, "loss": 0.4274, "mean_token_accuracy": 0.8563103914260864, "num_tokens": 641544955.0, "step": 19150 }, { "epoch": 1.1434455587392551, "grad_norm": 0.5075921416282654, "learning_rate": 2.4040339973528754e-05, "loss": 0.4559, "mean_token_accuracy": 0.8491172552108764, "num_tokens": 641712635.0, "step": 19155 }, { "epoch": 1.1437440305635147, "grad_norm": 0.4714864492416382, "learning_rate": 2.402936810278857e-05, "loss": 0.4799, "mean_token_accuracy": 0.8408684253692627, "num_tokens": 641880315.0, "step": 19160 }, { "epoch": 1.1440425023877747, "grad_norm": 0.48537150025367737, "learning_rate": 2.401839707738483e-05, "loss": 0.4327, "mean_token_accuracy": 0.8557616472244263, "num_tokens": 642047995.0, "step": 19165 }, { "epoch": 1.1443409742120343, "grad_norm": 0.5679064989089966, "learning_rate": 2.4007426899989753e-05, "loss": 0.4554, "mean_token_accuracy": 0.850477147102356, "num_tokens": 642215675.0, "step": 19170 }, { "epoch": 1.144639446036294, "grad_norm": 0.4865145981311798, "learning_rate": 2.3996457573275326e-05, "loss": 0.4414, "mean_token_accuracy": 0.8535130500793457, "num_tokens": 642383355.0, "step": 19175 }, { "epoch": 1.1449379178605539, "grad_norm": 0.5719457864761353, "learning_rate": 2.3985489099913322e-05, "loss": 0.4634, "mean_token_accuracy": 0.8466837644577027, "num_tokens": 642551035.0, "step": 19180 }, { "epoch": 1.1452363896848137, "grad_norm": 0.4855192005634308, "learning_rate": 2.3974521482575317e-05, "loss": 0.4571, "mean_token_accuracy": 0.8489920139312744, "num_tokens": 642718715.0, "step": 19185 }, { "epoch": 1.1455348615090735, "grad_norm": 0.4773414433002472, "learning_rate": 2.3963554723932673e-05, "loss": 0.4369, "mean_token_accuracy": 0.8547298073768616, "num_tokens": 642886395.0, "step": 19190 }, { "epoch": 1.1458333333333333, "grad_norm": 0.516990602016449, "learning_rate": 2.3952588826656562e-05, "loss": 0.4389, "mean_token_accuracy": 0.8544315814971923, "num_tokens": 643054075.0, "step": 19195 }, { "epoch": 1.146131805157593, "grad_norm": 0.45904049277305603, "learning_rate": 2.394162379341791e-05, "loss": 0.4616, "mean_token_accuracy": 0.8474591493606567, "num_tokens": 643221755.0, "step": 19200 }, { "epoch": 1.1464302769818528, "grad_norm": 0.5339150428771973, "learning_rate": 2.393065962688747e-05, "loss": 0.4753, "mean_token_accuracy": 0.840399706363678, "num_tokens": 643386233.0, "step": 19205 }, { "epoch": 1.1467287488061126, "grad_norm": 0.5641091465950012, "learning_rate": 2.3919696329735757e-05, "loss": 0.483, "mean_token_accuracy": 0.8402719855308532, "num_tokens": 643553913.0, "step": 19210 }, { "epoch": 1.1470272206303724, "grad_norm": 0.6663140058517456, "learning_rate": 2.3908733904633086e-05, "loss": 0.4482, "mean_token_accuracy": 0.8502909898757934, "num_tokens": 643719878.0, "step": 19215 }, { "epoch": 1.1473256924546322, "grad_norm": 0.538935661315918, "learning_rate": 2.3897772354249565e-05, "loss": 0.4574, "mean_token_accuracy": 0.8498151063919067, "num_tokens": 643887558.0, "step": 19220 }, { "epoch": 1.147624164278892, "grad_norm": 0.6172839403152466, "learning_rate": 2.3886811681255074e-05, "loss": 0.4593, "mean_token_accuracy": 0.8503340005874633, "num_tokens": 644055238.0, "step": 19225 }, { "epoch": 1.1479226361031518, "grad_norm": 0.45632052421569824, "learning_rate": 2.3875851888319277e-05, "loss": 0.4444, "mean_token_accuracy": 0.8506024122238159, "num_tokens": 644222918.0, "step": 19230 }, { "epoch": 1.1482211079274116, "grad_norm": 0.46800461411476135, "learning_rate": 2.386489297811167e-05, "loss": 0.3943, "mean_token_accuracy": 0.8668734431266785, "num_tokens": 644390598.0, "step": 19235 }, { "epoch": 1.1485195797516714, "grad_norm": 0.5249519348144531, "learning_rate": 2.3853934953301455e-05, "loss": 0.4449, "mean_token_accuracy": 0.8518907308578492, "num_tokens": 644558278.0, "step": 19240 }, { "epoch": 1.1488180515759312, "grad_norm": 0.5473239421844482, "learning_rate": 2.3842977816557694e-05, "loss": 0.4078, "mean_token_accuracy": 0.863545274734497, "num_tokens": 644725958.0, "step": 19245 }, { "epoch": 1.149116523400191, "grad_norm": 0.5105882883071899, "learning_rate": 2.3832021570549178e-05, "loss": 0.4436, "mean_token_accuracy": 0.8536741018295289, "num_tokens": 644893638.0, "step": 19250 }, { "epoch": 1.1494149952244508, "grad_norm": 0.5072927474975586, "learning_rate": 2.3821066217944517e-05, "loss": 0.476, "mean_token_accuracy": 0.8418585300445557, "num_tokens": 645061318.0, "step": 19255 }, { "epoch": 1.1497134670487106, "grad_norm": 0.5030297636985779, "learning_rate": 2.3810111761412086e-05, "loss": 0.4575, "mean_token_accuracy": 0.8480973362922668, "num_tokens": 645228998.0, "step": 19260 }, { "epoch": 1.1500119388729704, "grad_norm": 0.46399497985839844, "learning_rate": 2.379915820362003e-05, "loss": 0.4254, "mean_token_accuracy": 0.8578969359397888, "num_tokens": 645396678.0, "step": 19265 }, { "epoch": 1.1503104106972302, "grad_norm": 0.5404046177864075, "learning_rate": 2.378820554723632e-05, "loss": 0.4837, "mean_token_accuracy": 0.8409817457199097, "num_tokens": 645564358.0, "step": 19270 }, { "epoch": 1.15060888252149, "grad_norm": 0.5211895108222961, "learning_rate": 2.377725379492865e-05, "loss": 0.4405, "mean_token_accuracy": 0.8542765140533447, "num_tokens": 645732038.0, "step": 19275 }, { "epoch": 1.1509073543457498, "grad_norm": 0.5210592746734619, "learning_rate": 2.3766302949364528e-05, "loss": 0.4364, "mean_token_accuracy": 0.8568591117858887, "num_tokens": 645899718.0, "step": 19280 }, { "epoch": 1.1512058261700095, "grad_norm": 0.4576818645000458, "learning_rate": 2.3755353013211255e-05, "loss": 0.438, "mean_token_accuracy": 0.8534355163574219, "num_tokens": 646067398.0, "step": 19285 }, { "epoch": 1.1515042979942693, "grad_norm": 0.45660170912742615, "learning_rate": 2.374440398913586e-05, "loss": 0.4487, "mean_token_accuracy": 0.8509364128112793, "num_tokens": 646235078.0, "step": 19290 }, { "epoch": 1.1518027698185291, "grad_norm": 0.5278347134590149, "learning_rate": 2.3733455879805207e-05, "loss": 0.4726, "mean_token_accuracy": 0.8441906213760376, "num_tokens": 646402758.0, "step": 19295 }, { "epoch": 1.152101241642789, "grad_norm": 0.5075232982635498, "learning_rate": 2.372250868788588e-05, "loss": 0.4281, "mean_token_accuracy": 0.8569843769073486, "num_tokens": 646570438.0, "step": 19300 }, { "epoch": 1.1523997134670487, "grad_norm": 0.4600481688976288, "learning_rate": 2.371156241604431e-05, "loss": 0.4298, "mean_token_accuracy": 0.857318389415741, "num_tokens": 646738118.0, "step": 19305 }, { "epoch": 1.1526981852913085, "grad_norm": 0.5240980386734009, "learning_rate": 2.370061706694662e-05, "loss": 0.462, "mean_token_accuracy": 0.8460336327552795, "num_tokens": 646905798.0, "step": 19310 }, { "epoch": 1.1529966571155683, "grad_norm": 0.5291630625724792, "learning_rate": 2.368967264325877e-05, "loss": 0.4288, "mean_token_accuracy": 0.8553501009941101, "num_tokens": 647073478.0, "step": 19315 }, { "epoch": 1.153295128939828, "grad_norm": 0.498884379863739, "learning_rate": 2.367872914764649e-05, "loss": 0.4495, "mean_token_accuracy": 0.8502147197723389, "num_tokens": 647241158.0, "step": 19320 }, { "epoch": 1.153593600764088, "grad_norm": 0.47125518321990967, "learning_rate": 2.366778658277525e-05, "loss": 0.4737, "mean_token_accuracy": 0.844172728061676, "num_tokens": 647408838.0, "step": 19325 }, { "epoch": 1.1538920725883477, "grad_norm": 0.5179007053375244, "learning_rate": 2.365684495131033e-05, "loss": 0.4929, "mean_token_accuracy": 0.8375044584274292, "num_tokens": 647576518.0, "step": 19330 }, { "epoch": 1.1541905444126075, "grad_norm": 0.5301775336265564, "learning_rate": 2.3645904255916756e-05, "loss": 0.4452, "mean_token_accuracy": 0.8518907308578492, "num_tokens": 647744198.0, "step": 19335 }, { "epoch": 1.1544890162368673, "grad_norm": 0.5048215985298157, "learning_rate": 2.3634964499259337e-05, "loss": 0.4425, "mean_token_accuracy": 0.8521770238876343, "num_tokens": 647911878.0, "step": 19340 }, { "epoch": 1.154787488061127, "grad_norm": 0.5531294941902161, "learning_rate": 2.362402568400266e-05, "loss": 0.458, "mean_token_accuracy": 0.8494453072547913, "num_tokens": 648079558.0, "step": 19345 }, { "epoch": 1.1550859598853869, "grad_norm": 0.5473691821098328, "learning_rate": 2.3613087812811065e-05, "loss": 0.4733, "mean_token_accuracy": 0.841691541671753, "num_tokens": 648247238.0, "step": 19350 }, { "epoch": 1.1553844317096467, "grad_norm": 0.4831838607788086, "learning_rate": 2.3602150888348682e-05, "loss": 0.4435, "mean_token_accuracy": 0.8529106616973877, "num_tokens": 648414918.0, "step": 19355 }, { "epoch": 1.1556829035339065, "grad_norm": 0.5275028347969055, "learning_rate": 2.359121491327939e-05, "loss": 0.4662, "mean_token_accuracy": 0.8465465784072876, "num_tokens": 648582598.0, "step": 19360 }, { "epoch": 1.1559813753581663, "grad_norm": 0.47243648767471313, "learning_rate": 2.3580279890266848e-05, "loss": 0.4417, "mean_token_accuracy": 0.8519145846366882, "num_tokens": 648750278.0, "step": 19365 }, { "epoch": 1.156279847182426, "grad_norm": 0.4737529158592224, "learning_rate": 2.3569345821974502e-05, "loss": 0.469, "mean_token_accuracy": 0.8454610586166382, "num_tokens": 648917958.0, "step": 19370 }, { "epoch": 1.1565783190066858, "grad_norm": 0.47167718410491943, "learning_rate": 2.3558412711065525e-05, "loss": 0.4791, "mean_token_accuracy": 0.8408207058906555, "num_tokens": 649085638.0, "step": 19375 }, { "epoch": 1.1568767908309456, "grad_norm": 0.4872191846370697, "learning_rate": 2.3547480560202895e-05, "loss": 0.4546, "mean_token_accuracy": 0.8506620407104493, "num_tokens": 649253318.0, "step": 19380 }, { "epoch": 1.1571752626552054, "grad_norm": 0.49179843068122864, "learning_rate": 2.353654937204932e-05, "loss": 0.4719, "mean_token_accuracy": 0.8441787004470825, "num_tokens": 649420998.0, "step": 19385 }, { "epoch": 1.1574737344794652, "grad_norm": 0.5116528868675232, "learning_rate": 2.3525619149267303e-05, "loss": 0.4881, "mean_token_accuracy": 0.8374865889549256, "num_tokens": 649588678.0, "step": 19390 }, { "epoch": 1.157772206303725, "grad_norm": 0.553030788898468, "learning_rate": 2.3514689894519105e-05, "loss": 0.496, "mean_token_accuracy": 0.8368543386459351, "num_tokens": 649756358.0, "step": 19395 }, { "epoch": 1.1580706781279848, "grad_norm": 0.5112454295158386, "learning_rate": 2.3503761610466742e-05, "loss": 0.4356, "mean_token_accuracy": 0.854849112033844, "num_tokens": 649924038.0, "step": 19400 }, { "epoch": 1.1583691499522446, "grad_norm": 0.5680729746818542, "learning_rate": 2.3492834299772003e-05, "loss": 0.4452, "mean_token_accuracy": 0.8531552076339721, "num_tokens": 650091718.0, "step": 19405 }, { "epoch": 1.1586676217765044, "grad_norm": 0.5384697914123535, "learning_rate": 2.3481907965096427e-05, "loss": 0.4385, "mean_token_accuracy": 0.853483235836029, "num_tokens": 650259398.0, "step": 19410 }, { "epoch": 1.158966093600764, "grad_norm": 0.5490452647209167, "learning_rate": 2.347098260910134e-05, "loss": 0.4511, "mean_token_accuracy": 0.8497077584266662, "num_tokens": 650427078.0, "step": 19415 }, { "epoch": 1.159264565425024, "grad_norm": 0.5354728102684021, "learning_rate": 2.34600582344478e-05, "loss": 0.4774, "mean_token_accuracy": 0.8451926469802856, "num_tokens": 650594758.0, "step": 19420 }, { "epoch": 1.1595630372492836, "grad_norm": 0.571340799331665, "learning_rate": 2.3449134843796643e-05, "loss": 0.4653, "mean_token_accuracy": 0.8448467016220093, "num_tokens": 650762438.0, "step": 19425 }, { "epoch": 1.1598615090735436, "grad_norm": 0.5150979161262512, "learning_rate": 2.3438212439808476e-05, "loss": 0.4673, "mean_token_accuracy": 0.8454371929168701, "num_tokens": 650930118.0, "step": 19430 }, { "epoch": 1.1601599808978031, "grad_norm": 0.5014667510986328, "learning_rate": 2.3427291025143628e-05, "loss": 0.447, "mean_token_accuracy": 0.8523857712745666, "num_tokens": 651097798.0, "step": 19435 }, { "epoch": 1.1604584527220632, "grad_norm": 0.5894905924797058, "learning_rate": 2.341637060246224e-05, "loss": 0.4518, "mean_token_accuracy": 0.8483299493789673, "num_tokens": 651265478.0, "step": 19440 }, { "epoch": 1.1607569245463227, "grad_norm": 0.5215399265289307, "learning_rate": 2.3405451174424152e-05, "loss": 0.4499, "mean_token_accuracy": 0.8494751334190369, "num_tokens": 651433158.0, "step": 19445 }, { "epoch": 1.1610553963705825, "grad_norm": 0.552625834941864, "learning_rate": 2.3394532743689014e-05, "loss": 0.4918, "mean_token_accuracy": 0.8379816293716431, "num_tokens": 651600838.0, "step": 19450 }, { "epoch": 1.1613538681948423, "grad_norm": 0.5501632690429688, "learning_rate": 2.3383615312916206e-05, "loss": 0.439, "mean_token_accuracy": 0.8531134486198425, "num_tokens": 651768518.0, "step": 19455 }, { "epoch": 1.1616523400191021, "grad_norm": 0.5946309566497803, "learning_rate": 2.3372698884764865e-05, "loss": 0.4862, "mean_token_accuracy": 0.8406656384468079, "num_tokens": 651936198.0, "step": 19460 }, { "epoch": 1.161950811843362, "grad_norm": 0.5006893277168274, "learning_rate": 2.3361783461893894e-05, "loss": 0.4093, "mean_token_accuracy": 0.8622271299362183, "num_tokens": 652103878.0, "step": 19465 }, { "epoch": 1.1622492836676217, "grad_norm": 0.49508512020111084, "learning_rate": 2.335086904696194e-05, "loss": 0.5054, "mean_token_accuracy": 0.8347190856933594, "num_tokens": 652271558.0, "step": 19470 }, { "epoch": 1.1625477554918815, "grad_norm": 0.5250930190086365, "learning_rate": 2.3339955642627414e-05, "loss": 0.4809, "mean_token_accuracy": 0.8424788355827332, "num_tokens": 652439238.0, "step": 19475 }, { "epoch": 1.1628462273161413, "grad_norm": 0.5129685401916504, "learning_rate": 2.3329043251548488e-05, "loss": 0.4829, "mean_token_accuracy": 0.8413634657859802, "num_tokens": 652606918.0, "step": 19480 }, { "epoch": 1.163144699140401, "grad_norm": 0.6268259882926941, "learning_rate": 2.3318131876383053e-05, "loss": 0.497, "mean_token_accuracy": 0.8373672842979432, "num_tokens": 652774598.0, "step": 19485 }, { "epoch": 1.1634431709646609, "grad_norm": 0.575529158115387, "learning_rate": 2.3307221519788795e-05, "loss": 0.4404, "mean_token_accuracy": 0.852856981754303, "num_tokens": 652942278.0, "step": 19490 }, { "epoch": 1.1637416427889207, "grad_norm": 0.4819105267524719, "learning_rate": 2.3296312184423113e-05, "loss": 0.4378, "mean_token_accuracy": 0.8558332443237304, "num_tokens": 653109958.0, "step": 19495 }, { "epoch": 1.1640401146131805, "grad_norm": 0.527688205242157, "learning_rate": 2.32854038729432e-05, "loss": 0.4838, "mean_token_accuracy": 0.8400274276733398, "num_tokens": 653277638.0, "step": 19500 }, { "epoch": 1.1643385864374403, "grad_norm": 0.4638756215572357, "learning_rate": 2.3274496588005974e-05, "loss": 0.3933, "mean_token_accuracy": 0.8679529905319214, "num_tokens": 653445318.0, "step": 19505 }, { "epoch": 1.1646370582617, "grad_norm": 0.566583514213562, "learning_rate": 2.3263590332268082e-05, "loss": 0.4832, "mean_token_accuracy": 0.8397829055786132, "num_tokens": 653612998.0, "step": 19510 }, { "epoch": 1.1649355300859598, "grad_norm": 0.5689113140106201, "learning_rate": 2.3252685108385967e-05, "loss": 0.4448, "mean_token_accuracy": 0.8518251180648804, "num_tokens": 653780678.0, "step": 19515 }, { "epoch": 1.1652340019102196, "grad_norm": 0.5511366724967957, "learning_rate": 2.3241780919015793e-05, "loss": 0.4742, "mean_token_accuracy": 0.843671715259552, "num_tokens": 653948358.0, "step": 19520 }, { "epoch": 1.1655324737344794, "grad_norm": 0.5243823528289795, "learning_rate": 2.3230877766813474e-05, "loss": 0.4877, "mean_token_accuracy": 0.8391685366630555, "num_tokens": 654116038.0, "step": 19525 }, { "epoch": 1.1658309455587392, "grad_norm": 0.5512502789497375, "learning_rate": 2.3219975654434666e-05, "loss": 0.4634, "mean_token_accuracy": 0.8468567252159118, "num_tokens": 654283718.0, "step": 19530 }, { "epoch": 1.166129417382999, "grad_norm": 0.499554306268692, "learning_rate": 2.320907458453479e-05, "loss": 0.4884, "mean_token_accuracy": 0.8369438171386718, "num_tokens": 654451398.0, "step": 19535 }, { "epoch": 1.1664278892072588, "grad_norm": 0.46907830238342285, "learning_rate": 2.319817455976901e-05, "loss": 0.4533, "mean_token_accuracy": 0.85076345205307, "num_tokens": 654619078.0, "step": 19540 }, { "epoch": 1.1667263610315186, "grad_norm": 0.5021779537200928, "learning_rate": 2.3187275582792206e-05, "loss": 0.4319, "mean_token_accuracy": 0.8558093786239624, "num_tokens": 654786758.0, "step": 19545 }, { "epoch": 1.1670248328557784, "grad_norm": 0.5622022747993469, "learning_rate": 2.317637765625905e-05, "loss": 0.4756, "mean_token_accuracy": 0.8429380774497985, "num_tokens": 654954438.0, "step": 19550 }, { "epoch": 1.1673233046800382, "grad_norm": 0.5311176180839539, "learning_rate": 2.3165480782823912e-05, "loss": 0.5019, "mean_token_accuracy": 0.8349815130233764, "num_tokens": 655122118.0, "step": 19555 }, { "epoch": 1.167621776504298, "grad_norm": 0.5961161851882935, "learning_rate": 2.315458496514093e-05, "loss": 0.485, "mean_token_accuracy": 0.8389657735824585, "num_tokens": 655289798.0, "step": 19560 }, { "epoch": 1.1679202483285578, "grad_norm": 0.48575928807258606, "learning_rate": 2.3143690205863998e-05, "loss": 0.3969, "mean_token_accuracy": 0.8668615102767945, "num_tokens": 655457478.0, "step": 19565 }, { "epoch": 1.1682187201528176, "grad_norm": 0.541350245475769, "learning_rate": 2.3132796507646716e-05, "loss": 0.4299, "mean_token_accuracy": 0.8556185126304626, "num_tokens": 655625158.0, "step": 19570 }, { "epoch": 1.1685171919770774, "grad_norm": 0.515748143196106, "learning_rate": 2.312190387314247e-05, "loss": 0.4587, "mean_token_accuracy": 0.8487236022949218, "num_tokens": 655792838.0, "step": 19575 }, { "epoch": 1.1688156638013372, "grad_norm": 0.5054634213447571, "learning_rate": 2.3111012305004325e-05, "loss": 0.4397, "mean_token_accuracy": 0.8530717015266418, "num_tokens": 655960518.0, "step": 19580 }, { "epoch": 1.169114135625597, "grad_norm": 0.49361228942871094, "learning_rate": 2.310012180588515e-05, "loss": 0.4531, "mean_token_accuracy": 0.8503817200660706, "num_tokens": 656128198.0, "step": 19585 }, { "epoch": 1.1694126074498568, "grad_norm": 0.5183147192001343, "learning_rate": 2.308923237843753e-05, "loss": 0.4498, "mean_token_accuracy": 0.8505487203598022, "num_tokens": 656295878.0, "step": 19590 }, { "epoch": 1.1697110792741165, "grad_norm": 0.46185898780822754, "learning_rate": 2.3078344025313775e-05, "loss": 0.4447, "mean_token_accuracy": 0.8518907308578492, "num_tokens": 656463558.0, "step": 19595 }, { "epoch": 1.1700095510983763, "grad_norm": 0.5042920112609863, "learning_rate": 2.3067456749165946e-05, "loss": 0.4672, "mean_token_accuracy": 0.8442502617835999, "num_tokens": 656631238.0, "step": 19600 }, { "epoch": 1.1703080229226361, "grad_norm": 0.5162453651428223, "learning_rate": 2.305657055264584e-05, "loss": 0.4553, "mean_token_accuracy": 0.8501014113426208, "num_tokens": 656798918.0, "step": 19605 }, { "epoch": 1.170606494746896, "grad_norm": 0.578216552734375, "learning_rate": 2.304568543840499e-05, "loss": 0.455, "mean_token_accuracy": 0.8495765209197998, "num_tokens": 656966598.0, "step": 19610 }, { "epoch": 1.1709049665711557, "grad_norm": 0.5236524343490601, "learning_rate": 2.3034801409094675e-05, "loss": 0.4887, "mean_token_accuracy": 0.8392257809638977, "num_tokens": 657133177.0, "step": 19615 }, { "epoch": 1.1712034383954155, "grad_norm": 0.5342540144920349, "learning_rate": 2.3023918467365886e-05, "loss": 0.4308, "mean_token_accuracy": 0.8558153510093689, "num_tokens": 657300857.0, "step": 19620 }, { "epoch": 1.1715019102196753, "grad_norm": 0.5280064344406128, "learning_rate": 2.301303661586939e-05, "loss": 0.4572, "mean_token_accuracy": 0.8483418822288513, "num_tokens": 657468537.0, "step": 19625 }, { "epoch": 1.171800382043935, "grad_norm": 0.5262241363525391, "learning_rate": 2.3002155857255624e-05, "loss": 0.4837, "mean_token_accuracy": 0.8410354375839233, "num_tokens": 657636217.0, "step": 19630 }, { "epoch": 1.172098853868195, "grad_norm": 0.4653843343257904, "learning_rate": 2.2991276194174838e-05, "loss": 0.4332, "mean_token_accuracy": 0.8568114042282104, "num_tokens": 657803897.0, "step": 19635 }, { "epoch": 1.1723973256924547, "grad_norm": 0.5430920124053955, "learning_rate": 2.2980397629276944e-05, "loss": 0.4203, "mean_token_accuracy": 0.8596325993537903, "num_tokens": 657971577.0, "step": 19640 }, { "epoch": 1.1726957975167145, "grad_norm": 0.5130705833435059, "learning_rate": 2.2969520165211634e-05, "loss": 0.4327, "mean_token_accuracy": 0.8546701669692993, "num_tokens": 658139257.0, "step": 19645 }, { "epoch": 1.1729942693409743, "grad_norm": 0.547199547290802, "learning_rate": 2.2958643804628315e-05, "loss": 0.4563, "mean_token_accuracy": 0.8479959487915039, "num_tokens": 658306937.0, "step": 19650 }, { "epoch": 1.173292741165234, "grad_norm": 0.49585190415382385, "learning_rate": 2.2947768550176112e-05, "loss": 0.4392, "mean_token_accuracy": 0.8531909823417664, "num_tokens": 658474617.0, "step": 19655 }, { "epoch": 1.1735912129894939, "grad_norm": 0.49096888303756714, "learning_rate": 2.2936894404503912e-05, "loss": 0.4196, "mean_token_accuracy": 0.8608910799026489, "num_tokens": 658642297.0, "step": 19660 }, { "epoch": 1.1738896848137537, "grad_norm": 0.505174994468689, "learning_rate": 2.2926021370260305e-05, "loss": 0.4436, "mean_token_accuracy": 0.8531253814697266, "num_tokens": 658809977.0, "step": 19665 }, { "epoch": 1.1741881566380135, "grad_norm": 0.4876374304294586, "learning_rate": 2.291514945009361e-05, "loss": 0.4292, "mean_token_accuracy": 0.8567875385284424, "num_tokens": 658977657.0, "step": 19670 }, { "epoch": 1.1744866284622733, "grad_norm": 0.5102152228355408, "learning_rate": 2.2904278646651905e-05, "loss": 0.4538, "mean_token_accuracy": 0.8490337610244751, "num_tokens": 659145337.0, "step": 19675 }, { "epoch": 1.174785100286533, "grad_norm": 0.4950414001941681, "learning_rate": 2.2893408962582958e-05, "loss": 0.4423, "mean_token_accuracy": 0.8534116744995117, "num_tokens": 659313017.0, "step": 19680 }, { "epoch": 1.1750835721107928, "grad_norm": 0.4529719054698944, "learning_rate": 2.2882540400534287e-05, "loss": 0.4342, "mean_token_accuracy": 0.8553143262863159, "num_tokens": 659480697.0, "step": 19685 }, { "epoch": 1.1753820439350524, "grad_norm": 0.5090801119804382, "learning_rate": 2.2871672963153124e-05, "loss": 0.4435, "mean_token_accuracy": 0.8520517706871032, "num_tokens": 659648377.0, "step": 19690 }, { "epoch": 1.1756805157593124, "grad_norm": 0.4853980243206024, "learning_rate": 2.2860806653086437e-05, "loss": 0.4248, "mean_token_accuracy": 0.857819402217865, "num_tokens": 659816057.0, "step": 19695 }, { "epoch": 1.175978987583572, "grad_norm": 0.585038959980011, "learning_rate": 2.2849941472980936e-05, "loss": 0.4683, "mean_token_accuracy": 0.8449839115142822, "num_tokens": 659983737.0, "step": 19700 }, { "epoch": 1.176277459407832, "grad_norm": 0.48393943905830383, "learning_rate": 2.2839077425483003e-05, "loss": 0.4538, "mean_token_accuracy": 0.8494393467903137, "num_tokens": 660151417.0, "step": 19705 }, { "epoch": 1.1765759312320916, "grad_norm": 0.49411025643348694, "learning_rate": 2.2828214513238806e-05, "loss": 0.4392, "mean_token_accuracy": 0.8544196486473083, "num_tokens": 660319097.0, "step": 19710 }, { "epoch": 1.1768744030563516, "grad_norm": 0.5259071588516235, "learning_rate": 2.2817352738894187e-05, "loss": 0.4819, "mean_token_accuracy": 0.8411249041557312, "num_tokens": 660486777.0, "step": 19715 }, { "epoch": 1.1771728748806112, "grad_norm": 0.515748143196106, "learning_rate": 2.280649210509474e-05, "loss": 0.4365, "mean_token_accuracy": 0.8544196486473083, "num_tokens": 660654457.0, "step": 19720 }, { "epoch": 1.177471346704871, "grad_norm": 0.5063372254371643, "learning_rate": 2.2795632614485777e-05, "loss": 0.4715, "mean_token_accuracy": 0.842180609703064, "num_tokens": 660822137.0, "step": 19725 }, { "epoch": 1.1777698185291308, "grad_norm": 0.5847283005714417, "learning_rate": 2.2784774269712324e-05, "loss": 0.4647, "mean_token_accuracy": 0.845765233039856, "num_tokens": 660989817.0, "step": 19730 }, { "epoch": 1.1780682903533906, "grad_norm": 0.572058379650116, "learning_rate": 2.277391707341913e-05, "loss": 0.5071, "mean_token_accuracy": 0.8348562598228455, "num_tokens": 661157497.0, "step": 19735 }, { "epoch": 1.1783667621776504, "grad_norm": 0.5968338847160339, "learning_rate": 2.2763061028250665e-05, "loss": 0.4733, "mean_token_accuracy": 0.8430752754211426, "num_tokens": 661325177.0, "step": 19740 }, { "epoch": 1.1786652340019101, "grad_norm": 0.5082035064697266, "learning_rate": 2.2752206136851124e-05, "loss": 0.5075, "mean_token_accuracy": 0.8324883580207825, "num_tokens": 661492857.0, "step": 19745 }, { "epoch": 1.17896370582617, "grad_norm": 0.5289940237998962, "learning_rate": 2.2741352401864408e-05, "loss": 0.4387, "mean_token_accuracy": 0.8542049288749695, "num_tokens": 661660537.0, "step": 19750 }, { "epoch": 1.1792621776504297, "grad_norm": 0.47402361035346985, "learning_rate": 2.273049982593415e-05, "loss": 0.4321, "mean_token_accuracy": 0.8553620457649231, "num_tokens": 661828217.0, "step": 19755 }, { "epoch": 1.1795606494746895, "grad_norm": 0.5124250650405884, "learning_rate": 2.2719648411703698e-05, "loss": 0.4506, "mean_token_accuracy": 0.8496361613273621, "num_tokens": 661995897.0, "step": 19760 }, { "epoch": 1.1798591212989493, "grad_norm": 0.5215466022491455, "learning_rate": 2.2708798161816103e-05, "loss": 0.4312, "mean_token_accuracy": 0.8568650841712951, "num_tokens": 662163577.0, "step": 19765 }, { "epoch": 1.1801575931232091, "grad_norm": 0.518336296081543, "learning_rate": 2.2697949078914166e-05, "loss": 0.4436, "mean_token_accuracy": 0.8518847703933716, "num_tokens": 662331257.0, "step": 19770 }, { "epoch": 1.180456064947469, "grad_norm": 0.8242446184158325, "learning_rate": 2.2687101165640347e-05, "loss": 0.4625, "mean_token_accuracy": 0.8481629490852356, "num_tokens": 662498937.0, "step": 19775 }, { "epoch": 1.1807545367717287, "grad_norm": 0.521611213684082, "learning_rate": 2.2676254424636884e-05, "loss": 0.4297, "mean_token_accuracy": 0.8576404571533203, "num_tokens": 662666617.0, "step": 19780 }, { "epoch": 1.1810530085959885, "grad_norm": 0.5193411707878113, "learning_rate": 2.2665408858545694e-05, "loss": 0.441, "mean_token_accuracy": 0.8525885820388794, "num_tokens": 662834297.0, "step": 19785 }, { "epoch": 1.1813514804202483, "grad_norm": 0.5576213002204895, "learning_rate": 2.265456447000841e-05, "loss": 0.4523, "mean_token_accuracy": 0.8498926401138306, "num_tokens": 663001977.0, "step": 19790 }, { "epoch": 1.181649952244508, "grad_norm": 0.47739002108573914, "learning_rate": 2.264372126166639e-05, "loss": 0.4654, "mean_token_accuracy": 0.8471072316169739, "num_tokens": 663169657.0, "step": 19795 }, { "epoch": 1.1819484240687679, "grad_norm": 0.4830189645290375, "learning_rate": 2.263287923616069e-05, "loss": 0.4629, "mean_token_accuracy": 0.8477036833763123, "num_tokens": 663337337.0, "step": 19800 }, { "epoch": 1.1822468958930277, "grad_norm": 0.5114553570747375, "learning_rate": 2.262203839613209e-05, "loss": 0.4389, "mean_token_accuracy": 0.8536621570587158, "num_tokens": 663505017.0, "step": 19805 }, { "epoch": 1.1825453677172875, "grad_norm": 0.46853530406951904, "learning_rate": 2.261119874422108e-05, "loss": 0.4766, "mean_token_accuracy": 0.8436239957809448, "num_tokens": 663672697.0, "step": 19810 }, { "epoch": 1.1828438395415473, "grad_norm": 0.5454878807067871, "learning_rate": 2.260036028306784e-05, "loss": 0.4783, "mean_token_accuracy": 0.841315758228302, "num_tokens": 663840377.0, "step": 19815 }, { "epoch": 1.183142311365807, "grad_norm": 0.5502786636352539, "learning_rate": 2.2589523015312303e-05, "loss": 0.4755, "mean_token_accuracy": 0.8417929172515869, "num_tokens": 664008057.0, "step": 19820 }, { "epoch": 1.1834407831900668, "grad_norm": 0.5025622844696045, "learning_rate": 2.257868694359407e-05, "loss": 0.4694, "mean_token_accuracy": 0.8456996321678162, "num_tokens": 664175737.0, "step": 19825 }, { "epoch": 1.1837392550143266, "grad_norm": 0.4925752580165863, "learning_rate": 2.2567852070552457e-05, "loss": 0.455, "mean_token_accuracy": 0.8487653613090516, "num_tokens": 664343417.0, "step": 19830 }, { "epoch": 1.1840377268385864, "grad_norm": 0.5200989246368408, "learning_rate": 2.2557018398826517e-05, "loss": 0.4664, "mean_token_accuracy": 0.8442025542259216, "num_tokens": 664511097.0, "step": 19835 }, { "epoch": 1.1843361986628462, "grad_norm": 0.4849730134010315, "learning_rate": 2.2546185931054987e-05, "loss": 0.423, "mean_token_accuracy": 0.8579267621040344, "num_tokens": 664678777.0, "step": 19840 }, { "epoch": 1.184634670487106, "grad_norm": 0.45852211117744446, "learning_rate": 2.2535354669876306e-05, "loss": 0.4164, "mean_token_accuracy": 0.8617380380630493, "num_tokens": 664846457.0, "step": 19845 }, { "epoch": 1.1849331423113658, "grad_norm": 0.5718573927879333, "learning_rate": 2.2524524617928638e-05, "loss": 0.4885, "mean_token_accuracy": 0.8376297235488892, "num_tokens": 665014137.0, "step": 19850 }, { "epoch": 1.1852316141356256, "grad_norm": 0.5277978777885437, "learning_rate": 2.2513695777849834e-05, "loss": 0.4728, "mean_token_accuracy": 0.8458845257759094, "num_tokens": 665181817.0, "step": 19855 }, { "epoch": 1.1855300859598854, "grad_norm": 0.4968494772911072, "learning_rate": 2.2502868152277456e-05, "loss": 0.4092, "mean_token_accuracy": 0.8622211813926697, "num_tokens": 665349497.0, "step": 19860 }, { "epoch": 1.1858285577841452, "grad_norm": 0.5447377562522888, "learning_rate": 2.2492041743848776e-05, "loss": 0.4588, "mean_token_accuracy": 0.8484253764152527, "num_tokens": 665517177.0, "step": 19865 }, { "epoch": 1.186127029608405, "grad_norm": 0.5029953122138977, "learning_rate": 2.2481216555200774e-05, "loss": 0.4831, "mean_token_accuracy": 0.8423774242401123, "num_tokens": 665684857.0, "step": 19870 }, { "epoch": 1.1864255014326648, "grad_norm": 0.5610088109970093, "learning_rate": 2.247039258897011e-05, "loss": 0.4364, "mean_token_accuracy": 0.853972327709198, "num_tokens": 665852537.0, "step": 19875 }, { "epoch": 1.1867239732569246, "grad_norm": 0.5005916953086853, "learning_rate": 2.2459569847793166e-05, "loss": 0.4308, "mean_token_accuracy": 0.8550638198852539, "num_tokens": 666020217.0, "step": 19880 }, { "epoch": 1.1870224450811844, "grad_norm": 0.5791324973106384, "learning_rate": 2.244874833430602e-05, "loss": 0.4541, "mean_token_accuracy": 0.8501073598861695, "num_tokens": 666187897.0, "step": 19885 }, { "epoch": 1.1873209169054442, "grad_norm": 0.4910011291503906, "learning_rate": 2.2437928051144446e-05, "loss": 0.4091, "mean_token_accuracy": 0.8643385529518127, "num_tokens": 666355577.0, "step": 19890 }, { "epoch": 1.187619388729704, "grad_norm": 0.49433910846710205, "learning_rate": 2.2427109000943946e-05, "loss": 0.4485, "mean_token_accuracy": 0.8510557055473328, "num_tokens": 666523257.0, "step": 19895 }, { "epoch": 1.1879178605539638, "grad_norm": 0.500910758972168, "learning_rate": 2.2416291186339672e-05, "loss": 0.4388, "mean_token_accuracy": 0.8530776619911193, "num_tokens": 666690937.0, "step": 19900 }, { "epoch": 1.1882163323782235, "grad_norm": 0.48798131942749023, "learning_rate": 2.2405474609966524e-05, "loss": 0.4265, "mean_token_accuracy": 0.8563342452049255, "num_tokens": 666858617.0, "step": 19905 }, { "epoch": 1.1885148042024833, "grad_norm": 0.46963825821876526, "learning_rate": 2.2394659274459057e-05, "loss": 0.437, "mean_token_accuracy": 0.8557378053665161, "num_tokens": 667026297.0, "step": 19910 }, { "epoch": 1.1888132760267431, "grad_norm": 0.5557814240455627, "learning_rate": 2.238384518245156e-05, "loss": 0.4292, "mean_token_accuracy": 0.8553560853004456, "num_tokens": 667193977.0, "step": 19915 }, { "epoch": 1.189111747851003, "grad_norm": 0.4949130415916443, "learning_rate": 2.237303233657801e-05, "loss": 0.4284, "mean_token_accuracy": 0.8582070946693421, "num_tokens": 667361657.0, "step": 19920 }, { "epoch": 1.1894102196752627, "grad_norm": 0.494113028049469, "learning_rate": 2.236222073947206e-05, "loss": 0.4411, "mean_token_accuracy": 0.8533460617065429, "num_tokens": 667529337.0, "step": 19925 }, { "epoch": 1.1897086914995225, "grad_norm": 0.47995102405548096, "learning_rate": 2.2351410393767092e-05, "loss": 0.4214, "mean_token_accuracy": 0.8601097345352173, "num_tokens": 667697017.0, "step": 19930 }, { "epoch": 1.1900071633237823, "grad_norm": 0.5150399804115295, "learning_rate": 2.2340601302096147e-05, "loss": 0.4693, "mean_token_accuracy": 0.8455803394317627, "num_tokens": 667864697.0, "step": 19935 }, { "epoch": 1.190305635148042, "grad_norm": 0.5533705949783325, "learning_rate": 2.2329793467091987e-05, "loss": 0.4451, "mean_token_accuracy": 0.8515030384063721, "num_tokens": 668032377.0, "step": 19940 }, { "epoch": 1.190604106972302, "grad_norm": 0.5789175033569336, "learning_rate": 2.2318986891387054e-05, "loss": 0.4662, "mean_token_accuracy": 0.8456566810607911, "num_tokens": 668192213.0, "step": 19945 }, { "epoch": 1.1909025787965617, "grad_norm": 0.5290037989616394, "learning_rate": 2.23081815776135e-05, "loss": 0.4766, "mean_token_accuracy": 0.8415423989295959, "num_tokens": 668359893.0, "step": 19950 }, { "epoch": 1.1912010506208215, "grad_norm": 0.500445544719696, "learning_rate": 2.229737752840316e-05, "loss": 0.4528, "mean_token_accuracy": 0.8498270153999329, "num_tokens": 668527573.0, "step": 19955 }, { "epoch": 1.1914995224450813, "grad_norm": 0.5138988494873047, "learning_rate": 2.2286574746387534e-05, "loss": 0.4824, "mean_token_accuracy": 0.8410235047340393, "num_tokens": 668695253.0, "step": 19960 }, { "epoch": 1.1917979942693409, "grad_norm": 0.5639352202415466, "learning_rate": 2.2275773234197878e-05, "loss": 0.4822, "mean_token_accuracy": 0.8398067593574524, "num_tokens": 668862933.0, "step": 19965 }, { "epoch": 1.1920964660936009, "grad_norm": 0.5014837384223938, "learning_rate": 2.226497299446506e-05, "loss": 0.4456, "mean_token_accuracy": 0.8509483456611633, "num_tokens": 669030613.0, "step": 19970 }, { "epoch": 1.1923949379178604, "grad_norm": 0.5052182674407959, "learning_rate": 2.2254174029819703e-05, "loss": 0.4733, "mean_token_accuracy": 0.8446021676063538, "num_tokens": 669198293.0, "step": 19975 }, { "epoch": 1.1926934097421205, "grad_norm": 0.5078866481781006, "learning_rate": 2.22433763428921e-05, "loss": 0.4802, "mean_token_accuracy": 0.8433019280433655, "num_tokens": 669365973.0, "step": 19980 }, { "epoch": 1.19299188156638, "grad_norm": 0.5302884578704834, "learning_rate": 2.22325799363122e-05, "loss": 0.4328, "mean_token_accuracy": 0.8557437539100647, "num_tokens": 669533653.0, "step": 19985 }, { "epoch": 1.19329035339064, "grad_norm": 0.5151789784431458, "learning_rate": 2.222178481270969e-05, "loss": 0.467, "mean_token_accuracy": 0.8452582597732544, "num_tokens": 669701333.0, "step": 19990 }, { "epoch": 1.1935888252148996, "grad_norm": 0.5370619893074036, "learning_rate": 2.2210990974713908e-05, "loss": 0.4785, "mean_token_accuracy": 0.8420016765594482, "num_tokens": 669869013.0, "step": 19995 }, { "epoch": 1.1938872970391594, "grad_norm": 0.4923109710216522, "learning_rate": 2.2200198424953894e-05, "loss": 0.4457, "mean_token_accuracy": 0.8509841322898865, "num_tokens": 670036693.0, "step": 20000 }, { "epoch": 1.1941857688634192, "grad_norm": 0.47648292779922485, "learning_rate": 2.218940716605839e-05, "loss": 0.4245, "mean_token_accuracy": 0.8580043077468872, "num_tokens": 670204373.0, "step": 20005 }, { "epoch": 1.194484240687679, "grad_norm": 0.4782179892063141, "learning_rate": 2.217861720065578e-05, "loss": 0.465, "mean_token_accuracy": 0.8462722182273865, "num_tokens": 670372053.0, "step": 20010 }, { "epoch": 1.1947827125119388, "grad_norm": 0.5209377408027649, "learning_rate": 2.2167828531374184e-05, "loss": 0.4291, "mean_token_accuracy": 0.8558869123458862, "num_tokens": 670539733.0, "step": 20015 }, { "epoch": 1.1950811843361986, "grad_norm": 0.5709136128425598, "learning_rate": 2.2157041160841362e-05, "loss": 0.4818, "mean_token_accuracy": 0.8399618268013, "num_tokens": 670707413.0, "step": 20020 }, { "epoch": 1.1953796561604584, "grad_norm": 0.5084706544876099, "learning_rate": 2.2146255091684786e-05, "loss": 0.4926, "mean_token_accuracy": 0.8385482549667358, "num_tokens": 670875093.0, "step": 20025 }, { "epoch": 1.1956781279847182, "grad_norm": 0.4596470296382904, "learning_rate": 2.213547032653161e-05, "loss": 0.4384, "mean_token_accuracy": 0.8529046893119812, "num_tokens": 671042773.0, "step": 20030 }, { "epoch": 1.195976599808978, "grad_norm": 0.49420779943466187, "learning_rate": 2.2124686868008638e-05, "loss": 0.4143, "mean_token_accuracy": 0.8603900790214538, "num_tokens": 671210453.0, "step": 20035 }, { "epoch": 1.1962750716332378, "grad_norm": 0.5122491121292114, "learning_rate": 2.211390471874241e-05, "loss": 0.4536, "mean_token_accuracy": 0.8490934014320374, "num_tokens": 671378133.0, "step": 20040 }, { "epoch": 1.1965735434574976, "grad_norm": 0.4651281237602234, "learning_rate": 2.2103123881359094e-05, "loss": 0.4694, "mean_token_accuracy": 0.8450912594795227, "num_tokens": 671545813.0, "step": 20045 }, { "epoch": 1.1968720152817574, "grad_norm": 0.5637299418449402, "learning_rate": 2.2092344358484586e-05, "loss": 0.4944, "mean_token_accuracy": 0.8379220008850098, "num_tokens": 671713493.0, "step": 20050 }, { "epoch": 1.1971704871060171, "grad_norm": 0.5713850259780884, "learning_rate": 2.2081566152744404e-05, "loss": 0.5167, "mean_token_accuracy": 0.8296731472015381, "num_tokens": 671881173.0, "step": 20055 }, { "epoch": 1.197468958930277, "grad_norm": 0.5393882989883423, "learning_rate": 2.207078926676381e-05, "loss": 0.5106, "mean_token_accuracy": 0.8318620920181274, "num_tokens": 672048853.0, "step": 20060 }, { "epoch": 1.1977674307545367, "grad_norm": 0.4390849173069, "learning_rate": 2.20600137031677e-05, "loss": 0.4228, "mean_token_accuracy": 0.8580997228622437, "num_tokens": 672216533.0, "step": 20065 }, { "epoch": 1.1980659025787965, "grad_norm": 0.49061521887779236, "learning_rate": 2.2049239464580664e-05, "loss": 0.4336, "mean_token_accuracy": 0.8548789143562316, "num_tokens": 672384213.0, "step": 20070 }, { "epoch": 1.1983643744030563, "grad_norm": 0.525015115737915, "learning_rate": 2.2038466553626965e-05, "loss": 0.4541, "mean_token_accuracy": 0.848759388923645, "num_tokens": 672551893.0, "step": 20075 }, { "epoch": 1.1986628462273161, "grad_norm": 0.5270291566848755, "learning_rate": 2.202769497293054e-05, "loss": 0.4355, "mean_token_accuracy": 0.854109525680542, "num_tokens": 672719573.0, "step": 20080 }, { "epoch": 1.198961318051576, "grad_norm": 0.5062497854232788, "learning_rate": 2.201692472511501e-05, "loss": 0.4594, "mean_token_accuracy": 0.8487057089805603, "num_tokens": 672887253.0, "step": 20085 }, { "epoch": 1.1992597898758357, "grad_norm": 0.6046461462974548, "learning_rate": 2.2006155812803685e-05, "loss": 0.4172, "mean_token_accuracy": 0.8603602647781372, "num_tokens": 673054933.0, "step": 20090 }, { "epoch": 1.1995582617000955, "grad_norm": 0.49185025691986084, "learning_rate": 2.19953882386195e-05, "loss": 0.4564, "mean_token_accuracy": 0.8482404947280884, "num_tokens": 673222613.0, "step": 20095 }, { "epoch": 1.1998567335243553, "grad_norm": 0.576162576675415, "learning_rate": 2.198462200518513e-05, "loss": 0.4483, "mean_token_accuracy": 0.8519980907440186, "num_tokens": 673390293.0, "step": 20100 }, { "epoch": 1.200155205348615, "grad_norm": 0.4832688570022583, "learning_rate": 2.1973857115122852e-05, "loss": 0.4387, "mean_token_accuracy": 0.8535011410713196, "num_tokens": 673557973.0, "step": 20105 }, { "epoch": 1.2004536771728749, "grad_norm": 0.5112908482551575, "learning_rate": 2.196309357105469e-05, "loss": 0.4356, "mean_token_accuracy": 0.8550996065139771, "num_tokens": 673725653.0, "step": 20110 }, { "epoch": 1.2007521489971347, "grad_norm": 0.5340437293052673, "learning_rate": 2.1952331375602288e-05, "loss": 0.4698, "mean_token_accuracy": 0.844399380683899, "num_tokens": 673893333.0, "step": 20115 }, { "epoch": 1.2010506208213945, "grad_norm": 0.4853335916996002, "learning_rate": 2.1941570531386968e-05, "loss": 0.4379, "mean_token_accuracy": 0.8539126873016357, "num_tokens": 674061013.0, "step": 20120 }, { "epoch": 1.2013490926456543, "grad_norm": 0.5637503266334534, "learning_rate": 2.1930811041029754e-05, "loss": 0.4887, "mean_token_accuracy": 0.8378205895423889, "num_tokens": 674228693.0, "step": 20125 }, { "epoch": 1.201647564469914, "grad_norm": 0.500538170337677, "learning_rate": 2.1920052907151296e-05, "loss": 0.4452, "mean_token_accuracy": 0.852493143081665, "num_tokens": 674396373.0, "step": 20130 }, { "epoch": 1.2019460362941738, "grad_norm": 0.48072803020477295, "learning_rate": 2.1909296132371947e-05, "loss": 0.4306, "mean_token_accuracy": 0.8552845120429993, "num_tokens": 674564053.0, "step": 20135 }, { "epoch": 1.2022445081184336, "grad_norm": 0.5291653275489807, "learning_rate": 2.1898540719311722e-05, "loss": 0.4813, "mean_token_accuracy": 0.840450918674469, "num_tokens": 674731733.0, "step": 20140 }, { "epoch": 1.2025429799426934, "grad_norm": 0.5230559706687927, "learning_rate": 2.1887786670590295e-05, "loss": 0.4519, "mean_token_accuracy": 0.8490635871887207, "num_tokens": 674899413.0, "step": 20145 }, { "epoch": 1.2028414517669532, "grad_norm": 0.5714749097824097, "learning_rate": 2.1877033988827016e-05, "loss": 0.4559, "mean_token_accuracy": 0.8475366830825806, "num_tokens": 675067093.0, "step": 20150 }, { "epoch": 1.203139923591213, "grad_norm": 0.566028356552124, "learning_rate": 2.186628267664089e-05, "loss": 0.4773, "mean_token_accuracy": 0.842067277431488, "num_tokens": 675234773.0, "step": 20155 }, { "epoch": 1.2034383954154728, "grad_norm": 0.5234898924827576, "learning_rate": 2.1855532736650617e-05, "loss": 0.4304, "mean_token_accuracy": 0.8562745928764344, "num_tokens": 675402453.0, "step": 20160 }, { "epoch": 1.2037368672397326, "grad_norm": 0.5434269905090332, "learning_rate": 2.1844784171474526e-05, "loss": 0.4243, "mean_token_accuracy": 0.8572945237159729, "num_tokens": 675570133.0, "step": 20165 }, { "epoch": 1.2040353390639924, "grad_norm": 0.5061639547348022, "learning_rate": 2.183403698373062e-05, "loss": 0.4515, "mean_token_accuracy": 0.8479184031486511, "num_tokens": 675737813.0, "step": 20170 }, { "epoch": 1.2043338108882522, "grad_norm": 0.5039093494415283, "learning_rate": 2.1823291176036608e-05, "loss": 0.4717, "mean_token_accuracy": 0.8431229829788208, "num_tokens": 675905493.0, "step": 20175 }, { "epoch": 1.204632282712512, "grad_norm": 0.4952000081539154, "learning_rate": 2.1812546751009793e-05, "loss": 0.4272, "mean_token_accuracy": 0.8578253746032715, "num_tokens": 676073173.0, "step": 20180 }, { "epoch": 1.2049307545367718, "grad_norm": 0.5139133930206299, "learning_rate": 2.1801803711267203e-05, "loss": 0.4293, "mean_token_accuracy": 0.8558630585670471, "num_tokens": 676240853.0, "step": 20185 }, { "epoch": 1.2052292263610316, "grad_norm": 0.4910508692264557, "learning_rate": 2.17910620594255e-05, "loss": 0.4216, "mean_token_accuracy": 0.8602051734924316, "num_tokens": 676408533.0, "step": 20190 }, { "epoch": 1.2055276981852914, "grad_norm": 0.4882889986038208, "learning_rate": 2.1780321798101e-05, "loss": 0.4192, "mean_token_accuracy": 0.8586782813072205, "num_tokens": 676576213.0, "step": 20195 }, { "epoch": 1.2058261700095512, "grad_norm": 0.47544705867767334, "learning_rate": 2.1769582929909703e-05, "loss": 0.4643, "mean_token_accuracy": 0.8458606719970703, "num_tokens": 676743893.0, "step": 20200 }, { "epoch": 1.206124641833811, "grad_norm": 0.5053207874298096, "learning_rate": 2.1758845457467254e-05, "loss": 0.4427, "mean_token_accuracy": 0.8504055857658386, "num_tokens": 676911573.0, "step": 20205 }, { "epoch": 1.2064231136580708, "grad_norm": 0.5221449136734009, "learning_rate": 2.174810938338897e-05, "loss": 0.4339, "mean_token_accuracy": 0.8546045422554016, "num_tokens": 677079253.0, "step": 20210 }, { "epoch": 1.2067215854823305, "grad_norm": 0.4730874300003052, "learning_rate": 2.1737374710289803e-05, "loss": 0.4402, "mean_token_accuracy": 0.8543063282966614, "num_tokens": 677246933.0, "step": 20215 }, { "epoch": 1.2070200573065903, "grad_norm": 0.531874418258667, "learning_rate": 2.172664144078439e-05, "loss": 0.4346, "mean_token_accuracy": 0.8557795524597168, "num_tokens": 677414613.0, "step": 20220 }, { "epoch": 1.2073185291308501, "grad_norm": 0.4817505478858948, "learning_rate": 2.171590957748703e-05, "loss": 0.4351, "mean_token_accuracy": 0.855964457988739, "num_tokens": 677582293.0, "step": 20225 }, { "epoch": 1.20761700095511, "grad_norm": 0.5840501189231873, "learning_rate": 2.1705179123011643e-05, "loss": 0.5032, "mean_token_accuracy": 0.8334307670593262, "num_tokens": 677749973.0, "step": 20230 }, { "epoch": 1.2079154727793697, "grad_norm": 0.6456384658813477, "learning_rate": 2.169445007997185e-05, "loss": 0.4287, "mean_token_accuracy": 0.8555767655372619, "num_tokens": 677917653.0, "step": 20235 }, { "epoch": 1.2082139446036293, "grad_norm": 0.5162643790245056, "learning_rate": 2.168372245098089e-05, "loss": 0.4848, "mean_token_accuracy": 0.8411666512489319, "num_tokens": 678085333.0, "step": 20240 }, { "epoch": 1.2085124164278893, "grad_norm": 0.5398520231246948, "learning_rate": 2.1672996238651682e-05, "loss": 0.4724, "mean_token_accuracy": 0.8448705792427063, "num_tokens": 678253013.0, "step": 20245 }, { "epoch": 1.2088108882521489, "grad_norm": 0.5212766528129578, "learning_rate": 2.1662271445596805e-05, "loss": 0.433, "mean_token_accuracy": 0.8571752548217774, "num_tokens": 678420693.0, "step": 20250 }, { "epoch": 1.209109360076409, "grad_norm": 0.535269558429718, "learning_rate": 2.1651548074428462e-05, "loss": 0.4696, "mean_token_accuracy": 0.8452463388442993, "num_tokens": 678588373.0, "step": 20255 }, { "epoch": 1.2094078319006685, "grad_norm": 0.49153932929039, "learning_rate": 2.1640826127758544e-05, "loss": 0.4459, "mean_token_accuracy": 0.8492484807968139, "num_tokens": 678756053.0, "step": 20260 }, { "epoch": 1.2097063037249285, "grad_norm": 0.5382280349731445, "learning_rate": 2.1630105608198566e-05, "loss": 0.483, "mean_token_accuracy": 0.8415304780006408, "num_tokens": 678923733.0, "step": 20265 }, { "epoch": 1.210004775549188, "grad_norm": 0.5323084592819214, "learning_rate": 2.1619386518359708e-05, "loss": 0.4817, "mean_token_accuracy": 0.8401944398880005, "num_tokens": 679091413.0, "step": 20270 }, { "epoch": 1.2103032473734479, "grad_norm": 0.46458926796913147, "learning_rate": 2.160866886085281e-05, "loss": 0.4369, "mean_token_accuracy": 0.8556722044944763, "num_tokens": 679259093.0, "step": 20275 }, { "epoch": 1.2106017191977076, "grad_norm": 0.5003271698951721, "learning_rate": 2.159795263828835e-05, "loss": 0.4318, "mean_token_accuracy": 0.8547119259834289, "num_tokens": 679426773.0, "step": 20280 }, { "epoch": 1.2109001910219674, "grad_norm": 0.5180540680885315, "learning_rate": 2.158723785327647e-05, "loss": 0.4532, "mean_token_accuracy": 0.8484253883361816, "num_tokens": 679594453.0, "step": 20285 }, { "epoch": 1.2111986628462272, "grad_norm": 0.5006572604179382, "learning_rate": 2.1576524508426944e-05, "loss": 0.4671, "mean_token_accuracy": 0.8442502737045288, "num_tokens": 679762133.0, "step": 20290 }, { "epoch": 1.211497134670487, "grad_norm": 0.4925740361213684, "learning_rate": 2.1565812606349212e-05, "loss": 0.4522, "mean_token_accuracy": 0.8501490950584412, "num_tokens": 679929813.0, "step": 20295 }, { "epoch": 1.2117956064947468, "grad_norm": 0.5071954131126404, "learning_rate": 2.1555102149652346e-05, "loss": 0.4499, "mean_token_accuracy": 0.8497733473777771, "num_tokens": 680097493.0, "step": 20300 }, { "epoch": 1.2120940783190066, "grad_norm": 0.5857788920402527, "learning_rate": 2.154439314094508e-05, "loss": 0.4644, "mean_token_accuracy": 0.8469462037086487, "num_tokens": 680265173.0, "step": 20305 }, { "epoch": 1.2123925501432664, "grad_norm": 0.5620033144950867, "learning_rate": 2.15336855828358e-05, "loss": 0.4646, "mean_token_accuracy": 0.8460574984550476, "num_tokens": 680432853.0, "step": 20310 }, { "epoch": 1.2126910219675262, "grad_norm": 0.4749830961227417, "learning_rate": 2.152297947793252e-05, "loss": 0.4384, "mean_token_accuracy": 0.8538947939872742, "num_tokens": 680600533.0, "step": 20315 }, { "epoch": 1.212989493791786, "grad_norm": 0.4631313383579254, "learning_rate": 2.1512274828842914e-05, "loss": 0.4375, "mean_token_accuracy": 0.8555290341377259, "num_tokens": 680768213.0, "step": 20320 }, { "epoch": 1.2132879656160458, "grad_norm": 0.5602476000785828, "learning_rate": 2.150157163817429e-05, "loss": 0.454, "mean_token_accuracy": 0.8492067337036133, "num_tokens": 680935893.0, "step": 20325 }, { "epoch": 1.2135864374403056, "grad_norm": 0.5524552464485168, "learning_rate": 2.149086990853361e-05, "loss": 0.4615, "mean_token_accuracy": 0.8464928984642028, "num_tokens": 681103573.0, "step": 20330 }, { "epoch": 1.2138849092645654, "grad_norm": 0.5774242281913757, "learning_rate": 2.1480169642527488e-05, "loss": 0.4565, "mean_token_accuracy": 0.8492007613182068, "num_tokens": 681271253.0, "step": 20335 }, { "epoch": 1.2141833810888252, "grad_norm": 0.5189878940582275, "learning_rate": 2.1469470842762153e-05, "loss": 0.4459, "mean_token_accuracy": 0.8516998648643493, "num_tokens": 681438933.0, "step": 20340 }, { "epoch": 1.214481852913085, "grad_norm": 0.48288533091545105, "learning_rate": 2.1458773511843516e-05, "loss": 0.4893, "mean_token_accuracy": 0.8384528279304504, "num_tokens": 681606613.0, "step": 20345 }, { "epoch": 1.2147803247373448, "grad_norm": 0.4914621412754059, "learning_rate": 2.144807765237709e-05, "loss": 0.4533, "mean_token_accuracy": 0.848133134841919, "num_tokens": 681774293.0, "step": 20350 }, { "epoch": 1.2150787965616046, "grad_norm": 0.5602121949195862, "learning_rate": 2.1437383266968054e-05, "loss": 0.4552, "mean_token_accuracy": 0.8499940514564515, "num_tokens": 681941973.0, "step": 20355 }, { "epoch": 1.2153772683858644, "grad_norm": 0.5511289238929749, "learning_rate": 2.1426690358221237e-05, "loss": 0.4357, "mean_token_accuracy": 0.8555827379226685, "num_tokens": 682109653.0, "step": 20360 }, { "epoch": 1.2156757402101241, "grad_norm": 0.5459198951721191, "learning_rate": 2.141599892874107e-05, "loss": 0.4412, "mean_token_accuracy": 0.8522485852241516, "num_tokens": 682277333.0, "step": 20365 }, { "epoch": 1.215974212034384, "grad_norm": 0.5703451633453369, "learning_rate": 2.140530898113168e-05, "loss": 0.4896, "mean_token_accuracy": 0.8384468674659729, "num_tokens": 682445013.0, "step": 20370 }, { "epoch": 1.2162726838586437, "grad_norm": 0.5362686514854431, "learning_rate": 2.1394620517996766e-05, "loss": 0.4403, "mean_token_accuracy": 0.8533162355422974, "num_tokens": 682612693.0, "step": 20375 }, { "epoch": 1.2165711556829035, "grad_norm": 0.5646585822105408, "learning_rate": 2.1383933541939734e-05, "loss": 0.4701, "mean_token_accuracy": 0.8428307294845581, "num_tokens": 682780373.0, "step": 20380 }, { "epoch": 1.2168696275071633, "grad_norm": 0.5130622386932373, "learning_rate": 2.137324805556356e-05, "loss": 0.4715, "mean_token_accuracy": 0.8437194347381591, "num_tokens": 682948053.0, "step": 20385 }, { "epoch": 1.2171680993314231, "grad_norm": 0.49547046422958374, "learning_rate": 2.136256406147092e-05, "loss": 0.4698, "mean_token_accuracy": 0.8436359167098999, "num_tokens": 683115733.0, "step": 20390 }, { "epoch": 1.217466571155683, "grad_norm": 0.5067601799964905, "learning_rate": 2.1351881562264088e-05, "loss": 0.441, "mean_token_accuracy": 0.8517953038215638, "num_tokens": 683283413.0, "step": 20395 }, { "epoch": 1.2177650429799427, "grad_norm": 0.4947945177555084, "learning_rate": 2.1341200560544973e-05, "loss": 0.448, "mean_token_accuracy": 0.8514135599136352, "num_tokens": 683451093.0, "step": 20400 }, { "epoch": 1.2180635148042025, "grad_norm": 0.5381024479866028, "learning_rate": 2.133052105891515e-05, "loss": 0.4819, "mean_token_accuracy": 0.8396755218505859, "num_tokens": 683618773.0, "step": 20405 }, { "epoch": 1.2183619866284623, "grad_norm": 0.5483804941177368, "learning_rate": 2.131984305997579e-05, "loss": 0.4484, "mean_token_accuracy": 0.8510020256042481, "num_tokens": 683786453.0, "step": 20410 }, { "epoch": 1.218660458452722, "grad_norm": 0.506986677646637, "learning_rate": 2.1309166566327726e-05, "loss": 0.4516, "mean_token_accuracy": 0.8493916153907776, "num_tokens": 683954133.0, "step": 20415 }, { "epoch": 1.2189589302769819, "grad_norm": 0.45554739236831665, "learning_rate": 2.1298491580571423e-05, "loss": 0.379, "mean_token_accuracy": 0.8714839458465576, "num_tokens": 684121813.0, "step": 20420 }, { "epoch": 1.2192574021012417, "grad_norm": 0.5481733083724976, "learning_rate": 2.128781810530695e-05, "loss": 0.4582, "mean_token_accuracy": 0.8475664973258972, "num_tokens": 684289493.0, "step": 20425 }, { "epoch": 1.2195558739255015, "grad_norm": 0.49845367670059204, "learning_rate": 2.127714614313406e-05, "loss": 0.4688, "mean_token_accuracy": 0.8436497330665589, "num_tokens": 684456105.0, "step": 20430 }, { "epoch": 1.2198543457497613, "grad_norm": 0.5329987406730652, "learning_rate": 2.1266475696652065e-05, "loss": 0.4691, "mean_token_accuracy": 0.8459978461265564, "num_tokens": 684623785.0, "step": 20435 }, { "epoch": 1.220152817574021, "grad_norm": 0.514126718044281, "learning_rate": 2.125580676845998e-05, "loss": 0.4594, "mean_token_accuracy": 0.8477872014045715, "num_tokens": 684791465.0, "step": 20440 }, { "epoch": 1.2204512893982808, "grad_norm": 0.6443539261817932, "learning_rate": 2.1245139361156418e-05, "loss": 0.458, "mean_token_accuracy": 0.8491709470748902, "num_tokens": 684959145.0, "step": 20445 }, { "epoch": 1.2207497612225406, "grad_norm": 0.48586156964302063, "learning_rate": 2.123447347733961e-05, "loss": 0.4365, "mean_token_accuracy": 0.8554720878601074, "num_tokens": 685119867.0, "step": 20450 }, { "epoch": 1.2210482330468004, "grad_norm": 0.4845660924911499, "learning_rate": 2.1223809119607442e-05, "loss": 0.4949, "mean_token_accuracy": 0.8369378447532654, "num_tokens": 685287547.0, "step": 20455 }, { "epoch": 1.2213467048710602, "grad_norm": 0.49853649735450745, "learning_rate": 2.12131462905574e-05, "loss": 0.4531, "mean_token_accuracy": 0.8490635752677917, "num_tokens": 685455227.0, "step": 20460 }, { "epoch": 1.22164517669532, "grad_norm": 0.5116775631904602, "learning_rate": 2.1202484992786625e-05, "loss": 0.4622, "mean_token_accuracy": 0.8474949359893799, "num_tokens": 685622907.0, "step": 20465 }, { "epoch": 1.2219436485195798, "grad_norm": 0.5472502112388611, "learning_rate": 2.119182522889187e-05, "loss": 0.4004, "mean_token_accuracy": 0.8642333388328552, "num_tokens": 685776657.0, "step": 20470 }, { "epoch": 1.2222421203438396, "grad_norm": 0.4789251983165741, "learning_rate": 2.118116700146951e-05, "loss": 0.4308, "mean_token_accuracy": 0.8540617942810058, "num_tokens": 685944337.0, "step": 20475 }, { "epoch": 1.2225405921680994, "grad_norm": 10.693766593933105, "learning_rate": 2.1170510313115565e-05, "loss": 0.4689, "mean_token_accuracy": 0.8528689026832581, "num_tokens": 686112017.0, "step": 20480 }, { "epoch": 1.2228390639923592, "grad_norm": 0.5329935550689697, "learning_rate": 2.1159855166425664e-05, "loss": 0.4382, "mean_token_accuracy": 0.8534355282783508, "num_tokens": 686279697.0, "step": 20485 }, { "epoch": 1.223137535816619, "grad_norm": 0.49704545736312866, "learning_rate": 2.114920156399506e-05, "loss": 0.4327, "mean_token_accuracy": 0.8557318449020386, "num_tokens": 686447377.0, "step": 20490 }, { "epoch": 1.2234360076408788, "grad_norm": 0.5286566615104675, "learning_rate": 2.113854950841863e-05, "loss": 0.438, "mean_token_accuracy": 0.8551473259925843, "num_tokens": 686615057.0, "step": 20495 }, { "epoch": 1.2237344794651386, "grad_norm": 0.4883231222629547, "learning_rate": 2.112789900229089e-05, "loss": 0.4427, "mean_token_accuracy": 0.8521352767944336, "num_tokens": 686782737.0, "step": 20500 }, { "epoch": 1.2240329512893984, "grad_norm": 0.48276063799858093, "learning_rate": 2.111725004820597e-05, "loss": 0.4198, "mean_token_accuracy": 0.859071922302246, "num_tokens": 686950417.0, "step": 20505 }, { "epoch": 1.2243314231136582, "grad_norm": 0.5543134808540344, "learning_rate": 2.1106602648757606e-05, "loss": 0.4647, "mean_token_accuracy": 0.8451687932014466, "num_tokens": 687118097.0, "step": 20510 }, { "epoch": 1.2246298949379177, "grad_norm": 0.461599200963974, "learning_rate": 2.1095956806539184e-05, "loss": 0.3925, "mean_token_accuracy": 0.8689788818359375, "num_tokens": 687285777.0, "step": 20515 }, { "epoch": 1.2249283667621778, "grad_norm": 0.5091221332550049, "learning_rate": 2.1085312524143675e-05, "loss": 0.4722, "mean_token_accuracy": 0.8436001420021058, "num_tokens": 687453457.0, "step": 20520 }, { "epoch": 1.2252268385864373, "grad_norm": 0.5655896067619324, "learning_rate": 2.107466980416371e-05, "loss": 0.4435, "mean_token_accuracy": 0.852606475353241, "num_tokens": 687621137.0, "step": 20525 }, { "epoch": 1.2255253104106973, "grad_norm": 0.5249313116073608, "learning_rate": 2.106402864919151e-05, "loss": 0.4217, "mean_token_accuracy": 0.8591136813163758, "num_tokens": 687788817.0, "step": 20530 }, { "epoch": 1.225823782234957, "grad_norm": 0.5126509666442871, "learning_rate": 2.1053389061818928e-05, "loss": 0.4578, "mean_token_accuracy": 0.8487355351448059, "num_tokens": 687956497.0, "step": 20535 }, { "epoch": 1.226122254059217, "grad_norm": 0.4747845232486725, "learning_rate": 2.104275104463743e-05, "loss": 0.4363, "mean_token_accuracy": 0.853984260559082, "num_tokens": 688124177.0, "step": 20540 }, { "epoch": 1.2264207258834765, "grad_norm": 0.5217232704162598, "learning_rate": 2.1032114600238107e-05, "loss": 0.4543, "mean_token_accuracy": 0.8476738572120667, "num_tokens": 688291857.0, "step": 20545 }, { "epoch": 1.2267191977077363, "grad_norm": 0.4815486967563629, "learning_rate": 2.1021479731211642e-05, "loss": 0.4398, "mean_token_accuracy": 0.8530478358268738, "num_tokens": 688459537.0, "step": 20550 }, { "epoch": 1.227017669531996, "grad_norm": 0.5050053596496582, "learning_rate": 2.1010846440148384e-05, "loss": 0.4315, "mean_token_accuracy": 0.8557318568229675, "num_tokens": 688627217.0, "step": 20555 }, { "epoch": 1.2273161413562559, "grad_norm": 0.5565704107284546, "learning_rate": 2.100021472963824e-05, "loss": 0.47, "mean_token_accuracy": 0.8439401149749756, "num_tokens": 688794897.0, "step": 20560 }, { "epoch": 1.2276146131805157, "grad_norm": 0.5366640090942383, "learning_rate": 2.098958460227079e-05, "loss": 0.4457, "mean_token_accuracy": 0.8527023315429687, "num_tokens": 688960297.0, "step": 20565 }, { "epoch": 1.2279130850047755, "grad_norm": 0.46611812710762024, "learning_rate": 2.097895606063516e-05, "loss": 0.4257, "mean_token_accuracy": 0.8582309484481812, "num_tokens": 689127977.0, "step": 20570 }, { "epoch": 1.2282115568290353, "grad_norm": 0.49351072311401367, "learning_rate": 2.0968329107320145e-05, "loss": 0.456, "mean_token_accuracy": 0.8475366950035095, "num_tokens": 689295657.0, "step": 20575 }, { "epoch": 1.228510028653295, "grad_norm": 0.5155776143074036, "learning_rate": 2.095770374491415e-05, "loss": 0.4549, "mean_token_accuracy": 0.8492707490921021, "num_tokens": 689461982.0, "step": 20580 }, { "epoch": 1.2288085004775549, "grad_norm": 0.5490033030509949, "learning_rate": 2.0947079976005156e-05, "loss": 0.4518, "mean_token_accuracy": 0.8493021607398987, "num_tokens": 689629662.0, "step": 20585 }, { "epoch": 1.2291069723018146, "grad_norm": 0.5510979890823364, "learning_rate": 2.0936457803180792e-05, "loss": 0.4834, "mean_token_accuracy": 0.8399916291236877, "num_tokens": 689797342.0, "step": 20590 }, { "epoch": 1.2294054441260744, "grad_norm": 0.4988440275192261, "learning_rate": 2.0925837229028273e-05, "loss": 0.4362, "mean_token_accuracy": 0.853835153579712, "num_tokens": 689965022.0, "step": 20595 }, { "epoch": 1.2297039159503342, "grad_norm": 0.6232066750526428, "learning_rate": 2.091521825613445e-05, "loss": 0.5028, "mean_token_accuracy": 0.8367827653884887, "num_tokens": 690132702.0, "step": 20600 }, { "epoch": 1.230002387774594, "grad_norm": 0.5446926355361938, "learning_rate": 2.0904600887085757e-05, "loss": 0.4265, "mean_token_accuracy": 0.8578074693679809, "num_tokens": 690300382.0, "step": 20605 }, { "epoch": 1.2303008595988538, "grad_norm": 0.5652893781661987, "learning_rate": 2.089398512446825e-05, "loss": 0.476, "mean_token_accuracy": 0.8420493721961975, "num_tokens": 690468062.0, "step": 20610 }, { "epoch": 1.2305993314231136, "grad_norm": 0.44775089621543884, "learning_rate": 2.0883370970867604e-05, "loss": 0.4047, "mean_token_accuracy": 0.8656089663505554, "num_tokens": 690635742.0, "step": 20615 }, { "epoch": 1.2308978032473734, "grad_norm": 0.4957120418548584, "learning_rate": 2.0872758428869088e-05, "loss": 0.4462, "mean_token_accuracy": 0.8523738503456115, "num_tokens": 690803422.0, "step": 20620 }, { "epoch": 1.2311962750716332, "grad_norm": 0.522113025188446, "learning_rate": 2.0862147501057583e-05, "loss": 0.4773, "mean_token_accuracy": 0.8456698060035706, "num_tokens": 690971102.0, "step": 20625 }, { "epoch": 1.231494746895893, "grad_norm": 0.5285800695419312, "learning_rate": 2.0851538190017573e-05, "loss": 0.4488, "mean_token_accuracy": 0.8500119209289551, "num_tokens": 691138782.0, "step": 20630 }, { "epoch": 1.2317932187201528, "grad_norm": 0.5293931365013123, "learning_rate": 2.084093049833315e-05, "loss": 0.4796, "mean_token_accuracy": 0.8391029477119446, "num_tokens": 691306462.0, "step": 20635 }, { "epoch": 1.2320916905444126, "grad_norm": 0.4784390926361084, "learning_rate": 2.0830324428588028e-05, "loss": 0.4398, "mean_token_accuracy": 0.854085648059845, "num_tokens": 691474142.0, "step": 20640 }, { "epoch": 1.2323901623686724, "grad_norm": 0.4853532314300537, "learning_rate": 2.0819719983365484e-05, "loss": 0.4583, "mean_token_accuracy": 0.8484850406646729, "num_tokens": 691641822.0, "step": 20645 }, { "epoch": 1.2326886341929322, "grad_norm": 0.5179458260536194, "learning_rate": 2.0809117165248447e-05, "loss": 0.4613, "mean_token_accuracy": 0.8475903511047364, "num_tokens": 691809502.0, "step": 20650 }, { "epoch": 1.232987106017192, "grad_norm": 0.5093544125556946, "learning_rate": 2.0798515976819427e-05, "loss": 0.494, "mean_token_accuracy": 0.8383752822875976, "num_tokens": 691977182.0, "step": 20655 }, { "epoch": 1.2332855778414518, "grad_norm": 0.5244585275650024, "learning_rate": 2.078791642066053e-05, "loss": 0.4677, "mean_token_accuracy": 0.8452880859375, "num_tokens": 692144862.0, "step": 20660 }, { "epoch": 1.2335840496657116, "grad_norm": 0.47669073939323425, "learning_rate": 2.0777318499353486e-05, "loss": 0.3941, "mean_token_accuracy": 0.8674281358718872, "num_tokens": 692312542.0, "step": 20665 }, { "epoch": 1.2338825214899714, "grad_norm": 0.4995233416557312, "learning_rate": 2.0766722215479607e-05, "loss": 0.4621, "mean_token_accuracy": 0.8483538150787353, "num_tokens": 692480222.0, "step": 20670 }, { "epoch": 1.2341809933142311, "grad_norm": 0.5297977924346924, "learning_rate": 2.0756127571619815e-05, "loss": 0.4672, "mean_token_accuracy": 0.8449659943580627, "num_tokens": 692647902.0, "step": 20675 }, { "epoch": 1.234479465138491, "grad_norm": 0.5540143251419067, "learning_rate": 2.074553457035463e-05, "loss": 0.4837, "mean_token_accuracy": 0.839085054397583, "num_tokens": 692815582.0, "step": 20680 }, { "epoch": 1.2347779369627507, "grad_norm": 0.5271779298782349, "learning_rate": 2.0734943214264168e-05, "loss": 0.4724, "mean_token_accuracy": 0.8444709539413452, "num_tokens": 692983262.0, "step": 20685 }, { "epoch": 1.2350764087870105, "grad_norm": 0.4945247173309326, "learning_rate": 2.0724353505928167e-05, "loss": 0.419, "mean_token_accuracy": 0.8598055601119995, "num_tokens": 693150942.0, "step": 20690 }, { "epoch": 1.2353748806112703, "grad_norm": 0.45129886269569397, "learning_rate": 2.071376544792592e-05, "loss": 0.3999, "mean_token_accuracy": 0.8669092297554016, "num_tokens": 693318622.0, "step": 20695 }, { "epoch": 1.2356733524355301, "grad_norm": 0.4860512912273407, "learning_rate": 2.0703179042836375e-05, "loss": 0.4071, "mean_token_accuracy": 0.8625849962234498, "num_tokens": 693486302.0, "step": 20700 }, { "epoch": 1.23597182425979, "grad_norm": 0.5046267509460449, "learning_rate": 2.069259429323801e-05, "loss": 0.4617, "mean_token_accuracy": 0.8464213252067566, "num_tokens": 693653982.0, "step": 20705 }, { "epoch": 1.2362702960840497, "grad_norm": 0.5387168526649475, "learning_rate": 2.0682011201708973e-05, "loss": 0.445, "mean_token_accuracy": 0.8519145846366882, "num_tokens": 693821662.0, "step": 20710 }, { "epoch": 1.2365687679083095, "grad_norm": 0.45135805010795593, "learning_rate": 2.0671429770826937e-05, "loss": 0.4226, "mean_token_accuracy": 0.8580281496047973, "num_tokens": 693989342.0, "step": 20715 }, { "epoch": 1.2368672397325693, "grad_norm": 0.5135748386383057, "learning_rate": 2.0660850003169225e-05, "loss": 0.4464, "mean_token_accuracy": 0.8520040512084961, "num_tokens": 694157022.0, "step": 20720 }, { "epoch": 1.237165711556829, "grad_norm": 0.5206548571586609, "learning_rate": 2.065027190131274e-05, "loss": 0.4733, "mean_token_accuracy": 0.8424489974975586, "num_tokens": 694324702.0, "step": 20725 }, { "epoch": 1.2374641833810889, "grad_norm": 0.5115079879760742, "learning_rate": 2.0639695467833954e-05, "loss": 0.4617, "mean_token_accuracy": 0.8463378310203552, "num_tokens": 694492382.0, "step": 20730 }, { "epoch": 1.2377626552053487, "grad_norm": 0.46827027201652527, "learning_rate": 2.0629120705308974e-05, "loss": 0.412, "mean_token_accuracy": 0.8624477982521057, "num_tokens": 694660062.0, "step": 20735 }, { "epoch": 1.2380611270296085, "grad_norm": 0.5105363130569458, "learning_rate": 2.061854761631346e-05, "loss": 0.4265, "mean_token_accuracy": 0.8570320844650269, "num_tokens": 694827742.0, "step": 20740 }, { "epoch": 1.2383595988538683, "grad_norm": 0.6181185245513916, "learning_rate": 2.060797620342269e-05, "loss": 0.4526, "mean_token_accuracy": 0.848997962474823, "num_tokens": 694995422.0, "step": 20745 }, { "epoch": 1.238658070678128, "grad_norm": 0.535239577293396, "learning_rate": 2.059740646921154e-05, "loss": 0.4583, "mean_token_accuracy": 0.8479243636131286, "num_tokens": 695163102.0, "step": 20750 }, { "epoch": 1.2389565425023878, "grad_norm": 0.4934963583946228, "learning_rate": 2.0586838416254438e-05, "loss": 0.4619, "mean_token_accuracy": 0.8474710822105408, "num_tokens": 695330782.0, "step": 20755 }, { "epoch": 1.2392550143266476, "grad_norm": 0.541788637638092, "learning_rate": 2.057627204712545e-05, "loss": 0.4779, "mean_token_accuracy": 0.842794942855835, "num_tokens": 695498462.0, "step": 20760 }, { "epoch": 1.2395534861509074, "grad_norm": 0.4743037819862366, "learning_rate": 2.05657073643982e-05, "loss": 0.436, "mean_token_accuracy": 0.8543301939964294, "num_tokens": 695666142.0, "step": 20765 }, { "epoch": 1.2398519579751672, "grad_norm": 0.5119059681892395, "learning_rate": 2.0555144370645906e-05, "loss": 0.4212, "mean_token_accuracy": 0.8579625368118287, "num_tokens": 695833822.0, "step": 20770 }, { "epoch": 1.240150429799427, "grad_norm": 0.5168021321296692, "learning_rate": 2.0544583068441397e-05, "loss": 0.4612, "mean_token_accuracy": 0.8471311092376709, "num_tokens": 696001502.0, "step": 20775 }, { "epoch": 1.2404489016236868, "grad_norm": 0.49903252720832825, "learning_rate": 2.0534023460357067e-05, "loss": 0.4371, "mean_token_accuracy": 0.8529404759407043, "num_tokens": 696169182.0, "step": 20780 }, { "epoch": 1.2407473734479466, "grad_norm": 0.5115178823471069, "learning_rate": 2.0523465548964898e-05, "loss": 0.4563, "mean_token_accuracy": 0.8501013994216919, "num_tokens": 696336862.0, "step": 20785 }, { "epoch": 1.2410458452722062, "grad_norm": 0.5184021592140198, "learning_rate": 2.051290933683646e-05, "loss": 0.4402, "mean_token_accuracy": 0.8541095018386841, "num_tokens": 696504542.0, "step": 20790 }, { "epoch": 1.2413443170964662, "grad_norm": 0.48175662755966187, "learning_rate": 2.0502354826542925e-05, "loss": 0.4681, "mean_token_accuracy": 0.84702969789505, "num_tokens": 696672222.0, "step": 20795 }, { "epoch": 1.2416427889207258, "grad_norm": 0.4659521281719208, "learning_rate": 2.049180202065504e-05, "loss": 0.437, "mean_token_accuracy": 0.8540677547454834, "num_tokens": 696839902.0, "step": 20800 }, { "epoch": 1.2419412607449858, "grad_norm": 0.5196808576583862, "learning_rate": 2.0481250921743116e-05, "loss": 0.4808, "mean_token_accuracy": 0.8409877061843872, "num_tokens": 697007582.0, "step": 20805 }, { "epoch": 1.2422397325692454, "grad_norm": 0.5354881882667542, "learning_rate": 2.047070153237709e-05, "loss": 0.4522, "mean_token_accuracy": 0.8516044378280639, "num_tokens": 697175262.0, "step": 20810 }, { "epoch": 1.2425382043935054, "grad_norm": 0.5706625580787659, "learning_rate": 2.0460153855126444e-05, "loss": 0.4707, "mean_token_accuracy": 0.8445604085922241, "num_tokens": 697342942.0, "step": 20815 }, { "epoch": 1.242836676217765, "grad_norm": 0.54524827003479, "learning_rate": 2.044960789256028e-05, "loss": 0.4442, "mean_token_accuracy": 0.8526422500610351, "num_tokens": 697510622.0, "step": 20820 }, { "epoch": 1.2431351480420247, "grad_norm": 0.49509236216545105, "learning_rate": 2.043906364724724e-05, "loss": 0.4434, "mean_token_accuracy": 0.8518549561500549, "num_tokens": 697678302.0, "step": 20825 }, { "epoch": 1.2434336198662845, "grad_norm": 0.5055444836616516, "learning_rate": 2.0428521121755572e-05, "loss": 0.4636, "mean_token_accuracy": 0.8464571237564087, "num_tokens": 697845982.0, "step": 20830 }, { "epoch": 1.2437320916905443, "grad_norm": 0.504357635974884, "learning_rate": 2.041798031865313e-05, "loss": 0.4616, "mean_token_accuracy": 0.846540629863739, "num_tokens": 698013662.0, "step": 20835 }, { "epoch": 1.2440305635148041, "grad_norm": 0.4978924095630646, "learning_rate": 2.040744124050728e-05, "loss": 0.3966, "mean_token_accuracy": 0.8676249504089355, "num_tokens": 698181342.0, "step": 20840 }, { "epoch": 1.244329035339064, "grad_norm": 0.5274893045425415, "learning_rate": 2.0396903889885045e-05, "loss": 0.4734, "mean_token_accuracy": 0.8450196743011474, "num_tokens": 698349022.0, "step": 20845 }, { "epoch": 1.2446275071633237, "grad_norm": 0.5136992931365967, "learning_rate": 2.038636826935296e-05, "loss": 0.4463, "mean_token_accuracy": 0.849977707862854, "num_tokens": 698508905.0, "step": 20850 }, { "epoch": 1.2449259789875835, "grad_norm": 0.5866198539733887, "learning_rate": 2.03758343814772e-05, "loss": 0.4484, "mean_token_accuracy": 0.8517177700996399, "num_tokens": 698676585.0, "step": 20855 }, { "epoch": 1.2452244508118433, "grad_norm": 0.4704689383506775, "learning_rate": 2.036530222882347e-05, "loss": 0.4042, "mean_token_accuracy": 0.8641178607940674, "num_tokens": 698844265.0, "step": 20860 }, { "epoch": 1.245522922636103, "grad_norm": 0.510746955871582, "learning_rate": 2.0354771813957064e-05, "loss": 0.433, "mean_token_accuracy": 0.8544494867324829, "num_tokens": 699011945.0, "step": 20865 }, { "epoch": 1.2458213944603629, "grad_norm": 0.49832162261009216, "learning_rate": 2.0344243139442882e-05, "loss": 0.4382, "mean_token_accuracy": 0.8540260076522828, "num_tokens": 699179625.0, "step": 20870 }, { "epoch": 1.2461198662846227, "grad_norm": 0.49919718503952026, "learning_rate": 2.0333716207845353e-05, "loss": 0.4314, "mean_token_accuracy": 0.8555111408233642, "num_tokens": 699347305.0, "step": 20875 }, { "epoch": 1.2464183381088825, "grad_norm": 0.4491886496543884, "learning_rate": 2.032319102172852e-05, "loss": 0.4263, "mean_token_accuracy": 0.857318389415741, "num_tokens": 699514985.0, "step": 20880 }, { "epoch": 1.2467168099331423, "grad_norm": 0.5185744762420654, "learning_rate": 2.0312667583655987e-05, "loss": 0.3924, "mean_token_accuracy": 0.8676130414009094, "num_tokens": 699682665.0, "step": 20885 }, { "epoch": 1.247015281757402, "grad_norm": 0.5070383548736572, "learning_rate": 2.030214589619092e-05, "loss": 0.4611, "mean_token_accuracy": 0.8475545763969421, "num_tokens": 699850345.0, "step": 20890 }, { "epoch": 1.2473137535816619, "grad_norm": 0.5295190215110779, "learning_rate": 2.0291625961896082e-05, "loss": 0.4478, "mean_token_accuracy": 0.8517117738723755, "num_tokens": 700018025.0, "step": 20895 }, { "epoch": 1.2476122254059216, "grad_norm": 0.5279406309127808, "learning_rate": 2.0281107783333787e-05, "loss": 0.4519, "mean_token_accuracy": 0.8491053342819214, "num_tokens": 700185705.0, "step": 20900 }, { "epoch": 1.2479106972301814, "grad_norm": 0.5204823017120361, "learning_rate": 2.0270591363065945e-05, "loss": 0.4487, "mean_token_accuracy": 0.8485327363014221, "num_tokens": 700353385.0, "step": 20905 }, { "epoch": 1.2482091690544412, "grad_norm": 0.4830276668071747, "learning_rate": 2.0260076703654007e-05, "loss": 0.4311, "mean_token_accuracy": 0.8544077277183533, "num_tokens": 700521065.0, "step": 20910 }, { "epoch": 1.248507640878701, "grad_norm": 0.5601676106452942, "learning_rate": 2.0249563807659023e-05, "loss": 0.4761, "mean_token_accuracy": 0.8422819972038269, "num_tokens": 700688745.0, "step": 20915 }, { "epoch": 1.2488061127029608, "grad_norm": 0.4411035180091858, "learning_rate": 2.0239052677641607e-05, "loss": 0.4246, "mean_token_accuracy": 0.8582548141479492, "num_tokens": 700856425.0, "step": 20920 }, { "epoch": 1.2491045845272206, "grad_norm": 0.5106505751609802, "learning_rate": 2.022854331616193e-05, "loss": 0.4429, "mean_token_accuracy": 0.8539782881736755, "num_tokens": 701024105.0, "step": 20925 }, { "epoch": 1.2494030563514804, "grad_norm": 0.4430442750453949, "learning_rate": 2.0218035725779745e-05, "loss": 0.4174, "mean_token_accuracy": 0.8587796807289123, "num_tokens": 701191785.0, "step": 20930 }, { "epoch": 1.2497015281757402, "grad_norm": 0.5601619482040405, "learning_rate": 2.020752990905437e-05, "loss": 0.476, "mean_token_accuracy": 0.843546462059021, "num_tokens": 701359465.0, "step": 20935 }, { "epoch": 1.25, "grad_norm": 0.46632280945777893, "learning_rate": 2.0197025868544682e-05, "loss": 0.4539, "mean_token_accuracy": 0.8478050827980042, "num_tokens": 701527145.0, "step": 20940 }, { "epoch": 1.2502984718242598, "grad_norm": 0.5367595553398132, "learning_rate": 2.018652360680915e-05, "loss": 0.4387, "mean_token_accuracy": 0.8531731128692627, "num_tokens": 701694825.0, "step": 20945 }, { "epoch": 1.2505969436485196, "grad_norm": 0.499789834022522, "learning_rate": 2.017602312640578e-05, "loss": 0.4521, "mean_token_accuracy": 0.8490934014320374, "num_tokens": 701862505.0, "step": 20950 }, { "epoch": 1.2508954154727794, "grad_norm": 0.5380803942680359, "learning_rate": 2.0165524429892163e-05, "loss": 0.4606, "mean_token_accuracy": 0.8480734825134277, "num_tokens": 702030185.0, "step": 20955 }, { "epoch": 1.2511938872970392, "grad_norm": 0.5221821069717407, "learning_rate": 2.0155027519825448e-05, "loss": 0.4492, "mean_token_accuracy": 0.8515090107917785, "num_tokens": 702197865.0, "step": 20960 }, { "epoch": 1.251492359121299, "grad_norm": 0.508967936038971, "learning_rate": 2.0144532398762357e-05, "loss": 0.4327, "mean_token_accuracy": 0.8550340056419372, "num_tokens": 702365545.0, "step": 20965 }, { "epoch": 1.2517908309455588, "grad_norm": 0.514724850654602, "learning_rate": 2.013403906925917e-05, "loss": 0.4381, "mean_token_accuracy": 0.8526661038398743, "num_tokens": 702533225.0, "step": 20970 }, { "epoch": 1.2520893027698186, "grad_norm": 0.5254999399185181, "learning_rate": 2.0123547533871717e-05, "loss": 0.4195, "mean_token_accuracy": 0.858547055721283, "num_tokens": 702700905.0, "step": 20975 }, { "epoch": 1.2523877745940784, "grad_norm": 0.565106987953186, "learning_rate": 2.0113057795155426e-05, "loss": 0.457, "mean_token_accuracy": 0.8483597755432128, "num_tokens": 702868585.0, "step": 20980 }, { "epoch": 1.2526862464183381, "grad_norm": 0.5399832129478455, "learning_rate": 2.0102569855665248e-05, "loss": 0.4664, "mean_token_accuracy": 0.8447751522064209, "num_tokens": 703036265.0, "step": 20985 }, { "epoch": 1.252984718242598, "grad_norm": 0.5129609704017639, "learning_rate": 2.0092083717955728e-05, "loss": 0.4352, "mean_token_accuracy": 0.8541810750961304, "num_tokens": 703203945.0, "step": 20990 }, { "epoch": 1.2532831900668577, "grad_norm": 0.5159810185432434, "learning_rate": 2.0081599384580952e-05, "loss": 0.4441, "mean_token_accuracy": 0.8502206802368164, "num_tokens": 703371625.0, "step": 20995 }, { "epoch": 1.2535816618911175, "grad_norm": 0.5023723244667053, "learning_rate": 2.0071116858094573e-05, "loss": 0.4267, "mean_token_accuracy": 0.85719313621521, "num_tokens": 703539305.0, "step": 21000 }, { "epoch": 1.2538801337153773, "grad_norm": 0.5119024515151978, "learning_rate": 2.006063614104981e-05, "loss": 0.4386, "mean_token_accuracy": 0.8528092622756958, "num_tokens": 703706985.0, "step": 21005 }, { "epoch": 1.2541786055396371, "grad_norm": 0.4861049950122833, "learning_rate": 2.0050157235999424e-05, "loss": 0.455, "mean_token_accuracy": 0.848872721195221, "num_tokens": 703874665.0, "step": 21010 }, { "epoch": 1.254477077363897, "grad_norm": 0.48757025599479675, "learning_rate": 2.0039680145495753e-05, "loss": 0.4543, "mean_token_accuracy": 0.849385666847229, "num_tokens": 704042345.0, "step": 21015 }, { "epoch": 1.2547755491881567, "grad_norm": 0.513938307762146, "learning_rate": 2.0029204872090685e-05, "loss": 0.4597, "mean_token_accuracy": 0.8479482412338257, "num_tokens": 704210025.0, "step": 21020 }, { "epoch": 1.2550740210124165, "grad_norm": 0.5094524025917053, "learning_rate": 2.0018731418335658e-05, "loss": 0.4601, "mean_token_accuracy": 0.8463258981704712, "num_tokens": 704377705.0, "step": 21025 }, { "epoch": 1.2553724928366763, "grad_norm": 0.5137123465538025, "learning_rate": 2.0008259786781695e-05, "loss": 0.4176, "mean_token_accuracy": 0.8611833572387695, "num_tokens": 704545385.0, "step": 21030 }, { "epoch": 1.255670964660936, "grad_norm": 0.4705581068992615, "learning_rate": 1.9997789979979332e-05, "loss": 0.4448, "mean_token_accuracy": 0.8523977041244507, "num_tokens": 704713065.0, "step": 21035 }, { "epoch": 1.2559694364851959, "grad_norm": 0.5636093020439148, "learning_rate": 1.998732200047871e-05, "loss": 0.4859, "mean_token_accuracy": 0.839198362827301, "num_tokens": 704880745.0, "step": 21040 }, { "epoch": 1.2562679083094554, "grad_norm": 0.6135547161102295, "learning_rate": 1.9976855850829463e-05, "loss": 0.4652, "mean_token_accuracy": 0.8450972080230713, "num_tokens": 705048425.0, "step": 21045 }, { "epoch": 1.2565663801337155, "grad_norm": 0.491524875164032, "learning_rate": 1.996639153358084e-05, "loss": 0.4376, "mean_token_accuracy": 0.8533401012420654, "num_tokens": 705216105.0, "step": 21050 }, { "epoch": 1.256864851957975, "grad_norm": 0.5055975317955017, "learning_rate": 1.995592905128162e-05, "loss": 0.4174, "mean_token_accuracy": 0.859686267375946, "num_tokens": 705383785.0, "step": 21055 }, { "epoch": 1.257163323782235, "grad_norm": 0.47449231147766113, "learning_rate": 1.9945468406480127e-05, "loss": 0.4309, "mean_token_accuracy": 0.8571374177932739, "num_tokens": 705548788.0, "step": 21060 }, { "epoch": 1.2574617956064946, "grad_norm": 0.5250605940818787, "learning_rate": 1.9935009601724243e-05, "loss": 0.4512, "mean_token_accuracy": 0.848520827293396, "num_tokens": 705716468.0, "step": 21065 }, { "epoch": 1.2577602674307546, "grad_norm": 0.5314686298370361, "learning_rate": 1.99245526395614e-05, "loss": 0.4338, "mean_token_accuracy": 0.85520099401474, "num_tokens": 705884148.0, "step": 21070 }, { "epoch": 1.2580587392550142, "grad_norm": 0.5164315104484558, "learning_rate": 1.9914097522538587e-05, "loss": 0.4721, "mean_token_accuracy": 0.8424549698829651, "num_tokens": 706051828.0, "step": 21075 }, { "epoch": 1.2583572110792742, "grad_norm": 0.5837658047676086, "learning_rate": 1.9903644253202353e-05, "loss": 0.4772, "mean_token_accuracy": 0.8415066242218018, "num_tokens": 706219508.0, "step": 21080 }, { "epoch": 1.2586556829035338, "grad_norm": 0.483070969581604, "learning_rate": 1.9893192834098767e-05, "loss": 0.4164, "mean_token_accuracy": 0.8601574659347534, "num_tokens": 706387188.0, "step": 21085 }, { "epoch": 1.2589541547277938, "grad_norm": 0.5082573890686035, "learning_rate": 1.9882743267773474e-05, "loss": 0.4692, "mean_token_accuracy": 0.8421984910964966, "num_tokens": 706554868.0, "step": 21090 }, { "epoch": 1.2592526265520534, "grad_norm": 0.5246371030807495, "learning_rate": 1.9872295556771652e-05, "loss": 0.4613, "mean_token_accuracy": 0.8465238451957703, "num_tokens": 706722474.0, "step": 21095 }, { "epoch": 1.2595510983763134, "grad_norm": 0.48530304431915283, "learning_rate": 1.986184970363803e-05, "loss": 0.432, "mean_token_accuracy": 0.854861032962799, "num_tokens": 706890154.0, "step": 21100 }, { "epoch": 1.259849570200573, "grad_norm": 0.45325836539268494, "learning_rate": 1.985140571091692e-05, "loss": 0.4395, "mean_token_accuracy": 0.8546761393547058, "num_tokens": 707057834.0, "step": 21105 }, { "epoch": 1.260148042024833, "grad_norm": 0.4651221036911011, "learning_rate": 1.984096358115211e-05, "loss": 0.4472, "mean_token_accuracy": 0.8494453191757202, "num_tokens": 707225514.0, "step": 21110 }, { "epoch": 1.2604465138490926, "grad_norm": 0.48300737142562866, "learning_rate": 1.9830523316886995e-05, "loss": 0.4439, "mean_token_accuracy": 0.8510388016700745, "num_tokens": 707386234.0, "step": 21115 }, { "epoch": 1.2607449856733524, "grad_norm": 0.4908396303653717, "learning_rate": 1.9820084920664484e-05, "loss": 0.4403, "mean_token_accuracy": 0.8526840090751648, "num_tokens": 707553914.0, "step": 21120 }, { "epoch": 1.2610434574976122, "grad_norm": 0.47433868050575256, "learning_rate": 1.980964839502705e-05, "loss": 0.4126, "mean_token_accuracy": 0.8613742113113403, "num_tokens": 707721594.0, "step": 21125 }, { "epoch": 1.261341929321872, "grad_norm": 0.5637901425361633, "learning_rate": 1.9799213742516692e-05, "loss": 0.4685, "mean_token_accuracy": 0.845753300189972, "num_tokens": 707889274.0, "step": 21130 }, { "epoch": 1.2616404011461317, "grad_norm": 0.48012587428092957, "learning_rate": 1.9788780965674973e-05, "loss": 0.4756, "mean_token_accuracy": 0.8419658780097962, "num_tokens": 708056954.0, "step": 21135 }, { "epoch": 1.2619388729703915, "grad_norm": 0.47778651118278503, "learning_rate": 1.9778350067042983e-05, "loss": 0.4067, "mean_token_accuracy": 0.8626267433166503, "num_tokens": 708224634.0, "step": 21140 }, { "epoch": 1.2622373447946513, "grad_norm": 0.5273913741111755, "learning_rate": 1.9767921049161354e-05, "loss": 0.4211, "mean_token_accuracy": 0.8585768699645996, "num_tokens": 708392314.0, "step": 21145 }, { "epoch": 1.2625358166189111, "grad_norm": 0.47344717383384705, "learning_rate": 1.9757493914570275e-05, "loss": 0.4309, "mean_token_accuracy": 0.8556125640869141, "num_tokens": 708559994.0, "step": 21150 }, { "epoch": 1.262834288443171, "grad_norm": 0.44999417662620544, "learning_rate": 1.9747068665809464e-05, "loss": 0.4184, "mean_token_accuracy": 0.860151493549347, "num_tokens": 708727674.0, "step": 21155 }, { "epoch": 1.2631327602674307, "grad_norm": 0.5429452657699585, "learning_rate": 1.9736645305418173e-05, "loss": 0.4269, "mean_token_accuracy": 0.8565700888633728, "num_tokens": 708890569.0, "step": 21160 }, { "epoch": 1.2634312320916905, "grad_norm": 0.4813825786113739, "learning_rate": 1.972622383593523e-05, "loss": 0.4531, "mean_token_accuracy": 0.8496003985404968, "num_tokens": 709058249.0, "step": 21165 }, { "epoch": 1.2637297039159503, "grad_norm": 0.5351575613021851, "learning_rate": 1.971580425989894e-05, "loss": 0.4544, "mean_token_accuracy": 0.8501670122146606, "num_tokens": 709225929.0, "step": 21170 }, { "epoch": 1.26402817574021, "grad_norm": 0.49393555521965027, "learning_rate": 1.9705386579847217e-05, "loss": 0.4573, "mean_token_accuracy": 0.8498052358627319, "num_tokens": 709388339.0, "step": 21175 }, { "epoch": 1.2643266475644699, "grad_norm": 0.5100563764572144, "learning_rate": 1.9694970798317458e-05, "loss": 0.4486, "mean_token_accuracy": 0.8513360261917114, "num_tokens": 709556019.0, "step": 21180 }, { "epoch": 1.2646251193887297, "grad_norm": 0.48210608959198, "learning_rate": 1.9684556917846624e-05, "loss": 0.4324, "mean_token_accuracy": 0.8570201635360718, "num_tokens": 709723699.0, "step": 21185 }, { "epoch": 1.2649235912129895, "grad_norm": 0.4726911187171936, "learning_rate": 1.9674144940971217e-05, "loss": 0.439, "mean_token_accuracy": 0.8528653502464294, "num_tokens": 709885029.0, "step": 21190 }, { "epoch": 1.2652220630372493, "grad_norm": 0.4879405200481415, "learning_rate": 1.9663734870227253e-05, "loss": 0.4293, "mean_token_accuracy": 0.8567935228347778, "num_tokens": 710052709.0, "step": 21195 }, { "epoch": 1.265520534861509, "grad_norm": 0.49307912588119507, "learning_rate": 1.96533267081503e-05, "loss": 0.4238, "mean_token_accuracy": 0.8584993600845336, "num_tokens": 710220389.0, "step": 21200 }, { "epoch": 1.2658190066857689, "grad_norm": 0.46078819036483765, "learning_rate": 1.9642920457275466e-05, "loss": 0.382, "mean_token_accuracy": 0.8713944911956787, "num_tokens": 710388069.0, "step": 21205 }, { "epoch": 1.2661174785100286, "grad_norm": 0.5175058841705322, "learning_rate": 1.9632516120137372e-05, "loss": 0.4623, "mean_token_accuracy": 0.8475844025611877, "num_tokens": 710555749.0, "step": 21210 }, { "epoch": 1.2664159503342884, "grad_norm": 0.5143455266952515, "learning_rate": 1.96221136992702e-05, "loss": 0.4696, "mean_token_accuracy": 0.8443755149841309, "num_tokens": 710723429.0, "step": 21215 }, { "epoch": 1.2667144221585482, "grad_norm": 0.5266312956809998, "learning_rate": 1.9611713197207643e-05, "loss": 0.4307, "mean_token_accuracy": 0.8573720693588257, "num_tokens": 710891109.0, "step": 21220 }, { "epoch": 1.267012893982808, "grad_norm": 0.4937094748020172, "learning_rate": 1.9601314616482942e-05, "loss": 0.4584, "mean_token_accuracy": 0.8467195510864258, "num_tokens": 711058789.0, "step": 21225 }, { "epoch": 1.2673113658070678, "grad_norm": 0.562179446220398, "learning_rate": 1.9590917959628856e-05, "loss": 0.4469, "mean_token_accuracy": 0.8507336378097534, "num_tokens": 711226469.0, "step": 21230 }, { "epoch": 1.2676098376313276, "grad_norm": 0.5001862049102783, "learning_rate": 1.958052322917769e-05, "loss": 0.4076, "mean_token_accuracy": 0.8633365273475647, "num_tokens": 711394149.0, "step": 21235 }, { "epoch": 1.2679083094555874, "grad_norm": 0.5662214756011963, "learning_rate": 1.957013042766126e-05, "loss": 0.4538, "mean_token_accuracy": 0.8486818432807922, "num_tokens": 711561829.0, "step": 21240 }, { "epoch": 1.2682067812798472, "grad_norm": 0.4831666648387909, "learning_rate": 1.9559739557610936e-05, "loss": 0.4079, "mean_token_accuracy": 0.863873302936554, "num_tokens": 711729509.0, "step": 21245 }, { "epoch": 1.268505253104107, "grad_norm": 0.5003403425216675, "learning_rate": 1.9549350621557604e-05, "loss": 0.4613, "mean_token_accuracy": 0.8470654964447022, "num_tokens": 711897189.0, "step": 21250 }, { "epoch": 1.2688037249283668, "grad_norm": 0.5404338836669922, "learning_rate": 1.953896362203168e-05, "loss": 0.4633, "mean_token_accuracy": 0.8459620714187622, "num_tokens": 712064869.0, "step": 21255 }, { "epoch": 1.2691021967526266, "grad_norm": 0.4927692711353302, "learning_rate": 1.9528578561563116e-05, "loss": 0.4459, "mean_token_accuracy": 0.8504831314086914, "num_tokens": 712232549.0, "step": 21260 }, { "epoch": 1.2694006685768864, "grad_norm": 0.538818359375, "learning_rate": 1.951819544268137e-05, "loss": 0.4919, "mean_token_accuracy": 0.837242043018341, "num_tokens": 712400229.0, "step": 21265 }, { "epoch": 1.2696991404011462, "grad_norm": 0.47741541266441345, "learning_rate": 1.9507814267915452e-05, "loss": 0.4078, "mean_token_accuracy": 0.8616187453269959, "num_tokens": 712567909.0, "step": 21270 }, { "epoch": 1.269997612225406, "grad_norm": 0.5630694627761841, "learning_rate": 1.9497435039793892e-05, "loss": 0.4403, "mean_token_accuracy": 0.854849112033844, "num_tokens": 712735589.0, "step": 21275 }, { "epoch": 1.2702960840496658, "grad_norm": 0.4834173619747162, "learning_rate": 1.948705776084474e-05, "loss": 0.4271, "mean_token_accuracy": 0.8579267501831055, "num_tokens": 712903269.0, "step": 21280 }, { "epoch": 1.2705945558739256, "grad_norm": 0.7313836216926575, "learning_rate": 1.9476682433595573e-05, "loss": 0.4571, "mean_token_accuracy": 0.8482643604278565, "num_tokens": 713070949.0, "step": 21285 }, { "epoch": 1.2708930276981854, "grad_norm": 0.5565345883369446, "learning_rate": 1.9466309060573486e-05, "loss": 0.4797, "mean_token_accuracy": 0.8417273163795471, "num_tokens": 713238629.0, "step": 21290 }, { "epoch": 1.2711914995224451, "grad_norm": 0.5181086659431458, "learning_rate": 1.945593764430511e-05, "loss": 0.4597, "mean_token_accuracy": 0.8472981095314026, "num_tokens": 713406309.0, "step": 21295 }, { "epoch": 1.271489971346705, "grad_norm": 0.5557349324226379, "learning_rate": 1.9445568187316608e-05, "loss": 0.4643, "mean_token_accuracy": 0.8465346574783326, "num_tokens": 713573989.0, "step": 21300 }, { "epoch": 1.2717884431709647, "grad_norm": 0.5677387714385986, "learning_rate": 1.9435200692133633e-05, "loss": 0.4624, "mean_token_accuracy": 0.8455564856529236, "num_tokens": 713741669.0, "step": 21305 }, { "epoch": 1.2720869149952245, "grad_norm": 0.5147316455841064, "learning_rate": 1.9424835161281395e-05, "loss": 0.4841, "mean_token_accuracy": 0.8406000256538391, "num_tokens": 713909349.0, "step": 21310 }, { "epoch": 1.2723853868194843, "grad_norm": 0.5038878917694092, "learning_rate": 1.94144715972846e-05, "loss": 0.4411, "mean_token_accuracy": 0.85284503698349, "num_tokens": 714077029.0, "step": 21315 }, { "epoch": 1.272683858643744, "grad_norm": 0.5570834875106812, "learning_rate": 1.9404110002667486e-05, "loss": 0.4488, "mean_token_accuracy": 0.8506739735603333, "num_tokens": 714244709.0, "step": 21320 }, { "epoch": 1.272982330468004, "grad_norm": 0.4947250485420227, "learning_rate": 1.9393750379953824e-05, "loss": 0.452, "mean_token_accuracy": 0.8522903442382812, "num_tokens": 714412389.0, "step": 21325 }, { "epoch": 1.2732808022922635, "grad_norm": 0.5436123013496399, "learning_rate": 1.9383392731666873e-05, "loss": 0.482, "mean_token_accuracy": 0.8396278262138367, "num_tokens": 714580069.0, "step": 21330 }, { "epoch": 1.2735792741165235, "grad_norm": 0.49210286140441895, "learning_rate": 1.937303706032944e-05, "loss": 0.4496, "mean_token_accuracy": 0.8511392116546631, "num_tokens": 714747749.0, "step": 21335 }, { "epoch": 1.273877745940783, "grad_norm": 0.4916885197162628, "learning_rate": 1.9362683368463835e-05, "loss": 0.4069, "mean_token_accuracy": 0.8642789006233216, "num_tokens": 714915429.0, "step": 21340 }, { "epoch": 1.274176217765043, "grad_norm": 0.48606356978416443, "learning_rate": 1.93523316585919e-05, "loss": 0.3987, "mean_token_accuracy": 0.865638792514801, "num_tokens": 715083109.0, "step": 21345 }, { "epoch": 1.2744746895893027, "grad_norm": 0.4798738360404968, "learning_rate": 1.9341981933234975e-05, "loss": 0.4908, "mean_token_accuracy": 0.8389955759048462, "num_tokens": 715250789.0, "step": 21350 }, { "epoch": 1.2747731614135627, "grad_norm": 0.47659364342689514, "learning_rate": 1.933163419491393e-05, "loss": 0.4375, "mean_token_accuracy": 0.8537814617156982, "num_tokens": 715418469.0, "step": 21355 }, { "epoch": 1.2750716332378222, "grad_norm": 0.530511200428009, "learning_rate": 1.9321288446149145e-05, "loss": 0.4304, "mean_token_accuracy": 0.8576046705245972, "num_tokens": 715586149.0, "step": 21360 }, { "epoch": 1.2753701050620823, "grad_norm": 0.5185045599937439, "learning_rate": 1.9310944689460515e-05, "loss": 0.4503, "mean_token_accuracy": 0.8511928796768189, "num_tokens": 715753829.0, "step": 21365 }, { "epoch": 1.2756685768863418, "grad_norm": 0.4745495021343231, "learning_rate": 1.930060292736747e-05, "loss": 0.4214, "mean_token_accuracy": 0.8583382964134216, "num_tokens": 715921509.0, "step": 21370 }, { "epoch": 1.2759670487106018, "grad_norm": 0.5337730646133423, "learning_rate": 1.9290263162388917e-05, "loss": 0.4583, "mean_token_accuracy": 0.8472265362739563, "num_tokens": 716089189.0, "step": 21375 }, { "epoch": 1.2762655205348614, "grad_norm": 0.5391858220100403, "learning_rate": 1.927992539704331e-05, "loss": 0.4677, "mean_token_accuracy": 0.8453477263450623, "num_tokens": 716256869.0, "step": 21380 }, { "epoch": 1.2765639923591214, "grad_norm": 0.5120289325714111, "learning_rate": 1.92695896338486e-05, "loss": 0.4166, "mean_token_accuracy": 0.8610879063606263, "num_tokens": 716424549.0, "step": 21385 }, { "epoch": 1.276862464183381, "grad_norm": 0.5688652992248535, "learning_rate": 1.9259255875322246e-05, "loss": 0.4832, "mean_token_accuracy": 0.8419386863708496, "num_tokens": 716589499.0, "step": 21390 }, { "epoch": 1.2771609360076408, "grad_norm": 0.5396602153778076, "learning_rate": 1.9248924123981233e-05, "loss": 0.4598, "mean_token_accuracy": 0.8455326437950135, "num_tokens": 716757179.0, "step": 21395 }, { "epoch": 1.2774594078319006, "grad_norm": 0.5797765851020813, "learning_rate": 1.9238594382342046e-05, "loss": 0.4571, "mean_token_accuracy": 0.8484671473503113, "num_tokens": 716924859.0, "step": 21400 }, { "epoch": 1.2777578796561604, "grad_norm": 0.5455407500267029, "learning_rate": 1.9228266652920684e-05, "loss": 0.449, "mean_token_accuracy": 0.8498537302017212, "num_tokens": 717087609.0, "step": 21405 }, { "epoch": 1.2780563514804202, "grad_norm": 0.522078812122345, "learning_rate": 1.921794093823266e-05, "loss": 0.4482, "mean_token_accuracy": 0.8496779203414917, "num_tokens": 717255289.0, "step": 21410 }, { "epoch": 1.27835482330468, "grad_norm": 0.4904731214046478, "learning_rate": 1.9207617240792987e-05, "loss": 0.4671, "mean_token_accuracy": 0.8468030571937561, "num_tokens": 717422969.0, "step": 21415 }, { "epoch": 1.2786532951289398, "grad_norm": 0.5107303857803345, "learning_rate": 1.9197295563116204e-05, "loss": 0.4187, "mean_token_accuracy": 0.8584695219993591, "num_tokens": 717590649.0, "step": 21420 }, { "epoch": 1.2789517669531996, "grad_norm": 0.5600069761276245, "learning_rate": 1.9186975907716325e-05, "loss": 0.4638, "mean_token_accuracy": 0.8463199257850647, "num_tokens": 717758329.0, "step": 21425 }, { "epoch": 1.2792502387774594, "grad_norm": 0.5263044834136963, "learning_rate": 1.91766582771069e-05, "loss": 0.4256, "mean_token_accuracy": 0.8587987780570984, "num_tokens": 717923781.0, "step": 21430 }, { "epoch": 1.2795487106017192, "grad_norm": 0.5339877605438232, "learning_rate": 1.9166342673801003e-05, "loss": 0.4151, "mean_token_accuracy": 0.859775722026825, "num_tokens": 718091461.0, "step": 21435 }, { "epoch": 1.279847182425979, "grad_norm": 0.48792290687561035, "learning_rate": 1.9156029100311144e-05, "loss": 0.3939, "mean_token_accuracy": 0.8680245757102967, "num_tokens": 718259141.0, "step": 21440 }, { "epoch": 1.2801456542502387, "grad_norm": 0.5102375149726868, "learning_rate": 1.9145717559149435e-05, "loss": 0.5101, "mean_token_accuracy": 0.8314744234085083, "num_tokens": 718426821.0, "step": 21445 }, { "epoch": 1.2804441260744985, "grad_norm": 0.5072603225708008, "learning_rate": 1.9135408052827397e-05, "loss": 0.4219, "mean_token_accuracy": 0.8588930010795593, "num_tokens": 718594501.0, "step": 21450 }, { "epoch": 1.2807425978987583, "grad_norm": 0.4996205270290375, "learning_rate": 1.9125100583856135e-05, "loss": 0.4544, "mean_token_accuracy": 0.8499343991279602, "num_tokens": 718762181.0, "step": 21455 }, { "epoch": 1.2810410697230181, "grad_norm": 0.473214328289032, "learning_rate": 1.9114795154746202e-05, "loss": 0.4312, "mean_token_accuracy": 0.855075740814209, "num_tokens": 718929861.0, "step": 21460 }, { "epoch": 1.281339541547278, "grad_norm": 0.505867600440979, "learning_rate": 1.9104491768007682e-05, "loss": 0.449, "mean_token_accuracy": 0.8504175186157227, "num_tokens": 719097541.0, "step": 21465 }, { "epoch": 1.2816380133715377, "grad_norm": 0.4916333854198456, "learning_rate": 1.9094190426150153e-05, "loss": 0.4545, "mean_token_accuracy": 0.8483060956001282, "num_tokens": 719265221.0, "step": 21470 }, { "epoch": 1.2819364851957975, "grad_norm": 0.5311673879623413, "learning_rate": 1.9083891131682707e-05, "loss": 0.4435, "mean_token_accuracy": 0.853071677684784, "num_tokens": 719432901.0, "step": 21475 }, { "epoch": 1.2822349570200573, "grad_norm": 0.5119551420211792, "learning_rate": 1.9073593887113917e-05, "loss": 0.4425, "mean_token_accuracy": 0.852493143081665, "num_tokens": 719600581.0, "step": 21480 }, { "epoch": 1.282533428844317, "grad_norm": 0.5507540702819824, "learning_rate": 1.9063298694951863e-05, "loss": 0.4505, "mean_token_accuracy": 0.8495824813842774, "num_tokens": 719768261.0, "step": 21485 }, { "epoch": 1.2828319006685769, "grad_norm": 0.47302794456481934, "learning_rate": 1.9053005557704133e-05, "loss": 0.4455, "mean_token_accuracy": 0.8505964517593384, "num_tokens": 719935941.0, "step": 21490 }, { "epoch": 1.2831303724928367, "grad_norm": 0.5300539135932922, "learning_rate": 1.9042714477877827e-05, "loss": 0.4364, "mean_token_accuracy": 0.8546403408050537, "num_tokens": 720103621.0, "step": 21495 }, { "epoch": 1.2834288443170965, "grad_norm": 0.47543445229530334, "learning_rate": 1.9032425457979503e-05, "loss": 0.4324, "mean_token_accuracy": 0.8566324710845947, "num_tokens": 720271301.0, "step": 21500 }, { "epoch": 1.2837273161413563, "grad_norm": 0.5631353855133057, "learning_rate": 1.9022138500515264e-05, "loss": 0.4496, "mean_token_accuracy": 0.8516819715499878, "num_tokens": 720438981.0, "step": 21505 }, { "epoch": 1.284025787965616, "grad_norm": 0.46531927585601807, "learning_rate": 1.901185360799066e-05, "loss": 0.4203, "mean_token_accuracy": 0.859060001373291, "num_tokens": 720606661.0, "step": 21510 }, { "epoch": 1.2843242597898759, "grad_norm": 0.4696912467479706, "learning_rate": 1.9001570782910787e-05, "loss": 0.4612, "mean_token_accuracy": 0.8484373211860656, "num_tokens": 720774341.0, "step": 21515 }, { "epoch": 1.2846227316141356, "grad_norm": 0.4956842362880707, "learning_rate": 1.8991290027780222e-05, "loss": 0.457, "mean_token_accuracy": 0.8481569766998291, "num_tokens": 720942021.0, "step": 21520 }, { "epoch": 1.2849212034383954, "grad_norm": 0.4649125933647156, "learning_rate": 1.8981011345103015e-05, "loss": 0.4805, "mean_token_accuracy": 0.841727864742279, "num_tokens": 721102300.0, "step": 21525 }, { "epoch": 1.2852196752626552, "grad_norm": 0.47681552171707153, "learning_rate": 1.8970734737382744e-05, "loss": 0.4417, "mean_token_accuracy": 0.8522963285446167, "num_tokens": 721269980.0, "step": 21530 }, { "epoch": 1.285518147086915, "grad_norm": 0.4758831858634949, "learning_rate": 1.896046020712246e-05, "loss": 0.4505, "mean_token_accuracy": 0.8505964398384094, "num_tokens": 721437660.0, "step": 21535 }, { "epoch": 1.2858166189111748, "grad_norm": 0.595343828201294, "learning_rate": 1.8950187756824705e-05, "loss": 0.5001, "mean_token_accuracy": 0.8331444501876831, "num_tokens": 721605340.0, "step": 21540 }, { "epoch": 1.2861150907354346, "grad_norm": 0.6689709424972534, "learning_rate": 1.8939917388991547e-05, "loss": 0.4768, "mean_token_accuracy": 0.8417451977729797, "num_tokens": 721773020.0, "step": 21545 }, { "epoch": 1.2864135625596944, "grad_norm": 0.5414208173751831, "learning_rate": 1.8929649106124497e-05, "loss": 0.4289, "mean_token_accuracy": 0.8577716827392579, "num_tokens": 721940700.0, "step": 21550 }, { "epoch": 1.2867120343839542, "grad_norm": 0.48877057433128357, "learning_rate": 1.891938291072461e-05, "loss": 0.4317, "mean_token_accuracy": 0.8560479402542114, "num_tokens": 722108380.0, "step": 21555 }, { "epoch": 1.287010506208214, "grad_norm": 0.49305394291877747, "learning_rate": 1.8909118805292388e-05, "loss": 0.4808, "mean_token_accuracy": 0.8400811076164245, "num_tokens": 722276060.0, "step": 21560 }, { "epoch": 1.2873089780324738, "grad_norm": 0.482348769903183, "learning_rate": 1.8898856792327852e-05, "loss": 0.4667, "mean_token_accuracy": 0.8455624461174012, "num_tokens": 722443740.0, "step": 21565 }, { "epoch": 1.2876074498567336, "grad_norm": 0.5003317594528198, "learning_rate": 1.88885968743305e-05, "loss": 0.3979, "mean_token_accuracy": 0.8663604855537415, "num_tokens": 722611420.0, "step": 21570 }, { "epoch": 1.2879059216809934, "grad_norm": 0.5651480555534363, "learning_rate": 1.887833905379932e-05, "loss": 0.477, "mean_token_accuracy": 0.8425503849983216, "num_tokens": 722779100.0, "step": 21575 }, { "epoch": 1.2882043935052532, "grad_norm": 0.4775930345058441, "learning_rate": 1.8868083333232817e-05, "loss": 0.4061, "mean_token_accuracy": 0.8624179840087891, "num_tokens": 722946780.0, "step": 21580 }, { "epoch": 1.288502865329513, "grad_norm": 0.5129034519195557, "learning_rate": 1.885782971512894e-05, "loss": 0.4584, "mean_token_accuracy": 0.8483120560646057, "num_tokens": 723114460.0, "step": 21585 }, { "epoch": 1.2888013371537728, "grad_norm": 0.5143440961837769, "learning_rate": 1.884757820198515e-05, "loss": 0.4386, "mean_token_accuracy": 0.8527138233184814, "num_tokens": 723282140.0, "step": 21590 }, { "epoch": 1.2890998089780323, "grad_norm": 0.4904709756374359, "learning_rate": 1.883732879629839e-05, "loss": 0.4381, "mean_token_accuracy": 0.8531909823417664, "num_tokens": 723449820.0, "step": 21595 }, { "epoch": 1.2893982808022924, "grad_norm": 0.5331194996833801, "learning_rate": 1.8827081500565104e-05, "loss": 0.4399, "mean_token_accuracy": 0.8531492352485657, "num_tokens": 723617500.0, "step": 21600 }, { "epoch": 1.289696752626552, "grad_norm": 0.5755446553230286, "learning_rate": 1.8816836317281206e-05, "loss": 0.4565, "mean_token_accuracy": 0.8476082563400269, "num_tokens": 723785180.0, "step": 21605 }, { "epoch": 1.289995224450812, "grad_norm": 0.5003317594528198, "learning_rate": 1.8806593248942083e-05, "loss": 0.417, "mean_token_accuracy": 0.8610342383384705, "num_tokens": 723952860.0, "step": 21610 }, { "epoch": 1.2902936962750715, "grad_norm": 0.5023718476295471, "learning_rate": 1.8796352298042648e-05, "loss": 0.4358, "mean_token_accuracy": 0.8536502480506897, "num_tokens": 724120540.0, "step": 21615 }, { "epoch": 1.2905921680993315, "grad_norm": 0.5326288938522339, "learning_rate": 1.8786113467077256e-05, "loss": 0.4512, "mean_token_accuracy": 0.8493319749832153, "num_tokens": 724288220.0, "step": 21620 }, { "epoch": 1.290890639923591, "grad_norm": 0.7071923017501831, "learning_rate": 1.877587675853976e-05, "loss": 0.4514, "mean_token_accuracy": 0.8503578543663025, "num_tokens": 724455900.0, "step": 21625 }, { "epoch": 1.2911891117478511, "grad_norm": 0.7134477496147156, "learning_rate": 1.8765642174923525e-05, "loss": 0.4672, "mean_token_accuracy": 0.8470975160598755, "num_tokens": 724619592.0, "step": 21630 }, { "epoch": 1.2914875835721107, "grad_norm": 0.4384186267852783, "learning_rate": 1.8755409718721338e-05, "loss": 0.4575, "mean_token_accuracy": 0.8496779203414917, "num_tokens": 724787272.0, "step": 21635 }, { "epoch": 1.2917860553963707, "grad_norm": 0.44683679938316345, "learning_rate": 1.8745179392425544e-05, "loss": 0.4262, "mean_token_accuracy": 0.8579386830329895, "num_tokens": 724954952.0, "step": 21640 }, { "epoch": 1.2920845272206303, "grad_norm": 0.5542892217636108, "learning_rate": 1.8734951198527882e-05, "loss": 0.4468, "mean_token_accuracy": 0.8526363015174866, "num_tokens": 725122632.0, "step": 21645 }, { "epoch": 1.2923829990448903, "grad_norm": 0.5385172367095947, "learning_rate": 1.872472513951965e-05, "loss": 0.464, "mean_token_accuracy": 0.846528697013855, "num_tokens": 725290312.0, "step": 21650 }, { "epoch": 1.2926814708691499, "grad_norm": 0.5000436902046204, "learning_rate": 1.871450121789159e-05, "loss": 0.445, "mean_token_accuracy": 0.8502326250076294, "num_tokens": 725457992.0, "step": 21655 }, { "epoch": 1.2929799426934099, "grad_norm": 0.4647788405418396, "learning_rate": 1.870427943613391e-05, "loss": 0.3889, "mean_token_accuracy": 0.8675832033157349, "num_tokens": 725625672.0, "step": 21660 }, { "epoch": 1.2932784145176695, "grad_norm": 0.5435347557067871, "learning_rate": 1.869405979673633e-05, "loss": 0.4466, "mean_token_accuracy": 0.8517237305641174, "num_tokens": 725793352.0, "step": 21665 }, { "epoch": 1.2935768863419292, "grad_norm": 0.5936532020568848, "learning_rate": 1.8683842302188013e-05, "loss": 0.4834, "mean_token_accuracy": 0.8409936785697937, "num_tokens": 725961032.0, "step": 21670 }, { "epoch": 1.293875358166189, "grad_norm": 0.4887048006057739, "learning_rate": 1.8673626954977645e-05, "loss": 0.4423, "mean_token_accuracy": 0.8525050640106201, "num_tokens": 726128712.0, "step": 21675 }, { "epoch": 1.2941738299904488, "grad_norm": 0.5274898409843445, "learning_rate": 1.8663413757593346e-05, "loss": 0.4511, "mean_token_accuracy": 0.8489383339881897, "num_tokens": 726296392.0, "step": 21680 }, { "epoch": 1.2944723018147086, "grad_norm": 0.4889707863330841, "learning_rate": 1.8653202712522728e-05, "loss": 0.4133, "mean_token_accuracy": 0.8612608909606934, "num_tokens": 726464072.0, "step": 21685 }, { "epoch": 1.2947707736389684, "grad_norm": 0.48489663004875183, "learning_rate": 1.864299382225288e-05, "loss": 0.4464, "mean_token_accuracy": 0.852982223033905, "num_tokens": 726631752.0, "step": 21690 }, { "epoch": 1.2950692454632282, "grad_norm": 0.5408115386962891, "learning_rate": 1.8632787089270374e-05, "loss": 0.4572, "mean_token_accuracy": 0.8479840159416199, "num_tokens": 726799432.0, "step": 21695 }, { "epoch": 1.295367717287488, "grad_norm": 0.48788145184516907, "learning_rate": 1.8622582516061245e-05, "loss": 0.4116, "mean_token_accuracy": 0.8613921046257019, "num_tokens": 726967112.0, "step": 21700 }, { "epoch": 1.2956661891117478, "grad_norm": 0.5170052647590637, "learning_rate": 1.861238010511099e-05, "loss": 0.4405, "mean_token_accuracy": 0.8541631817817688, "num_tokens": 727134792.0, "step": 21705 }, { "epoch": 1.2959646609360076, "grad_norm": 0.5319960117340088, "learning_rate": 1.860217985890461e-05, "loss": 0.4656, "mean_token_accuracy": 0.8464809775352478, "num_tokens": 727302472.0, "step": 21710 }, { "epoch": 1.2962631327602674, "grad_norm": 0.6124706268310547, "learning_rate": 1.8591981779926564e-05, "loss": 0.4578, "mean_token_accuracy": 0.8469462156295776, "num_tokens": 727470152.0, "step": 21715 }, { "epoch": 1.2965616045845272, "grad_norm": 0.5003710985183716, "learning_rate": 1.8581785870660783e-05, "loss": 0.4088, "mean_token_accuracy": 0.863145649433136, "num_tokens": 727637832.0, "step": 21720 }, { "epoch": 1.296860076408787, "grad_norm": 0.5208696126937866, "learning_rate": 1.857159213359066e-05, "loss": 0.4482, "mean_token_accuracy": 0.849761426448822, "num_tokens": 727805512.0, "step": 21725 }, { "epoch": 1.2971585482330468, "grad_norm": 0.5592601299285889, "learning_rate": 1.8561400571199072e-05, "loss": 0.4578, "mean_token_accuracy": 0.8479124426841735, "num_tokens": 727973192.0, "step": 21730 }, { "epoch": 1.2974570200573066, "grad_norm": 0.4732402265071869, "learning_rate": 1.8551211185968363e-05, "loss": 0.4341, "mean_token_accuracy": 0.8565788030624389, "num_tokens": 728140872.0, "step": 21735 }, { "epoch": 1.2977554918815664, "grad_norm": 0.5079814195632935, "learning_rate": 1.8541023980380352e-05, "loss": 0.4462, "mean_token_accuracy": 0.8484015226364136, "num_tokens": 728308552.0, "step": 21740 }, { "epoch": 1.2980539637058262, "grad_norm": 0.47837844491004944, "learning_rate": 1.85308389569163e-05, "loss": 0.4492, "mean_token_accuracy": 0.8500894784927369, "num_tokens": 728476232.0, "step": 21745 }, { "epoch": 1.298352435530086, "grad_norm": 0.512665331363678, "learning_rate": 1.852065611805699e-05, "loss": 0.4669, "mean_token_accuracy": 0.843710470199585, "num_tokens": 728642001.0, "step": 21750 }, { "epoch": 1.2986509073543457, "grad_norm": 0.4837343096733093, "learning_rate": 1.851047546628261e-05, "loss": 0.4301, "mean_token_accuracy": 0.8560479521751404, "num_tokens": 728809681.0, "step": 21755 }, { "epoch": 1.2989493791786055, "grad_norm": 0.478515088558197, "learning_rate": 1.8500297004072854e-05, "loss": 0.4292, "mean_token_accuracy": 0.8570082187652588, "num_tokens": 728977361.0, "step": 21760 }, { "epoch": 1.2992478510028653, "grad_norm": 0.5688692927360535, "learning_rate": 1.8490120733906892e-05, "loss": 0.449, "mean_token_accuracy": 0.8524573683738709, "num_tokens": 729145041.0, "step": 21765 }, { "epoch": 1.2995463228271251, "grad_norm": 0.5579441785812378, "learning_rate": 1.8479946658263315e-05, "loss": 0.4667, "mean_token_accuracy": 0.8456101655960083, "num_tokens": 729312721.0, "step": 21770 }, { "epoch": 1.299844794651385, "grad_norm": 0.44270285964012146, "learning_rate": 1.846977477962023e-05, "loss": 0.4389, "mean_token_accuracy": 0.8549266338348389, "num_tokens": 729480401.0, "step": 21775 }, { "epoch": 1.3001432664756447, "grad_norm": 0.5454878211021423, "learning_rate": 1.845960510045517e-05, "loss": 0.4245, "mean_token_accuracy": 0.8591613888740539, "num_tokens": 729648081.0, "step": 21780 }, { "epoch": 1.3004417382999045, "grad_norm": 0.46221116185188293, "learning_rate": 1.8449437623245163e-05, "loss": 0.4448, "mean_token_accuracy": 0.852117383480072, "num_tokens": 729815761.0, "step": 21785 }, { "epoch": 1.3007402101241643, "grad_norm": 0.47009992599487305, "learning_rate": 1.8439272350466663e-05, "loss": 0.4326, "mean_token_accuracy": 0.8562328577041626, "num_tokens": 729983441.0, "step": 21790 }, { "epoch": 1.301038681948424, "grad_norm": 0.6485724449157715, "learning_rate": 1.842910928459563e-05, "loss": 0.5059, "mean_token_accuracy": 0.8340272068977356, "num_tokens": 730151121.0, "step": 21795 }, { "epoch": 1.3013371537726839, "grad_norm": 0.508536159992218, "learning_rate": 1.841894842810747e-05, "loss": 0.4614, "mean_token_accuracy": 0.847005832195282, "num_tokens": 730318801.0, "step": 21800 }, { "epoch": 1.3016356255969437, "grad_norm": 0.5798758268356323, "learning_rate": 1.840878978347702e-05, "loss": 0.4581, "mean_token_accuracy": 0.847757351398468, "num_tokens": 730486481.0, "step": 21805 }, { "epoch": 1.3019340974212035, "grad_norm": 0.48340994119644165, "learning_rate": 1.839863335317864e-05, "loss": 0.403, "mean_token_accuracy": 0.8640403032302857, "num_tokens": 730654161.0, "step": 21810 }, { "epoch": 1.3022325692454633, "grad_norm": 0.5189979672431946, "learning_rate": 1.8388479139686088e-05, "loss": 0.4575, "mean_token_accuracy": 0.8482285499572754, "num_tokens": 730821841.0, "step": 21815 }, { "epoch": 1.302531041069723, "grad_norm": 0.49118292331695557, "learning_rate": 1.8378327145472624e-05, "loss": 0.4336, "mean_token_accuracy": 0.8549564599990844, "num_tokens": 730989521.0, "step": 21820 }, { "epoch": 1.3028295128939829, "grad_norm": 0.5009886026382446, "learning_rate": 1.8368177373010954e-05, "loss": 0.438, "mean_token_accuracy": 0.8536144495010376, "num_tokens": 731157201.0, "step": 21825 }, { "epoch": 1.3031279847182426, "grad_norm": 0.5465167760848999, "learning_rate": 1.8358029824773225e-05, "loss": 0.4667, "mean_token_accuracy": 0.8434390902519227, "num_tokens": 731324881.0, "step": 21830 }, { "epoch": 1.3034264565425024, "grad_norm": 0.5582911372184753, "learning_rate": 1.8347884503231094e-05, "loss": 0.4551, "mean_token_accuracy": 0.848270308971405, "num_tokens": 731492561.0, "step": 21835 }, { "epoch": 1.3037249283667622, "grad_norm": 0.4791560173034668, "learning_rate": 1.8337741410855602e-05, "loss": 0.4653, "mean_token_accuracy": 0.8467970967292786, "num_tokens": 731660241.0, "step": 21840 }, { "epoch": 1.304023400191022, "grad_norm": 0.4999723434448242, "learning_rate": 1.832760055011731e-05, "loss": 0.4595, "mean_token_accuracy": 0.8481748700141907, "num_tokens": 731827921.0, "step": 21845 }, { "epoch": 1.3043218720152818, "grad_norm": 0.5018539428710938, "learning_rate": 1.8317461923486218e-05, "loss": 0.4263, "mean_token_accuracy": 0.8575450301170349, "num_tokens": 731995601.0, "step": 21850 }, { "epoch": 1.3046203438395416, "grad_norm": 0.4244856834411621, "learning_rate": 1.8307325533431757e-05, "loss": 0.4258, "mean_token_accuracy": 0.8578730702400208, "num_tokens": 732163281.0, "step": 21855 }, { "epoch": 1.3049188156638014, "grad_norm": 0.5131832361221313, "learning_rate": 1.8297191382422847e-05, "loss": 0.4405, "mean_token_accuracy": 0.8525169968605042, "num_tokens": 732330961.0, "step": 21860 }, { "epoch": 1.3052172874880612, "grad_norm": 0.4526806175708771, "learning_rate": 1.828705947292783e-05, "loss": 0.4192, "mean_token_accuracy": 0.8599367737770081, "num_tokens": 732498641.0, "step": 21865 }, { "epoch": 1.3055157593123208, "grad_norm": 0.4389190375804901, "learning_rate": 1.827692980741454e-05, "loss": 0.4107, "mean_token_accuracy": 0.8621078372001648, "num_tokens": 732666321.0, "step": 21870 }, { "epoch": 1.3058142311365808, "grad_norm": 0.5271827578544617, "learning_rate": 1.8266802388350225e-05, "loss": 0.4548, "mean_token_accuracy": 0.8494453072547913, "num_tokens": 732834001.0, "step": 21875 }, { "epoch": 1.3061127029608404, "grad_norm": 0.49042731523513794, "learning_rate": 1.825667721820161e-05, "loss": 0.4332, "mean_token_accuracy": 0.8544375658035278, "num_tokens": 733001681.0, "step": 21880 }, { "epoch": 1.3064111747851004, "grad_norm": 0.5099753141403198, "learning_rate": 1.8246554299434888e-05, "loss": 0.4562, "mean_token_accuracy": 0.8488906025886536, "num_tokens": 733169361.0, "step": 21885 }, { "epoch": 1.30670964660936, "grad_norm": 0.5492095351219177, "learning_rate": 1.8236433634515648e-05, "loss": 0.4438, "mean_token_accuracy": 0.8514851570129395, "num_tokens": 733337041.0, "step": 21890 }, { "epoch": 1.30700811843362, "grad_norm": 0.4917539358139038, "learning_rate": 1.8226315225908993e-05, "loss": 0.4481, "mean_token_accuracy": 0.8518966913223267, "num_tokens": 733504721.0, "step": 21895 }, { "epoch": 1.3073065902578795, "grad_norm": 0.6124330759048462, "learning_rate": 1.8216199076079426e-05, "loss": 0.4697, "mean_token_accuracy": 0.8449481129646301, "num_tokens": 733672401.0, "step": 21900 }, { "epoch": 1.3076050620821396, "grad_norm": 0.496549129486084, "learning_rate": 1.820608518749094e-05, "loss": 0.4233, "mean_token_accuracy": 0.8578551769256592, "num_tokens": 733840081.0, "step": 21905 }, { "epoch": 1.3079035339063991, "grad_norm": 0.5200099945068359, "learning_rate": 1.819597356260696e-05, "loss": 0.4489, "mean_token_accuracy": 0.8500178933143616, "num_tokens": 734007761.0, "step": 21910 }, { "epoch": 1.3082020057306591, "grad_norm": 0.48620837926864624, "learning_rate": 1.8185864203890334e-05, "loss": 0.4314, "mean_token_accuracy": 0.8548789262771607, "num_tokens": 734175441.0, "step": 21915 }, { "epoch": 1.3085004775549187, "grad_norm": 0.4823870360851288, "learning_rate": 1.8175757113803415e-05, "loss": 0.4762, "mean_token_accuracy": 0.8423535823822021, "num_tokens": 734343121.0, "step": 21920 }, { "epoch": 1.3087989493791787, "grad_norm": 0.4812258780002594, "learning_rate": 1.816565229480795e-05, "loss": 0.4381, "mean_token_accuracy": 0.8525587439537048, "num_tokens": 734510801.0, "step": 21925 }, { "epoch": 1.3090974212034383, "grad_norm": 0.5481657385826111, "learning_rate": 1.8155549749365163e-05, "loss": 0.4539, "mean_token_accuracy": 0.8486699342727662, "num_tokens": 734678481.0, "step": 21930 }, { "epoch": 1.3093958930276983, "grad_norm": 0.4960908889770508, "learning_rate": 1.8145449479935717e-05, "loss": 0.4391, "mean_token_accuracy": 0.8538112878799439, "num_tokens": 734846161.0, "step": 21935 }, { "epoch": 1.309694364851958, "grad_norm": 0.5120190382003784, "learning_rate": 1.8135351488979708e-05, "loss": 0.4611, "mean_token_accuracy": 0.8461946845054626, "num_tokens": 735013841.0, "step": 21940 }, { "epoch": 1.3099928366762177, "grad_norm": 0.5206858515739441, "learning_rate": 1.8125255778956714e-05, "loss": 0.4319, "mean_token_accuracy": 0.8559882998466491, "num_tokens": 735181521.0, "step": 21945 }, { "epoch": 1.3102913085004775, "grad_norm": 0.5164458155632019, "learning_rate": 1.811516235232568e-05, "loss": 0.4381, "mean_token_accuracy": 0.8545511484146118, "num_tokens": 735344176.0, "step": 21950 }, { "epoch": 1.3105897803247373, "grad_norm": 0.48718783259391785, "learning_rate": 1.81050712115451e-05, "loss": 0.4282, "mean_token_accuracy": 0.8561851382255554, "num_tokens": 735511856.0, "step": 21955 }, { "epoch": 1.310888252148997, "grad_norm": 0.5037433505058289, "learning_rate": 1.8094982359072838e-05, "loss": 0.4649, "mean_token_accuracy": 0.8462185382843017, "num_tokens": 735679536.0, "step": 21960 }, { "epoch": 1.3111867239732569, "grad_norm": 0.5409345030784607, "learning_rate": 1.8084895797366215e-05, "loss": 0.4516, "mean_token_accuracy": 0.8509543061256408, "num_tokens": 735847216.0, "step": 21965 }, { "epoch": 1.3114851957975167, "grad_norm": 0.4389113485813141, "learning_rate": 1.8074811528882007e-05, "loss": 0.4206, "mean_token_accuracy": 0.8593583345413208, "num_tokens": 736010296.0, "step": 21970 }, { "epoch": 1.3117836676217765, "grad_norm": 0.48540887236595154, "learning_rate": 1.8064729556076414e-05, "loss": 0.4617, "mean_token_accuracy": 0.8471907615661621, "num_tokens": 736177976.0, "step": 21975 }, { "epoch": 1.3120821394460362, "grad_norm": 0.5117364525794983, "learning_rate": 1.80546498814051e-05, "loss": 0.458, "mean_token_accuracy": 0.8476500034332275, "num_tokens": 736345656.0, "step": 21980 }, { "epoch": 1.312380611270296, "grad_norm": 0.5079487562179565, "learning_rate": 1.804457250732315e-05, "loss": 0.4714, "mean_token_accuracy": 0.8450912475585938, "num_tokens": 736513336.0, "step": 21985 }, { "epoch": 1.3126790830945558, "grad_norm": 0.47358182072639465, "learning_rate": 1.8034497436285082e-05, "loss": 0.4245, "mean_token_accuracy": 0.8578492164611816, "num_tokens": 736681016.0, "step": 21990 }, { "epoch": 1.3129775549188156, "grad_norm": 0.4516245126724243, "learning_rate": 1.802442467074488e-05, "loss": 0.4244, "mean_token_accuracy": 0.8607658386230469, "num_tokens": 736848696.0, "step": 21995 }, { "epoch": 1.3132760267430754, "grad_norm": 0.4859462082386017, "learning_rate": 1.801435421315595e-05, "loss": 0.4249, "mean_token_accuracy": 0.8593283891677856, "num_tokens": 737016376.0, "step": 22000 }, { "epoch": 1.3135744985673352, "grad_norm": 0.5580401420593262, "learning_rate": 1.8004286065971142e-05, "loss": 0.4215, "mean_token_accuracy": 0.858069896697998, "num_tokens": 737184056.0, "step": 22005 }, { "epoch": 1.313872970391595, "grad_norm": 0.5447243452072144, "learning_rate": 1.799422023164272e-05, "loss": 0.4324, "mean_token_accuracy": 0.8559942722320557, "num_tokens": 737351736.0, "step": 22010 }, { "epoch": 1.3141714422158548, "grad_norm": 0.5117738246917725, "learning_rate": 1.7984156712622425e-05, "loss": 0.4404, "mean_token_accuracy": 0.8534892082214356, "num_tokens": 737519416.0, "step": 22015 }, { "epoch": 1.3144699140401146, "grad_norm": 0.5081394910812378, "learning_rate": 1.7974095511361418e-05, "loss": 0.4452, "mean_token_accuracy": 0.8518430233001709, "num_tokens": 737687096.0, "step": 22020 }, { "epoch": 1.3147683858643744, "grad_norm": 0.537953794002533, "learning_rate": 1.7964036630310262e-05, "loss": 0.4611, "mean_token_accuracy": 0.8462543249130249, "num_tokens": 737854776.0, "step": 22025 }, { "epoch": 1.3150668576886342, "grad_norm": 0.4828946590423584, "learning_rate": 1.7953980071918996e-05, "loss": 0.4757, "mean_token_accuracy": 0.8448586463928223, "num_tokens": 738022456.0, "step": 22030 }, { "epoch": 1.315365329512894, "grad_norm": 0.465561181306839, "learning_rate": 1.7943925838637087e-05, "loss": 0.4255, "mean_token_accuracy": 0.8582488536834717, "num_tokens": 738190136.0, "step": 22035 }, { "epoch": 1.3156638013371538, "grad_norm": 0.5207686424255371, "learning_rate": 1.7933873932913425e-05, "loss": 0.4227, "mean_token_accuracy": 0.8588572025299073, "num_tokens": 738357816.0, "step": 22040 }, { "epoch": 1.3159622731614136, "grad_norm": 0.512942910194397, "learning_rate": 1.7923824357196343e-05, "loss": 0.4669, "mean_token_accuracy": 0.8468805909156799, "num_tokens": 738525496.0, "step": 22045 }, { "epoch": 1.3162607449856734, "grad_norm": 0.5877902507781982, "learning_rate": 1.7913777113933594e-05, "loss": 0.4335, "mean_token_accuracy": 0.8556602597236633, "num_tokens": 738693176.0, "step": 22050 }, { "epoch": 1.3165592168099332, "grad_norm": 0.5309934616088867, "learning_rate": 1.7903732205572366e-05, "loss": 0.4474, "mean_token_accuracy": 0.8513300776481628, "num_tokens": 738860856.0, "step": 22055 }, { "epoch": 1.316857688634193, "grad_norm": 0.5761898159980774, "learning_rate": 1.7893689634559284e-05, "loss": 0.4621, "mean_token_accuracy": 0.8478170156478881, "num_tokens": 739028536.0, "step": 22060 }, { "epoch": 1.3171561604584527, "grad_norm": 0.4726199805736542, "learning_rate": 1.7883649403340403e-05, "loss": 0.4165, "mean_token_accuracy": 0.8606584787368774, "num_tokens": 739196216.0, "step": 22065 }, { "epoch": 1.3174546322827125, "grad_norm": 0.5314990282058716, "learning_rate": 1.7873611514361216e-05, "loss": 0.4361, "mean_token_accuracy": 0.8551055788993835, "num_tokens": 739363896.0, "step": 22070 }, { "epoch": 1.3177531041069723, "grad_norm": 0.5096940398216248, "learning_rate": 1.7863575970066615e-05, "loss": 0.4407, "mean_token_accuracy": 0.854097580909729, "num_tokens": 739531576.0, "step": 22075 }, { "epoch": 1.3180515759312321, "grad_norm": 0.5359295606613159, "learning_rate": 1.7853542772900965e-05, "loss": 0.4504, "mean_token_accuracy": 0.8497316002845764, "num_tokens": 739699256.0, "step": 22080 }, { "epoch": 1.318350047755492, "grad_norm": 0.49495822191238403, "learning_rate": 1.784351192530802e-05, "loss": 0.449, "mean_token_accuracy": 0.8510258793830872, "num_tokens": 739866936.0, "step": 22085 }, { "epoch": 1.3186485195797517, "grad_norm": 0.472333163022995, "learning_rate": 1.7833483429730986e-05, "loss": 0.4204, "mean_token_accuracy": 0.8596803069114685, "num_tokens": 740034616.0, "step": 22090 }, { "epoch": 1.3189469914040115, "grad_norm": 0.4684799015522003, "learning_rate": 1.782345728861249e-05, "loss": 0.4251, "mean_token_accuracy": 0.8584158420562744, "num_tokens": 740202296.0, "step": 22095 }, { "epoch": 1.3192454632282713, "grad_norm": 0.4827628433704376, "learning_rate": 1.7813433504394572e-05, "loss": 0.4281, "mean_token_accuracy": 0.8565489649772644, "num_tokens": 740369976.0, "step": 22100 }, { "epoch": 1.319543935052531, "grad_norm": 0.5313222408294678, "learning_rate": 1.7803412079518727e-05, "loss": 0.4746, "mean_token_accuracy": 0.8430335283279419, "num_tokens": 740537656.0, "step": 22105 }, { "epoch": 1.3198424068767909, "grad_norm": 0.5034235715866089, "learning_rate": 1.779339301642584e-05, "loss": 0.4545, "mean_token_accuracy": 0.8488369345664978, "num_tokens": 740705336.0, "step": 22110 }, { "epoch": 1.3201408787010507, "grad_norm": 0.5767083168029785, "learning_rate": 1.778337631755625e-05, "loss": 0.4408, "mean_token_accuracy": 0.8530180096626282, "num_tokens": 740873016.0, "step": 22115 }, { "epoch": 1.3204393505253105, "grad_norm": 0.5320366621017456, "learning_rate": 1.7773361985349706e-05, "loss": 0.4366, "mean_token_accuracy": 0.8537098884582519, "num_tokens": 741040696.0, "step": 22120 }, { "epoch": 1.3207378223495703, "grad_norm": 0.5269381403923035, "learning_rate": 1.7763350022245386e-05, "loss": 0.4165, "mean_token_accuracy": 0.8591792941093445, "num_tokens": 741208376.0, "step": 22125 }, { "epoch": 1.32103629417383, "grad_norm": 0.54883873462677, "learning_rate": 1.7753340430681892e-05, "loss": 0.4405, "mean_token_accuracy": 0.8532983422279358, "num_tokens": 741376056.0, "step": 22130 }, { "epoch": 1.3213347659980899, "grad_norm": 0.4980525076389313, "learning_rate": 1.774333321309722e-05, "loss": 0.4314, "mean_token_accuracy": 0.8546761274337769, "num_tokens": 741543736.0, "step": 22135 }, { "epoch": 1.3216332378223496, "grad_norm": 0.5273322463035583, "learning_rate": 1.773332837192885e-05, "loss": 0.4467, "mean_token_accuracy": 0.8511451840400696, "num_tokens": 741711416.0, "step": 22140 }, { "epoch": 1.3219317096466094, "grad_norm": 0.4954814612865448, "learning_rate": 1.7723325909613614e-05, "loss": 0.4465, "mean_token_accuracy": 0.8516461968421936, "num_tokens": 741879096.0, "step": 22145 }, { "epoch": 1.3222301814708692, "grad_norm": 0.4901604652404785, "learning_rate": 1.771332582858782e-05, "loss": 0.4548, "mean_token_accuracy": 0.8492007613182068, "num_tokens": 742046776.0, "step": 22150 }, { "epoch": 1.3225286532951288, "grad_norm": 0.5273316502571106, "learning_rate": 1.770332813128715e-05, "loss": 0.4485, "mean_token_accuracy": 0.8497792959213257, "num_tokens": 742214456.0, "step": 22155 }, { "epoch": 1.3228271251193888, "grad_norm": 0.5013324618339539, "learning_rate": 1.7693332820146746e-05, "loss": 0.4285, "mean_token_accuracy": 0.857055950164795, "num_tokens": 742382136.0, "step": 22160 }, { "epoch": 1.3231255969436484, "grad_norm": 0.4954442083835602, "learning_rate": 1.7683339897601154e-05, "loss": 0.4185, "mean_token_accuracy": 0.8604020118713379, "num_tokens": 742549816.0, "step": 22165 }, { "epoch": 1.3234240687679084, "grad_norm": 0.5129901170730591, "learning_rate": 1.7673349366084312e-05, "loss": 0.4497, "mean_token_accuracy": 0.8501550674438476, "num_tokens": 742717496.0, "step": 22170 }, { "epoch": 1.323722540592168, "grad_norm": 0.5030074119567871, "learning_rate": 1.7663361228029612e-05, "loss": 0.4444, "mean_token_accuracy": 0.8514970779418946, "num_tokens": 742885176.0, "step": 22175 }, { "epoch": 1.324021012416428, "grad_norm": 0.5100350379943848, "learning_rate": 1.765337548586985e-05, "loss": 0.4335, "mean_token_accuracy": 0.8553143262863159, "num_tokens": 743052856.0, "step": 22180 }, { "epoch": 1.3243194842406876, "grad_norm": 0.5419145822525024, "learning_rate": 1.7643392142037236e-05, "loss": 0.4553, "mean_token_accuracy": 0.8496600151062011, "num_tokens": 743220536.0, "step": 22185 }, { "epoch": 1.3246179560649476, "grad_norm": 0.4562287926673889, "learning_rate": 1.7633411198963407e-05, "loss": 0.4078, "mean_token_accuracy": 0.8644419193267823, "num_tokens": 743382694.0, "step": 22190 }, { "epoch": 1.3249164278892072, "grad_norm": 0.47411105036735535, "learning_rate": 1.7623432659079388e-05, "loss": 0.4758, "mean_token_accuracy": 0.8425563693046569, "num_tokens": 743550374.0, "step": 22195 }, { "epoch": 1.3252148997134672, "grad_norm": 0.4742583930492401, "learning_rate": 1.761345652481564e-05, "loss": 0.4447, "mean_token_accuracy": 0.8504473328590393, "num_tokens": 743718054.0, "step": 22200 }, { "epoch": 1.3255133715377267, "grad_norm": 0.5340882539749146, "learning_rate": 1.7603482798602045e-05, "loss": 0.4438, "mean_token_accuracy": 0.8522545576095581, "num_tokens": 743885734.0, "step": 22205 }, { "epoch": 1.3258118433619868, "grad_norm": 0.5445700287818909, "learning_rate": 1.7593511482867886e-05, "loss": 0.4315, "mean_token_accuracy": 0.855463445186615, "num_tokens": 744053414.0, "step": 22210 }, { "epoch": 1.3261103151862463, "grad_norm": 0.7758572101593018, "learning_rate": 1.7583542580041857e-05, "loss": 0.4618, "mean_token_accuracy": 0.8486937880516052, "num_tokens": 744221094.0, "step": 22215 }, { "epoch": 1.3264087870105061, "grad_norm": 0.5776500701904297, "learning_rate": 1.7573576092552063e-05, "loss": 0.4541, "mean_token_accuracy": 0.8487951874732971, "num_tokens": 744388774.0, "step": 22220 }, { "epoch": 1.326707258834766, "grad_norm": 0.5077141523361206, "learning_rate": 1.756361202282605e-05, "loss": 0.4377, "mean_token_accuracy": 0.8538888216018676, "num_tokens": 744556454.0, "step": 22225 }, { "epoch": 1.3270057306590257, "grad_norm": 0.474239319562912, "learning_rate": 1.755365037329072e-05, "loss": 0.4394, "mean_token_accuracy": 0.8527615427970886, "num_tokens": 744724134.0, "step": 22230 }, { "epoch": 1.3273042024832855, "grad_norm": 0.5360094308853149, "learning_rate": 1.7543691146372428e-05, "loss": 0.4343, "mean_token_accuracy": 0.8550459265708923, "num_tokens": 744891814.0, "step": 22235 }, { "epoch": 1.3276026743075453, "grad_norm": 0.5197581052780151, "learning_rate": 1.7533734344496933e-05, "loss": 0.427, "mean_token_accuracy": 0.8574973106384277, "num_tokens": 745059494.0, "step": 22240 }, { "epoch": 1.327901146131805, "grad_norm": 0.5478337407112122, "learning_rate": 1.7523779970089395e-05, "loss": 0.4556, "mean_token_accuracy": 0.8478110551834106, "num_tokens": 745227174.0, "step": 22245 }, { "epoch": 1.328199617956065, "grad_norm": 0.5398106575012207, "learning_rate": 1.7513828025574393e-05, "loss": 0.4642, "mean_token_accuracy": 0.8458069920539856, "num_tokens": 745394854.0, "step": 22250 }, { "epoch": 1.3284980897803247, "grad_norm": 0.5628871917724609, "learning_rate": 1.750387851337589e-05, "loss": 0.4775, "mean_token_accuracy": 0.8413515448570251, "num_tokens": 745562534.0, "step": 22255 }, { "epoch": 1.3287965616045845, "grad_norm": 0.5625954270362854, "learning_rate": 1.749393143591727e-05, "loss": 0.4382, "mean_token_accuracy": 0.8533102631568908, "num_tokens": 745730214.0, "step": 22260 }, { "epoch": 1.3290950334288443, "grad_norm": 0.4975132644176483, "learning_rate": 1.7483986795621367e-05, "loss": 0.4541, "mean_token_accuracy": 0.848634135723114, "num_tokens": 745897894.0, "step": 22265 }, { "epoch": 1.329393505253104, "grad_norm": 0.4786888062953949, "learning_rate": 1.7474044594910334e-05, "loss": 0.4066, "mean_token_accuracy": 0.8636228084564209, "num_tokens": 746065574.0, "step": 22270 }, { "epoch": 1.3296919770773639, "grad_norm": 0.5369954109191895, "learning_rate": 1.7464104836205815e-05, "loss": 0.4602, "mean_token_accuracy": 0.8474531769752502, "num_tokens": 746233254.0, "step": 22275 }, { "epoch": 1.3299904489016237, "grad_norm": 0.46432721614837646, "learning_rate": 1.7454167521928783e-05, "loss": 0.484, "mean_token_accuracy": 0.8396337747573852, "num_tokens": 746400934.0, "step": 22280 }, { "epoch": 1.3302889207258835, "grad_norm": 0.4735514223575592, "learning_rate": 1.7444232654499687e-05, "loss": 0.4342, "mean_token_accuracy": 0.8554694056510925, "num_tokens": 746568614.0, "step": 22285 }, { "epoch": 1.3305873925501432, "grad_norm": 0.5196797251701355, "learning_rate": 1.743430023633834e-05, "loss": 0.451, "mean_token_accuracy": 0.8498270273208618, "num_tokens": 746736294.0, "step": 22290 }, { "epoch": 1.330885864374403, "grad_norm": 0.48476192355155945, "learning_rate": 1.742437026986396e-05, "loss": 0.4416, "mean_token_accuracy": 0.8505964398384094, "num_tokens": 746903974.0, "step": 22295 }, { "epoch": 1.3311843361986628, "grad_norm": 0.5303208827972412, "learning_rate": 1.7414442757495173e-05, "loss": 0.4415, "mean_token_accuracy": 0.8522784352302551, "num_tokens": 747071654.0, "step": 22300 }, { "epoch": 1.3314828080229226, "grad_norm": 0.48505687713623047, "learning_rate": 1.7404517701650007e-05, "loss": 0.4322, "mean_token_accuracy": 0.85434809923172, "num_tokens": 747239334.0, "step": 22305 }, { "epoch": 1.3317812798471824, "grad_norm": 0.5132880806922913, "learning_rate": 1.7394595104745908e-05, "loss": 0.4431, "mean_token_accuracy": 0.8535667419433594, "num_tokens": 747407014.0, "step": 22310 }, { "epoch": 1.3320797516714422, "grad_norm": 0.5003713369369507, "learning_rate": 1.7384674969199688e-05, "loss": 0.4414, "mean_token_accuracy": 0.8522605419158935, "num_tokens": 747574694.0, "step": 22315 }, { "epoch": 1.332378223495702, "grad_norm": 0.4654309153556824, "learning_rate": 1.7374757297427584e-05, "loss": 0.4795, "mean_token_accuracy": 0.8427293300628662, "num_tokens": 747742374.0, "step": 22320 }, { "epoch": 1.3326766953199618, "grad_norm": 0.5223366022109985, "learning_rate": 1.7364842091845234e-05, "loss": 0.4046, "mean_token_accuracy": 0.8646665811538696, "num_tokens": 747910054.0, "step": 22325 }, { "epoch": 1.3329751671442216, "grad_norm": 0.47959938645362854, "learning_rate": 1.7354929354867667e-05, "loss": 0.455, "mean_token_accuracy": 0.8486102819442749, "num_tokens": 748077734.0, "step": 22330 }, { "epoch": 1.3332736389684814, "grad_norm": 0.5158442258834839, "learning_rate": 1.7345019088909314e-05, "loss": 0.4638, "mean_token_accuracy": 0.8464021801948547, "num_tokens": 748237116.0, "step": 22335 }, { "epoch": 1.3335721107927412, "grad_norm": 0.5109057426452637, "learning_rate": 1.7335111296383987e-05, "loss": 0.4734, "mean_token_accuracy": 0.8424370765686036, "num_tokens": 748404796.0, "step": 22340 }, { "epoch": 1.333870582617001, "grad_norm": 0.5325325727462769, "learning_rate": 1.7325205979704936e-05, "loss": 0.4558, "mean_token_accuracy": 0.8487951874732971, "num_tokens": 748572476.0, "step": 22345 }, { "epoch": 1.3341690544412608, "grad_norm": 0.5609932541847229, "learning_rate": 1.7315303141284782e-05, "loss": 0.4718, "mean_token_accuracy": 0.8445723533630372, "num_tokens": 748740156.0, "step": 22350 }, { "epoch": 1.3344675262655206, "grad_norm": 0.503879189491272, "learning_rate": 1.730540278353553e-05, "loss": 0.4839, "mean_token_accuracy": 0.8394071459770203, "num_tokens": 748907836.0, "step": 22355 }, { "epoch": 1.3347659980897804, "grad_norm": 0.4853704869747162, "learning_rate": 1.7295504908868597e-05, "loss": 0.4301, "mean_token_accuracy": 0.8537755012512207, "num_tokens": 749075516.0, "step": 22360 }, { "epoch": 1.3350644699140402, "grad_norm": 0.45328521728515625, "learning_rate": 1.72856095196948e-05, "loss": 0.4504, "mean_token_accuracy": 0.8498270273208618, "num_tokens": 749243196.0, "step": 22365 }, { "epoch": 1.3353629417383, "grad_norm": 0.4905661642551422, "learning_rate": 1.7275716618424337e-05, "loss": 0.4571, "mean_token_accuracy": 0.84889657497406, "num_tokens": 749410876.0, "step": 22370 }, { "epoch": 1.3356614135625597, "grad_norm": 0.4668544828891754, "learning_rate": 1.7265826207466823e-05, "loss": 0.4246, "mean_token_accuracy": 0.8590003609657287, "num_tokens": 749578556.0, "step": 22375 }, { "epoch": 1.3359598853868195, "grad_norm": 0.4947129487991333, "learning_rate": 1.7255938289231227e-05, "loss": 0.4175, "mean_token_accuracy": 0.8594894528388977, "num_tokens": 749746236.0, "step": 22380 }, { "epoch": 1.3362583572110793, "grad_norm": 0.49391013383865356, "learning_rate": 1.7246052866125943e-05, "loss": 0.4471, "mean_token_accuracy": 0.8513598799705505, "num_tokens": 749913916.0, "step": 22385 }, { "epoch": 1.3365568290353391, "grad_norm": 0.5421164631843567, "learning_rate": 1.723616994055875e-05, "loss": 0.4498, "mean_token_accuracy": 0.8504354000091553, "num_tokens": 750081596.0, "step": 22390 }, { "epoch": 1.336855300859599, "grad_norm": 0.4698849618434906, "learning_rate": 1.7226289514936815e-05, "loss": 0.4279, "mean_token_accuracy": 0.8579625487327576, "num_tokens": 750249276.0, "step": 22395 }, { "epoch": 1.3371537726838587, "grad_norm": 0.47517839074134827, "learning_rate": 1.7216411591666713e-05, "loss": 0.4185, "mean_token_accuracy": 0.8602886915206909, "num_tokens": 750416956.0, "step": 22400 }, { "epoch": 1.3374522445081185, "grad_norm": 0.5229873061180115, "learning_rate": 1.720653617315436e-05, "loss": 0.4241, "mean_token_accuracy": 0.8571513772010804, "num_tokens": 750584636.0, "step": 22405 }, { "epoch": 1.3377507163323783, "grad_norm": 0.5081853270530701, "learning_rate": 1.7196663261805137e-05, "loss": 0.4492, "mean_token_accuracy": 0.8521591305732727, "num_tokens": 750752316.0, "step": 22410 }, { "epoch": 1.338049188156638, "grad_norm": 0.6048567891120911, "learning_rate": 1.7186792860023743e-05, "loss": 0.445, "mean_token_accuracy": 0.8530359148979187, "num_tokens": 750919996.0, "step": 22415 }, { "epoch": 1.3383476599808979, "grad_norm": 0.500054121017456, "learning_rate": 1.717692497021432e-05, "loss": 0.4715, "mean_token_accuracy": 0.8435524344444275, "num_tokens": 751087676.0, "step": 22420 }, { "epoch": 1.3386461318051577, "grad_norm": 0.5501048564910889, "learning_rate": 1.7167059594780344e-05, "loss": 0.4513, "mean_token_accuracy": 0.8502564787864685, "num_tokens": 751255356.0, "step": 22425 }, { "epoch": 1.3389446036294173, "grad_norm": 0.48309409618377686, "learning_rate": 1.7157196736124735e-05, "loss": 0.4228, "mean_token_accuracy": 0.857455563545227, "num_tokens": 751423036.0, "step": 22430 }, { "epoch": 1.3392430754536773, "grad_norm": 0.47978973388671875, "learning_rate": 1.7147336396649777e-05, "loss": 0.4212, "mean_token_accuracy": 0.8584874153137207, "num_tokens": 751590716.0, "step": 22435 }, { "epoch": 1.3395415472779368, "grad_norm": 0.5024915933609009, "learning_rate": 1.713747857875712e-05, "loss": 0.4249, "mean_token_accuracy": 0.8580553293228149, "num_tokens": 751756682.0, "step": 22440 }, { "epoch": 1.3398400191021969, "grad_norm": 0.5611995458602905, "learning_rate": 1.7127623284847827e-05, "loss": 0.4964, "mean_token_accuracy": 0.8371108174324036, "num_tokens": 751924362.0, "step": 22445 }, { "epoch": 1.3401384909264564, "grad_norm": 0.5121062397956848, "learning_rate": 1.711777051732233e-05, "loss": 0.4299, "mean_token_accuracy": 0.8562328577041626, "num_tokens": 752092042.0, "step": 22450 }, { "epoch": 1.3404369627507164, "grad_norm": 0.6843779683113098, "learning_rate": 1.710792027858046e-05, "loss": 0.437, "mean_token_accuracy": 0.8552904844284057, "num_tokens": 752259722.0, "step": 22455 }, { "epoch": 1.340735434574976, "grad_norm": 0.5093886852264404, "learning_rate": 1.7098072571021432e-05, "loss": 0.473, "mean_token_accuracy": 0.8442622065544129, "num_tokens": 752427402.0, "step": 22460 }, { "epoch": 1.341033906399236, "grad_norm": 0.48257824778556824, "learning_rate": 1.7088227397043805e-05, "loss": 0.4188, "mean_token_accuracy": 0.8596325874328613, "num_tokens": 752595082.0, "step": 22465 }, { "epoch": 1.3413323782234956, "grad_norm": 0.5374806523323059, "learning_rate": 1.70783847590456e-05, "loss": 0.4274, "mean_token_accuracy": 0.8561076045036315, "num_tokens": 752762762.0, "step": 22470 }, { "epoch": 1.3416308500477556, "grad_norm": 0.5240042209625244, "learning_rate": 1.7068544659424133e-05, "loss": 0.4326, "mean_token_accuracy": 0.8562090039253235, "num_tokens": 752930442.0, "step": 22475 }, { "epoch": 1.3419293218720152, "grad_norm": 0.5293153524398804, "learning_rate": 1.705870710057616e-05, "loss": 0.4258, "mean_token_accuracy": 0.8556304454803467, "num_tokens": 753098122.0, "step": 22480 }, { "epoch": 1.3422277936962752, "grad_norm": 0.5044662952423096, "learning_rate": 1.7048872084897793e-05, "loss": 0.4608, "mean_token_accuracy": 0.8467255115509034, "num_tokens": 753265802.0, "step": 22485 }, { "epoch": 1.3425262655205348, "grad_norm": 0.5189728736877441, "learning_rate": 1.7039039614784537e-05, "loss": 0.4435, "mean_token_accuracy": 0.852606475353241, "num_tokens": 753433482.0, "step": 22490 }, { "epoch": 1.3428247373447946, "grad_norm": 0.5683044791221619, "learning_rate": 1.7029209692631283e-05, "loss": 0.4554, "mean_token_accuracy": 0.8494452953338623, "num_tokens": 753601162.0, "step": 22495 }, { "epoch": 1.3431232091690544, "grad_norm": 0.5045459270477295, "learning_rate": 1.701938232083226e-05, "loss": 0.4062, "mean_token_accuracy": 0.8633782625198364, "num_tokens": 753768842.0, "step": 22500 }, { "epoch": 1.3434216809933142, "grad_norm": 0.46433112025260925, "learning_rate": 1.7009557501781132e-05, "loss": 0.453, "mean_token_accuracy": 0.8479422688484192, "num_tokens": 753936522.0, "step": 22505 }, { "epoch": 1.343720152817574, "grad_norm": 0.4946271777153015, "learning_rate": 1.69997352378709e-05, "loss": 0.4574, "mean_token_accuracy": 0.8473040580749511, "num_tokens": 754104202.0, "step": 22510 }, { "epoch": 1.3440186246418337, "grad_norm": 0.5034542083740234, "learning_rate": 1.6989915531493963e-05, "loss": 0.4235, "mean_token_accuracy": 0.8585232019424438, "num_tokens": 754271882.0, "step": 22515 }, { "epoch": 1.3443170964660935, "grad_norm": 0.5567042827606201, "learning_rate": 1.6980098385042108e-05, "loss": 0.4633, "mean_token_accuracy": 0.8463497638702393, "num_tokens": 754439562.0, "step": 22520 }, { "epoch": 1.3446155682903533, "grad_norm": 0.4457235634326935, "learning_rate": 1.6970283800906446e-05, "loss": 0.4504, "mean_token_accuracy": 0.8513956665992737, "num_tokens": 754607242.0, "step": 22525 }, { "epoch": 1.3449140401146131, "grad_norm": 0.5092954039573669, "learning_rate": 1.6960471781477532e-05, "loss": 0.4536, "mean_token_accuracy": 0.8478408694267273, "num_tokens": 754774922.0, "step": 22530 }, { "epoch": 1.345212511938873, "grad_norm": 0.5446640253067017, "learning_rate": 1.6950662329145246e-05, "loss": 0.4523, "mean_token_accuracy": 0.8477275490760803, "num_tokens": 754942602.0, "step": 22535 }, { "epoch": 1.3455109837631327, "grad_norm": 0.5287734270095825, "learning_rate": 1.6940855446298874e-05, "loss": 0.4668, "mean_token_accuracy": 0.8447274208068848, "num_tokens": 755110282.0, "step": 22540 }, { "epoch": 1.3458094555873925, "grad_norm": 0.5278980731964111, "learning_rate": 1.693105113532706e-05, "loss": 0.4604, "mean_token_accuracy": 0.849773359298706, "num_tokens": 755277962.0, "step": 22545 }, { "epoch": 1.3461079274116523, "grad_norm": 0.5189603567123413, "learning_rate": 1.692124939861781e-05, "loss": 0.4353, "mean_token_accuracy": 0.8555469393730164, "num_tokens": 755445642.0, "step": 22550 }, { "epoch": 1.346406399235912, "grad_norm": 0.5070314407348633, "learning_rate": 1.6911450238558544e-05, "loss": 0.4381, "mean_token_accuracy": 0.8529345035552979, "num_tokens": 755613322.0, "step": 22555 }, { "epoch": 1.346704871060172, "grad_norm": 0.4751354455947876, "learning_rate": 1.690165365753601e-05, "loss": 0.3935, "mean_token_accuracy": 0.8694918274879455, "num_tokens": 755781002.0, "step": 22560 }, { "epoch": 1.3470033428844317, "grad_norm": 0.5475949645042419, "learning_rate": 1.6891859657936346e-05, "loss": 0.4479, "mean_token_accuracy": 0.8498329877853393, "num_tokens": 755948682.0, "step": 22565 }, { "epoch": 1.3473018147086915, "grad_norm": 0.48246076703071594, "learning_rate": 1.6882068242145067e-05, "loss": 0.438, "mean_token_accuracy": 0.8531313538551331, "num_tokens": 756116362.0, "step": 22570 }, { "epoch": 1.3476002865329513, "grad_norm": 0.5266431570053101, "learning_rate": 1.6872279412547055e-05, "loss": 0.4304, "mean_token_accuracy": 0.8557318329811097, "num_tokens": 756284042.0, "step": 22575 }, { "epoch": 1.347898758357211, "grad_norm": 0.5577112436294556, "learning_rate": 1.686249317152656e-05, "loss": 0.4453, "mean_token_accuracy": 0.8521591424942017, "num_tokens": 756451722.0, "step": 22580 }, { "epoch": 1.3481972301814709, "grad_norm": 0.50678950548172, "learning_rate": 1.6852709521467198e-05, "loss": 0.4117, "mean_token_accuracy": 0.8616903305053711, "num_tokens": 756619402.0, "step": 22585 }, { "epoch": 1.3484957020057307, "grad_norm": 0.5217883586883545, "learning_rate": 1.6842928464751946e-05, "loss": 0.4564, "mean_token_accuracy": 0.84849693775177, "num_tokens": 756787082.0, "step": 22590 }, { "epoch": 1.3487941738299905, "grad_norm": 0.47241899371147156, "learning_rate": 1.6833150003763195e-05, "loss": 0.4282, "mean_token_accuracy": 0.856179165840149, "num_tokens": 756954762.0, "step": 22595 }, { "epoch": 1.3490926456542502, "grad_norm": 0.4343146085739136, "learning_rate": 1.6823374140882634e-05, "loss": 0.4232, "mean_token_accuracy": 0.8582846283912658, "num_tokens": 757122442.0, "step": 22600 }, { "epoch": 1.34939111747851, "grad_norm": 0.5446040034294128, "learning_rate": 1.6813600878491376e-05, "loss": 0.4534, "mean_token_accuracy": 0.8460023999214172, "num_tokens": 757288541.0, "step": 22605 }, { "epoch": 1.3496895893027698, "grad_norm": 0.5250765085220337, "learning_rate": 1.6803830218969857e-05, "loss": 0.4535, "mean_token_accuracy": 0.8486818552017212, "num_tokens": 757456221.0, "step": 22610 }, { "epoch": 1.3499880611270296, "grad_norm": 0.5114559531211853, "learning_rate": 1.6794062164697928e-05, "loss": 0.4254, "mean_token_accuracy": 0.8572885751724243, "num_tokens": 757623901.0, "step": 22615 }, { "epoch": 1.3502865329512894, "grad_norm": 0.511920690536499, "learning_rate": 1.678429671805478e-05, "loss": 0.455, "mean_token_accuracy": 0.8480973362922668, "num_tokens": 757791581.0, "step": 22620 }, { "epoch": 1.3505850047755492, "grad_norm": 0.5332680940628052, "learning_rate": 1.677453388141894e-05, "loss": 0.3948, "mean_token_accuracy": 0.8664439916610718, "num_tokens": 757959261.0, "step": 22625 }, { "epoch": 1.350883476599809, "grad_norm": 0.5050813555717468, "learning_rate": 1.6764773657168346e-05, "loss": 0.4368, "mean_token_accuracy": 0.8537695288658143, "num_tokens": 758126941.0, "step": 22630 }, { "epoch": 1.3511819484240688, "grad_norm": 0.5128730535507202, "learning_rate": 1.6755016047680276e-05, "loss": 0.4593, "mean_token_accuracy": 0.845890486240387, "num_tokens": 758294621.0, "step": 22635 }, { "epoch": 1.3514804202483286, "grad_norm": 0.5740980505943298, "learning_rate": 1.6745261055331384e-05, "loss": 0.4493, "mean_token_accuracy": 0.8507873177528381, "num_tokens": 758462301.0, "step": 22640 }, { "epoch": 1.3517788920725884, "grad_norm": 0.4692186117172241, "learning_rate": 1.6735508682497662e-05, "loss": 0.3948, "mean_token_accuracy": 0.8667660713195801, "num_tokens": 758629981.0, "step": 22645 }, { "epoch": 1.3520773638968482, "grad_norm": 0.5040562748908997, "learning_rate": 1.6725758931554496e-05, "loss": 0.4582, "mean_token_accuracy": 0.8474650979042053, "num_tokens": 758797661.0, "step": 22650 }, { "epoch": 1.352375835721108, "grad_norm": 0.5797613263130188, "learning_rate": 1.671601180487661e-05, "loss": 0.4563, "mean_token_accuracy": 0.847769296169281, "num_tokens": 758965341.0, "step": 22655 }, { "epoch": 1.3526743075453678, "grad_norm": 0.5466217398643494, "learning_rate": 1.6706267304838097e-05, "loss": 0.4787, "mean_token_accuracy": 0.8430215954780579, "num_tokens": 759133021.0, "step": 22660 }, { "epoch": 1.3529727793696276, "grad_norm": 0.5243255496025085, "learning_rate": 1.669652543381242e-05, "loss": 0.4604, "mean_token_accuracy": 0.8472503900527955, "num_tokens": 759300701.0, "step": 22665 }, { "epoch": 1.3532712511938874, "grad_norm": 0.5923428535461426, "learning_rate": 1.668678619417238e-05, "loss": 0.4488, "mean_token_accuracy": 0.8497972130775452, "num_tokens": 759468381.0, "step": 22670 }, { "epoch": 1.3535697230181472, "grad_norm": 0.5448712706565857, "learning_rate": 1.6677049588290132e-05, "loss": 0.4754, "mean_token_accuracy": 0.8438029527664185, "num_tokens": 759636061.0, "step": 22675 }, { "epoch": 1.353868194842407, "grad_norm": 0.48216933012008667, "learning_rate": 1.6667315618537248e-05, "loss": 0.4344, "mean_token_accuracy": 0.8535548210144043, "num_tokens": 759803741.0, "step": 22680 }, { "epoch": 1.3541666666666667, "grad_norm": 0.47375988960266113, "learning_rate": 1.6657584287284588e-05, "loss": 0.4325, "mean_token_accuracy": 0.8546522736549378, "num_tokens": 759971421.0, "step": 22685 }, { "epoch": 1.3544651384909265, "grad_norm": 0.500453770160675, "learning_rate": 1.66478555969024e-05, "loss": 0.4166, "mean_token_accuracy": 0.8598115086555481, "num_tokens": 760139101.0, "step": 22690 }, { "epoch": 1.3547636103151863, "grad_norm": 0.5385774374008179, "learning_rate": 1.663812954976029e-05, "loss": 0.4667, "mean_token_accuracy": 0.845162832736969, "num_tokens": 760306781.0, "step": 22695 }, { "epoch": 1.3550620821394461, "grad_norm": 0.5080847144126892, "learning_rate": 1.6628406148227216e-05, "loss": 0.4656, "mean_token_accuracy": 0.8456340312957764, "num_tokens": 760474461.0, "step": 22700 }, { "epoch": 1.3553605539637057, "grad_norm": 0.521116316318512, "learning_rate": 1.6618685394671495e-05, "loss": 0.4402, "mean_token_accuracy": 0.8540498614311218, "num_tokens": 760642141.0, "step": 22705 }, { "epoch": 1.3556590257879657, "grad_norm": 0.5268535017967224, "learning_rate": 1.6608967291460787e-05, "loss": 0.4284, "mean_token_accuracy": 0.8576523900032044, "num_tokens": 760809821.0, "step": 22710 }, { "epoch": 1.3559574976122253, "grad_norm": 0.46757519245147705, "learning_rate": 1.6599251840962125e-05, "loss": 0.4173, "mean_token_accuracy": 0.8612072110176087, "num_tokens": 760977501.0, "step": 22715 }, { "epoch": 1.3562559694364853, "grad_norm": 0.5216953158378601, "learning_rate": 1.6589539045541876e-05, "loss": 0.4545, "mean_token_accuracy": 0.8494214415550232, "num_tokens": 761145181.0, "step": 22720 }, { "epoch": 1.3565544412607449, "grad_norm": 0.48437798023223877, "learning_rate": 1.6579828907565774e-05, "loss": 0.4189, "mean_token_accuracy": 0.8591196417808533, "num_tokens": 761312861.0, "step": 22725 }, { "epoch": 1.3568529130850049, "grad_norm": 0.5425718426704407, "learning_rate": 1.657012142939892e-05, "loss": 0.482, "mean_token_accuracy": 0.8402302145957947, "num_tokens": 761480541.0, "step": 22730 }, { "epoch": 1.3571513849092645, "grad_norm": 0.46816858649253845, "learning_rate": 1.6560416613405714e-05, "loss": 0.4567, "mean_token_accuracy": 0.848496961593628, "num_tokens": 761648221.0, "step": 22735 }, { "epoch": 1.3574498567335245, "grad_norm": 0.50034499168396, "learning_rate": 1.6550714461949983e-05, "loss": 0.4856, "mean_token_accuracy": 0.8382380962371826, "num_tokens": 761815901.0, "step": 22740 }, { "epoch": 1.357748328557784, "grad_norm": 0.5078026056289673, "learning_rate": 1.6541014977394833e-05, "loss": 0.437, "mean_token_accuracy": 0.8542407155036926, "num_tokens": 761983581.0, "step": 22745 }, { "epoch": 1.358046800382044, "grad_norm": 0.49565309286117554, "learning_rate": 1.6531318162102778e-05, "loss": 0.4536, "mean_token_accuracy": 0.8489264011383056, "num_tokens": 762151261.0, "step": 22750 }, { "epoch": 1.3583452722063036, "grad_norm": 0.43293872475624084, "learning_rate": 1.652162401843562e-05, "loss": 0.4114, "mean_token_accuracy": 0.8619587302207947, "num_tokens": 762318941.0, "step": 22755 }, { "epoch": 1.3586437440305636, "grad_norm": 0.4872153401374817, "learning_rate": 1.6511932548754585e-05, "loss": 0.4158, "mean_token_accuracy": 0.8595908403396606, "num_tokens": 762486621.0, "step": 22760 }, { "epoch": 1.3589422158548232, "grad_norm": 0.4540598392486572, "learning_rate": 1.6502243755420202e-05, "loss": 0.4115, "mean_token_accuracy": 0.8612668514251709, "num_tokens": 762654301.0, "step": 22765 }, { "epoch": 1.359240687679083, "grad_norm": 0.5416213274002075, "learning_rate": 1.6492557640792335e-05, "loss": 0.4293, "mean_token_accuracy": 0.8563521385192872, "num_tokens": 762821981.0, "step": 22770 }, { "epoch": 1.3595391595033428, "grad_norm": 0.5540740489959717, "learning_rate": 1.6482874207230232e-05, "loss": 0.4339, "mean_token_accuracy": 0.8556304454803467, "num_tokens": 762989661.0, "step": 22775 }, { "epoch": 1.3598376313276026, "grad_norm": 0.5466917753219604, "learning_rate": 1.6473193457092468e-05, "loss": 0.4316, "mean_token_accuracy": 0.8545926213264465, "num_tokens": 763157341.0, "step": 22780 }, { "epoch": 1.3601361031518624, "grad_norm": 0.53810715675354, "learning_rate": 1.6463515392736967e-05, "loss": 0.4579, "mean_token_accuracy": 0.8482643485069274, "num_tokens": 763325021.0, "step": 22785 }, { "epoch": 1.3604345749761222, "grad_norm": 0.4831179976463318, "learning_rate": 1.6453840016521016e-05, "loss": 0.4552, "mean_token_accuracy": 0.8488429069519043, "num_tokens": 763492701.0, "step": 22790 }, { "epoch": 1.360733046800382, "grad_norm": 0.4876408278942108, "learning_rate": 1.6444167330801203e-05, "loss": 0.4372, "mean_token_accuracy": 0.854485273361206, "num_tokens": 763660381.0, "step": 22795 }, { "epoch": 1.3610315186246418, "grad_norm": 0.5168662071228027, "learning_rate": 1.6434497337933506e-05, "loss": 0.4232, "mean_token_accuracy": 0.8589884281158447, "num_tokens": 763828061.0, "step": 22800 }, { "epoch": 1.3613299904489016, "grad_norm": 0.5481839179992676, "learning_rate": 1.642483004027323e-05, "loss": 0.4506, "mean_token_accuracy": 0.8492007493972779, "num_tokens": 763995741.0, "step": 22805 }, { "epoch": 1.3616284622731614, "grad_norm": 0.5525104403495789, "learning_rate": 1.641516544017502e-05, "loss": 0.4724, "mean_token_accuracy": 0.8434033036231995, "num_tokens": 764163421.0, "step": 22810 }, { "epoch": 1.3619269340974212, "grad_norm": 0.5119472146034241, "learning_rate": 1.640550353999287e-05, "loss": 0.4888, "mean_token_accuracy": 0.838446855545044, "num_tokens": 764331101.0, "step": 22815 }, { "epoch": 1.362225405921681, "grad_norm": 0.46866685152053833, "learning_rate": 1.6395844342080114e-05, "loss": 0.4486, "mean_token_accuracy": 0.8503459334373474, "num_tokens": 764498781.0, "step": 22820 }, { "epoch": 1.3625238777459407, "grad_norm": 0.5765346884727478, "learning_rate": 1.6386187848789436e-05, "loss": 0.4822, "mean_token_accuracy": 0.8402532815933228, "num_tokens": 764660775.0, "step": 22825 }, { "epoch": 1.3628223495702005, "grad_norm": 0.6466535329818726, "learning_rate": 1.6376534062472838e-05, "loss": 0.4577, "mean_token_accuracy": 0.8486579895019531, "num_tokens": 764828455.0, "step": 22830 }, { "epoch": 1.3631208213944603, "grad_norm": 0.5960538387298584, "learning_rate": 1.636688298548168e-05, "loss": 0.4787, "mean_token_accuracy": 0.8417690515518188, "num_tokens": 764996135.0, "step": 22835 }, { "epoch": 1.3634192932187201, "grad_norm": 0.5411390662193298, "learning_rate": 1.6357234620166668e-05, "loss": 0.4798, "mean_token_accuracy": 0.8389538407325745, "num_tokens": 765163815.0, "step": 22840 }, { "epoch": 1.36371776504298, "grad_norm": 0.5571218132972717, "learning_rate": 1.6347588968877837e-05, "loss": 0.4251, "mean_token_accuracy": 0.8586663484573365, "num_tokens": 765331495.0, "step": 22845 }, { "epoch": 1.3640162368672397, "grad_norm": 0.510390043258667, "learning_rate": 1.633794603396457e-05, "loss": 0.4003, "mean_token_accuracy": 0.8647376418113708, "num_tokens": 765491234.0, "step": 22850 }, { "epoch": 1.3643147086914995, "grad_norm": 0.46171265840530396, "learning_rate": 1.6328305817775566e-05, "loss": 0.4223, "mean_token_accuracy": 0.8602051854133606, "num_tokens": 765658914.0, "step": 22855 }, { "epoch": 1.3646131805157593, "grad_norm": 0.5314909219741821, "learning_rate": 1.6318668322658888e-05, "loss": 0.464, "mean_token_accuracy": 0.844560420513153, "num_tokens": 765826594.0, "step": 22860 }, { "epoch": 1.364911652340019, "grad_norm": 0.525103747844696, "learning_rate": 1.6309033550961918e-05, "loss": 0.4696, "mean_token_accuracy": 0.8451986193656922, "num_tokens": 765994274.0, "step": 22865 }, { "epoch": 1.365210124164279, "grad_norm": 0.5250957608222961, "learning_rate": 1.6299401505031395e-05, "loss": 0.4322, "mean_token_accuracy": 0.8557735919952393, "num_tokens": 766161954.0, "step": 22870 }, { "epoch": 1.3655085959885387, "grad_norm": 0.48402971029281616, "learning_rate": 1.6289772187213378e-05, "loss": 0.4569, "mean_token_accuracy": 0.8474830031394959, "num_tokens": 766329634.0, "step": 22875 }, { "epoch": 1.3658070678127985, "grad_norm": 0.4996907413005829, "learning_rate": 1.6280145599853246e-05, "loss": 0.4344, "mean_token_accuracy": 0.8563044190406799, "num_tokens": 766497314.0, "step": 22880 }, { "epoch": 1.3661055396370583, "grad_norm": 0.5941933989524841, "learning_rate": 1.627052174529577e-05, "loss": 0.4488, "mean_token_accuracy": 0.8501192927360535, "num_tokens": 766664994.0, "step": 22885 }, { "epoch": 1.366404011461318, "grad_norm": 0.5185617804527283, "learning_rate": 1.626090062588498e-05, "loss": 0.445, "mean_token_accuracy": 0.851240611076355, "num_tokens": 766832674.0, "step": 22890 }, { "epoch": 1.3667024832855779, "grad_norm": 0.5115506052970886, "learning_rate": 1.6251282243964295e-05, "loss": 0.481, "mean_token_accuracy": 0.8410831451416015, "num_tokens": 767000354.0, "step": 22895 }, { "epoch": 1.3670009551098377, "grad_norm": 0.4756830334663391, "learning_rate": 1.624166660187645e-05, "loss": 0.4195, "mean_token_accuracy": 0.8597459197044373, "num_tokens": 767168034.0, "step": 22900 }, { "epoch": 1.3672994269340975, "grad_norm": 0.492611825466156, "learning_rate": 1.6232053701963503e-05, "loss": 0.4567, "mean_token_accuracy": 0.8500656247138977, "num_tokens": 767335714.0, "step": 22905 }, { "epoch": 1.3675978987583572, "grad_norm": 0.5117066502571106, "learning_rate": 1.6222443546566874e-05, "loss": 0.42, "mean_token_accuracy": 0.8589168548583984, "num_tokens": 767503394.0, "step": 22910 }, { "epoch": 1.367896370582617, "grad_norm": 0.5237162113189697, "learning_rate": 1.6212836138027268e-05, "loss": 0.4135, "mean_token_accuracy": 0.8620004773139953, "num_tokens": 767671074.0, "step": 22915 }, { "epoch": 1.3681948424068768, "grad_norm": 0.4933103024959564, "learning_rate": 1.6203231478684755e-05, "loss": 0.4293, "mean_token_accuracy": 0.8567100048065186, "num_tokens": 767838754.0, "step": 22920 }, { "epoch": 1.3684933142311366, "grad_norm": 0.5579466223716736, "learning_rate": 1.6193629570878735e-05, "loss": 0.441, "mean_token_accuracy": 0.8531969547271728, "num_tokens": 768006434.0, "step": 22925 }, { "epoch": 1.3687917860553964, "grad_norm": 0.5770542621612549, "learning_rate": 1.618403041694792e-05, "loss": 0.4476, "mean_token_accuracy": 0.8525587439537048, "num_tokens": 768174114.0, "step": 22930 }, { "epoch": 1.3690902578796562, "grad_norm": 0.5308911800384521, "learning_rate": 1.6174434019230377e-05, "loss": 0.4437, "mean_token_accuracy": 0.8527257561683654, "num_tokens": 768341794.0, "step": 22935 }, { "epoch": 1.369388729703916, "grad_norm": 0.575415313243866, "learning_rate": 1.6164840380063457e-05, "loss": 0.4367, "mean_token_accuracy": 0.8536621809005738, "num_tokens": 768509474.0, "step": 22940 }, { "epoch": 1.3696872015281758, "grad_norm": 0.546991229057312, "learning_rate": 1.6155249501783905e-05, "loss": 0.4934, "mean_token_accuracy": 0.8393296003341675, "num_tokens": 768677154.0, "step": 22945 }, { "epoch": 1.3699856733524356, "grad_norm": 0.5168837904930115, "learning_rate": 1.6145661386727726e-05, "loss": 0.452, "mean_token_accuracy": 0.8490039348602295, "num_tokens": 768844834.0, "step": 22950 }, { "epoch": 1.3702841451766954, "grad_norm": 0.46434494853019714, "learning_rate": 1.6136076037230294e-05, "loss": 0.4432, "mean_token_accuracy": 0.853576409816742, "num_tokens": 769010588.0, "step": 22955 }, { "epoch": 1.3705826170009552, "grad_norm": 0.8401955962181091, "learning_rate": 1.6126493455626296e-05, "loss": 0.4202, "mean_token_accuracy": 0.8608910799026489, "num_tokens": 769178268.0, "step": 22960 }, { "epoch": 1.370881088825215, "grad_norm": 0.5199339985847473, "learning_rate": 1.6116913644249747e-05, "loss": 0.4489, "mean_token_accuracy": 0.8502803325653077, "num_tokens": 769345948.0, "step": 22965 }, { "epoch": 1.3711795606494748, "grad_norm": 0.48886939883232117, "learning_rate": 1.6107336605433996e-05, "loss": 0.4088, "mean_token_accuracy": 0.8638792872428894, "num_tokens": 769513628.0, "step": 22970 }, { "epoch": 1.3714780324737346, "grad_norm": 0.5605626106262207, "learning_rate": 1.6097762341511693e-05, "loss": 0.451, "mean_token_accuracy": 0.8501133322715759, "num_tokens": 769681308.0, "step": 22975 }, { "epoch": 1.3717765042979941, "grad_norm": 0.5340942144393921, "learning_rate": 1.6088190854814834e-05, "loss": 0.4172, "mean_token_accuracy": 0.8595431208610534, "num_tokens": 769848988.0, "step": 22980 }, { "epoch": 1.3720749761222542, "grad_norm": 0.4701322615146637, "learning_rate": 1.607862214767473e-05, "loss": 0.4242, "mean_token_accuracy": 0.8595848798751831, "num_tokens": 770016668.0, "step": 22985 }, { "epoch": 1.3723734479465137, "grad_norm": 0.507288932800293, "learning_rate": 1.6069056222422014e-05, "loss": 0.4393, "mean_token_accuracy": 0.8525514006614685, "num_tokens": 770178269.0, "step": 22990 }, { "epoch": 1.3726719197707737, "grad_norm": 0.4795933961868286, "learning_rate": 1.605949308138665e-05, "loss": 0.4298, "mean_token_accuracy": 0.8552904725074768, "num_tokens": 770345949.0, "step": 22995 }, { "epoch": 1.3729703915950333, "grad_norm": 0.5413479208946228, "learning_rate": 1.6049932726897917e-05, "loss": 0.4281, "mean_token_accuracy": 0.857920789718628, "num_tokens": 770513629.0, "step": 23000 }, { "epoch": 1.3732688634192933, "grad_norm": 0.49817144870758057, "learning_rate": 1.6040375161284392e-05, "loss": 0.4392, "mean_token_accuracy": 0.8535249948501586, "num_tokens": 770681309.0, "step": 23005 }, { "epoch": 1.373567335243553, "grad_norm": 0.5285564661026001, "learning_rate": 1.603082038687404e-05, "loss": 0.4674, "mean_token_accuracy": 0.8443695664405823, "num_tokens": 770848989.0, "step": 23010 }, { "epoch": 1.373865807067813, "grad_norm": 0.5043045282363892, "learning_rate": 1.6021268405994074e-05, "loss": 0.4677, "mean_token_accuracy": 0.843898355960846, "num_tokens": 771016669.0, "step": 23015 }, { "epoch": 1.3741642788920725, "grad_norm": 0.531291127204895, "learning_rate": 1.6011719220971065e-05, "loss": 0.4464, "mean_token_accuracy": 0.8496659994125366, "num_tokens": 771184349.0, "step": 23020 }, { "epoch": 1.3744627507163325, "grad_norm": 0.5122756361961365, "learning_rate": 1.6002172834130868e-05, "loss": 0.4535, "mean_token_accuracy": 0.8468448042869567, "num_tokens": 771352029.0, "step": 23025 }, { "epoch": 1.374761222540592, "grad_norm": 0.515861988067627, "learning_rate": 1.599262924779872e-05, "loss": 0.4242, "mean_token_accuracy": 0.8582488417625427, "num_tokens": 771519709.0, "step": 23030 }, { "epoch": 1.375059694364852, "grad_norm": 0.48412561416625977, "learning_rate": 1.5983088464299125e-05, "loss": 0.3985, "mean_token_accuracy": 0.8648693799972534, "num_tokens": 771687389.0, "step": 23035 }, { "epoch": 1.3753581661891117, "grad_norm": 0.5374919772148132, "learning_rate": 1.5973550485955903e-05, "loss": 0.4561, "mean_token_accuracy": 0.8482285499572754, "num_tokens": 771855069.0, "step": 23040 }, { "epoch": 1.3756566380133715, "grad_norm": 0.5241804718971252, "learning_rate": 1.5964015315092214e-05, "loss": 0.4531, "mean_token_accuracy": 0.8494810938835144, "num_tokens": 772022749.0, "step": 23045 }, { "epoch": 1.3759551098376313, "grad_norm": 0.5024101734161377, "learning_rate": 1.5954482954030524e-05, "loss": 0.4347, "mean_token_accuracy": 0.8542466998100281, "num_tokens": 772190429.0, "step": 23050 }, { "epoch": 1.376253581661891, "grad_norm": 0.5139269232749939, "learning_rate": 1.5944953405092628e-05, "loss": 0.4243, "mean_token_accuracy": 0.8594954013824463, "num_tokens": 772358109.0, "step": 23055 }, { "epoch": 1.3765520534861508, "grad_norm": 0.5611736178398132, "learning_rate": 1.59354266705996e-05, "loss": 0.4837, "mean_token_accuracy": 0.8412263035774231, "num_tokens": 772525789.0, "step": 23060 }, { "epoch": 1.3768505253104106, "grad_norm": 0.5159117579460144, "learning_rate": 1.5925902752871857e-05, "loss": 0.4505, "mean_token_accuracy": 0.8505964398384094, "num_tokens": 772693469.0, "step": 23065 }, { "epoch": 1.3771489971346704, "grad_norm": 0.4799140393733978, "learning_rate": 1.591638165422915e-05, "loss": 0.4281, "mean_token_accuracy": 0.856453537940979, "num_tokens": 772861149.0, "step": 23070 }, { "epoch": 1.3774474689589302, "grad_norm": 0.4879801869392395, "learning_rate": 1.5906863376990493e-05, "loss": 0.4619, "mean_token_accuracy": 0.8494154930114746, "num_tokens": 773028829.0, "step": 23075 }, { "epoch": 1.37774594078319, "grad_norm": 0.5032755732536316, "learning_rate": 1.5897347923474255e-05, "loss": 0.442, "mean_token_accuracy": 0.8528172254562378, "num_tokens": 773189390.0, "step": 23080 }, { "epoch": 1.3780444126074498, "grad_norm": 0.5552733540534973, "learning_rate": 1.588783529599807e-05, "loss": 0.4472, "mean_token_accuracy": 0.8517237305641174, "num_tokens": 773357070.0, "step": 23085 }, { "epoch": 1.3783428844317096, "grad_norm": 0.5376341342926025, "learning_rate": 1.587832549687894e-05, "loss": 0.4707, "mean_token_accuracy": 0.8450368404388428, "num_tokens": 773516785.0, "step": 23090 }, { "epoch": 1.3786413562559694, "grad_norm": 0.5441288948059082, "learning_rate": 1.586881852843316e-05, "loss": 0.4409, "mean_token_accuracy": 0.8532983541488648, "num_tokens": 773684465.0, "step": 23095 }, { "epoch": 1.3789398280802292, "grad_norm": 0.5238974094390869, "learning_rate": 1.5859314392976305e-05, "loss": 0.4509, "mean_token_accuracy": 0.8497284531593323, "num_tokens": 773852062.0, "step": 23100 }, { "epoch": 1.379238299904489, "grad_norm": 0.44513800740242004, "learning_rate": 1.584981309282329e-05, "loss": 0.4177, "mean_token_accuracy": 0.8594119071960449, "num_tokens": 774019742.0, "step": 23105 }, { "epoch": 1.3795367717287488, "grad_norm": 0.545132040977478, "learning_rate": 1.584031463028834e-05, "loss": 0.4639, "mean_token_accuracy": 0.8463318586349488, "num_tokens": 774187422.0, "step": 23110 }, { "epoch": 1.3798352435530086, "grad_norm": 0.44364869594573975, "learning_rate": 1.583081900768497e-05, "loss": 0.4606, "mean_token_accuracy": 0.8487355351448059, "num_tokens": 774355102.0, "step": 23115 }, { "epoch": 1.3801337153772684, "grad_norm": 0.5100110769271851, "learning_rate": 1.5821326227326026e-05, "loss": 0.4262, "mean_token_accuracy": 0.8563760042190551, "num_tokens": 774522782.0, "step": 23120 }, { "epoch": 1.3804321872015282, "grad_norm": 0.4937985837459564, "learning_rate": 1.5811836291523636e-05, "loss": 0.432, "mean_token_accuracy": 0.8567935109138489, "num_tokens": 774690462.0, "step": 23125 }, { "epoch": 1.380730659025788, "grad_norm": 0.5006693005561829, "learning_rate": 1.5802349202589258e-05, "loss": 0.4284, "mean_token_accuracy": 0.856715989112854, "num_tokens": 774858142.0, "step": 23130 }, { "epoch": 1.3810291308500477, "grad_norm": 0.5181460976600647, "learning_rate": 1.579286496283364e-05, "loss": 0.4271, "mean_token_accuracy": 0.8563044190406799, "num_tokens": 775025822.0, "step": 23135 }, { "epoch": 1.3813276026743075, "grad_norm": 0.5043380260467529, "learning_rate": 1.578338357456686e-05, "loss": 0.4076, "mean_token_accuracy": 0.8640761017799378, "num_tokens": 775193502.0, "step": 23140 }, { "epoch": 1.3816260744985673, "grad_norm": 0.5079756379127502, "learning_rate": 1.5773905040098274e-05, "loss": 0.4093, "mean_token_accuracy": 0.8617261052131653, "num_tokens": 775361182.0, "step": 23145 }, { "epoch": 1.3819245463228271, "grad_norm": 0.4899621605873108, "learning_rate": 1.5764429361736537e-05, "loss": 0.392, "mean_token_accuracy": 0.867255175113678, "num_tokens": 775528862.0, "step": 23150 }, { "epoch": 1.382223018147087, "grad_norm": 0.523612916469574, "learning_rate": 1.5754956541789667e-05, "loss": 0.4167, "mean_token_accuracy": 0.8596325874328613, "num_tokens": 775696542.0, "step": 23155 }, { "epoch": 1.3825214899713467, "grad_norm": 0.45955783128738403, "learning_rate": 1.5745486582564906e-05, "loss": 0.4158, "mean_token_accuracy": 0.860676383972168, "num_tokens": 775864222.0, "step": 23160 }, { "epoch": 1.3828199617956065, "grad_norm": 0.5167980194091797, "learning_rate": 1.573601948636885e-05, "loss": 0.4078, "mean_token_accuracy": 0.8635273814201355, "num_tokens": 776031902.0, "step": 23165 }, { "epoch": 1.3831184336198663, "grad_norm": 0.46761518716812134, "learning_rate": 1.572655525550739e-05, "loss": 0.4237, "mean_token_accuracy": 0.8580341100692749, "num_tokens": 776199582.0, "step": 23170 }, { "epoch": 1.383416905444126, "grad_norm": 0.5134435892105103, "learning_rate": 1.5717093892285713e-05, "loss": 0.462, "mean_token_accuracy": 0.8456220865249634, "num_tokens": 776367262.0, "step": 23175 }, { "epoch": 1.383715377268386, "grad_norm": 0.47994717955589294, "learning_rate": 1.5707635399008308e-05, "loss": 0.3875, "mean_token_accuracy": 0.8687761068344116, "num_tokens": 776534942.0, "step": 23180 }, { "epoch": 1.3840138490926457, "grad_norm": 0.50049889087677, "learning_rate": 1.5698179777978956e-05, "loss": 0.4275, "mean_token_accuracy": 0.8575629353523254, "num_tokens": 776702622.0, "step": 23185 }, { "epoch": 1.3843123209169055, "grad_norm": 0.5402145385742188, "learning_rate": 1.568872703150076e-05, "loss": 0.4283, "mean_token_accuracy": 0.855713951587677, "num_tokens": 776870302.0, "step": 23190 }, { "epoch": 1.3846107927411653, "grad_norm": 0.5122275352478027, "learning_rate": 1.5679277161876105e-05, "loss": 0.4302, "mean_token_accuracy": 0.8575152039527894, "num_tokens": 777037982.0, "step": 23195 }, { "epoch": 1.384909264565425, "grad_norm": 0.502173662185669, "learning_rate": 1.566983017140668e-05, "loss": 0.4257, "mean_token_accuracy": 0.8584754824638366, "num_tokens": 777205662.0, "step": 23200 }, { "epoch": 1.3852077363896849, "grad_norm": 0.492482990026474, "learning_rate": 1.5660386062393485e-05, "loss": 0.4323, "mean_token_accuracy": 0.8531730771064758, "num_tokens": 777373342.0, "step": 23205 }, { "epoch": 1.3855062082139447, "grad_norm": 0.4919479489326477, "learning_rate": 1.5650944837136787e-05, "loss": 0.445, "mean_token_accuracy": 0.8522784233093261, "num_tokens": 777541022.0, "step": 23210 }, { "epoch": 1.3858046800382045, "grad_norm": 0.48758816719055176, "learning_rate": 1.5641506497936194e-05, "loss": 0.4312, "mean_token_accuracy": 0.8559107780456543, "num_tokens": 777708702.0, "step": 23215 }, { "epoch": 1.3861031518624642, "grad_norm": 0.5445257425308228, "learning_rate": 1.5632071047090573e-05, "loss": 0.4878, "mean_token_accuracy": 0.8381703734397888, "num_tokens": 777870190.0, "step": 23220 }, { "epoch": 1.386401623686724, "grad_norm": 0.5068190097808838, "learning_rate": 1.56226384868981e-05, "loss": 0.4363, "mean_token_accuracy": 0.8535488486289978, "num_tokens": 778037870.0, "step": 23225 }, { "epoch": 1.3867000955109838, "grad_norm": 0.47730064392089844, "learning_rate": 1.5613208819656256e-05, "loss": 0.4656, "mean_token_accuracy": 0.8452463388442993, "num_tokens": 778205550.0, "step": 23230 }, { "epoch": 1.3869985673352436, "grad_norm": 0.5039456486701965, "learning_rate": 1.560378204766181e-05, "loss": 0.4321, "mean_token_accuracy": 0.8557914853096008, "num_tokens": 778373230.0, "step": 23235 }, { "epoch": 1.3872970391595034, "grad_norm": 0.5148113369941711, "learning_rate": 1.5594358173210834e-05, "loss": 0.4448, "mean_token_accuracy": 0.85208158493042, "num_tokens": 778540910.0, "step": 23240 }, { "epoch": 1.3875955109837632, "grad_norm": 0.5091186761856079, "learning_rate": 1.558493719859867e-05, "loss": 0.4097, "mean_token_accuracy": 0.8638256072998047, "num_tokens": 778708590.0, "step": 23245 }, { "epoch": 1.387893982808023, "grad_norm": 0.49849849939346313, "learning_rate": 1.5575519126119976e-05, "loss": 0.4437, "mean_token_accuracy": 0.8517535328865051, "num_tokens": 778876270.0, "step": 23250 }, { "epoch": 1.3881924546322826, "grad_norm": 0.5224447250366211, "learning_rate": 1.5566103958068702e-05, "loss": 0.4487, "mean_token_accuracy": 0.8500298142433167, "num_tokens": 779043950.0, "step": 23255 }, { "epoch": 1.3884909264565426, "grad_norm": 0.5036640763282776, "learning_rate": 1.555669169673808e-05, "loss": 0.4273, "mean_token_accuracy": 0.8570022702217102, "num_tokens": 779211630.0, "step": 23260 }, { "epoch": 1.3887893982808022, "grad_norm": 0.46421748399734497, "learning_rate": 1.5547282344420656e-05, "loss": 0.4353, "mean_token_accuracy": 0.8537695288658143, "num_tokens": 779379310.0, "step": 23265 }, { "epoch": 1.3890878701050622, "grad_norm": 0.5558269023895264, "learning_rate": 1.553787590340823e-05, "loss": 0.4693, "mean_token_accuracy": 0.8448586463928223, "num_tokens": 779546990.0, "step": 23270 }, { "epoch": 1.3893863419293218, "grad_norm": 0.4932655394077301, "learning_rate": 1.552847237599192e-05, "loss": 0.4388, "mean_token_accuracy": 0.8518549561500549, "num_tokens": 779714670.0, "step": 23275 }, { "epoch": 1.3896848137535818, "grad_norm": 0.5218916535377502, "learning_rate": 1.5519071764462135e-05, "loss": 0.4493, "mean_token_accuracy": 0.8492663741111756, "num_tokens": 779882350.0, "step": 23280 }, { "epoch": 1.3899832855778413, "grad_norm": 0.5360414385795593, "learning_rate": 1.550967407110856e-05, "loss": 0.4125, "mean_token_accuracy": 0.8623464107513428, "num_tokens": 780050030.0, "step": 23285 }, { "epoch": 1.3902817574021014, "grad_norm": 0.4707147181034088, "learning_rate": 1.5500279298220186e-05, "loss": 0.4552, "mean_token_accuracy": 0.8484313607215881, "num_tokens": 780217710.0, "step": 23290 }, { "epoch": 1.390580229226361, "grad_norm": 0.4926894009113312, "learning_rate": 1.5490887448085273e-05, "loss": 0.4322, "mean_token_accuracy": 0.8560718059539795, "num_tokens": 780385390.0, "step": 23295 }, { "epoch": 1.390878701050621, "grad_norm": 0.5696619153022766, "learning_rate": 1.54814985229914e-05, "loss": 0.4376, "mean_token_accuracy": 0.8527303457260131, "num_tokens": 780550807.0, "step": 23300 }, { "epoch": 1.3911771728748805, "grad_norm": 0.49192309379577637, "learning_rate": 1.5472112525225385e-05, "loss": 0.4185, "mean_token_accuracy": 0.8599188804626465, "num_tokens": 780718487.0, "step": 23305 }, { "epoch": 1.3914756446991405, "grad_norm": 0.4632330536842346, "learning_rate": 1.5462729457073374e-05, "loss": 0.404, "mean_token_accuracy": 0.8642908215522767, "num_tokens": 780886167.0, "step": 23310 }, { "epoch": 1.3917741165234, "grad_norm": 0.4984997510910034, "learning_rate": 1.5453349320820787e-05, "loss": 0.4597, "mean_token_accuracy": 0.8462066054344177, "num_tokens": 781053847.0, "step": 23315 }, { "epoch": 1.39207258834766, "grad_norm": 0.5803594589233398, "learning_rate": 1.544397211875233e-05, "loss": 0.434, "mean_token_accuracy": 0.8580520153045654, "num_tokens": 781221527.0, "step": 23320 }, { "epoch": 1.3923710601719197, "grad_norm": 0.5138877034187317, "learning_rate": 1.5434597853152e-05, "loss": 0.4347, "mean_token_accuracy": 0.8547119140625, "num_tokens": 781389207.0, "step": 23325 }, { "epoch": 1.3926695319961795, "grad_norm": 0.47389793395996094, "learning_rate": 1.5425226526303055e-05, "loss": 0.4426, "mean_token_accuracy": 0.8544196724891663, "num_tokens": 781556887.0, "step": 23330 }, { "epoch": 1.3929680038204393, "grad_norm": 0.5190534591674805, "learning_rate": 1.5415858140488056e-05, "loss": 0.4936, "mean_token_accuracy": 0.8371704578399658, "num_tokens": 781724567.0, "step": 23335 }, { "epoch": 1.393266475644699, "grad_norm": 0.4836447536945343, "learning_rate": 1.540649269798887e-05, "loss": 0.439, "mean_token_accuracy": 0.8533937692642212, "num_tokens": 781892247.0, "step": 23340 }, { "epoch": 1.3935649474689589, "grad_norm": 0.5116270780563354, "learning_rate": 1.53971302010866e-05, "loss": 0.4405, "mean_token_accuracy": 0.8543540477752686, "num_tokens": 782059927.0, "step": 23345 }, { "epoch": 1.3938634192932187, "grad_norm": 0.5673056840896606, "learning_rate": 1.5387770652061677e-05, "loss": 0.4507, "mean_token_accuracy": 0.8505785584449768, "num_tokens": 782227607.0, "step": 23350 }, { "epoch": 1.3941618911174785, "grad_norm": 0.5157475471496582, "learning_rate": 1.5378414053193757e-05, "loss": 0.4351, "mean_token_accuracy": 0.8551354050636292, "num_tokens": 782395287.0, "step": 23355 }, { "epoch": 1.3944603629417383, "grad_norm": 0.5795453190803528, "learning_rate": 1.536906040676184e-05, "loss": 0.4537, "mean_token_accuracy": 0.8510717868804931, "num_tokens": 782557019.0, "step": 23360 }, { "epoch": 1.394758834765998, "grad_norm": 0.6179571151733398, "learning_rate": 1.5359709715044183e-05, "loss": 0.4141, "mean_token_accuracy": 0.8607181191444397, "num_tokens": 782724699.0, "step": 23365 }, { "epoch": 1.3950573065902578, "grad_norm": 0.4667505621910095, "learning_rate": 1.53503619803183e-05, "loss": 0.4475, "mean_token_accuracy": 0.8524215698242188, "num_tokens": 782892379.0, "step": 23370 }, { "epoch": 1.3953557784145176, "grad_norm": 0.6107589602470398, "learning_rate": 1.5341017204861014e-05, "loss": 0.4788, "mean_token_accuracy": 0.841703450679779, "num_tokens": 783060059.0, "step": 23375 }, { "epoch": 1.3956542502387774, "grad_norm": 0.4700370728969574, "learning_rate": 1.5331675390948415e-05, "loss": 0.4038, "mean_token_accuracy": 0.8635751008987427, "num_tokens": 783227739.0, "step": 23380 }, { "epoch": 1.3959527220630372, "grad_norm": 0.5494814515113831, "learning_rate": 1.5322336540855886e-05, "loss": 0.4671, "mean_token_accuracy": 0.8456399798393249, "num_tokens": 783395419.0, "step": 23385 }, { "epoch": 1.396251193887297, "grad_norm": 0.5478232502937317, "learning_rate": 1.5313000656858056e-05, "loss": 0.4768, "mean_token_accuracy": 0.8424668908119202, "num_tokens": 783563099.0, "step": 23390 }, { "epoch": 1.3965496657115568, "grad_norm": 0.5142637491226196, "learning_rate": 1.5303667741228866e-05, "loss": 0.4536, "mean_token_accuracy": 0.8494333744049072, "num_tokens": 783730779.0, "step": 23395 }, { "epoch": 1.3968481375358166, "grad_norm": 0.5252295732498169, "learning_rate": 1.5294337796241513e-05, "loss": 0.4666, "mean_token_accuracy": 0.8457652449607849, "num_tokens": 783898459.0, "step": 23400 }, { "epoch": 1.3971466093600764, "grad_norm": 0.5872812271118164, "learning_rate": 1.528501082416848e-05, "loss": 0.4876, "mean_token_accuracy": 0.8384528040885926, "num_tokens": 784066139.0, "step": 23405 }, { "epoch": 1.3974450811843362, "grad_norm": 0.5298374891281128, "learning_rate": 1.5275686827281537e-05, "loss": 0.4451, "mean_token_accuracy": 0.8518012642860413, "num_tokens": 784233819.0, "step": 23410 }, { "epoch": 1.397743553008596, "grad_norm": 0.4962978959083557, "learning_rate": 1.5266365807851677e-05, "loss": 0.4427, "mean_token_accuracy": 0.8532446622848511, "num_tokens": 784401499.0, "step": 23415 }, { "epoch": 1.3980420248328558, "grad_norm": 0.501723051071167, "learning_rate": 1.5257047768149247e-05, "loss": 0.4203, "mean_token_accuracy": 0.8596504926681519, "num_tokens": 784569179.0, "step": 23420 }, { "epoch": 1.3983404966571156, "grad_norm": 0.5250449776649475, "learning_rate": 1.5247732710443818e-05, "loss": 0.4297, "mean_token_accuracy": 0.8572885632514954, "num_tokens": 784736859.0, "step": 23425 }, { "epoch": 1.3986389684813754, "grad_norm": 0.5277840495109558, "learning_rate": 1.523842063700423e-05, "loss": 0.4388, "mean_token_accuracy": 0.8527496099472046, "num_tokens": 784904539.0, "step": 23430 }, { "epoch": 1.3989374403056352, "grad_norm": 0.537818431854248, "learning_rate": 1.5229111550098619e-05, "loss": 0.4736, "mean_token_accuracy": 0.8428009033203125, "num_tokens": 785072219.0, "step": 23435 }, { "epoch": 1.399235912129895, "grad_norm": 0.5120958685874939, "learning_rate": 1.5219805451994385e-05, "loss": 0.4111, "mean_token_accuracy": 0.862406063079834, "num_tokens": 785239899.0, "step": 23440 }, { "epoch": 1.3995343839541547, "grad_norm": 0.5241063237190247, "learning_rate": 1.52105023449582e-05, "loss": 0.4359, "mean_token_accuracy": 0.8544435024261474, "num_tokens": 785407579.0, "step": 23445 }, { "epoch": 1.3998328557784145, "grad_norm": 0.5484216809272766, "learning_rate": 1.5201202231256012e-05, "loss": 0.4457, "mean_token_accuracy": 0.8524036765098572, "num_tokens": 785575259.0, "step": 23450 }, { "epoch": 1.4001313276026743, "grad_norm": 0.4641386866569519, "learning_rate": 1.5191905113153027e-05, "loss": 0.453, "mean_token_accuracy": 0.8485923886299134, "num_tokens": 785742939.0, "step": 23455 }, { "epoch": 1.4004297994269341, "grad_norm": 0.4222840964794159, "learning_rate": 1.5182610992913727e-05, "loss": 0.4243, "mean_token_accuracy": 0.8578671097755433, "num_tokens": 785910619.0, "step": 23460 }, { "epoch": 1.400728271251194, "grad_norm": 0.45363545417785645, "learning_rate": 1.5173319872801878e-05, "loss": 0.459, "mean_token_accuracy": 0.8480078697204589, "num_tokens": 786078299.0, "step": 23465 }, { "epoch": 1.4010267430754537, "grad_norm": 0.49216949939727783, "learning_rate": 1.5164031755080496e-05, "loss": 0.4316, "mean_token_accuracy": 0.8564237117767334, "num_tokens": 786245979.0, "step": 23470 }, { "epoch": 1.4013252148997135, "grad_norm": 0.5190386772155762, "learning_rate": 1.5154746642011885e-05, "loss": 0.4278, "mean_token_accuracy": 0.8559704184532165, "num_tokens": 786413659.0, "step": 23475 }, { "epoch": 1.4016236867239733, "grad_norm": 0.5304282903671265, "learning_rate": 1.5145464535857578e-05, "loss": 0.4528, "mean_token_accuracy": 0.8485566139221191, "num_tokens": 786581339.0, "step": 23480 }, { "epoch": 1.401922158548233, "grad_norm": 0.5135473608970642, "learning_rate": 1.5136185438878443e-05, "loss": 0.4339, "mean_token_accuracy": 0.8545031547546387, "num_tokens": 786749019.0, "step": 23485 }, { "epoch": 1.402220630372493, "grad_norm": 0.5437111258506775, "learning_rate": 1.5126909353334543e-05, "loss": 0.4232, "mean_token_accuracy": 0.8568471908569336, "num_tokens": 786916699.0, "step": 23490 }, { "epoch": 1.4025191021967527, "grad_norm": 0.4763391315937042, "learning_rate": 1.5117636281485251e-05, "loss": 0.4258, "mean_token_accuracy": 0.8564177393913269, "num_tokens": 787084379.0, "step": 23495 }, { "epoch": 1.4028175740210125, "grad_norm": 0.5312849879264832, "learning_rate": 1.5108366225589193e-05, "loss": 0.4682, "mean_token_accuracy": 0.8442920088768006, "num_tokens": 787252059.0, "step": 23500 }, { "epoch": 1.4031160458452723, "grad_norm": 0.5117261409759521, "learning_rate": 1.5099099187904267e-05, "loss": 0.4543, "mean_token_accuracy": 0.8490874290466308, "num_tokens": 787419739.0, "step": 23505 }, { "epoch": 1.403414517669532, "grad_norm": 0.5443363785743713, "learning_rate": 1.5089835170687633e-05, "loss": 0.4474, "mean_token_accuracy": 0.851604425907135, "num_tokens": 787587419.0, "step": 23510 }, { "epoch": 1.4037129894937919, "grad_norm": 0.5582711696624756, "learning_rate": 1.5080574176195706e-05, "loss": 0.4421, "mean_token_accuracy": 0.8532864093780518, "num_tokens": 787755099.0, "step": 23515 }, { "epoch": 1.4040114613180517, "grad_norm": 0.5083257555961609, "learning_rate": 1.5071316206684172e-05, "loss": 0.4379, "mean_token_accuracy": 0.8533639430999755, "num_tokens": 787922779.0, "step": 23520 }, { "epoch": 1.4043099331423115, "grad_norm": 0.6606442928314209, "learning_rate": 1.5062061264407981e-05, "loss": 0.4681, "mean_token_accuracy": 0.8441488742828369, "num_tokens": 788090459.0, "step": 23525 }, { "epoch": 1.404608404966571, "grad_norm": 0.46060633659362793, "learning_rate": 1.505280935162135e-05, "loss": 0.3909, "mean_token_accuracy": 0.8679052948951721, "num_tokens": 788258139.0, "step": 23530 }, { "epoch": 1.404906876790831, "grad_norm": 0.48085835576057434, "learning_rate": 1.5043560470577756e-05, "loss": 0.4889, "mean_token_accuracy": 0.8374567747116088, "num_tokens": 788425819.0, "step": 23535 }, { "epoch": 1.4052053486150906, "grad_norm": 0.4876287877559662, "learning_rate": 1.5034314623529915e-05, "loss": 0.4375, "mean_token_accuracy": 0.8546999931335449, "num_tokens": 788593499.0, "step": 23540 }, { "epoch": 1.4055038204393506, "grad_norm": 0.5198888182640076, "learning_rate": 1.5025071812729851e-05, "loss": 0.4528, "mean_token_accuracy": 0.849361801147461, "num_tokens": 788761179.0, "step": 23545 }, { "epoch": 1.4058022922636102, "grad_norm": 0.48025810718536377, "learning_rate": 1.5015832040428802e-05, "loss": 0.3951, "mean_token_accuracy": 0.8678508639335633, "num_tokens": 788924345.0, "step": 23550 }, { "epoch": 1.4061007640878702, "grad_norm": 0.5333512425422668, "learning_rate": 1.5006595308877292e-05, "loss": 0.4878, "mean_token_accuracy": 0.8393355727195739, "num_tokens": 789092025.0, "step": 23555 }, { "epoch": 1.4063992359121298, "grad_norm": 0.5317134857177734, "learning_rate": 1.4997361620325092e-05, "loss": 0.4753, "mean_token_accuracy": 0.8442848682403564, "num_tokens": 789252200.0, "step": 23560 }, { "epoch": 1.4066977077363898, "grad_norm": 0.4926573932170868, "learning_rate": 1.4988130977021248e-05, "loss": 0.4196, "mean_token_accuracy": 0.8609658360481263, "num_tokens": 789417168.0, "step": 23565 }, { "epoch": 1.4069961795606494, "grad_norm": 0.5432858467102051, "learning_rate": 1.4978903381214052e-05, "loss": 0.47, "mean_token_accuracy": 0.8432363271713257, "num_tokens": 789584848.0, "step": 23570 }, { "epoch": 1.4072946513849094, "grad_norm": 0.5307675004005432, "learning_rate": 1.4969678835151038e-05, "loss": 0.4732, "mean_token_accuracy": 0.8439281940460205, "num_tokens": 789752528.0, "step": 23575 }, { "epoch": 1.407593123209169, "grad_norm": 0.5629969239234924, "learning_rate": 1.4960457341079032e-05, "loss": 0.4906, "mean_token_accuracy": 0.8383394956588746, "num_tokens": 789920208.0, "step": 23580 }, { "epoch": 1.407891595033429, "grad_norm": 0.5573857426643372, "learning_rate": 1.4951238901244093e-05, "loss": 0.4419, "mean_token_accuracy": 0.8512644648551941, "num_tokens": 790087888.0, "step": 23585 }, { "epoch": 1.4081900668576885, "grad_norm": 0.5876045227050781, "learning_rate": 1.4942023517891546e-05, "loss": 0.4536, "mean_token_accuracy": 0.8486162543296814, "num_tokens": 790255568.0, "step": 23590 }, { "epoch": 1.4084885386819483, "grad_norm": 0.5804503560066223, "learning_rate": 1.4932811193265966e-05, "loss": 0.4401, "mean_token_accuracy": 0.8550161004066468, "num_tokens": 790423248.0, "step": 23595 }, { "epoch": 1.4087870105062081, "grad_norm": 0.5259001851081848, "learning_rate": 1.492360192961118e-05, "loss": 0.4566, "mean_token_accuracy": 0.8480496406555176, "num_tokens": 790590928.0, "step": 23600 }, { "epoch": 1.409085482330468, "grad_norm": 0.5038285851478577, "learning_rate": 1.491439572917028e-05, "loss": 0.4463, "mean_token_accuracy": 0.8517774105072021, "num_tokens": 790758608.0, "step": 23605 }, { "epoch": 1.4093839541547277, "grad_norm": 0.5120645761489868, "learning_rate": 1.4905192594185602e-05, "loss": 0.4396, "mean_token_accuracy": 0.8551830649375916, "num_tokens": 790920496.0, "step": 23610 }, { "epoch": 1.4096824259789875, "grad_norm": 0.5468581914901733, "learning_rate": 1.489599252689874e-05, "loss": 0.4888, "mean_token_accuracy": 0.8406358003616333, "num_tokens": 791088176.0, "step": 23615 }, { "epoch": 1.4099808978032473, "grad_norm": 0.47131621837615967, "learning_rate": 1.488679552955054e-05, "loss": 0.4192, "mean_token_accuracy": 0.8598771333694458, "num_tokens": 791255856.0, "step": 23620 }, { "epoch": 1.410279369627507, "grad_norm": 0.49126923084259033, "learning_rate": 1.4877601604381102e-05, "loss": 0.4438, "mean_token_accuracy": 0.8514076113700867, "num_tokens": 791423536.0, "step": 23625 }, { "epoch": 1.410577841451767, "grad_norm": 0.49482589960098267, "learning_rate": 1.4868410753629786e-05, "loss": 0.4109, "mean_token_accuracy": 0.8625730752944947, "num_tokens": 791591216.0, "step": 23630 }, { "epoch": 1.4108763132760267, "grad_norm": 0.5141450762748718, "learning_rate": 1.4859222979535175e-05, "loss": 0.4162, "mean_token_accuracy": 0.8585052967071534, "num_tokens": 791758896.0, "step": 23635 }, { "epoch": 1.4111747851002865, "grad_norm": 0.5500440001487732, "learning_rate": 1.4850038284335132e-05, "loss": 0.461, "mean_token_accuracy": 0.8475366830825806, "num_tokens": 791926576.0, "step": 23640 }, { "epoch": 1.4114732569245463, "grad_norm": 0.5590739846229553, "learning_rate": 1.4840856670266751e-05, "loss": 0.4708, "mean_token_accuracy": 0.8436061024665833, "num_tokens": 792094256.0, "step": 23645 }, { "epoch": 1.411771728748806, "grad_norm": 0.4894876480102539, "learning_rate": 1.4831678139566393e-05, "loss": 0.4106, "mean_token_accuracy": 0.8630752325057983, "num_tokens": 792254732.0, "step": 23650 }, { "epoch": 1.4120702005730659, "grad_norm": 0.49888819456100464, "learning_rate": 1.4822502694469662e-05, "loss": 0.4403, "mean_token_accuracy": 0.8525647163391114, "num_tokens": 792422412.0, "step": 23655 }, { "epoch": 1.4123686723973257, "grad_norm": 0.5480368733406067, "learning_rate": 1.481333033721139e-05, "loss": 0.4754, "mean_token_accuracy": 0.8414350390434265, "num_tokens": 792590092.0, "step": 23660 }, { "epoch": 1.4126671442215855, "grad_norm": 0.5386296510696411, "learning_rate": 1.4804161070025674e-05, "loss": 0.4528, "mean_token_accuracy": 0.8493796944618225, "num_tokens": 792757772.0, "step": 23665 }, { "epoch": 1.4129656160458453, "grad_norm": 0.5429210662841797, "learning_rate": 1.4794994895145886e-05, "loss": 0.4352, "mean_token_accuracy": 0.8546999931335449, "num_tokens": 792925452.0, "step": 23670 }, { "epoch": 1.413264087870105, "grad_norm": 0.49123069643974304, "learning_rate": 1.478583181480459e-05, "loss": 0.4359, "mean_token_accuracy": 0.8555648326873779, "num_tokens": 793093132.0, "step": 23675 }, { "epoch": 1.4135625596943648, "grad_norm": 0.5216329097747803, "learning_rate": 1.4776671831233646e-05, "loss": 0.4558, "mean_token_accuracy": 0.8480555772781372, "num_tokens": 793260812.0, "step": 23680 }, { "epoch": 1.4138610315186246, "grad_norm": 0.5336837768554688, "learning_rate": 1.4767514946664102e-05, "loss": 0.4297, "mean_token_accuracy": 0.8548789143562316, "num_tokens": 793428492.0, "step": 23685 }, { "epoch": 1.4141595033428844, "grad_norm": 0.464883953332901, "learning_rate": 1.4758361163326315e-05, "loss": 0.4197, "mean_token_accuracy": 0.859036135673523, "num_tokens": 793596172.0, "step": 23690 }, { "epoch": 1.4144579751671442, "grad_norm": 0.470636785030365, "learning_rate": 1.4749210483449857e-05, "loss": 0.4199, "mean_token_accuracy": 0.8599009871482849, "num_tokens": 793763852.0, "step": 23695 }, { "epoch": 1.414756446991404, "grad_norm": 0.49245259165763855, "learning_rate": 1.4740062909263535e-05, "loss": 0.4092, "mean_token_accuracy": 0.8628534078598022, "num_tokens": 793931532.0, "step": 23700 }, { "epoch": 1.4150549188156638, "grad_norm": 0.4722106456756592, "learning_rate": 1.4730918442995412e-05, "loss": 0.3979, "mean_token_accuracy": 0.866378390789032, "num_tokens": 794099212.0, "step": 23705 }, { "epoch": 1.4153533906399236, "grad_norm": 0.566887617111206, "learning_rate": 1.472177708687279e-05, "loss": 0.4541, "mean_token_accuracy": 0.8495586276054382, "num_tokens": 794266892.0, "step": 23710 }, { "epoch": 1.4156518624641834, "grad_norm": 0.4816938638687134, "learning_rate": 1.4712638843122223e-05, "loss": 0.4414, "mean_token_accuracy": 0.8511391997337341, "num_tokens": 794434572.0, "step": 23715 }, { "epoch": 1.4159503342884432, "grad_norm": 0.44670966267585754, "learning_rate": 1.4703503713969488e-05, "loss": 0.4172, "mean_token_accuracy": 0.8612549185752869, "num_tokens": 794602252.0, "step": 23720 }, { "epoch": 1.416248806112703, "grad_norm": 0.5175304412841797, "learning_rate": 1.4694371701639623e-05, "loss": 0.426, "mean_token_accuracy": 0.8579744815826416, "num_tokens": 794769932.0, "step": 23725 }, { "epoch": 1.4165472779369628, "grad_norm": 0.48323628306388855, "learning_rate": 1.4685242808356892e-05, "loss": 0.4631, "mean_token_accuracy": 0.8474412322044372, "num_tokens": 794937612.0, "step": 23730 }, { "epoch": 1.4168457497612226, "grad_norm": 0.49873974919319153, "learning_rate": 1.4676117036344805e-05, "loss": 0.4103, "mean_token_accuracy": 0.8622569441795349, "num_tokens": 795105292.0, "step": 23735 }, { "epoch": 1.4171442215854824, "grad_norm": 0.5509675145149231, "learning_rate": 1.4666994387826128e-05, "loss": 0.4399, "mean_token_accuracy": 0.8523977041244507, "num_tokens": 795272972.0, "step": 23740 }, { "epoch": 1.4174426934097422, "grad_norm": 0.49592530727386475, "learning_rate": 1.4657874865022825e-05, "loss": 0.4393, "mean_token_accuracy": 0.8547954201698303, "num_tokens": 795440652.0, "step": 23745 }, { "epoch": 1.417741165234002, "grad_norm": 0.5493110418319702, "learning_rate": 1.4648758470156138e-05, "loss": 0.4459, "mean_token_accuracy": 0.8516700506210327, "num_tokens": 795608332.0, "step": 23750 }, { "epoch": 1.4180396370582617, "grad_norm": 0.47795289754867554, "learning_rate": 1.4639645205446545e-05, "loss": 0.4151, "mean_token_accuracy": 0.8609924912452698, "num_tokens": 795776012.0, "step": 23755 }, { "epoch": 1.4183381088825215, "grad_norm": 0.4751960337162018, "learning_rate": 1.4630535073113726e-05, "loss": 0.437, "mean_token_accuracy": 0.8548073530197143, "num_tokens": 795943692.0, "step": 23760 }, { "epoch": 1.4186365807067813, "grad_norm": 0.4811757802963257, "learning_rate": 1.4621428075376637e-05, "loss": 0.412, "mean_token_accuracy": 0.8597280263900757, "num_tokens": 796111372.0, "step": 23765 }, { "epoch": 1.4189350525310411, "grad_norm": 0.5161427855491638, "learning_rate": 1.461232421445345e-05, "loss": 0.452, "mean_token_accuracy": 0.8475486040115356, "num_tokens": 796279052.0, "step": 23770 }, { "epoch": 1.419233524355301, "grad_norm": 0.5029381513595581, "learning_rate": 1.4603223492561579e-05, "loss": 0.449, "mean_token_accuracy": 0.8512823581695557, "num_tokens": 796446732.0, "step": 23775 }, { "epoch": 1.4195319961795607, "grad_norm": 0.5676060318946838, "learning_rate": 1.4594125911917678e-05, "loss": 0.4636, "mean_token_accuracy": 0.8461529374122619, "num_tokens": 796614412.0, "step": 23780 }, { "epoch": 1.4198304680038205, "grad_norm": 0.5017569065093994, "learning_rate": 1.4585031474737623e-05, "loss": 0.4364, "mean_token_accuracy": 0.8541810750961304, "num_tokens": 796782092.0, "step": 23785 }, { "epoch": 1.4201289398280803, "grad_norm": 0.5884302854537964, "learning_rate": 1.4575940183236533e-05, "loss": 0.4895, "mean_token_accuracy": 0.8387987613677979, "num_tokens": 796949772.0, "step": 23790 }, { "epoch": 1.42042741165234, "grad_norm": 0.5938997268676758, "learning_rate": 1.4566852039628765e-05, "loss": 0.4497, "mean_token_accuracy": 0.8499463200569153, "num_tokens": 797117452.0, "step": 23795 }, { "epoch": 1.4207258834766, "grad_norm": 0.5110284686088562, "learning_rate": 1.4557767046127901e-05, "loss": 0.4375, "mean_token_accuracy": 0.8543958067893982, "num_tokens": 797285132.0, "step": 23800 }, { "epoch": 1.4210243553008595, "grad_norm": 0.5652061104774475, "learning_rate": 1.4548685204946766e-05, "loss": 0.4864, "mean_token_accuracy": 0.8398604154586792, "num_tokens": 797452812.0, "step": 23805 }, { "epoch": 1.4213228271251195, "grad_norm": 0.5034496784210205, "learning_rate": 1.4539606518297389e-05, "loss": 0.4698, "mean_token_accuracy": 0.8438506364822388, "num_tokens": 797620492.0, "step": 23810 }, { "epoch": 1.421621298949379, "grad_norm": 0.491308331489563, "learning_rate": 1.4530530988391075e-05, "loss": 0.4287, "mean_token_accuracy": 0.8562268853187561, "num_tokens": 797788172.0, "step": 23815 }, { "epoch": 1.421919770773639, "grad_norm": 0.4973183274269104, "learning_rate": 1.4521458617438328e-05, "loss": 0.4389, "mean_token_accuracy": 0.8530537962913514, "num_tokens": 797955852.0, "step": 23820 }, { "epoch": 1.4222182425978986, "grad_norm": 0.513624906539917, "learning_rate": 1.4512389407648897e-05, "loss": 0.4674, "mean_token_accuracy": 0.844411301612854, "num_tokens": 798123532.0, "step": 23825 }, { "epoch": 1.4225167144221587, "grad_norm": 0.484712690114975, "learning_rate": 1.450332336123173e-05, "loss": 0.4449, "mean_token_accuracy": 0.8539663553237915, "num_tokens": 798291212.0, "step": 23830 }, { "epoch": 1.4228151862464182, "grad_norm": 0.5327746868133545, "learning_rate": 1.4494260480395061e-05, "loss": 0.4185, "mean_token_accuracy": 0.8599427461624145, "num_tokens": 798458892.0, "step": 23835 }, { "epoch": 1.4231136580706782, "grad_norm": 0.4941696226596832, "learning_rate": 1.4485200767346318e-05, "loss": 0.4065, "mean_token_accuracy": 0.8639627695083618, "num_tokens": 798626572.0, "step": 23840 }, { "epoch": 1.4234121298949378, "grad_norm": 0.5326698422431946, "learning_rate": 1.4476144224292143e-05, "loss": 0.4656, "mean_token_accuracy": 0.8460157513618469, "num_tokens": 798794252.0, "step": 23845 }, { "epoch": 1.4237106017191978, "grad_norm": 0.5434609651565552, "learning_rate": 1.4467090853438436e-05, "loss": 0.4242, "mean_token_accuracy": 0.8582011222839355, "num_tokens": 798961932.0, "step": 23850 }, { "epoch": 1.4240090735434574, "grad_norm": 0.5191842913627625, "learning_rate": 1.4458040656990313e-05, "loss": 0.4477, "mean_token_accuracy": 0.85235595703125, "num_tokens": 799129612.0, "step": 23855 }, { "epoch": 1.4243075453677174, "grad_norm": 0.5037443041801453, "learning_rate": 1.4448993637152118e-05, "loss": 0.4313, "mean_token_accuracy": 0.8549206852912903, "num_tokens": 799297292.0, "step": 23860 }, { "epoch": 1.424606017191977, "grad_norm": 0.5144136548042297, "learning_rate": 1.4439949796127419e-05, "loss": 0.4616, "mean_token_accuracy": 0.846630084514618, "num_tokens": 799464972.0, "step": 23865 }, { "epoch": 1.4249044890162368, "grad_norm": 0.5368221402168274, "learning_rate": 1.4430909136118998e-05, "loss": 0.4251, "mean_token_accuracy": 0.8582726955413819, "num_tokens": 799632652.0, "step": 23870 }, { "epoch": 1.4252029608404966, "grad_norm": 0.5508019924163818, "learning_rate": 1.4421871659328895e-05, "loss": 0.4512, "mean_token_accuracy": 0.8493081092834472, "num_tokens": 799800332.0, "step": 23875 }, { "epoch": 1.4255014326647564, "grad_norm": 0.4998466968536377, "learning_rate": 1.4412837367958337e-05, "loss": 0.465, "mean_token_accuracy": 0.844071340560913, "num_tokens": 799968012.0, "step": 23880 }, { "epoch": 1.4257999044890162, "grad_norm": 0.4987039268016815, "learning_rate": 1.4403806264207803e-05, "loss": 0.4595, "mean_token_accuracy": 0.8475903749465943, "num_tokens": 800135692.0, "step": 23885 }, { "epoch": 1.426098376313276, "grad_norm": 0.5814368724822998, "learning_rate": 1.4394778350276977e-05, "loss": 0.4922, "mean_token_accuracy": 0.8407108426094055, "num_tokens": 800298660.0, "step": 23890 }, { "epoch": 1.4263968481375358, "grad_norm": 0.5098195672035217, "learning_rate": 1.4385753628364778e-05, "loss": 0.477, "mean_token_accuracy": 0.8422462105751037, "num_tokens": 800466340.0, "step": 23895 }, { "epoch": 1.4266953199617955, "grad_norm": 0.46268561482429504, "learning_rate": 1.4376732100669358e-05, "loss": 0.4119, "mean_token_accuracy": 0.8615889191627503, "num_tokens": 800634020.0, "step": 23900 }, { "epoch": 1.4269937917860553, "grad_norm": 0.5035937428474426, "learning_rate": 1.436771376938805e-05, "loss": 0.394, "mean_token_accuracy": 0.8673386573791504, "num_tokens": 800801700.0, "step": 23905 }, { "epoch": 1.4272922636103151, "grad_norm": 0.5187295079231262, "learning_rate": 1.435869863671745e-05, "loss": 0.4426, "mean_token_accuracy": 0.851503050327301, "num_tokens": 800969380.0, "step": 23910 }, { "epoch": 1.427590735434575, "grad_norm": 0.50020432472229, "learning_rate": 1.4349686704853355e-05, "loss": 0.4485, "mean_token_accuracy": 0.8506381869316101, "num_tokens": 801137060.0, "step": 23915 }, { "epoch": 1.4278892072588347, "grad_norm": 0.4603779911994934, "learning_rate": 1.434067797599079e-05, "loss": 0.4087, "mean_token_accuracy": 0.8621853709220886, "num_tokens": 801304740.0, "step": 23920 }, { "epoch": 1.4281876790830945, "grad_norm": 0.48090657591819763, "learning_rate": 1.4331672452324007e-05, "loss": 0.4581, "mean_token_accuracy": 0.8485804677009583, "num_tokens": 801472420.0, "step": 23925 }, { "epoch": 1.4284861509073543, "grad_norm": 0.45343217253685, "learning_rate": 1.4322670136046451e-05, "loss": 0.4555, "mean_token_accuracy": 0.8493558406829834, "num_tokens": 801640100.0, "step": 23930 }, { "epoch": 1.428784622731614, "grad_norm": 0.5139942169189453, "learning_rate": 1.4313671029350805e-05, "loss": 0.4535, "mean_token_accuracy": 0.8499522805213928, "num_tokens": 801807780.0, "step": 23935 }, { "epoch": 1.429083094555874, "grad_norm": 0.4862551689147949, "learning_rate": 1.4304675134428974e-05, "loss": 0.4312, "mean_token_accuracy": 0.8545210480690002, "num_tokens": 801975460.0, "step": 23940 }, { "epoch": 1.4293815663801337, "grad_norm": 0.5076653361320496, "learning_rate": 1.4295682453472073e-05, "loss": 0.4399, "mean_token_accuracy": 0.8531074762344361, "num_tokens": 802143140.0, "step": 23945 }, { "epoch": 1.4296800382043935, "grad_norm": 0.49128422141075134, "learning_rate": 1.4286692988670442e-05, "loss": 0.4283, "mean_token_accuracy": 0.8561255097389221, "num_tokens": 802310820.0, "step": 23950 }, { "epoch": 1.4299785100286533, "grad_norm": 0.6112308502197266, "learning_rate": 1.4277706742213603e-05, "loss": 0.4292, "mean_token_accuracy": 0.8564356327056885, "num_tokens": 802478500.0, "step": 23955 }, { "epoch": 1.430276981852913, "grad_norm": 0.4948352575302124, "learning_rate": 1.4268723716290362e-05, "loss": 0.4196, "mean_token_accuracy": 0.8581653356552124, "num_tokens": 802646180.0, "step": 23960 }, { "epoch": 1.4305754536771729, "grad_norm": 0.5185667872428894, "learning_rate": 1.4259743913088674e-05, "loss": 0.4348, "mean_token_accuracy": 0.8555648446083068, "num_tokens": 802813860.0, "step": 23965 }, { "epoch": 1.4308739255014327, "grad_norm": 0.5161097049713135, "learning_rate": 1.4250767334795743e-05, "loss": 0.4615, "mean_token_accuracy": 0.8464931726455689, "num_tokens": 802974882.0, "step": 23970 }, { "epoch": 1.4311723973256925, "grad_norm": 0.5076270699501038, "learning_rate": 1.4241793983597978e-05, "loss": 0.4539, "mean_token_accuracy": 0.849385678768158, "num_tokens": 803142562.0, "step": 23975 }, { "epoch": 1.4314708691499523, "grad_norm": 0.494201123714447, "learning_rate": 1.423282386168101e-05, "loss": 0.4179, "mean_token_accuracy": 0.8607717871665954, "num_tokens": 803310242.0, "step": 23980 }, { "epoch": 1.431769340974212, "grad_norm": 0.4579995572566986, "learning_rate": 1.4223856971229682e-05, "loss": 0.408, "mean_token_accuracy": 0.8639150619506836, "num_tokens": 803477922.0, "step": 23985 }, { "epoch": 1.4320678127984718, "grad_norm": 0.45281463861465454, "learning_rate": 1.4214893314428026e-05, "loss": 0.4142, "mean_token_accuracy": 0.8614099979400635, "num_tokens": 803645602.0, "step": 23990 }, { "epoch": 1.4323662846227316, "grad_norm": 0.5281286835670471, "learning_rate": 1.420593289345931e-05, "loss": 0.4645, "mean_token_accuracy": 0.8459799647331238, "num_tokens": 803813282.0, "step": 23995 }, { "epoch": 1.4326647564469914, "grad_norm": 0.516741156578064, "learning_rate": 1.4196975710506044e-05, "loss": 0.4532, "mean_token_accuracy": 0.8507038116455078, "num_tokens": 803980962.0, "step": 24000 }, { "epoch": 1.4329632282712512, "grad_norm": 0.4869138300418854, "learning_rate": 1.418802176774987e-05, "loss": 0.4414, "mean_token_accuracy": 0.8525348901748657, "num_tokens": 804148642.0, "step": 24005 }, { "epoch": 1.433261700095511, "grad_norm": 0.48890477418899536, "learning_rate": 1.4179071067371718e-05, "loss": 0.4346, "mean_token_accuracy": 0.854234766960144, "num_tokens": 804316322.0, "step": 24010 }, { "epoch": 1.4335601719197708, "grad_norm": 0.4909629821777344, "learning_rate": 1.4170123611551672e-05, "loss": 0.442, "mean_token_accuracy": 0.8507872939109802, "num_tokens": 804484002.0, "step": 24015 }, { "epoch": 1.4338586437440306, "grad_norm": 0.4792387783527374, "learning_rate": 1.4161179402469072e-05, "loss": 0.4363, "mean_token_accuracy": 0.856441605091095, "num_tokens": 804651682.0, "step": 24020 }, { "epoch": 1.4341571155682904, "grad_norm": 0.5897955298423767, "learning_rate": 1.415223844230243e-05, "loss": 0.4915, "mean_token_accuracy": 0.8374150156974792, "num_tokens": 804819362.0, "step": 24025 }, { "epoch": 1.4344555873925502, "grad_norm": 0.4968867897987366, "learning_rate": 1.4143300733229484e-05, "loss": 0.4306, "mean_token_accuracy": 0.8565012574195862, "num_tokens": 804987042.0, "step": 24030 }, { "epoch": 1.43475405921681, "grad_norm": 0.47743961215019226, "learning_rate": 1.413436627742718e-05, "loss": 0.4195, "mean_token_accuracy": 0.8582786560058594, "num_tokens": 805154722.0, "step": 24035 }, { "epoch": 1.4350525310410698, "grad_norm": 0.4852258861064911, "learning_rate": 1.4125435077071675e-05, "loss": 0.4175, "mean_token_accuracy": 0.8602051734924316, "num_tokens": 805322402.0, "step": 24040 }, { "epoch": 1.4353510028653296, "grad_norm": 0.5231012105941772, "learning_rate": 1.4116507134338328e-05, "loss": 0.4346, "mean_token_accuracy": 0.8538172602653503, "num_tokens": 805490082.0, "step": 24045 }, { "epoch": 1.4356494746895894, "grad_norm": 0.5689031481742859, "learning_rate": 1.410758245140169e-05, "loss": 0.4513, "mean_token_accuracy": 0.8490695357322693, "num_tokens": 805657762.0, "step": 24050 }, { "epoch": 1.4359479465138492, "grad_norm": 0.50330650806427, "learning_rate": 1.4098661030435545e-05, "loss": 0.4139, "mean_token_accuracy": 0.8612727999687195, "num_tokens": 805825442.0, "step": 24055 }, { "epoch": 1.436246418338109, "grad_norm": 0.5357939004898071, "learning_rate": 1.4089742873612866e-05, "loss": 0.4675, "mean_token_accuracy": 0.8457731962203979, "num_tokens": 805990390.0, "step": 24060 }, { "epoch": 1.4365448901623687, "grad_norm": 0.49177396297454834, "learning_rate": 1.4080827983105832e-05, "loss": 0.4519, "mean_token_accuracy": 0.8493379473686218, "num_tokens": 806158070.0, "step": 24065 }, { "epoch": 1.4368433619866285, "grad_norm": 0.4820666015148163, "learning_rate": 1.4071916361085846e-05, "loss": 0.4299, "mean_token_accuracy": 0.8576046705245972, "num_tokens": 806325750.0, "step": 24070 }, { "epoch": 1.4371418338108883, "grad_norm": 0.459162175655365, "learning_rate": 1.4063008009723472e-05, "loss": 0.4389, "mean_token_accuracy": 0.8532625555992126, "num_tokens": 806493430.0, "step": 24075 }, { "epoch": 1.437440305635148, "grad_norm": 0.5223341584205627, "learning_rate": 1.4054102931188507e-05, "loss": 0.4511, "mean_token_accuracy": 0.8505904793739318, "num_tokens": 806661110.0, "step": 24080 }, { "epoch": 1.437738777459408, "grad_norm": 0.4740830957889557, "learning_rate": 1.4045201127649976e-05, "loss": 0.4531, "mean_token_accuracy": 0.8490993618965149, "num_tokens": 806828790.0, "step": 24085 }, { "epoch": 1.4380372492836675, "grad_norm": 0.5166698694229126, "learning_rate": 1.4036302601276047e-05, "loss": 0.4724, "mean_token_accuracy": 0.8437253952026367, "num_tokens": 806996470.0, "step": 24090 }, { "epoch": 1.4383357211079275, "grad_norm": 0.5253591537475586, "learning_rate": 1.4027407354234128e-05, "loss": 0.4581, "mean_token_accuracy": 0.8484790682792663, "num_tokens": 807164150.0, "step": 24095 }, { "epoch": 1.438634192932187, "grad_norm": 0.4748472273349762, "learning_rate": 1.4018515388690828e-05, "loss": 0.4738, "mean_token_accuracy": 0.8433436632156373, "num_tokens": 807331830.0, "step": 24100 }, { "epoch": 1.438932664756447, "grad_norm": 0.4808465242385864, "learning_rate": 1.4009626706811945e-05, "loss": 0.4875, "mean_token_accuracy": 0.839824640750885, "num_tokens": 807499510.0, "step": 24105 }, { "epoch": 1.4392311365807067, "grad_norm": 0.48362985253334045, "learning_rate": 1.4000741310762489e-05, "loss": 0.4401, "mean_token_accuracy": 0.8532029032707215, "num_tokens": 807667190.0, "step": 24110 }, { "epoch": 1.4395296084049667, "grad_norm": 0.585921049118042, "learning_rate": 1.3991859202706641e-05, "loss": 0.4538, "mean_token_accuracy": 0.849123227596283, "num_tokens": 807834870.0, "step": 24115 }, { "epoch": 1.4398280802292263, "grad_norm": 0.5421547293663025, "learning_rate": 1.3982980384807824e-05, "loss": 0.4277, "mean_token_accuracy": 0.8572050571441651, "num_tokens": 808002550.0, "step": 24120 }, { "epoch": 1.4401265520534863, "grad_norm": 0.540352463722229, "learning_rate": 1.3974104859228624e-05, "loss": 0.436, "mean_token_accuracy": 0.8536025285720825, "num_tokens": 808170230.0, "step": 24125 }, { "epoch": 1.4404250238777458, "grad_norm": 0.5015534162521362, "learning_rate": 1.3965232628130854e-05, "loss": 0.4348, "mean_token_accuracy": 0.8538709282875061, "num_tokens": 808337910.0, "step": 24130 }, { "epoch": 1.4407234957020059, "grad_norm": 0.5280135273933411, "learning_rate": 1.3956363693675495e-05, "loss": 0.4403, "mean_token_accuracy": 0.8545210480690002, "num_tokens": 808505590.0, "step": 24135 }, { "epoch": 1.4410219675262654, "grad_norm": 0.5273571610450745, "learning_rate": 1.3947498058022734e-05, "loss": 0.4306, "mean_token_accuracy": 0.8572229385375977, "num_tokens": 808673270.0, "step": 24140 }, { "epoch": 1.4413204393505252, "grad_norm": 0.44331005215644836, "learning_rate": 1.393863572333199e-05, "loss": 0.3998, "mean_token_accuracy": 0.8642669677734375, "num_tokens": 808840950.0, "step": 24145 }, { "epoch": 1.441618911174785, "grad_norm": 0.4782272279262543, "learning_rate": 1.392977669176182e-05, "loss": 0.4611, "mean_token_accuracy": 0.8476082444190979, "num_tokens": 809008630.0, "step": 24150 }, { "epoch": 1.4419173829990448, "grad_norm": 0.542181134223938, "learning_rate": 1.3920920965470024e-05, "loss": 0.431, "mean_token_accuracy": 0.857670271396637, "num_tokens": 809176310.0, "step": 24155 }, { "epoch": 1.4422158548233046, "grad_norm": 0.4955284297466278, "learning_rate": 1.3912068546613549e-05, "loss": 0.4209, "mean_token_accuracy": 0.8584874153137207, "num_tokens": 809343990.0, "step": 24160 }, { "epoch": 1.4425143266475644, "grad_norm": 0.5144757628440857, "learning_rate": 1.3903219437348597e-05, "loss": 0.4345, "mean_token_accuracy": 0.8535727143287659, "num_tokens": 809511670.0, "step": 24165 }, { "epoch": 1.4428127984718242, "grad_norm": 0.5090042948722839, "learning_rate": 1.3894373639830521e-05, "loss": 0.4337, "mean_token_accuracy": 0.8532446622848511, "num_tokens": 809679350.0, "step": 24170 }, { "epoch": 1.443111270296084, "grad_norm": 0.510388970375061, "learning_rate": 1.3885531156213871e-05, "loss": 0.435, "mean_token_accuracy": 0.8543182611465454, "num_tokens": 809847030.0, "step": 24175 }, { "epoch": 1.4434097421203438, "grad_norm": 0.5230668187141418, "learning_rate": 1.3876691988652402e-05, "loss": 0.4288, "mean_token_accuracy": 0.8576643228530884, "num_tokens": 810014710.0, "step": 24180 }, { "epoch": 1.4437082139446036, "grad_norm": 0.5183838605880737, "learning_rate": 1.3867856139299057e-05, "loss": 0.4627, "mean_token_accuracy": 0.8450316309928894, "num_tokens": 810182390.0, "step": 24185 }, { "epoch": 1.4440066857688634, "grad_norm": 0.49484556913375854, "learning_rate": 1.3859023610305972e-05, "loss": 0.4082, "mean_token_accuracy": 0.862143623828888, "num_tokens": 810350070.0, "step": 24190 }, { "epoch": 1.4443051575931232, "grad_norm": 0.5739304423332214, "learning_rate": 1.3850194403824474e-05, "loss": 0.4923, "mean_token_accuracy": 0.8381963491439819, "num_tokens": 810517750.0, "step": 24195 }, { "epoch": 1.444603629417383, "grad_norm": 0.5037735104560852, "learning_rate": 1.3841368522005069e-05, "loss": 0.4144, "mean_token_accuracy": 0.8597638130187988, "num_tokens": 810685430.0, "step": 24200 }, { "epoch": 1.4449021012416428, "grad_norm": 0.4882184863090515, "learning_rate": 1.3832545966997468e-05, "loss": 0.4281, "mean_token_accuracy": 0.8564177513122558, "num_tokens": 810853110.0, "step": 24205 }, { "epoch": 1.4452005730659025, "grad_norm": 0.5717160105705261, "learning_rate": 1.382372674095057e-05, "loss": 0.4266, "mean_token_accuracy": 0.856692111492157, "num_tokens": 811020790.0, "step": 24210 }, { "epoch": 1.4454990448901623, "grad_norm": 0.5074063539505005, "learning_rate": 1.3814910846012463e-05, "loss": 0.4328, "mean_token_accuracy": 0.8543566107749939, "num_tokens": 811180279.0, "step": 24215 }, { "epoch": 1.4457975167144221, "grad_norm": 0.48674798011779785, "learning_rate": 1.3806098284330415e-05, "loss": 0.441, "mean_token_accuracy": 0.8518430113792419, "num_tokens": 811347959.0, "step": 24220 }, { "epoch": 1.446095988538682, "grad_norm": 0.5206977725028992, "learning_rate": 1.379728905805089e-05, "loss": 0.4394, "mean_token_accuracy": 0.8528152227401733, "num_tokens": 811515639.0, "step": 24225 }, { "epoch": 1.4463944603629417, "grad_norm": 0.4975236654281616, "learning_rate": 1.3788483169319544e-05, "loss": 0.4721, "mean_token_accuracy": 0.8434629559516906, "num_tokens": 811683319.0, "step": 24230 }, { "epoch": 1.4466929321872015, "grad_norm": 0.502943217754364, "learning_rate": 1.3779680620281204e-05, "loss": 0.4168, "mean_token_accuracy": 0.8585530281066894, "num_tokens": 811850999.0, "step": 24235 }, { "epoch": 1.4469914040114613, "grad_norm": 0.5049415230751038, "learning_rate": 1.3770881413079895e-05, "loss": 0.4657, "mean_token_accuracy": 0.845765233039856, "num_tokens": 812018679.0, "step": 24240 }, { "epoch": 1.447289875835721, "grad_norm": 0.49742600321769714, "learning_rate": 1.3762085549858828e-05, "loss": 0.4042, "mean_token_accuracy": 0.8638136625289917, "num_tokens": 812186359.0, "step": 24245 }, { "epoch": 1.447588347659981, "grad_norm": 0.45932161808013916, "learning_rate": 1.37532930327604e-05, "loss": 0.4286, "mean_token_accuracy": 0.8566980838775635, "num_tokens": 812354039.0, "step": 24250 }, { "epoch": 1.4478868194842407, "grad_norm": 0.5616112947463989, "learning_rate": 1.3744503863926195e-05, "loss": 0.4321, "mean_token_accuracy": 0.8550817131996155, "num_tokens": 812521719.0, "step": 24255 }, { "epoch": 1.4481852913085005, "grad_norm": 0.5444290637969971, "learning_rate": 1.3735718045496964e-05, "loss": 0.4383, "mean_token_accuracy": 0.852868914604187, "num_tokens": 812689399.0, "step": 24260 }, { "epoch": 1.4484837631327603, "grad_norm": 0.47080856561660767, "learning_rate": 1.3726935579612662e-05, "loss": 0.4324, "mean_token_accuracy": 0.8562447905540467, "num_tokens": 812857079.0, "step": 24265 }, { "epoch": 1.44878223495702, "grad_norm": 0.47317975759506226, "learning_rate": 1.3718156468412418e-05, "loss": 0.446, "mean_token_accuracy": 0.8499165058135987, "num_tokens": 813024759.0, "step": 24270 }, { "epoch": 1.4490807067812799, "grad_norm": 0.4811856746673584, "learning_rate": 1.370938071403456e-05, "loss": 0.4745, "mean_token_accuracy": 0.8426637291908264, "num_tokens": 813192439.0, "step": 24275 }, { "epoch": 1.4493791786055397, "grad_norm": 0.4584912955760956, "learning_rate": 1.370060831861657e-05, "loss": 0.4451, "mean_token_accuracy": 0.8509421825408936, "num_tokens": 813355594.0, "step": 24280 }, { "epoch": 1.4496776504297995, "grad_norm": 0.5099167227745056, "learning_rate": 1.3691839284295121e-05, "loss": 0.4181, "mean_token_accuracy": 0.8598353862762451, "num_tokens": 813523274.0, "step": 24285 }, { "epoch": 1.4499761222540593, "grad_norm": 0.5842281579971313, "learning_rate": 1.3683073613206101e-05, "loss": 0.4558, "mean_token_accuracy": 0.8484492540359497, "num_tokens": 813690954.0, "step": 24290 }, { "epoch": 1.450274594078319, "grad_norm": 0.5133460164070129, "learning_rate": 1.3674311307484528e-05, "loss": 0.4506, "mean_token_accuracy": 0.8483180284500123, "num_tokens": 813858634.0, "step": 24295 }, { "epoch": 1.4505730659025788, "grad_norm": 0.46325740218162537, "learning_rate": 1.3665552369264625e-05, "loss": 0.5106, "mean_token_accuracy": 0.831742811203003, "num_tokens": 814026314.0, "step": 24300 }, { "epoch": 1.4508715377268386, "grad_norm": 0.5090318322181702, "learning_rate": 1.36567968006798e-05, "loss": 0.4332, "mean_token_accuracy": 0.8538888096809387, "num_tokens": 814193994.0, "step": 24305 }, { "epoch": 1.4511700095510984, "grad_norm": 0.5046600103378296, "learning_rate": 1.3648044603862625e-05, "loss": 0.4608, "mean_token_accuracy": 0.8458785653114319, "num_tokens": 814361674.0, "step": 24310 }, { "epoch": 1.4514684813753582, "grad_norm": 0.48981788754463196, "learning_rate": 1.3639295780944877e-05, "loss": 0.4431, "mean_token_accuracy": 0.8519622921943665, "num_tokens": 814529354.0, "step": 24315 }, { "epoch": 1.451766953199618, "grad_norm": 0.5356281995773315, "learning_rate": 1.3630550334057473e-05, "loss": 0.4579, "mean_token_accuracy": 0.847280216217041, "num_tokens": 814697034.0, "step": 24320 }, { "epoch": 1.4520654250238778, "grad_norm": 0.4935963451862335, "learning_rate": 1.3621808265330536e-05, "loss": 0.4256, "mean_token_accuracy": 0.8570320844650269, "num_tokens": 814864714.0, "step": 24325 }, { "epoch": 1.4523638968481376, "grad_norm": 0.4961432218551636, "learning_rate": 1.3613069576893356e-05, "loss": 0.441, "mean_token_accuracy": 0.8521710634231567, "num_tokens": 815032394.0, "step": 24330 }, { "epoch": 1.4526623686723974, "grad_norm": 0.5234033465385437, "learning_rate": 1.3604334270874405e-05, "loss": 0.4401, "mean_token_accuracy": 0.852731716632843, "num_tokens": 815200074.0, "step": 24335 }, { "epoch": 1.4529608404966572, "grad_norm": 0.4845702052116394, "learning_rate": 1.3595602349401333e-05, "loss": 0.4787, "mean_token_accuracy": 0.8402958273887634, "num_tokens": 815367754.0, "step": 24340 }, { "epoch": 1.453259312320917, "grad_norm": 0.6409403085708618, "learning_rate": 1.3586873814600941e-05, "loss": 0.4563, "mean_token_accuracy": 0.8497852921485901, "num_tokens": 815535434.0, "step": 24345 }, { "epoch": 1.4535577841451768, "grad_norm": 0.5014445185661316, "learning_rate": 1.3578148668599256e-05, "loss": 0.4535, "mean_token_accuracy": 0.8493021726608276, "num_tokens": 815703114.0, "step": 24350 }, { "epoch": 1.4538562559694364, "grad_norm": 0.529164731502533, "learning_rate": 1.3569426913521422e-05, "loss": 0.4419, "mean_token_accuracy": 0.851264476776123, "num_tokens": 815870794.0, "step": 24355 }, { "epoch": 1.4541547277936964, "grad_norm": 0.5152989029884338, "learning_rate": 1.3560708551491785e-05, "loss": 0.4573, "mean_token_accuracy": 0.8457413792610169, "num_tokens": 816038474.0, "step": 24360 }, { "epoch": 1.454453199617956, "grad_norm": 0.4938627779483795, "learning_rate": 1.355199358463388e-05, "loss": 0.4583, "mean_token_accuracy": 0.847894549369812, "num_tokens": 816206154.0, "step": 24365 }, { "epoch": 1.454751671442216, "grad_norm": 0.5390340089797974, "learning_rate": 1.354328201507038e-05, "loss": 0.4548, "mean_token_accuracy": 0.8480436563491821, "num_tokens": 816373834.0, "step": 24370 }, { "epoch": 1.4550501432664755, "grad_norm": 0.4630414545536041, "learning_rate": 1.3534573844923171e-05, "loss": 0.381, "mean_token_accuracy": 0.8706250667572022, "num_tokens": 816541514.0, "step": 24375 }, { "epoch": 1.4553486150907355, "grad_norm": 0.5235395431518555, "learning_rate": 1.3525869076313263e-05, "loss": 0.4473, "mean_token_accuracy": 0.8517833590507508, "num_tokens": 816709194.0, "step": 24380 }, { "epoch": 1.4556470869149951, "grad_norm": 0.48764026165008545, "learning_rate": 1.3517167711360873e-05, "loss": 0.4042, "mean_token_accuracy": 0.8617141723632813, "num_tokens": 816876874.0, "step": 24385 }, { "epoch": 1.4559455587392551, "grad_norm": 0.5036116242408752, "learning_rate": 1.3508469752185382e-05, "loss": 0.458, "mean_token_accuracy": 0.8485387086868286, "num_tokens": 817044554.0, "step": 24390 }, { "epoch": 1.4562440305635147, "grad_norm": 0.5442743897438049, "learning_rate": 1.349977520090534e-05, "loss": 0.4369, "mean_token_accuracy": 0.8539186358451843, "num_tokens": 817212234.0, "step": 24395 }, { "epoch": 1.4565425023877747, "grad_norm": 0.5036551356315613, "learning_rate": 1.3491084059638468e-05, "loss": 0.4557, "mean_token_accuracy": 0.8475008845329285, "num_tokens": 817379914.0, "step": 24400 }, { "epoch": 1.4568409742120343, "grad_norm": 0.511569082736969, "learning_rate": 1.348239633050164e-05, "loss": 0.4501, "mean_token_accuracy": 0.8490695476531982, "num_tokens": 817547594.0, "step": 24405 }, { "epoch": 1.4571394460362943, "grad_norm": 0.47128644585609436, "learning_rate": 1.3473712015610913e-05, "loss": 0.4643, "mean_token_accuracy": 0.8441130876541137, "num_tokens": 817715274.0, "step": 24410 }, { "epoch": 1.4574379178605539, "grad_norm": 0.5112484097480774, "learning_rate": 1.3465031117081533e-05, "loss": 0.4727, "mean_token_accuracy": 0.8451151132583619, "num_tokens": 817882954.0, "step": 24415 }, { "epoch": 1.4577363896848137, "grad_norm": 0.523012638092041, "learning_rate": 1.3456353637027879e-05, "loss": 0.4033, "mean_token_accuracy": 0.8637540340423584, "num_tokens": 818050634.0, "step": 24420 }, { "epoch": 1.4580348615090735, "grad_norm": 0.5245733857154846, "learning_rate": 1.344767957756351e-05, "loss": 0.4593, "mean_token_accuracy": 0.8489562273025513, "num_tokens": 818218314.0, "step": 24425 }, { "epoch": 1.4583333333333333, "grad_norm": 0.4996326267719269, "learning_rate": 1.3439008940801156e-05, "loss": 0.427, "mean_token_accuracy": 0.8570380568504333, "num_tokens": 818385994.0, "step": 24430 }, { "epoch": 1.458631805157593, "grad_norm": 0.4786929786205292, "learning_rate": 1.3430341728852708e-05, "loss": 0.4061, "mean_token_accuracy": 0.8649528861045838, "num_tokens": 818553674.0, "step": 24435 }, { "epoch": 1.4589302769818528, "grad_norm": 0.552032470703125, "learning_rate": 1.3421677943829236e-05, "loss": 0.4532, "mean_token_accuracy": 0.8496182799339295, "num_tokens": 818721354.0, "step": 24440 }, { "epoch": 1.4592287488061126, "grad_norm": 0.4969017803668976, "learning_rate": 1.3413017587840953e-05, "loss": 0.4282, "mean_token_accuracy": 0.8564296722412109, "num_tokens": 818889034.0, "step": 24445 }, { "epoch": 1.4595272206303724, "grad_norm": 0.5180125832557678, "learning_rate": 1.340436066299725e-05, "loss": 0.4584, "mean_token_accuracy": 0.8482822299003601, "num_tokens": 819056714.0, "step": 24450 }, { "epoch": 1.4598256924546322, "grad_norm": 0.5134487748146057, "learning_rate": 1.3395707171406685e-05, "loss": 0.4556, "mean_token_accuracy": 0.8473577499389648, "num_tokens": 819224394.0, "step": 24455 }, { "epoch": 1.460124164278892, "grad_norm": 0.490887314081192, "learning_rate": 1.3387057115176977e-05, "loss": 0.4321, "mean_token_accuracy": 0.8564773797988892, "num_tokens": 819392074.0, "step": 24460 }, { "epoch": 1.4604226361031518, "grad_norm": 0.5028799176216125, "learning_rate": 1.3378410496415e-05, "loss": 0.4423, "mean_token_accuracy": 0.8521054506301879, "num_tokens": 819559754.0, "step": 24465 }, { "epoch": 1.4607211079274116, "grad_norm": 0.48976489901542664, "learning_rate": 1.3369767317226795e-05, "loss": 0.4399, "mean_token_accuracy": 0.8515448093414306, "num_tokens": 819727434.0, "step": 24470 }, { "epoch": 1.4610195797516714, "grad_norm": 0.5112046003341675, "learning_rate": 1.3361127579717586e-05, "loss": 0.4033, "mean_token_accuracy": 0.8626385688781738, "num_tokens": 819890728.0, "step": 24475 }, { "epoch": 1.4613180515759312, "grad_norm": 0.47551229596138, "learning_rate": 1.335249128599172e-05, "loss": 0.4397, "mean_token_accuracy": 0.8535846471786499, "num_tokens": 820058408.0, "step": 24480 }, { "epoch": 1.461616523400191, "grad_norm": 0.6014630198478699, "learning_rate": 1.3343858438152745e-05, "loss": 0.4642, "mean_token_accuracy": 0.8459501385688781, "num_tokens": 820226088.0, "step": 24485 }, { "epoch": 1.4619149952244508, "grad_norm": 0.5242108106613159, "learning_rate": 1.3335229038303326e-05, "loss": 0.4341, "mean_token_accuracy": 0.8538053154945373, "num_tokens": 820393768.0, "step": 24490 }, { "epoch": 1.4622134670487106, "grad_norm": 0.5242660045623779, "learning_rate": 1.3326603088545328e-05, "loss": 0.4188, "mean_token_accuracy": 0.8590838551521301, "num_tokens": 820561448.0, "step": 24495 }, { "epoch": 1.4625119388729704, "grad_norm": 0.457776814699173, "learning_rate": 1.3317980590979774e-05, "loss": 0.4054, "mean_token_accuracy": 0.8620660781860352, "num_tokens": 820729128.0, "step": 24500 }, { "epoch": 1.4628104106972302, "grad_norm": 0.5410901308059692, "learning_rate": 1.3309361547706806e-05, "loss": 0.4486, "mean_token_accuracy": 0.8497972011566162, "num_tokens": 820896808.0, "step": 24505 }, { "epoch": 1.46310888252149, "grad_norm": 0.5776258707046509, "learning_rate": 1.3300745960825762e-05, "loss": 0.441, "mean_token_accuracy": 0.8529523968696594, "num_tokens": 821064488.0, "step": 24510 }, { "epoch": 1.4634073543457498, "grad_norm": 0.4791201949119568, "learning_rate": 1.3292133832435122e-05, "loss": 0.3998, "mean_token_accuracy": 0.8648276209831238, "num_tokens": 821232168.0, "step": 24515 }, { "epoch": 1.4637058261700095, "grad_norm": 0.5130822062492371, "learning_rate": 1.3283525164632543e-05, "loss": 0.4258, "mean_token_accuracy": 0.8583919882774353, "num_tokens": 821399848.0, "step": 24520 }, { "epoch": 1.4640042979942693, "grad_norm": 0.5081445574760437, "learning_rate": 1.327491995951482e-05, "loss": 0.4291, "mean_token_accuracy": 0.8570738315582276, "num_tokens": 821567528.0, "step": 24525 }, { "epoch": 1.4643027698185291, "grad_norm": 0.4760996103286743, "learning_rate": 1.3266318219177898e-05, "loss": 0.3947, "mean_token_accuracy": 0.8671120166778564, "num_tokens": 821735208.0, "step": 24530 }, { "epoch": 1.464601241642789, "grad_norm": 0.4987267851829529, "learning_rate": 1.3257719945716896e-05, "loss": 0.4242, "mean_token_accuracy": 0.8592091083526612, "num_tokens": 821902888.0, "step": 24535 }, { "epoch": 1.4648997134670487, "grad_norm": 0.5035992860794067, "learning_rate": 1.3249125141226088e-05, "loss": 0.4473, "mean_token_accuracy": 0.850876784324646, "num_tokens": 822070568.0, "step": 24540 }, { "epoch": 1.4651981852913085, "grad_norm": 0.4848971962928772, "learning_rate": 1.3240533807798888e-05, "loss": 0.4039, "mean_token_accuracy": 0.8631754755973816, "num_tokens": 822238248.0, "step": 24545 }, { "epoch": 1.4654966571155683, "grad_norm": 0.5123375654220581, "learning_rate": 1.323194594752788e-05, "loss": 0.4343, "mean_token_accuracy": 0.8553023934364319, "num_tokens": 822405928.0, "step": 24550 }, { "epoch": 1.465795128939828, "grad_norm": 0.5385374426841736, "learning_rate": 1.3223361562504796e-05, "loss": 0.4049, "mean_token_accuracy": 0.8639747023582458, "num_tokens": 822573608.0, "step": 24555 }, { "epoch": 1.466093600764088, "grad_norm": 0.5345389246940613, "learning_rate": 1.3214780654820527e-05, "loss": 0.4636, "mean_token_accuracy": 0.8473994851112365, "num_tokens": 822741288.0, "step": 24560 }, { "epoch": 1.4663920725883477, "grad_norm": 0.49947449564933777, "learning_rate": 1.3206203226565098e-05, "loss": 0.4104, "mean_token_accuracy": 0.8613324522972107, "num_tokens": 822908968.0, "step": 24565 }, { "epoch": 1.4666905444126075, "grad_norm": 0.5049331188201904, "learning_rate": 1.3197629279827705e-05, "loss": 0.4296, "mean_token_accuracy": 0.8557437658309937, "num_tokens": 823076648.0, "step": 24570 }, { "epoch": 1.4669890162368673, "grad_norm": 0.5047481656074524, "learning_rate": 1.3189058816696692e-05, "loss": 0.4694, "mean_token_accuracy": 0.8450554609298706, "num_tokens": 823244328.0, "step": 24575 }, { "epoch": 1.467287488061127, "grad_norm": 0.52135169506073, "learning_rate": 1.3180491839259557e-05, "loss": 0.4115, "mean_token_accuracy": 0.8619288921356201, "num_tokens": 823412008.0, "step": 24580 }, { "epoch": 1.4675859598853869, "grad_norm": 0.4863163232803345, "learning_rate": 1.3171928349602947e-05, "loss": 0.4352, "mean_token_accuracy": 0.8533818364143372, "num_tokens": 823579688.0, "step": 24585 }, { "epoch": 1.4678844317096467, "grad_norm": 0.5247222185134888, "learning_rate": 1.3163368349812643e-05, "loss": 0.4307, "mean_token_accuracy": 0.8565191507339478, "num_tokens": 823747368.0, "step": 24590 }, { "epoch": 1.4681829035339065, "grad_norm": 0.499421626329422, "learning_rate": 1.3154811841973608e-05, "loss": 0.4428, "mean_token_accuracy": 0.8511213183403015, "num_tokens": 823915048.0, "step": 24595 }, { "epoch": 1.4684813753581663, "grad_norm": 0.5525295734405518, "learning_rate": 1.3146258828169927e-05, "loss": 0.4732, "mean_token_accuracy": 0.8406091332435608, "num_tokens": 824075951.0, "step": 24600 }, { "epoch": 1.468779847182426, "grad_norm": 0.5224230289459229, "learning_rate": 1.3137709310484847e-05, "loss": 0.4708, "mean_token_accuracy": 0.8445663809776306, "num_tokens": 824243631.0, "step": 24605 }, { "epoch": 1.4690783190066858, "grad_norm": 0.5163947343826294, "learning_rate": 1.3129163291000773e-05, "loss": 0.4537, "mean_token_accuracy": 0.8477394700050354, "num_tokens": 824411311.0, "step": 24610 }, { "epoch": 1.4693767908309456, "grad_norm": 0.5577158331871033, "learning_rate": 1.3120620771799216e-05, "loss": 0.4981, "mean_token_accuracy": 0.835864245891571, "num_tokens": 824578991.0, "step": 24615 }, { "epoch": 1.4696752626552054, "grad_norm": 0.4999620020389557, "learning_rate": 1.31120817549609e-05, "loss": 0.4491, "mean_token_accuracy": 0.8503220677375793, "num_tokens": 824746671.0, "step": 24620 }, { "epoch": 1.4699737344794652, "grad_norm": 0.46030867099761963, "learning_rate": 1.3103546242565635e-05, "loss": 0.4297, "mean_token_accuracy": 0.8575867772102356, "num_tokens": 824914351.0, "step": 24625 }, { "epoch": 1.4702722063037248, "grad_norm": 0.5278317928314209, "learning_rate": 1.3095014236692413e-05, "loss": 0.437, "mean_token_accuracy": 0.853095543384552, "num_tokens": 825082031.0, "step": 24630 }, { "epoch": 1.4705706781279848, "grad_norm": 0.5900847911834717, "learning_rate": 1.3086485739419358e-05, "loss": 0.4483, "mean_token_accuracy": 0.8494751334190369, "num_tokens": 825249711.0, "step": 24635 }, { "epoch": 1.4708691499522444, "grad_norm": 0.5356927514076233, "learning_rate": 1.3077960752823742e-05, "loss": 0.4301, "mean_token_accuracy": 0.8565250992774963, "num_tokens": 825417391.0, "step": 24640 }, { "epoch": 1.4711676217765044, "grad_norm": 0.5412192940711975, "learning_rate": 1.3069439278981993e-05, "loss": 0.4273, "mean_token_accuracy": 0.8574078559875489, "num_tokens": 825585071.0, "step": 24645 }, { "epoch": 1.471466093600764, "grad_norm": 0.501054048538208, "learning_rate": 1.306092131996966e-05, "loss": 0.441, "mean_token_accuracy": 0.851968276500702, "num_tokens": 825752751.0, "step": 24650 }, { "epoch": 1.471764565425024, "grad_norm": 0.5443149209022522, "learning_rate": 1.305240687786145e-05, "loss": 0.4348, "mean_token_accuracy": 0.8554694056510925, "num_tokens": 825920431.0, "step": 24655 }, { "epoch": 1.4720630372492836, "grad_norm": 0.48157799243927, "learning_rate": 1.3043895954731222e-05, "loss": 0.4354, "mean_token_accuracy": 0.8546224474906922, "num_tokens": 826088111.0, "step": 24660 }, { "epoch": 1.4723615090735436, "grad_norm": 0.5196056365966797, "learning_rate": 1.3035388552651962e-05, "loss": 0.4246, "mean_token_accuracy": 0.8593096733093262, "num_tokens": 826251237.0, "step": 24665 }, { "epoch": 1.4726599808978031, "grad_norm": 0.48469680547714233, "learning_rate": 1.3026884673695813e-05, "loss": 0.4362, "mean_token_accuracy": 0.8536979556083679, "num_tokens": 826418917.0, "step": 24670 }, { "epoch": 1.4729584527220632, "grad_norm": 0.5830373764038086, "learning_rate": 1.301838431993403e-05, "loss": 0.4656, "mean_token_accuracy": 0.8446558356285095, "num_tokens": 826586597.0, "step": 24675 }, { "epoch": 1.4732569245463227, "grad_norm": 0.45847535133361816, "learning_rate": 1.3009887493437061e-05, "loss": 0.4659, "mean_token_accuracy": 0.8450017929077148, "num_tokens": 826754277.0, "step": 24680 }, { "epoch": 1.4735553963705827, "grad_norm": 0.5602812170982361, "learning_rate": 1.3001394196274441e-05, "loss": 0.4656, "mean_token_accuracy": 0.8452344059944152, "num_tokens": 826921957.0, "step": 24685 }, { "epoch": 1.4738538681948423, "grad_norm": 0.5139380097389221, "learning_rate": 1.299290443051488e-05, "loss": 0.4183, "mean_token_accuracy": 0.8604556798934937, "num_tokens": 827089637.0, "step": 24690 }, { "epoch": 1.4741523400191021, "grad_norm": 0.5584856271743774, "learning_rate": 1.2984418198226211e-05, "loss": 0.4624, "mean_token_accuracy": 0.8476619482040405, "num_tokens": 827257317.0, "step": 24695 }, { "epoch": 1.474450811843362, "grad_norm": 0.47130510210990906, "learning_rate": 1.2975935501475422e-05, "loss": 0.4023, "mean_token_accuracy": 0.864010500907898, "num_tokens": 827424997.0, "step": 24700 }, { "epoch": 1.4747492836676217, "grad_norm": 0.4731478989124298, "learning_rate": 1.2967456342328629e-05, "loss": 0.4778, "mean_token_accuracy": 0.8413694381713868, "num_tokens": 827592677.0, "step": 24705 }, { "epoch": 1.4750477554918815, "grad_norm": 0.5441104769706726, "learning_rate": 1.2958980722851078e-05, "loss": 0.4685, "mean_token_accuracy": 0.8445842742919922, "num_tokens": 827760357.0, "step": 24710 }, { "epoch": 1.4753462273161413, "grad_norm": 0.5230847001075745, "learning_rate": 1.2950508645107168e-05, "loss": 0.4395, "mean_token_accuracy": 0.851980197429657, "num_tokens": 827928037.0, "step": 24715 }, { "epoch": 1.475644699140401, "grad_norm": 0.5109076499938965, "learning_rate": 1.2942040111160433e-05, "loss": 0.4625, "mean_token_accuracy": 0.8463080048561096, "num_tokens": 828095717.0, "step": 24720 }, { "epoch": 1.4759431709646609, "grad_norm": 0.4509248733520508, "learning_rate": 1.2933575123073532e-05, "loss": 0.4547, "mean_token_accuracy": 0.8495705604553223, "num_tokens": 828263397.0, "step": 24725 }, { "epoch": 1.4762416427889207, "grad_norm": 0.4846537709236145, "learning_rate": 1.2925113682908287e-05, "loss": 0.4272, "mean_token_accuracy": 0.8562626719474793, "num_tokens": 828431077.0, "step": 24730 }, { "epoch": 1.4765401146131805, "grad_norm": 0.5724625587463379, "learning_rate": 1.2916655792725621e-05, "loss": 0.4549, "mean_token_accuracy": 0.8486401081085205, "num_tokens": 828598757.0, "step": 24735 }, { "epoch": 1.4768385864374403, "grad_norm": 0.5184370875358582, "learning_rate": 1.2908201454585602e-05, "loss": 0.4462, "mean_token_accuracy": 0.8523917555809021, "num_tokens": 828766437.0, "step": 24740 }, { "epoch": 1.4771370582617, "grad_norm": 0.5459449291229248, "learning_rate": 1.2899750670547473e-05, "loss": 0.4942, "mean_token_accuracy": 0.8369617104530335, "num_tokens": 828934117.0, "step": 24745 }, { "epoch": 1.4774355300859598, "grad_norm": 0.49026742577552795, "learning_rate": 1.2891303442669545e-05, "loss": 0.4311, "mean_token_accuracy": 0.8568054437637329, "num_tokens": 829101797.0, "step": 24750 }, { "epoch": 1.4777340019102196, "grad_norm": 0.4901483654975891, "learning_rate": 1.2882859773009314e-05, "loss": 0.441, "mean_token_accuracy": 0.8519622921943665, "num_tokens": 829269477.0, "step": 24755 }, { "epoch": 1.4780324737344794, "grad_norm": 0.5515833497047424, "learning_rate": 1.2874419663623377e-05, "loss": 0.4712, "mean_token_accuracy": 0.8429500222206116, "num_tokens": 829437157.0, "step": 24760 }, { "epoch": 1.4783309455587392, "grad_norm": 0.5111850500106812, "learning_rate": 1.2865983116567499e-05, "loss": 0.478, "mean_token_accuracy": 0.8412501335144043, "num_tokens": 829604837.0, "step": 24765 }, { "epoch": 1.478629417382999, "grad_norm": 0.5199674963951111, "learning_rate": 1.2857550133896548e-05, "loss": 0.4415, "mean_token_accuracy": 0.8518072366714478, "num_tokens": 829772517.0, "step": 24770 }, { "epoch": 1.4789278892072588, "grad_norm": 0.5170202851295471, "learning_rate": 1.2849120717664526e-05, "loss": 0.4446, "mean_token_accuracy": 0.8533937931060791, "num_tokens": 829940197.0, "step": 24775 }, { "epoch": 1.4792263610315186, "grad_norm": 0.48555731773376465, "learning_rate": 1.2840694869924574e-05, "loss": 0.4068, "mean_token_accuracy": 0.8628414750099183, "num_tokens": 830107877.0, "step": 24780 }, { "epoch": 1.4795248328557784, "grad_norm": 0.5514911413192749, "learning_rate": 1.2832272592728966e-05, "loss": 0.4159, "mean_token_accuracy": 0.8612310647964477, "num_tokens": 830275557.0, "step": 24785 }, { "epoch": 1.4798233046800382, "grad_norm": 0.4827394485473633, "learning_rate": 1.2823853888129116e-05, "loss": 0.4425, "mean_token_accuracy": 0.8533102750778199, "num_tokens": 830443237.0, "step": 24790 }, { "epoch": 1.480121776504298, "grad_norm": 0.44826555252075195, "learning_rate": 1.281543875817553e-05, "loss": 0.4234, "mean_token_accuracy": 0.8591554403305054, "num_tokens": 830610917.0, "step": 24795 }, { "epoch": 1.4804202483285578, "grad_norm": 0.49727168679237366, "learning_rate": 1.2807027204917877e-05, "loss": 0.4273, "mean_token_accuracy": 0.8564177513122558, "num_tokens": 830778597.0, "step": 24800 }, { "epoch": 1.4807187201528176, "grad_norm": 0.5396794080734253, "learning_rate": 1.2798619230404962e-05, "loss": 0.4204, "mean_token_accuracy": 0.8594834923744201, "num_tokens": 830946277.0, "step": 24805 }, { "epoch": 1.4810171919770774, "grad_norm": 0.5203003287315369, "learning_rate": 1.279021483668468e-05, "loss": 0.4344, "mean_token_accuracy": 0.8545449018478394, "num_tokens": 831113957.0, "step": 24810 }, { "epoch": 1.4813156638013372, "grad_norm": 0.5016398429870605, "learning_rate": 1.2781814025804095e-05, "loss": 0.4268, "mean_token_accuracy": 0.8572229504585266, "num_tokens": 831281637.0, "step": 24815 }, { "epoch": 1.481614135625597, "grad_norm": 0.5529849529266357, "learning_rate": 1.2773416799809357e-05, "loss": 0.4443, "mean_token_accuracy": 0.8507992386817932, "num_tokens": 831449317.0, "step": 24820 }, { "epoch": 1.4819126074498568, "grad_norm": 0.8989988565444946, "learning_rate": 1.2765023160745788e-05, "loss": 0.4522, "mean_token_accuracy": 0.8524036765098572, "num_tokens": 831616997.0, "step": 24825 }, { "epoch": 1.4822110792741165, "grad_norm": 0.5164071917533875, "learning_rate": 1.2756633110657807e-05, "loss": 0.4271, "mean_token_accuracy": 0.8566086292266846, "num_tokens": 831784677.0, "step": 24830 }, { "epoch": 1.4825095510983763, "grad_norm": 0.4770441949367523, "learning_rate": 1.2748246651588963e-05, "loss": 0.405, "mean_token_accuracy": 0.8625909566879273, "num_tokens": 831952357.0, "step": 24835 }, { "epoch": 1.4828080229226361, "grad_norm": 0.547262966632843, "learning_rate": 1.2739863785581924e-05, "loss": 0.4602, "mean_token_accuracy": 0.846051549911499, "num_tokens": 832120037.0, "step": 24840 }, { "epoch": 1.483106494746896, "grad_norm": 0.5147694945335388, "learning_rate": 1.2731484514678508e-05, "loss": 0.4019, "mean_token_accuracy": 0.8648813009262085, "num_tokens": 832287717.0, "step": 24845 }, { "epoch": 1.4834049665711557, "grad_norm": 0.47269514203071594, "learning_rate": 1.2723108840919631e-05, "loss": 0.4369, "mean_token_accuracy": 0.853847074508667, "num_tokens": 832455397.0, "step": 24850 }, { "epoch": 1.4837034383954155, "grad_norm": 0.4831273555755615, "learning_rate": 1.2714736766345357e-05, "loss": 0.4261, "mean_token_accuracy": 0.8571155905723572, "num_tokens": 832623077.0, "step": 24855 }, { "epoch": 1.4840019102196753, "grad_norm": 0.5041881799697876, "learning_rate": 1.2706368292994836e-05, "loss": 0.4378, "mean_token_accuracy": 0.8522306919097901, "num_tokens": 832790757.0, "step": 24860 }, { "epoch": 1.484300382043935, "grad_norm": 0.5632654428482056, "learning_rate": 1.2698003422906379e-05, "loss": 0.4302, "mean_token_accuracy": 0.8571871757507324, "num_tokens": 832958437.0, "step": 24865 }, { "epoch": 1.484598853868195, "grad_norm": 0.49671268463134766, "learning_rate": 1.2689642158117403e-05, "loss": 0.4532, "mean_token_accuracy": 0.8494810938835144, "num_tokens": 833126117.0, "step": 24870 }, { "epoch": 1.4848973256924547, "grad_norm": 0.5024557709693909, "learning_rate": 1.2681284500664451e-05, "loss": 0.4761, "mean_token_accuracy": 0.8410294532775879, "num_tokens": 833293797.0, "step": 24875 }, { "epoch": 1.4851957975167145, "grad_norm": 0.4708639681339264, "learning_rate": 1.2672930452583176e-05, "loss": 0.4461, "mean_token_accuracy": 0.851127278804779, "num_tokens": 833461477.0, "step": 24880 }, { "epoch": 1.4854942693409743, "grad_norm": 0.500773549079895, "learning_rate": 1.2664580015908355e-05, "loss": 0.4293, "mean_token_accuracy": 0.8556781649589539, "num_tokens": 833629157.0, "step": 24885 }, { "epoch": 1.485792741165234, "grad_norm": 0.5218284130096436, "learning_rate": 1.2656233192673922e-05, "loss": 0.3952, "mean_token_accuracy": 0.8665692567825317, "num_tokens": 833796837.0, "step": 24890 }, { "epoch": 1.4860912129894939, "grad_norm": 0.515834629535675, "learning_rate": 1.264788998491287e-05, "loss": 0.4582, "mean_token_accuracy": 0.8470476031303406, "num_tokens": 833964517.0, "step": 24895 }, { "epoch": 1.4863896848137537, "grad_norm": 0.5145506858825684, "learning_rate": 1.2639550394657348e-05, "loss": 0.4147, "mean_token_accuracy": 0.8619706630706787, "num_tokens": 834132197.0, "step": 24900 }, { "epoch": 1.4866881566380132, "grad_norm": 0.4744322896003723, "learning_rate": 1.2631214423938623e-05, "loss": 0.4444, "mean_token_accuracy": 0.8522366881370544, "num_tokens": 834299877.0, "step": 24905 }, { "epoch": 1.4869866284622733, "grad_norm": 0.48897868394851685, "learning_rate": 1.2622882074787071e-05, "loss": 0.4539, "mean_token_accuracy": 0.8466599106788635, "num_tokens": 834467557.0, "step": 24910 }, { "epoch": 1.4872851002865328, "grad_norm": 0.503420889377594, "learning_rate": 1.2614553349232197e-05, "loss": 0.4317, "mean_token_accuracy": 0.8563402175903321, "num_tokens": 834635237.0, "step": 24915 }, { "epoch": 1.4875835721107928, "grad_norm": 0.4867260158061981, "learning_rate": 1.2606228249302599e-05, "loss": 0.3895, "mean_token_accuracy": 0.8672491908073425, "num_tokens": 834802917.0, "step": 24920 }, { "epoch": 1.4878820439350524, "grad_norm": 0.49484288692474365, "learning_rate": 1.2597906777026023e-05, "loss": 0.4075, "mean_token_accuracy": 0.8627639412879944, "num_tokens": 834970597.0, "step": 24925 }, { "epoch": 1.4881805157593124, "grad_norm": 0.4858594536781311, "learning_rate": 1.2589588934429308e-05, "loss": 0.4188, "mean_token_accuracy": 0.8597041606903076, "num_tokens": 835138277.0, "step": 24930 }, { "epoch": 1.488478987583572, "grad_norm": 0.5712736248970032, "learning_rate": 1.2581274723538422e-05, "loss": 0.465, "mean_token_accuracy": 0.8465823769569397, "num_tokens": 835305957.0, "step": 24935 }, { "epoch": 1.488777459407832, "grad_norm": 0.52598637342453, "learning_rate": 1.2572964146378457e-05, "loss": 0.4273, "mean_token_accuracy": 0.8549862742424011, "num_tokens": 835473637.0, "step": 24940 }, { "epoch": 1.4890759312320916, "grad_norm": 0.47138312458992004, "learning_rate": 1.2564657204973578e-05, "loss": 0.4562, "mean_token_accuracy": 0.8479541897773742, "num_tokens": 835641317.0, "step": 24945 }, { "epoch": 1.4893744030563516, "grad_norm": 0.49905580282211304, "learning_rate": 1.255635390134713e-05, "loss": 0.4485, "mean_token_accuracy": 0.8495526790618897, "num_tokens": 835808997.0, "step": 24950 }, { "epoch": 1.4896728748806112, "grad_norm": 0.5662775039672852, "learning_rate": 1.2548054237521506e-05, "loss": 0.461, "mean_token_accuracy": 0.8471012711524963, "num_tokens": 835976677.0, "step": 24955 }, { "epoch": 1.4899713467048712, "grad_norm": 0.5363903045654297, "learning_rate": 1.2539758215518255e-05, "loss": 0.4429, "mean_token_accuracy": 0.8501908779144287, "num_tokens": 836144357.0, "step": 24960 }, { "epoch": 1.4902698185291308, "grad_norm": 0.5376128554344177, "learning_rate": 1.2531465837358025e-05, "loss": 0.4473, "mean_token_accuracy": 0.8515567183494568, "num_tokens": 836312037.0, "step": 24965 }, { "epoch": 1.4905682903533906, "grad_norm": 0.4691319167613983, "learning_rate": 1.2523177105060577e-05, "loss": 0.4411, "mean_token_accuracy": 0.8522486090660095, "num_tokens": 836479717.0, "step": 24970 }, { "epoch": 1.4908667621776504, "grad_norm": 0.4661787450313568, "learning_rate": 1.2514892020644798e-05, "loss": 0.4675, "mean_token_accuracy": 0.8460515379905701, "num_tokens": 836647397.0, "step": 24975 }, { "epoch": 1.4911652340019101, "grad_norm": 0.5125504732131958, "learning_rate": 1.2506610586128648e-05, "loss": 0.4149, "mean_token_accuracy": 0.8614278912544251, "num_tokens": 836815077.0, "step": 24980 }, { "epoch": 1.49146370582617, "grad_norm": 0.48773860931396484, "learning_rate": 1.2498332803529236e-05, "loss": 0.4099, "mean_token_accuracy": 0.8640820622444153, "num_tokens": 836982757.0, "step": 24985 }, { "epoch": 1.4917621776504297, "grad_norm": 0.5541853904724121, "learning_rate": 1.249005867486277e-05, "loss": 0.4506, "mean_token_accuracy": 0.8504711866378785, "num_tokens": 837150437.0, "step": 24990 }, { "epoch": 1.4920606494746895, "grad_norm": 0.5276203155517578, "learning_rate": 1.2481788202144565e-05, "loss": 0.4381, "mean_token_accuracy": 0.85422283411026, "num_tokens": 837318117.0, "step": 24995 }, { "epoch": 1.4923591212989493, "grad_norm": 0.4703633785247803, "learning_rate": 1.2473521387389056e-05, "loss": 0.4361, "mean_token_accuracy": 0.8546284198760986, "num_tokens": 837485797.0, "step": 25000 }, { "epoch": 1.4926575931232091, "grad_norm": 0.5033524036407471, "learning_rate": 1.2465258232609761e-05, "loss": 0.4217, "mean_token_accuracy": 0.8586842536926269, "num_tokens": 837653477.0, "step": 25005 }, { "epoch": 1.492956064947469, "grad_norm": 0.5081316828727722, "learning_rate": 1.2456998739819336e-05, "loss": 0.451, "mean_token_accuracy": 0.8490158557891846, "num_tokens": 837821157.0, "step": 25010 }, { "epoch": 1.4932545367717287, "grad_norm": 0.48394495248794556, "learning_rate": 1.2448742911029527e-05, "loss": 0.4494, "mean_token_accuracy": 0.8493916153907776, "num_tokens": 837988837.0, "step": 25015 }, { "epoch": 1.4935530085959885, "grad_norm": 0.49527519941329956, "learning_rate": 1.2440490748251197e-05, "loss": 0.4335, "mean_token_accuracy": 0.8559107661247254, "num_tokens": 838156517.0, "step": 25020 }, { "epoch": 1.4938514804202483, "grad_norm": 0.4975765347480774, "learning_rate": 1.2432242253494314e-05, "loss": 0.4386, "mean_token_accuracy": 0.8532029151916504, "num_tokens": 838324197.0, "step": 25025 }, { "epoch": 1.494149952244508, "grad_norm": 0.48721006512641907, "learning_rate": 1.2423997428767946e-05, "loss": 0.4118, "mean_token_accuracy": 0.8605153322219848, "num_tokens": 838491877.0, "step": 25030 }, { "epoch": 1.4944484240687679, "grad_norm": 0.535171627998352, "learning_rate": 1.2415756276080286e-05, "loss": 0.4664, "mean_token_accuracy": 0.8455982446670532, "num_tokens": 838659557.0, "step": 25035 }, { "epoch": 1.4947468958930277, "grad_norm": 0.47328364849090576, "learning_rate": 1.2407518797438598e-05, "loss": 0.4655, "mean_token_accuracy": 0.84479900598526, "num_tokens": 838827237.0, "step": 25040 }, { "epoch": 1.4950453677172875, "grad_norm": 0.4900859296321869, "learning_rate": 1.2399284994849285e-05, "loss": 0.4264, "mean_token_accuracy": 0.8561135649681091, "num_tokens": 838994917.0, "step": 25045 }, { "epoch": 1.4953438395415473, "grad_norm": 0.4779356122016907, "learning_rate": 1.2391054870317837e-05, "loss": 0.4558, "mean_token_accuracy": 0.8495765328407288, "num_tokens": 839162597.0, "step": 25050 }, { "epoch": 1.495642311365807, "grad_norm": 0.586794912815094, "learning_rate": 1.2382828425848854e-05, "loss": 0.4418, "mean_token_accuracy": 0.851240599155426, "num_tokens": 839330277.0, "step": 25055 }, { "epoch": 1.4959407831900668, "grad_norm": 0.48590776324272156, "learning_rate": 1.2374605663446048e-05, "loss": 0.4488, "mean_token_accuracy": 0.8508290529251099, "num_tokens": 839497957.0, "step": 25060 }, { "epoch": 1.4962392550143266, "grad_norm": 0.4813680052757263, "learning_rate": 1.2366386585112205e-05, "loss": 0.3965, "mean_token_accuracy": 0.8661517381668091, "num_tokens": 839665637.0, "step": 25065 }, { "epoch": 1.4965377268385864, "grad_norm": 0.5295145511627197, "learning_rate": 1.235817119284924e-05, "loss": 0.4074, "mean_token_accuracy": 0.8633424639701843, "num_tokens": 839833317.0, "step": 25070 }, { "epoch": 1.4968361986628462, "grad_norm": 0.5573604702949524, "learning_rate": 1.2349959488658181e-05, "loss": 0.4179, "mean_token_accuracy": 0.8604318141937256, "num_tokens": 840000997.0, "step": 25075 }, { "epoch": 1.497134670487106, "grad_norm": 0.4596938490867615, "learning_rate": 1.2341751474539115e-05, "loss": 0.4127, "mean_token_accuracy": 0.8616605043411255, "num_tokens": 840168677.0, "step": 25080 }, { "epoch": 1.4974331423113658, "grad_norm": 0.4987497627735138, "learning_rate": 1.2333547152491279e-05, "loss": 0.3955, "mean_token_accuracy": 0.8667899370193481, "num_tokens": 840336357.0, "step": 25085 }, { "epoch": 1.4977316141356256, "grad_norm": 0.48805469274520874, "learning_rate": 1.2325346524512957e-05, "loss": 0.4483, "mean_token_accuracy": 0.8518787980079651, "num_tokens": 840504037.0, "step": 25090 }, { "epoch": 1.4980300859598854, "grad_norm": 0.5142251253128052, "learning_rate": 1.2317149592601596e-05, "loss": 0.4289, "mean_token_accuracy": 0.8573064565658569, "num_tokens": 840671717.0, "step": 25095 }, { "epoch": 1.4983285577841452, "grad_norm": 0.4770118296146393, "learning_rate": 1.2308956358753688e-05, "loss": 0.4255, "mean_token_accuracy": 0.8563103914260864, "num_tokens": 840839397.0, "step": 25100 }, { "epoch": 1.498627029608405, "grad_norm": 0.5360010862350464, "learning_rate": 1.2300766824964855e-05, "loss": 0.4202, "mean_token_accuracy": 0.8601276397705078, "num_tokens": 841007077.0, "step": 25105 }, { "epoch": 1.4989255014326648, "grad_norm": 0.49961942434310913, "learning_rate": 1.2292580993229804e-05, "loss": 0.4632, "mean_token_accuracy": 0.845789110660553, "num_tokens": 841174757.0, "step": 25110 }, { "epoch": 1.4992239732569246, "grad_norm": 0.5356742143630981, "learning_rate": 1.2284398865542354e-05, "loss": 0.4531, "mean_token_accuracy": 0.8493523597717285, "num_tokens": 841340286.0, "step": 25115 }, { "epoch": 1.4995224450811844, "grad_norm": 0.5119374990463257, "learning_rate": 1.2276220443895411e-05, "loss": 0.454, "mean_token_accuracy": 0.8494691729545594, "num_tokens": 841507966.0, "step": 25120 }, { "epoch": 1.4998209169054442, "grad_norm": 0.46571317315101624, "learning_rate": 1.2268045730280973e-05, "loss": 0.4276, "mean_token_accuracy": 0.8551771521568299, "num_tokens": 841675646.0, "step": 25125 }, { "epoch": 1.500119388729704, "grad_norm": 0.5265080332756042, "learning_rate": 1.2259874726690146e-05, "loss": 0.4639, "mean_token_accuracy": 0.845685076713562, "num_tokens": 841837651.0, "step": 25130 }, { "epoch": 1.5004178605539638, "grad_norm": 0.5621362924575806, "learning_rate": 1.2251707435113128e-05, "loss": 0.4022, "mean_token_accuracy": 0.8644578337669373, "num_tokens": 842005331.0, "step": 25135 }, { "epoch": 1.5007163323782235, "grad_norm": 0.5322445631027222, "learning_rate": 1.2243543857539219e-05, "loss": 0.455, "mean_token_accuracy": 0.8471847653388977, "num_tokens": 842173011.0, "step": 25140 }, { "epoch": 1.5010148042024833, "grad_norm": 0.5195977091789246, "learning_rate": 1.223538399595681e-05, "loss": 0.4189, "mean_token_accuracy": 0.86031254529953, "num_tokens": 842340691.0, "step": 25145 }, { "epoch": 1.501313276026743, "grad_norm": 0.4984567165374756, "learning_rate": 1.2227227852353367e-05, "loss": 0.4357, "mean_token_accuracy": 0.8533878087997436, "num_tokens": 842508371.0, "step": 25150 }, { "epoch": 1.501611747851003, "grad_norm": 0.48683884739875793, "learning_rate": 1.221907542871549e-05, "loss": 0.4231, "mean_token_accuracy": 0.858057975769043, "num_tokens": 842676051.0, "step": 25155 }, { "epoch": 1.5019102196752625, "grad_norm": 0.5287027359008789, "learning_rate": 1.2210926727028856e-05, "loss": 0.453, "mean_token_accuracy": 0.8500894665718078, "num_tokens": 842843731.0, "step": 25160 }, { "epoch": 1.5022086914995225, "grad_norm": 0.5262054204940796, "learning_rate": 1.2202781749278212e-05, "loss": 0.4446, "mean_token_accuracy": 0.8515984773635864, "num_tokens": 843011411.0, "step": 25165 }, { "epoch": 1.502507163323782, "grad_norm": 0.4648466408252716, "learning_rate": 1.2194640497447428e-05, "loss": 0.4501, "mean_token_accuracy": 0.8499105215072632, "num_tokens": 843179091.0, "step": 25170 }, { "epoch": 1.502805635148042, "grad_norm": 0.4945337772369385, "learning_rate": 1.2186502973519457e-05, "loss": 0.4141, "mean_token_accuracy": 0.8615650773048401, "num_tokens": 843346771.0, "step": 25175 }, { "epoch": 1.5031041069723017, "grad_norm": 0.5120456218719482, "learning_rate": 1.217836917947634e-05, "loss": 0.4622, "mean_token_accuracy": 0.8446141123771668, "num_tokens": 843514451.0, "step": 25180 }, { "epoch": 1.5034025787965617, "grad_norm": 0.5191718339920044, "learning_rate": 1.2170239117299222e-05, "loss": 0.4616, "mean_token_accuracy": 0.8467434048652649, "num_tokens": 843682131.0, "step": 25185 }, { "epoch": 1.5037010506208213, "grad_norm": 0.4831700623035431, "learning_rate": 1.2162112788968313e-05, "loss": 0.4311, "mean_token_accuracy": 0.8565191388130188, "num_tokens": 843849811.0, "step": 25190 }, { "epoch": 1.5039995224450813, "grad_norm": 0.4708995521068573, "learning_rate": 1.2153990196462941e-05, "loss": 0.3956, "mean_token_accuracy": 0.8654121398925781, "num_tokens": 844017491.0, "step": 25195 }, { "epoch": 1.5042979942693409, "grad_norm": 0.4848741590976715, "learning_rate": 1.2145871341761512e-05, "loss": 0.4276, "mean_token_accuracy": 0.8563103914260864, "num_tokens": 844185171.0, "step": 25200 }, { "epoch": 1.5045964660936009, "grad_norm": 0.48994994163513184, "learning_rate": 1.2137756226841526e-05, "loss": 0.445, "mean_token_accuracy": 0.8521054506301879, "num_tokens": 844352851.0, "step": 25205 }, { "epoch": 1.5048949379178604, "grad_norm": 0.4944668412208557, "learning_rate": 1.212964485367956e-05, "loss": 0.425, "mean_token_accuracy": 0.8572945237159729, "num_tokens": 844520531.0, "step": 25210 }, { "epoch": 1.5051934097421205, "grad_norm": 0.45361489057540894, "learning_rate": 1.2121537224251289e-05, "loss": 0.3842, "mean_token_accuracy": 0.8697423338890076, "num_tokens": 844688211.0, "step": 25215 }, { "epoch": 1.50549188156638, "grad_norm": 0.49812015891075134, "learning_rate": 1.2113433340531496e-05, "loss": 0.4352, "mean_token_accuracy": 0.8560658454895019, "num_tokens": 844855891.0, "step": 25220 }, { "epoch": 1.50579035339064, "grad_norm": 0.4734099209308624, "learning_rate": 1.2105333204494005e-05, "loss": 0.3861, "mean_token_accuracy": 0.8691816687583923, "num_tokens": 845023571.0, "step": 25225 }, { "epoch": 1.5060888252148996, "grad_norm": 0.5214412808418274, "learning_rate": 1.2097236818111778e-05, "loss": 0.4605, "mean_token_accuracy": 0.8480973482131958, "num_tokens": 845191251.0, "step": 25230 }, { "epoch": 1.5063872970391596, "grad_norm": 0.5334967970848083, "learning_rate": 1.2089144183356805e-05, "loss": 0.4634, "mean_token_accuracy": 0.8447154879570007, "num_tokens": 845358931.0, "step": 25235 }, { "epoch": 1.5066857688634192, "grad_norm": 0.4756765365600586, "learning_rate": 1.2081055302200231e-05, "loss": 0.4139, "mean_token_accuracy": 0.8588691473007202, "num_tokens": 845526611.0, "step": 25240 }, { "epoch": 1.5069842406876792, "grad_norm": 0.5088101625442505, "learning_rate": 1.2072970176612245e-05, "loss": 0.4477, "mean_token_accuracy": 0.8507813453674317, "num_tokens": 845694291.0, "step": 25245 }, { "epoch": 1.5072827125119388, "grad_norm": 0.4716501832008362, "learning_rate": 1.2064888808562114e-05, "loss": 0.4289, "mean_token_accuracy": 0.8575927495956421, "num_tokens": 845861971.0, "step": 25250 }, { "epoch": 1.5075811843361988, "grad_norm": 0.5015446543693542, "learning_rate": 1.2056811200018214e-05, "loss": 0.4239, "mean_token_accuracy": 0.8587737083435059, "num_tokens": 846029651.0, "step": 25255 }, { "epoch": 1.5078796561604584, "grad_norm": 0.49406954646110535, "learning_rate": 1.2048737352947995e-05, "loss": 0.4001, "mean_token_accuracy": 0.8648753523826599, "num_tokens": 846197331.0, "step": 25260 }, { "epoch": 1.5081781279847184, "grad_norm": 0.499738872051239, "learning_rate": 1.2040667269317991e-05, "loss": 0.4493, "mean_token_accuracy": 0.8507117509841919, "num_tokens": 846360857.0, "step": 25265 }, { "epoch": 1.508476599808978, "grad_norm": 0.5533084869384766, "learning_rate": 1.2032600951093829e-05, "loss": 0.4374, "mean_token_accuracy": 0.8535309433937073, "num_tokens": 846528537.0, "step": 25270 }, { "epoch": 1.508775071633238, "grad_norm": 0.5204278826713562, "learning_rate": 1.2024538400240185e-05, "loss": 0.4318, "mean_token_accuracy": 0.8561195254325866, "num_tokens": 846696217.0, "step": 25275 }, { "epoch": 1.5090735434574976, "grad_norm": 0.5286776423454285, "learning_rate": 1.2016479618720873e-05, "loss": 0.4405, "mean_token_accuracy": 0.8546880483627319, "num_tokens": 846863897.0, "step": 25280 }, { "epoch": 1.5093720152817574, "grad_norm": 0.5254387259483337, "learning_rate": 1.2008424608498733e-05, "loss": 0.4409, "mean_token_accuracy": 0.8507753729820251, "num_tokens": 847031577.0, "step": 25285 }, { "epoch": 1.5096704871060171, "grad_norm": 0.5032444000244141, "learning_rate": 1.2000373371535722e-05, "loss": 0.396, "mean_token_accuracy": 0.8657580852508545, "num_tokens": 847199257.0, "step": 25290 }, { "epoch": 1.509968958930277, "grad_norm": 0.477531373500824, "learning_rate": 1.199232590979287e-05, "loss": 0.4391, "mean_token_accuracy": 0.8542645931243896, "num_tokens": 847366937.0, "step": 25295 }, { "epoch": 1.5102674307545367, "grad_norm": 0.6035160422325134, "learning_rate": 1.198428222523028e-05, "loss": 0.4664, "mean_token_accuracy": 0.8442025542259216, "num_tokens": 847534617.0, "step": 25300 }, { "epoch": 1.5105659025787965, "grad_norm": 0.4623118042945862, "learning_rate": 1.1976242319807146e-05, "loss": 0.4339, "mean_token_accuracy": 0.8555767655372619, "num_tokens": 847702297.0, "step": 25305 }, { "epoch": 1.5108643744030563, "grad_norm": 0.508072018623352, "learning_rate": 1.1968206195481728e-05, "loss": 0.4473, "mean_token_accuracy": 0.8505487203598022, "num_tokens": 847869977.0, "step": 25310 }, { "epoch": 1.5111628462273161, "grad_norm": 0.5197190046310425, "learning_rate": 1.196017385421137e-05, "loss": 0.4276, "mean_token_accuracy": 0.8578790426254272, "num_tokens": 848037657.0, "step": 25315 }, { "epoch": 1.511461318051576, "grad_norm": 0.5564250349998474, "learning_rate": 1.1952145297952506e-05, "loss": 0.4551, "mean_token_accuracy": 0.8482047080993652, "num_tokens": 848205337.0, "step": 25320 }, { "epoch": 1.5117597898758357, "grad_norm": 0.5371652841567993, "learning_rate": 1.1944120528660633e-05, "loss": 0.4495, "mean_token_accuracy": 0.8491649866104126, "num_tokens": 848373017.0, "step": 25325 }, { "epoch": 1.5120582617000955, "grad_norm": 0.5175400972366333, "learning_rate": 1.1936099548290346e-05, "loss": 0.4362, "mean_token_accuracy": 0.8541810870170593, "num_tokens": 848540697.0, "step": 25330 }, { "epoch": 1.5123567335243553, "grad_norm": 0.5128393173217773, "learning_rate": 1.1928082358795278e-05, "loss": 0.4109, "mean_token_accuracy": 0.8617618918418884, "num_tokens": 848708377.0, "step": 25335 }, { "epoch": 1.512655205348615, "grad_norm": 0.5576654076576233, "learning_rate": 1.1920068962128176e-05, "loss": 0.4522, "mean_token_accuracy": 0.849135160446167, "num_tokens": 848876057.0, "step": 25340 }, { "epoch": 1.5129536771728749, "grad_norm": 0.5023189783096313, "learning_rate": 1.1912059360240854e-05, "loss": 0.4043, "mean_token_accuracy": 0.8634021282196045, "num_tokens": 849043737.0, "step": 25345 }, { "epoch": 1.5132521489971347, "grad_norm": 0.5152266621589661, "learning_rate": 1.1904053555084194e-05, "loss": 0.4779, "mean_token_accuracy": 0.8420374512672424, "num_tokens": 849211417.0, "step": 25350 }, { "epoch": 1.5135506208213945, "grad_norm": 0.586997389793396, "learning_rate": 1.1896051548608167e-05, "loss": 0.456, "mean_token_accuracy": 0.8480019092559814, "num_tokens": 849379097.0, "step": 25355 }, { "epoch": 1.5138490926456543, "grad_norm": 0.4987232983112335, "learning_rate": 1.1888053342761784e-05, "loss": 0.4025, "mean_token_accuracy": 0.8653107404708862, "num_tokens": 849546777.0, "step": 25360 }, { "epoch": 1.514147564469914, "grad_norm": 0.5013066530227661, "learning_rate": 1.188005893949319e-05, "loss": 0.4444, "mean_token_accuracy": 0.8521412372589111, "num_tokens": 849714457.0, "step": 25365 }, { "epoch": 1.5144460362941738, "grad_norm": 0.5315134525299072, "learning_rate": 1.1872068340749544e-05, "loss": 0.4248, "mean_token_accuracy": 0.857079803943634, "num_tokens": 849882137.0, "step": 25370 }, { "epoch": 1.5147445081184336, "grad_norm": 0.4681101143360138, "learning_rate": 1.1864081548477113e-05, "loss": 0.4298, "mean_token_accuracy": 0.856214964389801, "num_tokens": 850049817.0, "step": 25375 }, { "epoch": 1.5150429799426934, "grad_norm": 0.4866337180137634, "learning_rate": 1.1856098564621227e-05, "loss": 0.4203, "mean_token_accuracy": 0.858809494972229, "num_tokens": 850217497.0, "step": 25380 }, { "epoch": 1.5153414517669532, "grad_norm": 0.4581342041492462, "learning_rate": 1.1848119391126287e-05, "loss": 0.4043, "mean_token_accuracy": 0.8650721549987793, "num_tokens": 850385177.0, "step": 25385 }, { "epoch": 1.515639923591213, "grad_norm": 0.5076090097427368, "learning_rate": 1.184014402993578e-05, "loss": 0.392, "mean_token_accuracy": 0.8668555498123169, "num_tokens": 850552857.0, "step": 25390 }, { "epoch": 1.5159383954154728, "grad_norm": 0.5384923219680786, "learning_rate": 1.1832172482992234e-05, "loss": 0.4536, "mean_token_accuracy": 0.8473696827888488, "num_tokens": 850720537.0, "step": 25395 }, { "epoch": 1.5162368672397326, "grad_norm": 0.5698250532150269, "learning_rate": 1.1824204752237265e-05, "loss": 0.4474, "mean_token_accuracy": 0.8503936529159546, "num_tokens": 850888217.0, "step": 25400 }, { "epoch": 1.5165353390639924, "grad_norm": 0.5221206545829773, "learning_rate": 1.181624083961159e-05, "loss": 0.4284, "mean_token_accuracy": 0.8569962978363037, "num_tokens": 851055897.0, "step": 25405 }, { "epoch": 1.5168338108882522, "grad_norm": 0.5178723931312561, "learning_rate": 1.1808280747054933e-05, "loss": 0.4252, "mean_token_accuracy": 0.8568114042282104, "num_tokens": 851223577.0, "step": 25410 }, { "epoch": 1.517132282712512, "grad_norm": 0.5310178995132446, "learning_rate": 1.1800324476506149e-05, "loss": 0.449, "mean_token_accuracy": 0.8513897299766541, "num_tokens": 851391257.0, "step": 25415 }, { "epoch": 1.5174307545367718, "grad_norm": 0.5094597935676575, "learning_rate": 1.1792372029903107e-05, "loss": 0.4548, "mean_token_accuracy": 0.848258376121521, "num_tokens": 851558937.0, "step": 25420 }, { "epoch": 1.5177292263610314, "grad_norm": 0.4514775574207306, "learning_rate": 1.17844234091828e-05, "loss": 0.3802, "mean_token_accuracy": 0.8704043984413147, "num_tokens": 851726617.0, "step": 25425 }, { "epoch": 1.5180276981852914, "grad_norm": 0.4997726082801819, "learning_rate": 1.177647861628124e-05, "loss": 0.4231, "mean_token_accuracy": 0.8594954133033752, "num_tokens": 851894297.0, "step": 25430 }, { "epoch": 1.518326170009551, "grad_norm": 0.5539298057556152, "learning_rate": 1.1768537653133537e-05, "loss": 0.4642, "mean_token_accuracy": 0.8446081280708313, "num_tokens": 852061977.0, "step": 25435 }, { "epoch": 1.518624641833811, "grad_norm": 0.5309528112411499, "learning_rate": 1.1760600521673856e-05, "loss": 0.4242, "mean_token_accuracy": 0.8588572144508362, "num_tokens": 852229657.0, "step": 25440 }, { "epoch": 1.5189231136580705, "grad_norm": 0.5359094142913818, "learning_rate": 1.1752667223835437e-05, "loss": 0.4821, "mean_token_accuracy": 0.8404091596603394, "num_tokens": 852397337.0, "step": 25445 }, { "epoch": 1.5192215854823305, "grad_norm": 0.5146046876907349, "learning_rate": 1.1744737761550585e-05, "loss": 0.4429, "mean_token_accuracy": 0.8515865325927734, "num_tokens": 852565017.0, "step": 25450 }, { "epoch": 1.5195200573065901, "grad_norm": 0.4770878553390503, "learning_rate": 1.1736812136750649e-05, "loss": 0.4007, "mean_token_accuracy": 0.8659608602523804, "num_tokens": 852732697.0, "step": 25455 }, { "epoch": 1.5198185291308501, "grad_norm": 0.47861650586128235, "learning_rate": 1.1728890351366077e-05, "loss": 0.4437, "mean_token_accuracy": 0.8520279049873352, "num_tokens": 852900377.0, "step": 25460 }, { "epoch": 1.5201170009551097, "grad_norm": 0.4809304177761078, "learning_rate": 1.1720972407326359e-05, "loss": 0.4218, "mean_token_accuracy": 0.8567756175994873, "num_tokens": 853068057.0, "step": 25465 }, { "epoch": 1.5204154727793697, "grad_norm": 0.5149447321891785, "learning_rate": 1.1713058306560056e-05, "loss": 0.43, "mean_token_accuracy": 0.8563581109046936, "num_tokens": 853235737.0, "step": 25470 }, { "epoch": 1.5207139446036293, "grad_norm": 0.48470592498779297, "learning_rate": 1.170514805099481e-05, "loss": 0.448, "mean_token_accuracy": 0.8501848936080932, "num_tokens": 853403417.0, "step": 25475 }, { "epoch": 1.5210124164278893, "grad_norm": 0.5272351503372192, "learning_rate": 1.1697241642557275e-05, "loss": 0.4727, "mean_token_accuracy": 0.8430275559425354, "num_tokens": 853571097.0, "step": 25480 }, { "epoch": 1.5213108882521489, "grad_norm": 0.5201582908630371, "learning_rate": 1.1689339083173235e-05, "loss": 0.445, "mean_token_accuracy": 0.8512048244476318, "num_tokens": 853738777.0, "step": 25485 }, { "epoch": 1.521609360076409, "grad_norm": 0.5304452180862427, "learning_rate": 1.1681440374767497e-05, "loss": 0.4667, "mean_token_accuracy": 0.8431408882141114, "num_tokens": 853906457.0, "step": 25490 }, { "epoch": 1.5219078319006685, "grad_norm": 0.4628830850124359, "learning_rate": 1.1673545519263927e-05, "loss": 0.4317, "mean_token_accuracy": 0.8550936460494996, "num_tokens": 854074137.0, "step": 25495 }, { "epoch": 1.5222063037249285, "grad_norm": 0.5660292506217957, "learning_rate": 1.1665654518585468e-05, "loss": 0.4622, "mean_token_accuracy": 0.8458248853683472, "num_tokens": 854241817.0, "step": 25500 }, { "epoch": 1.522504775549188, "grad_norm": 0.5355969667434692, "learning_rate": 1.1657767374654118e-05, "loss": 0.4106, "mean_token_accuracy": 0.8627042770385742, "num_tokens": 854409497.0, "step": 25505 }, { "epoch": 1.522803247373448, "grad_norm": 0.5886985659599304, "learning_rate": 1.164988408939094e-05, "loss": 0.4658, "mean_token_accuracy": 0.8455147385597229, "num_tokens": 854577177.0, "step": 25510 }, { "epoch": 1.5231017191977076, "grad_norm": 0.5814493298530579, "learning_rate": 1.164200466471606e-05, "loss": 0.3958, "mean_token_accuracy": 0.8682237386703491, "num_tokens": 854739815.0, "step": 25515 }, { "epoch": 1.5234001910219677, "grad_norm": 0.4609314501285553, "learning_rate": 1.1634129102548641e-05, "loss": 0.4299, "mean_token_accuracy": 0.8565251111984253, "num_tokens": 854907495.0, "step": 25520 }, { "epoch": 1.5236986628462272, "grad_norm": 0.48527318239212036, "learning_rate": 1.1626257404806929e-05, "loss": 0.4054, "mean_token_accuracy": 0.8627758622169495, "num_tokens": 855075175.0, "step": 25525 }, { "epoch": 1.5239971346704873, "grad_norm": 0.5244532227516174, "learning_rate": 1.161838957340822e-05, "loss": 0.4487, "mean_token_accuracy": 0.8498926281929016, "num_tokens": 855242855.0, "step": 25530 }, { "epoch": 1.5242956064947468, "grad_norm": 0.5301193594932556, "learning_rate": 1.1610525610268882e-05, "loss": 0.4333, "mean_token_accuracy": 0.8554813265800476, "num_tokens": 855410535.0, "step": 25535 }, { "epoch": 1.5245940783190068, "grad_norm": 0.5758376717567444, "learning_rate": 1.1602665517304308e-05, "loss": 0.4658, "mean_token_accuracy": 0.8455624461174012, "num_tokens": 855578215.0, "step": 25540 }, { "epoch": 1.5248925501432664, "grad_norm": 0.47314441204071045, "learning_rate": 1.159480929642897e-05, "loss": 0.4238, "mean_token_accuracy": 0.8574615359306336, "num_tokens": 855745895.0, "step": 25545 }, { "epoch": 1.5251910219675264, "grad_norm": 0.5664907693862915, "learning_rate": 1.1586956949556422e-05, "loss": 0.45, "mean_token_accuracy": 0.8488965630531311, "num_tokens": 855913575.0, "step": 25550 }, { "epoch": 1.525489493791786, "grad_norm": 0.4657769799232483, "learning_rate": 1.157910847859922e-05, "loss": 0.4174, "mean_token_accuracy": 0.861177396774292, "num_tokens": 856081255.0, "step": 25555 }, { "epoch": 1.5257879656160458, "grad_norm": 0.5686755180358887, "learning_rate": 1.1571263885469021e-05, "loss": 0.4793, "mean_token_accuracy": 0.8419002890586853, "num_tokens": 856248935.0, "step": 25560 }, { "epoch": 1.5260864374403056, "grad_norm": 0.5609865784645081, "learning_rate": 1.1563423172076501e-05, "loss": 0.46, "mean_token_accuracy": 0.8468090176582337, "num_tokens": 856416615.0, "step": 25565 }, { "epoch": 1.5263849092645654, "grad_norm": 0.5525285601615906, "learning_rate": 1.1555586340331431e-05, "loss": 0.4349, "mean_token_accuracy": 0.8541989684104919, "num_tokens": 856584295.0, "step": 25570 }, { "epoch": 1.5266833810888252, "grad_norm": 0.4844779968261719, "learning_rate": 1.1547753392142613e-05, "loss": 0.4224, "mean_token_accuracy": 0.8589230298995971, "num_tokens": 856748145.0, "step": 25575 }, { "epoch": 1.526981852913085, "grad_norm": 0.45421233773231506, "learning_rate": 1.1539924329417895e-05, "loss": 0.4269, "mean_token_accuracy": 0.8569724440574646, "num_tokens": 856915825.0, "step": 25580 }, { "epoch": 1.5272803247373448, "grad_norm": 0.5185850262641907, "learning_rate": 1.1532099154064197e-05, "loss": 0.4113, "mean_token_accuracy": 0.8594715476036072, "num_tokens": 857083505.0, "step": 25585 }, { "epoch": 1.5275787965616046, "grad_norm": 0.463449627161026, "learning_rate": 1.1524277867987481e-05, "loss": 0.4435, "mean_token_accuracy": 0.8512883305549621, "num_tokens": 857251185.0, "step": 25590 }, { "epoch": 1.5278772683858644, "grad_norm": 0.5653368830680847, "learning_rate": 1.151646047309277e-05, "loss": 0.4203, "mean_token_accuracy": 0.8600679874420166, "num_tokens": 857418865.0, "step": 25595 }, { "epoch": 1.5281757402101241, "grad_norm": 0.5152712464332581, "learning_rate": 1.1508646971284139e-05, "loss": 0.4209, "mean_token_accuracy": 0.858421802520752, "num_tokens": 857586545.0, "step": 25600 }, { "epoch": 1.528474212034384, "grad_norm": 0.5396140813827515, "learning_rate": 1.150083736446469e-05, "loss": 0.4749, "mean_token_accuracy": 0.841703450679779, "num_tokens": 857754225.0, "step": 25605 }, { "epoch": 1.5287726838586437, "grad_norm": 0.5097308158874512, "learning_rate": 1.1493031654536629e-05, "loss": 0.4398, "mean_token_accuracy": 0.853960394859314, "num_tokens": 857921905.0, "step": 25610 }, { "epoch": 1.5290711556829035, "grad_norm": 0.5128664970397949, "learning_rate": 1.1485229843401154e-05, "loss": 0.4348, "mean_token_accuracy": 0.853369927406311, "num_tokens": 858089585.0, "step": 25615 }, { "epoch": 1.5293696275071633, "grad_norm": 0.4968007206916809, "learning_rate": 1.147743193295855e-05, "loss": 0.4493, "mean_token_accuracy": 0.8519444227218628, "num_tokens": 858257265.0, "step": 25620 }, { "epoch": 1.5296680993314231, "grad_norm": 0.4765837788581848, "learning_rate": 1.146963792510814e-05, "loss": 0.4488, "mean_token_accuracy": 0.8500298142433167, "num_tokens": 858424945.0, "step": 25625 }, { "epoch": 1.529966571155683, "grad_norm": 0.49282023310661316, "learning_rate": 1.1461847821748302e-05, "loss": 0.4211, "mean_token_accuracy": 0.857706081867218, "num_tokens": 858592625.0, "step": 25630 }, { "epoch": 1.5302650429799427, "grad_norm": 0.5051887631416321, "learning_rate": 1.1454061624776463e-05, "loss": 0.4413, "mean_token_accuracy": 0.8518907308578492, "num_tokens": 858760305.0, "step": 25635 }, { "epoch": 1.5305635148042025, "grad_norm": 0.5638011693954468, "learning_rate": 1.1446279336089084e-05, "loss": 0.4574, "mean_token_accuracy": 0.8478050827980042, "num_tokens": 858927985.0, "step": 25640 }, { "epoch": 1.5308619866284623, "grad_norm": 0.5325712561607361, "learning_rate": 1.1438500957581688e-05, "loss": 0.4169, "mean_token_accuracy": 0.8607121586799622, "num_tokens": 859095665.0, "step": 25645 }, { "epoch": 1.531160458452722, "grad_norm": 0.5539132356643677, "learning_rate": 1.1430726491148846e-05, "loss": 0.4261, "mean_token_accuracy": 0.8586424946784973, "num_tokens": 859263345.0, "step": 25650 }, { "epoch": 1.5314589302769819, "grad_norm": 0.5448346734046936, "learning_rate": 1.1422955938684173e-05, "loss": 0.478, "mean_token_accuracy": 0.8418287038803101, "num_tokens": 859431025.0, "step": 25655 }, { "epoch": 1.5317574021012417, "grad_norm": 0.48333102464675903, "learning_rate": 1.1415189302080333e-05, "loss": 0.4349, "mean_token_accuracy": 0.8548968076705933, "num_tokens": 859598705.0, "step": 25660 }, { "epoch": 1.5320558739255015, "grad_norm": 0.5613148808479309, "learning_rate": 1.1407426583229026e-05, "loss": 0.4283, "mean_token_accuracy": 0.8573780298233032, "num_tokens": 859766385.0, "step": 25665 }, { "epoch": 1.5323543457497613, "grad_norm": 0.5383117198944092, "learning_rate": 1.1399667784021011e-05, "loss": 0.4193, "mean_token_accuracy": 0.8589884281158447, "num_tokens": 859934065.0, "step": 25670 }, { "epoch": 1.532652817574021, "grad_norm": 0.532345175743103, "learning_rate": 1.1391912906346083e-05, "loss": 0.481, "mean_token_accuracy": 0.8398604273796082, "num_tokens": 860101745.0, "step": 25675 }, { "epoch": 1.5329512893982808, "grad_norm": 0.6099774837493896, "learning_rate": 1.138416195209309e-05, "loss": 0.4321, "mean_token_accuracy": 0.8556900739669799, "num_tokens": 860269425.0, "step": 25680 }, { "epoch": 1.5332497612225406, "grad_norm": 0.5399348735809326, "learning_rate": 1.1376414923149922e-05, "loss": 0.442, "mean_token_accuracy": 0.8528748631477356, "num_tokens": 860437105.0, "step": 25685 }, { "epoch": 1.5335482330468004, "grad_norm": 0.5816062688827515, "learning_rate": 1.1368671821403495e-05, "loss": 0.436, "mean_token_accuracy": 0.8532923817634582, "num_tokens": 860604785.0, "step": 25690 }, { "epoch": 1.5338467048710602, "grad_norm": 0.49152544140815735, "learning_rate": 1.1360932648739808e-05, "loss": 0.4448, "mean_token_accuracy": 0.8501729607582093, "num_tokens": 860772465.0, "step": 25695 }, { "epoch": 1.5341451766953198, "grad_norm": 0.5046047568321228, "learning_rate": 1.1353197407043861e-05, "loss": 0.4551, "mean_token_accuracy": 0.848496961593628, "num_tokens": 860940145.0, "step": 25700 }, { "epoch": 1.5344436485195798, "grad_norm": 0.5169011950492859, "learning_rate": 1.1345466098199722e-05, "loss": 0.4209, "mean_token_accuracy": 0.8584098815917969, "num_tokens": 861107825.0, "step": 25705 }, { "epoch": 1.5347421203438394, "grad_norm": 0.47887977957725525, "learning_rate": 1.1337738724090493e-05, "loss": 0.4161, "mean_token_accuracy": 0.8592329740524292, "num_tokens": 861275505.0, "step": 25710 }, { "epoch": 1.5350405921680994, "grad_norm": 0.5099368095397949, "learning_rate": 1.1330015286598318e-05, "loss": 0.4551, "mean_token_accuracy": 0.8475963115692139, "num_tokens": 861443185.0, "step": 25715 }, { "epoch": 1.535339063992359, "grad_norm": 0.5132337212562561, "learning_rate": 1.1322295787604394e-05, "loss": 0.4428, "mean_token_accuracy": 0.8527734637260437, "num_tokens": 861610865.0, "step": 25720 }, { "epoch": 1.535637535816619, "grad_norm": 0.4553278684616089, "learning_rate": 1.1314580228988925e-05, "loss": 0.4698, "mean_token_accuracy": 0.8450196743011474, "num_tokens": 861778545.0, "step": 25725 }, { "epoch": 1.5359360076408786, "grad_norm": 0.5055566430091858, "learning_rate": 1.1306868612631186e-05, "loss": 0.4158, "mean_token_accuracy": 0.8602409601211548, "num_tokens": 861946225.0, "step": 25730 }, { "epoch": 1.5362344794651386, "grad_norm": 0.521346926689148, "learning_rate": 1.1299160940409503e-05, "loss": 0.463, "mean_token_accuracy": 0.845639967918396, "num_tokens": 862113905.0, "step": 25735 }, { "epoch": 1.5365329512893982, "grad_norm": 0.5294837951660156, "learning_rate": 1.1291457214201194e-05, "loss": 0.4248, "mean_token_accuracy": 0.8577120304107666, "num_tokens": 862281585.0, "step": 25740 }, { "epoch": 1.5368314231136582, "grad_norm": 0.5719627141952515, "learning_rate": 1.1283757435882668e-05, "loss": 0.491, "mean_token_accuracy": 0.8375521898269653, "num_tokens": 862449265.0, "step": 25745 }, { "epoch": 1.5371298949379177, "grad_norm": 0.5353347659111023, "learning_rate": 1.127606160732932e-05, "loss": 0.4777, "mean_token_accuracy": 0.8414589047431946, "num_tokens": 862616945.0, "step": 25750 }, { "epoch": 1.5374283667621778, "grad_norm": 0.5057556629180908, "learning_rate": 1.126836973041564e-05, "loss": 0.4647, "mean_token_accuracy": 0.8440892338752747, "num_tokens": 862784625.0, "step": 25755 }, { "epoch": 1.5377268385864373, "grad_norm": 0.5216675400733948, "learning_rate": 1.1260681807015108e-05, "loss": 0.4529, "mean_token_accuracy": 0.8496063470840454, "num_tokens": 862952305.0, "step": 25760 }, { "epoch": 1.5380253104106973, "grad_norm": 0.5547958016395569, "learning_rate": 1.1252997839000267e-05, "loss": 0.4657, "mean_token_accuracy": 0.845276141166687, "num_tokens": 863119985.0, "step": 25765 }, { "epoch": 1.538323782234957, "grad_norm": 0.4714139997959137, "learning_rate": 1.1245317828242686e-05, "loss": 0.435, "mean_token_accuracy": 0.8532387018203735, "num_tokens": 863287665.0, "step": 25770 }, { "epoch": 1.538622254059217, "grad_norm": 0.5024564266204834, "learning_rate": 1.1237641776612974e-05, "loss": 0.4179, "mean_token_accuracy": 0.859048068523407, "num_tokens": 863455345.0, "step": 25775 }, { "epoch": 1.5389207258834765, "grad_norm": 0.5225319266319275, "learning_rate": 1.1229969685980782e-05, "loss": 0.4494, "mean_token_accuracy": 0.8506441712379456, "num_tokens": 863623025.0, "step": 25780 }, { "epoch": 1.5392191977077365, "grad_norm": 0.5305291414260864, "learning_rate": 1.1222301558214779e-05, "loss": 0.4546, "mean_token_accuracy": 0.8486043095588685, "num_tokens": 863790705.0, "step": 25785 }, { "epoch": 1.539517669531996, "grad_norm": 0.619536280632019, "learning_rate": 1.1214637395182683e-05, "loss": 0.4542, "mean_token_accuracy": 0.8495228528976441, "num_tokens": 863958385.0, "step": 25790 }, { "epoch": 1.539816141356256, "grad_norm": 0.4792643189430237, "learning_rate": 1.1206977198751243e-05, "loss": 0.4079, "mean_token_accuracy": 0.8632649302482605, "num_tokens": 864126065.0, "step": 25795 }, { "epoch": 1.5401146131805157, "grad_norm": 0.5104835629463196, "learning_rate": 1.1199320970786242e-05, "loss": 0.4466, "mean_token_accuracy": 0.8502445340156555, "num_tokens": 864293745.0, "step": 25800 }, { "epoch": 1.5404130850047757, "grad_norm": 0.5209883451461792, "learning_rate": 1.1191668713152498e-05, "loss": 0.4555, "mean_token_accuracy": 0.8478706836700439, "num_tokens": 864461425.0, "step": 25805 }, { "epoch": 1.5407115568290353, "grad_norm": 0.49672821164131165, "learning_rate": 1.118402042771385e-05, "loss": 0.4082, "mean_token_accuracy": 0.8616068124771118, "num_tokens": 864629105.0, "step": 25810 }, { "epoch": 1.5410100286532953, "grad_norm": 0.44503673911094666, "learning_rate": 1.1176376116333182e-05, "loss": 0.4144, "mean_token_accuracy": 0.8597459197044373, "num_tokens": 864796785.0, "step": 25815 }, { "epoch": 1.5413085004775549, "grad_norm": 0.4767151176929474, "learning_rate": 1.1168735780872421e-05, "loss": 0.4142, "mean_token_accuracy": 0.8602946519851684, "num_tokens": 864964465.0, "step": 25820 }, { "epoch": 1.5416069723018149, "grad_norm": 0.48956987261772156, "learning_rate": 1.1161099423192495e-05, "loss": 0.4408, "mean_token_accuracy": 0.852594530582428, "num_tokens": 865132145.0, "step": 25825 }, { "epoch": 1.5419054441260744, "grad_norm": 0.5427263379096985, "learning_rate": 1.1153467045153387e-05, "loss": 0.4302, "mean_token_accuracy": 0.8550697803497315, "num_tokens": 865299825.0, "step": 25830 }, { "epoch": 1.5422039159503342, "grad_norm": 0.4723684787750244, "learning_rate": 1.1145838648614102e-05, "loss": 0.3798, "mean_token_accuracy": 0.8718656778335572, "num_tokens": 865467505.0, "step": 25835 }, { "epoch": 1.542502387774594, "grad_norm": 0.4865725338459015, "learning_rate": 1.1138214235432683e-05, "loss": 0.4272, "mean_token_accuracy": 0.8571573376655579, "num_tokens": 865635185.0, "step": 25840 }, { "epoch": 1.5428008595988538, "grad_norm": 0.5468321442604065, "learning_rate": 1.1130593807466186e-05, "loss": 0.4488, "mean_token_accuracy": 0.849373722076416, "num_tokens": 865802865.0, "step": 25845 }, { "epoch": 1.5430993314231136, "grad_norm": 0.5016831755638123, "learning_rate": 1.1122977366570712e-05, "loss": 0.4036, "mean_token_accuracy": 0.8641238331794738, "num_tokens": 865970545.0, "step": 25850 }, { "epoch": 1.5433978032473734, "grad_norm": 0.5058051943778992, "learning_rate": 1.1115364914601382e-05, "loss": 0.4229, "mean_token_accuracy": 0.8579625368118287, "num_tokens": 866138225.0, "step": 25855 }, { "epoch": 1.5436962750716332, "grad_norm": 0.5117329359054565, "learning_rate": 1.1107756453412354e-05, "loss": 0.4536, "mean_token_accuracy": 0.8492544412612915, "num_tokens": 866305905.0, "step": 25860 }, { "epoch": 1.543994746895893, "grad_norm": 0.5152749419212341, "learning_rate": 1.1100151984856819e-05, "loss": 0.4568, "mean_token_accuracy": 0.847381591796875, "num_tokens": 866473585.0, "step": 25865 }, { "epoch": 1.5442932187201528, "grad_norm": 0.48024821281433105, "learning_rate": 1.1092551510786961e-05, "loss": 0.4051, "mean_token_accuracy": 0.8638315558433532, "num_tokens": 866641265.0, "step": 25870 }, { "epoch": 1.5445916905444126, "grad_norm": 0.5218802690505981, "learning_rate": 1.108495503305403e-05, "loss": 0.4201, "mean_token_accuracy": 0.8588930130004883, "num_tokens": 866808945.0, "step": 25875 }, { "epoch": 1.5448901623686724, "grad_norm": 0.498441219329834, "learning_rate": 1.1077362553508297e-05, "loss": 0.4495, "mean_token_accuracy": 0.8507336378097534, "num_tokens": 866976625.0, "step": 25880 }, { "epoch": 1.5451886341929322, "grad_norm": 0.514673113822937, "learning_rate": 1.1069774073999034e-05, "loss": 0.4682, "mean_token_accuracy": 0.844560420513153, "num_tokens": 867144305.0, "step": 25885 }, { "epoch": 1.545487106017192, "grad_norm": 0.48694169521331787, "learning_rate": 1.106218959637457e-05, "loss": 0.4335, "mean_token_accuracy": 0.8562507390975952, "num_tokens": 867311985.0, "step": 25890 }, { "epoch": 1.5457855778414518, "grad_norm": 0.533424973487854, "learning_rate": 1.1054609122482225e-05, "loss": 0.4597, "mean_token_accuracy": 0.8466599106788635, "num_tokens": 867479665.0, "step": 25895 }, { "epoch": 1.5460840496657116, "grad_norm": 0.5263311266899109, "learning_rate": 1.1047032654168379e-05, "loss": 0.456, "mean_token_accuracy": 0.8474114298820495, "num_tokens": 867647345.0, "step": 25900 }, { "epoch": 1.5463825214899714, "grad_norm": 0.5120063424110413, "learning_rate": 1.1039460193278428e-05, "loss": 0.4173, "mean_token_accuracy": 0.8585888147354126, "num_tokens": 867815025.0, "step": 25905 }, { "epoch": 1.5466809933142311, "grad_norm": 0.5247727632522583, "learning_rate": 1.1031891741656762e-05, "loss": 0.427, "mean_token_accuracy": 0.8561374306678772, "num_tokens": 867982705.0, "step": 25910 }, { "epoch": 1.546979465138491, "grad_norm": 0.484334260225296, "learning_rate": 1.1024327301146833e-05, "loss": 0.4458, "mean_token_accuracy": 0.8502445459365845, "num_tokens": 868150385.0, "step": 25915 }, { "epoch": 1.5472779369627507, "grad_norm": 0.5441709160804749, "learning_rate": 1.1016766873591094e-05, "loss": 0.4253, "mean_token_accuracy": 0.8564714431762696, "num_tokens": 868318065.0, "step": 25920 }, { "epoch": 1.5475764087870105, "grad_norm": 0.6044000387191772, "learning_rate": 1.100921046083103e-05, "loss": 0.4515, "mean_token_accuracy": 0.8506501078605652, "num_tokens": 868485745.0, "step": 25925 }, { "epoch": 1.5478748806112703, "grad_norm": 0.4996013939380646, "learning_rate": 1.1001658064707148e-05, "loss": 0.4639, "mean_token_accuracy": 0.8459322452545166, "num_tokens": 868653425.0, "step": 25930 }, { "epoch": 1.5481733524355301, "grad_norm": 0.5255122780799866, "learning_rate": 1.0994109687058964e-05, "loss": 0.4347, "mean_token_accuracy": 0.8554992198944091, "num_tokens": 868821105.0, "step": 25935 }, { "epoch": 1.54847182425979, "grad_norm": 0.5189975500106812, "learning_rate": 1.0986565329725027e-05, "loss": 0.4452, "mean_token_accuracy": 0.8505308389663696, "num_tokens": 868988785.0, "step": 25940 }, { "epoch": 1.5487702960840497, "grad_norm": 0.47914978861808777, "learning_rate": 1.0979024994542908e-05, "loss": 0.4424, "mean_token_accuracy": 0.8541751265525818, "num_tokens": 869156465.0, "step": 25945 }, { "epoch": 1.5490687679083095, "grad_norm": 0.49758031964302063, "learning_rate": 1.09714886833492e-05, "loss": 0.4362, "mean_token_accuracy": 0.8544256329536438, "num_tokens": 869324145.0, "step": 25950 }, { "epoch": 1.5493672397325693, "grad_norm": 0.5295895338058472, "learning_rate": 1.0963956397979494e-05, "loss": 0.4557, "mean_token_accuracy": 0.8487892270088195, "num_tokens": 869491825.0, "step": 25955 }, { "epoch": 1.549665711556829, "grad_norm": 0.7546069025993347, "learning_rate": 1.0956428140268431e-05, "loss": 0.4829, "mean_token_accuracy": 0.8412846446037292, "num_tokens": 869651921.0, "step": 25960 }, { "epoch": 1.5499641833810889, "grad_norm": 0.4934066832065582, "learning_rate": 1.0948903912049663e-05, "loss": 0.4462, "mean_token_accuracy": 0.8510199189186096, "num_tokens": 869819601.0, "step": 25965 }, { "epoch": 1.5502626552053487, "grad_norm": 0.4560341536998749, "learning_rate": 1.0941383715155837e-05, "loss": 0.4246, "mean_token_accuracy": 0.8564893245697022, "num_tokens": 869987281.0, "step": 25970 }, { "epoch": 1.5505611270296082, "grad_norm": 0.46059396862983704, "learning_rate": 1.0933867551418647e-05, "loss": 0.3978, "mean_token_accuracy": 0.8638375401496887, "num_tokens": 870154961.0, "step": 25975 }, { "epoch": 1.5508595988538683, "grad_norm": 0.49816128611564636, "learning_rate": 1.0926355422668788e-05, "loss": 0.4384, "mean_token_accuracy": 0.8538828611373901, "num_tokens": 870322641.0, "step": 25980 }, { "epoch": 1.5511580706781278, "grad_norm": 0.47712135314941406, "learning_rate": 1.0918847330735981e-05, "loss": 0.4128, "mean_token_accuracy": 0.8611236929893493, "num_tokens": 870490321.0, "step": 25985 }, { "epoch": 1.5514565425023878, "grad_norm": 0.4922550618648529, "learning_rate": 1.0911343277448965e-05, "loss": 0.4849, "mean_token_accuracy": 0.8386854410171509, "num_tokens": 870658001.0, "step": 25990 }, { "epoch": 1.5517550143266474, "grad_norm": 0.5132717490196228, "learning_rate": 1.090384326463548e-05, "loss": 0.4358, "mean_token_accuracy": 0.85394846200943, "num_tokens": 870825681.0, "step": 25995 }, { "epoch": 1.5520534861509074, "grad_norm": 0.5151049494743347, "learning_rate": 1.0896347294122297e-05, "loss": 0.408, "mean_token_accuracy": 0.8625671148300171, "num_tokens": 870993361.0, "step": 26000 }, { "epoch": 1.552351957975167, "grad_norm": 0.6029640436172485, "learning_rate": 1.08888553677352e-05, "loss": 0.4644, "mean_token_accuracy": 0.8446856737136841, "num_tokens": 871161041.0, "step": 26005 }, { "epoch": 1.552650429799427, "grad_norm": 0.5040461421012878, "learning_rate": 1.0881367487298985e-05, "loss": 0.4093, "mean_token_accuracy": 0.863008463382721, "num_tokens": 871328721.0, "step": 26010 }, { "epoch": 1.5529489016236866, "grad_norm": 0.4810337424278259, "learning_rate": 1.087388365463747e-05, "loss": 0.4313, "mean_token_accuracy": 0.8544554471969604, "num_tokens": 871496401.0, "step": 26015 }, { "epoch": 1.5532473734479466, "grad_norm": 0.48912104964256287, "learning_rate": 1.086640387157346e-05, "loss": 0.4274, "mean_token_accuracy": 0.8566324830055236, "num_tokens": 871664081.0, "step": 26020 }, { "epoch": 1.5535458452722062, "grad_norm": 0.5116869211196899, "learning_rate": 1.0858928139928822e-05, "loss": 0.4266, "mean_token_accuracy": 0.8555409789085389, "num_tokens": 871831761.0, "step": 26025 }, { "epoch": 1.5538443170964662, "grad_norm": 0.535221517086029, "learning_rate": 1.0851456461524388e-05, "loss": 0.4186, "mean_token_accuracy": 0.8599510550498962, "num_tokens": 871997278.0, "step": 26030 }, { "epoch": 1.5541427889207258, "grad_norm": 0.6300113201141357, "learning_rate": 1.0843988838180027e-05, "loss": 0.4623, "mean_token_accuracy": 0.8439712285995483, "num_tokens": 872157664.0, "step": 26035 }, { "epoch": 1.5544412607449858, "grad_norm": 0.5374897718429565, "learning_rate": 1.083652527171462e-05, "loss": 0.4253, "mean_token_accuracy": 0.8560479521751404, "num_tokens": 872325344.0, "step": 26040 }, { "epoch": 1.5547397325692454, "grad_norm": 0.46194443106651306, "learning_rate": 1.0829065763946058e-05, "loss": 0.4208, "mean_token_accuracy": 0.8587080955505371, "num_tokens": 872493024.0, "step": 26045 }, { "epoch": 1.5550382043935054, "grad_norm": 0.47020941972732544, "learning_rate": 1.0821610316691244e-05, "loss": 0.4302, "mean_token_accuracy": 0.8555469393730164, "num_tokens": 872660704.0, "step": 26050 }, { "epoch": 1.555336676217765, "grad_norm": 0.5209056735038757, "learning_rate": 1.0814158931766077e-05, "loss": 0.4294, "mean_token_accuracy": 0.8565191507339478, "num_tokens": 872828384.0, "step": 26055 }, { "epoch": 1.555635148042025, "grad_norm": 0.49416953325271606, "learning_rate": 1.080671161098549e-05, "loss": 0.4032, "mean_token_accuracy": 0.8660324573516845, "num_tokens": 872996064.0, "step": 26060 }, { "epoch": 1.5559336198662845, "grad_norm": 0.5182694792747498, "learning_rate": 1.0799268356163413e-05, "loss": 0.4357, "mean_token_accuracy": 0.8550604462623597, "num_tokens": 873163695.0, "step": 26065 }, { "epoch": 1.5562320916905446, "grad_norm": 0.5381470918655396, "learning_rate": 1.0791829169112788e-05, "loss": 0.4592, "mean_token_accuracy": 0.8478170037269592, "num_tokens": 873331375.0, "step": 26070 }, { "epoch": 1.5565305635148041, "grad_norm": 0.5272315144538879, "learning_rate": 1.0784394051645572e-05, "loss": 0.4507, "mean_token_accuracy": 0.8483597636222839, "num_tokens": 873499055.0, "step": 26075 }, { "epoch": 1.5568290353390641, "grad_norm": 0.49999555945396423, "learning_rate": 1.0776963005572705e-05, "loss": 0.419, "mean_token_accuracy": 0.860151493549347, "num_tokens": 873666735.0, "step": 26080 }, { "epoch": 1.5571275071633237, "grad_norm": 0.5012195706367493, "learning_rate": 1.0769536032704185e-05, "loss": 0.433, "mean_token_accuracy": 0.855725884437561, "num_tokens": 873834415.0, "step": 26085 }, { "epoch": 1.5574259789875837, "grad_norm": 0.44986918568611145, "learning_rate": 1.0762113134848969e-05, "loss": 0.4184, "mean_token_accuracy": 0.8602170944213867, "num_tokens": 874002095.0, "step": 26090 }, { "epoch": 1.5577244508118433, "grad_norm": 0.5704871416091919, "learning_rate": 1.0754694313815042e-05, "loss": 0.4574, "mean_token_accuracy": 0.8485983490943909, "num_tokens": 874169775.0, "step": 26095 }, { "epoch": 1.5580229226361033, "grad_norm": 0.4929519593715668, "learning_rate": 1.0747279571409399e-05, "loss": 0.4532, "mean_token_accuracy": 0.8491709470748902, "num_tokens": 874337455.0, "step": 26100 }, { "epoch": 1.5583213944603629, "grad_norm": 0.44656768441200256, "learning_rate": 1.0739868909438033e-05, "loss": 0.4126, "mean_token_accuracy": 0.8603185057640076, "num_tokens": 874505135.0, "step": 26105 }, { "epoch": 1.5586198662846227, "grad_norm": 0.453061044216156, "learning_rate": 1.073246232970596e-05, "loss": 0.3809, "mean_token_accuracy": 0.8709900736808777, "num_tokens": 874670103.0, "step": 26110 }, { "epoch": 1.5589183381088825, "grad_norm": 0.5654840469360352, "learning_rate": 1.0725059834017173e-05, "loss": 0.4487, "mean_token_accuracy": 0.8499582409858704, "num_tokens": 874837783.0, "step": 26115 }, { "epoch": 1.5592168099331423, "grad_norm": 0.5006895661354065, "learning_rate": 1.0717661424174693e-05, "loss": 0.4271, "mean_token_accuracy": 0.8571871519088745, "num_tokens": 875005463.0, "step": 26120 }, { "epoch": 1.559515281757402, "grad_norm": 0.4911402761936188, "learning_rate": 1.0710267101980542e-05, "loss": 0.4585, "mean_token_accuracy": 0.8476500034332275, "num_tokens": 875173143.0, "step": 26125 }, { "epoch": 1.5598137535816619, "grad_norm": 0.5086870193481445, "learning_rate": 1.0702876869235737e-05, "loss": 0.4199, "mean_token_accuracy": 0.8591434955596924, "num_tokens": 875340823.0, "step": 26130 }, { "epoch": 1.5601122254059216, "grad_norm": 0.4928043484687805, "learning_rate": 1.069549072774032e-05, "loss": 0.4482, "mean_token_accuracy": 0.8503280401229858, "num_tokens": 875508503.0, "step": 26135 }, { "epoch": 1.5604106972301814, "grad_norm": 0.4967227578163147, "learning_rate": 1.0688108679293304e-05, "loss": 0.4796, "mean_token_accuracy": 0.8401586532592773, "num_tokens": 875676183.0, "step": 26140 }, { "epoch": 1.5607091690544412, "grad_norm": 0.546454131603241, "learning_rate": 1.0680730725692724e-05, "loss": 0.4132, "mean_token_accuracy": 0.8598711490631104, "num_tokens": 875843863.0, "step": 26145 }, { "epoch": 1.561007640878701, "grad_norm": 0.4873913526535034, "learning_rate": 1.0673356868735637e-05, "loss": 0.432, "mean_token_accuracy": 0.8557199120521546, "num_tokens": 876011543.0, "step": 26150 }, { "epoch": 1.5613061127029608, "grad_norm": 0.5232744216918945, "learning_rate": 1.0665987110218062e-05, "loss": 0.4392, "mean_token_accuracy": 0.8530776500701904, "num_tokens": 876179223.0, "step": 26155 }, { "epoch": 1.5616045845272206, "grad_norm": 0.48951423168182373, "learning_rate": 1.0658621451935055e-05, "loss": 0.4136, "mean_token_accuracy": 0.8604199051856994, "num_tokens": 876346903.0, "step": 26160 }, { "epoch": 1.5619030563514804, "grad_norm": 0.4294096529483795, "learning_rate": 1.0651259895680635e-05, "loss": 0.4132, "mean_token_accuracy": 0.8607956528663635, "num_tokens": 876514583.0, "step": 26165 }, { "epoch": 1.5622015281757402, "grad_norm": 0.5300936102867126, "learning_rate": 1.0643902443247874e-05, "loss": 0.4507, "mean_token_accuracy": 0.8491112947463989, "num_tokens": 876682263.0, "step": 26170 }, { "epoch": 1.5625, "grad_norm": 0.46735334396362305, "learning_rate": 1.0636549096428793e-05, "loss": 0.4139, "mean_token_accuracy": 0.8602469325065613, "num_tokens": 876849943.0, "step": 26175 }, { "epoch": 1.5627984718242598, "grad_norm": 0.4797969162464142, "learning_rate": 1.0629199857014445e-05, "loss": 0.4595, "mean_token_accuracy": 0.8473978757858276, "num_tokens": 877011330.0, "step": 26180 }, { "epoch": 1.5630969436485196, "grad_norm": 0.5361015200614929, "learning_rate": 1.0621854726794869e-05, "loss": 0.4347, "mean_token_accuracy": 0.8547596216201783, "num_tokens": 877179010.0, "step": 26185 }, { "epoch": 1.5633954154727794, "grad_norm": 0.4962694048881531, "learning_rate": 1.061451370755911e-05, "loss": 0.4137, "mean_token_accuracy": 0.8613085985183716, "num_tokens": 877346690.0, "step": 26190 }, { "epoch": 1.5636938872970392, "grad_norm": 0.546158492565155, "learning_rate": 1.0607176801095215e-05, "loss": 0.4871, "mean_token_accuracy": 0.8394667744636536, "num_tokens": 877514370.0, "step": 26195 }, { "epoch": 1.563992359121299, "grad_norm": 0.44295719265937805, "learning_rate": 1.0599844009190208e-05, "loss": 0.4157, "mean_token_accuracy": 0.8599427461624145, "num_tokens": 877682050.0, "step": 26200 }, { "epoch": 1.5642908309455588, "grad_norm": 0.5119081139564514, "learning_rate": 1.0592515333630128e-05, "loss": 0.4415, "mean_token_accuracy": 0.8528867840766907, "num_tokens": 877849730.0, "step": 26205 }, { "epoch": 1.5645893027698186, "grad_norm": 0.5087532997131348, "learning_rate": 1.0585190776200026e-05, "loss": 0.4371, "mean_token_accuracy": 0.8547477126121521, "num_tokens": 878017410.0, "step": 26210 }, { "epoch": 1.5648877745940784, "grad_norm": 0.5864216685295105, "learning_rate": 1.0577870338683917e-05, "loss": 0.4632, "mean_token_accuracy": 0.8481450557708741, "num_tokens": 878185090.0, "step": 26215 }, { "epoch": 1.5651862464183381, "grad_norm": 0.4697224497795105, "learning_rate": 1.0570554022864842e-05, "loss": 0.3905, "mean_token_accuracy": 0.8672372698783875, "num_tokens": 878352770.0, "step": 26220 }, { "epoch": 1.565484718242598, "grad_norm": 0.5592724680900574, "learning_rate": 1.0563241830524803e-05, "loss": 0.4391, "mean_token_accuracy": 0.8536442875862121, "num_tokens": 878520450.0, "step": 26225 }, { "epoch": 1.5657831900668577, "grad_norm": 0.541803240776062, "learning_rate": 1.0555933763444838e-05, "loss": 0.4207, "mean_token_accuracy": 0.8588035345077515, "num_tokens": 878688130.0, "step": 26230 }, { "epoch": 1.5660816618911175, "grad_norm": 0.4904037117958069, "learning_rate": 1.0548629823404962e-05, "loss": 0.4466, "mean_token_accuracy": 0.8503698110580444, "num_tokens": 878855810.0, "step": 26235 }, { "epoch": 1.5663801337153773, "grad_norm": 0.4765397012233734, "learning_rate": 1.0541330012184175e-05, "loss": 0.4132, "mean_token_accuracy": 0.8618573307991028, "num_tokens": 879023490.0, "step": 26240 }, { "epoch": 1.5666786055396371, "grad_norm": 0.5097711682319641, "learning_rate": 1.0534034331560482e-05, "loss": 0.4272, "mean_token_accuracy": 0.8572825908660888, "num_tokens": 879191170.0, "step": 26245 }, { "epoch": 1.5669770773638967, "grad_norm": 0.5480923652648926, "learning_rate": 1.052674278331089e-05, "loss": 0.4478, "mean_token_accuracy": 0.8509662389755249, "num_tokens": 879358850.0, "step": 26250 }, { "epoch": 1.5672755491881567, "grad_norm": 0.5250793099403381, "learning_rate": 1.0519455369211374e-05, "loss": 0.4558, "mean_token_accuracy": 0.8474293231964112, "num_tokens": 879526530.0, "step": 26255 }, { "epoch": 1.5675740210124163, "grad_norm": 0.4893653988838196, "learning_rate": 1.051217209103694e-05, "loss": 0.4227, "mean_token_accuracy": 0.8576881766319275, "num_tokens": 879694210.0, "step": 26260 }, { "epoch": 1.5678724928366763, "grad_norm": 0.5299640893936157, "learning_rate": 1.0504892950561546e-05, "loss": 0.4632, "mean_token_accuracy": 0.8466161608695983, "num_tokens": 879853507.0, "step": 26265 }, { "epoch": 1.5681709646609359, "grad_norm": 0.472123920917511, "learning_rate": 1.0497617949558165e-05, "loss": 0.4587, "mean_token_accuracy": 0.848860788345337, "num_tokens": 880021187.0, "step": 26270 }, { "epoch": 1.5684694364851959, "grad_norm": 0.5101821422576904, "learning_rate": 1.0490347089798762e-05, "loss": 0.4181, "mean_token_accuracy": 0.8592926263809204, "num_tokens": 880188867.0, "step": 26275 }, { "epoch": 1.5687679083094554, "grad_norm": 0.5313236713409424, "learning_rate": 1.0483080373054294e-05, "loss": 0.4323, "mean_token_accuracy": 0.8552606463432312, "num_tokens": 880356547.0, "step": 26280 }, { "epoch": 1.5690663801337155, "grad_norm": 0.5494893193244934, "learning_rate": 1.047581780109469e-05, "loss": 0.4214, "mean_token_accuracy": 0.8601992130279541, "num_tokens": 880524227.0, "step": 26285 }, { "epoch": 1.569364851957975, "grad_norm": 0.5203440189361572, "learning_rate": 1.0468559375688888e-05, "loss": 0.3706, "mean_token_accuracy": 0.8743468999862671, "num_tokens": 880691907.0, "step": 26290 }, { "epoch": 1.569663323782235, "grad_norm": 0.5092349052429199, "learning_rate": 1.0461305098604823e-05, "loss": 0.4253, "mean_token_accuracy": 0.8562567234039307, "num_tokens": 880859587.0, "step": 26295 }, { "epoch": 1.5699617956064946, "grad_norm": 0.557034969329834, "learning_rate": 1.0454054971609398e-05, "loss": 0.4403, "mean_token_accuracy": 0.8532804489135742, "num_tokens": 881027267.0, "step": 26300 }, { "epoch": 1.5702602674307546, "grad_norm": 0.4711591899394989, "learning_rate": 1.0446808996468512e-05, "loss": 0.4055, "mean_token_accuracy": 0.8620268106460571, "num_tokens": 881188675.0, "step": 26305 }, { "epoch": 1.5705587392550142, "grad_norm": 0.49493682384490967, "learning_rate": 1.0439567174947063e-05, "loss": 0.4495, "mean_token_accuracy": 0.850363838672638, "num_tokens": 881356355.0, "step": 26310 }, { "epoch": 1.5708572110792742, "grad_norm": 0.4804880917072296, "learning_rate": 1.0432329508808927e-05, "loss": 0.4157, "mean_token_accuracy": 0.8591256022453309, "num_tokens": 881524035.0, "step": 26315 }, { "epoch": 1.5711556829035338, "grad_norm": 0.5023981332778931, "learning_rate": 1.0425095999816978e-05, "loss": 0.4128, "mean_token_accuracy": 0.8612668514251709, "num_tokens": 881691715.0, "step": 26320 }, { "epoch": 1.5714541547277938, "grad_norm": 0.47381237149238586, "learning_rate": 1.041786664973306e-05, "loss": 0.4258, "mean_token_accuracy": 0.8583860158920288, "num_tokens": 881859395.0, "step": 26325 }, { "epoch": 1.5717526265520534, "grad_norm": 0.450787752866745, "learning_rate": 1.041064146031802e-05, "loss": 0.4303, "mean_token_accuracy": 0.8556125521659851, "num_tokens": 882027075.0, "step": 26330 }, { "epoch": 1.5720510983763134, "grad_norm": 0.5109887719154358, "learning_rate": 1.0403420433331684e-05, "loss": 0.4239, "mean_token_accuracy": 0.8576643228530884, "num_tokens": 882194755.0, "step": 26335 }, { "epoch": 1.572349570200573, "grad_norm": 0.4925119876861572, "learning_rate": 1.0396203570532867e-05, "loss": 0.418, "mean_token_accuracy": 0.8584337353706359, "num_tokens": 882362435.0, "step": 26340 }, { "epoch": 1.572648042024833, "grad_norm": 0.531807541847229, "learning_rate": 1.038899087367938e-05, "loss": 0.4744, "mean_token_accuracy": 0.8419658899307251, "num_tokens": 882530115.0, "step": 26345 }, { "epoch": 1.5729465138490926, "grad_norm": 0.4866146147251129, "learning_rate": 1.0381782344527985e-05, "loss": 0.3919, "mean_token_accuracy": 0.867732310295105, "num_tokens": 882697795.0, "step": 26350 }, { "epoch": 1.5732449856733526, "grad_norm": 0.5182363390922546, "learning_rate": 1.0374577984834479e-05, "loss": 0.4695, "mean_token_accuracy": 0.8426398634910583, "num_tokens": 882865475.0, "step": 26355 }, { "epoch": 1.5735434574976122, "grad_norm": 0.44843345880508423, "learning_rate": 1.0367377796353594e-05, "loss": 0.4538, "mean_token_accuracy": 0.8491769075393677, "num_tokens": 883033155.0, "step": 26360 }, { "epoch": 1.5738419293218722, "grad_norm": 0.49718332290649414, "learning_rate": 1.0360181780839085e-05, "loss": 0.4554, "mean_token_accuracy": 0.8470237255096436, "num_tokens": 883200835.0, "step": 26365 }, { "epoch": 1.5741404011461317, "grad_norm": 0.5355473160743713, "learning_rate": 1.0352989940043664e-05, "loss": 0.4174, "mean_token_accuracy": 0.8623523831367492, "num_tokens": 883368515.0, "step": 26370 }, { "epoch": 1.5744388729703918, "grad_norm": 0.5284214615821838, "learning_rate": 1.034580227571904e-05, "loss": 0.4409, "mean_token_accuracy": 0.8529046893119812, "num_tokens": 883536195.0, "step": 26375 }, { "epoch": 1.5747373447946513, "grad_norm": 0.5034896731376648, "learning_rate": 1.0338618789615911e-05, "loss": 0.3938, "mean_token_accuracy": 0.8655194997787475, "num_tokens": 883703875.0, "step": 26380 }, { "epoch": 1.5750358166189111, "grad_norm": 0.5367690324783325, "learning_rate": 1.0331439483483932e-05, "loss": 0.4435, "mean_token_accuracy": 0.8505845069885254, "num_tokens": 883871555.0, "step": 26385 }, { "epoch": 1.575334288443171, "grad_norm": 0.566448450088501, "learning_rate": 1.0324264359071761e-05, "loss": 0.4054, "mean_token_accuracy": 0.8646904468536377, "num_tokens": 884039235.0, "step": 26390 }, { "epoch": 1.5756327602674307, "grad_norm": 0.5155763626098633, "learning_rate": 1.031709341812703e-05, "loss": 0.4426, "mean_token_accuracy": 0.8529046893119812, "num_tokens": 884206915.0, "step": 26395 }, { "epoch": 1.5759312320916905, "grad_norm": 0.4850751459598541, "learning_rate": 1.030992666239636e-05, "loss": 0.405, "mean_token_accuracy": 0.8638852596282959, "num_tokens": 884374595.0, "step": 26400 }, { "epoch": 1.5762297039159503, "grad_norm": 0.5149714946746826, "learning_rate": 1.0302764093625353e-05, "loss": 0.4114, "mean_token_accuracy": 0.8620481967926026, "num_tokens": 884542275.0, "step": 26405 }, { "epoch": 1.57652817574021, "grad_norm": 0.48544833064079285, "learning_rate": 1.0295605713558565e-05, "loss": 0.4343, "mean_token_accuracy": 0.8533400893211365, "num_tokens": 884709955.0, "step": 26410 }, { "epoch": 1.5768266475644699, "grad_norm": 0.4907308518886566, "learning_rate": 1.0288451523939562e-05, "loss": 0.4361, "mean_token_accuracy": 0.8554574728012085, "num_tokens": 884877635.0, "step": 26415 }, { "epoch": 1.5771251193887297, "grad_norm": 0.48904645442962646, "learning_rate": 1.0281301526510883e-05, "loss": 0.4447, "mean_token_accuracy": 0.8516879439353943, "num_tokens": 885045315.0, "step": 26420 }, { "epoch": 1.5774235912129895, "grad_norm": 0.4829970598220825, "learning_rate": 1.0274155723014036e-05, "loss": 0.4192, "mean_token_accuracy": 0.8591077208518982, "num_tokens": 885212995.0, "step": 26425 }, { "epoch": 1.5777220630372493, "grad_norm": 0.4841747283935547, "learning_rate": 1.0267014115189517e-05, "loss": 0.4152, "mean_token_accuracy": 0.8611833333969117, "num_tokens": 885380675.0, "step": 26430 }, { "epoch": 1.578020534861509, "grad_norm": 0.5143060088157654, "learning_rate": 1.0259876704776792e-05, "loss": 0.4408, "mean_token_accuracy": 0.8531253695487976, "num_tokens": 885548355.0, "step": 26435 }, { "epoch": 1.5783190066857689, "grad_norm": 0.48538410663604736, "learning_rate": 1.0252743493514318e-05, "loss": 0.3942, "mean_token_accuracy": 0.8671418309211731, "num_tokens": 885716035.0, "step": 26440 }, { "epoch": 1.5786174785100286, "grad_norm": 0.4950398802757263, "learning_rate": 1.024561448313951e-05, "loss": 0.4361, "mean_token_accuracy": 0.8537456750869751, "num_tokens": 885883715.0, "step": 26445 }, { "epoch": 1.5789159503342884, "grad_norm": 0.5013229846954346, "learning_rate": 1.0238489675388771e-05, "loss": 0.4297, "mean_token_accuracy": 0.8550280213356019, "num_tokens": 886051395.0, "step": 26450 }, { "epoch": 1.5792144221585482, "grad_norm": 0.4976925551891327, "learning_rate": 1.0231369071997486e-05, "loss": 0.445, "mean_token_accuracy": 0.8521591305732727, "num_tokens": 886219075.0, "step": 26455 }, { "epoch": 1.579512893982808, "grad_norm": 0.5492770075798035, "learning_rate": 1.0224252674700003e-05, "loss": 0.4322, "mean_token_accuracy": 0.8546105146408081, "num_tokens": 886386755.0, "step": 26460 }, { "epoch": 1.5798113658070678, "grad_norm": 0.4879792630672455, "learning_rate": 1.0217140485229661e-05, "loss": 0.4151, "mean_token_accuracy": 0.861815583705902, "num_tokens": 886554435.0, "step": 26465 }, { "epoch": 1.5801098376313276, "grad_norm": 0.579795777797699, "learning_rate": 1.0210032505318756e-05, "loss": 0.4468, "mean_token_accuracy": 0.8492305755615235, "num_tokens": 886722115.0, "step": 26470 }, { "epoch": 1.5804083094555874, "grad_norm": 0.48856765031814575, "learning_rate": 1.0202928736698561e-05, "loss": 0.435, "mean_token_accuracy": 0.8541989684104919, "num_tokens": 886889795.0, "step": 26475 }, { "epoch": 1.5807067812798472, "grad_norm": 0.5250730514526367, "learning_rate": 1.0195829181099353e-05, "loss": 0.4033, "mean_token_accuracy": 0.8631396889686584, "num_tokens": 887057475.0, "step": 26480 }, { "epoch": 1.581005253104107, "grad_norm": 0.5376426577568054, "learning_rate": 1.0188733840250343e-05, "loss": 0.4467, "mean_token_accuracy": 0.849988055229187, "num_tokens": 887225155.0, "step": 26485 }, { "epoch": 1.5813037249283668, "grad_norm": 0.5562255382537842, "learning_rate": 1.0181642715879737e-05, "loss": 0.4596, "mean_token_accuracy": 0.8456101655960083, "num_tokens": 887392835.0, "step": 26490 }, { "epoch": 1.5816021967526266, "grad_norm": 0.5164707899093628, "learning_rate": 1.0174555809714701e-05, "loss": 0.4326, "mean_token_accuracy": 0.8548789143562316, "num_tokens": 887560515.0, "step": 26495 }, { "epoch": 1.5819006685768864, "grad_norm": 0.5418306589126587, "learning_rate": 1.0167473123481395e-05, "loss": 0.4101, "mean_token_accuracy": 0.8625611424446106, "num_tokens": 887728195.0, "step": 26500 }, { "epoch": 1.5821991404011462, "grad_norm": 0.5183385610580444, "learning_rate": 1.016039465890493e-05, "loss": 0.4168, "mean_token_accuracy": 0.8602648258209229, "num_tokens": 887895875.0, "step": 26505 }, { "epoch": 1.582497612225406, "grad_norm": 0.5144839882850647, "learning_rate": 1.0153320417709395e-05, "loss": 0.4518, "mean_token_accuracy": 0.8490874290466308, "num_tokens": 888063555.0, "step": 26510 }, { "epoch": 1.5827960840496658, "grad_norm": 0.49877968430519104, "learning_rate": 1.0146250401617856e-05, "loss": 0.4373, "mean_token_accuracy": 0.8533878087997436, "num_tokens": 888231235.0, "step": 26515 }, { "epoch": 1.5830945558739256, "grad_norm": 0.4924964904785156, "learning_rate": 1.0139184612352346e-05, "loss": 0.4143, "mean_token_accuracy": 0.860312533378601, "num_tokens": 888398915.0, "step": 26520 }, { "epoch": 1.5833930276981851, "grad_norm": 0.8070304989814758, "learning_rate": 1.0132123051633874e-05, "loss": 0.4514, "mean_token_accuracy": 0.8498687744140625, "num_tokens": 888566595.0, "step": 26525 }, { "epoch": 1.5836914995224451, "grad_norm": 0.4972057342529297, "learning_rate": 1.01250657211824e-05, "loss": 0.4262, "mean_token_accuracy": 0.8566444039344787, "num_tokens": 888734275.0, "step": 26530 }, { "epoch": 1.5839899713467047, "grad_norm": 0.5219476819038391, "learning_rate": 1.0118012622716874e-05, "loss": 0.4589, "mean_token_accuracy": 0.8484015226364136, "num_tokens": 888901955.0, "step": 26535 }, { "epoch": 1.5842884431709647, "grad_norm": 0.6271768808364868, "learning_rate": 1.0110963757955208e-05, "loss": 0.4679, "mean_token_accuracy": 0.8445663809776306, "num_tokens": 889069635.0, "step": 26540 }, { "epoch": 1.5845869149952243, "grad_norm": 0.48052552342414856, "learning_rate": 1.0103919128614284e-05, "loss": 0.4077, "mean_token_accuracy": 0.8619467973709106, "num_tokens": 889237315.0, "step": 26545 }, { "epoch": 1.5848853868194843, "grad_norm": 0.506382167339325, "learning_rate": 1.009687873640996e-05, "loss": 0.4318, "mean_token_accuracy": 0.8543302059173584, "num_tokens": 889404995.0, "step": 26550 }, { "epoch": 1.585183858643744, "grad_norm": 0.5315185189247131, "learning_rate": 1.0089842583057028e-05, "loss": 0.4315, "mean_token_accuracy": 0.8549147129058838, "num_tokens": 889572675.0, "step": 26555 }, { "epoch": 1.585482330468004, "grad_norm": 0.5435509085655212, "learning_rate": 1.0082810670269299e-05, "loss": 0.4426, "mean_token_accuracy": 0.853483247756958, "num_tokens": 889740355.0, "step": 26560 }, { "epoch": 1.5857808022922635, "grad_norm": 0.5118299126625061, "learning_rate": 1.007578299975952e-05, "loss": 0.4255, "mean_token_accuracy": 0.8571573376655579, "num_tokens": 889908035.0, "step": 26565 }, { "epoch": 1.5860792741165235, "grad_norm": 0.5012679100036621, "learning_rate": 1.00687595732394e-05, "loss": 0.4049, "mean_token_accuracy": 0.8631456613540649, "num_tokens": 890075715.0, "step": 26570 }, { "epoch": 1.586377745940783, "grad_norm": 0.5095944404602051, "learning_rate": 1.0061740392419631e-05, "loss": 0.4653, "mean_token_accuracy": 0.8450733661651612, "num_tokens": 890243395.0, "step": 26575 }, { "epoch": 1.586676217765043, "grad_norm": 0.5599350333213806, "learning_rate": 1.0054725459009858e-05, "loss": 0.455, "mean_token_accuracy": 0.8507157325744629, "num_tokens": 890411075.0, "step": 26580 }, { "epoch": 1.5869746895893027, "grad_norm": 0.4542834460735321, "learning_rate": 1.0047714774718709e-05, "loss": 0.4067, "mean_token_accuracy": 0.8618334889411926, "num_tokens": 890578755.0, "step": 26585 }, { "epoch": 1.5872731614135627, "grad_norm": 0.5228456258773804, "learning_rate": 1.004070834125376e-05, "loss": 0.4544, "mean_token_accuracy": 0.8470476031303406, "num_tokens": 890746435.0, "step": 26590 }, { "epoch": 1.5875716332378222, "grad_norm": 0.5303345918655396, "learning_rate": 1.0033706160321551e-05, "loss": 0.4286, "mean_token_accuracy": 0.8567577362060547, "num_tokens": 890914115.0, "step": 26595 }, { "epoch": 1.5878701050620823, "grad_norm": 0.5150555968284607, "learning_rate": 1.0026708233627601e-05, "loss": 0.4377, "mean_token_accuracy": 0.8535556197166443, "num_tokens": 891076157.0, "step": 26600 }, { "epoch": 1.5881685768863418, "grad_norm": 0.4887823760509491, "learning_rate": 1.0019714562876377e-05, "loss": 0.4303, "mean_token_accuracy": 0.8553858995437622, "num_tokens": 891243837.0, "step": 26605 }, { "epoch": 1.5884670487106018, "grad_norm": 0.5814841985702515, "learning_rate": 1.0012725149771329e-05, "loss": 0.4376, "mean_token_accuracy": 0.8544375538825989, "num_tokens": 891411517.0, "step": 26610 }, { "epoch": 1.5887655205348614, "grad_norm": 0.5063245296478271, "learning_rate": 1.000573999601484e-05, "loss": 0.4286, "mean_token_accuracy": 0.8563342452049255, "num_tokens": 891579197.0, "step": 26615 }, { "epoch": 1.5890639923591214, "grad_norm": 0.501069188117981, "learning_rate": 9.998759103308278e-06, "loss": 0.4112, "mean_token_accuracy": 0.8617678761482239, "num_tokens": 891746877.0, "step": 26620 }, { "epoch": 1.589362464183381, "grad_norm": 0.5059115290641785, "learning_rate": 9.991782473351985e-06, "loss": 0.4365, "mean_token_accuracy": 0.8556781530380249, "num_tokens": 891914557.0, "step": 26625 }, { "epoch": 1.589660936007641, "grad_norm": 0.5677490234375, "learning_rate": 9.984810107845228e-06, "loss": 0.4737, "mean_token_accuracy": 0.8436061024665833, "num_tokens": 892082237.0, "step": 26630 }, { "epoch": 1.5899594078319006, "grad_norm": 0.49569299817085266, "learning_rate": 9.977842008486263e-06, "loss": 0.4253, "mean_token_accuracy": 0.8565608978271484, "num_tokens": 892249917.0, "step": 26635 }, { "epoch": 1.5902578796561606, "grad_norm": 0.4933372437953949, "learning_rate": 9.970878176972297e-06, "loss": 0.4334, "mean_token_accuracy": 0.8551771402359009, "num_tokens": 892417597.0, "step": 26640 }, { "epoch": 1.5905563514804202, "grad_norm": 0.5534626245498657, "learning_rate": 9.963918614999502e-06, "loss": 0.4443, "mean_token_accuracy": 0.8499701857566834, "num_tokens": 892585277.0, "step": 26645 }, { "epoch": 1.5908548233046802, "grad_norm": 0.5127054452896118, "learning_rate": 9.956963324263013e-06, "loss": 0.4057, "mean_token_accuracy": 0.8625969290733337, "num_tokens": 892752957.0, "step": 26650 }, { "epoch": 1.5911532951289398, "grad_norm": 0.5097017288208008, "learning_rate": 9.95001230645691e-06, "loss": 0.4598, "mean_token_accuracy": 0.8478587508201599, "num_tokens": 892920637.0, "step": 26655 }, { "epoch": 1.5914517669531996, "grad_norm": 0.5153566598892212, "learning_rate": 9.943065563274244e-06, "loss": 0.4863, "mean_token_accuracy": 0.8390228271484375, "num_tokens": 893085449.0, "step": 26660 }, { "epoch": 1.5917502387774594, "grad_norm": 0.5006033182144165, "learning_rate": 9.936123096407024e-06, "loss": 0.409, "mean_token_accuracy": 0.8611833453178406, "num_tokens": 893253129.0, "step": 26665 }, { "epoch": 1.5920487106017192, "grad_norm": 0.5272590517997742, "learning_rate": 9.929184907546217e-06, "loss": 0.4513, "mean_token_accuracy": 0.847882616519928, "num_tokens": 893420809.0, "step": 26670 }, { "epoch": 1.592347182425979, "grad_norm": 0.5295668244361877, "learning_rate": 9.922250998381751e-06, "loss": 0.4247, "mean_token_accuracy": 0.8569724321365356, "num_tokens": 893588489.0, "step": 26675 }, { "epoch": 1.5926456542502387, "grad_norm": 0.5618000030517578, "learning_rate": 9.91532137060249e-06, "loss": 0.4325, "mean_token_accuracy": 0.8550161123275757, "num_tokens": 893756169.0, "step": 26680 }, { "epoch": 1.5929441260744985, "grad_norm": 0.4733685553073883, "learning_rate": 9.908396025896297e-06, "loss": 0.4333, "mean_token_accuracy": 0.8540140748023987, "num_tokens": 893923849.0, "step": 26685 }, { "epoch": 1.5932425978987583, "grad_norm": 0.4882482588291168, "learning_rate": 9.901474965949953e-06, "loss": 0.4005, "mean_token_accuracy": 0.8641655802726745, "num_tokens": 894091529.0, "step": 26690 }, { "epoch": 1.5935410697230181, "grad_norm": 0.4893314242362976, "learning_rate": 9.894558192449208e-06, "loss": 0.4418, "mean_token_accuracy": 0.8521770238876343, "num_tokens": 894259209.0, "step": 26695 }, { "epoch": 1.593839541547278, "grad_norm": 0.5153326392173767, "learning_rate": 9.887645707078776e-06, "loss": 0.4472, "mean_token_accuracy": 0.8510497450828552, "num_tokens": 894426889.0, "step": 26700 }, { "epoch": 1.5941380133715377, "grad_norm": 0.5323935151100159, "learning_rate": 9.88073751152232e-06, "loss": 0.4482, "mean_token_accuracy": 0.8494453072547913, "num_tokens": 894594569.0, "step": 26705 }, { "epoch": 1.5944364851957975, "grad_norm": 0.5779756903648376, "learning_rate": 9.873833607462462e-06, "loss": 0.4937, "mean_token_accuracy": 0.8369080305099488, "num_tokens": 894762249.0, "step": 26710 }, { "epoch": 1.5947349570200573, "grad_norm": 0.5010717511177063, "learning_rate": 9.866933996580763e-06, "loss": 0.4228, "mean_token_accuracy": 0.8586663365364074, "num_tokens": 894929929.0, "step": 26715 }, { "epoch": 1.595033428844317, "grad_norm": 0.4734920561313629, "learning_rate": 9.860038680557762e-06, "loss": 0.4512, "mean_token_accuracy": 0.8497136950492858, "num_tokens": 895097609.0, "step": 26720 }, { "epoch": 1.5953319006685769, "grad_norm": 0.5456564426422119, "learning_rate": 9.853147661072936e-06, "loss": 0.4087, "mean_token_accuracy": 0.863020396232605, "num_tokens": 895265289.0, "step": 26725 }, { "epoch": 1.5956303724928367, "grad_norm": 0.4554482102394104, "learning_rate": 9.846260939804719e-06, "loss": 0.3861, "mean_token_accuracy": 0.8693188548088073, "num_tokens": 895432969.0, "step": 26730 }, { "epoch": 1.5959288443170965, "grad_norm": 0.4803209900856018, "learning_rate": 9.839378518430506e-06, "loss": 0.4703, "mean_token_accuracy": 0.8443278074264526, "num_tokens": 895600649.0, "step": 26735 }, { "epoch": 1.5962273161413563, "grad_norm": 0.4860791265964508, "learning_rate": 9.832500398626628e-06, "loss": 0.4248, "mean_token_accuracy": 0.8565907120704651, "num_tokens": 895768329.0, "step": 26740 }, { "epoch": 1.596525787965616, "grad_norm": 0.49583545327186584, "learning_rate": 9.825626582068384e-06, "loss": 0.4346, "mean_token_accuracy": 0.8548073410987854, "num_tokens": 895936009.0, "step": 26745 }, { "epoch": 1.5968242597898759, "grad_norm": 0.5204347372055054, "learning_rate": 9.818757070430016e-06, "loss": 0.4406, "mean_token_accuracy": 0.8525289297103882, "num_tokens": 896103689.0, "step": 26750 }, { "epoch": 1.5971227316141356, "grad_norm": 0.5683661699295044, "learning_rate": 9.811891865384723e-06, "loss": 0.4721, "mean_token_accuracy": 0.8433377146720886, "num_tokens": 896271369.0, "step": 26755 }, { "epoch": 1.5974212034383954, "grad_norm": 0.47288426756858826, "learning_rate": 9.805030968604651e-06, "loss": 0.415, "mean_token_accuracy": 0.8594954133033752, "num_tokens": 896439049.0, "step": 26760 }, { "epoch": 1.5977196752626552, "grad_norm": 0.5538458824157715, "learning_rate": 9.798174381760896e-06, "loss": 0.4291, "mean_token_accuracy": 0.8562626719474793, "num_tokens": 896606729.0, "step": 26765 }, { "epoch": 1.598018147086915, "grad_norm": 0.5352579355239868, "learning_rate": 9.791322106523516e-06, "loss": 0.4265, "mean_token_accuracy": 0.857079803943634, "num_tokens": 896774409.0, "step": 26770 }, { "epoch": 1.5983166189111748, "grad_norm": 0.5396891236305237, "learning_rate": 9.784474144561498e-06, "loss": 0.4672, "mean_token_accuracy": 0.845902419090271, "num_tokens": 896942089.0, "step": 26775 }, { "epoch": 1.5986150907354346, "grad_norm": 0.446588397026062, "learning_rate": 9.777630497542792e-06, "loss": 0.4496, "mean_token_accuracy": 0.8509543061256408, "num_tokens": 897109769.0, "step": 26780 }, { "epoch": 1.5989135625596944, "grad_norm": 0.5855042338371277, "learning_rate": 9.770791167134293e-06, "loss": 0.4575, "mean_token_accuracy": 0.8473398447036743, "num_tokens": 897277449.0, "step": 26785 }, { "epoch": 1.5992120343839542, "grad_norm": 0.48388636112213135, "learning_rate": 9.763956155001852e-06, "loss": 0.4225, "mean_token_accuracy": 0.85706787109375, "num_tokens": 897445129.0, "step": 26790 }, { "epoch": 1.599510506208214, "grad_norm": 0.4797343611717224, "learning_rate": 9.757125462810263e-06, "loss": 0.3971, "mean_token_accuracy": 0.8664678454399108, "num_tokens": 897612809.0, "step": 26795 }, { "epoch": 1.5998089780324736, "grad_norm": 0.5071563720703125, "learning_rate": 9.75029909222326e-06, "loss": 0.4709, "mean_token_accuracy": 0.843057382106781, "num_tokens": 897780489.0, "step": 26800 }, { "epoch": 1.6001074498567336, "grad_norm": 0.5073161721229553, "learning_rate": 9.743477044903532e-06, "loss": 0.4831, "mean_token_accuracy": 0.8407252907752991, "num_tokens": 897948169.0, "step": 26805 }, { "epoch": 1.6004059216809932, "grad_norm": 0.49718189239501953, "learning_rate": 9.73665932251272e-06, "loss": 0.4357, "mean_token_accuracy": 0.8544196605682373, "num_tokens": 898115849.0, "step": 26810 }, { "epoch": 1.6007043935052532, "grad_norm": 0.4949471354484558, "learning_rate": 9.729845926711403e-06, "loss": 0.4222, "mean_token_accuracy": 0.8589109063148499, "num_tokens": 898283529.0, "step": 26815 }, { "epoch": 1.6010028653295127, "grad_norm": 0.49653738737106323, "learning_rate": 9.723036859159117e-06, "loss": 0.4221, "mean_token_accuracy": 0.8589282393455505, "num_tokens": 898447005.0, "step": 26820 }, { "epoch": 1.6013013371537728, "grad_norm": 0.5347087979316711, "learning_rate": 9.716232121514316e-06, "loss": 0.4693, "mean_token_accuracy": 0.8447035551071167, "num_tokens": 898614685.0, "step": 26825 }, { "epoch": 1.6015998089780323, "grad_norm": 0.5218964219093323, "learning_rate": 9.709431715434445e-06, "loss": 0.409, "mean_token_accuracy": 0.8617738366127015, "num_tokens": 898782365.0, "step": 26830 }, { "epoch": 1.6018982808022924, "grad_norm": 0.516520082950592, "learning_rate": 9.70263564257585e-06, "loss": 0.4309, "mean_token_accuracy": 0.8550817251205445, "num_tokens": 898950045.0, "step": 26835 }, { "epoch": 1.602196752626552, "grad_norm": 0.5449186563491821, "learning_rate": 9.695843904593843e-06, "loss": 0.4246, "mean_token_accuracy": 0.8581295490264893, "num_tokens": 899117725.0, "step": 26840 }, { "epoch": 1.602495224450812, "grad_norm": 0.49110421538352966, "learning_rate": 9.689056503142683e-06, "loss": 0.4495, "mean_token_accuracy": 0.8502743601799011, "num_tokens": 899285405.0, "step": 26845 }, { "epoch": 1.6027936962750715, "grad_norm": 0.5249834656715393, "learning_rate": 9.682273439875565e-06, "loss": 0.4201, "mean_token_accuracy": 0.8587916016578674, "num_tokens": 899453085.0, "step": 26850 }, { "epoch": 1.6030921680993315, "grad_norm": 0.5787405967712402, "learning_rate": 9.675494716444634e-06, "loss": 0.4196, "mean_token_accuracy": 0.85933438539505, "num_tokens": 899620765.0, "step": 26855 }, { "epoch": 1.603390639923591, "grad_norm": 0.5511835217475891, "learning_rate": 9.668720334500959e-06, "loss": 0.4709, "mean_token_accuracy": 0.8446797132492065, "num_tokens": 899788445.0, "step": 26860 }, { "epoch": 1.6036891117478511, "grad_norm": 0.479943186044693, "learning_rate": 9.661950295694575e-06, "loss": 0.4207, "mean_token_accuracy": 0.8601156949996949, "num_tokens": 899956125.0, "step": 26865 }, { "epoch": 1.6039875835721107, "grad_norm": 0.5228937864303589, "learning_rate": 9.65518460167445e-06, "loss": 0.4398, "mean_token_accuracy": 0.8517773985862732, "num_tokens": 900123805.0, "step": 26870 }, { "epoch": 1.6042860553963707, "grad_norm": 0.4709329307079315, "learning_rate": 9.648423254088493e-06, "loss": 0.4375, "mean_token_accuracy": 0.8529285311698913, "num_tokens": 900291485.0, "step": 26875 }, { "epoch": 1.6045845272206303, "grad_norm": 0.5314335823059082, "learning_rate": 9.64166625458356e-06, "loss": 0.4206, "mean_token_accuracy": 0.8573959231376648, "num_tokens": 900459165.0, "step": 26880 }, { "epoch": 1.6048829990448903, "grad_norm": 0.5271437764167786, "learning_rate": 9.634913604805426e-06, "loss": 0.3926, "mean_token_accuracy": 0.8686866283416748, "num_tokens": 900626845.0, "step": 26885 }, { "epoch": 1.6051814708691499, "grad_norm": 0.5638481378555298, "learning_rate": 9.62816530639884e-06, "loss": 0.429, "mean_token_accuracy": 0.8561433792114258, "num_tokens": 900794525.0, "step": 26890 }, { "epoch": 1.6054799426934099, "grad_norm": 0.5208526849746704, "learning_rate": 9.62142136100748e-06, "loss": 0.4191, "mean_token_accuracy": 0.8589824557304382, "num_tokens": 900962205.0, "step": 26895 }, { "epoch": 1.6057784145176695, "grad_norm": 0.4979512393474579, "learning_rate": 9.614681770273938e-06, "loss": 0.4206, "mean_token_accuracy": 0.8592866420745849, "num_tokens": 901129885.0, "step": 26900 }, { "epoch": 1.6060768863419295, "grad_norm": 0.49903905391693115, "learning_rate": 9.607946535839779e-06, "loss": 0.4445, "mean_token_accuracy": 0.8520637035369873, "num_tokens": 901297565.0, "step": 26905 }, { "epoch": 1.606375358166189, "grad_norm": 0.4726596176624298, "learning_rate": 9.60121565934549e-06, "loss": 0.417, "mean_token_accuracy": 0.859799587726593, "num_tokens": 901465245.0, "step": 26910 }, { "epoch": 1.606673829990449, "grad_norm": 0.5235370993614197, "learning_rate": 9.594489142430504e-06, "loss": 0.4423, "mean_token_accuracy": 0.852004063129425, "num_tokens": 901632925.0, "step": 26915 }, { "epoch": 1.6069723018147086, "grad_norm": 0.4798019528388977, "learning_rate": 9.587766986733182e-06, "loss": 0.4396, "mean_token_accuracy": 0.8529047012329102, "num_tokens": 901800605.0, "step": 26920 }, { "epoch": 1.6072707736389686, "grad_norm": 0.5040823221206665, "learning_rate": 9.581049193890832e-06, "loss": 0.4237, "mean_token_accuracy": 0.8580758810043335, "num_tokens": 901968285.0, "step": 26925 }, { "epoch": 1.6075692454632282, "grad_norm": 0.49416518211364746, "learning_rate": 9.574335765539697e-06, "loss": 0.4506, "mean_token_accuracy": 0.8500179052352905, "num_tokens": 902135965.0, "step": 26930 }, { "epoch": 1.607867717287488, "grad_norm": 0.5431501865386963, "learning_rate": 9.567626703314955e-06, "loss": 0.4664, "mean_token_accuracy": 0.8454908847808837, "num_tokens": 902303645.0, "step": 26935 }, { "epoch": 1.6081661891117478, "grad_norm": 0.4754214882850647, "learning_rate": 9.560922008850729e-06, "loss": 0.4107, "mean_token_accuracy": 0.8612907290458679, "num_tokens": 902471325.0, "step": 26940 }, { "epoch": 1.6084646609360076, "grad_norm": 0.4518228769302368, "learning_rate": 9.554221683780059e-06, "loss": 0.4558, "mean_token_accuracy": 0.8502803206443786, "num_tokens": 902639005.0, "step": 26945 }, { "epoch": 1.6087631327602674, "grad_norm": 0.4833535850048065, "learning_rate": 9.547525729734933e-06, "loss": 0.3867, "mean_token_accuracy": 0.869873559474945, "num_tokens": 902806685.0, "step": 26950 }, { "epoch": 1.6090616045845272, "grad_norm": 0.4745807647705078, "learning_rate": 9.540834148346294e-06, "loss": 0.436, "mean_token_accuracy": 0.8546074509620667, "num_tokens": 902972867.0, "step": 26955 }, { "epoch": 1.609360076408787, "grad_norm": 0.4893713891506195, "learning_rate": 9.53414694124398e-06, "loss": 0.4579, "mean_token_accuracy": 0.8480615496635437, "num_tokens": 903140547.0, "step": 26960 }, { "epoch": 1.6096585482330468, "grad_norm": 0.5143224596977234, "learning_rate": 9.527464110056795e-06, "loss": 0.4142, "mean_token_accuracy": 0.86092689037323, "num_tokens": 903308227.0, "step": 26965 }, { "epoch": 1.6099570200573066, "grad_norm": 0.5242037177085876, "learning_rate": 9.520785656412451e-06, "loss": 0.4504, "mean_token_accuracy": 0.8502564668655396, "num_tokens": 903475907.0, "step": 26970 }, { "epoch": 1.6102554918815664, "grad_norm": 0.49313151836395264, "learning_rate": 9.514111581937624e-06, "loss": 0.4383, "mean_token_accuracy": 0.8542943954467773, "num_tokens": 903643587.0, "step": 26975 }, { "epoch": 1.6105539637058262, "grad_norm": 0.5468960404396057, "learning_rate": 9.507441888257909e-06, "loss": 0.4768, "mean_token_accuracy": 0.8423893690109253, "num_tokens": 903811267.0, "step": 26980 }, { "epoch": 1.610852435530086, "grad_norm": 0.4903563857078552, "learning_rate": 9.500776576997817e-06, "loss": 0.4368, "mean_token_accuracy": 0.8536144614219665, "num_tokens": 903978947.0, "step": 26985 }, { "epoch": 1.6111509073543457, "grad_norm": 0.4625544846057892, "learning_rate": 9.49411564978082e-06, "loss": 0.3955, "mean_token_accuracy": 0.8665871500968934, "num_tokens": 904146627.0, "step": 26990 }, { "epoch": 1.6114493791786055, "grad_norm": 0.5248073935508728, "learning_rate": 9.487459108229308e-06, "loss": 0.4636, "mean_token_accuracy": 0.8441965937614441, "num_tokens": 904314307.0, "step": 26995 }, { "epoch": 1.6117478510028653, "grad_norm": 0.5223250985145569, "learning_rate": 9.480806953964603e-06, "loss": 0.4274, "mean_token_accuracy": 0.8551348924636841, "num_tokens": 904475803.0, "step": 27000 }, { "epoch": 1.6120463228271251, "grad_norm": 0.5437977910041809, "learning_rate": 9.474159188606966e-06, "loss": 0.4713, "mean_token_accuracy": 0.8438267946243286, "num_tokens": 904643483.0, "step": 27005 }, { "epoch": 1.612344794651385, "grad_norm": 0.5392799377441406, "learning_rate": 9.467515813775565e-06, "loss": 0.4689, "mean_token_accuracy": 0.8446916460990905, "num_tokens": 904811163.0, "step": 27010 }, { "epoch": 1.6126432664756447, "grad_norm": 0.5598256587982178, "learning_rate": 9.460876831088542e-06, "loss": 0.4328, "mean_token_accuracy": 0.8536442875862121, "num_tokens": 904978843.0, "step": 27015 }, { "epoch": 1.6129417382999045, "grad_norm": 0.5238668322563171, "learning_rate": 9.454242242162925e-06, "loss": 0.4847, "mean_token_accuracy": 0.8389538288116455, "num_tokens": 905146523.0, "step": 27020 }, { "epoch": 1.6132402101241643, "grad_norm": 0.5520257353782654, "learning_rate": 9.447612048614699e-06, "loss": 0.4667, "mean_token_accuracy": 0.8442264199256897, "num_tokens": 905314203.0, "step": 27025 }, { "epoch": 1.613538681948424, "grad_norm": 0.5324056148529053, "learning_rate": 9.44098625205876e-06, "loss": 0.4379, "mean_token_accuracy": 0.8533102750778199, "num_tokens": 905481883.0, "step": 27030 }, { "epoch": 1.6138371537726839, "grad_norm": 0.4806233048439026, "learning_rate": 9.434364854108954e-06, "loss": 0.4011, "mean_token_accuracy": 0.8637361288070678, "num_tokens": 905649563.0, "step": 27035 }, { "epoch": 1.6141356255969437, "grad_norm": 0.48348575830459595, "learning_rate": 9.42774785637805e-06, "loss": 0.428, "mean_token_accuracy": 0.8563103914260864, "num_tokens": 905817243.0, "step": 27040 }, { "epoch": 1.6144340974212035, "grad_norm": 0.47034746408462524, "learning_rate": 9.421135260477723e-06, "loss": 0.3855, "mean_token_accuracy": 0.8697304010391236, "num_tokens": 905984923.0, "step": 27045 }, { "epoch": 1.6147325692454633, "grad_norm": 0.5088317394256592, "learning_rate": 9.4145270680186e-06, "loss": 0.4467, "mean_token_accuracy": 0.8514016389846801, "num_tokens": 906152603.0, "step": 27050 }, { "epoch": 1.615031041069723, "grad_norm": 0.576236367225647, "learning_rate": 9.407923280610231e-06, "loss": 0.4463, "mean_token_accuracy": 0.8515030384063721, "num_tokens": 906320283.0, "step": 27055 }, { "epoch": 1.6153295128939829, "grad_norm": 0.5178862810134888, "learning_rate": 9.401323899861088e-06, "loss": 0.4218, "mean_token_accuracy": 0.8584814429283142, "num_tokens": 906487963.0, "step": 27060 }, { "epoch": 1.6156279847182426, "grad_norm": 0.48196008801460266, "learning_rate": 9.394728927378576e-06, "loss": 0.4362, "mean_token_accuracy": 0.8545568346977234, "num_tokens": 906655643.0, "step": 27065 }, { "epoch": 1.6159264565425024, "grad_norm": 0.4702968895435333, "learning_rate": 9.388138364769018e-06, "loss": 0.4063, "mean_token_accuracy": 0.8634379148483277, "num_tokens": 906823323.0, "step": 27070 }, { "epoch": 1.616224928366762, "grad_norm": 0.5784925818443298, "learning_rate": 9.381552213637664e-06, "loss": 0.4497, "mean_token_accuracy": 0.8507634401321411, "num_tokens": 906991003.0, "step": 27075 }, { "epoch": 1.616523400191022, "grad_norm": 0.5316598415374756, "learning_rate": 9.374970475588697e-06, "loss": 0.4606, "mean_token_accuracy": 0.847667396068573, "num_tokens": 907155008.0, "step": 27080 }, { "epoch": 1.6168218720152816, "grad_norm": 0.47320249676704407, "learning_rate": 9.368393152225222e-06, "loss": 0.4331, "mean_token_accuracy": 0.8547477006912232, "num_tokens": 907322688.0, "step": 27085 }, { "epoch": 1.6171203438395416, "grad_norm": 0.45936790108680725, "learning_rate": 9.36182024514927e-06, "loss": 0.4249, "mean_token_accuracy": 0.8595848679542542, "num_tokens": 907490368.0, "step": 27090 }, { "epoch": 1.6174188156638012, "grad_norm": 0.589867353439331, "learning_rate": 9.35525175596178e-06, "loss": 0.4496, "mean_token_accuracy": 0.8518787980079651, "num_tokens": 907658048.0, "step": 27095 }, { "epoch": 1.6177172874880612, "grad_norm": 0.5166635513305664, "learning_rate": 9.348687686262647e-06, "loss": 0.4358, "mean_token_accuracy": 0.8537576079368592, "num_tokens": 907825728.0, "step": 27100 }, { "epoch": 1.6180157593123208, "grad_norm": 0.5179538130760193, "learning_rate": 9.34212803765066e-06, "loss": 0.4534, "mean_token_accuracy": 0.849862813949585, "num_tokens": 907993408.0, "step": 27105 }, { "epoch": 1.6183142311365808, "grad_norm": 0.5585004091262817, "learning_rate": 9.335572811723545e-06, "loss": 0.4367, "mean_token_accuracy": 0.8537874341011047, "num_tokens": 908161088.0, "step": 27110 }, { "epoch": 1.6186127029608404, "grad_norm": 0.5620519518852234, "learning_rate": 9.329022010077947e-06, "loss": 0.4987, "mean_token_accuracy": 0.83711678981781, "num_tokens": 908328768.0, "step": 27115 }, { "epoch": 1.6189111747851004, "grad_norm": 0.47741344571113586, "learning_rate": 9.322475634309436e-06, "loss": 0.3985, "mean_token_accuracy": 0.864881956577301, "num_tokens": 908491032.0, "step": 27120 }, { "epoch": 1.61920964660936, "grad_norm": 0.524983823299408, "learning_rate": 9.31593368601251e-06, "loss": 0.44, "mean_token_accuracy": 0.8528390645980835, "num_tokens": 908658712.0, "step": 27125 }, { "epoch": 1.61950811843362, "grad_norm": 0.5282403230667114, "learning_rate": 9.309396166780567e-06, "loss": 0.4693, "mean_token_accuracy": 0.8461290717124939, "num_tokens": 908826392.0, "step": 27130 }, { "epoch": 1.6198065902578795, "grad_norm": 0.5588261485099792, "learning_rate": 9.302863078205951e-06, "loss": 0.451, "mean_token_accuracy": 0.8486937880516052, "num_tokens": 908994072.0, "step": 27135 }, { "epoch": 1.6201050620821396, "grad_norm": 0.4812007546424866, "learning_rate": 9.29633442187991e-06, "loss": 0.4118, "mean_token_accuracy": 0.8601813197135926, "num_tokens": 909161752.0, "step": 27140 }, { "epoch": 1.6204035339063991, "grad_norm": 0.50464928150177, "learning_rate": 9.289810199392622e-06, "loss": 0.4374, "mean_token_accuracy": 0.853847086429596, "num_tokens": 909329432.0, "step": 27145 }, { "epoch": 1.6207020057306591, "grad_norm": 0.5393331050872803, "learning_rate": 9.283290412333188e-06, "loss": 0.4707, "mean_token_accuracy": 0.8434748768806457, "num_tokens": 909497112.0, "step": 27150 }, { "epoch": 1.6210004775549187, "grad_norm": 0.4586549401283264, "learning_rate": 9.276775062289606e-06, "loss": 0.4158, "mean_token_accuracy": 0.861266839504242, "num_tokens": 909664792.0, "step": 27155 }, { "epoch": 1.6212989493791787, "grad_norm": 0.5473030209541321, "learning_rate": 9.27026415084883e-06, "loss": 0.4615, "mean_token_accuracy": 0.8463318586349488, "num_tokens": 909832472.0, "step": 27160 }, { "epoch": 1.6215974212034383, "grad_norm": 0.5064117312431335, "learning_rate": 9.263757679596694e-06, "loss": 0.4409, "mean_token_accuracy": 0.8542049288749695, "num_tokens": 910000152.0, "step": 27165 }, { "epoch": 1.6218958930276983, "grad_norm": 0.498205304145813, "learning_rate": 9.257255650117983e-06, "loss": 0.4049, "mean_token_accuracy": 0.8642908215522767, "num_tokens": 910167832.0, "step": 27170 }, { "epoch": 1.622194364851958, "grad_norm": 0.5261074900627136, "learning_rate": 9.250758063996376e-06, "loss": 0.4502, "mean_token_accuracy": 0.8519205570220947, "num_tokens": 910335512.0, "step": 27175 }, { "epoch": 1.622492836676218, "grad_norm": 0.5457990765571594, "learning_rate": 9.244264922814487e-06, "loss": 0.4497, "mean_token_accuracy": 0.8492544293403625, "num_tokens": 910503192.0, "step": 27180 }, { "epoch": 1.6227913085004775, "grad_norm": 0.5503063201904297, "learning_rate": 9.237776228153845e-06, "loss": 0.4403, "mean_token_accuracy": 0.8519623041152954, "num_tokens": 910670872.0, "step": 27185 }, { "epoch": 1.6230897803247375, "grad_norm": 0.5159018635749817, "learning_rate": 9.231291981594874e-06, "loss": 0.4118, "mean_token_accuracy": 0.8600083589553833, "num_tokens": 910838552.0, "step": 27190 }, { "epoch": 1.623388252148997, "grad_norm": 0.4987584948539734, "learning_rate": 9.22481218471695e-06, "loss": 0.4033, "mean_token_accuracy": 0.863873302936554, "num_tokens": 911006232.0, "step": 27195 }, { "epoch": 1.623686723973257, "grad_norm": 0.4894299805164337, "learning_rate": 9.218336839098335e-06, "loss": 0.4106, "mean_token_accuracy": 0.8630144357681274, "num_tokens": 911173912.0, "step": 27200 }, { "epoch": 1.6239851957975167, "grad_norm": 0.4829419255256653, "learning_rate": 9.211865946316223e-06, "loss": 0.3953, "mean_token_accuracy": 0.8659012198448182, "num_tokens": 911341592.0, "step": 27205 }, { "epoch": 1.6242836676217765, "grad_norm": 0.6027539372444153, "learning_rate": 9.205399507946726e-06, "loss": 0.4705, "mean_token_accuracy": 0.8444888472557068, "num_tokens": 911509272.0, "step": 27210 }, { "epoch": 1.6245821394460362, "grad_norm": 0.6368187069892883, "learning_rate": 9.198937525564856e-06, "loss": 0.4501, "mean_token_accuracy": 0.849874746799469, "num_tokens": 911676952.0, "step": 27215 }, { "epoch": 1.624880611270296, "grad_norm": 0.5282248258590698, "learning_rate": 9.192480000744544e-06, "loss": 0.4278, "mean_token_accuracy": 0.8571215748786927, "num_tokens": 911844632.0, "step": 27220 }, { "epoch": 1.6251790830945558, "grad_norm": 0.4703761041164398, "learning_rate": 9.186026935058657e-06, "loss": 0.4315, "mean_token_accuracy": 0.8558928728103637, "num_tokens": 912012312.0, "step": 27225 }, { "epoch": 1.6254775549188156, "grad_norm": 0.47730252146720886, "learning_rate": 9.179578330078939e-06, "loss": 0.4194, "mean_token_accuracy": 0.859060001373291, "num_tokens": 912179992.0, "step": 27230 }, { "epoch": 1.6257760267430754, "grad_norm": 0.4958620071411133, "learning_rate": 9.173134187376078e-06, "loss": 0.4372, "mean_token_accuracy": 0.8554276466369629, "num_tokens": 912347672.0, "step": 27235 }, { "epoch": 1.6260744985673352, "grad_norm": 0.47102028131484985, "learning_rate": 9.166694508519659e-06, "loss": 0.4278, "mean_token_accuracy": 0.856316351890564, "num_tokens": 912515352.0, "step": 27240 }, { "epoch": 1.626372970391595, "grad_norm": 0.4946134388446808, "learning_rate": 9.160259295078193e-06, "loss": 0.4504, "mean_token_accuracy": 0.849749481678009, "num_tokens": 912683032.0, "step": 27245 }, { "epoch": 1.6266714422158548, "grad_norm": 0.5380204319953918, "learning_rate": 9.153828548619085e-06, "loss": 0.4511, "mean_token_accuracy": 0.8505845189094543, "num_tokens": 912850712.0, "step": 27250 }, { "epoch": 1.6269699140401146, "grad_norm": 0.4733681082725525, "learning_rate": 9.147402270708662e-06, "loss": 0.4287, "mean_token_accuracy": 0.8559346318244934, "num_tokens": 913018392.0, "step": 27255 }, { "epoch": 1.6272683858643744, "grad_norm": 0.4521208107471466, "learning_rate": 9.140980462912165e-06, "loss": 0.3925, "mean_token_accuracy": 0.867141842842102, "num_tokens": 913186072.0, "step": 27260 }, { "epoch": 1.6275668576886342, "grad_norm": 0.5736082196235657, "learning_rate": 9.134563126793749e-06, "loss": 0.4255, "mean_token_accuracy": 0.8585112690925598, "num_tokens": 913353752.0, "step": 27265 }, { "epoch": 1.627865329512894, "grad_norm": 0.49521011114120483, "learning_rate": 9.128150263916474e-06, "loss": 0.4516, "mean_token_accuracy": 0.8474114298820495, "num_tokens": 913521432.0, "step": 27270 }, { "epoch": 1.6281638013371538, "grad_norm": 0.5299897789955139, "learning_rate": 9.121741875842303e-06, "loss": 0.424, "mean_token_accuracy": 0.8581056952476501, "num_tokens": 913689112.0, "step": 27275 }, { "epoch": 1.6284622731614136, "grad_norm": 0.560864269733429, "learning_rate": 9.115337964132115e-06, "loss": 0.4318, "mean_token_accuracy": 0.8541274070739746, "num_tokens": 913856792.0, "step": 27280 }, { "epoch": 1.6287607449856734, "grad_norm": 0.5350287556648254, "learning_rate": 9.10893853034572e-06, "loss": 0.4502, "mean_token_accuracy": 0.8483597755432128, "num_tokens": 914024472.0, "step": 27285 }, { "epoch": 1.6290592168099332, "grad_norm": 0.5666098594665527, "learning_rate": 9.1025435760418e-06, "loss": 0.4473, "mean_token_accuracy": 0.8494654893875122, "num_tokens": 914191795.0, "step": 27290 }, { "epoch": 1.629357688634193, "grad_norm": 0.51736980676651, "learning_rate": 9.096153102777977e-06, "loss": 0.4498, "mean_token_accuracy": 0.850727653503418, "num_tokens": 914359475.0, "step": 27295 }, { "epoch": 1.6296561604584527, "grad_norm": 0.46848005056381226, "learning_rate": 9.089767112110747e-06, "loss": 0.448, "mean_token_accuracy": 0.8509125709533691, "num_tokens": 914527155.0, "step": 27300 }, { "epoch": 1.6299546322827125, "grad_norm": 0.49154558777809143, "learning_rate": 9.08338560559556e-06, "loss": 0.4371, "mean_token_accuracy": 0.8552069783210754, "num_tokens": 914694835.0, "step": 27305 }, { "epoch": 1.6302531041069723, "grad_norm": 0.5288469195365906, "learning_rate": 9.077008584786744e-06, "loss": 0.4607, "mean_token_accuracy": 0.8457175254821777, "num_tokens": 914862515.0, "step": 27310 }, { "epoch": 1.6305515759312321, "grad_norm": 0.4710169732570648, "learning_rate": 9.070636051237536e-06, "loss": 0.4597, "mean_token_accuracy": 0.8464094042778015, "num_tokens": 915030195.0, "step": 27315 }, { "epoch": 1.630850047755492, "grad_norm": 0.5431186556816101, "learning_rate": 9.06426800650008e-06, "loss": 0.4478, "mean_token_accuracy": 0.851491117477417, "num_tokens": 915197875.0, "step": 27320 }, { "epoch": 1.6311485195797517, "grad_norm": 0.5576286315917969, "learning_rate": 9.057904452125438e-06, "loss": 0.4797, "mean_token_accuracy": 0.8407789587974548, "num_tokens": 915365555.0, "step": 27325 }, { "epoch": 1.6314469914040115, "grad_norm": 0.5282570123672485, "learning_rate": 9.051545389663566e-06, "loss": 0.4246, "mean_token_accuracy": 0.8584039211273193, "num_tokens": 915533235.0, "step": 27330 }, { "epoch": 1.6317454632282713, "grad_norm": 0.4832616448402405, "learning_rate": 9.045190820663342e-06, "loss": 0.4055, "mean_token_accuracy": 0.86303231716156, "num_tokens": 915700915.0, "step": 27335 }, { "epoch": 1.632043935052531, "grad_norm": 0.46541497111320496, "learning_rate": 9.038840746672523e-06, "loss": 0.4409, "mean_token_accuracy": 0.8526482224464417, "num_tokens": 915868595.0, "step": 27340 }, { "epoch": 1.6323424068767909, "grad_norm": 0.4830757677555084, "learning_rate": 9.032495169237794e-06, "loss": 0.4126, "mean_token_accuracy": 0.8616068243980408, "num_tokens": 916036275.0, "step": 27345 }, { "epoch": 1.6326408787010505, "grad_norm": 0.48255184292793274, "learning_rate": 9.026154089904737e-06, "loss": 0.4275, "mean_token_accuracy": 0.855815327167511, "num_tokens": 916203955.0, "step": 27350 }, { "epoch": 1.6329393505253105, "grad_norm": 0.5121918320655823, "learning_rate": 9.019817510217849e-06, "loss": 0.4352, "mean_token_accuracy": 0.8541751265525818, "num_tokens": 916371635.0, "step": 27355 }, { "epoch": 1.63323782234957, "grad_norm": 0.5057121515274048, "learning_rate": 9.013485431720498e-06, "loss": 0.4517, "mean_token_accuracy": 0.8485446572303772, "num_tokens": 916539315.0, "step": 27360 }, { "epoch": 1.63353629417383, "grad_norm": 0.5022083520889282, "learning_rate": 9.007157855954997e-06, "loss": 0.4193, "mean_token_accuracy": 0.8576404571533203, "num_tokens": 916706995.0, "step": 27365 }, { "epoch": 1.6338347659980896, "grad_norm": 0.5485715270042419, "learning_rate": 9.000834784462544e-06, "loss": 0.4334, "mean_token_accuracy": 0.8532386898994446, "num_tokens": 916874675.0, "step": 27370 }, { "epoch": 1.6341332378223496, "grad_norm": 0.5028972625732422, "learning_rate": 8.994516218783232e-06, "loss": 0.4565, "mean_token_accuracy": 0.8468030571937561, "num_tokens": 917042355.0, "step": 27375 }, { "epoch": 1.6344317096466092, "grad_norm": 0.45500344038009644, "learning_rate": 8.988202160456064e-06, "loss": 0.4429, "mean_token_accuracy": 0.8534891963005066, "num_tokens": 917210035.0, "step": 27380 }, { "epoch": 1.6347301814708692, "grad_norm": 0.4935471713542938, "learning_rate": 8.981892611018949e-06, "loss": 0.4293, "mean_token_accuracy": 0.8555827260017395, "num_tokens": 917377715.0, "step": 27385 }, { "epoch": 1.6350286532951288, "grad_norm": 0.4806463420391083, "learning_rate": 8.975587572008695e-06, "loss": 0.4145, "mean_token_accuracy": 0.8592150807380676, "num_tokens": 917545395.0, "step": 27390 }, { "epoch": 1.6353271251193888, "grad_norm": 0.5478640794754028, "learning_rate": 8.969287044961014e-06, "loss": 0.4508, "mean_token_accuracy": 0.8494751214981079, "num_tokens": 917713075.0, "step": 27395 }, { "epoch": 1.6356255969436484, "grad_norm": 0.49986085295677185, "learning_rate": 8.962991031410507e-06, "loss": 0.4525, "mean_token_accuracy": 0.8484313488006592, "num_tokens": 917880755.0, "step": 27400 }, { "epoch": 1.6359240687679084, "grad_norm": 0.5525385737419128, "learning_rate": 8.956699532890683e-06, "loss": 0.4286, "mean_token_accuracy": 0.856226897239685, "num_tokens": 918048435.0, "step": 27405 }, { "epoch": 1.636222540592168, "grad_norm": 0.4970659911632538, "learning_rate": 8.950412550933964e-06, "loss": 0.4323, "mean_token_accuracy": 0.8539961814880371, "num_tokens": 918216115.0, "step": 27410 }, { "epoch": 1.636521012416428, "grad_norm": 0.5140109658241272, "learning_rate": 8.944130087071651e-06, "loss": 0.4462, "mean_token_accuracy": 0.8535548210144043, "num_tokens": 918383795.0, "step": 27415 }, { "epoch": 1.6368194842406876, "grad_norm": 0.5214282274246216, "learning_rate": 8.937852142833963e-06, "loss": 0.4135, "mean_token_accuracy": 0.8604795575141907, "num_tokens": 918551475.0, "step": 27420 }, { "epoch": 1.6371179560649476, "grad_norm": 0.4619886875152588, "learning_rate": 8.931578719749997e-06, "loss": 0.4105, "mean_token_accuracy": 0.8627997159957885, "num_tokens": 918719155.0, "step": 27425 }, { "epoch": 1.6374164278892072, "grad_norm": 0.5684396624565125, "learning_rate": 8.925309819347774e-06, "loss": 0.4309, "mean_token_accuracy": 0.8536502480506897, "num_tokens": 918886835.0, "step": 27430 }, { "epoch": 1.6377148997134672, "grad_norm": 0.5223966836929321, "learning_rate": 8.919045443154189e-06, "loss": 0.4161, "mean_token_accuracy": 0.859298586845398, "num_tokens": 919054515.0, "step": 27435 }, { "epoch": 1.6380133715377267, "grad_norm": 0.5374763011932373, "learning_rate": 8.912785592695053e-06, "loss": 0.46, "mean_token_accuracy": 0.8458368182182312, "num_tokens": 919222195.0, "step": 27440 }, { "epoch": 1.6383118433619868, "grad_norm": 0.49368685483932495, "learning_rate": 8.906530269495064e-06, "loss": 0.4249, "mean_token_accuracy": 0.8565191507339478, "num_tokens": 919389875.0, "step": 27445 }, { "epoch": 1.6386103151862463, "grad_norm": 0.49564072489738464, "learning_rate": 8.900279475077827e-06, "loss": 0.4105, "mean_token_accuracy": 0.8620123982429504, "num_tokens": 919557555.0, "step": 27450 }, { "epoch": 1.6389087870105064, "grad_norm": 0.48721274733543396, "learning_rate": 8.894033210965838e-06, "loss": 0.4196, "mean_token_accuracy": 0.8584396958351135, "num_tokens": 919725235.0, "step": 27455 }, { "epoch": 1.639207258834766, "grad_norm": 0.6268237829208374, "learning_rate": 8.887791478680484e-06, "loss": 0.4416, "mean_token_accuracy": 0.8544912338256836, "num_tokens": 919892915.0, "step": 27460 }, { "epoch": 1.639505730659026, "grad_norm": 0.555050790309906, "learning_rate": 8.881554279742056e-06, "loss": 0.5002, "mean_token_accuracy": 0.8354646444320679, "num_tokens": 920060595.0, "step": 27465 }, { "epoch": 1.6398042024832855, "grad_norm": 0.4681052267551422, "learning_rate": 8.87532161566974e-06, "loss": 0.4174, "mean_token_accuracy": 0.8597817063331604, "num_tokens": 920228275.0, "step": 27470 }, { "epoch": 1.6401026743075455, "grad_norm": 0.473096638917923, "learning_rate": 8.869093487981619e-06, "loss": 0.4194, "mean_token_accuracy": 0.8590063214302063, "num_tokens": 920395955.0, "step": 27475 }, { "epoch": 1.640401146131805, "grad_norm": 0.506682813167572, "learning_rate": 8.86286989819467e-06, "loss": 0.4315, "mean_token_accuracy": 0.855839192867279, "num_tokens": 920563635.0, "step": 27480 }, { "epoch": 1.640699617956065, "grad_norm": 0.5237351655960083, "learning_rate": 8.85665084782475e-06, "loss": 0.4367, "mean_token_accuracy": 0.8536741137504578, "num_tokens": 920731315.0, "step": 27485 }, { "epoch": 1.6409980897803247, "grad_norm": 0.473579466342926, "learning_rate": 8.850436338386641e-06, "loss": 0.4115, "mean_token_accuracy": 0.8609507441520691, "num_tokens": 920898995.0, "step": 27490 }, { "epoch": 1.6412965616045845, "grad_norm": 0.4849102795124054, "learning_rate": 8.844226371393988e-06, "loss": 0.386, "mean_token_accuracy": 0.8686150550842285, "num_tokens": 921066675.0, "step": 27495 }, { "epoch": 1.6415950334288443, "grad_norm": 0.5197857022285461, "learning_rate": 8.838020948359352e-06, "loss": 0.4492, "mean_token_accuracy": 0.8496838927268981, "num_tokens": 921234355.0, "step": 27500 }, { "epoch": 1.641893505253104, "grad_norm": 0.5411177277565002, "learning_rate": 8.831820070794169e-06, "loss": 0.3957, "mean_token_accuracy": 0.8665096044540406, "num_tokens": 921402035.0, "step": 27505 }, { "epoch": 1.6421919770773639, "grad_norm": 0.5621386170387268, "learning_rate": 8.825623740208788e-06, "loss": 0.4498, "mean_token_accuracy": 0.8501968264579773, "num_tokens": 921569715.0, "step": 27510 }, { "epoch": 1.6424904489016237, "grad_norm": 0.493028461933136, "learning_rate": 8.81943195811244e-06, "loss": 0.4198, "mean_token_accuracy": 0.858570909500122, "num_tokens": 921737395.0, "step": 27515 }, { "epoch": 1.6427889207258835, "grad_norm": 0.4988255500793457, "learning_rate": 8.813244726013233e-06, "loss": 0.409, "mean_token_accuracy": 0.861916983127594, "num_tokens": 921905075.0, "step": 27520 }, { "epoch": 1.6430873925501432, "grad_norm": 0.5710336565971375, "learning_rate": 8.807062045418191e-06, "loss": 0.4685, "mean_token_accuracy": 0.8433794617652893, "num_tokens": 922072755.0, "step": 27525 }, { "epoch": 1.643385864374403, "grad_norm": 0.4894435405731201, "learning_rate": 8.800883917833222e-06, "loss": 0.4221, "mean_token_accuracy": 0.8586842298507691, "num_tokens": 922240435.0, "step": 27530 }, { "epoch": 1.6436843361986628, "grad_norm": 0.5521994829177856, "learning_rate": 8.794710344763121e-06, "loss": 0.4585, "mean_token_accuracy": 0.8486639618873596, "num_tokens": 922408115.0, "step": 27535 }, { "epoch": 1.6439828080229226, "grad_norm": 0.49032309651374817, "learning_rate": 8.788541327711583e-06, "loss": 0.4351, "mean_token_accuracy": 0.8541930079460144, "num_tokens": 922575795.0, "step": 27540 }, { "epoch": 1.6442812798471824, "grad_norm": 0.545734167098999, "learning_rate": 8.78237686818117e-06, "loss": 0.4354, "mean_token_accuracy": 0.8546164751052856, "num_tokens": 922743475.0, "step": 27545 }, { "epoch": 1.6445797516714422, "grad_norm": 0.498403936624527, "learning_rate": 8.776216967673356e-06, "loss": 0.4252, "mean_token_accuracy": 0.858558988571167, "num_tokens": 922911155.0, "step": 27550 }, { "epoch": 1.644878223495702, "grad_norm": 0.4923866391181946, "learning_rate": 8.770061627688508e-06, "loss": 0.4668, "mean_token_accuracy": 0.8452105402946473, "num_tokens": 923078835.0, "step": 27555 }, { "epoch": 1.6451766953199618, "grad_norm": 0.4732860028743744, "learning_rate": 8.763910849725865e-06, "loss": 0.427, "mean_token_accuracy": 0.8553560853004456, "num_tokens": 923246515.0, "step": 27560 }, { "epoch": 1.6454751671442216, "grad_norm": 0.5003758668899536, "learning_rate": 8.757764635283558e-06, "loss": 0.4522, "mean_token_accuracy": 0.8487236022949218, "num_tokens": 923414195.0, "step": 27565 }, { "epoch": 1.6457736389684814, "grad_norm": 0.4619987905025482, "learning_rate": 8.75162298585862e-06, "loss": 0.3876, "mean_token_accuracy": 0.8686269879341125, "num_tokens": 923581875.0, "step": 27570 }, { "epoch": 1.6460721107927412, "grad_norm": 0.5236973762512207, "learning_rate": 8.745485902946961e-06, "loss": 0.4234, "mean_token_accuracy": 0.8577179908752441, "num_tokens": 923749555.0, "step": 27575 }, { "epoch": 1.646370582617001, "grad_norm": 0.46308356523513794, "learning_rate": 8.739353388043375e-06, "loss": 0.4631, "mean_token_accuracy": 0.8457115650177002, "num_tokens": 923917235.0, "step": 27580 }, { "epoch": 1.6466690544412608, "grad_norm": 0.4999522268772125, "learning_rate": 8.733225442641554e-06, "loss": 0.3951, "mean_token_accuracy": 0.8672670841217041, "num_tokens": 924084915.0, "step": 27585 }, { "epoch": 1.6469675262655206, "grad_norm": 0.5205977559089661, "learning_rate": 8.727102068234073e-06, "loss": 0.4175, "mean_token_accuracy": 0.8607002258300781, "num_tokens": 924252595.0, "step": 27590 }, { "epoch": 1.6472659980897804, "grad_norm": 0.501131534576416, "learning_rate": 8.720983266312387e-06, "loss": 0.4438, "mean_token_accuracy": 0.8526839971542358, "num_tokens": 924420275.0, "step": 27595 }, { "epoch": 1.6475644699140402, "grad_norm": 0.5764347910881042, "learning_rate": 8.714869038366857e-06, "loss": 0.4745, "mean_token_accuracy": 0.8423476219177246, "num_tokens": 924587955.0, "step": 27600 }, { "epoch": 1.6478629417383, "grad_norm": 0.4761529266834259, "learning_rate": 8.708759385886698e-06, "loss": 0.4294, "mean_token_accuracy": 0.8558034062385559, "num_tokens": 924755635.0, "step": 27605 }, { "epoch": 1.6481614135625597, "grad_norm": 0.48771607875823975, "learning_rate": 8.702654310360034e-06, "loss": 0.4554, "mean_token_accuracy": 0.8486460685729981, "num_tokens": 924923315.0, "step": 27610 }, { "epoch": 1.6484598853868195, "grad_norm": 0.558755099773407, "learning_rate": 8.69655381327388e-06, "loss": 0.4507, "mean_token_accuracy": 0.8506441593170166, "num_tokens": 925090995.0, "step": 27615 }, { "epoch": 1.6487583572110793, "grad_norm": 0.48419713973999023, "learning_rate": 8.690457896114115e-06, "loss": 0.4397, "mean_token_accuracy": 0.852982223033905, "num_tokens": 925258675.0, "step": 27620 }, { "epoch": 1.649056829035339, "grad_norm": 0.5028923153877258, "learning_rate": 8.684366560365515e-06, "loss": 0.4479, "mean_token_accuracy": 0.8506203174591065, "num_tokens": 925426355.0, "step": 27625 }, { "epoch": 1.649355300859599, "grad_norm": 0.5347084999084473, "learning_rate": 8.678279807511729e-06, "loss": 0.3942, "mean_token_accuracy": 0.8680066943168641, "num_tokens": 925594035.0, "step": 27630 }, { "epoch": 1.6496537726838585, "grad_norm": 0.5266854763031006, "learning_rate": 8.67219763903531e-06, "loss": 0.4157, "mean_token_accuracy": 0.8587916135787964, "num_tokens": 925761715.0, "step": 27635 }, { "epoch": 1.6499522445081185, "grad_norm": 0.4946543276309967, "learning_rate": 8.666120056417681e-06, "loss": 0.4575, "mean_token_accuracy": 0.8478885889053345, "num_tokens": 925929395.0, "step": 27640 }, { "epoch": 1.650250716332378, "grad_norm": 0.46847429871559143, "learning_rate": 8.66004706113914e-06, "loss": 0.424, "mean_token_accuracy": 0.8566444039344787, "num_tokens": 926097075.0, "step": 27645 }, { "epoch": 1.650549188156638, "grad_norm": 0.506936252117157, "learning_rate": 8.653978654678885e-06, "loss": 0.4663, "mean_token_accuracy": 0.8446975946426392, "num_tokens": 926264755.0, "step": 27650 }, { "epoch": 1.6508476599808977, "grad_norm": 0.49346089363098145, "learning_rate": 8.647914838514985e-06, "loss": 0.4361, "mean_token_accuracy": 0.8550280451774597, "num_tokens": 926432435.0, "step": 27655 }, { "epoch": 1.6511461318051577, "grad_norm": 0.4977369010448456, "learning_rate": 8.641855614124397e-06, "loss": 0.5022, "mean_token_accuracy": 0.8350590467453003, "num_tokens": 926600115.0, "step": 27660 }, { "epoch": 1.6514446036294173, "grad_norm": 0.48802804946899414, "learning_rate": 8.635800982982958e-06, "loss": 0.4294, "mean_token_accuracy": 0.8562388181686401, "num_tokens": 926767795.0, "step": 27665 }, { "epoch": 1.6517430754536773, "grad_norm": 0.48658645153045654, "learning_rate": 8.62975094656538e-06, "loss": 0.444, "mean_token_accuracy": 0.8521412372589111, "num_tokens": 926935475.0, "step": 27670 }, { "epoch": 1.6520415472779368, "grad_norm": 0.5168094635009766, "learning_rate": 8.623705506345265e-06, "loss": 0.3963, "mean_token_accuracy": 0.8662173509597778, "num_tokens": 927103155.0, "step": 27675 }, { "epoch": 1.6523400191021969, "grad_norm": 0.502001941204071, "learning_rate": 8.61766466379509e-06, "loss": 0.3981, "mean_token_accuracy": 0.8663068056106568, "num_tokens": 927270835.0, "step": 27680 }, { "epoch": 1.6526384909264564, "grad_norm": 0.47451192140579224, "learning_rate": 8.611628420386217e-06, "loss": 0.4107, "mean_token_accuracy": 0.8618871569633484, "num_tokens": 927438515.0, "step": 27685 }, { "epoch": 1.6529369627507164, "grad_norm": 0.4913615584373474, "learning_rate": 8.60559677758887e-06, "loss": 0.4416, "mean_token_accuracy": 0.8519324779510498, "num_tokens": 927606195.0, "step": 27690 }, { "epoch": 1.653235434574976, "grad_norm": 0.5603580474853516, "learning_rate": 8.599569736872187e-06, "loss": 0.4343, "mean_token_accuracy": 0.8542526602745056, "num_tokens": 927773875.0, "step": 27695 }, { "epoch": 1.653533906399236, "grad_norm": 0.47387024760246277, "learning_rate": 8.593547299704164e-06, "loss": 0.405, "mean_token_accuracy": 0.8633007287979126, "num_tokens": 927941555.0, "step": 27700 }, { "epoch": 1.6538323782234956, "grad_norm": 0.45370379090309143, "learning_rate": 8.587529467551658e-06, "loss": 0.4143, "mean_token_accuracy": 0.859948706626892, "num_tokens": 928109235.0, "step": 27705 }, { "epoch": 1.6541308500477556, "grad_norm": 0.5351717472076416, "learning_rate": 8.581516241880439e-06, "loss": 0.4458, "mean_token_accuracy": 0.8508529186248779, "num_tokens": 928276915.0, "step": 27710 }, { "epoch": 1.6544293218720152, "grad_norm": 0.49247390031814575, "learning_rate": 8.57550762415513e-06, "loss": 0.4148, "mean_token_accuracy": 0.8607300400733948, "num_tokens": 928444595.0, "step": 27715 }, { "epoch": 1.6547277936962752, "grad_norm": 0.5350109338760376, "learning_rate": 8.56950361583925e-06, "loss": 0.4478, "mean_token_accuracy": 0.8499940276145935, "num_tokens": 928612275.0, "step": 27720 }, { "epoch": 1.6550262655205348, "grad_norm": 0.5484545230865479, "learning_rate": 8.563504218395182e-06, "loss": 0.4396, "mean_token_accuracy": 0.8516998529434204, "num_tokens": 928779955.0, "step": 27725 }, { "epoch": 1.6553247373447948, "grad_norm": 0.5197243690490723, "learning_rate": 8.557509433284188e-06, "loss": 0.4114, "mean_token_accuracy": 0.8616783857345581, "num_tokens": 928947635.0, "step": 27730 }, { "epoch": 1.6556232091690544, "grad_norm": 0.5030907988548279, "learning_rate": 8.551519261966408e-06, "loss": 0.4433, "mean_token_accuracy": 0.8522963166236878, "num_tokens": 929115315.0, "step": 27735 }, { "epoch": 1.6559216809933144, "grad_norm": 0.5298928618431091, "learning_rate": 8.545533705900863e-06, "loss": 0.4654, "mean_token_accuracy": 0.8443397402763366, "num_tokens": 929282995.0, "step": 27740 }, { "epoch": 1.656220152817574, "grad_norm": 0.5053024291992188, "learning_rate": 8.53955276654544e-06, "loss": 0.445, "mean_token_accuracy": 0.8511749982833863, "num_tokens": 929450675.0, "step": 27745 }, { "epoch": 1.656518624641834, "grad_norm": 0.4837656319141388, "learning_rate": 8.533576445356917e-06, "loss": 0.4117, "mean_token_accuracy": 0.8607598781585694, "num_tokens": 929618355.0, "step": 27750 }, { "epoch": 1.6568170964660935, "grad_norm": 0.5115906000137329, "learning_rate": 8.527604743790919e-06, "loss": 0.4672, "mean_token_accuracy": 0.8451926589012146, "num_tokens": 929786035.0, "step": 27755 }, { "epoch": 1.6571155682903533, "grad_norm": 0.6006373763084412, "learning_rate": 8.52163766330199e-06, "loss": 0.4402, "mean_token_accuracy": 0.8521770358085632, "num_tokens": 929953715.0, "step": 27760 }, { "epoch": 1.6574140401146131, "grad_norm": 0.4715985357761383, "learning_rate": 8.515675205343501e-06, "loss": 0.4159, "mean_token_accuracy": 0.8614815711975098, "num_tokens": 930121395.0, "step": 27765 }, { "epoch": 1.657712511938873, "grad_norm": 0.5042600035667419, "learning_rate": 8.509717371367732e-06, "loss": 0.4435, "mean_token_accuracy": 0.8507097721099853, "num_tokens": 930289075.0, "step": 27770 }, { "epoch": 1.6580109837631327, "grad_norm": 0.4789464771747589, "learning_rate": 8.503764162825805e-06, "loss": 0.4566, "mean_token_accuracy": 0.847518789768219, "num_tokens": 930456755.0, "step": 27775 }, { "epoch": 1.6583094555873925, "grad_norm": 0.4957612454891205, "learning_rate": 8.497815581167755e-06, "loss": 0.4177, "mean_token_accuracy": 0.8605153203010559, "num_tokens": 930624435.0, "step": 27780 }, { "epoch": 1.6586079274116523, "grad_norm": 0.5574946999549866, "learning_rate": 8.491871627842465e-06, "loss": 0.4383, "mean_token_accuracy": 0.8546403408050537, "num_tokens": 930792115.0, "step": 27785 }, { "epoch": 1.658906399235912, "grad_norm": 0.49140918254852295, "learning_rate": 8.485932304297683e-06, "loss": 0.4454, "mean_token_accuracy": 0.8529762506484986, "num_tokens": 930959795.0, "step": 27790 }, { "epoch": 1.659204871060172, "grad_norm": 0.5171493887901306, "learning_rate": 8.479997611980048e-06, "loss": 0.4541, "mean_token_accuracy": 0.8486520290374756, "num_tokens": 931127475.0, "step": 27795 }, { "epoch": 1.6595033428844317, "grad_norm": 0.49158546328544617, "learning_rate": 8.474067552335062e-06, "loss": 0.3906, "mean_token_accuracy": 0.8663068175315857, "num_tokens": 931295155.0, "step": 27800 }, { "epoch": 1.6598018147086915, "grad_norm": 0.48990312218666077, "learning_rate": 8.468142126807105e-06, "loss": 0.4321, "mean_token_accuracy": 0.8553020596504212, "num_tokens": 931449666.0, "step": 27805 }, { "epoch": 1.6601002865329513, "grad_norm": 0.5473856329917908, "learning_rate": 8.462221336839427e-06, "loss": 0.4447, "mean_token_accuracy": 0.8515865325927734, "num_tokens": 931617346.0, "step": 27810 }, { "epoch": 1.660398758357211, "grad_norm": 0.4636194407939911, "learning_rate": 8.456305183874128e-06, "loss": 0.4393, "mean_token_accuracy": 0.8540737271308899, "num_tokens": 931785026.0, "step": 27815 }, { "epoch": 1.6606972301814709, "grad_norm": 0.5502971410751343, "learning_rate": 8.450393669352215e-06, "loss": 0.4251, "mean_token_accuracy": 0.8557437658309937, "num_tokens": 931952706.0, "step": 27820 }, { "epoch": 1.6609957020057307, "grad_norm": 0.4992155432701111, "learning_rate": 8.444486794713538e-06, "loss": 0.4104, "mean_token_accuracy": 0.8617140412330627, "num_tokens": 932119688.0, "step": 27825 }, { "epoch": 1.6612941738299905, "grad_norm": 0.5216702818870544, "learning_rate": 8.438584561396826e-06, "loss": 0.4672, "mean_token_accuracy": 0.8459441781044006, "num_tokens": 932287368.0, "step": 27830 }, { "epoch": 1.6615926456542502, "grad_norm": 0.5434436798095703, "learning_rate": 8.432686970839674e-06, "loss": 0.4404, "mean_token_accuracy": 0.8518191695213317, "num_tokens": 932455048.0, "step": 27835 }, { "epoch": 1.66189111747851, "grad_norm": 0.47585082054138184, "learning_rate": 8.426794024478555e-06, "loss": 0.4079, "mean_token_accuracy": 0.8623523831367492, "num_tokens": 932622728.0, "step": 27840 }, { "epoch": 1.6621895893027698, "grad_norm": 0.456572026014328, "learning_rate": 8.420905723748809e-06, "loss": 0.421, "mean_token_accuracy": 0.8603841185569763, "num_tokens": 932790408.0, "step": 27845 }, { "epoch": 1.6624880611270296, "grad_norm": 0.48674291372299194, "learning_rate": 8.415022070084626e-06, "loss": 0.4261, "mean_token_accuracy": 0.8582906007766724, "num_tokens": 932958088.0, "step": 27850 }, { "epoch": 1.6627865329512894, "grad_norm": 0.4801638722419739, "learning_rate": 8.409143064919088e-06, "loss": 0.4466, "mean_token_accuracy": 0.8507037878036499, "num_tokens": 933125768.0, "step": 27855 }, { "epoch": 1.6630850047755492, "grad_norm": 0.48785632848739624, "learning_rate": 8.403268709684135e-06, "loss": 0.4633, "mean_token_accuracy": 0.8466360449790955, "num_tokens": 933293448.0, "step": 27860 }, { "epoch": 1.663383476599809, "grad_norm": 0.4722157418727875, "learning_rate": 8.397399005810569e-06, "loss": 0.4201, "mean_token_accuracy": 0.8584218144416809, "num_tokens": 933461128.0, "step": 27865 }, { "epoch": 1.6636819484240688, "grad_norm": 0.4849564731121063, "learning_rate": 8.391533954728075e-06, "loss": 0.4372, "mean_token_accuracy": 0.85422283411026, "num_tokens": 933628808.0, "step": 27870 }, { "epoch": 1.6639804202483286, "grad_norm": 0.4901125133037567, "learning_rate": 8.385673557865184e-06, "loss": 0.4394, "mean_token_accuracy": 0.8514575719833374, "num_tokens": 933790061.0, "step": 27875 }, { "epoch": 1.6642788920725884, "grad_norm": 0.4593963623046875, "learning_rate": 8.379817816649308e-06, "loss": 0.4223, "mean_token_accuracy": 0.8571036577224731, "num_tokens": 933957741.0, "step": 27880 }, { "epoch": 1.6645773638968482, "grad_norm": 0.48224207758903503, "learning_rate": 8.373966732506722e-06, "loss": 0.467, "mean_token_accuracy": 0.8445186734199523, "num_tokens": 934125421.0, "step": 27885 }, { "epoch": 1.664875835721108, "grad_norm": 0.4888492524623871, "learning_rate": 8.368120306862563e-06, "loss": 0.4487, "mean_token_accuracy": 0.8504950404167175, "num_tokens": 934293101.0, "step": 27890 }, { "epoch": 1.6651743075453678, "grad_norm": 0.5116983652114868, "learning_rate": 8.362278541140845e-06, "loss": 0.3966, "mean_token_accuracy": 0.8655374050140381, "num_tokens": 934460781.0, "step": 27895 }, { "epoch": 1.6654727793696273, "grad_norm": 0.5528643727302551, "learning_rate": 8.356441436764415e-06, "loss": 0.4638, "mean_token_accuracy": 0.8443516492843628, "num_tokens": 934628461.0, "step": 27900 }, { "epoch": 1.6657712511938874, "grad_norm": 0.5356369614601135, "learning_rate": 8.350608995155035e-06, "loss": 0.4265, "mean_token_accuracy": 0.8570857644081116, "num_tokens": 934796141.0, "step": 27905 }, { "epoch": 1.666069723018147, "grad_norm": 0.472017377614975, "learning_rate": 8.344781217733285e-06, "loss": 0.4183, "mean_token_accuracy": 0.8590242266654968, "num_tokens": 934963821.0, "step": 27910 }, { "epoch": 1.666368194842407, "grad_norm": 0.5102182626724243, "learning_rate": 8.338958105918639e-06, "loss": 0.4345, "mean_token_accuracy": 0.8555946588516236, "num_tokens": 935131501.0, "step": 27915 }, { "epoch": 1.6666666666666665, "grad_norm": 0.4919207692146301, "learning_rate": 8.333139661129415e-06, "loss": 0.4509, "mean_token_accuracy": 0.8502326130867004, "num_tokens": 935299181.0, "step": 27920 }, { "epoch": 1.6669651384909265, "grad_norm": 0.6054666042327881, "learning_rate": 8.327325884782804e-06, "loss": 0.4499, "mean_token_accuracy": 0.8494810819625854, "num_tokens": 935466861.0, "step": 27925 }, { "epoch": 1.667263610315186, "grad_norm": 0.5438222885131836, "learning_rate": 8.321516778294872e-06, "loss": 0.4534, "mean_token_accuracy": 0.8481868028640747, "num_tokens": 935634541.0, "step": 27930 }, { "epoch": 1.6675620821394461, "grad_norm": 0.5048179626464844, "learning_rate": 8.315712343080512e-06, "loss": 0.4254, "mean_token_accuracy": 0.85670405626297, "num_tokens": 935802221.0, "step": 27935 }, { "epoch": 1.6678605539637057, "grad_norm": 0.540630578994751, "learning_rate": 8.309912580553512e-06, "loss": 0.4135, "mean_token_accuracy": 0.8610700249671936, "num_tokens": 935969901.0, "step": 27940 }, { "epoch": 1.6681590257879657, "grad_norm": 0.44495153427124023, "learning_rate": 8.304117492126519e-06, "loss": 0.4127, "mean_token_accuracy": 0.8618275046348571, "num_tokens": 936137581.0, "step": 27945 }, { "epoch": 1.6684574976122253, "grad_norm": 0.5471038222312927, "learning_rate": 8.298327079211022e-06, "loss": 0.4465, "mean_token_accuracy": 0.8510318517684936, "num_tokens": 936305261.0, "step": 27950 }, { "epoch": 1.6687559694364853, "grad_norm": 0.562255859375, "learning_rate": 8.29254134321739e-06, "loss": 0.4418, "mean_token_accuracy": 0.85333411693573, "num_tokens": 936472941.0, "step": 27955 }, { "epoch": 1.6690544412607449, "grad_norm": 0.5070235133171082, "learning_rate": 8.286760285554836e-06, "loss": 0.4117, "mean_token_accuracy": 0.8602230787277222, "num_tokens": 936640621.0, "step": 27960 }, { "epoch": 1.6693529130850049, "grad_norm": 0.47316086292266846, "learning_rate": 8.280983907631452e-06, "loss": 0.4227, "mean_token_accuracy": 0.8590838670730591, "num_tokens": 936808301.0, "step": 27965 }, { "epoch": 1.6696513849092645, "grad_norm": 0.5139144062995911, "learning_rate": 8.275212210854188e-06, "loss": 0.4366, "mean_token_accuracy": 0.8539424896240234, "num_tokens": 936975981.0, "step": 27970 }, { "epoch": 1.6699498567335245, "grad_norm": 0.524645984172821, "learning_rate": 8.269445196628831e-06, "loss": 0.4456, "mean_token_accuracy": 0.8517237186431885, "num_tokens": 937143661.0, "step": 27975 }, { "epoch": 1.670248328557784, "grad_norm": 0.5532966256141663, "learning_rate": 8.263682866360051e-06, "loss": 0.424, "mean_token_accuracy": 0.8589824676513672, "num_tokens": 937311341.0, "step": 27980 }, { "epoch": 1.670546800382044, "grad_norm": 0.4807977080345154, "learning_rate": 8.257925221451373e-06, "loss": 0.4266, "mean_token_accuracy": 0.8570380449295044, "num_tokens": 937479021.0, "step": 27985 }, { "epoch": 1.6708452722063036, "grad_norm": 0.547822117805481, "learning_rate": 8.252172263305183e-06, "loss": 0.4259, "mean_token_accuracy": 0.8576702833175659, "num_tokens": 937646701.0, "step": 27990 }, { "epoch": 1.6711437440305636, "grad_norm": 0.5031469464302063, "learning_rate": 8.246423993322706e-06, "loss": 0.4524, "mean_token_accuracy": 0.8491291999816895, "num_tokens": 937814381.0, "step": 27995 }, { "epoch": 1.6714422158548232, "grad_norm": 0.5416647791862488, "learning_rate": 8.240680412904041e-06, "loss": 0.4221, "mean_token_accuracy": 0.858183228969574, "num_tokens": 937982061.0, "step": 28000 }, { "epoch": 1.6717406876790832, "grad_norm": 0.483253538608551, "learning_rate": 8.23494152344815e-06, "loss": 0.4238, "mean_token_accuracy": 0.8573720693588257, "num_tokens": 938149741.0, "step": 28005 }, { "epoch": 1.6720391595033428, "grad_norm": 0.5158764123916626, "learning_rate": 8.229207326352845e-06, "loss": 0.4256, "mean_token_accuracy": 0.8569068312644958, "num_tokens": 938317421.0, "step": 28010 }, { "epoch": 1.6723376313276028, "grad_norm": 0.5189338326454163, "learning_rate": 8.223477823014797e-06, "loss": 0.4145, "mean_token_accuracy": 0.8611952662467957, "num_tokens": 938485101.0, "step": 28015 }, { "epoch": 1.6726361031518624, "grad_norm": 0.4930899143218994, "learning_rate": 8.217753014829527e-06, "loss": 0.4107, "mean_token_accuracy": 0.8626685023307801, "num_tokens": 938652781.0, "step": 28020 }, { "epoch": 1.6729345749761224, "grad_norm": 0.4693681001663208, "learning_rate": 8.21203290319141e-06, "loss": 0.4444, "mean_token_accuracy": 0.8507097721099853, "num_tokens": 938820461.0, "step": 28025 }, { "epoch": 1.673233046800382, "grad_norm": 0.5533576607704163, "learning_rate": 8.20631748949371e-06, "loss": 0.442, "mean_token_accuracy": 0.8511093974113464, "num_tokens": 938988141.0, "step": 28030 }, { "epoch": 1.6735315186246418, "grad_norm": 0.4884834885597229, "learning_rate": 8.200606775128497e-06, "loss": 0.4369, "mean_token_accuracy": 0.8536621689796448, "num_tokens": 939155821.0, "step": 28035 }, { "epoch": 1.6738299904489016, "grad_norm": 0.46458420157432556, "learning_rate": 8.194900761486731e-06, "loss": 0.4325, "mean_token_accuracy": 0.8541333794593811, "num_tokens": 939323501.0, "step": 28040 }, { "epoch": 1.6741284622731614, "grad_norm": 0.5253254175186157, "learning_rate": 8.189199449958215e-06, "loss": 0.4428, "mean_token_accuracy": 0.8522843837738037, "num_tokens": 939491181.0, "step": 28045 }, { "epoch": 1.6744269340974212, "grad_norm": 0.5520123243331909, "learning_rate": 8.183502841931611e-06, "loss": 0.4209, "mean_token_accuracy": 0.857819390296936, "num_tokens": 939658861.0, "step": 28050 }, { "epoch": 1.674725405921681, "grad_norm": 0.45845434069633484, "learning_rate": 8.177810938794436e-06, "loss": 0.409, "mean_token_accuracy": 0.8619587302207947, "num_tokens": 939826541.0, "step": 28055 }, { "epoch": 1.6750238777459407, "grad_norm": 0.49971920251846313, "learning_rate": 8.172123741933047e-06, "loss": 0.399, "mean_token_accuracy": 0.8653644323348999, "num_tokens": 939994221.0, "step": 28060 }, { "epoch": 1.6753223495702005, "grad_norm": 0.5177424550056458, "learning_rate": 8.166441252732675e-06, "loss": 0.4149, "mean_token_accuracy": 0.859686267375946, "num_tokens": 940161901.0, "step": 28065 }, { "epoch": 1.6756208213944603, "grad_norm": 0.49712035059928894, "learning_rate": 8.16076347257739e-06, "loss": 0.4011, "mean_token_accuracy": 0.8641894340515137, "num_tokens": 940329581.0, "step": 28070 }, { "epoch": 1.6759192932187201, "grad_norm": 0.5571178197860718, "learning_rate": 8.155090402850124e-06, "loss": 0.4376, "mean_token_accuracy": 0.8536383032798767, "num_tokens": 940497261.0, "step": 28075 }, { "epoch": 1.67621776504298, "grad_norm": 0.5189345479011536, "learning_rate": 8.149422044932664e-06, "loss": 0.4335, "mean_token_accuracy": 0.854461395740509, "num_tokens": 940664941.0, "step": 28080 }, { "epoch": 1.6765162368672397, "grad_norm": 0.44967299699783325, "learning_rate": 8.143758400205622e-06, "loss": 0.4055, "mean_token_accuracy": 0.8618453979492188, "num_tokens": 940832621.0, "step": 28085 }, { "epoch": 1.6768147086914995, "grad_norm": 0.5450178384780884, "learning_rate": 8.138099470048508e-06, "loss": 0.4617, "mean_token_accuracy": 0.8467613220214844, "num_tokens": 941000301.0, "step": 28090 }, { "epoch": 1.6771131805157593, "grad_norm": 0.5511917471885681, "learning_rate": 8.132445255839644e-06, "loss": 0.4214, "mean_token_accuracy": 0.8596087336540222, "num_tokens": 941167981.0, "step": 28095 }, { "epoch": 1.677411652340019, "grad_norm": 0.4799546003341675, "learning_rate": 8.126795758956225e-06, "loss": 0.4539, "mean_token_accuracy": 0.8484134674072266, "num_tokens": 941335661.0, "step": 28100 }, { "epoch": 1.677710124164279, "grad_norm": 0.4885207414627075, "learning_rate": 8.121150980774279e-06, "loss": 0.4232, "mean_token_accuracy": 0.8574794292449951, "num_tokens": 941503341.0, "step": 28105 }, { "epoch": 1.6780085959885387, "grad_norm": 0.4217756688594818, "learning_rate": 8.115510922668706e-06, "loss": 0.3685, "mean_token_accuracy": 0.8754443645477294, "num_tokens": 941671021.0, "step": 28110 }, { "epoch": 1.6783070678127985, "grad_norm": 0.5287328958511353, "learning_rate": 8.109875586013253e-06, "loss": 0.4501, "mean_token_accuracy": 0.8497495055198669, "num_tokens": 941838701.0, "step": 28115 }, { "epoch": 1.6786055396370583, "grad_norm": 0.522629976272583, "learning_rate": 8.104244972180496e-06, "loss": 0.4474, "mean_token_accuracy": 0.8518430233001709, "num_tokens": 942006381.0, "step": 28120 }, { "epoch": 1.678904011461318, "grad_norm": 0.6132490038871765, "learning_rate": 8.098619082541881e-06, "loss": 0.4715, "mean_token_accuracy": 0.8445246338844299, "num_tokens": 942174061.0, "step": 28125 }, { "epoch": 1.6792024832855779, "grad_norm": 0.4500133693218231, "learning_rate": 8.092997918467697e-06, "loss": 0.4139, "mean_token_accuracy": 0.8597578406333923, "num_tokens": 942341741.0, "step": 28130 }, { "epoch": 1.6795009551098377, "grad_norm": 0.41929033398628235, "learning_rate": 8.08738148132708e-06, "loss": 0.4064, "mean_token_accuracy": 0.8644816875457764, "num_tokens": 942509421.0, "step": 28135 }, { "epoch": 1.6797994269340975, "grad_norm": 0.5036787986755371, "learning_rate": 8.081769772488025e-06, "loss": 0.4311, "mean_token_accuracy": 0.8543898344039917, "num_tokens": 942677101.0, "step": 28140 }, { "epoch": 1.6800978987583572, "grad_norm": 0.4599270820617676, "learning_rate": 8.076162793317357e-06, "loss": 0.4101, "mean_token_accuracy": 0.8629428505897522, "num_tokens": 942844781.0, "step": 28145 }, { "epoch": 1.680396370582617, "grad_norm": 0.48597437143325806, "learning_rate": 8.070560545180763e-06, "loss": 0.4391, "mean_token_accuracy": 0.8520935416221619, "num_tokens": 943012461.0, "step": 28150 }, { "epoch": 1.6806948424068768, "grad_norm": 0.54108726978302, "learning_rate": 8.064963029442776e-06, "loss": 0.4854, "mean_token_accuracy": 0.842067277431488, "num_tokens": 943180141.0, "step": 28155 }, { "epoch": 1.6809933142311366, "grad_norm": 0.4786975681781769, "learning_rate": 8.059370247466773e-06, "loss": 0.4354, "mean_token_accuracy": 0.8550280213356019, "num_tokens": 943347821.0, "step": 28160 }, { "epoch": 1.6812917860553964, "grad_norm": 0.4980897903442383, "learning_rate": 8.053782200614978e-06, "loss": 0.4173, "mean_token_accuracy": 0.8611356258392334, "num_tokens": 943515501.0, "step": 28165 }, { "epoch": 1.6815902578796562, "grad_norm": 0.47392842173576355, "learning_rate": 8.048198890248463e-06, "loss": 0.4102, "mean_token_accuracy": 0.860640573501587, "num_tokens": 943683181.0, "step": 28170 }, { "epoch": 1.6818887297039158, "grad_norm": 0.49055197834968567, "learning_rate": 8.042620317727152e-06, "loss": 0.4206, "mean_token_accuracy": 0.8582130551338196, "num_tokens": 943850861.0, "step": 28175 }, { "epoch": 1.6821872015281758, "grad_norm": 0.5143933296203613, "learning_rate": 8.037046484409801e-06, "loss": 0.425, "mean_token_accuracy": 0.8564356565475464, "num_tokens": 944018541.0, "step": 28180 }, { "epoch": 1.6824856733524354, "grad_norm": 0.5638523697853088, "learning_rate": 8.031477391654025e-06, "loss": 0.4715, "mean_token_accuracy": 0.8440176606178283, "num_tokens": 944186221.0, "step": 28185 }, { "epoch": 1.6827841451766954, "grad_norm": 0.6349928379058838, "learning_rate": 8.025913040816278e-06, "loss": 0.4433, "mean_token_accuracy": 0.8538411140441895, "num_tokens": 944353901.0, "step": 28190 }, { "epoch": 1.683082617000955, "grad_norm": 0.4978715181350708, "learning_rate": 8.020353433251859e-06, "loss": 0.3895, "mean_token_accuracy": 0.8682273626327515, "num_tokens": 944521581.0, "step": 28195 }, { "epoch": 1.683381088825215, "grad_norm": 0.5366897583007812, "learning_rate": 8.014798570314918e-06, "loss": 0.4763, "mean_token_accuracy": 0.8423953294754029, "num_tokens": 944689261.0, "step": 28200 }, { "epoch": 1.6836795606494745, "grad_norm": 0.5215397477149963, "learning_rate": 8.009248453358438e-06, "loss": 0.4289, "mean_token_accuracy": 0.855851125717163, "num_tokens": 944856941.0, "step": 28205 }, { "epoch": 1.6839780324737346, "grad_norm": 0.5060926675796509, "learning_rate": 8.003703083734254e-06, "loss": 0.4283, "mean_token_accuracy": 0.8561254739761353, "num_tokens": 945024621.0, "step": 28210 }, { "epoch": 1.6842765042979941, "grad_norm": 0.6309110522270203, "learning_rate": 7.998162462793046e-06, "loss": 0.4059, "mean_token_accuracy": 0.862018370628357, "num_tokens": 945192301.0, "step": 28215 }, { "epoch": 1.6845749761222542, "grad_norm": 0.46858254075050354, "learning_rate": 7.992626591884331e-06, "loss": 0.4039, "mean_token_accuracy": 0.8643146634101868, "num_tokens": 945359981.0, "step": 28220 }, { "epoch": 1.6848734479465137, "grad_norm": 0.5501070022583008, "learning_rate": 7.98709547235648e-06, "loss": 0.4729, "mean_token_accuracy": 0.8422581434249878, "num_tokens": 945527661.0, "step": 28225 }, { "epoch": 1.6851719197707737, "grad_norm": 0.4850921034812927, "learning_rate": 7.981569105556685e-06, "loss": 0.3941, "mean_token_accuracy": 0.8667780041694642, "num_tokens": 945695341.0, "step": 28230 }, { "epoch": 1.6854703915950333, "grad_norm": 0.4399625360965729, "learning_rate": 7.976047492831012e-06, "loss": 0.4256, "mean_token_accuracy": 0.8575629234313965, "num_tokens": 945863021.0, "step": 28235 }, { "epoch": 1.6857688634192933, "grad_norm": 0.5460418462753296, "learning_rate": 7.97053063552434e-06, "loss": 0.4751, "mean_token_accuracy": 0.843546462059021, "num_tokens": 946030701.0, "step": 28240 }, { "epoch": 1.686067335243553, "grad_norm": 0.48071375489234924, "learning_rate": 7.9650185349804e-06, "loss": 0.4543, "mean_token_accuracy": 0.8498270273208618, "num_tokens": 946198381.0, "step": 28245 }, { "epoch": 1.686365807067813, "grad_norm": 0.503944993019104, "learning_rate": 7.959511192541772e-06, "loss": 0.4395, "mean_token_accuracy": 0.8524394512176514, "num_tokens": 946366061.0, "step": 28250 }, { "epoch": 1.6866642788920725, "grad_norm": 0.4849456548690796, "learning_rate": 7.954008609549872e-06, "loss": 0.4476, "mean_token_accuracy": 0.8522247314453125, "num_tokens": 946533741.0, "step": 28255 }, { "epoch": 1.6869627507163325, "grad_norm": 0.5044650435447693, "learning_rate": 7.948510787344956e-06, "loss": 0.428, "mean_token_accuracy": 0.8562388062477112, "num_tokens": 946701421.0, "step": 28260 }, { "epoch": 1.687261222540592, "grad_norm": 0.47206827998161316, "learning_rate": 7.94301772726611e-06, "loss": 0.4515, "mean_token_accuracy": 0.8497972130775452, "num_tokens": 946869101.0, "step": 28265 }, { "epoch": 1.687559694364852, "grad_norm": 0.4965664744377136, "learning_rate": 7.937529430651282e-06, "loss": 0.4011, "mean_token_accuracy": 0.8655254721641541, "num_tokens": 947036781.0, "step": 28270 }, { "epoch": 1.6878581661891117, "grad_norm": 0.4931280016899109, "learning_rate": 7.932045898837238e-06, "loss": 0.4639, "mean_token_accuracy": 0.8454550862312317, "num_tokens": 947204461.0, "step": 28275 }, { "epoch": 1.6881566380133717, "grad_norm": 0.49421343207359314, "learning_rate": 7.9265671331596e-06, "loss": 0.4502, "mean_token_accuracy": 0.8504592537879944, "num_tokens": 947372141.0, "step": 28280 }, { "epoch": 1.6884551098376313, "grad_norm": 0.5166566371917725, "learning_rate": 7.921093134952827e-06, "loss": 0.409, "mean_token_accuracy": 0.8620002031326294, "num_tokens": 947532664.0, "step": 28285 }, { "epoch": 1.6887535816618913, "grad_norm": 0.48326945304870605, "learning_rate": 7.915623905550194e-06, "loss": 0.4539, "mean_token_accuracy": 0.8495586276054382, "num_tokens": 947700344.0, "step": 28290 }, { "epoch": 1.6890520534861508, "grad_norm": 0.5510462522506714, "learning_rate": 7.910159446283852e-06, "loss": 0.4258, "mean_token_accuracy": 0.8554395794868469, "num_tokens": 947868024.0, "step": 28295 }, { "epoch": 1.6893505253104109, "grad_norm": 0.4928211271762848, "learning_rate": 7.904699758484768e-06, "loss": 0.4386, "mean_token_accuracy": 0.8537516236305237, "num_tokens": 948035704.0, "step": 28300 }, { "epoch": 1.6896489971346704, "grad_norm": 0.4887271225452423, "learning_rate": 7.899244843482742e-06, "loss": 0.4257, "mean_token_accuracy": 0.8570857524871827, "num_tokens": 948203384.0, "step": 28305 }, { "epoch": 1.6899474689589304, "grad_norm": 0.5313078165054321, "learning_rate": 7.89379470260642e-06, "loss": 0.4325, "mean_token_accuracy": 0.8548013806343079, "num_tokens": 948371064.0, "step": 28310 }, { "epoch": 1.69024594078319, "grad_norm": 0.5151989459991455, "learning_rate": 7.888349337183289e-06, "loss": 0.4271, "mean_token_accuracy": 0.8573422312736512, "num_tokens": 948538744.0, "step": 28315 }, { "epoch": 1.6905444126074498, "grad_norm": 0.5010346174240112, "learning_rate": 7.882908748539672e-06, "loss": 0.4466, "mean_token_accuracy": 0.8504592776298523, "num_tokens": 948706424.0, "step": 28320 }, { "epoch": 1.6908428844317096, "grad_norm": 0.4591065049171448, "learning_rate": 7.877472938000713e-06, "loss": 0.4088, "mean_token_accuracy": 0.8633484244346619, "num_tokens": 948874104.0, "step": 28325 }, { "epoch": 1.6911413562559694, "grad_norm": 0.5350884795188904, "learning_rate": 7.872041906890412e-06, "loss": 0.4453, "mean_token_accuracy": 0.8519742369651795, "num_tokens": 949041784.0, "step": 28330 }, { "epoch": 1.6914398280802292, "grad_norm": 0.5192052721977234, "learning_rate": 7.866615656531593e-06, "loss": 0.4101, "mean_token_accuracy": 0.8620004773139953, "num_tokens": 949209464.0, "step": 28335 }, { "epoch": 1.691738299904489, "grad_norm": 0.4882083833217621, "learning_rate": 7.861194188245922e-06, "loss": 0.3833, "mean_token_accuracy": 0.8693725347518921, "num_tokens": 949377144.0, "step": 28340 }, { "epoch": 1.6920367717287488, "grad_norm": 0.5025365352630615, "learning_rate": 7.855777503353904e-06, "loss": 0.4753, "mean_token_accuracy": 0.8442443013191223, "num_tokens": 949544824.0, "step": 28345 }, { "epoch": 1.6923352435530086, "grad_norm": 0.5009467005729675, "learning_rate": 7.850365603174858e-06, "loss": 0.4136, "mean_token_accuracy": 0.8606942653656006, "num_tokens": 949712504.0, "step": 28350 }, { "epoch": 1.6926337153772684, "grad_norm": 0.520879328250885, "learning_rate": 7.84495848902696e-06, "loss": 0.4373, "mean_token_accuracy": 0.8529941439628601, "num_tokens": 949880184.0, "step": 28355 }, { "epoch": 1.6929321872015282, "grad_norm": 0.4755707383155823, "learning_rate": 7.839556162227218e-06, "loss": 0.4562, "mean_token_accuracy": 0.8494751214981079, "num_tokens": 950047864.0, "step": 28360 }, { "epoch": 1.693230659025788, "grad_norm": 0.5176428556442261, "learning_rate": 7.834158624091459e-06, "loss": 0.4074, "mean_token_accuracy": 0.8618990778923035, "num_tokens": 950215544.0, "step": 28365 }, { "epoch": 1.6935291308500477, "grad_norm": 0.4631257951259613, "learning_rate": 7.828765875934362e-06, "loss": 0.403, "mean_token_accuracy": 0.8632470369338989, "num_tokens": 950383224.0, "step": 28370 }, { "epoch": 1.6938276026743075, "grad_norm": 0.5002923011779785, "learning_rate": 7.823377919069416e-06, "loss": 0.3853, "mean_token_accuracy": 0.8705057859420776, "num_tokens": 950550904.0, "step": 28375 }, { "epoch": 1.6941260744985673, "grad_norm": 0.5075061321258545, "learning_rate": 7.81799475480897e-06, "loss": 0.4351, "mean_token_accuracy": 0.8541930079460144, "num_tokens": 950718584.0, "step": 28380 }, { "epoch": 1.6944245463228271, "grad_norm": 0.4791085422039032, "learning_rate": 7.812616384464197e-06, "loss": 0.44, "mean_token_accuracy": 0.8523440361022949, "num_tokens": 950886264.0, "step": 28385 }, { "epoch": 1.694723018147087, "grad_norm": 0.5743136405944824, "learning_rate": 7.807242809345084e-06, "loss": 0.4266, "mean_token_accuracy": 0.8563044190406799, "num_tokens": 951053944.0, "step": 28390 }, { "epoch": 1.6950214899713467, "grad_norm": 0.5197674632072449, "learning_rate": 7.801874030760472e-06, "loss": 0.4496, "mean_token_accuracy": 0.8507097840309144, "num_tokens": 951221624.0, "step": 28395 }, { "epoch": 1.6953199617956065, "grad_norm": 0.4603707790374756, "learning_rate": 7.796510050018025e-06, "loss": 0.447, "mean_token_accuracy": 0.8499463200569153, "num_tokens": 951389304.0, "step": 28400 }, { "epoch": 1.6956184336198663, "grad_norm": 0.47777071595191956, "learning_rate": 7.791150868424243e-06, "loss": 0.4135, "mean_token_accuracy": 0.8609805583953858, "num_tokens": 951556984.0, "step": 28405 }, { "epoch": 1.695916905444126, "grad_norm": 0.43646612763404846, "learning_rate": 7.785796487284453e-06, "loss": 0.4107, "mean_token_accuracy": 0.862155556678772, "num_tokens": 951724664.0, "step": 28410 }, { "epoch": 1.696215377268386, "grad_norm": 0.5291107296943665, "learning_rate": 7.780446907902802e-06, "loss": 0.432, "mean_token_accuracy": 0.85520099401474, "num_tokens": 951892344.0, "step": 28415 }, { "epoch": 1.6965138490926457, "grad_norm": 0.53330397605896, "learning_rate": 7.775102131582296e-06, "loss": 0.4486, "mean_token_accuracy": 0.8504354119300842, "num_tokens": 952060024.0, "step": 28420 }, { "epoch": 1.6968123209169055, "grad_norm": 0.5128780603408813, "learning_rate": 7.769762159624742e-06, "loss": 0.4181, "mean_token_accuracy": 0.8613086104393005, "num_tokens": 952227704.0, "step": 28425 }, { "epoch": 1.6971107927411653, "grad_norm": 0.5765726566314697, "learning_rate": 7.764426993330798e-06, "loss": 0.4509, "mean_token_accuracy": 0.8495944023132325, "num_tokens": 952395384.0, "step": 28430 }, { "epoch": 1.697409264565425, "grad_norm": 0.5101821422576904, "learning_rate": 7.759096633999927e-06, "loss": 0.409, "mean_token_accuracy": 0.8609089851379395, "num_tokens": 952563064.0, "step": 28435 }, { "epoch": 1.6977077363896849, "grad_norm": 0.6067535877227783, "learning_rate": 7.753771082930452e-06, "loss": 0.4613, "mean_token_accuracy": 0.8458964586257934, "num_tokens": 952730744.0, "step": 28440 }, { "epoch": 1.6980062082139447, "grad_norm": 0.4924018979072571, "learning_rate": 7.748450341419505e-06, "loss": 0.4066, "mean_token_accuracy": 0.8639150738716126, "num_tokens": 952898424.0, "step": 28445 }, { "epoch": 1.6983046800382042, "grad_norm": 0.5162074565887451, "learning_rate": 7.743134410763048e-06, "loss": 0.4175, "mean_token_accuracy": 0.859435772895813, "num_tokens": 953066104.0, "step": 28450 }, { "epoch": 1.6986031518624642, "grad_norm": 0.5504987835884094, "learning_rate": 7.737823292255873e-06, "loss": 0.44, "mean_token_accuracy": 0.8522247433662414, "num_tokens": 953233784.0, "step": 28455 }, { "epoch": 1.6989016236867238, "grad_norm": 0.48993080854415894, "learning_rate": 7.732516987191602e-06, "loss": 0.4297, "mean_token_accuracy": 0.8549683928489685, "num_tokens": 953401464.0, "step": 28460 }, { "epoch": 1.6992000955109838, "grad_norm": 0.5028998255729675, "learning_rate": 7.727215496862687e-06, "loss": 0.4329, "mean_token_accuracy": 0.8567756175994873, "num_tokens": 953569144.0, "step": 28465 }, { "epoch": 1.6994985673352434, "grad_norm": 0.5005002617835999, "learning_rate": 7.721918822560404e-06, "loss": 0.4003, "mean_token_accuracy": 0.8648267984390259, "num_tokens": 953732227.0, "step": 28470 }, { "epoch": 1.6997970391595034, "grad_norm": 0.49301546812057495, "learning_rate": 7.71662696557485e-06, "loss": 0.4218, "mean_token_accuracy": 0.8585291624069213, "num_tokens": 953899907.0, "step": 28475 }, { "epoch": 1.700095510983763, "grad_norm": 0.5593149065971375, "learning_rate": 7.711339927194958e-06, "loss": 0.4376, "mean_token_accuracy": 0.853083610534668, "num_tokens": 954067587.0, "step": 28480 }, { "epoch": 1.700393982808023, "grad_norm": 0.5250505805015564, "learning_rate": 7.706057708708486e-06, "loss": 0.4142, "mean_token_accuracy": 0.8604497194290162, "num_tokens": 954235267.0, "step": 28485 }, { "epoch": 1.7006924546322826, "grad_norm": 0.5128530859947205, "learning_rate": 7.700780311402008e-06, "loss": 0.4175, "mean_token_accuracy": 0.8612728118896484, "num_tokens": 954402947.0, "step": 28490 }, { "epoch": 1.7009909264565426, "grad_norm": 0.5120176672935486, "learning_rate": 7.695507736560945e-06, "loss": 0.4246, "mean_token_accuracy": 0.8568531632423401, "num_tokens": 954570627.0, "step": 28495 }, { "epoch": 1.7012893982808022, "grad_norm": 0.4883759617805481, "learning_rate": 7.690239985469511e-06, "loss": 0.4116, "mean_token_accuracy": 0.8602588534355163, "num_tokens": 954738307.0, "step": 28500 }, { "epoch": 1.7015878701050622, "grad_norm": 0.5529108643531799, "learning_rate": 7.684977059410783e-06, "loss": 0.4784, "mean_token_accuracy": 0.8417511582374573, "num_tokens": 954905987.0, "step": 28505 }, { "epoch": 1.7018863419293218, "grad_norm": 0.5632097125053406, "learning_rate": 7.679718959666635e-06, "loss": 0.4082, "mean_token_accuracy": 0.8618931174278259, "num_tokens": 955073667.0, "step": 28510 }, { "epoch": 1.7021848137535818, "grad_norm": 0.7264323234558105, "learning_rate": 7.674465687517773e-06, "loss": 0.4837, "mean_token_accuracy": 0.8404151201248169, "num_tokens": 955241347.0, "step": 28515 }, { "epoch": 1.7024832855778413, "grad_norm": 0.5806310772895813, "learning_rate": 7.669217244243728e-06, "loss": 0.4629, "mean_token_accuracy": 0.8464153647422791, "num_tokens": 955409027.0, "step": 28520 }, { "epoch": 1.7027817574021014, "grad_norm": 0.5168312191963196, "learning_rate": 7.66397363112286e-06, "loss": 0.4383, "mean_token_accuracy": 0.8536860466003418, "num_tokens": 955576707.0, "step": 28525 }, { "epoch": 1.703080229226361, "grad_norm": 0.5000343322753906, "learning_rate": 7.65873484943235e-06, "loss": 0.4408, "mean_token_accuracy": 0.8526422500610351, "num_tokens": 955744387.0, "step": 28530 }, { "epoch": 1.703378701050621, "grad_norm": 0.5200855731964111, "learning_rate": 7.653500900448189e-06, "loss": 0.4255, "mean_token_accuracy": 0.8565907120704651, "num_tokens": 955912067.0, "step": 28535 }, { "epoch": 1.7036771728748805, "grad_norm": 0.5474066734313965, "learning_rate": 7.648271785445209e-06, "loss": 0.4445, "mean_token_accuracy": 0.8520517587661743, "num_tokens": 956079747.0, "step": 28540 }, { "epoch": 1.7039756446991405, "grad_norm": 0.5423592329025269, "learning_rate": 7.643047505697055e-06, "loss": 0.4958, "mean_token_accuracy": 0.833472490310669, "num_tokens": 956247427.0, "step": 28545 }, { "epoch": 1.7042741165234, "grad_norm": 0.4779508113861084, "learning_rate": 7.6378280624762e-06, "loss": 0.4067, "mean_token_accuracy": 0.8627579569816589, "num_tokens": 956415107.0, "step": 28550 }, { "epoch": 1.7045725883476601, "grad_norm": 0.5067100524902344, "learning_rate": 7.63261345705394e-06, "loss": 0.4401, "mean_token_accuracy": 0.850626266002655, "num_tokens": 956582787.0, "step": 28555 }, { "epoch": 1.7048710601719197, "grad_norm": 0.5187618136405945, "learning_rate": 7.6274036907003705e-06, "loss": 0.4082, "mean_token_accuracy": 0.8630621552467346, "num_tokens": 956750467.0, "step": 28560 }, { "epoch": 1.7051695319961797, "grad_norm": 0.5184881687164307, "learning_rate": 7.622198764684449e-06, "loss": 0.4116, "mean_token_accuracy": 0.8616008639335633, "num_tokens": 956918147.0, "step": 28565 }, { "epoch": 1.7054680038204393, "grad_norm": 0.450073778629303, "learning_rate": 7.616998680273916e-06, "loss": 0.402, "mean_token_accuracy": 0.8652153253555298, "num_tokens": 957085827.0, "step": 28570 }, { "epoch": 1.7057664756446993, "grad_norm": 0.48244667053222656, "learning_rate": 7.611803438735351e-06, "loss": 0.399, "mean_token_accuracy": 0.865000593662262, "num_tokens": 957253507.0, "step": 28575 }, { "epoch": 1.7060649474689589, "grad_norm": 0.46460282802581787, "learning_rate": 7.606613041334154e-06, "loss": 0.4208, "mean_token_accuracy": 0.8585828542709351, "num_tokens": 957421187.0, "step": 28580 }, { "epoch": 1.7063634192932189, "grad_norm": 0.5835782885551453, "learning_rate": 7.601427489334541e-06, "loss": 0.4493, "mean_token_accuracy": 0.8491709470748902, "num_tokens": 957588867.0, "step": 28585 }, { "epoch": 1.7066618911174785, "grad_norm": 0.48928967118263245, "learning_rate": 7.5962467839995524e-06, "loss": 0.4152, "mean_token_accuracy": 0.8602827072143555, "num_tokens": 957756547.0, "step": 28590 }, { "epoch": 1.7069603629417383, "grad_norm": 0.5073524713516235, "learning_rate": 7.591070926591036e-06, "loss": 0.4228, "mean_token_accuracy": 0.8580221891403198, "num_tokens": 957924227.0, "step": 28595 }, { "epoch": 1.707258834765998, "grad_norm": 0.504292368888855, "learning_rate": 7.585899918369673e-06, "loss": 0.439, "mean_token_accuracy": 0.8524633169174194, "num_tokens": 958091907.0, "step": 28600 }, { "epoch": 1.7075573065902578, "grad_norm": 0.5035752058029175, "learning_rate": 7.580733760594956e-06, "loss": 0.4266, "mean_token_accuracy": 0.8580639362335205, "num_tokens": 958259587.0, "step": 28605 }, { "epoch": 1.7078557784145176, "grad_norm": 0.5165441036224365, "learning_rate": 7.575572454525198e-06, "loss": 0.4323, "mean_token_accuracy": 0.854485273361206, "num_tokens": 958427267.0, "step": 28610 }, { "epoch": 1.7081542502387774, "grad_norm": 0.5164929032325745, "learning_rate": 7.570416001417537e-06, "loss": 0.4346, "mean_token_accuracy": 0.852117371559143, "num_tokens": 958594947.0, "step": 28615 }, { "epoch": 1.7084527220630372, "grad_norm": 0.5457221269607544, "learning_rate": 7.565264402527912e-06, "loss": 0.458, "mean_token_accuracy": 0.8475068569183349, "num_tokens": 958762627.0, "step": 28620 }, { "epoch": 1.708751193887297, "grad_norm": 0.6608583331108093, "learning_rate": 7.560117659111092e-06, "loss": 0.4455, "mean_token_accuracy": 0.851389718055725, "num_tokens": 958930307.0, "step": 28625 }, { "epoch": 1.7090496657115568, "grad_norm": 0.4773944318294525, "learning_rate": 7.5549757724206724e-06, "loss": 0.4187, "mean_token_accuracy": 0.8601634263992309, "num_tokens": 959097987.0, "step": 28630 }, { "epoch": 1.7093481375358166, "grad_norm": 0.5381609797477722, "learning_rate": 7.5498387437090425e-06, "loss": 0.4349, "mean_token_accuracy": 0.8539424896240234, "num_tokens": 959265667.0, "step": 28635 }, { "epoch": 1.7096466093600764, "grad_norm": 0.5077849626541138, "learning_rate": 7.544706574227423e-06, "loss": 0.4035, "mean_token_accuracy": 0.8643146634101868, "num_tokens": 959433347.0, "step": 28640 }, { "epoch": 1.7099450811843362, "grad_norm": 0.5975263118743896, "learning_rate": 7.5395792652258514e-06, "loss": 0.4528, "mean_token_accuracy": 0.8489264011383056, "num_tokens": 959601027.0, "step": 28645 }, { "epoch": 1.710243553008596, "grad_norm": 0.5666967630386353, "learning_rate": 7.53445681795318e-06, "loss": 0.4485, "mean_token_accuracy": 0.8485506534576416, "num_tokens": 959768707.0, "step": 28650 }, { "epoch": 1.7105420248328558, "grad_norm": 0.4827417731285095, "learning_rate": 7.5293392336570676e-06, "loss": 0.4247, "mean_token_accuracy": 0.8574674963951111, "num_tokens": 959936387.0, "step": 28655 }, { "epoch": 1.7108404966571156, "grad_norm": 0.48321419954299927, "learning_rate": 7.524226513584001e-06, "loss": 0.4498, "mean_token_accuracy": 0.8502242684364318, "num_tokens": 960101809.0, "step": 28660 }, { "epoch": 1.7111389684813754, "grad_norm": 0.5237590670585632, "learning_rate": 7.51911865897928e-06, "loss": 0.4706, "mean_token_accuracy": 0.845162832736969, "num_tokens": 960269489.0, "step": 28665 }, { "epoch": 1.7114374403056352, "grad_norm": 0.5307829976081848, "learning_rate": 7.514015671087012e-06, "loss": 0.4312, "mean_token_accuracy": 0.8544971942901611, "num_tokens": 960437169.0, "step": 28670 }, { "epoch": 1.711735912129895, "grad_norm": 0.5036895871162415, "learning_rate": 7.508917551150131e-06, "loss": 0.414, "mean_token_accuracy": 0.860914945602417, "num_tokens": 960604849.0, "step": 28675 }, { "epoch": 1.7120343839541547, "grad_norm": 0.5041924118995667, "learning_rate": 7.503824300410371e-06, "loss": 0.4439, "mean_token_accuracy": 0.8513181567192077, "num_tokens": 960772529.0, "step": 28680 }, { "epoch": 1.7123328557784145, "grad_norm": 0.5621993541717529, "learning_rate": 7.498735920108284e-06, "loss": 0.4432, "mean_token_accuracy": 0.8518370389938354, "num_tokens": 960940209.0, "step": 28685 }, { "epoch": 1.7126313276026743, "grad_norm": 0.5141382813453674, "learning_rate": 7.493652411483251e-06, "loss": 0.4087, "mean_token_accuracy": 0.8628951430320739, "num_tokens": 961107889.0, "step": 28690 }, { "epoch": 1.7129297994269341, "grad_norm": 0.5095019936561584, "learning_rate": 7.488573775773447e-06, "loss": 0.4097, "mean_token_accuracy": 0.8610103845596313, "num_tokens": 961275569.0, "step": 28695 }, { "epoch": 1.713228271251194, "grad_norm": 0.5133291482925415, "learning_rate": 7.483500014215872e-06, "loss": 0.4543, "mean_token_accuracy": 0.8493200540542603, "num_tokens": 961443249.0, "step": 28700 }, { "epoch": 1.7135267430754537, "grad_norm": 0.48671406507492065, "learning_rate": 7.4784311280463205e-06, "loss": 0.444, "mean_token_accuracy": 0.8509841322898865, "num_tokens": 961610929.0, "step": 28705 }, { "epoch": 1.7138252148997135, "grad_norm": 0.4508340060710907, "learning_rate": 7.47336711849943e-06, "loss": 0.4076, "mean_token_accuracy": 0.8617439985275268, "num_tokens": 961778609.0, "step": 28710 }, { "epoch": 1.7141236867239733, "grad_norm": 0.4830595850944519, "learning_rate": 7.468307986808631e-06, "loss": 0.4253, "mean_token_accuracy": 0.8575331091880798, "num_tokens": 961946289.0, "step": 28715 }, { "epoch": 1.714422158548233, "grad_norm": 0.4971616268157959, "learning_rate": 7.4632537342061605e-06, "loss": 0.4057, "mean_token_accuracy": 0.8620004773139953, "num_tokens": 962113969.0, "step": 28720 }, { "epoch": 1.7147206303724927, "grad_norm": 0.5192204117774963, "learning_rate": 7.458204361923078e-06, "loss": 0.4302, "mean_token_accuracy": 0.854570722579956, "num_tokens": 962281259.0, "step": 28725 }, { "epoch": 1.7150191021967527, "grad_norm": 0.5822002291679382, "learning_rate": 7.4531598711892524e-06, "loss": 0.4206, "mean_token_accuracy": 0.8583144426345826, "num_tokens": 962448939.0, "step": 28730 }, { "epoch": 1.7153175740210123, "grad_norm": 0.5741744041442871, "learning_rate": 7.448120263233369e-06, "loss": 0.4437, "mean_token_accuracy": 0.852266502380371, "num_tokens": 962616619.0, "step": 28735 }, { "epoch": 1.7156160458452723, "grad_norm": 0.5148447155952454, "learning_rate": 7.443085539282902e-06, "loss": 0.436, "mean_token_accuracy": 0.8529404759407043, "num_tokens": 962784299.0, "step": 28740 }, { "epoch": 1.7159145176695318, "grad_norm": 0.5515783429145813, "learning_rate": 7.438055700564164e-06, "loss": 0.46, "mean_token_accuracy": 0.8464213252067566, "num_tokens": 962951979.0, "step": 28745 }, { "epoch": 1.7162129894937919, "grad_norm": 0.5184022784233093, "learning_rate": 7.4330307483022594e-06, "loss": 0.3965, "mean_token_accuracy": 0.8646069407463074, "num_tokens": 963119659.0, "step": 28750 }, { "epoch": 1.7165114613180514, "grad_norm": 0.5345397591590881, "learning_rate": 7.428010683721109e-06, "loss": 0.4516, "mean_token_accuracy": 0.8486043095588685, "num_tokens": 963287339.0, "step": 28755 }, { "epoch": 1.7168099331423115, "grad_norm": 0.5140678882598877, "learning_rate": 7.422995508043451e-06, "loss": 0.3867, "mean_token_accuracy": 0.8687820553779602, "num_tokens": 963455019.0, "step": 28760 }, { "epoch": 1.717108404966571, "grad_norm": 0.5007964968681335, "learning_rate": 7.417985222490803e-06, "loss": 0.4303, "mean_token_accuracy": 0.8557855248451233, "num_tokens": 963622699.0, "step": 28765 }, { "epoch": 1.717406876790831, "grad_norm": 0.514384388923645, "learning_rate": 7.412979828283531e-06, "loss": 0.4698, "mean_token_accuracy": 0.8426458239555359, "num_tokens": 963790379.0, "step": 28770 }, { "epoch": 1.7177053486150906, "grad_norm": 0.5809042453765869, "learning_rate": 7.407979326640795e-06, "loss": 0.4444, "mean_token_accuracy": 0.853095555305481, "num_tokens": 963958059.0, "step": 28775 }, { "epoch": 1.7180038204393506, "grad_norm": 0.46283888816833496, "learning_rate": 7.40298371878054e-06, "loss": 0.4431, "mean_token_accuracy": 0.8503161191940307, "num_tokens": 964125739.0, "step": 28780 }, { "epoch": 1.7183022922636102, "grad_norm": 0.5773378610610962, "learning_rate": 7.397993005919551e-06, "loss": 0.4293, "mean_token_accuracy": 0.8577418327331543, "num_tokens": 964293419.0, "step": 28785 }, { "epoch": 1.7186007640878702, "grad_norm": 0.5771684050559998, "learning_rate": 7.393007189273404e-06, "loss": 0.4561, "mean_token_accuracy": 0.8474770426750183, "num_tokens": 964461099.0, "step": 28790 }, { "epoch": 1.7188992359121298, "grad_norm": 0.527180016040802, "learning_rate": 7.38802627005649e-06, "loss": 0.4289, "mean_token_accuracy": 0.856316351890564, "num_tokens": 964628779.0, "step": 28795 }, { "epoch": 1.7191977077363898, "grad_norm": 0.5195962190628052, "learning_rate": 7.3830502494820066e-06, "loss": 0.4409, "mean_token_accuracy": 0.8521770238876343, "num_tokens": 964796459.0, "step": 28800 }, { "epoch": 1.7194961795606494, "grad_norm": 0.5164055228233337, "learning_rate": 7.3780791287619496e-06, "loss": 0.4091, "mean_token_accuracy": 0.8626327157020569, "num_tokens": 964964139.0, "step": 28805 }, { "epoch": 1.7197946513849094, "grad_norm": 0.5179728865623474, "learning_rate": 7.373112909107126e-06, "loss": 0.4101, "mean_token_accuracy": 0.8619348645210266, "num_tokens": 965131819.0, "step": 28810 }, { "epoch": 1.720093123209169, "grad_norm": 0.5365899801254272, "learning_rate": 7.368151591727153e-06, "loss": 0.419, "mean_token_accuracy": 0.857455563545227, "num_tokens": 965299499.0, "step": 28815 }, { "epoch": 1.720391595033429, "grad_norm": 0.5447962284088135, "learning_rate": 7.363195177830451e-06, "loss": 0.431, "mean_token_accuracy": 0.8541273832321167, "num_tokens": 965467179.0, "step": 28820 }, { "epoch": 1.7206900668576885, "grad_norm": 0.48601073026657104, "learning_rate": 7.358243668624254e-06, "loss": 0.4281, "mean_token_accuracy": 0.8566503643989563, "num_tokens": 965634859.0, "step": 28825 }, { "epoch": 1.7209885386819486, "grad_norm": 0.5295308828353882, "learning_rate": 7.353297065314572e-06, "loss": 0.4307, "mean_token_accuracy": 0.8569008708000183, "num_tokens": 965802539.0, "step": 28830 }, { "epoch": 1.7212870105062081, "grad_norm": 0.5698486566543579, "learning_rate": 7.348355369106268e-06, "loss": 0.4398, "mean_token_accuracy": 0.8530418872833252, "num_tokens": 965970219.0, "step": 28835 }, { "epoch": 1.7215854823304682, "grad_norm": 0.4656085968017578, "learning_rate": 7.343418581202964e-06, "loss": 0.3996, "mean_token_accuracy": 0.8655970335006714, "num_tokens": 966137899.0, "step": 28840 }, { "epoch": 1.7218839541547277, "grad_norm": 0.5593969225883484, "learning_rate": 7.3384867028071135e-06, "loss": 0.4967, "mean_token_accuracy": 0.8361266851425171, "num_tokens": 966305579.0, "step": 28845 }, { "epoch": 1.7221824259789877, "grad_norm": 0.4897625148296356, "learning_rate": 7.333559735119966e-06, "loss": 0.4224, "mean_token_accuracy": 0.8582011222839355, "num_tokens": 966473259.0, "step": 28850 }, { "epoch": 1.7224808978032473, "grad_norm": 0.5123838186264038, "learning_rate": 7.328637679341572e-06, "loss": 0.3894, "mean_token_accuracy": 0.8688894152641297, "num_tokens": 966640939.0, "step": 28855 }, { "epoch": 1.7227793696275073, "grad_norm": 0.5210784077644348, "learning_rate": 7.3237205366708e-06, "loss": 0.4433, "mean_token_accuracy": 0.8524513959884643, "num_tokens": 966808619.0, "step": 28860 }, { "epoch": 1.723077841451767, "grad_norm": 0.4530964195728302, "learning_rate": 7.318808308305295e-06, "loss": 0.4, "mean_token_accuracy": 0.8650363922119141, "num_tokens": 966976299.0, "step": 28865 }, { "epoch": 1.7233763132760267, "grad_norm": 0.5902482271194458, "learning_rate": 7.31390099544153e-06, "loss": 0.4581, "mean_token_accuracy": 0.8467791914939881, "num_tokens": 967143979.0, "step": 28870 }, { "epoch": 1.7236747851002865, "grad_norm": 0.5239464640617371, "learning_rate": 7.308998599274772e-06, "loss": 0.4526, "mean_token_accuracy": 0.8492007613182068, "num_tokens": 967311659.0, "step": 28875 }, { "epoch": 1.7239732569245463, "grad_norm": 0.4668653905391693, "learning_rate": 7.304101120999086e-06, "loss": 0.396, "mean_token_accuracy": 0.86689133644104, "num_tokens": 967479339.0, "step": 28880 }, { "epoch": 1.724271728748806, "grad_norm": 0.4923475682735443, "learning_rate": 7.299208561807351e-06, "loss": 0.4622, "mean_token_accuracy": 0.8459918856620788, "num_tokens": 967647019.0, "step": 28885 }, { "epoch": 1.7245702005730659, "grad_norm": 0.5477373003959656, "learning_rate": 7.294320922891225e-06, "loss": 0.4121, "mean_token_accuracy": 0.861916983127594, "num_tokens": 967814699.0, "step": 28890 }, { "epoch": 1.7248686723973257, "grad_norm": 0.4551163613796234, "learning_rate": 7.289438205441201e-06, "loss": 0.4339, "mean_token_accuracy": 0.8527881026268005, "num_tokens": 967978525.0, "step": 28895 }, { "epoch": 1.7251671442215855, "grad_norm": 0.5083862543106079, "learning_rate": 7.284560410646544e-06, "loss": 0.4332, "mean_token_accuracy": 0.8555469393730164, "num_tokens": 968146205.0, "step": 28900 }, { "epoch": 1.7254656160458453, "grad_norm": 0.46898481249809265, "learning_rate": 7.279687539695332e-06, "loss": 0.4007, "mean_token_accuracy": 0.8623463988304139, "num_tokens": 968313885.0, "step": 28905 }, { "epoch": 1.725764087870105, "grad_norm": 0.49112948775291443, "learning_rate": 7.274819593774443e-06, "loss": 0.4341, "mean_token_accuracy": 0.8541274070739746, "num_tokens": 968481565.0, "step": 28910 }, { "epoch": 1.7260625596943648, "grad_norm": 0.5248199701309204, "learning_rate": 7.269956574069557e-06, "loss": 0.4801, "mean_token_accuracy": 0.8411189317703247, "num_tokens": 968649245.0, "step": 28915 }, { "epoch": 1.7263610315186246, "grad_norm": 0.49649155139923096, "learning_rate": 7.265098481765156e-06, "loss": 0.4202, "mean_token_accuracy": 0.8583621621131897, "num_tokens": 968816925.0, "step": 28920 }, { "epoch": 1.7266595033428844, "grad_norm": 0.510306715965271, "learning_rate": 7.260245318044506e-06, "loss": 0.3827, "mean_token_accuracy": 0.8696409344673157, "num_tokens": 968984605.0, "step": 28925 }, { "epoch": 1.7269579751671442, "grad_norm": 0.4668762683868408, "learning_rate": 7.255397084089695e-06, "loss": 0.4285, "mean_token_accuracy": 0.8577836155891418, "num_tokens": 969152285.0, "step": 28930 }, { "epoch": 1.727256446991404, "grad_norm": 0.4967258870601654, "learning_rate": 7.250553781081597e-06, "loss": 0.4556, "mean_token_accuracy": 0.8465525388717652, "num_tokens": 969319965.0, "step": 28935 }, { "epoch": 1.7275549188156638, "grad_norm": 0.4535907804965973, "learning_rate": 7.245715410199885e-06, "loss": 0.4207, "mean_token_accuracy": 0.8593045592308044, "num_tokens": 969487645.0, "step": 28940 }, { "epoch": 1.7278533906399236, "grad_norm": 0.5136157274246216, "learning_rate": 7.2408819726230425e-06, "loss": 0.3981, "mean_token_accuracy": 0.8662173509597778, "num_tokens": 969655325.0, "step": 28945 }, { "epoch": 1.7281518624641834, "grad_norm": 0.5177064538002014, "learning_rate": 7.2360534695283314e-06, "loss": 0.4551, "mean_token_accuracy": 0.8487236142158509, "num_tokens": 969823005.0, "step": 28950 }, { "epoch": 1.7284503342884432, "grad_norm": 0.4764386713504791, "learning_rate": 7.231229902091827e-06, "loss": 0.4167, "mean_token_accuracy": 0.8607300400733948, "num_tokens": 969990685.0, "step": 28955 }, { "epoch": 1.728748806112703, "grad_norm": 0.5229875445365906, "learning_rate": 7.226411271488398e-06, "loss": 0.5133, "mean_token_accuracy": 0.8314565300941468, "num_tokens": 970158365.0, "step": 28960 }, { "epoch": 1.7290472779369628, "grad_norm": 0.5183318257331848, "learning_rate": 7.221597578891713e-06, "loss": 0.4834, "mean_token_accuracy": 0.8390671610832214, "num_tokens": 970326045.0, "step": 28965 }, { "epoch": 1.7293457497612226, "grad_norm": 0.48091086745262146, "learning_rate": 7.216788825474233e-06, "loss": 0.5109, "mean_token_accuracy": 0.8319217443466187, "num_tokens": 970493725.0, "step": 28970 }, { "epoch": 1.7296442215854824, "grad_norm": 0.5449036359786987, "learning_rate": 7.211985012407219e-06, "loss": 0.4715, "mean_token_accuracy": 0.8427252888679504, "num_tokens": 970654365.0, "step": 28975 }, { "epoch": 1.7299426934097422, "grad_norm": 0.5691753625869751, "learning_rate": 7.207186140860733e-06, "loss": 0.4263, "mean_token_accuracy": 0.857568895816803, "num_tokens": 970822045.0, "step": 28980 }, { "epoch": 1.730241165234002, "grad_norm": 0.5598579049110413, "learning_rate": 7.202392212003623e-06, "loss": 0.4246, "mean_token_accuracy": 0.8589526534080505, "num_tokens": 970989725.0, "step": 28985 }, { "epoch": 1.7305396370582617, "grad_norm": 0.5520233511924744, "learning_rate": 7.197603227003538e-06, "loss": 0.4255, "mean_token_accuracy": 0.8578790307044983, "num_tokens": 971157405.0, "step": 28990 }, { "epoch": 1.7308381088825215, "grad_norm": 0.4802533686161041, "learning_rate": 7.192819187026928e-06, "loss": 0.4604, "mean_token_accuracy": 0.8450733661651612, "num_tokens": 971325085.0, "step": 28995 }, { "epoch": 1.7311365807067811, "grad_norm": 0.5109880566596985, "learning_rate": 7.188040093239032e-06, "loss": 0.4186, "mean_token_accuracy": 0.8593582153320313, "num_tokens": 971492765.0, "step": 29000 }, { "epoch": 1.7314350525310411, "grad_norm": 0.4747738540172577, "learning_rate": 7.183265946803892e-06, "loss": 0.4051, "mean_token_accuracy": 0.8643803000450134, "num_tokens": 971660445.0, "step": 29005 }, { "epoch": 1.7317335243553007, "grad_norm": 0.5008866190910339, "learning_rate": 7.178496748884329e-06, "loss": 0.4108, "mean_token_accuracy": 0.8623046636581421, "num_tokens": 971828125.0, "step": 29010 }, { "epoch": 1.7320319961795607, "grad_norm": 0.5050458312034607, "learning_rate": 7.173732500641972e-06, "loss": 0.4519, "mean_token_accuracy": 0.8488190412521363, "num_tokens": 971995805.0, "step": 29015 }, { "epoch": 1.7323304680038203, "grad_norm": 0.5497936606407166, "learning_rate": 7.168973203237253e-06, "loss": 0.4582, "mean_token_accuracy": 0.8469521641731262, "num_tokens": 972163485.0, "step": 29020 }, { "epoch": 1.7326289398280803, "grad_norm": 0.5496740341186523, "learning_rate": 7.164218857829373e-06, "loss": 0.4554, "mean_token_accuracy": 0.8479840159416199, "num_tokens": 972331165.0, "step": 29025 }, { "epoch": 1.7329274116523399, "grad_norm": 0.5461985468864441, "learning_rate": 7.15946946557635e-06, "loss": 0.4564, "mean_token_accuracy": 0.8468149900436401, "num_tokens": 972498845.0, "step": 29030 }, { "epoch": 1.7332258834766, "grad_norm": 0.5176361203193665, "learning_rate": 7.1547250276349746e-06, "loss": 0.4007, "mean_token_accuracy": 0.8657998323440552, "num_tokens": 972666525.0, "step": 29035 }, { "epoch": 1.7335243553008595, "grad_norm": 0.5617021322250366, "learning_rate": 7.149985545160852e-06, "loss": 0.4602, "mean_token_accuracy": 0.8467255115509034, "num_tokens": 972834205.0, "step": 29040 }, { "epoch": 1.7338228271251195, "grad_norm": 0.5165064334869385, "learning_rate": 7.1452510193083745e-06, "loss": 0.459, "mean_token_accuracy": 0.8488667726516723, "num_tokens": 973001885.0, "step": 29045 }, { "epoch": 1.734121298949379, "grad_norm": 0.5426809191703796, "learning_rate": 7.1405214512307085e-06, "loss": 0.4268, "mean_token_accuracy": 0.8566265106201172, "num_tokens": 973169565.0, "step": 29050 }, { "epoch": 1.734419770773639, "grad_norm": 0.5158417820930481, "learning_rate": 7.135796842079841e-06, "loss": 0.4816, "mean_token_accuracy": 0.8405284523963928, "num_tokens": 973337245.0, "step": 29055 }, { "epoch": 1.7347182425978986, "grad_norm": 0.5401809811592102, "learning_rate": 7.13107719300653e-06, "loss": 0.4697, "mean_token_accuracy": 0.8447095274925231, "num_tokens": 973504925.0, "step": 29060 }, { "epoch": 1.7350167144221587, "grad_norm": 0.5143449902534485, "learning_rate": 7.126362505160339e-06, "loss": 0.4358, "mean_token_accuracy": 0.8531253695487976, "num_tokens": 973672605.0, "step": 29065 }, { "epoch": 1.7353151862464182, "grad_norm": 0.47009438276290894, "learning_rate": 7.1216527796896126e-06, "loss": 0.4591, "mean_token_accuracy": 0.8464630842208862, "num_tokens": 973840285.0, "step": 29070 }, { "epoch": 1.7356136580706782, "grad_norm": 0.5764166116714478, "learning_rate": 7.11694801774149e-06, "loss": 0.4459, "mean_token_accuracy": 0.8515089988708496, "num_tokens": 974007965.0, "step": 29075 }, { "epoch": 1.7359121298949378, "grad_norm": 0.5124257802963257, "learning_rate": 7.112248220461908e-06, "loss": 0.453, "mean_token_accuracy": 0.8497912406921386, "num_tokens": 974175645.0, "step": 29080 }, { "epoch": 1.7362106017191978, "grad_norm": 0.4700307548046112, "learning_rate": 7.107553388995586e-06, "loss": 0.4199, "mean_token_accuracy": 0.8593761086463928, "num_tokens": 974343325.0, "step": 29085 }, { "epoch": 1.7365090735434574, "grad_norm": 0.45867449045181274, "learning_rate": 7.102863524486042e-06, "loss": 0.4006, "mean_token_accuracy": 0.8648157000541687, "num_tokens": 974511005.0, "step": 29090 }, { "epoch": 1.7368075453677174, "grad_norm": 0.5127758979797363, "learning_rate": 7.098178628075568e-06, "loss": 0.4498, "mean_token_accuracy": 0.8496123075485229, "num_tokens": 974678685.0, "step": 29095 }, { "epoch": 1.737106017191977, "grad_norm": 0.5111568570137024, "learning_rate": 7.0934987009052655e-06, "loss": 0.4442, "mean_token_accuracy": 0.8514195442199707, "num_tokens": 974846365.0, "step": 29100 }, { "epoch": 1.737404489016237, "grad_norm": 0.5325424671173096, "learning_rate": 7.0888237441150164e-06, "loss": 0.4416, "mean_token_accuracy": 0.8515328764915466, "num_tokens": 975014045.0, "step": 29105 }, { "epoch": 1.7377029608404966, "grad_norm": 0.520170271396637, "learning_rate": 7.084153758843492e-06, "loss": 0.4009, "mean_token_accuracy": 0.8634319543838501, "num_tokens": 975181725.0, "step": 29110 }, { "epoch": 1.7380014326647566, "grad_norm": 0.5719833970069885, "learning_rate": 7.079488746228152e-06, "loss": 0.4187, "mean_token_accuracy": 0.8595729470252991, "num_tokens": 975349405.0, "step": 29115 }, { "epoch": 1.7382999044890162, "grad_norm": 0.48435789346694946, "learning_rate": 7.074828707405247e-06, "loss": 0.4097, "mean_token_accuracy": 0.8607479453086853, "num_tokens": 975517085.0, "step": 29120 }, { "epoch": 1.7385983763132762, "grad_norm": 0.45030462741851807, "learning_rate": 7.0701736435098155e-06, "loss": 0.3969, "mean_token_accuracy": 0.866628885269165, "num_tokens": 975684765.0, "step": 29125 }, { "epoch": 1.7388968481375358, "grad_norm": 0.47363996505737305, "learning_rate": 7.065523555675688e-06, "loss": 0.4299, "mean_token_accuracy": 0.8558690190315247, "num_tokens": 975852445.0, "step": 29130 }, { "epoch": 1.7391953199617958, "grad_norm": 0.5388246774673462, "learning_rate": 7.060878445035475e-06, "loss": 0.4663, "mean_token_accuracy": 0.8441667675971984, "num_tokens": 976020125.0, "step": 29135 }, { "epoch": 1.7394937917860553, "grad_norm": 0.5351729393005371, "learning_rate": 7.0562383127205755e-06, "loss": 0.4287, "mean_token_accuracy": 0.8562984466552734, "num_tokens": 976187805.0, "step": 29140 }, { "epoch": 1.7397922636103151, "grad_norm": 0.5127094984054565, "learning_rate": 7.051603159861185e-06, "loss": 0.4433, "mean_token_accuracy": 0.8518012642860413, "num_tokens": 976355485.0, "step": 29145 }, { "epoch": 1.740090735434575, "grad_norm": 0.47143083810806274, "learning_rate": 7.046972987586278e-06, "loss": 0.4619, "mean_token_accuracy": 0.8455684065818787, "num_tokens": 976523165.0, "step": 29150 }, { "epoch": 1.7403892072588347, "grad_norm": 0.5048629641532898, "learning_rate": 7.042347797023627e-06, "loss": 0.4374, "mean_token_accuracy": 0.8541870474815368, "num_tokens": 976690845.0, "step": 29155 }, { "epoch": 1.7406876790830945, "grad_norm": 0.5515156388282776, "learning_rate": 7.037727589299763e-06, "loss": 0.4351, "mean_token_accuracy": 0.8531372904777527, "num_tokens": 976858525.0, "step": 29160 }, { "epoch": 1.7409861509073543, "grad_norm": 0.5150281190872192, "learning_rate": 7.033112365540046e-06, "loss": 0.4113, "mean_token_accuracy": 0.8639532208442688, "num_tokens": 977020654.0, "step": 29165 }, { "epoch": 1.741284622731614, "grad_norm": 0.4680708348751068, "learning_rate": 7.0285021268685815e-06, "loss": 0.3796, "mean_token_accuracy": 0.8723666787147522, "num_tokens": 977188334.0, "step": 29170 }, { "epoch": 1.741583094555874, "grad_norm": 0.5442808866500854, "learning_rate": 7.0238968744082895e-06, "loss": 0.4461, "mean_token_accuracy": 0.8513360381126404, "num_tokens": 977356014.0, "step": 29175 }, { "epoch": 1.7418815663801337, "grad_norm": 0.5612311959266663, "learning_rate": 7.019296609280852e-06, "loss": 0.469, "mean_token_accuracy": 0.8423177838325501, "num_tokens": 977523694.0, "step": 29180 }, { "epoch": 1.7421800382043935, "grad_norm": 0.5339342355728149, "learning_rate": 7.01470133260676e-06, "loss": 0.4202, "mean_token_accuracy": 0.8588035225868225, "num_tokens": 977691374.0, "step": 29185 }, { "epoch": 1.7424785100286533, "grad_norm": 0.47362253069877625, "learning_rate": 7.010111045505276e-06, "loss": 0.4004, "mean_token_accuracy": 0.8640641808509827, "num_tokens": 977859054.0, "step": 29190 }, { "epoch": 1.742776981852913, "grad_norm": 0.49893632531166077, "learning_rate": 7.0055257490944435e-06, "loss": 0.4327, "mean_token_accuracy": 0.8548312067985535, "num_tokens": 978026734.0, "step": 29195 }, { "epoch": 1.7430754536771729, "grad_norm": 0.498502254486084, "learning_rate": 7.0009454444911e-06, "loss": 0.4054, "mean_token_accuracy": 0.8631158351898194, "num_tokens": 978194414.0, "step": 29200 }, { "epoch": 1.7433739255014327, "grad_norm": 0.47744449973106384, "learning_rate": 6.996370132810863e-06, "loss": 0.4117, "mean_token_accuracy": 0.8615889191627503, "num_tokens": 978362094.0, "step": 29205 }, { "epoch": 1.7436723973256925, "grad_norm": 0.5168974995613098, "learning_rate": 6.991799815168132e-06, "loss": 0.4116, "mean_token_accuracy": 0.861678397655487, "num_tokens": 978529774.0, "step": 29210 }, { "epoch": 1.7439708691499523, "grad_norm": 0.5573127269744873, "learning_rate": 6.987234492676101e-06, "loss": 0.4266, "mean_token_accuracy": 0.8568233370780944, "num_tokens": 978697454.0, "step": 29215 }, { "epoch": 1.744269340974212, "grad_norm": 0.5329184532165527, "learning_rate": 6.982674166446721e-06, "loss": 0.4745, "mean_token_accuracy": 0.8439878225326538, "num_tokens": 978865134.0, "step": 29220 }, { "epoch": 1.7445678127984718, "grad_norm": 0.5054529309272766, "learning_rate": 6.978118837590762e-06, "loss": 0.4428, "mean_token_accuracy": 0.852266502380371, "num_tokens": 979032814.0, "step": 29225 }, { "epoch": 1.7448662846227316, "grad_norm": 0.5723335146903992, "learning_rate": 6.973568507217748e-06, "loss": 0.414, "mean_token_accuracy": 0.8620839715003967, "num_tokens": 979200494.0, "step": 29230 }, { "epoch": 1.7451647564469914, "grad_norm": 0.5810419917106628, "learning_rate": 6.969023176435998e-06, "loss": 0.4487, "mean_token_accuracy": 0.8496063590049744, "num_tokens": 979368174.0, "step": 29235 }, { "epoch": 1.7454632282712512, "grad_norm": 0.48232221603393555, "learning_rate": 6.964482846352613e-06, "loss": 0.4197, "mean_token_accuracy": 0.8599069476127624, "num_tokens": 979535854.0, "step": 29240 }, { "epoch": 1.745761700095511, "grad_norm": 0.5374194383621216, "learning_rate": 6.9599475180734715e-06, "loss": 0.4154, "mean_token_accuracy": 0.8601992130279541, "num_tokens": 979703534.0, "step": 29245 }, { "epoch": 1.7460601719197708, "grad_norm": 0.5285108089447021, "learning_rate": 6.955417192703242e-06, "loss": 0.4761, "mean_token_accuracy": 0.8415483593940735, "num_tokens": 979871214.0, "step": 29250 }, { "epoch": 1.7463586437440306, "grad_norm": 0.4871658980846405, "learning_rate": 6.950891871345362e-06, "loss": 0.4199, "mean_token_accuracy": 0.8586902022361755, "num_tokens": 980038894.0, "step": 29255 }, { "epoch": 1.7466571155682904, "grad_norm": 0.4737515449523926, "learning_rate": 6.946371555102058e-06, "loss": 0.4323, "mean_token_accuracy": 0.8547477006912232, "num_tokens": 980206574.0, "step": 29260 }, { "epoch": 1.7469555873925502, "grad_norm": 0.5292275547981262, "learning_rate": 6.941856245074341e-06, "loss": 0.4226, "mean_token_accuracy": 0.8580042958259583, "num_tokens": 980374254.0, "step": 29265 }, { "epoch": 1.74725405921681, "grad_norm": 0.4743780791759491, "learning_rate": 6.937345942361992e-06, "loss": 0.4303, "mean_token_accuracy": 0.8553918600082397, "num_tokens": 980541934.0, "step": 29270 }, { "epoch": 1.7475525310410696, "grad_norm": 0.5489332675933838, "learning_rate": 6.932840648063588e-06, "loss": 0.4259, "mean_token_accuracy": 0.8564177393913269, "num_tokens": 980709614.0, "step": 29275 }, { "epoch": 1.7478510028653296, "grad_norm": 0.5025535225868225, "learning_rate": 6.928340363276464e-06, "loss": 0.4545, "mean_token_accuracy": 0.847667908668518, "num_tokens": 980877294.0, "step": 29280 }, { "epoch": 1.7481494746895891, "grad_norm": 0.5074628591537476, "learning_rate": 6.9238450890967575e-06, "loss": 0.4301, "mean_token_accuracy": 0.8545985817909241, "num_tokens": 981044974.0, "step": 29285 }, { "epoch": 1.7484479465138492, "grad_norm": 0.5280393362045288, "learning_rate": 6.919354826619369e-06, "loss": 0.4493, "mean_token_accuracy": 0.8487355351448059, "num_tokens": 981212654.0, "step": 29290 }, { "epoch": 1.7487464183381087, "grad_norm": 0.4847422242164612, "learning_rate": 6.914869576937988e-06, "loss": 0.4602, "mean_token_accuracy": 0.8480973482131958, "num_tokens": 981380334.0, "step": 29295 }, { "epoch": 1.7490448901623687, "grad_norm": 0.48247066140174866, "learning_rate": 6.910389341145085e-06, "loss": 0.4243, "mean_token_accuracy": 0.8567696452140808, "num_tokens": 981548014.0, "step": 29300 }, { "epoch": 1.7493433619866283, "grad_norm": 0.5005117654800415, "learning_rate": 6.905914120331891e-06, "loss": 0.4526, "mean_token_accuracy": 0.8484253883361816, "num_tokens": 981715694.0, "step": 29305 }, { "epoch": 1.7496418338108883, "grad_norm": 0.5716114640235901, "learning_rate": 6.901443915588443e-06, "loss": 0.4315, "mean_token_accuracy": 0.8556662440299988, "num_tokens": 981883374.0, "step": 29310 }, { "epoch": 1.749940305635148, "grad_norm": 0.6131590008735657, "learning_rate": 6.8969787280035306e-06, "loss": 0.4333, "mean_token_accuracy": 0.8557735919952393, "num_tokens": 982051054.0, "step": 29315 }, { "epoch": 1.750238777459408, "grad_norm": 0.5128840804100037, "learning_rate": 6.892518558664739e-06, "loss": 0.4395, "mean_token_accuracy": 0.8533997297286987, "num_tokens": 982218734.0, "step": 29320 }, { "epoch": 1.7505372492836675, "grad_norm": 0.49456241726875305, "learning_rate": 6.888063408658421e-06, "loss": 0.3973, "mean_token_accuracy": 0.8649588465690613, "num_tokens": 982386414.0, "step": 29325 }, { "epoch": 1.7508357211079275, "grad_norm": 0.49524593353271484, "learning_rate": 6.883613279069712e-06, "loss": 0.398, "mean_token_accuracy": 0.8655433654785156, "num_tokens": 982554094.0, "step": 29330 }, { "epoch": 1.751134192932187, "grad_norm": 0.4886665940284729, "learning_rate": 6.879168170982531e-06, "loss": 0.4201, "mean_token_accuracy": 0.8586782693862915, "num_tokens": 982721774.0, "step": 29335 }, { "epoch": 1.751432664756447, "grad_norm": 0.49355655908584595, "learning_rate": 6.874728085479551e-06, "loss": 0.4182, "mean_token_accuracy": 0.8605272531509399, "num_tokens": 982889454.0, "step": 29340 }, { "epoch": 1.7517311365807067, "grad_norm": 0.507889449596405, "learning_rate": 6.870293023642244e-06, "loss": 0.4583, "mean_token_accuracy": 0.8461708307266236, "num_tokens": 983057134.0, "step": 29345 }, { "epoch": 1.7520296084049667, "grad_norm": 0.4929352104663849, "learning_rate": 6.8658629865508585e-06, "loss": 0.4703, "mean_token_accuracy": 0.8423476099967957, "num_tokens": 983224814.0, "step": 29350 }, { "epoch": 1.7523280802292263, "grad_norm": 0.5128406882286072, "learning_rate": 6.861437975284399e-06, "loss": 0.4205, "mean_token_accuracy": 0.8580937623977661, "num_tokens": 983392494.0, "step": 29355 }, { "epoch": 1.7526265520534863, "grad_norm": 0.5166106224060059, "learning_rate": 6.857017990920672e-06, "loss": 0.4722, "mean_token_accuracy": 0.8444411396980286, "num_tokens": 983560174.0, "step": 29360 }, { "epoch": 1.7529250238777458, "grad_norm": 0.5328486561775208, "learning_rate": 6.852603034536231e-06, "loss": 0.4444, "mean_token_accuracy": 0.8511332392692565, "num_tokens": 983727854.0, "step": 29365 }, { "epoch": 1.7532234957020059, "grad_norm": 0.48859071731567383, "learning_rate": 6.84819310720643e-06, "loss": 0.4608, "mean_token_accuracy": 0.8448586344718934, "num_tokens": 983895534.0, "step": 29370 }, { "epoch": 1.7535219675262654, "grad_norm": 0.5125442743301392, "learning_rate": 6.8437882100053946e-06, "loss": 0.4601, "mean_token_accuracy": 0.8469879508018494, "num_tokens": 984063214.0, "step": 29375 }, { "epoch": 1.7538204393505255, "grad_norm": 0.5006015300750732, "learning_rate": 6.839388344006004e-06, "loss": 0.4298, "mean_token_accuracy": 0.8558272600173951, "num_tokens": 984230894.0, "step": 29380 }, { "epoch": 1.754118911174785, "grad_norm": 0.5291780233383179, "learning_rate": 6.83499351027993e-06, "loss": 0.4062, "mean_token_accuracy": 0.862919008731842, "num_tokens": 984398574.0, "step": 29385 }, { "epoch": 1.754417382999045, "grad_norm": 0.5332061052322388, "learning_rate": 6.830603709897625e-06, "loss": 0.4418, "mean_token_accuracy": 0.8513121843338013, "num_tokens": 984566254.0, "step": 29390 }, { "epoch": 1.7547158548233046, "grad_norm": 0.5164974927902222, "learning_rate": 6.8262189439283004e-06, "loss": 0.4455, "mean_token_accuracy": 0.8509960651397706, "num_tokens": 984733934.0, "step": 29395 }, { "epoch": 1.7550143266475646, "grad_norm": 0.47510188817977905, "learning_rate": 6.82183921343994e-06, "loss": 0.4042, "mean_token_accuracy": 0.8634796619415284, "num_tokens": 984901614.0, "step": 29400 }, { "epoch": 1.7553127984718242, "grad_norm": 0.5421191453933716, "learning_rate": 6.81746451949932e-06, "loss": 0.4726, "mean_token_accuracy": 0.842317771911621, "num_tokens": 985069294.0, "step": 29405 }, { "epoch": 1.7556112702960842, "grad_norm": 0.48657265305519104, "learning_rate": 6.813094863171968e-06, "loss": 0.3967, "mean_token_accuracy": 0.8663724184036254, "num_tokens": 985236974.0, "step": 29410 }, { "epoch": 1.7559097421203438, "grad_norm": 0.4703061282634735, "learning_rate": 6.808730245522199e-06, "loss": 0.4188, "mean_token_accuracy": 0.8590003728866578, "num_tokens": 985404654.0, "step": 29415 }, { "epoch": 1.7562082139446036, "grad_norm": 0.5072619915008545, "learning_rate": 6.804370667613103e-06, "loss": 0.4632, "mean_token_accuracy": 0.8465823769569397, "num_tokens": 985572334.0, "step": 29420 }, { "epoch": 1.7565066857688634, "grad_norm": 0.5081788897514343, "learning_rate": 6.800016130506521e-06, "loss": 0.4306, "mean_token_accuracy": 0.8566324710845947, "num_tokens": 985740014.0, "step": 29425 }, { "epoch": 1.7568051575931232, "grad_norm": 0.4417765140533447, "learning_rate": 6.795666635263083e-06, "loss": 0.4129, "mean_token_accuracy": 0.8616784095764161, "num_tokens": 985907694.0, "step": 29430 }, { "epoch": 1.757103629417383, "grad_norm": 0.5713768601417542, "learning_rate": 6.7913221829422055e-06, "loss": 0.4495, "mean_token_accuracy": 0.8488071084022522, "num_tokens": 986075374.0, "step": 29435 }, { "epoch": 1.7574021012416428, "grad_norm": 0.5122180581092834, "learning_rate": 6.786982774602044e-06, "loss": 0.4508, "mean_token_accuracy": 0.8512942910194397, "num_tokens": 986243054.0, "step": 29440 }, { "epoch": 1.7577005730659025, "grad_norm": 0.4664669632911682, "learning_rate": 6.782648411299544e-06, "loss": 0.4132, "mean_token_accuracy": 0.8611415982246399, "num_tokens": 986410734.0, "step": 29445 }, { "epoch": 1.7579990448901623, "grad_norm": 0.5467097163200378, "learning_rate": 6.778319094090427e-06, "loss": 0.4528, "mean_token_accuracy": 0.8484194159507752, "num_tokens": 986578414.0, "step": 29450 }, { "epoch": 1.7582975167144221, "grad_norm": 0.5622749924659729, "learning_rate": 6.77399482402917e-06, "loss": 0.4422, "mean_token_accuracy": 0.8527436494827271, "num_tokens": 986746094.0, "step": 29455 }, { "epoch": 1.758595988538682, "grad_norm": 0.5136642456054688, "learning_rate": 6.7696756021690356e-06, "loss": 0.421, "mean_token_accuracy": 0.8591971874237061, "num_tokens": 986913774.0, "step": 29460 }, { "epoch": 1.7588944603629417, "grad_norm": 0.5292744636535645, "learning_rate": 6.765361429562042e-06, "loss": 0.4342, "mean_token_accuracy": 0.8541631937026978, "num_tokens": 987081454.0, "step": 29465 }, { "epoch": 1.7591929321872015, "grad_norm": 0.5027296543121338, "learning_rate": 6.761052307258992e-06, "loss": 0.4121, "mean_token_accuracy": 0.8607479453086853, "num_tokens": 987249134.0, "step": 29470 }, { "epoch": 1.7594914040114613, "grad_norm": 0.5739250779151917, "learning_rate": 6.756748236309446e-06, "loss": 0.4361, "mean_token_accuracy": 0.8539365410804749, "num_tokens": 987416814.0, "step": 29475 }, { "epoch": 1.759789875835721, "grad_norm": 0.48803815245628357, "learning_rate": 6.752449217761744e-06, "loss": 0.4259, "mean_token_accuracy": 0.8570976853370667, "num_tokens": 987584494.0, "step": 29480 }, { "epoch": 1.760088347659981, "grad_norm": 0.4690641760826111, "learning_rate": 6.748155252662996e-06, "loss": 0.4255, "mean_token_accuracy": 0.8577955484390258, "num_tokens": 987752174.0, "step": 29485 }, { "epoch": 1.7603868194842407, "grad_norm": 0.4950862228870392, "learning_rate": 6.743866342059066e-06, "loss": 0.4608, "mean_token_accuracy": 0.845288074016571, "num_tokens": 987919854.0, "step": 29490 }, { "epoch": 1.7606852913085005, "grad_norm": 0.5685482025146484, "learning_rate": 6.739582486994608e-06, "loss": 0.4432, "mean_token_accuracy": 0.8523320913314819, "num_tokens": 988087534.0, "step": 29495 }, { "epoch": 1.7609837631327603, "grad_norm": 0.48449113965034485, "learning_rate": 6.735303688513025e-06, "loss": 0.4256, "mean_token_accuracy": 0.859173321723938, "num_tokens": 988255214.0, "step": 29500 }, { "epoch": 1.76128223495702, "grad_norm": 0.4949661195278168, "learning_rate": 6.731029947656505e-06, "loss": 0.4143, "mean_token_accuracy": 0.8599248647689819, "num_tokens": 988422894.0, "step": 29505 }, { "epoch": 1.7615807067812799, "grad_norm": 0.46910420060157776, "learning_rate": 6.726761265465985e-06, "loss": 0.4161, "mean_token_accuracy": 0.8598831176757813, "num_tokens": 988590574.0, "step": 29510 }, { "epoch": 1.7618791786055397, "grad_norm": 0.548302173614502, "learning_rate": 6.722497642981192e-06, "loss": 0.4388, "mean_token_accuracy": 0.8521591305732727, "num_tokens": 988758254.0, "step": 29515 }, { "epoch": 1.7621776504297995, "grad_norm": 0.5400492548942566, "learning_rate": 6.7182390812406125e-06, "loss": 0.4209, "mean_token_accuracy": 0.8587379217147827, "num_tokens": 988925934.0, "step": 29520 }, { "epoch": 1.7624761222540593, "grad_norm": 0.531758189201355, "learning_rate": 6.713985581281487e-06, "loss": 0.4372, "mean_token_accuracy": 0.853996193408966, "num_tokens": 989093614.0, "step": 29525 }, { "epoch": 1.762774594078319, "grad_norm": 0.48997655510902405, "learning_rate": 6.70973714413984e-06, "loss": 0.4511, "mean_token_accuracy": 0.8480317234992981, "num_tokens": 989261294.0, "step": 29530 }, { "epoch": 1.7630730659025788, "grad_norm": 0.4956251084804535, "learning_rate": 6.705493770850455e-06, "loss": 0.4228, "mean_token_accuracy": 0.8595908403396606, "num_tokens": 989428974.0, "step": 29535 }, { "epoch": 1.7633715377268386, "grad_norm": 0.5295732617378235, "learning_rate": 6.701255462446881e-06, "loss": 0.4159, "mean_token_accuracy": 0.860926878452301, "num_tokens": 989596654.0, "step": 29540 }, { "epoch": 1.7636700095510984, "grad_norm": 0.5092737078666687, "learning_rate": 6.6970222199614474e-06, "loss": 0.4296, "mean_token_accuracy": 0.8560718059539795, "num_tokens": 989764334.0, "step": 29545 }, { "epoch": 1.7639684813753582, "grad_norm": 0.5308535695075989, "learning_rate": 6.692794044425228e-06, "loss": 0.4366, "mean_token_accuracy": 0.8535667419433594, "num_tokens": 989932014.0, "step": 29550 }, { "epoch": 1.764266953199618, "grad_norm": 0.48232805728912354, "learning_rate": 6.688570936868076e-06, "loss": 0.4279, "mean_token_accuracy": 0.8573243379592895, "num_tokens": 990099694.0, "step": 29555 }, { "epoch": 1.7645654250238776, "grad_norm": 0.5460061430931091, "learning_rate": 6.684352898318606e-06, "loss": 0.4795, "mean_token_accuracy": 0.839448893070221, "num_tokens": 990267374.0, "step": 29560 }, { "epoch": 1.7648638968481376, "grad_norm": 0.5096883177757263, "learning_rate": 6.680139929804202e-06, "loss": 0.4549, "mean_token_accuracy": 0.8480734825134277, "num_tokens": 990435054.0, "step": 29565 }, { "epoch": 1.7651623686723972, "grad_norm": 0.5248125195503235, "learning_rate": 6.675932032351008e-06, "loss": 0.4159, "mean_token_accuracy": 0.8602469205856323, "num_tokens": 990602734.0, "step": 29570 }, { "epoch": 1.7654608404966572, "grad_norm": 0.4921119511127472, "learning_rate": 6.671729206983934e-06, "loss": 0.4502, "mean_token_accuracy": 0.8517058253288269, "num_tokens": 990770414.0, "step": 29575 }, { "epoch": 1.7657593123209168, "grad_norm": 0.5222781896591187, "learning_rate": 6.667531454726664e-06, "loss": 0.381, "mean_token_accuracy": 0.8724323034286499, "num_tokens": 990938094.0, "step": 29580 }, { "epoch": 1.7660577841451768, "grad_norm": 0.5048891305923462, "learning_rate": 6.663338776601626e-06, "loss": 0.4473, "mean_token_accuracy": 0.850751531124115, "num_tokens": 991105774.0, "step": 29585 }, { "epoch": 1.7663562559694364, "grad_norm": 0.49026796221733093, "learning_rate": 6.659151173630029e-06, "loss": 0.4047, "mean_token_accuracy": 0.864398181438446, "num_tokens": 991273454.0, "step": 29590 }, { "epoch": 1.7666547277936964, "grad_norm": 0.555770218372345, "learning_rate": 6.6549686468318454e-06, "loss": 0.4178, "mean_token_accuracy": 0.8583025217056275, "num_tokens": 991441134.0, "step": 29595 }, { "epoch": 1.766953199617956, "grad_norm": 0.4972131848335266, "learning_rate": 6.6507911972258e-06, "loss": 0.4104, "mean_token_accuracy": 0.8608612656593323, "num_tokens": 991608814.0, "step": 29600 }, { "epoch": 1.767251671442216, "grad_norm": 0.48660045862197876, "learning_rate": 6.646618825829394e-06, "loss": 0.4345, "mean_token_accuracy": 0.8554919004440308, "num_tokens": 991771670.0, "step": 29605 }, { "epoch": 1.7675501432664755, "grad_norm": 0.5368556380271912, "learning_rate": 6.642451533658879e-06, "loss": 0.4054, "mean_token_accuracy": 0.8636287689208985, "num_tokens": 991939350.0, "step": 29610 }, { "epoch": 1.7678486150907355, "grad_norm": 0.46552684903144836, "learning_rate": 6.638289321729283e-06, "loss": 0.4149, "mean_token_accuracy": 0.8622509837150574, "num_tokens": 992107030.0, "step": 29615 }, { "epoch": 1.7681470869149951, "grad_norm": 0.5402354598045349, "learning_rate": 6.634132191054379e-06, "loss": 0.4233, "mean_token_accuracy": 0.8583919763565063, "num_tokens": 992274710.0, "step": 29620 }, { "epoch": 1.7684455587392551, "grad_norm": 0.5115271210670471, "learning_rate": 6.6299801426467234e-06, "loss": 0.4332, "mean_token_accuracy": 0.8553441524505615, "num_tokens": 992442390.0, "step": 29625 }, { "epoch": 1.7687440305635147, "grad_norm": 0.49103888869285583, "learning_rate": 6.625833177517627e-06, "loss": 0.3992, "mean_token_accuracy": 0.8636526226997375, "num_tokens": 992610070.0, "step": 29630 }, { "epoch": 1.7690425023877747, "grad_norm": 0.4704672694206238, "learning_rate": 6.62169129667714e-06, "loss": 0.4018, "mean_token_accuracy": 0.8645353674888611, "num_tokens": 992777750.0, "step": 29635 }, { "epoch": 1.7693409742120343, "grad_norm": 0.571395754814148, "learning_rate": 6.617554501134118e-06, "loss": 0.4396, "mean_token_accuracy": 0.8529345035552979, "num_tokens": 992945430.0, "step": 29640 }, { "epoch": 1.7696394460362943, "grad_norm": 0.5068790316581726, "learning_rate": 6.61342279189614e-06, "loss": 0.412, "mean_token_accuracy": 0.8606405854225159, "num_tokens": 993113110.0, "step": 29645 }, { "epoch": 1.7699379178605539, "grad_norm": 0.5302010774612427, "learning_rate": 6.609296169969559e-06, "loss": 0.4267, "mean_token_accuracy": 0.8559704184532165, "num_tokens": 993280790.0, "step": 29650 }, { "epoch": 1.770236389684814, "grad_norm": 0.4594547748565674, "learning_rate": 6.6051746363594975e-06, "loss": 0.4412, "mean_token_accuracy": 0.8518251180648804, "num_tokens": 993448470.0, "step": 29655 }, { "epoch": 1.7705348615090735, "grad_norm": 0.5209713578224182, "learning_rate": 6.601058192069829e-06, "loss": 0.472, "mean_token_accuracy": 0.8433198094367981, "num_tokens": 993616150.0, "step": 29660 }, { "epoch": 1.7708333333333335, "grad_norm": 0.4633740782737732, "learning_rate": 6.596946838103189e-06, "loss": 0.379, "mean_token_accuracy": 0.8718776106834412, "num_tokens": 993783830.0, "step": 29665 }, { "epoch": 1.771131805157593, "grad_norm": 0.5139845013618469, "learning_rate": 6.592840575460968e-06, "loss": 0.4091, "mean_token_accuracy": 0.8618036508560181, "num_tokens": 993951510.0, "step": 29670 }, { "epoch": 1.771430276981853, "grad_norm": 0.4802762269973755, "learning_rate": 6.588739405143328e-06, "loss": 0.4141, "mean_token_accuracy": 0.8604795455932617, "num_tokens": 994119190.0, "step": 29675 }, { "epoch": 1.7717287488061126, "grad_norm": 0.5031047463417053, "learning_rate": 6.584643328149185e-06, "loss": 0.4525, "mean_token_accuracy": 0.8474412441253663, "num_tokens": 994286870.0, "step": 29680 }, { "epoch": 1.7720272206303727, "grad_norm": 0.5848226547241211, "learning_rate": 6.5805523454762125e-06, "loss": 0.4522, "mean_token_accuracy": 0.8492186546325684, "num_tokens": 994454550.0, "step": 29685 }, { "epoch": 1.7723256924546322, "grad_norm": 0.5669364929199219, "learning_rate": 6.576466458120848e-06, "loss": 0.4503, "mean_token_accuracy": 0.8499164938926697, "num_tokens": 994622230.0, "step": 29690 }, { "epoch": 1.772624164278892, "grad_norm": 0.5281694531440735, "learning_rate": 6.572385667078273e-06, "loss": 0.442, "mean_token_accuracy": 0.8521710515022278, "num_tokens": 994789910.0, "step": 29695 }, { "epoch": 1.7729226361031518, "grad_norm": 0.5648983716964722, "learning_rate": 6.568309973342457e-06, "loss": 0.4608, "mean_token_accuracy": 0.8459382057189941, "num_tokens": 994957590.0, "step": 29700 }, { "epoch": 1.7732211079274116, "grad_norm": 0.5047057867050171, "learning_rate": 6.564239377906098e-06, "loss": 0.4217, "mean_token_accuracy": 0.8572646856307984, "num_tokens": 995125270.0, "step": 29705 }, { "epoch": 1.7735195797516714, "grad_norm": 0.48366889357566833, "learning_rate": 6.5601738817606685e-06, "loss": 0.421, "mean_token_accuracy": 0.8602588653564454, "num_tokens": 995292950.0, "step": 29710 }, { "epoch": 1.7738180515759312, "grad_norm": 0.4254077970981598, "learning_rate": 6.556113485896395e-06, "loss": 0.3889, "mean_token_accuracy": 0.8687104821205139, "num_tokens": 995460630.0, "step": 29715 }, { "epoch": 1.774116523400191, "grad_norm": 0.48525968194007874, "learning_rate": 6.55205819130226e-06, "loss": 0.4345, "mean_token_accuracy": 0.8535667538642884, "num_tokens": 995628310.0, "step": 29720 }, { "epoch": 1.7744149952244508, "grad_norm": 0.5002026557922363, "learning_rate": 6.548007998966012e-06, "loss": 0.4077, "mean_token_accuracy": 0.8615650534629822, "num_tokens": 995795990.0, "step": 29725 }, { "epoch": 1.7747134670487106, "grad_norm": 0.4627630114555359, "learning_rate": 6.5439629098741435e-06, "loss": 0.4213, "mean_token_accuracy": 0.8598294258117676, "num_tokens": 995963670.0, "step": 29730 }, { "epoch": 1.7750119388729704, "grad_norm": 0.495576947927475, "learning_rate": 6.539922925011911e-06, "loss": 0.4279, "mean_token_accuracy": 0.856340229511261, "num_tokens": 996131350.0, "step": 29735 }, { "epoch": 1.7753104106972302, "grad_norm": 0.5531829595565796, "learning_rate": 6.535888045363328e-06, "loss": 0.4266, "mean_token_accuracy": 0.8580281496047973, "num_tokens": 996299030.0, "step": 29740 }, { "epoch": 1.77560888252149, "grad_norm": 0.4722798466682434, "learning_rate": 6.531858271911165e-06, "loss": 0.4172, "mean_token_accuracy": 0.8583681344985962, "num_tokens": 996466710.0, "step": 29745 }, { "epoch": 1.7759073543457498, "grad_norm": 0.410942405462265, "learning_rate": 6.527833605636953e-06, "loss": 0.3753, "mean_token_accuracy": 0.8734045147895813, "num_tokens": 996634390.0, "step": 29750 }, { "epoch": 1.7762058261700095, "grad_norm": 0.46298593282699585, "learning_rate": 6.5238140475209625e-06, "loss": 0.4172, "mean_token_accuracy": 0.8595013737678527, "num_tokens": 996802070.0, "step": 29755 }, { "epoch": 1.7765042979942693, "grad_norm": 0.5453138947486877, "learning_rate": 6.519799598542235e-06, "loss": 0.4358, "mean_token_accuracy": 0.8545150876045227, "num_tokens": 996969750.0, "step": 29760 }, { "epoch": 1.7768027698185291, "grad_norm": 0.5334903597831726, "learning_rate": 6.515790259678572e-06, "loss": 0.4614, "mean_token_accuracy": 0.8477156281471252, "num_tokens": 997137430.0, "step": 29765 }, { "epoch": 1.777101241642789, "grad_norm": 0.5010488629341125, "learning_rate": 6.511786031906511e-06, "loss": 0.3825, "mean_token_accuracy": 0.8697840809822083, "num_tokens": 997305110.0, "step": 29770 }, { "epoch": 1.7773997134670487, "grad_norm": 0.5318519473075867, "learning_rate": 6.507786916201359e-06, "loss": 0.3842, "mean_token_accuracy": 0.8707205057144165, "num_tokens": 997472790.0, "step": 29775 }, { "epoch": 1.7776981852913085, "grad_norm": 0.5182591676712036, "learning_rate": 6.503792913537177e-06, "loss": 0.4347, "mean_token_accuracy": 0.8557556986808776, "num_tokens": 997640470.0, "step": 29780 }, { "epoch": 1.7779966571155683, "grad_norm": 0.49705302715301514, "learning_rate": 6.499804024886776e-06, "loss": 0.4526, "mean_token_accuracy": 0.8486997485160828, "num_tokens": 997808150.0, "step": 29785 }, { "epoch": 1.778295128939828, "grad_norm": 0.5338529944419861, "learning_rate": 6.495820251221728e-06, "loss": 0.4119, "mean_token_accuracy": 0.8605806112289429, "num_tokens": 997969428.0, "step": 29790 }, { "epoch": 1.778593600764088, "grad_norm": 0.5711192488670349, "learning_rate": 6.491841593512346e-06, "loss": 0.4334, "mean_token_accuracy": 0.8553143262863159, "num_tokens": 998137108.0, "step": 29795 }, { "epoch": 1.7788920725883477, "grad_norm": 0.5651690363883972, "learning_rate": 6.4878680527277105e-06, "loss": 0.4488, "mean_token_accuracy": 0.8490635871887207, "num_tokens": 998304788.0, "step": 29800 }, { "epoch": 1.7791905444126075, "grad_norm": 0.515377938747406, "learning_rate": 6.483899629835651e-06, "loss": 0.4077, "mean_token_accuracy": 0.86315758228302, "num_tokens": 998472468.0, "step": 29805 }, { "epoch": 1.7794890162368673, "grad_norm": 0.5820438265800476, "learning_rate": 6.479936325802752e-06, "loss": 0.4448, "mean_token_accuracy": 0.8510855317115784, "num_tokens": 998640148.0, "step": 29810 }, { "epoch": 1.779787488061127, "grad_norm": 0.5081514716148376, "learning_rate": 6.475978141594345e-06, "loss": 0.4274, "mean_token_accuracy": 0.8555827260017395, "num_tokens": 998807828.0, "step": 29815 }, { "epoch": 1.7800859598853869, "grad_norm": 0.5771610736846924, "learning_rate": 6.472025078174514e-06, "loss": 0.4841, "mean_token_accuracy": 0.8405165195465087, "num_tokens": 998975508.0, "step": 29820 }, { "epoch": 1.7803844317096467, "grad_norm": 0.5426960587501526, "learning_rate": 6.4680771365061175e-06, "loss": 0.4183, "mean_token_accuracy": 0.8575092315673828, "num_tokens": 999143188.0, "step": 29825 }, { "epoch": 1.7806829035339065, "grad_norm": 0.4830979108810425, "learning_rate": 6.4641343175507336e-06, "loss": 0.4017, "mean_token_accuracy": 0.8635810613632202, "num_tokens": 999310868.0, "step": 29830 }, { "epoch": 1.780981375358166, "grad_norm": 0.5529038310050964, "learning_rate": 6.460196622268717e-06, "loss": 0.4335, "mean_token_accuracy": 0.854461407661438, "num_tokens": 999478548.0, "step": 29835 }, { "epoch": 1.781279847182426, "grad_norm": 0.5055702924728394, "learning_rate": 6.4562640516191595e-06, "loss": 0.4004, "mean_token_accuracy": 0.8650065660476685, "num_tokens": 999646228.0, "step": 29840 }, { "epoch": 1.7815783190066856, "grad_norm": 0.530823290348053, "learning_rate": 6.452336606559915e-06, "loss": 0.4214, "mean_token_accuracy": 0.8566145777702332, "num_tokens": 999813908.0, "step": 29845 }, { "epoch": 1.7818767908309456, "grad_norm": 0.5236109495162964, "learning_rate": 6.448414288047592e-06, "loss": 0.4201, "mean_token_accuracy": 0.8586305618286133, "num_tokens": 999981588.0, "step": 29850 }, { "epoch": 1.7821752626552052, "grad_norm": 0.4977428913116455, "learning_rate": 6.444497097037532e-06, "loss": 0.4557, "mean_token_accuracy": 0.8490516543388367, "num_tokens": 1000149268.0, "step": 29855 }, { "epoch": 1.7824737344794652, "grad_norm": 0.5108789205551147, "learning_rate": 6.4405850344838425e-06, "loss": 0.4315, "mean_token_accuracy": 0.8559883236885071, "num_tokens": 1000316948.0, "step": 29860 }, { "epoch": 1.7827722063037248, "grad_norm": 0.4991239905357361, "learning_rate": 6.436678101339383e-06, "loss": 0.3895, "mean_token_accuracy": 0.8672253370285035, "num_tokens": 1000484628.0, "step": 29865 }, { "epoch": 1.7830706781279848, "grad_norm": 0.5139074325561523, "learning_rate": 6.432776298555755e-06, "loss": 0.4312, "mean_token_accuracy": 0.8549803376197815, "num_tokens": 1000652308.0, "step": 29870 }, { "epoch": 1.7833691499522444, "grad_norm": 0.5655705332756042, "learning_rate": 6.428879627083317e-06, "loss": 0.4463, "mean_token_accuracy": 0.8506799459457397, "num_tokens": 1000819988.0, "step": 29875 }, { "epoch": 1.7836676217765044, "grad_norm": 0.507956862449646, "learning_rate": 6.424988087871175e-06, "loss": 0.4131, "mean_token_accuracy": 0.8603363990783691, "num_tokens": 1000987668.0, "step": 29880 }, { "epoch": 1.783966093600764, "grad_norm": 0.5413893461227417, "learning_rate": 6.421101681867181e-06, "loss": 0.4321, "mean_token_accuracy": 0.8563103914260864, "num_tokens": 1001155348.0, "step": 29885 }, { "epoch": 1.784264565425024, "grad_norm": 0.49874091148376465, "learning_rate": 6.417220410017946e-06, "loss": 0.4314, "mean_token_accuracy": 0.855851125717163, "num_tokens": 1001323028.0, "step": 29890 }, { "epoch": 1.7845630372492836, "grad_norm": 0.6097025871276855, "learning_rate": 6.413344273268823e-06, "loss": 0.4512, "mean_token_accuracy": 0.8496063470840454, "num_tokens": 1001490708.0, "step": 29895 }, { "epoch": 1.7848615090735436, "grad_norm": 0.5162869691848755, "learning_rate": 6.409473272563918e-06, "loss": 0.4323, "mean_token_accuracy": 0.8546641945838929, "num_tokens": 1001658388.0, "step": 29900 }, { "epoch": 1.7851599808978031, "grad_norm": 0.5996779799461365, "learning_rate": 6.405607408846083e-06, "loss": 0.4293, "mean_token_accuracy": 0.856942617893219, "num_tokens": 1001826068.0, "step": 29905 }, { "epoch": 1.7854584527220632, "grad_norm": 0.5370012521743774, "learning_rate": 6.401746683056927e-06, "loss": 0.4577, "mean_token_accuracy": 0.8476500153541565, "num_tokens": 1001993748.0, "step": 29910 }, { "epoch": 1.7857569245463227, "grad_norm": 0.5540712475776672, "learning_rate": 6.397891096136789e-06, "loss": 0.4736, "mean_token_accuracy": 0.8423714637756348, "num_tokens": 1002161428.0, "step": 29915 }, { "epoch": 1.7860553963705827, "grad_norm": 0.4775422513484955, "learning_rate": 6.394040649024777e-06, "loss": 0.4254, "mean_token_accuracy": 0.8566086292266846, "num_tokens": 1002329108.0, "step": 29920 }, { "epoch": 1.7863538681948423, "grad_norm": 0.45298928022384644, "learning_rate": 6.390195342658732e-06, "loss": 0.4006, "mean_token_accuracy": 0.8650840878486633, "num_tokens": 1002496788.0, "step": 29925 }, { "epoch": 1.7866523400191023, "grad_norm": 0.457699716091156, "learning_rate": 6.386355177975259e-06, "loss": 0.4396, "mean_token_accuracy": 0.8517356634140014, "num_tokens": 1002664468.0, "step": 29930 }, { "epoch": 1.786950811843362, "grad_norm": 0.49803197383880615, "learning_rate": 6.382520155909696e-06, "loss": 0.4217, "mean_token_accuracy": 0.8583979368209839, "num_tokens": 1002832148.0, "step": 29935 }, { "epoch": 1.787249283667622, "grad_norm": 0.5499160885810852, "learning_rate": 6.378690277396133e-06, "loss": 0.4345, "mean_token_accuracy": 0.8553560853004456, "num_tokens": 1002999828.0, "step": 29940 }, { "epoch": 1.7875477554918815, "grad_norm": 0.5389583706855774, "learning_rate": 6.374865543367406e-06, "loss": 0.4742, "mean_token_accuracy": 0.8422223448753356, "num_tokens": 1003167508.0, "step": 29945 }, { "epoch": 1.7878462273161415, "grad_norm": 0.5455766916275024, "learning_rate": 6.371045954755098e-06, "loss": 0.4444, "mean_token_accuracy": 0.8525766491889953, "num_tokens": 1003335188.0, "step": 29950 }, { "epoch": 1.788144699140401, "grad_norm": 0.4734271764755249, "learning_rate": 6.3672315124895475e-06, "loss": 0.4064, "mean_token_accuracy": 0.8629249691963196, "num_tokens": 1003502868.0, "step": 29955 }, { "epoch": 1.788443170964661, "grad_norm": 0.5003581643104553, "learning_rate": 6.36342221749983e-06, "loss": 0.4504, "mean_token_accuracy": 0.8496063590049744, "num_tokens": 1003670548.0, "step": 29960 }, { "epoch": 1.7887416427889207, "grad_norm": 0.53315269947052, "learning_rate": 6.359618070713763e-06, "loss": 0.4222, "mean_token_accuracy": 0.8570141911506652, "num_tokens": 1003838228.0, "step": 29965 }, { "epoch": 1.7890401146131805, "grad_norm": 0.5205037593841553, "learning_rate": 6.3558190730579254e-06, "loss": 0.4318, "mean_token_accuracy": 0.8561970710754394, "num_tokens": 1004005908.0, "step": 29970 }, { "epoch": 1.7893385864374403, "grad_norm": 0.49853748083114624, "learning_rate": 6.352025225457628e-06, "loss": 0.4156, "mean_token_accuracy": 0.859310507774353, "num_tokens": 1004173588.0, "step": 29975 }, { "epoch": 1.7896370582617, "grad_norm": 0.4901517629623413, "learning_rate": 6.348236528836933e-06, "loss": 0.426, "mean_token_accuracy": 0.8556977391242981, "num_tokens": 1004336302.0, "step": 29980 }, { "epoch": 1.7899355300859598, "grad_norm": 0.4826200604438782, "learning_rate": 6.3444529841186495e-06, "loss": 0.431, "mean_token_accuracy": 0.8551771402359009, "num_tokens": 1004503982.0, "step": 29985 }, { "epoch": 1.7902340019102196, "grad_norm": 0.4971453845500946, "learning_rate": 6.340674592224327e-06, "loss": 0.4506, "mean_token_accuracy": 0.8485446691513061, "num_tokens": 1004671662.0, "step": 29990 }, { "epoch": 1.7905324737344794, "grad_norm": 0.4662124216556549, "learning_rate": 6.3369013540742694e-06, "loss": 0.3942, "mean_token_accuracy": 0.868233323097229, "num_tokens": 1004839342.0, "step": 29995 }, { "epoch": 1.7908309455587392, "grad_norm": 0.5221835970878601, "learning_rate": 6.333133270587506e-06, "loss": 0.4243, "mean_token_accuracy": 0.8579923629760742, "num_tokens": 1005007022.0, "step": 30000 }, { "epoch": 1.791129417382999, "grad_norm": 0.5196115970611572, "learning_rate": 6.329370342681833e-06, "loss": 0.4223, "mean_token_accuracy": 0.8587856411933898, "num_tokens": 1005174702.0, "step": 30005 }, { "epoch": 1.7914278892072588, "grad_norm": 0.4869367480278015, "learning_rate": 6.3256125712737755e-06, "loss": 0.4207, "mean_token_accuracy": 0.8585052967071534, "num_tokens": 1005342382.0, "step": 30010 }, { "epoch": 1.7917263610315186, "grad_norm": 0.4613809287548065, "learning_rate": 6.3218599572786144e-06, "loss": 0.4149, "mean_token_accuracy": 0.8595669746398926, "num_tokens": 1005510062.0, "step": 30015 }, { "epoch": 1.7920248328557784, "grad_norm": 0.54756098985672, "learning_rate": 6.318112501610367e-06, "loss": 0.4387, "mean_token_accuracy": 0.8521114230155945, "num_tokens": 1005677742.0, "step": 30020 }, { "epoch": 1.7923233046800382, "grad_norm": 0.5355969071388245, "learning_rate": 6.314370205181783e-06, "loss": 0.4551, "mean_token_accuracy": 0.8488011479377746, "num_tokens": 1005845422.0, "step": 30025 }, { "epoch": 1.792621776504298, "grad_norm": 0.45079246163368225, "learning_rate": 6.310633068904386e-06, "loss": 0.4153, "mean_token_accuracy": 0.8608195066452027, "num_tokens": 1006013102.0, "step": 30030 }, { "epoch": 1.7929202483285578, "grad_norm": 0.5127429366111755, "learning_rate": 6.3069010936884135e-06, "loss": 0.4152, "mean_token_accuracy": 0.8595431089401245, "num_tokens": 1006180782.0, "step": 30035 }, { "epoch": 1.7932187201528176, "grad_norm": 0.5833861231803894, "learning_rate": 6.303174280442858e-06, "loss": 0.452, "mean_token_accuracy": 0.8508409857749939, "num_tokens": 1006348462.0, "step": 30040 }, { "epoch": 1.7935171919770774, "grad_norm": 0.502337634563446, "learning_rate": 6.299452630075456e-06, "loss": 0.4288, "mean_token_accuracy": 0.8551592588424682, "num_tokens": 1006516142.0, "step": 30045 }, { "epoch": 1.7938156638013372, "grad_norm": 0.5145968794822693, "learning_rate": 6.295736143492683e-06, "loss": 0.4381, "mean_token_accuracy": 0.8540617823600769, "num_tokens": 1006683822.0, "step": 30050 }, { "epoch": 1.794114135625597, "grad_norm": 0.48628824949264526, "learning_rate": 6.292024821599762e-06, "loss": 0.4722, "mean_token_accuracy": 0.8416378259658813, "num_tokens": 1006851502.0, "step": 30055 }, { "epoch": 1.7944126074498568, "grad_norm": 0.5729851126670837, "learning_rate": 6.288318665300645e-06, "loss": 0.4725, "mean_token_accuracy": 0.843922221660614, "num_tokens": 1007019182.0, "step": 30060 }, { "epoch": 1.7947110792741165, "grad_norm": 0.46259039640426636, "learning_rate": 6.284617675498041e-06, "loss": 0.3711, "mean_token_accuracy": 0.8740904331207275, "num_tokens": 1007186862.0, "step": 30065 }, { "epoch": 1.7950095510983763, "grad_norm": 0.4978337585926056, "learning_rate": 6.280921853093394e-06, "loss": 0.3941, "mean_token_accuracy": 0.8675533771514893, "num_tokens": 1007354542.0, "step": 30070 }, { "epoch": 1.7953080229226361, "grad_norm": 0.5172361135482788, "learning_rate": 6.277231198986885e-06, "loss": 0.4487, "mean_token_accuracy": 0.8510974526405335, "num_tokens": 1007522222.0, "step": 30075 }, { "epoch": 1.795606494746896, "grad_norm": 0.4804052412509918, "learning_rate": 6.273545714077454e-06, "loss": 0.424, "mean_token_accuracy": 0.8576524019241333, "num_tokens": 1007689902.0, "step": 30080 }, { "epoch": 1.7959049665711557, "grad_norm": 0.4806613624095917, "learning_rate": 6.269865399262753e-06, "loss": 0.3956, "mean_token_accuracy": 0.8665453910827636, "num_tokens": 1007857582.0, "step": 30085 }, { "epoch": 1.7962034383954155, "grad_norm": 0.5243979692459106, "learning_rate": 6.266190255439194e-06, "loss": 0.4265, "mean_token_accuracy": 0.8571633100509644, "num_tokens": 1008025262.0, "step": 30090 }, { "epoch": 1.7965019102196753, "grad_norm": 0.5323292016983032, "learning_rate": 6.262520283501938e-06, "loss": 0.4229, "mean_token_accuracy": 0.8592687606811523, "num_tokens": 1008192942.0, "step": 30095 }, { "epoch": 1.796800382043935, "grad_norm": 0.5139323472976685, "learning_rate": 6.258855484344862e-06, "loss": 0.4318, "mean_token_accuracy": 0.8544256210327148, "num_tokens": 1008360622.0, "step": 30100 }, { "epoch": 1.797098853868195, "grad_norm": 0.4848935008049011, "learning_rate": 6.255195858860602e-06, "loss": 0.397, "mean_token_accuracy": 0.865752112865448, "num_tokens": 1008528302.0, "step": 30105 }, { "epoch": 1.7973973256924545, "grad_norm": 0.5255609750747681, "learning_rate": 6.251541407940517e-06, "loss": 0.4215, "mean_token_accuracy": 0.8580639481544494, "num_tokens": 1008695982.0, "step": 30110 }, { "epoch": 1.7976957975167145, "grad_norm": 0.49564024806022644, "learning_rate": 6.247892132474729e-06, "loss": 0.4461, "mean_token_accuracy": 0.8502326130867004, "num_tokens": 1008863662.0, "step": 30115 }, { "epoch": 1.797994269340974, "grad_norm": 0.5161488652229309, "learning_rate": 6.244248033352084e-06, "loss": 0.4301, "mean_token_accuracy": 0.8570976972579956, "num_tokens": 1009031342.0, "step": 30120 }, { "epoch": 1.798292741165234, "grad_norm": 0.4784206748008728, "learning_rate": 6.240609111460163e-06, "loss": 0.4134, "mean_token_accuracy": 0.858821427822113, "num_tokens": 1009199022.0, "step": 30125 }, { "epoch": 1.7985912129894936, "grad_norm": 0.5764876008033752, "learning_rate": 6.236975367685295e-06, "loss": 0.3997, "mean_token_accuracy": 0.8661755919456482, "num_tokens": 1009366702.0, "step": 30130 }, { "epoch": 1.7988896848137537, "grad_norm": 0.5112322568893433, "learning_rate": 6.23334680291255e-06, "loss": 0.4148, "mean_token_accuracy": 0.8599308013916016, "num_tokens": 1009534382.0, "step": 30135 }, { "epoch": 1.7991881566380132, "grad_norm": 0.47579818964004517, "learning_rate": 6.229723418025729e-06, "loss": 0.3801, "mean_token_accuracy": 0.8712871193885803, "num_tokens": 1009702062.0, "step": 30140 }, { "epoch": 1.7994866284622733, "grad_norm": 0.4868124723434448, "learning_rate": 6.226105213907373e-06, "loss": 0.4179, "mean_token_accuracy": 0.8603721737861634, "num_tokens": 1009869742.0, "step": 30145 }, { "epoch": 1.7997851002865328, "grad_norm": 0.5073914527893066, "learning_rate": 6.222492191438758e-06, "loss": 0.4627, "mean_token_accuracy": 0.8444172739982605, "num_tokens": 1010037422.0, "step": 30150 }, { "epoch": 1.8000835721107928, "grad_norm": 0.5191833972930908, "learning_rate": 6.218884351499913e-06, "loss": 0.4261, "mean_token_accuracy": 0.8585053086280823, "num_tokens": 1010205102.0, "step": 30155 }, { "epoch": 1.8003820439350524, "grad_norm": 0.4738912880420685, "learning_rate": 6.215281694969587e-06, "loss": 0.3829, "mean_token_accuracy": 0.8716271042823791, "num_tokens": 1010372782.0, "step": 30160 }, { "epoch": 1.8006805157593124, "grad_norm": 0.5197004079818726, "learning_rate": 6.211684222725278e-06, "loss": 0.401, "mean_token_accuracy": 0.8648276329040527, "num_tokens": 1010540462.0, "step": 30165 }, { "epoch": 1.800978987583572, "grad_norm": 0.5202313661575317, "learning_rate": 6.20809193564321e-06, "loss": 0.4036, "mean_token_accuracy": 0.8631754636764526, "num_tokens": 1010708142.0, "step": 30170 }, { "epoch": 1.801277459407832, "grad_norm": 0.5416523814201355, "learning_rate": 6.204504834598354e-06, "loss": 0.462, "mean_token_accuracy": 0.8463437795639038, "num_tokens": 1010875822.0, "step": 30175 }, { "epoch": 1.8015759312320916, "grad_norm": 0.4549430012702942, "learning_rate": 6.200922920464421e-06, "loss": 0.4142, "mean_token_accuracy": 0.8605928778648376, "num_tokens": 1011043502.0, "step": 30180 }, { "epoch": 1.8018744030563516, "grad_norm": 0.5418521165847778, "learning_rate": 6.197346194113842e-06, "loss": 0.4349, "mean_token_accuracy": 0.8541631817817688, "num_tokens": 1011211182.0, "step": 30185 }, { "epoch": 1.8021728748806112, "grad_norm": 0.5701766610145569, "learning_rate": 6.1937746564178e-06, "loss": 0.4103, "mean_token_accuracy": 0.8625790357589722, "num_tokens": 1011378862.0, "step": 30190 }, { "epoch": 1.8024713467048712, "grad_norm": 0.52357017993927, "learning_rate": 6.190208308246208e-06, "loss": 0.4213, "mean_token_accuracy": 0.8581474423408508, "num_tokens": 1011546542.0, "step": 30195 }, { "epoch": 1.8027698185291308, "grad_norm": 0.5457358360290527, "learning_rate": 6.186647150467716e-06, "loss": 0.4454, "mean_token_accuracy": 0.8509125471115112, "num_tokens": 1011714222.0, "step": 30200 }, { "epoch": 1.8030682903533908, "grad_norm": 0.539545476436615, "learning_rate": 6.183091183949715e-06, "loss": 0.4601, "mean_token_accuracy": 0.8465406179428101, "num_tokens": 1011881902.0, "step": 30205 }, { "epoch": 1.8033667621776504, "grad_norm": 0.49626877903938293, "learning_rate": 6.179540409558316e-06, "loss": 0.4137, "mean_token_accuracy": 0.8608970522880555, "num_tokens": 1012049582.0, "step": 30210 }, { "epoch": 1.8036652340019104, "grad_norm": 0.49910977482795715, "learning_rate": 6.175994828158381e-06, "loss": 0.4283, "mean_token_accuracy": 0.8575629353523254, "num_tokens": 1012217262.0, "step": 30215 }, { "epoch": 1.80396370582617, "grad_norm": 0.5258999466896057, "learning_rate": 6.172454440613502e-06, "loss": 0.4367, "mean_token_accuracy": 0.8520577430725098, "num_tokens": 1012384942.0, "step": 30220 }, { "epoch": 1.80426217765043, "grad_norm": 0.4913251996040344, "learning_rate": 6.168919247786005e-06, "loss": 0.4112, "mean_token_accuracy": 0.8620839834213256, "num_tokens": 1012552622.0, "step": 30225 }, { "epoch": 1.8045606494746895, "grad_norm": 0.4836638569831848, "learning_rate": 6.165389250536957e-06, "loss": 0.4311, "mean_token_accuracy": 0.8542824864387513, "num_tokens": 1012720302.0, "step": 30230 }, { "epoch": 1.8048591212989495, "grad_norm": 0.5360139012336731, "learning_rate": 6.161864449726139e-06, "loss": 0.4506, "mean_token_accuracy": 0.8507157325744629, "num_tokens": 1012887982.0, "step": 30235 }, { "epoch": 1.8051575931232091, "grad_norm": 0.5072572231292725, "learning_rate": 6.158344846212099e-06, "loss": 0.4152, "mean_token_accuracy": 0.8602051615715027, "num_tokens": 1013055662.0, "step": 30240 }, { "epoch": 1.805456064947469, "grad_norm": 0.5142276287078857, "learning_rate": 6.154830440852092e-06, "loss": 0.3919, "mean_token_accuracy": 0.8670165777206421, "num_tokens": 1013223342.0, "step": 30245 }, { "epoch": 1.8057545367717287, "grad_norm": 0.5045875906944275, "learning_rate": 6.151321234502116e-06, "loss": 0.418, "mean_token_accuracy": 0.8589705348014831, "num_tokens": 1013391022.0, "step": 30250 }, { "epoch": 1.8060530085959885, "grad_norm": 0.5202556848526001, "learning_rate": 6.147817228016904e-06, "loss": 0.4196, "mean_token_accuracy": 0.8584038972854614, "num_tokens": 1013558702.0, "step": 30255 }, { "epoch": 1.8063514804202483, "grad_norm": 0.500163197517395, "learning_rate": 6.144318422249924e-06, "loss": 0.4447, "mean_token_accuracy": 0.852105462551117, "num_tokens": 1013726382.0, "step": 30260 }, { "epoch": 1.806649952244508, "grad_norm": 0.5260593891143799, "learning_rate": 6.140824818053378e-06, "loss": 0.4485, "mean_token_accuracy": 0.8506381869316101, "num_tokens": 1013894062.0, "step": 30265 }, { "epoch": 1.8069484240687679, "grad_norm": 0.4753541350364685, "learning_rate": 6.137336416278186e-06, "loss": 0.4088, "mean_token_accuracy": 0.8640641808509827, "num_tokens": 1014061742.0, "step": 30270 }, { "epoch": 1.8072468958930277, "grad_norm": 0.47642311453819275, "learning_rate": 6.133853217774019e-06, "loss": 0.3893, "mean_token_accuracy": 0.8676488041877747, "num_tokens": 1014229422.0, "step": 30275 }, { "epoch": 1.8075453677172875, "grad_norm": 0.523440420627594, "learning_rate": 6.130375223389276e-06, "loss": 0.3957, "mean_token_accuracy": 0.86489919424057, "num_tokens": 1014397102.0, "step": 30280 }, { "epoch": 1.8078438395415473, "grad_norm": 0.5024499893188477, "learning_rate": 6.126902433971086e-06, "loss": 0.4415, "mean_token_accuracy": 0.8528868079185485, "num_tokens": 1014564782.0, "step": 30285 }, { "epoch": 1.808142311365807, "grad_norm": 0.4526674747467041, "learning_rate": 6.123434850365314e-06, "loss": 0.3982, "mean_token_accuracy": 0.8654956459999085, "num_tokens": 1014732462.0, "step": 30290 }, { "epoch": 1.8084407831900668, "grad_norm": 0.510155200958252, "learning_rate": 6.119972473416544e-06, "loss": 0.4131, "mean_token_accuracy": 0.8604735612869263, "num_tokens": 1014900142.0, "step": 30295 }, { "epoch": 1.8087392550143266, "grad_norm": 0.47911566495895386, "learning_rate": 6.116515303968114e-06, "loss": 0.4057, "mean_token_accuracy": 0.8639985799789429, "num_tokens": 1015067822.0, "step": 30300 }, { "epoch": 1.8090377268385864, "grad_norm": 0.5345131158828735, "learning_rate": 6.1130633428620705e-06, "loss": 0.4371, "mean_token_accuracy": 0.8538768887519836, "num_tokens": 1015235502.0, "step": 30305 }, { "epoch": 1.8093361986628462, "grad_norm": 0.5456638336181641, "learning_rate": 6.109616590939211e-06, "loss": 0.4267, "mean_token_accuracy": 0.8576643228530884, "num_tokens": 1015403182.0, "step": 30310 }, { "epoch": 1.809634670487106, "grad_norm": 0.5286844968795776, "learning_rate": 6.1061750490390504e-06, "loss": 0.4282, "mean_token_accuracy": 0.8555171132087708, "num_tokens": 1015570862.0, "step": 30315 }, { "epoch": 1.8099331423113658, "grad_norm": 0.5617997050285339, "learning_rate": 6.102738717999842e-06, "loss": 0.4752, "mean_token_accuracy": 0.8416020512580872, "num_tokens": 1015738542.0, "step": 30320 }, { "epoch": 1.8102316141356256, "grad_norm": 0.4980163872241974, "learning_rate": 6.099307598658568e-06, "loss": 0.4194, "mean_token_accuracy": 0.8586007237434388, "num_tokens": 1015906222.0, "step": 30325 }, { "epoch": 1.8105300859598854, "grad_norm": 0.4981996715068817, "learning_rate": 6.095881691850939e-06, "loss": 0.427, "mean_token_accuracy": 0.8558928847312928, "num_tokens": 1016073902.0, "step": 30330 }, { "epoch": 1.8108285577841452, "grad_norm": 0.5117992758750916, "learning_rate": 6.092460998411397e-06, "loss": 0.4266, "mean_token_accuracy": 0.8573660969734191, "num_tokens": 1016241582.0, "step": 30335 }, { "epoch": 1.811127029608405, "grad_norm": 0.5172623991966248, "learning_rate": 6.089045519173118e-06, "loss": 0.4211, "mean_token_accuracy": 0.8579685091972351, "num_tokens": 1016409262.0, "step": 30340 }, { "epoch": 1.8114255014326648, "grad_norm": 0.5884139537811279, "learning_rate": 6.085635254968003e-06, "loss": 0.4614, "mean_token_accuracy": 0.8454789519309998, "num_tokens": 1016576942.0, "step": 30345 }, { "epoch": 1.8117239732569246, "grad_norm": 0.532335638999939, "learning_rate": 6.082230206626689e-06, "loss": 0.4447, "mean_token_accuracy": 0.8511392116546631, "num_tokens": 1016744622.0, "step": 30350 }, { "epoch": 1.8120224450811844, "grad_norm": 0.48214107751846313, "learning_rate": 6.078830374978531e-06, "loss": 0.4089, "mean_token_accuracy": 0.8626148223876953, "num_tokens": 1016912302.0, "step": 30355 }, { "epoch": 1.8123209169054442, "grad_norm": 0.5337048768997192, "learning_rate": 6.075435760851626e-06, "loss": 0.4478, "mean_token_accuracy": 0.8519205570220947, "num_tokens": 1017079982.0, "step": 30360 }, { "epoch": 1.812619388729704, "grad_norm": 0.5389066338539124, "learning_rate": 6.072046365072793e-06, "loss": 0.4507, "mean_token_accuracy": 0.8505367994308471, "num_tokens": 1017247662.0, "step": 30365 }, { "epoch": 1.8129178605539638, "grad_norm": 0.515815258026123, "learning_rate": 6.068662188467582e-06, "loss": 0.4133, "mean_token_accuracy": 0.8625551581382751, "num_tokens": 1017415342.0, "step": 30370 }, { "epoch": 1.8132163323782235, "grad_norm": 0.4780847728252411, "learning_rate": 6.065283231860274e-06, "loss": 0.4383, "mean_token_accuracy": 0.8542824745178222, "num_tokens": 1017583022.0, "step": 30375 }, { "epoch": 1.8135148042024833, "grad_norm": 0.5041197538375854, "learning_rate": 6.061909496073872e-06, "loss": 0.4329, "mean_token_accuracy": 0.8557795524597168, "num_tokens": 1017750702.0, "step": 30380 }, { "epoch": 1.813813276026743, "grad_norm": 0.5082278847694397, "learning_rate": 6.05854098193012e-06, "loss": 0.4529, "mean_token_accuracy": 0.849021828174591, "num_tokens": 1017918382.0, "step": 30385 }, { "epoch": 1.814111747851003, "grad_norm": 0.5243868827819824, "learning_rate": 6.055177690249469e-06, "loss": 0.4374, "mean_token_accuracy": 0.8529166221618653, "num_tokens": 1018086062.0, "step": 30390 }, { "epoch": 1.8144102196752625, "grad_norm": 0.5364031195640564, "learning_rate": 6.051819621851121e-06, "loss": 0.3963, "mean_token_accuracy": 0.8664738178253174, "num_tokens": 1018253742.0, "step": 30395 }, { "epoch": 1.8147086914995225, "grad_norm": 0.46478471159935, "learning_rate": 6.0484667775529895e-06, "loss": 0.404, "mean_token_accuracy": 0.8640164613723755, "num_tokens": 1018421422.0, "step": 30400 }, { "epoch": 1.815007163323782, "grad_norm": 0.48922744393348694, "learning_rate": 6.045119158171727e-06, "loss": 0.3969, "mean_token_accuracy": 0.866628897190094, "num_tokens": 1018589102.0, "step": 30405 }, { "epoch": 1.815305635148042, "grad_norm": 0.583345353603363, "learning_rate": 6.041776764522707e-06, "loss": 0.457, "mean_token_accuracy": 0.8483120560646057, "num_tokens": 1018756782.0, "step": 30410 }, { "epoch": 1.8156041069723017, "grad_norm": 0.5307941436767578, "learning_rate": 6.038439597420024e-06, "loss": 0.4051, "mean_token_accuracy": 0.8624537825584412, "num_tokens": 1018924462.0, "step": 30415 }, { "epoch": 1.8159025787965617, "grad_norm": 0.5566131472587585, "learning_rate": 6.035107657676513e-06, "loss": 0.4351, "mean_token_accuracy": 0.8534355163574219, "num_tokens": 1019092142.0, "step": 30420 }, { "epoch": 1.8162010506208213, "grad_norm": 0.4648083448410034, "learning_rate": 6.0317809461037324e-06, "loss": 0.392, "mean_token_accuracy": 0.8670404314994812, "num_tokens": 1019259822.0, "step": 30425 }, { "epoch": 1.8164995224450813, "grad_norm": 0.4756131172180176, "learning_rate": 6.028459463511958e-06, "loss": 0.4008, "mean_token_accuracy": 0.8640701293945312, "num_tokens": 1019427502.0, "step": 30430 }, { "epoch": 1.8167979942693409, "grad_norm": 0.5145769715309143, "learning_rate": 6.0251432107102015e-06, "loss": 0.4107, "mean_token_accuracy": 0.8606525182723999, "num_tokens": 1019595182.0, "step": 30435 }, { "epoch": 1.8170964660936009, "grad_norm": 0.5484784841537476, "learning_rate": 6.021832188506193e-06, "loss": 0.4058, "mean_token_accuracy": 0.8620064496994019, "num_tokens": 1019762862.0, "step": 30440 }, { "epoch": 1.8173949379178604, "grad_norm": 0.5580423474311829, "learning_rate": 6.018526397706399e-06, "loss": 0.4609, "mean_token_accuracy": 0.8464332580566406, "num_tokens": 1019930542.0, "step": 30445 }, { "epoch": 1.8176934097421205, "grad_norm": 0.6324069499969482, "learning_rate": 6.015225839116006e-06, "loss": 0.4179, "mean_token_accuracy": 0.8596206545829773, "num_tokens": 1020098222.0, "step": 30450 }, { "epoch": 1.81799188156638, "grad_norm": 0.45643118023872375, "learning_rate": 6.0119305135389205e-06, "loss": 0.413, "mean_token_accuracy": 0.8611237049102783, "num_tokens": 1020265902.0, "step": 30455 }, { "epoch": 1.81829035339064, "grad_norm": 0.5177029967308044, "learning_rate": 6.008640421777781e-06, "loss": 0.403, "mean_token_accuracy": 0.8635035157203674, "num_tokens": 1020433582.0, "step": 30460 }, { "epoch": 1.8185888252148996, "grad_norm": 0.5298128128051758, "learning_rate": 6.005355564633954e-06, "loss": 0.4297, "mean_token_accuracy": 0.8553620457649231, "num_tokens": 1020601262.0, "step": 30465 }, { "epoch": 1.8188872970391596, "grad_norm": 0.5516754984855652, "learning_rate": 6.002075942907528e-06, "loss": 0.4449, "mean_token_accuracy": 0.8510676264762879, "num_tokens": 1020768942.0, "step": 30470 }, { "epoch": 1.8191857688634192, "grad_norm": 0.5264891386032104, "learning_rate": 5.998801557397308e-06, "loss": 0.4343, "mean_token_accuracy": 0.8535369157791137, "num_tokens": 1020936622.0, "step": 30475 }, { "epoch": 1.8194842406876792, "grad_norm": 0.5354034900665283, "learning_rate": 5.995532408900836e-06, "loss": 0.444, "mean_token_accuracy": 0.8525706768035889, "num_tokens": 1021104302.0, "step": 30480 }, { "epoch": 1.8197827125119388, "grad_norm": 0.5818821787834167, "learning_rate": 5.992268498214374e-06, "loss": 0.4433, "mean_token_accuracy": 0.8516939043998718, "num_tokens": 1021271982.0, "step": 30485 }, { "epoch": 1.8200811843361988, "grad_norm": 0.5167210698127747, "learning_rate": 5.989009826132905e-06, "loss": 0.4558, "mean_token_accuracy": 0.8479064702987671, "num_tokens": 1021439662.0, "step": 30490 }, { "epoch": 1.8203796561604584, "grad_norm": 0.50235915184021, "learning_rate": 5.985756393450143e-06, "loss": 0.4145, "mean_token_accuracy": 0.8593641877174377, "num_tokens": 1021607342.0, "step": 30495 }, { "epoch": 1.8206781279847184, "grad_norm": 0.4829601049423218, "learning_rate": 5.982508200958512e-06, "loss": 0.4185, "mean_token_accuracy": 0.858057975769043, "num_tokens": 1021775022.0, "step": 30500 }, { "epoch": 1.820976599808978, "grad_norm": 0.5765755772590637, "learning_rate": 5.979265249449182e-06, "loss": 0.4443, "mean_token_accuracy": 0.85235595703125, "num_tokens": 1021942702.0, "step": 30505 }, { "epoch": 1.821275071633238, "grad_norm": 0.6675260663032532, "learning_rate": 5.976027539712028e-06, "loss": 0.4657, "mean_token_accuracy": 0.8453059792518616, "num_tokens": 1022110382.0, "step": 30510 }, { "epoch": 1.8215735434574976, "grad_norm": 0.6030436754226685, "learning_rate": 5.972795072535652e-06, "loss": 0.4436, "mean_token_accuracy": 0.8529643177986145, "num_tokens": 1022278062.0, "step": 30515 }, { "epoch": 1.8218720152817574, "grad_norm": 0.5055321455001831, "learning_rate": 5.969567848707382e-06, "loss": 0.407, "mean_token_accuracy": 0.8628235697746277, "num_tokens": 1022445742.0, "step": 30520 }, { "epoch": 1.8221704871060171, "grad_norm": 0.5094475150108337, "learning_rate": 5.9663458690132715e-06, "loss": 0.4299, "mean_token_accuracy": 0.8551950216293335, "num_tokens": 1022613422.0, "step": 30525 }, { "epoch": 1.822468958930277, "grad_norm": 0.5414401292800903, "learning_rate": 5.963129134238087e-06, "loss": 0.4438, "mean_token_accuracy": 0.8517654657363891, "num_tokens": 1022781102.0, "step": 30530 }, { "epoch": 1.8227674307545367, "grad_norm": 0.5134124755859375, "learning_rate": 5.9599176451653305e-06, "loss": 0.4275, "mean_token_accuracy": 0.8573004961013794, "num_tokens": 1022948782.0, "step": 30535 }, { "epoch": 1.8230659025787965, "grad_norm": 0.5222249627113342, "learning_rate": 5.956711402577213e-06, "loss": 0.4075, "mean_token_accuracy": 0.8628056764602661, "num_tokens": 1023116462.0, "step": 30540 }, { "epoch": 1.8233643744030563, "grad_norm": 0.5483086705207825, "learning_rate": 5.953510407254677e-06, "loss": 0.4206, "mean_token_accuracy": 0.8589884281158447, "num_tokens": 1023284142.0, "step": 30545 }, { "epoch": 1.8236628462273161, "grad_norm": 0.5105926394462585, "learning_rate": 5.950314659977386e-06, "loss": 0.4368, "mean_token_accuracy": 0.855338191986084, "num_tokens": 1023451822.0, "step": 30550 }, { "epoch": 1.823961318051576, "grad_norm": 0.48213180899620056, "learning_rate": 5.947124161523719e-06, "loss": 0.4238, "mean_token_accuracy": 0.8586245775222778, "num_tokens": 1023619502.0, "step": 30555 }, { "epoch": 1.8242597898758357, "grad_norm": 0.48848965764045715, "learning_rate": 5.943938912670785e-06, "loss": 0.4308, "mean_token_accuracy": 0.8550578713417053, "num_tokens": 1023787182.0, "step": 30560 }, { "epoch": 1.8245582617000955, "grad_norm": 0.4824397563934326, "learning_rate": 5.940758914194404e-06, "loss": 0.431, "mean_token_accuracy": 0.8564058184623718, "num_tokens": 1023954862.0, "step": 30565 }, { "epoch": 1.8248567335243553, "grad_norm": 0.49501392245292664, "learning_rate": 5.937584166869134e-06, "loss": 0.4153, "mean_token_accuracy": 0.8611058115959167, "num_tokens": 1024122542.0, "step": 30570 }, { "epoch": 1.825155205348615, "grad_norm": 0.4652409255504608, "learning_rate": 5.934414671468232e-06, "loss": 0.4041, "mean_token_accuracy": 0.8645293951034546, "num_tokens": 1024290222.0, "step": 30575 }, { "epoch": 1.8254536771728749, "grad_norm": 0.48045143485069275, "learning_rate": 5.931250428763696e-06, "loss": 0.378, "mean_token_accuracy": 0.8716092228889465, "num_tokens": 1024457902.0, "step": 30580 }, { "epoch": 1.8257521489971347, "grad_norm": 0.5033023357391357, "learning_rate": 5.928091439526226e-06, "loss": 0.3789, "mean_token_accuracy": 0.8707622766494751, "num_tokens": 1024625582.0, "step": 30585 }, { "epoch": 1.8260506208213945, "grad_norm": 0.5048587322235107, "learning_rate": 5.924937704525259e-06, "loss": 0.416, "mean_token_accuracy": 0.8606584668159485, "num_tokens": 1024793262.0, "step": 30590 }, { "epoch": 1.8263490926456543, "grad_norm": 0.5610945820808411, "learning_rate": 5.921789224528949e-06, "loss": 0.4605, "mean_token_accuracy": 0.8468030452728271, "num_tokens": 1024960942.0, "step": 30595 }, { "epoch": 1.826647564469914, "grad_norm": 0.5296248197555542, "learning_rate": 5.918646000304156e-06, "loss": 0.4155, "mean_token_accuracy": 0.8622748494148255, "num_tokens": 1025128622.0, "step": 30600 }, { "epoch": 1.8269460362941738, "grad_norm": 0.5809566974639893, "learning_rate": 5.915508032616478e-06, "loss": 0.4531, "mean_token_accuracy": 0.8490635871887207, "num_tokens": 1025296302.0, "step": 30605 }, { "epoch": 1.8272445081184336, "grad_norm": 0.5218061804771423, "learning_rate": 5.912375322230222e-06, "loss": 0.4565, "mean_token_accuracy": 0.848884642124176, "num_tokens": 1025463982.0, "step": 30610 }, { "epoch": 1.8275429799426934, "grad_norm": 0.49793973565101624, "learning_rate": 5.909247869908417e-06, "loss": 0.4328, "mean_token_accuracy": 0.854825246334076, "num_tokens": 1025631662.0, "step": 30615 }, { "epoch": 1.8278414517669532, "grad_norm": 0.4951666593551636, "learning_rate": 5.906125676412817e-06, "loss": 0.407, "mean_token_accuracy": 0.8635810613632202, "num_tokens": 1025799342.0, "step": 30620 }, { "epoch": 1.828139923591213, "grad_norm": 0.5134680271148682, "learning_rate": 5.903008742503879e-06, "loss": 0.4169, "mean_token_accuracy": 0.8579685091972351, "num_tokens": 1025967022.0, "step": 30625 }, { "epoch": 1.8284383954154728, "grad_norm": 0.5246886610984802, "learning_rate": 5.899897068940805e-06, "loss": 0.4037, "mean_token_accuracy": 0.8634021162986756, "num_tokens": 1026134702.0, "step": 30630 }, { "epoch": 1.8287368672397326, "grad_norm": 0.5466708540916443, "learning_rate": 5.8967906564814866e-06, "loss": 0.4756, "mean_token_accuracy": 0.8419599175453186, "num_tokens": 1026302382.0, "step": 30635 }, { "epoch": 1.8290353390639924, "grad_norm": 0.4633078873157501, "learning_rate": 5.8936895058825545e-06, "loss": 0.4183, "mean_token_accuracy": 0.8600679874420166, "num_tokens": 1026470062.0, "step": 30640 }, { "epoch": 1.8293338108882522, "grad_norm": 0.4778256118297577, "learning_rate": 5.89059361789935e-06, "loss": 0.4117, "mean_token_accuracy": 0.8613741993904114, "num_tokens": 1026637742.0, "step": 30645 }, { "epoch": 1.829632282712512, "grad_norm": 0.5174081921577454, "learning_rate": 5.887502993285934e-06, "loss": 0.4258, "mean_token_accuracy": 0.8562209129333496, "num_tokens": 1026805422.0, "step": 30650 }, { "epoch": 1.8299307545367718, "grad_norm": 0.46859246492385864, "learning_rate": 5.884417632795086e-06, "loss": 0.4118, "mean_token_accuracy": 0.8600739598274231, "num_tokens": 1026973102.0, "step": 30655 }, { "epoch": 1.8302292263610314, "grad_norm": 0.45793935656547546, "learning_rate": 5.8813375371783e-06, "loss": 0.4128, "mean_token_accuracy": 0.861165452003479, "num_tokens": 1027140782.0, "step": 30660 }, { "epoch": 1.8305276981852914, "grad_norm": 0.5306112766265869, "learning_rate": 5.878262707185792e-06, "loss": 0.4176, "mean_token_accuracy": 0.8616724252700806, "num_tokens": 1027308462.0, "step": 30665 }, { "epoch": 1.830826170009551, "grad_norm": 0.5088845491409302, "learning_rate": 5.875193143566493e-06, "loss": 0.4141, "mean_token_accuracy": 0.8621555447578431, "num_tokens": 1027476142.0, "step": 30670 }, { "epoch": 1.831124641833811, "grad_norm": 0.4992761015892029, "learning_rate": 5.8721288470680525e-06, "loss": 0.4296, "mean_token_accuracy": 0.8566980838775635, "num_tokens": 1027643822.0, "step": 30675 }, { "epoch": 1.8314231136580705, "grad_norm": 0.5080541372299194, "learning_rate": 5.869069818436838e-06, "loss": 0.3732, "mean_token_accuracy": 0.8748896598815918, "num_tokens": 1027811502.0, "step": 30680 }, { "epoch": 1.8317215854823305, "grad_norm": 0.5191560983657837, "learning_rate": 5.866016058417927e-06, "loss": 0.4143, "mean_token_accuracy": 0.8610700130462646, "num_tokens": 1027979182.0, "step": 30685 }, { "epoch": 1.8320200573065901, "grad_norm": 0.495977520942688, "learning_rate": 5.862967567755124e-06, "loss": 0.417, "mean_token_accuracy": 0.8614875316619873, "num_tokens": 1028146862.0, "step": 30690 }, { "epoch": 1.8323185291308501, "grad_norm": 0.48497673869132996, "learning_rate": 5.859924347190942e-06, "loss": 0.389, "mean_token_accuracy": 0.8695345759391785, "num_tokens": 1028314302.0, "step": 30695 }, { "epoch": 1.8326170009551097, "grad_norm": 0.4743047058582306, "learning_rate": 5.856886397466617e-06, "loss": 0.4492, "mean_token_accuracy": 0.8495467066764831, "num_tokens": 1028481982.0, "step": 30700 }, { "epoch": 1.8329154727793697, "grad_norm": 0.5589334964752197, "learning_rate": 5.853853719322099e-06, "loss": 0.4554, "mean_token_accuracy": 0.8488071084022522, "num_tokens": 1028649662.0, "step": 30705 }, { "epoch": 1.8332139446036293, "grad_norm": 0.5527417063713074, "learning_rate": 5.850826313496043e-06, "loss": 0.4206, "mean_token_accuracy": 0.8598830938339234, "num_tokens": 1028817342.0, "step": 30710 }, { "epoch": 1.8335124164278893, "grad_norm": 0.5253627300262451, "learning_rate": 5.8478041807258425e-06, "loss": 0.4416, "mean_token_accuracy": 0.8509185314178467, "num_tokens": 1028985022.0, "step": 30715 }, { "epoch": 1.8338108882521489, "grad_norm": 0.5398898124694824, "learning_rate": 5.844787321747584e-06, "loss": 0.4285, "mean_token_accuracy": 0.855952525138855, "num_tokens": 1029152702.0, "step": 30720 }, { "epoch": 1.834109360076409, "grad_norm": 0.575414776802063, "learning_rate": 5.841775737296084e-06, "loss": 0.4511, "mean_token_accuracy": 0.8481927633285522, "num_tokens": 1029320382.0, "step": 30725 }, { "epoch": 1.8344078319006685, "grad_norm": 0.5164406895637512, "learning_rate": 5.838769428104867e-06, "loss": 0.4312, "mean_token_accuracy": 0.8546045660972595, "num_tokens": 1029488062.0, "step": 30730 }, { "epoch": 1.8347063037249285, "grad_norm": 0.5350559949874878, "learning_rate": 5.835768394906176e-06, "loss": 0.4204, "mean_token_accuracy": 0.8575271368026733, "num_tokens": 1029655742.0, "step": 30735 }, { "epoch": 1.835004775549188, "grad_norm": 0.46778568625450134, "learning_rate": 5.832772638430973e-06, "loss": 0.4207, "mean_token_accuracy": 0.8587438821792602, "num_tokens": 1029823422.0, "step": 30740 }, { "epoch": 1.835303247373448, "grad_norm": 0.52099609375, "learning_rate": 5.829782159408919e-06, "loss": 0.4327, "mean_token_accuracy": 0.8545697569847107, "num_tokens": 1029990264.0, "step": 30745 }, { "epoch": 1.8356017191977076, "grad_norm": 0.49678269028663635, "learning_rate": 5.826796958568407e-06, "loss": 0.4147, "mean_token_accuracy": 0.8594536542892456, "num_tokens": 1030157944.0, "step": 30750 }, { "epoch": 1.8359001910219677, "grad_norm": 0.5183316469192505, "learning_rate": 5.823817036636541e-06, "loss": 0.4461, "mean_token_accuracy": 0.8510139584541321, "num_tokens": 1030325624.0, "step": 30755 }, { "epoch": 1.8361986628462272, "grad_norm": 0.48756346106529236, "learning_rate": 5.820842394339129e-06, "loss": 0.4095, "mean_token_accuracy": 0.8618014693260193, "num_tokens": 1030491801.0, "step": 30760 }, { "epoch": 1.8364971346704873, "grad_norm": 0.5023455619812012, "learning_rate": 5.817873032400707e-06, "loss": 0.3823, "mean_token_accuracy": 0.8708338379859925, "num_tokens": 1030659481.0, "step": 30765 }, { "epoch": 1.8367956064947468, "grad_norm": 0.5409082174301147, "learning_rate": 5.814908951544509e-06, "loss": 0.464, "mean_token_accuracy": 0.8456936597824096, "num_tokens": 1030827161.0, "step": 30770 }, { "epoch": 1.8370940783190068, "grad_norm": 0.48703309893608093, "learning_rate": 5.811950152492503e-06, "loss": 0.422, "mean_token_accuracy": 0.8569008827209472, "num_tokens": 1030994841.0, "step": 30775 }, { "epoch": 1.8373925501432664, "grad_norm": 0.5385554432868958, "learning_rate": 5.8089966359653494e-06, "loss": 0.4417, "mean_token_accuracy": 0.8538112759590148, "num_tokens": 1031162521.0, "step": 30780 }, { "epoch": 1.8376910219675264, "grad_norm": 0.5271511673927307, "learning_rate": 5.806048402682435e-06, "loss": 0.45, "mean_token_accuracy": 0.8497017741203308, "num_tokens": 1031330201.0, "step": 30785 }, { "epoch": 1.837989493791786, "grad_norm": 0.50701504945755, "learning_rate": 5.803105453361856e-06, "loss": 0.3984, "mean_token_accuracy": 0.866115951538086, "num_tokens": 1031497881.0, "step": 30790 }, { "epoch": 1.8382879656160458, "grad_norm": 0.4824557602405548, "learning_rate": 5.800167788720423e-06, "loss": 0.3946, "mean_token_accuracy": 0.8666587233543396, "num_tokens": 1031665561.0, "step": 30795 }, { "epoch": 1.8385864374403056, "grad_norm": 0.5363814234733582, "learning_rate": 5.797235409473661e-06, "loss": 0.4524, "mean_token_accuracy": 0.8493081331253052, "num_tokens": 1031833241.0, "step": 30800 }, { "epoch": 1.8388849092645654, "grad_norm": 0.5110020637512207, "learning_rate": 5.794308316335798e-06, "loss": 0.4058, "mean_token_accuracy": 0.8607777714729309, "num_tokens": 1032000921.0, "step": 30805 }, { "epoch": 1.8391833810888252, "grad_norm": 0.5159292221069336, "learning_rate": 5.791386510019786e-06, "loss": 0.4365, "mean_token_accuracy": 0.851741623878479, "num_tokens": 1032168601.0, "step": 30810 }, { "epoch": 1.839481852913085, "grad_norm": 0.5154623985290527, "learning_rate": 5.788469991237288e-06, "loss": 0.4223, "mean_token_accuracy": 0.8582667231559753, "num_tokens": 1032336281.0, "step": 30815 }, { "epoch": 1.8397803247373448, "grad_norm": 0.5138291716575623, "learning_rate": 5.78555876069867e-06, "loss": 0.4061, "mean_token_accuracy": 0.8626983165740967, "num_tokens": 1032503961.0, "step": 30820 }, { "epoch": 1.8400787965616046, "grad_norm": 0.5058844685554504, "learning_rate": 5.78265281911302e-06, "loss": 0.4252, "mean_token_accuracy": 0.856214952468872, "num_tokens": 1032671641.0, "step": 30825 }, { "epoch": 1.8403772683858644, "grad_norm": 0.5139539837837219, "learning_rate": 5.779752167188129e-06, "loss": 0.391, "mean_token_accuracy": 0.867881429195404, "num_tokens": 1032839321.0, "step": 30830 }, { "epoch": 1.8406757402101241, "grad_norm": 0.5310842394828796, "learning_rate": 5.776856805630508e-06, "loss": 0.457, "mean_token_accuracy": 0.8469521760940552, "num_tokens": 1033007001.0, "step": 30835 }, { "epoch": 1.840974212034384, "grad_norm": 0.5083591341972351, "learning_rate": 5.773966735145379e-06, "loss": 0.4439, "mean_token_accuracy": 0.8496719598770142, "num_tokens": 1033174681.0, "step": 30840 }, { "epoch": 1.8412726838586437, "grad_norm": 0.5695140957832336, "learning_rate": 5.771081956436667e-06, "loss": 0.4333, "mean_token_accuracy": 0.8546045541763305, "num_tokens": 1033342361.0, "step": 30845 }, { "epoch": 1.8415711556829035, "grad_norm": 0.557837963104248, "learning_rate": 5.768202470207014e-06, "loss": 0.4544, "mean_token_accuracy": 0.8481629490852356, "num_tokens": 1033510041.0, "step": 30850 }, { "epoch": 1.8418696275071633, "grad_norm": 0.5029990077018738, "learning_rate": 5.7653282771577705e-06, "loss": 0.4244, "mean_token_accuracy": 0.8581653237342834, "num_tokens": 1033677721.0, "step": 30855 }, { "epoch": 1.8421680993314231, "grad_norm": 0.5400208830833435, "learning_rate": 5.762459377989002e-06, "loss": 0.4578, "mean_token_accuracy": 0.8476798295974731, "num_tokens": 1033845401.0, "step": 30860 }, { "epoch": 1.842466571155683, "grad_norm": 0.44514694809913635, "learning_rate": 5.759595773399483e-06, "loss": 0.4171, "mean_token_accuracy": 0.8604079723358155, "num_tokens": 1034013081.0, "step": 30865 }, { "epoch": 1.8427650429799427, "grad_norm": 0.49066242575645447, "learning_rate": 5.756737464086695e-06, "loss": 0.4167, "mean_token_accuracy": 0.8607479333877563, "num_tokens": 1034180761.0, "step": 30870 }, { "epoch": 1.8430635148042025, "grad_norm": 0.47519221901893616, "learning_rate": 5.753884450746831e-06, "loss": 0.3917, "mean_token_accuracy": 0.8672492027282714, "num_tokens": 1034348441.0, "step": 30875 }, { "epoch": 1.8433619866284623, "grad_norm": 0.4528384804725647, "learning_rate": 5.7510367340747965e-06, "loss": 0.3736, "mean_token_accuracy": 0.8736132621765137, "num_tokens": 1034516121.0, "step": 30880 }, { "epoch": 1.843660458452722, "grad_norm": 0.512131929397583, "learning_rate": 5.7481943147642065e-06, "loss": 0.4181, "mean_token_accuracy": 0.8582190155982972, "num_tokens": 1034683801.0, "step": 30885 }, { "epoch": 1.8439589302769819, "grad_norm": 0.451454222202301, "learning_rate": 5.745357193507382e-06, "loss": 0.4176, "mean_token_accuracy": 0.8610879063606263, "num_tokens": 1034851481.0, "step": 30890 }, { "epoch": 1.8442574021012417, "grad_norm": 0.507235050201416, "learning_rate": 5.742525370995356e-06, "loss": 0.3913, "mean_token_accuracy": 0.867517602443695, "num_tokens": 1035019161.0, "step": 30895 }, { "epoch": 1.8445558739255015, "grad_norm": 0.5010097026824951, "learning_rate": 5.739698847917875e-06, "loss": 0.4036, "mean_token_accuracy": 0.8648056149482727, "num_tokens": 1035182536.0, "step": 30900 }, { "epoch": 1.8448543457497613, "grad_norm": 0.49267691373825073, "learning_rate": 5.736877624963391e-06, "loss": 0.4026, "mean_token_accuracy": 0.8634002923965454, "num_tokens": 1035345405.0, "step": 30905 }, { "epoch": 1.845152817574021, "grad_norm": 0.5309245586395264, "learning_rate": 5.734061702819062e-06, "loss": 0.4389, "mean_token_accuracy": 0.8526780486106873, "num_tokens": 1035513085.0, "step": 30910 }, { "epoch": 1.8454512893982808, "grad_norm": 0.5029569864273071, "learning_rate": 5.731251082170753e-06, "loss": 0.4231, "mean_token_accuracy": 0.8572885513305664, "num_tokens": 1035680765.0, "step": 30915 }, { "epoch": 1.8457497612225406, "grad_norm": 0.4928448498249054, "learning_rate": 5.728445763703051e-06, "loss": 0.4143, "mean_token_accuracy": 0.8612489461898803, "num_tokens": 1035848445.0, "step": 30920 }, { "epoch": 1.8460482330468004, "grad_norm": 0.49496352672576904, "learning_rate": 5.725645748099244e-06, "loss": 0.4179, "mean_token_accuracy": 0.8594178676605224, "num_tokens": 1036016125.0, "step": 30925 }, { "epoch": 1.8463467048710602, "grad_norm": 0.5211257934570312, "learning_rate": 5.72285103604132e-06, "loss": 0.4233, "mean_token_accuracy": 0.8571275115013123, "num_tokens": 1036183805.0, "step": 30930 }, { "epoch": 1.8466451766953198, "grad_norm": 0.5315496325492859, "learning_rate": 5.720061628209986e-06, "loss": 0.4357, "mean_token_accuracy": 0.8540737390518188, "num_tokens": 1036351485.0, "step": 30935 }, { "epoch": 1.8469436485195798, "grad_norm": 0.4949491024017334, "learning_rate": 5.717277525284655e-06, "loss": 0.4245, "mean_token_accuracy": 0.8582309484481812, "num_tokens": 1036519165.0, "step": 30940 }, { "epoch": 1.8472421203438394, "grad_norm": 0.5251035094261169, "learning_rate": 5.714498727943445e-06, "loss": 0.4002, "mean_token_accuracy": 0.8639925837516784, "num_tokens": 1036686845.0, "step": 30945 }, { "epoch": 1.8475405921680994, "grad_norm": 0.5131856203079224, "learning_rate": 5.711725236863185e-06, "loss": 0.4447, "mean_token_accuracy": 0.8512644648551941, "num_tokens": 1036854525.0, "step": 30950 }, { "epoch": 1.847839063992359, "grad_norm": 0.5783919095993042, "learning_rate": 5.7089570527194075e-06, "loss": 0.4403, "mean_token_accuracy": 0.8528092622756958, "num_tokens": 1037022205.0, "step": 30955 }, { "epoch": 1.848137535816619, "grad_norm": 0.5095967650413513, "learning_rate": 5.70619417618636e-06, "loss": 0.4258, "mean_token_accuracy": 0.8565430045127869, "num_tokens": 1037189885.0, "step": 30960 }, { "epoch": 1.8484360076408786, "grad_norm": 0.45795413851737976, "learning_rate": 5.703436607936984e-06, "loss": 0.3981, "mean_token_accuracy": 0.8656566858291626, "num_tokens": 1037357565.0, "step": 30965 }, { "epoch": 1.8487344794651386, "grad_norm": 0.47025933861732483, "learning_rate": 5.700684348642941e-06, "loss": 0.3886, "mean_token_accuracy": 0.8674400568008422, "num_tokens": 1037525245.0, "step": 30970 }, { "epoch": 1.8490329512893982, "grad_norm": 0.4744865894317627, "learning_rate": 5.697937398974596e-06, "loss": 0.4174, "mean_token_accuracy": 0.8595848679542542, "num_tokens": 1037692925.0, "step": 30975 }, { "epoch": 1.8493314231136582, "grad_norm": 0.47892388701438904, "learning_rate": 5.695195759601016e-06, "loss": 0.3958, "mean_token_accuracy": 0.8661994576454163, "num_tokens": 1037860605.0, "step": 30980 }, { "epoch": 1.8496298949379177, "grad_norm": 0.4974650740623474, "learning_rate": 5.6924594311899844e-06, "loss": 0.4008, "mean_token_accuracy": 0.865263032913208, "num_tokens": 1038028285.0, "step": 30985 }, { "epoch": 1.8499283667621778, "grad_norm": 0.4725826680660248, "learning_rate": 5.689728414407974e-06, "loss": 0.4194, "mean_token_accuracy": 0.8601992130279541, "num_tokens": 1038195965.0, "step": 30990 }, { "epoch": 1.8502268385864373, "grad_norm": 0.5021836161613464, "learning_rate": 5.6870027099201805e-06, "loss": 0.4397, "mean_token_accuracy": 0.8525169849395752, "num_tokens": 1038363645.0, "step": 30995 }, { "epoch": 1.8505253104106973, "grad_norm": 0.5390616059303284, "learning_rate": 5.6842823183905e-06, "loss": 0.405, "mean_token_accuracy": 0.8640283823013306, "num_tokens": 1038531325.0, "step": 31000 }, { "epoch": 1.850823782234957, "grad_norm": 0.5269567370414734, "learning_rate": 5.681567240481531e-06, "loss": 0.4363, "mean_token_accuracy": 0.8528390884399414, "num_tokens": 1038699005.0, "step": 31005 }, { "epoch": 1.851122254059217, "grad_norm": 0.4809347987174988, "learning_rate": 5.678857476854587e-06, "loss": 0.4282, "mean_token_accuracy": 0.8572050571441651, "num_tokens": 1038866685.0, "step": 31010 }, { "epoch": 1.8514207258834765, "grad_norm": 0.5040122866630554, "learning_rate": 5.676153028169674e-06, "loss": 0.4321, "mean_token_accuracy": 0.854360020160675, "num_tokens": 1039034365.0, "step": 31015 }, { "epoch": 1.8517191977077365, "grad_norm": 0.5039914846420288, "learning_rate": 5.67345389508551e-06, "loss": 0.4338, "mean_token_accuracy": 0.8536263942718506, "num_tokens": 1039202045.0, "step": 31020 }, { "epoch": 1.852017669531996, "grad_norm": 0.49863916635513306, "learning_rate": 5.670760078259525e-06, "loss": 0.4026, "mean_token_accuracy": 0.865263032913208, "num_tokens": 1039369725.0, "step": 31025 }, { "epoch": 1.852316141356256, "grad_norm": 0.493740975856781, "learning_rate": 5.668071578347844e-06, "loss": 0.4279, "mean_token_accuracy": 0.857055950164795, "num_tokens": 1039537405.0, "step": 31030 }, { "epoch": 1.8526146131805157, "grad_norm": 0.5113048553466797, "learning_rate": 5.665388396005303e-06, "loss": 0.3987, "mean_token_accuracy": 0.8637361288070678, "num_tokens": 1039705085.0, "step": 31035 }, { "epoch": 1.8529130850047757, "grad_norm": 0.5170607566833496, "learning_rate": 5.662710531885434e-06, "loss": 0.4453, "mean_token_accuracy": 0.8512048244476318, "num_tokens": 1039872765.0, "step": 31040 }, { "epoch": 1.8532115568290353, "grad_norm": 0.5332815647125244, "learning_rate": 5.660037986640491e-06, "loss": 0.438, "mean_token_accuracy": 0.8533162355422974, "num_tokens": 1040040445.0, "step": 31045 }, { "epoch": 1.8535100286532953, "grad_norm": 0.5392531752586365, "learning_rate": 5.657370760921413e-06, "loss": 0.4272, "mean_token_accuracy": 0.8567696452140808, "num_tokens": 1040208125.0, "step": 31050 }, { "epoch": 1.8538085004775549, "grad_norm": 0.4770124554634094, "learning_rate": 5.6547088553778545e-06, "loss": 0.4211, "mean_token_accuracy": 0.8585530281066894, "num_tokens": 1040375805.0, "step": 31055 }, { "epoch": 1.8541069723018149, "grad_norm": 0.5287703275680542, "learning_rate": 5.652052270658172e-06, "loss": 0.4705, "mean_token_accuracy": 0.8429619550704956, "num_tokens": 1040543485.0, "step": 31060 }, { "epoch": 1.8544054441260744, "grad_norm": 0.49176666140556335, "learning_rate": 5.6494010074094265e-06, "loss": 0.4423, "mean_token_accuracy": 0.8530597686767578, "num_tokens": 1040711165.0, "step": 31065 }, { "epoch": 1.8547039159503342, "grad_norm": 0.5171224474906921, "learning_rate": 5.646755066277384e-06, "loss": 0.4387, "mean_token_accuracy": 0.8530299425125122, "num_tokens": 1040878845.0, "step": 31070 }, { "epoch": 1.855002387774594, "grad_norm": 0.4887007772922516, "learning_rate": 5.644114447906508e-06, "loss": 0.4481, "mean_token_accuracy": 0.8498687744140625, "num_tokens": 1041046525.0, "step": 31075 }, { "epoch": 1.8553008595988538, "grad_norm": 0.49851658940315247, "learning_rate": 5.6414791529399684e-06, "loss": 0.4355, "mean_token_accuracy": 0.8541274070739746, "num_tokens": 1041214205.0, "step": 31080 }, { "epoch": 1.8555993314231136, "grad_norm": 0.4828285872936249, "learning_rate": 5.638849182019645e-06, "loss": 0.427, "mean_token_accuracy": 0.8561732053756714, "num_tokens": 1041381885.0, "step": 31085 }, { "epoch": 1.8558978032473734, "grad_norm": 0.5649977922439575, "learning_rate": 5.636224535786117e-06, "loss": 0.4407, "mean_token_accuracy": 0.8528686165809631, "num_tokens": 1041543202.0, "step": 31090 }, { "epoch": 1.8561962750716332, "grad_norm": 0.5959926247596741, "learning_rate": 5.6336052148786605e-06, "loss": 0.4319, "mean_token_accuracy": 0.8543063282966614, "num_tokens": 1041710882.0, "step": 31095 }, { "epoch": 1.856494746895893, "grad_norm": 0.4736967086791992, "learning_rate": 5.6309912199352595e-06, "loss": 0.4475, "mean_token_accuracy": 0.8506322264671325, "num_tokens": 1041878562.0, "step": 31100 }, { "epoch": 1.8567932187201528, "grad_norm": 0.5529529452323914, "learning_rate": 5.628382551592605e-06, "loss": 0.4591, "mean_token_accuracy": 0.8482762813568115, "num_tokens": 1042046242.0, "step": 31105 }, { "epoch": 1.8570916905444126, "grad_norm": 0.4477444291114807, "learning_rate": 5.625779210486084e-06, "loss": 0.4064, "mean_token_accuracy": 0.8632768869400025, "num_tokens": 1042213922.0, "step": 31110 }, { "epoch": 1.8573901623686724, "grad_norm": 0.512795090675354, "learning_rate": 5.623181197249785e-06, "loss": 0.4731, "mean_token_accuracy": 0.8422879695892334, "num_tokens": 1042381602.0, "step": 31115 }, { "epoch": 1.8576886341929322, "grad_norm": 0.5364097952842712, "learning_rate": 5.620588512516507e-06, "loss": 0.4087, "mean_token_accuracy": 0.8622569441795349, "num_tokens": 1042549282.0, "step": 31120 }, { "epoch": 1.857987106017192, "grad_norm": 0.5044190287590027, "learning_rate": 5.618001156917744e-06, "loss": 0.4242, "mean_token_accuracy": 0.8563938975334168, "num_tokens": 1042716962.0, "step": 31125 }, { "epoch": 1.8582855778414518, "grad_norm": 0.4930665194988251, "learning_rate": 5.6154191310836975e-06, "loss": 0.4285, "mean_token_accuracy": 0.8569485902786255, "num_tokens": 1042884642.0, "step": 31130 }, { "epoch": 1.8585840496657116, "grad_norm": 0.43919238448143005, "learning_rate": 5.612842435643263e-06, "loss": 0.4016, "mean_token_accuracy": 0.8655254602432251, "num_tokens": 1043052322.0, "step": 31135 }, { "epoch": 1.8588825214899714, "grad_norm": 0.5250810980796814, "learning_rate": 5.610271071224042e-06, "loss": 0.4134, "mean_token_accuracy": 0.8605869054794312, "num_tokens": 1043220002.0, "step": 31140 }, { "epoch": 1.8591809933142311, "grad_norm": 0.4844275712966919, "learning_rate": 5.6077050384523405e-06, "loss": 0.4349, "mean_token_accuracy": 0.8537755131721496, "num_tokens": 1043387682.0, "step": 31145 }, { "epoch": 1.859479465138491, "grad_norm": 0.573715090751648, "learning_rate": 5.605144337953162e-06, "loss": 0.4765, "mean_token_accuracy": 0.8424907445907592, "num_tokens": 1043555362.0, "step": 31150 }, { "epoch": 1.8597779369627507, "grad_norm": 0.47320133447647095, "learning_rate": 5.602588970350216e-06, "loss": 0.4385, "mean_token_accuracy": 0.8532029151916504, "num_tokens": 1043723042.0, "step": 31155 }, { "epoch": 1.8600764087870105, "grad_norm": 0.6306964755058289, "learning_rate": 5.6000389362659035e-06, "loss": 0.4136, "mean_token_accuracy": 0.86192946434021, "num_tokens": 1043883614.0, "step": 31160 }, { "epoch": 1.8603748806112703, "grad_norm": 0.477536678314209, "learning_rate": 5.597494236321334e-06, "loss": 0.4103, "mean_token_accuracy": 0.8599069476127624, "num_tokens": 1044051294.0, "step": 31165 }, { "epoch": 1.8606733524355301, "grad_norm": 0.5094892382621765, "learning_rate": 5.594954871136321e-06, "loss": 0.4436, "mean_token_accuracy": 0.8513777852058411, "num_tokens": 1044218974.0, "step": 31170 }, { "epoch": 1.86097182425979, "grad_norm": 0.4471518397331238, "learning_rate": 5.592420841329364e-06, "loss": 0.4208, "mean_token_accuracy": 0.858320415019989, "num_tokens": 1044386654.0, "step": 31175 }, { "epoch": 1.8612702960840497, "grad_norm": 0.5190898776054382, "learning_rate": 5.589892147517684e-06, "loss": 0.4323, "mean_token_accuracy": 0.8539067149162293, "num_tokens": 1044554334.0, "step": 31180 }, { "epoch": 1.8615687679083095, "grad_norm": 0.5199024677276611, "learning_rate": 5.587368790317184e-06, "loss": 0.4238, "mean_token_accuracy": 0.8567040443420411, "num_tokens": 1044722014.0, "step": 31185 }, { "epoch": 1.8618672397325693, "grad_norm": 0.5011622905731201, "learning_rate": 5.584850770342475e-06, "loss": 0.4493, "mean_token_accuracy": 0.8490337610244751, "num_tokens": 1044889694.0, "step": 31190 }, { "epoch": 1.862165711556829, "grad_norm": 0.5112608075141907, "learning_rate": 5.582338088206869e-06, "loss": 0.4384, "mean_token_accuracy": 0.8545508623123169, "num_tokens": 1045057374.0, "step": 31195 }, { "epoch": 1.8624641833810889, "grad_norm": 0.5377089381217957, "learning_rate": 5.579830744522375e-06, "loss": 0.4339, "mean_token_accuracy": 0.8547954201698303, "num_tokens": 1045225054.0, "step": 31200 }, { "epoch": 1.8627626552053487, "grad_norm": 0.5588016510009766, "learning_rate": 5.577328739899703e-06, "loss": 0.4548, "mean_token_accuracy": 0.8471967101097106, "num_tokens": 1045392734.0, "step": 31205 }, { "epoch": 1.8630611270296082, "grad_norm": 0.6160690784454346, "learning_rate": 5.574832074948264e-06, "loss": 0.4379, "mean_token_accuracy": 0.8541095018386841, "num_tokens": 1045560414.0, "step": 31210 }, { "epoch": 1.8633595988538683, "grad_norm": 0.5562750101089478, "learning_rate": 5.572340750276166e-06, "loss": 0.4241, "mean_token_accuracy": 0.8574674844741821, "num_tokens": 1045728094.0, "step": 31215 }, { "epoch": 1.8636580706781278, "grad_norm": 0.5025776624679565, "learning_rate": 5.569854766490215e-06, "loss": 0.4739, "mean_token_accuracy": 0.8429500102996826, "num_tokens": 1045895774.0, "step": 31220 }, { "epoch": 1.8639565425023878, "grad_norm": 0.5171743035316467, "learning_rate": 5.567374124195916e-06, "loss": 0.4428, "mean_token_accuracy": 0.8515209436416626, "num_tokens": 1046063454.0, "step": 31225 }, { "epoch": 1.8642550143266474, "grad_norm": 0.49668410420417786, "learning_rate": 5.564898823997484e-06, "loss": 0.4135, "mean_token_accuracy": 0.8606405854225159, "num_tokens": 1046231134.0, "step": 31230 }, { "epoch": 1.8645534861509074, "grad_norm": 0.5170344114303589, "learning_rate": 5.562428866497817e-06, "loss": 0.409, "mean_token_accuracy": 0.8612370371818543, "num_tokens": 1046398814.0, "step": 31235 }, { "epoch": 1.864851957975167, "grad_norm": 0.4819069802761078, "learning_rate": 5.559964252298523e-06, "loss": 0.3992, "mean_token_accuracy": 0.8660205125808715, "num_tokens": 1046566494.0, "step": 31240 }, { "epoch": 1.865150429799427, "grad_norm": 0.5160818099975586, "learning_rate": 5.557504981999899e-06, "loss": 0.4216, "mean_token_accuracy": 0.857932710647583, "num_tokens": 1046734174.0, "step": 31245 }, { "epoch": 1.8654489016236866, "grad_norm": 0.46007150411605835, "learning_rate": 5.555051056200949e-06, "loss": 0.3957, "mean_token_accuracy": 0.8672014832496643, "num_tokens": 1046901854.0, "step": 31250 }, { "epoch": 1.8657473734479466, "grad_norm": 0.4852820336818695, "learning_rate": 5.552602475499375e-06, "loss": 0.412, "mean_token_accuracy": 0.8617917180061341, "num_tokens": 1047069534.0, "step": 31255 }, { "epoch": 1.8660458452722062, "grad_norm": 0.5058529376983643, "learning_rate": 5.5501592404915664e-06, "loss": 0.3849, "mean_token_accuracy": 0.8685196161270141, "num_tokens": 1047237214.0, "step": 31260 }, { "epoch": 1.8663443170964662, "grad_norm": 0.526887834072113, "learning_rate": 5.547721351772624e-06, "loss": 0.4406, "mean_token_accuracy": 0.8532088875770569, "num_tokens": 1047404894.0, "step": 31265 }, { "epoch": 1.8666427889207258, "grad_norm": 0.5444082021713257, "learning_rate": 5.54528880993634e-06, "loss": 0.4298, "mean_token_accuracy": 0.854324221611023, "num_tokens": 1047572574.0, "step": 31270 }, { "epoch": 1.8669412607449858, "grad_norm": 0.4989696741104126, "learning_rate": 5.542861615575201e-06, "loss": 0.4205, "mean_token_accuracy": 0.8583621621131897, "num_tokens": 1047740254.0, "step": 31275 }, { "epoch": 1.8672397325692454, "grad_norm": 0.5281450748443604, "learning_rate": 5.540439769280403e-06, "loss": 0.4368, "mean_token_accuracy": 0.8532088756561279, "num_tokens": 1047907934.0, "step": 31280 }, { "epoch": 1.8675382043935054, "grad_norm": 0.4540364444255829, "learning_rate": 5.538023271641821e-06, "loss": 0.4293, "mean_token_accuracy": 0.8563342452049255, "num_tokens": 1048075614.0, "step": 31285 }, { "epoch": 1.867836676217765, "grad_norm": 0.4731082022190094, "learning_rate": 5.535612123248045e-06, "loss": 0.4109, "mean_token_accuracy": 0.8609893798828125, "num_tokens": 1048238899.0, "step": 31290 }, { "epoch": 1.868135148042025, "grad_norm": 0.6066111922264099, "learning_rate": 5.533206324686354e-06, "loss": 0.4409, "mean_token_accuracy": 0.853483247756958, "num_tokens": 1048406579.0, "step": 31295 }, { "epoch": 1.8684336198662845, "grad_norm": 0.5273709893226624, "learning_rate": 5.5308058765427205e-06, "loss": 0.4191, "mean_token_accuracy": 0.8587259888648987, "num_tokens": 1048574259.0, "step": 31300 }, { "epoch": 1.8687320916905446, "grad_norm": 0.49483057856559753, "learning_rate": 5.528410779401821e-06, "loss": 0.4204, "mean_token_accuracy": 0.8597041726112366, "num_tokens": 1048741939.0, "step": 31305 }, { "epoch": 1.8690305635148041, "grad_norm": 0.5066244602203369, "learning_rate": 5.526021033847026e-06, "loss": 0.4282, "mean_token_accuracy": 0.8569843769073486, "num_tokens": 1048909619.0, "step": 31310 }, { "epoch": 1.8693290353390641, "grad_norm": 0.48427721858024597, "learning_rate": 5.523636640460405e-06, "loss": 0.4212, "mean_token_accuracy": 0.8600918531417847, "num_tokens": 1049077299.0, "step": 31315 }, { "epoch": 1.8696275071633237, "grad_norm": 0.46243947744369507, "learning_rate": 5.521257599822711e-06, "loss": 0.4414, "mean_token_accuracy": 0.8525647044181823, "num_tokens": 1049244979.0, "step": 31320 }, { "epoch": 1.8699259789875837, "grad_norm": 0.5038584470748901, "learning_rate": 5.518883912513413e-06, "loss": 0.4388, "mean_token_accuracy": 0.8539246201515198, "num_tokens": 1049412659.0, "step": 31325 }, { "epoch": 1.8702244508118433, "grad_norm": 0.48492860794067383, "learning_rate": 5.5165155791106635e-06, "loss": 0.4196, "mean_token_accuracy": 0.8589764952659606, "num_tokens": 1049580339.0, "step": 31330 }, { "epoch": 1.8705229226361033, "grad_norm": 0.5504201054573059, "learning_rate": 5.514152600191311e-06, "loss": 0.4257, "mean_token_accuracy": 0.8546463131904602, "num_tokens": 1049748019.0, "step": 31335 }, { "epoch": 1.8708213944603629, "grad_norm": 0.5138734579086304, "learning_rate": 5.511794976330908e-06, "loss": 0.44, "mean_token_accuracy": 0.8532685160636901, "num_tokens": 1049915699.0, "step": 31340 }, { "epoch": 1.8711198662846227, "grad_norm": 0.5428465604782104, "learning_rate": 5.509442708103692e-06, "loss": 0.4199, "mean_token_accuracy": 0.859071922302246, "num_tokens": 1050083379.0, "step": 31345 }, { "epoch": 1.8714183381088825, "grad_norm": 0.554255485534668, "learning_rate": 5.507095796082603e-06, "loss": 0.4291, "mean_token_accuracy": 0.8551055669784546, "num_tokens": 1050251059.0, "step": 31350 }, { "epoch": 1.8717168099331423, "grad_norm": 0.4803463816642761, "learning_rate": 5.504754240839277e-06, "loss": 0.3983, "mean_token_accuracy": 0.8641178727149963, "num_tokens": 1050418739.0, "step": 31355 }, { "epoch": 1.872015281757402, "grad_norm": 0.5302906632423401, "learning_rate": 5.502418042944041e-06, "loss": 0.4397, "mean_token_accuracy": 0.8524573564529419, "num_tokens": 1050586419.0, "step": 31360 }, { "epoch": 1.8723137535816619, "grad_norm": 0.5589760541915894, "learning_rate": 5.500087202965919e-06, "loss": 0.4184, "mean_token_accuracy": 0.8608970522880555, "num_tokens": 1050754099.0, "step": 31365 }, { "epoch": 1.8726122254059216, "grad_norm": 0.5340816378593445, "learning_rate": 5.497761721472629e-06, "loss": 0.4462, "mean_token_accuracy": 0.8521531581878662, "num_tokens": 1050921779.0, "step": 31370 }, { "epoch": 1.8729106972301814, "grad_norm": 0.556317150592804, "learning_rate": 5.495441599030591e-06, "loss": 0.4465, "mean_token_accuracy": 0.8506620526313782, "num_tokens": 1051089459.0, "step": 31375 }, { "epoch": 1.8732091690544412, "grad_norm": 0.5510767102241516, "learning_rate": 5.493126836204907e-06, "loss": 0.4374, "mean_token_accuracy": 0.8553322315216064, "num_tokens": 1051257139.0, "step": 31380 }, { "epoch": 1.873507640878701, "grad_norm": 0.4932164251804352, "learning_rate": 5.490817433559382e-06, "loss": 0.4202, "mean_token_accuracy": 0.8583919882774353, "num_tokens": 1051424819.0, "step": 31385 }, { "epoch": 1.8738061127029608, "grad_norm": 0.4719121754169464, "learning_rate": 5.488513391656512e-06, "loss": 0.4132, "mean_token_accuracy": 0.8617320656776428, "num_tokens": 1051592499.0, "step": 31390 }, { "epoch": 1.8741045845272206, "grad_norm": 0.46552151441574097, "learning_rate": 5.486214711057493e-06, "loss": 0.364, "mean_token_accuracy": 0.8763569116592407, "num_tokens": 1051760179.0, "step": 31395 }, { "epoch": 1.8744030563514804, "grad_norm": 0.514227569103241, "learning_rate": 5.483921392322212e-06, "loss": 0.4231, "mean_token_accuracy": 0.8592568278312683, "num_tokens": 1051927859.0, "step": 31400 }, { "epoch": 1.8747015281757402, "grad_norm": 0.5355833768844604, "learning_rate": 5.481633436009242e-06, "loss": 0.4254, "mean_token_accuracy": 0.8574853897094726, "num_tokens": 1052095539.0, "step": 31405 }, { "epoch": 1.875, "grad_norm": 0.5531699061393738, "learning_rate": 5.479350842675863e-06, "loss": 0.4084, "mean_token_accuracy": 0.8619646906852723, "num_tokens": 1052263219.0, "step": 31410 }, { "epoch": 1.8752984718242598, "grad_norm": 0.5218827128410339, "learning_rate": 5.477073612878039e-06, "loss": 0.47, "mean_token_accuracy": 0.8432780504226685, "num_tokens": 1052430899.0, "step": 31415 }, { "epoch": 1.8755969436485196, "grad_norm": 0.5016190409660339, "learning_rate": 5.474801747170434e-06, "loss": 0.4211, "mean_token_accuracy": 0.8582011222839355, "num_tokens": 1052598579.0, "step": 31420 }, { "epoch": 1.8758954154727794, "grad_norm": 0.5406301617622375, "learning_rate": 5.472535246106404e-06, "loss": 0.445, "mean_token_accuracy": 0.851604449748993, "num_tokens": 1052766259.0, "step": 31425 }, { "epoch": 1.8761938872970392, "grad_norm": 0.4535290002822876, "learning_rate": 5.470274110237993e-06, "loss": 0.3857, "mean_token_accuracy": 0.8698556661605835, "num_tokens": 1052933939.0, "step": 31430 }, { "epoch": 1.876492359121299, "grad_norm": 0.5444434285163879, "learning_rate": 5.468018340115945e-06, "loss": 0.4305, "mean_token_accuracy": 0.855839216709137, "num_tokens": 1053101619.0, "step": 31435 }, { "epoch": 1.8767908309455588, "grad_norm": 0.4442603588104248, "learning_rate": 5.465767936289695e-06, "loss": 0.4033, "mean_token_accuracy": 0.863169503211975, "num_tokens": 1053269299.0, "step": 31440 }, { "epoch": 1.8770893027698186, "grad_norm": 0.5314321517944336, "learning_rate": 5.4635228993073675e-06, "loss": 0.4498, "mean_token_accuracy": 0.8506203055381775, "num_tokens": 1053436979.0, "step": 31445 }, { "epoch": 1.8773877745940784, "grad_norm": 0.5004502534866333, "learning_rate": 5.461283229715787e-06, "loss": 0.4576, "mean_token_accuracy": 0.8473816037178039, "num_tokens": 1053604659.0, "step": 31450 }, { "epoch": 1.8776862464183381, "grad_norm": 0.48730525374412537, "learning_rate": 5.4590489280604606e-06, "loss": 0.4277, "mean_token_accuracy": 0.8568591237068176, "num_tokens": 1053772339.0, "step": 31455 }, { "epoch": 1.877984718242598, "grad_norm": 0.5040608644485474, "learning_rate": 5.456819994885603e-06, "loss": 0.3913, "mean_token_accuracy": 0.8682571887969971, "num_tokens": 1053940019.0, "step": 31460 }, { "epoch": 1.8782831900668577, "grad_norm": 0.45529037714004517, "learning_rate": 5.454596430734101e-06, "loss": 0.4069, "mean_token_accuracy": 0.8635810613632202, "num_tokens": 1054107699.0, "step": 31465 }, { "epoch": 1.8785816618911175, "grad_norm": 0.5126983523368835, "learning_rate": 5.452378236147551e-06, "loss": 0.4399, "mean_token_accuracy": 0.8538411140441895, "num_tokens": 1054275379.0, "step": 31470 }, { "epoch": 1.8788801337153773, "grad_norm": 0.4560665786266327, "learning_rate": 5.450165411666235e-06, "loss": 0.45, "mean_token_accuracy": 0.8493021726608276, "num_tokens": 1054443059.0, "step": 31475 }, { "epoch": 1.8791786055396371, "grad_norm": 0.460692822933197, "learning_rate": 5.447957957829126e-06, "loss": 0.4081, "mean_token_accuracy": 0.8611177444458008, "num_tokens": 1054610739.0, "step": 31480 }, { "epoch": 1.8794770773638967, "grad_norm": 0.5564628839492798, "learning_rate": 5.4457558751738905e-06, "loss": 0.4584, "mean_token_accuracy": 0.8453656315803528, "num_tokens": 1054778419.0, "step": 31485 }, { "epoch": 1.8797755491881567, "grad_norm": 0.5538517832756042, "learning_rate": 5.443559164236885e-06, "loss": 0.4225, "mean_token_accuracy": 0.8589824557304382, "num_tokens": 1054946099.0, "step": 31490 }, { "epoch": 1.8800740210124163, "grad_norm": 0.5153970122337341, "learning_rate": 5.4413678255531606e-06, "loss": 0.4518, "mean_token_accuracy": 0.8511034250259399, "num_tokens": 1055113779.0, "step": 31495 }, { "epoch": 1.8803724928366763, "grad_norm": 0.5142152905464172, "learning_rate": 5.43918185965646e-06, "loss": 0.4147, "mean_token_accuracy": 0.8613205313682556, "num_tokens": 1055281459.0, "step": 31500 }, { "epoch": 1.8806709646609359, "grad_norm": 0.4856979250907898, "learning_rate": 5.437001267079212e-06, "loss": 0.4205, "mean_token_accuracy": 0.8587975621223449, "num_tokens": 1055449139.0, "step": 31505 }, { "epoch": 1.8809694364851959, "grad_norm": 0.5209598541259766, "learning_rate": 5.434826048352544e-06, "loss": 0.4531, "mean_token_accuracy": 0.8500059723854065, "num_tokens": 1055616819.0, "step": 31510 }, { "epoch": 1.8812679083094554, "grad_norm": 0.5124226808547974, "learning_rate": 5.4326562040062634e-06, "loss": 0.3999, "mean_token_accuracy": 0.8635631561279297, "num_tokens": 1055784499.0, "step": 31515 }, { "epoch": 1.8815663801337155, "grad_norm": 0.519205629825592, "learning_rate": 5.430491734568883e-06, "loss": 0.3998, "mean_token_accuracy": 0.8645413398742676, "num_tokens": 1055952179.0, "step": 31520 }, { "epoch": 1.881864851957975, "grad_norm": 0.5213624238967896, "learning_rate": 5.428332640567598e-06, "loss": 0.4081, "mean_token_accuracy": 0.8631814360618592, "num_tokens": 1056119859.0, "step": 31525 }, { "epoch": 1.882163323782235, "grad_norm": 0.48710131645202637, "learning_rate": 5.426178922528295e-06, "loss": 0.4177, "mean_token_accuracy": 0.8594834804534912, "num_tokens": 1056287539.0, "step": 31530 }, { "epoch": 1.8824617956064946, "grad_norm": 0.46749168634414673, "learning_rate": 5.4240305809755495e-06, "loss": 0.4369, "mean_token_accuracy": 0.8547298192977906, "num_tokens": 1056455219.0, "step": 31535 }, { "epoch": 1.8827602674307546, "grad_norm": 0.5485461950302124, "learning_rate": 5.421887616432633e-06, "loss": 0.4318, "mean_token_accuracy": 0.8540021419525147, "num_tokens": 1056622899.0, "step": 31540 }, { "epoch": 1.8830587392550142, "grad_norm": 0.4804892838001251, "learning_rate": 5.419750029421506e-06, "loss": 0.426, "mean_token_accuracy": 0.8559584975242615, "num_tokens": 1056790579.0, "step": 31545 }, { "epoch": 1.8833572110792742, "grad_norm": 0.5282833576202393, "learning_rate": 5.417617820462809e-06, "loss": 0.4234, "mean_token_accuracy": 0.8564952969551086, "num_tokens": 1056958259.0, "step": 31550 }, { "epoch": 1.8836556829035338, "grad_norm": 0.4980647563934326, "learning_rate": 5.4154909900758855e-06, "loss": 0.4247, "mean_token_accuracy": 0.8575629234313965, "num_tokens": 1057125939.0, "step": 31555 }, { "epoch": 1.8839541547277938, "grad_norm": 0.48746436834335327, "learning_rate": 5.413369538778768e-06, "loss": 0.4367, "mean_token_accuracy": 0.8544136881828308, "num_tokens": 1057293619.0, "step": 31560 }, { "epoch": 1.8842526265520534, "grad_norm": 0.5036244988441467, "learning_rate": 5.411253467088171e-06, "loss": 0.4309, "mean_token_accuracy": 0.8553441524505615, "num_tokens": 1057461299.0, "step": 31565 }, { "epoch": 1.8845510983763134, "grad_norm": 0.6265799403190613, "learning_rate": 5.409142775519507e-06, "loss": 0.4513, "mean_token_accuracy": 0.848646080493927, "num_tokens": 1057628979.0, "step": 31570 }, { "epoch": 1.884849570200573, "grad_norm": 0.4896012842655182, "learning_rate": 5.407037464586865e-06, "loss": 0.3926, "mean_token_accuracy": 0.8680305361747742, "num_tokens": 1057796659.0, "step": 31575 }, { "epoch": 1.885148042024833, "grad_norm": 0.5121934413909912, "learning_rate": 5.404937534803038e-06, "loss": 0.4555, "mean_token_accuracy": 0.8478885769844056, "num_tokens": 1057964339.0, "step": 31580 }, { "epoch": 1.8854465138490926, "grad_norm": 0.46774473786354065, "learning_rate": 5.402842986679505e-06, "loss": 0.4186, "mean_token_accuracy": 0.8576344966888427, "num_tokens": 1058132019.0, "step": 31585 }, { "epoch": 1.8857449856733526, "grad_norm": 0.5030744671821594, "learning_rate": 5.400753820726429e-06, "loss": 0.3922, "mean_token_accuracy": 0.8677024960517883, "num_tokens": 1058299699.0, "step": 31590 }, { "epoch": 1.8860434574976122, "grad_norm": 0.4890047609806061, "learning_rate": 5.398670037452664e-06, "loss": 0.3895, "mean_token_accuracy": 0.8691935896873474, "num_tokens": 1058467379.0, "step": 31595 }, { "epoch": 1.8863419293218722, "grad_norm": 0.5055318474769592, "learning_rate": 5.396591637365756e-06, "loss": 0.4318, "mean_token_accuracy": 0.8578642725944519, "num_tokens": 1058628606.0, "step": 31600 }, { "epoch": 1.8866404011461317, "grad_norm": 0.49970972537994385, "learning_rate": 5.394518620971936e-06, "loss": 0.3923, "mean_token_accuracy": 0.8688059210777282, "num_tokens": 1058796286.0, "step": 31605 }, { "epoch": 1.8869388729703918, "grad_norm": 0.6755126714706421, "learning_rate": 5.39245098877613e-06, "loss": 0.4388, "mean_token_accuracy": 0.8520517587661743, "num_tokens": 1058963966.0, "step": 31610 }, { "epoch": 1.8872373447946513, "grad_norm": 0.5788027048110962, "learning_rate": 5.390388741281942e-06, "loss": 0.4076, "mean_token_accuracy": 0.8621138095855713, "num_tokens": 1059131646.0, "step": 31615 }, { "epoch": 1.8875358166189111, "grad_norm": 0.46830010414123535, "learning_rate": 5.3883318789916735e-06, "loss": 0.3929, "mean_token_accuracy": 0.8661636590957642, "num_tokens": 1059299326.0, "step": 31620 }, { "epoch": 1.887834288443171, "grad_norm": 0.49278074502944946, "learning_rate": 5.386280402406308e-06, "loss": 0.4455, "mean_token_accuracy": 0.8518072247505188, "num_tokens": 1059467006.0, "step": 31625 }, { "epoch": 1.8881327602674307, "grad_norm": 0.4522901475429535, "learning_rate": 5.384234312025525e-06, "loss": 0.4092, "mean_token_accuracy": 0.8618513584136963, "num_tokens": 1059634686.0, "step": 31630 }, { "epoch": 1.8884312320916905, "grad_norm": 0.5867912173271179, "learning_rate": 5.382193608347684e-06, "loss": 0.4387, "mean_token_accuracy": 0.8537039279937744, "num_tokens": 1059802366.0, "step": 31635 }, { "epoch": 1.8887297039159503, "grad_norm": 0.5196858644485474, "learning_rate": 5.380158291869836e-06, "loss": 0.4183, "mean_token_accuracy": 0.8600083470344544, "num_tokens": 1059970046.0, "step": 31640 }, { "epoch": 1.88902817574021, "grad_norm": 0.5334433317184448, "learning_rate": 5.378128363087728e-06, "loss": 0.4235, "mean_token_accuracy": 0.8578790426254272, "num_tokens": 1060137726.0, "step": 31645 }, { "epoch": 1.8893266475644699, "grad_norm": 0.48552897572517395, "learning_rate": 5.376103822495774e-06, "loss": 0.4468, "mean_token_accuracy": 0.851503050327301, "num_tokens": 1060305406.0, "step": 31650 }, { "epoch": 1.8896251193887297, "grad_norm": 0.5461321473121643, "learning_rate": 5.374084670587097e-06, "loss": 0.39, "mean_token_accuracy": 0.8679887771606445, "num_tokens": 1060473086.0, "step": 31655 }, { "epoch": 1.8899235912129895, "grad_norm": 0.523984968662262, "learning_rate": 5.372070907853496e-06, "loss": 0.4169, "mean_token_accuracy": 0.8611892938613892, "num_tokens": 1060640766.0, "step": 31660 }, { "epoch": 1.8902220630372493, "grad_norm": 0.4599439799785614, "learning_rate": 5.370062534785459e-06, "loss": 0.3978, "mean_token_accuracy": 0.8666467905044556, "num_tokens": 1060808446.0, "step": 31665 }, { "epoch": 1.890520534861509, "grad_norm": 0.46077027916908264, "learning_rate": 5.368059551872164e-06, "loss": 0.4009, "mean_token_accuracy": 0.8674877762794495, "num_tokens": 1060976126.0, "step": 31670 }, { "epoch": 1.8908190066857689, "grad_norm": 0.6345248818397522, "learning_rate": 5.366061959601475e-06, "loss": 0.4813, "mean_token_accuracy": 0.8408385992050171, "num_tokens": 1061143806.0, "step": 31675 }, { "epoch": 1.8911174785100286, "grad_norm": 0.5159919261932373, "learning_rate": 5.36406975845994e-06, "loss": 0.4338, "mean_token_accuracy": 0.8544196605682373, "num_tokens": 1061311486.0, "step": 31680 }, { "epoch": 1.8914159503342884, "grad_norm": 0.5581896305084229, "learning_rate": 5.3620829489327995e-06, "loss": 0.4708, "mean_token_accuracy": 0.8435345411300659, "num_tokens": 1061479166.0, "step": 31685 }, { "epoch": 1.8917144221585482, "grad_norm": 0.5385413765907288, "learning_rate": 5.360101531503973e-06, "loss": 0.4283, "mean_token_accuracy": 0.8542287945747375, "num_tokens": 1061646846.0, "step": 31690 }, { "epoch": 1.892012893982808, "grad_norm": 0.5238235592842102, "learning_rate": 5.358125506656078e-06, "loss": 0.4502, "mean_token_accuracy": 0.8497596025466919, "num_tokens": 1061809829.0, "step": 31695 }, { "epoch": 1.8923113658070678, "grad_norm": 0.4854484796524048, "learning_rate": 5.356154874870405e-06, "loss": 0.4288, "mean_token_accuracy": 0.8591136693954468, "num_tokens": 1061977509.0, "step": 31700 }, { "epoch": 1.8926098376313276, "grad_norm": 0.48504146933555603, "learning_rate": 5.354189636626943e-06, "loss": 0.4302, "mean_token_accuracy": 0.8553322196006775, "num_tokens": 1062145189.0, "step": 31705 }, { "epoch": 1.8929083094555874, "grad_norm": 0.47814470529556274, "learning_rate": 5.352229792404358e-06, "loss": 0.4002, "mean_token_accuracy": 0.8656745910644531, "num_tokens": 1062312869.0, "step": 31710 }, { "epoch": 1.8932067812798472, "grad_norm": 0.449599951505661, "learning_rate": 5.350275342680009e-06, "loss": 0.42, "mean_token_accuracy": 0.8595669865608215, "num_tokens": 1062480549.0, "step": 31715 }, { "epoch": 1.893505253104107, "grad_norm": 0.54246586561203, "learning_rate": 5.3483262879299366e-06, "loss": 0.4323, "mean_token_accuracy": 0.8552845001220704, "num_tokens": 1062648229.0, "step": 31720 }, { "epoch": 1.8938037249283668, "grad_norm": 0.5128061175346375, "learning_rate": 5.3463826286288706e-06, "loss": 0.4036, "mean_token_accuracy": 0.863897168636322, "num_tokens": 1062815909.0, "step": 31725 }, { "epoch": 1.8941021967526266, "grad_norm": 0.4644211232662201, "learning_rate": 5.3444443652502245e-06, "loss": 0.4294, "mean_token_accuracy": 0.8552725672721863, "num_tokens": 1062983589.0, "step": 31730 }, { "epoch": 1.8944006685768864, "grad_norm": 0.5072385668754578, "learning_rate": 5.342511498266098e-06, "loss": 0.4259, "mean_token_accuracy": 0.8572289228439331, "num_tokens": 1063151269.0, "step": 31735 }, { "epoch": 1.8946991404011462, "grad_norm": 0.47007593512535095, "learning_rate": 5.340584028147276e-06, "loss": 0.4058, "mean_token_accuracy": 0.8622569441795349, "num_tokens": 1063318949.0, "step": 31740 }, { "epoch": 1.894997612225406, "grad_norm": 0.5132487416267395, "learning_rate": 5.338661955363228e-06, "loss": 0.4432, "mean_token_accuracy": 0.8513360261917114, "num_tokens": 1063486629.0, "step": 31745 }, { "epoch": 1.8952960840496658, "grad_norm": 0.564643383026123, "learning_rate": 5.336745280382114e-06, "loss": 0.4611, "mean_token_accuracy": 0.8468448162078858, "num_tokens": 1063654309.0, "step": 31750 }, { "epoch": 1.8955945558739256, "grad_norm": 0.5347476005554199, "learning_rate": 5.334834003670775e-06, "loss": 0.4208, "mean_token_accuracy": 0.8596743464469909, "num_tokens": 1063821989.0, "step": 31755 }, { "epoch": 1.8958930276981851, "grad_norm": 0.5305520296096802, "learning_rate": 5.332928125694736e-06, "loss": 0.4532, "mean_token_accuracy": 0.8477991104125977, "num_tokens": 1063989669.0, "step": 31760 }, { "epoch": 1.8961914995224451, "grad_norm": 0.47084951400756836, "learning_rate": 5.331027646918208e-06, "loss": 0.3921, "mean_token_accuracy": 0.8669390320777893, "num_tokens": 1064157349.0, "step": 31765 }, { "epoch": 1.8964899713467047, "grad_norm": 0.5179524421691895, "learning_rate": 5.329132567804089e-06, "loss": 0.4047, "mean_token_accuracy": 0.8634856224060059, "num_tokens": 1064325029.0, "step": 31770 }, { "epoch": 1.8967884431709647, "grad_norm": 0.5258401036262512, "learning_rate": 5.327242888813961e-06, "loss": 0.4379, "mean_token_accuracy": 0.8546284198760986, "num_tokens": 1064492709.0, "step": 31775 }, { "epoch": 1.8970869149952243, "grad_norm": 0.49087873101234436, "learning_rate": 5.325358610408088e-06, "loss": 0.4253, "mean_token_accuracy": 0.858195161819458, "num_tokens": 1064660389.0, "step": 31780 }, { "epoch": 1.8973853868194843, "grad_norm": 0.4951891303062439, "learning_rate": 5.323479733045425e-06, "loss": 0.4595, "mean_token_accuracy": 0.8468448162078858, "num_tokens": 1064828069.0, "step": 31785 }, { "epoch": 1.897683858643744, "grad_norm": 0.6353661417961121, "learning_rate": 5.3216062571836056e-06, "loss": 0.4642, "mean_token_accuracy": 0.8454431533813477, "num_tokens": 1064995749.0, "step": 31790 }, { "epoch": 1.897982330468004, "grad_norm": 0.4828473925590515, "learning_rate": 5.319738183278945e-06, "loss": 0.4168, "mean_token_accuracy": 0.8611058115959167, "num_tokens": 1065163429.0, "step": 31795 }, { "epoch": 1.8982808022922635, "grad_norm": 0.4772716164588928, "learning_rate": 5.3178755117864525e-06, "loss": 0.4037, "mean_token_accuracy": 0.8639210224151611, "num_tokens": 1065331109.0, "step": 31800 }, { "epoch": 1.8985792741165235, "grad_norm": 0.4815249741077423, "learning_rate": 5.316018243159813e-06, "loss": 0.4175, "mean_token_accuracy": 0.858547055721283, "num_tokens": 1065498789.0, "step": 31805 }, { "epoch": 1.898877745940783, "grad_norm": 0.6053141951560974, "learning_rate": 5.3141663778514004e-06, "loss": 0.4133, "mean_token_accuracy": 0.8614815831184387, "num_tokens": 1065666469.0, "step": 31810 }, { "epoch": 1.899176217765043, "grad_norm": 0.5123937129974365, "learning_rate": 5.31231991631227e-06, "loss": 0.4425, "mean_token_accuracy": 0.8516640901565552, "num_tokens": 1065834149.0, "step": 31815 }, { "epoch": 1.8994746895893027, "grad_norm": 0.5880845785140991, "learning_rate": 5.310478858992162e-06, "loss": 0.4058, "mean_token_accuracy": 0.8639866471290588, "num_tokens": 1066001829.0, "step": 31820 }, { "epoch": 1.8997731614135627, "grad_norm": 0.5158835053443909, "learning_rate": 5.308643206339497e-06, "loss": 0.4392, "mean_token_accuracy": 0.8542884349822998, "num_tokens": 1066169509.0, "step": 31825 }, { "epoch": 1.9000716332378222, "grad_norm": 0.5261027216911316, "learning_rate": 5.306812958801389e-06, "loss": 0.4494, "mean_token_accuracy": 0.8503936648368835, "num_tokens": 1066337189.0, "step": 31830 }, { "epoch": 1.9003701050620823, "grad_norm": 0.5400513410568237, "learning_rate": 5.304988116823622e-06, "loss": 0.408, "mean_token_accuracy": 0.8631993293762207, "num_tokens": 1066504869.0, "step": 31835 }, { "epoch": 1.9006685768863418, "grad_norm": 0.4870010018348694, "learning_rate": 5.3031686808506745e-06, "loss": 0.4145, "mean_token_accuracy": 0.8592210531234741, "num_tokens": 1066672549.0, "step": 31840 }, { "epoch": 1.9009670487106018, "grad_norm": 0.4953862428665161, "learning_rate": 5.3013546513256966e-06, "loss": 0.4341, "mean_token_accuracy": 0.85431067943573, "num_tokens": 1066834087.0, "step": 31845 }, { "epoch": 1.9012655205348614, "grad_norm": 0.4556969702243805, "learning_rate": 5.299546028690539e-06, "loss": 0.3759, "mean_token_accuracy": 0.8721579313278198, "num_tokens": 1067001767.0, "step": 31850 }, { "epoch": 1.9015639923591214, "grad_norm": 0.4724929630756378, "learning_rate": 5.2977428133857145e-06, "loss": 0.4175, "mean_token_accuracy": 0.859799599647522, "num_tokens": 1067169447.0, "step": 31855 }, { "epoch": 1.901862464183381, "grad_norm": 0.6419077515602112, "learning_rate": 5.295945005850437e-06, "loss": 0.423, "mean_token_accuracy": 0.8584874153137207, "num_tokens": 1067337127.0, "step": 31860 }, { "epoch": 1.902160936007641, "grad_norm": 0.4826868772506714, "learning_rate": 5.294152606522592e-06, "loss": 0.4126, "mean_token_accuracy": 0.8613205313682556, "num_tokens": 1067504807.0, "step": 31865 }, { "epoch": 1.9024594078319006, "grad_norm": 0.5089836716651917, "learning_rate": 5.292365615838752e-06, "loss": 0.4231, "mean_token_accuracy": 0.8571513652801513, "num_tokens": 1067672487.0, "step": 31870 }, { "epoch": 1.9027578796561606, "grad_norm": 0.5602514743804932, "learning_rate": 5.290584034234177e-06, "loss": 0.4452, "mean_token_accuracy": 0.850852906703949, "num_tokens": 1067840167.0, "step": 31875 }, { "epoch": 1.9030563514804202, "grad_norm": 0.5133985877037048, "learning_rate": 5.288807862142793e-06, "loss": 0.442, "mean_token_accuracy": 0.8537635684013367, "num_tokens": 1068007847.0, "step": 31880 }, { "epoch": 1.9033548233046802, "grad_norm": 0.5091225504875183, "learning_rate": 5.287037099997229e-06, "loss": 0.4267, "mean_token_accuracy": 0.8579506158828736, "num_tokens": 1068175527.0, "step": 31885 }, { "epoch": 1.9036532951289398, "grad_norm": 0.4815044105052948, "learning_rate": 5.285271748228784e-06, "loss": 0.4212, "mean_token_accuracy": 0.8574734568595886, "num_tokens": 1068343207.0, "step": 31890 }, { "epoch": 1.9039517669531996, "grad_norm": 0.5389668345451355, "learning_rate": 5.28351180726744e-06, "loss": 0.4211, "mean_token_accuracy": 0.8600381731986999, "num_tokens": 1068510887.0, "step": 31895 }, { "epoch": 1.9042502387774594, "grad_norm": 0.5164958834648132, "learning_rate": 5.281757277541867e-06, "loss": 0.4123, "mean_token_accuracy": 0.8624955177307129, "num_tokens": 1068678567.0, "step": 31900 }, { "epoch": 1.9045487106017192, "grad_norm": 0.5868775248527527, "learning_rate": 5.280008159479409e-06, "loss": 0.4498, "mean_token_accuracy": 0.8485267758369446, "num_tokens": 1068846247.0, "step": 31905 }, { "epoch": 1.904847182425979, "grad_norm": 0.546893298625946, "learning_rate": 5.278264453506097e-06, "loss": 0.4205, "mean_token_accuracy": 0.859435760974884, "num_tokens": 1069013927.0, "step": 31910 }, { "epoch": 1.9051456542502387, "grad_norm": 0.5343213677406311, "learning_rate": 5.276526160046645e-06, "loss": 0.4358, "mean_token_accuracy": 0.8547059535980225, "num_tokens": 1069181607.0, "step": 31915 }, { "epoch": 1.9054441260744985, "grad_norm": 0.5360780954360962, "learning_rate": 5.274793279524446e-06, "loss": 0.4455, "mean_token_accuracy": 0.8515746235847473, "num_tokens": 1069349287.0, "step": 31920 }, { "epoch": 1.9057425978987583, "grad_norm": 0.5584276914596558, "learning_rate": 5.273065812361573e-06, "loss": 0.4289, "mean_token_accuracy": 0.8572766423225403, "num_tokens": 1069516967.0, "step": 31925 }, { "epoch": 1.9060410697230181, "grad_norm": 0.4997557997703552, "learning_rate": 5.271343758978782e-06, "loss": 0.4302, "mean_token_accuracy": 0.855350112915039, "num_tokens": 1069684647.0, "step": 31930 }, { "epoch": 1.906339541547278, "grad_norm": 0.4914553761482239, "learning_rate": 5.269627119795515e-06, "loss": 0.4389, "mean_token_accuracy": 0.8529166221618653, "num_tokens": 1069852327.0, "step": 31935 }, { "epoch": 1.9066380133715377, "grad_norm": 0.5483037829399109, "learning_rate": 5.2679158952298896e-06, "loss": 0.4457, "mean_token_accuracy": 0.851753544807434, "num_tokens": 1070020007.0, "step": 31940 }, { "epoch": 1.9069364851957975, "grad_norm": 0.5162957310676575, "learning_rate": 5.266210085698702e-06, "loss": 0.4217, "mean_token_accuracy": 0.8590838551521301, "num_tokens": 1070187687.0, "step": 31945 }, { "epoch": 1.9072349570200573, "grad_norm": 0.4837557375431061, "learning_rate": 5.264509691617439e-06, "loss": 0.4312, "mean_token_accuracy": 0.8554097533226013, "num_tokens": 1070355367.0, "step": 31950 }, { "epoch": 1.907533428844317, "grad_norm": 0.4790009558200836, "learning_rate": 5.262814713400257e-06, "loss": 0.4101, "mean_token_accuracy": 0.8619229316711425, "num_tokens": 1070523047.0, "step": 31955 }, { "epoch": 1.9078319006685769, "grad_norm": 0.4639718532562256, "learning_rate": 5.261125151460007e-06, "loss": 0.4425, "mean_token_accuracy": 0.8534056901931762, "num_tokens": 1070690727.0, "step": 31960 }, { "epoch": 1.9081303724928367, "grad_norm": 0.510834813117981, "learning_rate": 5.259441006208208e-06, "loss": 0.433, "mean_token_accuracy": 0.85520099401474, "num_tokens": 1070858407.0, "step": 31965 }, { "epoch": 1.9084288443170965, "grad_norm": 0.5318284630775452, "learning_rate": 5.257762278055063e-06, "loss": 0.4317, "mean_token_accuracy": 0.8547596454620361, "num_tokens": 1071026087.0, "step": 31970 }, { "epoch": 1.9087273161413563, "grad_norm": 0.47995248436927795, "learning_rate": 5.256088967409464e-06, "loss": 0.4152, "mean_token_accuracy": 0.8603781461715698, "num_tokens": 1071193767.0, "step": 31975 }, { "epoch": 1.909025787965616, "grad_norm": 0.47716400027275085, "learning_rate": 5.254421074678971e-06, "loss": 0.4011, "mean_token_accuracy": 0.8657461524009704, "num_tokens": 1071361447.0, "step": 31980 }, { "epoch": 1.9093242597898759, "grad_norm": 0.5465705394744873, "learning_rate": 5.252758600269831e-06, "loss": 0.4401, "mean_token_accuracy": 0.8544375658035278, "num_tokens": 1071529127.0, "step": 31985 }, { "epoch": 1.9096227316141356, "grad_norm": 0.5060298442840576, "learning_rate": 5.25110154458697e-06, "loss": 0.4327, "mean_token_accuracy": 0.8546045541763305, "num_tokens": 1071696807.0, "step": 31990 }, { "epoch": 1.9099212034383954, "grad_norm": 0.5056952834129333, "learning_rate": 5.249449908033996e-06, "loss": 0.4499, "mean_token_accuracy": 0.8500894665718078, "num_tokens": 1071864487.0, "step": 31995 }, { "epoch": 1.9102196752626552, "grad_norm": 0.5316416025161743, "learning_rate": 5.2478036910131975e-06, "loss": 0.4394, "mean_token_accuracy": 0.8522068500518799, "num_tokens": 1072032167.0, "step": 32000 }, { "epoch": 1.910518147086915, "grad_norm": 0.47567927837371826, "learning_rate": 5.246162893925537e-06, "loss": 0.4376, "mean_token_accuracy": 0.8528152227401733, "num_tokens": 1072199847.0, "step": 32005 }, { "epoch": 1.9108166189111748, "grad_norm": 0.4773764908313751, "learning_rate": 5.244527517170661e-06, "loss": 0.3972, "mean_token_accuracy": 0.8659668326377868, "num_tokens": 1072367527.0, "step": 32010 }, { "epoch": 1.9111150907354346, "grad_norm": 0.5136639475822449, "learning_rate": 5.2428975611468986e-06, "loss": 0.4329, "mean_token_accuracy": 0.8539246082305908, "num_tokens": 1072535207.0, "step": 32015 }, { "epoch": 1.9114135625596944, "grad_norm": 0.5270606875419617, "learning_rate": 5.241273026251253e-06, "loss": 0.4688, "mean_token_accuracy": 0.8458964586257934, "num_tokens": 1072702887.0, "step": 32020 }, { "epoch": 1.9117120343839542, "grad_norm": 0.48470228910446167, "learning_rate": 5.239653912879412e-06, "loss": 0.3988, "mean_token_accuracy": 0.8664738059043884, "num_tokens": 1072870567.0, "step": 32025 }, { "epoch": 1.912010506208214, "grad_norm": 0.5084724426269531, "learning_rate": 5.238040221425739e-06, "loss": 0.4433, "mean_token_accuracy": 0.8526601433753968, "num_tokens": 1073038247.0, "step": 32030 }, { "epoch": 1.9123089780324736, "grad_norm": 0.4969509541988373, "learning_rate": 5.23643195228328e-06, "loss": 0.4188, "mean_token_accuracy": 0.8598532795906066, "num_tokens": 1073205927.0, "step": 32035 }, { "epoch": 1.9126074498567336, "grad_norm": 0.46751248836517334, "learning_rate": 5.234829105843755e-06, "loss": 0.4131, "mean_token_accuracy": 0.8601634263992309, "num_tokens": 1073373607.0, "step": 32040 }, { "epoch": 1.9129059216809932, "grad_norm": 0.5296719670295715, "learning_rate": 5.233231682497572e-06, "loss": 0.4247, "mean_token_accuracy": 0.8574018955230713, "num_tokens": 1073541287.0, "step": 32045 }, { "epoch": 1.9132043935052532, "grad_norm": 0.5356879234313965, "learning_rate": 5.231639682633807e-06, "loss": 0.4232, "mean_token_accuracy": 0.8574496030807495, "num_tokens": 1073708967.0, "step": 32050 }, { "epoch": 1.9135028653295127, "grad_norm": 0.570656418800354, "learning_rate": 5.2300531066402265e-06, "loss": 0.4588, "mean_token_accuracy": 0.845890486240387, "num_tokens": 1073876647.0, "step": 32055 }, { "epoch": 1.9138013371537728, "grad_norm": 0.5327288508415222, "learning_rate": 5.228471954903269e-06, "loss": 0.4463, "mean_token_accuracy": 0.8516640782356262, "num_tokens": 1074044327.0, "step": 32060 }, { "epoch": 1.9140998089780323, "grad_norm": 0.4700440466403961, "learning_rate": 5.22689622780805e-06, "loss": 0.4564, "mean_token_accuracy": 0.8459441781044006, "num_tokens": 1074212007.0, "step": 32065 }, { "epoch": 1.9143982808022924, "grad_norm": 0.4614141583442688, "learning_rate": 5.225325925738369e-06, "loss": 0.4181, "mean_token_accuracy": 0.8588274002075196, "num_tokens": 1074379687.0, "step": 32070 }, { "epoch": 1.914696752626552, "grad_norm": 0.48530611395835876, "learning_rate": 5.223761049076703e-06, "loss": 0.4126, "mean_token_accuracy": 0.8612131834030151, "num_tokens": 1074547367.0, "step": 32075 }, { "epoch": 1.914995224450812, "grad_norm": 0.5136609077453613, "learning_rate": 5.222201598204203e-06, "loss": 0.4595, "mean_token_accuracy": 0.846767270565033, "num_tokens": 1074715047.0, "step": 32080 }, { "epoch": 1.9152936962750715, "grad_norm": 0.5924073457717896, "learning_rate": 5.220647573500711e-06, "loss": 0.4637, "mean_token_accuracy": 0.8435643553733826, "num_tokens": 1074882727.0, "step": 32085 }, { "epoch": 1.9155921680993315, "grad_norm": 0.4726608991622925, "learning_rate": 5.219098975344729e-06, "loss": 0.4426, "mean_token_accuracy": 0.8532506346702575, "num_tokens": 1075050407.0, "step": 32090 }, { "epoch": 1.915890639923591, "grad_norm": 0.500882625579834, "learning_rate": 5.217555804113447e-06, "loss": 0.425, "mean_token_accuracy": 0.858570909500122, "num_tokens": 1075218087.0, "step": 32095 }, { "epoch": 1.9161891117478511, "grad_norm": 0.47746843099594116, "learning_rate": 5.2160180601827385e-06, "loss": 0.4118, "mean_token_accuracy": 0.8609924793243409, "num_tokens": 1075385767.0, "step": 32100 }, { "epoch": 1.9164875835721107, "grad_norm": 0.5892366766929626, "learning_rate": 5.214485743927147e-06, "loss": 0.4262, "mean_token_accuracy": 0.8567994832992554, "num_tokens": 1075553447.0, "step": 32105 }, { "epoch": 1.9167860553963707, "grad_norm": 0.4851658046245575, "learning_rate": 5.212958855719894e-06, "loss": 0.399, "mean_token_accuracy": 0.866139817237854, "num_tokens": 1075721127.0, "step": 32110 }, { "epoch": 1.9170845272206303, "grad_norm": 0.507503092288971, "learning_rate": 5.211437395932885e-06, "loss": 0.4205, "mean_token_accuracy": 0.8584158420562744, "num_tokens": 1075888807.0, "step": 32115 }, { "epoch": 1.9173829990448903, "grad_norm": 0.5060835480690002, "learning_rate": 5.209921364936699e-06, "loss": 0.4262, "mean_token_accuracy": 0.8586245894432067, "num_tokens": 1076056487.0, "step": 32120 }, { "epoch": 1.9176814708691499, "grad_norm": 0.4905909597873688, "learning_rate": 5.20841076310059e-06, "loss": 0.4297, "mean_token_accuracy": 0.8556363940238952, "num_tokens": 1076224167.0, "step": 32125 }, { "epoch": 1.9179799426934099, "grad_norm": 0.4943788945674896, "learning_rate": 5.206905590792496e-06, "loss": 0.3928, "mean_token_accuracy": 0.8674698948860169, "num_tokens": 1076391847.0, "step": 32130 }, { "epoch": 1.9182784145176695, "grad_norm": 0.5155355334281921, "learning_rate": 5.205405848379028e-06, "loss": 0.4182, "mean_token_accuracy": 0.8585709214210511, "num_tokens": 1076559527.0, "step": 32135 }, { "epoch": 1.9185768863419295, "grad_norm": 0.49372220039367676, "learning_rate": 5.203911536225477e-06, "loss": 0.3931, "mean_token_accuracy": 0.8676786422729492, "num_tokens": 1076727207.0, "step": 32140 }, { "epoch": 1.918875358166189, "grad_norm": 0.47030818462371826, "learning_rate": 5.202422654695812e-06, "loss": 0.3998, "mean_token_accuracy": 0.863998556137085, "num_tokens": 1076894887.0, "step": 32145 }, { "epoch": 1.919173829990449, "grad_norm": 0.474126398563385, "learning_rate": 5.200939204152674e-06, "loss": 0.3981, "mean_token_accuracy": 0.8657998442649841, "num_tokens": 1077062567.0, "step": 32150 }, { "epoch": 1.9194723018147086, "grad_norm": 0.5253416895866394, "learning_rate": 5.199461184957383e-06, "loss": 0.4365, "mean_token_accuracy": 0.8545150876045227, "num_tokens": 1077230247.0, "step": 32155 }, { "epoch": 1.9197707736389686, "grad_norm": 0.4943583309650421, "learning_rate": 5.197988597469946e-06, "loss": 0.4486, "mean_token_accuracy": 0.8502147197723389, "num_tokens": 1077397927.0, "step": 32160 }, { "epoch": 1.9200692454632282, "grad_norm": 0.49374493956565857, "learning_rate": 5.196521442049033e-06, "loss": 0.4134, "mean_token_accuracy": 0.8596027612686157, "num_tokens": 1077565607.0, "step": 32165 }, { "epoch": 1.920367717287488, "grad_norm": 0.5343686938285828, "learning_rate": 5.195059719052e-06, "loss": 0.413, "mean_token_accuracy": 0.859811520576477, "num_tokens": 1077733287.0, "step": 32170 }, { "epoch": 1.9206661891117478, "grad_norm": 0.506992518901825, "learning_rate": 5.1936034288348686e-06, "loss": 0.4513, "mean_token_accuracy": 0.8493021607398987, "num_tokens": 1077900967.0, "step": 32175 }, { "epoch": 1.9209646609360076, "grad_norm": 0.455807089805603, "learning_rate": 5.192152571752357e-06, "loss": 0.4073, "mean_token_accuracy": 0.8615710377693176, "num_tokens": 1078068647.0, "step": 32180 }, { "epoch": 1.9212631327602674, "grad_norm": 0.5381808280944824, "learning_rate": 5.190707148157839e-06, "loss": 0.4379, "mean_token_accuracy": 0.8528032898902893, "num_tokens": 1078236327.0, "step": 32185 }, { "epoch": 1.9215616045845272, "grad_norm": 0.4487593472003937, "learning_rate": 5.189267158403377e-06, "loss": 0.3877, "mean_token_accuracy": 0.8674579501152039, "num_tokens": 1078404007.0, "step": 32190 }, { "epoch": 1.921860076408787, "grad_norm": 0.49918490648269653, "learning_rate": 5.187832602839708e-06, "loss": 0.3994, "mean_token_accuracy": 0.8661048412322998, "num_tokens": 1078568782.0, "step": 32195 }, { "epoch": 1.9221585482330468, "grad_norm": 0.4842585027217865, "learning_rate": 5.186403481816245e-06, "loss": 0.4226, "mean_token_accuracy": 0.8582070827484131, "num_tokens": 1078736462.0, "step": 32200 }, { "epoch": 1.9224570200573066, "grad_norm": 0.5461395978927612, "learning_rate": 5.1849797956810755e-06, "loss": 0.4263, "mean_token_accuracy": 0.8561254858970642, "num_tokens": 1078904142.0, "step": 32205 }, { "epoch": 1.9227554918815664, "grad_norm": 0.46837109327316284, "learning_rate": 5.183561544780965e-06, "loss": 0.4019, "mean_token_accuracy": 0.8645532608032227, "num_tokens": 1079071822.0, "step": 32210 }, { "epoch": 1.9230539637058262, "grad_norm": 0.4826766550540924, "learning_rate": 5.182148729461353e-06, "loss": 0.4207, "mean_token_accuracy": 0.8577597498893738, "num_tokens": 1079239502.0, "step": 32215 }, { "epoch": 1.923352435530086, "grad_norm": 0.5636767148971558, "learning_rate": 5.180741350066359e-06, "loss": 0.4307, "mean_token_accuracy": 0.8561314582824707, "num_tokens": 1079407182.0, "step": 32220 }, { "epoch": 1.9236509073543457, "grad_norm": 0.551056981086731, "learning_rate": 5.179339406938776e-06, "loss": 0.4393, "mean_token_accuracy": 0.8547715663909912, "num_tokens": 1079574862.0, "step": 32225 }, { "epoch": 1.9239493791786055, "grad_norm": 0.4722616672515869, "learning_rate": 5.1779429004200735e-06, "loss": 0.4177, "mean_token_accuracy": 0.8593761205673218, "num_tokens": 1079742542.0, "step": 32230 }, { "epoch": 1.9242478510028653, "grad_norm": 0.5256747603416443, "learning_rate": 5.1765518308503945e-06, "loss": 0.442, "mean_token_accuracy": 0.851228678226471, "num_tokens": 1079910222.0, "step": 32235 }, { "epoch": 1.9245463228271251, "grad_norm": 0.47784456610679626, "learning_rate": 5.17516619856856e-06, "loss": 0.4031, "mean_token_accuracy": 0.8624074697494507, "num_tokens": 1080074740.0, "step": 32240 }, { "epoch": 1.924844794651385, "grad_norm": 0.5293857455253601, "learning_rate": 5.1737860039120695e-06, "loss": 0.4571, "mean_token_accuracy": 0.8470476031303406, "num_tokens": 1080242420.0, "step": 32245 }, { "epoch": 1.9251432664756447, "grad_norm": 0.5418432950973511, "learning_rate": 5.172411247217088e-06, "loss": 0.4235, "mean_token_accuracy": 0.8572467923164367, "num_tokens": 1080410100.0, "step": 32250 }, { "epoch": 1.9254417382999045, "grad_norm": 0.593043863773346, "learning_rate": 5.171041928818472e-06, "loss": 0.4409, "mean_token_accuracy": 0.8518787980079651, "num_tokens": 1080577780.0, "step": 32255 }, { "epoch": 1.9257402101241643, "grad_norm": 0.5288111567497253, "learning_rate": 5.169678049049736e-06, "loss": 0.4429, "mean_token_accuracy": 0.8513718366622924, "num_tokens": 1080745460.0, "step": 32260 }, { "epoch": 1.926038681948424, "grad_norm": 0.5180296897888184, "learning_rate": 5.168319608243083e-06, "loss": 0.3802, "mean_token_accuracy": 0.8718835830688476, "num_tokens": 1080913140.0, "step": 32265 }, { "epoch": 1.9263371537726839, "grad_norm": 0.49369266629219055, "learning_rate": 5.166966606729387e-06, "loss": 0.4178, "mean_token_accuracy": 0.8581056714057922, "num_tokens": 1081080820.0, "step": 32270 }, { "epoch": 1.9266356255969437, "grad_norm": 0.5329632759094238, "learning_rate": 5.165619044838193e-06, "loss": 0.4047, "mean_token_accuracy": 0.8635154604911804, "num_tokens": 1081248500.0, "step": 32275 }, { "epoch": 1.9269340974212035, "grad_norm": 0.4576363265514374, "learning_rate": 5.164276922897726e-06, "loss": 0.3823, "mean_token_accuracy": 0.8699689865112304, "num_tokens": 1081416180.0, "step": 32280 }, { "epoch": 1.9272325692454633, "grad_norm": 0.5308344960212708, "learning_rate": 5.162940241234883e-06, "loss": 0.3867, "mean_token_accuracy": 0.8699570536613465, "num_tokens": 1081583860.0, "step": 32285 }, { "epoch": 1.927531041069723, "grad_norm": 0.47972264885902405, "learning_rate": 5.161609000175242e-06, "loss": 0.4052, "mean_token_accuracy": 0.862656569480896, "num_tokens": 1081751540.0, "step": 32290 }, { "epoch": 1.9278295128939829, "grad_norm": 0.5196604132652283, "learning_rate": 5.160283200043046e-06, "loss": 0.4096, "mean_token_accuracy": 0.8632172226905823, "num_tokens": 1081919220.0, "step": 32295 }, { "epoch": 1.9281279847182426, "grad_norm": 0.517647385597229, "learning_rate": 5.158962841161222e-06, "loss": 0.4347, "mean_token_accuracy": 0.8538410902023316, "num_tokens": 1082086900.0, "step": 32300 }, { "epoch": 1.9284264565425024, "grad_norm": 0.49499738216400146, "learning_rate": 5.157647923851366e-06, "loss": 0.4092, "mean_token_accuracy": 0.8620660901069641, "num_tokens": 1082254580.0, "step": 32305 }, { "epoch": 1.928724928366762, "grad_norm": 0.48807811737060547, "learning_rate": 5.156338448433749e-06, "loss": 0.4236, "mean_token_accuracy": 0.8564773917198181, "num_tokens": 1082422260.0, "step": 32310 }, { "epoch": 1.929023400191022, "grad_norm": 0.4935206174850464, "learning_rate": 5.155034415227325e-06, "loss": 0.4324, "mean_token_accuracy": 0.8550996065139771, "num_tokens": 1082589940.0, "step": 32315 }, { "epoch": 1.9293218720152816, "grad_norm": 0.5093145370483398, "learning_rate": 5.153735824549705e-06, "loss": 0.4024, "mean_token_accuracy": 0.8654777407646179, "num_tokens": 1082757620.0, "step": 32320 }, { "epoch": 1.9296203438395416, "grad_norm": 0.5231690406799316, "learning_rate": 5.15244267671719e-06, "loss": 0.4831, "mean_token_accuracy": 0.8399558663368225, "num_tokens": 1082925300.0, "step": 32325 }, { "epoch": 1.9299188156638012, "grad_norm": 0.6406030058860779, "learning_rate": 5.151154972044753e-06, "loss": 0.4368, "mean_token_accuracy": 0.8529205799102784, "num_tokens": 1083084912.0, "step": 32330 }, { "epoch": 1.9302172874880612, "grad_norm": 0.5277084112167358, "learning_rate": 5.149872710846032e-06, "loss": 0.3843, "mean_token_accuracy": 0.8710843324661255, "num_tokens": 1083252592.0, "step": 32335 }, { "epoch": 1.9305157593123208, "grad_norm": 0.5303910970687866, "learning_rate": 5.148595893433348e-06, "loss": 0.3972, "mean_token_accuracy": 0.8653465390205384, "num_tokens": 1083420272.0, "step": 32340 }, { "epoch": 1.9308142311365808, "grad_norm": 0.5570942759513855, "learning_rate": 5.147324520117695e-06, "loss": 0.4194, "mean_token_accuracy": 0.8592687487602234, "num_tokens": 1083587952.0, "step": 32345 }, { "epoch": 1.9311127029608404, "grad_norm": 0.4771854877471924, "learning_rate": 5.146058591208738e-06, "loss": 0.4309, "mean_token_accuracy": 0.8555171132087708, "num_tokens": 1083755632.0, "step": 32350 }, { "epoch": 1.9314111747851004, "grad_norm": 0.5093052387237549, "learning_rate": 5.144798107014818e-06, "loss": 0.4331, "mean_token_accuracy": 0.8540677547454834, "num_tokens": 1083923312.0, "step": 32355 }, { "epoch": 1.93170964660936, "grad_norm": 0.5296303629875183, "learning_rate": 5.143543067842946e-06, "loss": 0.4293, "mean_token_accuracy": 0.8556423664093018, "num_tokens": 1084090992.0, "step": 32360 }, { "epoch": 1.93200811843362, "grad_norm": 0.5807662606239319, "learning_rate": 5.142293473998816e-06, "loss": 0.4529, "mean_token_accuracy": 0.8488369345664978, "num_tokens": 1084258672.0, "step": 32365 }, { "epoch": 1.9323065902578795, "grad_norm": 0.5080680847167969, "learning_rate": 5.141049325786785e-06, "loss": 0.4131, "mean_token_accuracy": 0.8619766116142273, "num_tokens": 1084426352.0, "step": 32370 }, { "epoch": 1.9326050620821396, "grad_norm": 0.5295805335044861, "learning_rate": 5.139810623509891e-06, "loss": 0.4124, "mean_token_accuracy": 0.8604735612869263, "num_tokens": 1084594032.0, "step": 32375 }, { "epoch": 1.9329035339063991, "grad_norm": 0.4883458614349365, "learning_rate": 5.138577367469842e-06, "loss": 0.4146, "mean_token_accuracy": 0.8606703996658325, "num_tokens": 1084761712.0, "step": 32380 }, { "epoch": 1.9332020057306591, "grad_norm": 0.5473781824111938, "learning_rate": 5.137349557967016e-06, "loss": 0.4292, "mean_token_accuracy": 0.8561433792114258, "num_tokens": 1084929392.0, "step": 32385 }, { "epoch": 1.9335004775549187, "grad_norm": 0.50545734167099, "learning_rate": 5.136127195300477e-06, "loss": 0.399, "mean_token_accuracy": 0.8654419779777527, "num_tokens": 1085097072.0, "step": 32390 }, { "epoch": 1.9337989493791787, "grad_norm": 0.5239991545677185, "learning_rate": 5.134910279767947e-06, "loss": 0.4294, "mean_token_accuracy": 0.8564117908477783, "num_tokens": 1085264752.0, "step": 32395 }, { "epoch": 1.9340974212034383, "grad_norm": 0.5179317593574524, "learning_rate": 5.133698811665831e-06, "loss": 0.4254, "mean_token_accuracy": 0.8578372955322265, "num_tokens": 1085432432.0, "step": 32400 }, { "epoch": 1.9343958930276983, "grad_norm": 0.47603145241737366, "learning_rate": 5.1324927912892035e-06, "loss": 0.4208, "mean_token_accuracy": 0.8594954133033752, "num_tokens": 1085600112.0, "step": 32405 }, { "epoch": 1.934694364851958, "grad_norm": 0.4627433717250824, "learning_rate": 5.1312922189318115e-06, "loss": 0.4162, "mean_token_accuracy": 0.8601634263992309, "num_tokens": 1085767792.0, "step": 32410 }, { "epoch": 1.934992836676218, "grad_norm": 0.5203522443771362, "learning_rate": 5.130097094886084e-06, "loss": 0.4203, "mean_token_accuracy": 0.8578671097755433, "num_tokens": 1085935472.0, "step": 32415 }, { "epoch": 1.9352913085004775, "grad_norm": 0.5076498985290527, "learning_rate": 5.128907419443105e-06, "loss": 0.4633, "mean_token_accuracy": 0.8469998836517334, "num_tokens": 1086103152.0, "step": 32420 }, { "epoch": 1.9355897803247375, "grad_norm": 0.5263394713401794, "learning_rate": 5.127723192892649e-06, "loss": 0.421, "mean_token_accuracy": 0.8593641757965088, "num_tokens": 1086270832.0, "step": 32425 }, { "epoch": 1.935888252148997, "grad_norm": 0.4940546751022339, "learning_rate": 5.1265444155231545e-06, "loss": 0.4358, "mean_token_accuracy": 0.853984260559082, "num_tokens": 1086438512.0, "step": 32430 }, { "epoch": 1.936186723973257, "grad_norm": 0.5298051238059998, "learning_rate": 5.125371087621734e-06, "loss": 0.4175, "mean_token_accuracy": 0.8595968008041381, "num_tokens": 1086606192.0, "step": 32435 }, { "epoch": 1.9364851957975167, "grad_norm": 0.5094213485717773, "learning_rate": 5.124203209474177e-06, "loss": 0.4104, "mean_token_accuracy": 0.8609089851379395, "num_tokens": 1086773872.0, "step": 32440 }, { "epoch": 1.9367836676217765, "grad_norm": 0.5221966505050659, "learning_rate": 5.123040781364933e-06, "loss": 0.3862, "mean_token_accuracy": 0.8694321751594544, "num_tokens": 1086941552.0, "step": 32445 }, { "epoch": 1.9370821394460362, "grad_norm": 0.6008448004722595, "learning_rate": 5.121883803577139e-06, "loss": 0.393, "mean_token_accuracy": 0.8662648558616638, "num_tokens": 1087102373.0, "step": 32450 }, { "epoch": 1.937380611270296, "grad_norm": 0.5770822167396545, "learning_rate": 5.120732276392598e-06, "loss": 0.4483, "mean_token_accuracy": 0.8495765328407288, "num_tokens": 1087270053.0, "step": 32455 }, { "epoch": 1.9376790830945558, "grad_norm": 0.49772778153419495, "learning_rate": 5.119586200091786e-06, "loss": 0.4313, "mean_token_accuracy": 0.8565728187561035, "num_tokens": 1087437733.0, "step": 32460 }, { "epoch": 1.9379775549188156, "grad_norm": 0.4981571435928345, "learning_rate": 5.1184455749538475e-06, "loss": 0.4155, "mean_token_accuracy": 0.8609507322311402, "num_tokens": 1087605413.0, "step": 32465 }, { "epoch": 1.9382760267430754, "grad_norm": 0.49466049671173096, "learning_rate": 5.117310401256608e-06, "loss": 0.3992, "mean_token_accuracy": 0.865877366065979, "num_tokens": 1087773093.0, "step": 32470 }, { "epoch": 1.9385744985673352, "grad_norm": 0.5097780823707581, "learning_rate": 5.116180679276559e-06, "loss": 0.4184, "mean_token_accuracy": 0.8581295371055603, "num_tokens": 1087940773.0, "step": 32475 }, { "epoch": 1.938872970391595, "grad_norm": 0.5160642266273499, "learning_rate": 5.1150564092888595e-06, "loss": 0.4102, "mean_token_accuracy": 0.8621078252792358, "num_tokens": 1088108453.0, "step": 32480 }, { "epoch": 1.9391714422158548, "grad_norm": 0.4997353255748749, "learning_rate": 5.113937591567351e-06, "loss": 0.4079, "mean_token_accuracy": 0.8624478101730346, "num_tokens": 1088276133.0, "step": 32485 }, { "epoch": 1.9394699140401146, "grad_norm": 0.5170464515686035, "learning_rate": 5.112824226384544e-06, "loss": 0.3919, "mean_token_accuracy": 0.8661696314811707, "num_tokens": 1088443813.0, "step": 32490 }, { "epoch": 1.9397683858643744, "grad_norm": 0.4755837619304657, "learning_rate": 5.111716314011615e-06, "loss": 0.4015, "mean_token_accuracy": 0.8652153134346008, "num_tokens": 1088611493.0, "step": 32495 }, { "epoch": 1.9400668576886342, "grad_norm": 0.5550814867019653, "learning_rate": 5.110613854718418e-06, "loss": 0.4629, "mean_token_accuracy": 0.845914363861084, "num_tokens": 1088779173.0, "step": 32500 }, { "epoch": 1.940365329512894, "grad_norm": 0.5384535789489746, "learning_rate": 5.109516848773475e-06, "loss": 0.4052, "mean_token_accuracy": 0.864147686958313, "num_tokens": 1088946853.0, "step": 32505 }, { "epoch": 1.9406638013371538, "grad_norm": 0.509691596031189, "learning_rate": 5.108425296443988e-06, "loss": 0.4459, "mean_token_accuracy": 0.8503161191940307, "num_tokens": 1089114533.0, "step": 32510 }, { "epoch": 1.9409622731614136, "grad_norm": 0.513257622718811, "learning_rate": 5.107339197995821e-06, "loss": 0.4403, "mean_token_accuracy": 0.8517058372497559, "num_tokens": 1089282213.0, "step": 32515 }, { "epoch": 1.9412607449856734, "grad_norm": 0.5266803503036499, "learning_rate": 5.10625855369351e-06, "loss": 0.4619, "mean_token_accuracy": 0.8452702045440674, "num_tokens": 1089449893.0, "step": 32520 }, { "epoch": 1.9415592168099332, "grad_norm": 0.4870196282863617, "learning_rate": 5.105183363800273e-06, "loss": 0.376, "mean_token_accuracy": 0.872969114780426, "num_tokens": 1089617573.0, "step": 32525 }, { "epoch": 1.941857688634193, "grad_norm": 0.5021434426307678, "learning_rate": 5.104113628577987e-06, "loss": 0.4144, "mean_token_accuracy": 0.8604377985000611, "num_tokens": 1089785253.0, "step": 32530 }, { "epoch": 1.9421561604584527, "grad_norm": 0.4860079288482666, "learning_rate": 5.10304934828721e-06, "loss": 0.4194, "mean_token_accuracy": 0.8605749845504761, "num_tokens": 1089952933.0, "step": 32535 }, { "epoch": 1.9424546322827125, "grad_norm": 0.5217337012290955, "learning_rate": 5.10199052318716e-06, "loss": 0.4469, "mean_token_accuracy": 0.851616358757019, "num_tokens": 1090120613.0, "step": 32540 }, { "epoch": 1.9427531041069723, "grad_norm": 0.5213215947151184, "learning_rate": 5.1009371535357426e-06, "loss": 0.4518, "mean_token_accuracy": 0.8478766441345215, "num_tokens": 1090288293.0, "step": 32545 }, { "epoch": 1.9430515759312321, "grad_norm": 0.4800375699996948, "learning_rate": 5.099889239589518e-06, "loss": 0.4028, "mean_token_accuracy": 0.8645174741744995, "num_tokens": 1090455973.0, "step": 32550 }, { "epoch": 1.943350047755492, "grad_norm": 0.5107091665267944, "learning_rate": 5.0988467816037265e-06, "loss": 0.4545, "mean_token_accuracy": 0.8474352836608887, "num_tokens": 1090623653.0, "step": 32555 }, { "epoch": 1.9436485195797517, "grad_norm": 0.5225998163223267, "learning_rate": 5.097809779832283e-06, "loss": 0.4317, "mean_token_accuracy": 0.856715977191925, "num_tokens": 1090791333.0, "step": 32560 }, { "epoch": 1.9439469914040115, "grad_norm": 0.5601538419723511, "learning_rate": 5.096778234527763e-06, "loss": 0.4564, "mean_token_accuracy": 0.8473517775535584, "num_tokens": 1090959013.0, "step": 32565 }, { "epoch": 1.9442454632282713, "grad_norm": 0.5448104739189148, "learning_rate": 5.095752145941418e-06, "loss": 0.4373, "mean_token_accuracy": 0.8525348901748657, "num_tokens": 1091126693.0, "step": 32570 }, { "epoch": 1.944543935052531, "grad_norm": 0.5191338658332825, "learning_rate": 5.094731514323175e-06, "loss": 0.4194, "mean_token_accuracy": 0.8580042958259583, "num_tokens": 1091294373.0, "step": 32575 }, { "epoch": 1.9448424068767909, "grad_norm": 0.5559952855110168, "learning_rate": 5.093716339921625e-06, "loss": 0.4302, "mean_token_accuracy": 0.8550101399421692, "num_tokens": 1091462053.0, "step": 32580 }, { "epoch": 1.9451408787010505, "grad_norm": 0.5127229690551758, "learning_rate": 5.092706622984035e-06, "loss": 0.4206, "mean_token_accuracy": 0.8593582391738892, "num_tokens": 1091629733.0, "step": 32585 }, { "epoch": 1.9454393505253105, "grad_norm": 0.5035390257835388, "learning_rate": 5.091702363756335e-06, "loss": 0.4309, "mean_token_accuracy": 0.8548431396484375, "num_tokens": 1091797413.0, "step": 32590 }, { "epoch": 1.94573782234957, "grad_norm": 0.4593779742717743, "learning_rate": 5.090703562483136e-06, "loss": 0.4675, "mean_token_accuracy": 0.8449481129646301, "num_tokens": 1091965093.0, "step": 32595 }, { "epoch": 1.94603629417383, "grad_norm": 0.5089210867881775, "learning_rate": 5.089710219407714e-06, "loss": 0.4299, "mean_token_accuracy": 0.8553322196006775, "num_tokens": 1092132773.0, "step": 32600 }, { "epoch": 1.9463347659980896, "grad_norm": 0.4984305500984192, "learning_rate": 5.088722334772011e-06, "loss": 0.4529, "mean_token_accuracy": 0.8480973362922668, "num_tokens": 1092300453.0, "step": 32605 }, { "epoch": 1.9466332378223496, "grad_norm": 0.4983685314655304, "learning_rate": 5.08773990881665e-06, "loss": 0.4046, "mean_token_accuracy": 0.8634021162986756, "num_tokens": 1092468133.0, "step": 32610 }, { "epoch": 1.9469317096466092, "grad_norm": 0.5268809795379639, "learning_rate": 5.086762941780916e-06, "loss": 0.4, "mean_token_accuracy": 0.8654538989067078, "num_tokens": 1092635813.0, "step": 32615 }, { "epoch": 1.9472301814708692, "grad_norm": 0.5293128490447998, "learning_rate": 5.085791433902771e-06, "loss": 0.422, "mean_token_accuracy": 0.8590182542800904, "num_tokens": 1092803493.0, "step": 32620 }, { "epoch": 1.9475286532951288, "grad_norm": 0.5257720947265625, "learning_rate": 5.084825385418841e-06, "loss": 0.4396, "mean_token_accuracy": 0.8520100235939025, "num_tokens": 1092971173.0, "step": 32625 }, { "epoch": 1.9478271251193888, "grad_norm": 0.5668218731880188, "learning_rate": 5.083864796564422e-06, "loss": 0.4321, "mean_token_accuracy": 0.8553143382072449, "num_tokens": 1093138853.0, "step": 32630 }, { "epoch": 1.9481255969436484, "grad_norm": 0.5278427004814148, "learning_rate": 5.082909667573488e-06, "loss": 0.4194, "mean_token_accuracy": 0.8592031598091125, "num_tokens": 1093306533.0, "step": 32635 }, { "epoch": 1.9484240687679084, "grad_norm": 0.47221750020980835, "learning_rate": 5.081959998678678e-06, "loss": 0.4394, "mean_token_accuracy": 0.85449720621109, "num_tokens": 1093474213.0, "step": 32640 }, { "epoch": 1.948722540592168, "grad_norm": 0.518292248249054, "learning_rate": 5.081015790111298e-06, "loss": 0.4305, "mean_token_accuracy": 0.855463445186615, "num_tokens": 1093641893.0, "step": 32645 }, { "epoch": 1.949021012416428, "grad_norm": 0.4561622440814972, "learning_rate": 5.080077042101328e-06, "loss": 0.4478, "mean_token_accuracy": 0.8493558287620544, "num_tokens": 1093809573.0, "step": 32650 }, { "epoch": 1.9493194842406876, "grad_norm": 0.48174574971199036, "learning_rate": 5.079143754877423e-06, "loss": 0.4469, "mean_token_accuracy": 0.8514851450920105, "num_tokens": 1093977253.0, "step": 32655 }, { "epoch": 1.9496179560649476, "grad_norm": 0.4775007963180542, "learning_rate": 5.078215928666896e-06, "loss": 0.4152, "mean_token_accuracy": 0.8596803069114685, "num_tokens": 1094144933.0, "step": 32660 }, { "epoch": 1.9499164278892072, "grad_norm": 0.5278202891349792, "learning_rate": 5.077293563695738e-06, "loss": 0.4289, "mean_token_accuracy": 0.8539902091026306, "num_tokens": 1094312613.0, "step": 32665 }, { "epoch": 1.9502148997134672, "grad_norm": 0.5718221068382263, "learning_rate": 5.076376660188608e-06, "loss": 0.4381, "mean_token_accuracy": 0.8539961695671081, "num_tokens": 1094480293.0, "step": 32670 }, { "epoch": 1.9505133715377267, "grad_norm": 0.4924035370349884, "learning_rate": 5.075465218368835e-06, "loss": 0.3835, "mean_token_accuracy": 0.8699451208114624, "num_tokens": 1094647973.0, "step": 32675 }, { "epoch": 1.9508118433619868, "grad_norm": 0.5240120887756348, "learning_rate": 5.074559238458418e-06, "loss": 0.4284, "mean_token_accuracy": 0.8556304335594177, "num_tokens": 1094815653.0, "step": 32680 }, { "epoch": 1.9511103151862463, "grad_norm": 0.45406651496887207, "learning_rate": 5.073658720678026e-06, "loss": 0.4211, "mean_token_accuracy": 0.8576762676239014, "num_tokens": 1094983333.0, "step": 32685 }, { "epoch": 1.9514087870105064, "grad_norm": 0.47419631481170654, "learning_rate": 5.072763665246994e-06, "loss": 0.3863, "mean_token_accuracy": 0.8681617498397827, "num_tokens": 1095151013.0, "step": 32690 }, { "epoch": 1.951707258834766, "grad_norm": 0.4790462851524353, "learning_rate": 5.071874072383329e-06, "loss": 0.3921, "mean_token_accuracy": 0.8667780041694642, "num_tokens": 1095318693.0, "step": 32695 }, { "epoch": 1.952005730659026, "grad_norm": 0.5091140270233154, "learning_rate": 5.07098994230371e-06, "loss": 0.4055, "mean_token_accuracy": 0.8613980770111084, "num_tokens": 1095486373.0, "step": 32700 }, { "epoch": 1.9523042024832855, "grad_norm": 0.5264834761619568, "learning_rate": 5.070111275223485e-06, "loss": 0.4116, "mean_token_accuracy": 0.8620064377784729, "num_tokens": 1095654053.0, "step": 32705 }, { "epoch": 1.9526026743075455, "grad_norm": 0.5464398264884949, "learning_rate": 5.069238071356666e-06, "loss": 0.3965, "mean_token_accuracy": 0.8665632843971253, "num_tokens": 1095821733.0, "step": 32710 }, { "epoch": 1.952901146131805, "grad_norm": 0.5134994387626648, "learning_rate": 5.068370330915939e-06, "loss": 0.445, "mean_token_accuracy": 0.8497137069702149, "num_tokens": 1095989413.0, "step": 32715 }, { "epoch": 1.953199617956065, "grad_norm": 0.4360191822052002, "learning_rate": 5.0675080541126604e-06, "loss": 0.3706, "mean_token_accuracy": 0.8747405409812927, "num_tokens": 1096157093.0, "step": 32720 }, { "epoch": 1.9534980897803247, "grad_norm": 0.5409044623374939, "learning_rate": 5.066651241156847e-06, "loss": 0.4359, "mean_token_accuracy": 0.8537755012512207, "num_tokens": 1096324773.0, "step": 32725 }, { "epoch": 1.9537965616045845, "grad_norm": 0.47892406582832336, "learning_rate": 5.0657998922572e-06, "loss": 0.4388, "mean_token_accuracy": 0.8529106497764587, "num_tokens": 1096492453.0, "step": 32730 }, { "epoch": 1.9540950334288443, "grad_norm": 0.49769166111946106, "learning_rate": 5.064954007621076e-06, "loss": 0.3899, "mean_token_accuracy": 0.8674698829650879, "num_tokens": 1096660133.0, "step": 32735 }, { "epoch": 1.954393505253104, "grad_norm": 0.5040245652198792, "learning_rate": 5.064113587454505e-06, "loss": 0.4406, "mean_token_accuracy": 0.8539365410804749, "num_tokens": 1096827813.0, "step": 32740 }, { "epoch": 1.9546919770773639, "grad_norm": 0.4996117651462555, "learning_rate": 5.063278631962192e-06, "loss": 0.3836, "mean_token_accuracy": 0.8699451327323914, "num_tokens": 1096995493.0, "step": 32745 }, { "epoch": 1.9549904489016237, "grad_norm": 0.4961124062538147, "learning_rate": 5.062449141347501e-06, "loss": 0.4251, "mean_token_accuracy": 0.8567100167274475, "num_tokens": 1097163173.0, "step": 32750 }, { "epoch": 1.9552889207258835, "grad_norm": 0.4479682743549347, "learning_rate": 5.061625115812472e-06, "loss": 0.4003, "mean_token_accuracy": 0.8652451276779175, "num_tokens": 1097330853.0, "step": 32755 }, { "epoch": 1.9555873925501432, "grad_norm": 0.548488199710846, "learning_rate": 5.060806555557811e-06, "loss": 0.4109, "mean_token_accuracy": 0.8621853828430176, "num_tokens": 1097498533.0, "step": 32760 }, { "epoch": 1.955885864374403, "grad_norm": 0.5609689354896545, "learning_rate": 5.059993460782892e-06, "loss": 0.4446, "mean_token_accuracy": 0.8500775337219239, "num_tokens": 1097666213.0, "step": 32765 }, { "epoch": 1.9561843361986628, "grad_norm": 0.47095489501953125, "learning_rate": 5.059185831685762e-06, "loss": 0.4309, "mean_token_accuracy": 0.8558749794960022, "num_tokens": 1097833893.0, "step": 32770 }, { "epoch": 1.9564828080229226, "grad_norm": 0.5144312977790833, "learning_rate": 5.058383668463131e-06, "loss": 0.4152, "mean_token_accuracy": 0.8596982002258301, "num_tokens": 1098001573.0, "step": 32775 }, { "epoch": 1.9567812798471824, "grad_norm": 0.4622931182384491, "learning_rate": 5.057586971310384e-06, "loss": 0.368, "mean_token_accuracy": 0.8749791264533997, "num_tokens": 1098169253.0, "step": 32780 }, { "epoch": 1.9570797516714422, "grad_norm": 0.5186492204666138, "learning_rate": 5.05679574042157e-06, "loss": 0.4393, "mean_token_accuracy": 0.8546463131904602, "num_tokens": 1098336933.0, "step": 32785 }, { "epoch": 1.957378223495702, "grad_norm": 0.47548261284828186, "learning_rate": 5.056009975989405e-06, "loss": 0.4, "mean_token_accuracy": 0.863748061656952, "num_tokens": 1098504613.0, "step": 32790 }, { "epoch": 1.9576766953199618, "grad_norm": 0.5142004489898682, "learning_rate": 5.055229678205281e-06, "loss": 0.4526, "mean_token_accuracy": 0.8496302127838135, "num_tokens": 1098672293.0, "step": 32795 }, { "epoch": 1.9579751671442216, "grad_norm": 0.4857657253742218, "learning_rate": 5.054454847259252e-06, "loss": 0.4216, "mean_token_accuracy": 0.8572408437728882, "num_tokens": 1098839973.0, "step": 32800 }, { "epoch": 1.9582736389684814, "grad_norm": 0.5145917534828186, "learning_rate": 5.053685483340041e-06, "loss": 0.4209, "mean_token_accuracy": 0.8593462944030762, "num_tokens": 1099007653.0, "step": 32805 }, { "epoch": 1.9585721107927412, "grad_norm": 0.4593253433704376, "learning_rate": 5.052921586635044e-06, "loss": 0.3843, "mean_token_accuracy": 0.8702910661697387, "num_tokens": 1099175333.0, "step": 32810 }, { "epoch": 1.958870582617001, "grad_norm": 0.543697714805603, "learning_rate": 5.052163157330317e-06, "loss": 0.4267, "mean_token_accuracy": 0.8570320844650269, "num_tokens": 1099343013.0, "step": 32815 }, { "epoch": 1.9591690544412608, "grad_norm": 0.46874719858169556, "learning_rate": 5.051410195610596e-06, "loss": 0.3923, "mean_token_accuracy": 0.8681200027465821, "num_tokens": 1099510693.0, "step": 32820 }, { "epoch": 1.9594675262655206, "grad_norm": 0.5890623331069946, "learning_rate": 5.0506627016592744e-06, "loss": 0.4119, "mean_token_accuracy": 0.8600143194198608, "num_tokens": 1099678373.0, "step": 32825 }, { "epoch": 1.9597659980897804, "grad_norm": 0.5843247175216675, "learning_rate": 5.049920675658418e-06, "loss": 0.4121, "mean_token_accuracy": 0.8618871450424195, "num_tokens": 1099846053.0, "step": 32830 }, { "epoch": 1.9600644699140402, "grad_norm": 0.4814905524253845, "learning_rate": 5.049184117788762e-06, "loss": 0.4166, "mean_token_accuracy": 0.860914945602417, "num_tokens": 1100013733.0, "step": 32835 }, { "epoch": 1.9603629417383, "grad_norm": 0.5177224278450012, "learning_rate": 5.0484530282297106e-06, "loss": 0.4146, "mean_token_accuracy": 0.8597220659255982, "num_tokens": 1100181413.0, "step": 32840 }, { "epoch": 1.9606614135625597, "grad_norm": 0.5069063901901245, "learning_rate": 5.04772740715933e-06, "loss": 0.4041, "mean_token_accuracy": 0.8631635546684265, "num_tokens": 1100349093.0, "step": 32845 }, { "epoch": 1.9609598853868195, "grad_norm": 0.5217315554618835, "learning_rate": 5.047007254754361e-06, "loss": 0.4234, "mean_token_accuracy": 0.8576046824455261, "num_tokens": 1100516773.0, "step": 32850 }, { "epoch": 1.9612583572110793, "grad_norm": 0.46630823612213135, "learning_rate": 5.046292571190211e-06, "loss": 0.3977, "mean_token_accuracy": 0.8659966588020325, "num_tokens": 1100684453.0, "step": 32855 }, { "epoch": 1.961556829035339, "grad_norm": 0.5049127340316772, "learning_rate": 5.045583356640954e-06, "loss": 0.4402, "mean_token_accuracy": 0.8517120480537415, "num_tokens": 1100850815.0, "step": 32860 }, { "epoch": 1.961855300859599, "grad_norm": 0.5101924538612366, "learning_rate": 5.044879611279331e-06, "loss": 0.4311, "mean_token_accuracy": 0.8554395914077759, "num_tokens": 1101018495.0, "step": 32865 }, { "epoch": 1.9621537726838585, "grad_norm": 0.5113304257392883, "learning_rate": 5.044181335276751e-06, "loss": 0.4564, "mean_token_accuracy": 0.8484909772872925, "num_tokens": 1101186175.0, "step": 32870 }, { "epoch": 1.9624522445081185, "grad_norm": 0.5390164852142334, "learning_rate": 5.043488528803296e-06, "loss": 0.4416, "mean_token_accuracy": 0.8523141980171204, "num_tokens": 1101353855.0, "step": 32875 }, { "epoch": 1.962750716332378, "grad_norm": 0.5232124328613281, "learning_rate": 5.042801192027708e-06, "loss": 0.3973, "mean_token_accuracy": 0.8659191131591797, "num_tokens": 1101521535.0, "step": 32880 }, { "epoch": 1.963049188156638, "grad_norm": 0.5050330758094788, "learning_rate": 5.0421193251174005e-06, "loss": 0.4283, "mean_token_accuracy": 0.8562626719474793, "num_tokens": 1101689215.0, "step": 32885 }, { "epoch": 1.9633476599808977, "grad_norm": 0.5037065744400024, "learning_rate": 5.041442928238459e-06, "loss": 0.4104, "mean_token_accuracy": 0.8612787842750549, "num_tokens": 1101856895.0, "step": 32890 }, { "epoch": 1.9636461318051577, "grad_norm": 0.5262560844421387, "learning_rate": 5.0407720015556295e-06, "loss": 0.4331, "mean_token_accuracy": 0.8548431277275086, "num_tokens": 1102024575.0, "step": 32895 }, { "epoch": 1.9639446036294173, "grad_norm": 0.5172950625419617, "learning_rate": 5.040106545232327e-06, "loss": 0.4096, "mean_token_accuracy": 0.8620899438858032, "num_tokens": 1102192255.0, "step": 32900 }, { "epoch": 1.9642430754536773, "grad_norm": 0.6064838767051697, "learning_rate": 5.03944655943064e-06, "loss": 0.4078, "mean_token_accuracy": 0.8617559313774109, "num_tokens": 1102359935.0, "step": 32905 }, { "epoch": 1.9645415472779368, "grad_norm": 0.4952913522720337, "learning_rate": 5.038792044311317e-06, "loss": 0.4189, "mean_token_accuracy": 0.8586424946784973, "num_tokens": 1102527615.0, "step": 32910 }, { "epoch": 1.9648400191021969, "grad_norm": 0.4856053292751312, "learning_rate": 5.0381430000337765e-06, "loss": 0.3827, "mean_token_accuracy": 0.8710127711296082, "num_tokens": 1102695295.0, "step": 32915 }, { "epoch": 1.9651384909264564, "grad_norm": 0.5069352984428406, "learning_rate": 5.037499426756109e-06, "loss": 0.4577, "mean_token_accuracy": 0.8478229761123657, "num_tokens": 1102862975.0, "step": 32920 }, { "epoch": 1.9654369627507164, "grad_norm": 0.5154653787612915, "learning_rate": 5.0368613246350645e-06, "loss": 0.4513, "mean_token_accuracy": 0.8479184031486511, "num_tokens": 1103030655.0, "step": 32925 }, { "epoch": 1.965735434574976, "grad_norm": 0.5819794535636902, "learning_rate": 5.036228693826068e-06, "loss": 0.421, "mean_token_accuracy": 0.8591673612594605, "num_tokens": 1103198335.0, "step": 32930 }, { "epoch": 1.966033906399236, "grad_norm": 0.5436822175979614, "learning_rate": 5.035601534483207e-06, "loss": 0.407, "mean_token_accuracy": 0.8614935040473938, "num_tokens": 1103366015.0, "step": 32935 }, { "epoch": 1.9663323782234956, "grad_norm": 0.48451313376426697, "learning_rate": 5.034979846759236e-06, "loss": 0.4602, "mean_token_accuracy": 0.8455326199531555, "num_tokens": 1103533695.0, "step": 32940 }, { "epoch": 1.9666308500477556, "grad_norm": 0.5387483239173889, "learning_rate": 5.03436363080558e-06, "loss": 0.4106, "mean_token_accuracy": 0.8616485714912414, "num_tokens": 1103701375.0, "step": 32945 }, { "epoch": 1.9669293218720152, "grad_norm": 0.4694807529449463, "learning_rate": 5.03375288677233e-06, "loss": 0.4009, "mean_token_accuracy": 0.8634259700775146, "num_tokens": 1103869055.0, "step": 32950 }, { "epoch": 1.9672277936962752, "grad_norm": 0.49442437291145325, "learning_rate": 5.033147614808247e-06, "loss": 0.4256, "mean_token_accuracy": 0.8560479640960693, "num_tokens": 1104036735.0, "step": 32955 }, { "epoch": 1.9675262655205348, "grad_norm": 0.4747960865497589, "learning_rate": 5.032547815060747e-06, "loss": 0.419, "mean_token_accuracy": 0.858958613872528, "num_tokens": 1104204415.0, "step": 32960 }, { "epoch": 1.9678247373447948, "grad_norm": 0.48176291584968567, "learning_rate": 5.031953487675933e-06, "loss": 0.4481, "mean_token_accuracy": 0.8515388369560242, "num_tokens": 1104372095.0, "step": 32965 }, { "epoch": 1.9681232091690544, "grad_norm": 0.4866582155227661, "learning_rate": 5.031364632798559e-06, "loss": 0.4034, "mean_token_accuracy": 0.8639150619506836, "num_tokens": 1104539775.0, "step": 32970 }, { "epoch": 1.9684216809933144, "grad_norm": 0.48427248001098633, "learning_rate": 5.030781250572053e-06, "loss": 0.3808, "mean_token_accuracy": 0.8713348507881165, "num_tokens": 1104707455.0, "step": 32975 }, { "epoch": 1.968720152817574, "grad_norm": 0.5667198300361633, "learning_rate": 5.030203341138504e-06, "loss": 0.4192, "mean_token_accuracy": 0.8588691353797913, "num_tokens": 1104875135.0, "step": 32980 }, { "epoch": 1.969018624641834, "grad_norm": 0.5106105208396912, "learning_rate": 5.029630904638678e-06, "loss": 0.4168, "mean_token_accuracy": 0.8606465458869934, "num_tokens": 1105042815.0, "step": 32985 }, { "epoch": 1.9693170964660935, "grad_norm": 0.4783756136894226, "learning_rate": 5.029063941212001e-06, "loss": 0.4382, "mean_token_accuracy": 0.8531193971633911, "num_tokens": 1105210495.0, "step": 32990 }, { "epoch": 1.9696155682903533, "grad_norm": 0.5211960673332214, "learning_rate": 5.028502450996568e-06, "loss": 0.3857, "mean_token_accuracy": 0.8693964123725891, "num_tokens": 1105378175.0, "step": 32995 }, { "epoch": 1.9699140401146131, "grad_norm": 0.5239377617835999, "learning_rate": 5.027946434129137e-06, "loss": 0.4134, "mean_token_accuracy": 0.8608791470527649, "num_tokens": 1105545855.0, "step": 33000 }, { "epoch": 1.970212511938873, "grad_norm": 0.4839612543582916, "learning_rate": 5.027395890745141e-06, "loss": 0.4018, "mean_token_accuracy": 0.8651616454124451, "num_tokens": 1105713535.0, "step": 33005 }, { "epoch": 1.9705109837631327, "grad_norm": 0.5277972221374512, "learning_rate": 5.026850820978671e-06, "loss": 0.4283, "mean_token_accuracy": 0.8563783884048461, "num_tokens": 1105876527.0, "step": 33010 }, { "epoch": 1.9708094555873925, "grad_norm": 0.4896654486656189, "learning_rate": 5.0263112249624925e-06, "loss": 0.4208, "mean_token_accuracy": 0.8595192670822144, "num_tokens": 1106044207.0, "step": 33015 }, { "epoch": 1.9711079274116523, "grad_norm": 0.5849815607070923, "learning_rate": 5.025777102828029e-06, "loss": 0.4675, "mean_token_accuracy": 0.8446141004562377, "num_tokens": 1106211887.0, "step": 33020 }, { "epoch": 1.971406399235912, "grad_norm": 0.5098403692245483, "learning_rate": 5.0252484547053815e-06, "loss": 0.4147, "mean_token_accuracy": 0.8612155199050904, "num_tokens": 1106373449.0, "step": 33025 }, { "epoch": 1.971704871060172, "grad_norm": 0.5544062852859497, "learning_rate": 5.0247252807233074e-06, "loss": 0.431, "mean_token_accuracy": 0.8559704184532165, "num_tokens": 1106541129.0, "step": 33030 }, { "epoch": 1.9720033428844317, "grad_norm": 0.4739290177822113, "learning_rate": 5.0242075810092364e-06, "loss": 0.3962, "mean_token_accuracy": 0.8666467785835266, "num_tokens": 1106708809.0, "step": 33035 }, { "epoch": 1.9723018147086915, "grad_norm": 0.48968541622161865, "learning_rate": 5.023695355689267e-06, "loss": 0.4206, "mean_token_accuracy": 0.858057975769043, "num_tokens": 1106876489.0, "step": 33040 }, { "epoch": 1.9726002865329513, "grad_norm": 0.546718955039978, "learning_rate": 5.023188604888156e-06, "loss": 0.479, "mean_token_accuracy": 0.8403674006462097, "num_tokens": 1107044169.0, "step": 33045 }, { "epoch": 1.972898758357211, "grad_norm": 0.4721791744232178, "learning_rate": 5.022687328729336e-06, "loss": 0.419, "mean_token_accuracy": 0.859185254573822, "num_tokens": 1107211849.0, "step": 33050 }, { "epoch": 1.9731972301814709, "grad_norm": 0.5139625072479248, "learning_rate": 5.022191527334902e-06, "loss": 0.4434, "mean_token_accuracy": 0.8497316002845764, "num_tokens": 1107379529.0, "step": 33055 }, { "epoch": 1.9734957020057307, "grad_norm": 0.5600574612617493, "learning_rate": 5.021701200825614e-06, "loss": 0.4695, "mean_token_accuracy": 0.843546462059021, "num_tokens": 1107547209.0, "step": 33060 }, { "epoch": 1.9737941738299905, "grad_norm": 0.5157943964004517, "learning_rate": 5.0212163493208995e-06, "loss": 0.4167, "mean_token_accuracy": 0.8595789074897766, "num_tokens": 1107714889.0, "step": 33065 }, { "epoch": 1.9740926456542502, "grad_norm": 0.5059912204742432, "learning_rate": 5.020736972938854e-06, "loss": 0.4472, "mean_token_accuracy": 0.848872709274292, "num_tokens": 1107882569.0, "step": 33070 }, { "epoch": 1.97439111747851, "grad_norm": 0.5495331287384033, "learning_rate": 5.0202630717962385e-06, "loss": 0.4162, "mean_token_accuracy": 0.8586782932281494, "num_tokens": 1108050249.0, "step": 33075 }, { "epoch": 1.9746895893027698, "grad_norm": 0.5167054533958435, "learning_rate": 5.01979464600848e-06, "loss": 0.4349, "mean_token_accuracy": 0.8533699154853821, "num_tokens": 1108217929.0, "step": 33080 }, { "epoch": 1.9749880611270296, "grad_norm": 0.4671654999256134, "learning_rate": 5.019331695689673e-06, "loss": 0.4025, "mean_token_accuracy": 0.8629547834396363, "num_tokens": 1108385609.0, "step": 33085 }, { "epoch": 1.9752865329512894, "grad_norm": 0.5264710783958435, "learning_rate": 5.018874220952578e-06, "loss": 0.4018, "mean_token_accuracy": 0.8641238331794738, "num_tokens": 1108553289.0, "step": 33090 }, { "epoch": 1.9755850047755492, "grad_norm": 0.4915428161621094, "learning_rate": 5.01842222190862e-06, "loss": 0.4233, "mean_token_accuracy": 0.8572289109230041, "num_tokens": 1108720969.0, "step": 33095 }, { "epoch": 1.975883476599809, "grad_norm": 0.580143392086029, "learning_rate": 5.017975698667892e-06, "loss": 0.4386, "mean_token_accuracy": 0.8535727143287659, "num_tokens": 1108888649.0, "step": 33100 }, { "epoch": 1.9761819484240688, "grad_norm": 0.5151272416114807, "learning_rate": 5.0175346513391506e-06, "loss": 0.423, "mean_token_accuracy": 0.8572527766227722, "num_tokens": 1109056329.0, "step": 33105 }, { "epoch": 1.9764804202483286, "grad_norm": 0.5739167332649231, "learning_rate": 5.017099080029827e-06, "loss": 0.4289, "mean_token_accuracy": 0.8560061931610108, "num_tokens": 1109224009.0, "step": 33110 }, { "epoch": 1.9767788920725884, "grad_norm": 0.5494800209999084, "learning_rate": 5.016668984846008e-06, "loss": 0.3928, "mean_token_accuracy": 0.8678158164024353, "num_tokens": 1109391689.0, "step": 33115 }, { "epoch": 1.9770773638968482, "grad_norm": 0.4673916697502136, "learning_rate": 5.016244365892452e-06, "loss": 0.4051, "mean_token_accuracy": 0.8637182235717773, "num_tokens": 1109559369.0, "step": 33120 }, { "epoch": 1.977375835721108, "grad_norm": 0.44602450728416443, "learning_rate": 5.0158252232725845e-06, "loss": 0.4042, "mean_token_accuracy": 0.8626327037811279, "num_tokens": 1109727049.0, "step": 33125 }, { "epoch": 1.9776743075453678, "grad_norm": 0.5123845338821411, "learning_rate": 5.015411557088493e-06, "loss": 0.4368, "mean_token_accuracy": 0.8515925168991089, "num_tokens": 1109894729.0, "step": 33130 }, { "epoch": 1.9779727793696273, "grad_norm": 0.5120893716812134, "learning_rate": 5.015003367440934e-06, "loss": 0.4247, "mean_token_accuracy": 0.8575330972671509, "num_tokens": 1110062409.0, "step": 33135 }, { "epoch": 1.9782712511938874, "grad_norm": 0.47194841504096985, "learning_rate": 5.014600654429334e-06, "loss": 0.3922, "mean_token_accuracy": 0.8669569253921509, "num_tokens": 1110230089.0, "step": 33140 }, { "epoch": 1.978569723018147, "grad_norm": 0.5535193085670471, "learning_rate": 5.014203418151775e-06, "loss": 0.4346, "mean_token_accuracy": 0.8549027800559997, "num_tokens": 1110397769.0, "step": 33145 }, { "epoch": 1.978868194842407, "grad_norm": 0.5228692293167114, "learning_rate": 5.013811658705015e-06, "loss": 0.4278, "mean_token_accuracy": 0.8584695219993591, "num_tokens": 1110565449.0, "step": 33150 }, { "epoch": 1.9791666666666665, "grad_norm": 0.553502082824707, "learning_rate": 5.01342537618447e-06, "loss": 0.4309, "mean_token_accuracy": 0.8556781649589539, "num_tokens": 1110733129.0, "step": 33155 }, { "epoch": 1.9794651384909265, "grad_norm": 0.5194050669670105, "learning_rate": 5.013044570684232e-06, "loss": 0.4332, "mean_token_accuracy": 0.8560121774673461, "num_tokens": 1110900809.0, "step": 33160 }, { "epoch": 1.979763610315186, "grad_norm": 0.5111545324325562, "learning_rate": 5.01266924229705e-06, "loss": 0.4112, "mean_token_accuracy": 0.860199224948883, "num_tokens": 1111068489.0, "step": 33165 }, { "epoch": 1.9800620821394461, "grad_norm": 0.4741707146167755, "learning_rate": 5.012299391114342e-06, "loss": 0.4181, "mean_token_accuracy": 0.8597339868545533, "num_tokens": 1111236169.0, "step": 33170 }, { "epoch": 1.9803605539637057, "grad_norm": 0.5976281762123108, "learning_rate": 5.011935017226192e-06, "loss": 0.4043, "mean_token_accuracy": 0.8634558081626892, "num_tokens": 1111403849.0, "step": 33175 }, { "epoch": 1.9806590257879657, "grad_norm": 0.5199026465415955, "learning_rate": 5.011576120721352e-06, "loss": 0.4067, "mean_token_accuracy": 0.8623583555221558, "num_tokens": 1111571529.0, "step": 33180 }, { "epoch": 1.9809574976122253, "grad_norm": 0.4533449113368988, "learning_rate": 5.0112227016872364e-06, "loss": 0.4223, "mean_token_accuracy": 0.8586782932281494, "num_tokens": 1111739209.0, "step": 33185 }, { "epoch": 1.9812559694364853, "grad_norm": 0.5162702798843384, "learning_rate": 5.010874760209927e-06, "loss": 0.4301, "mean_token_accuracy": 0.8562746047973633, "num_tokens": 1111906889.0, "step": 33190 }, { "epoch": 1.9815544412607449, "grad_norm": 0.5097695589065552, "learning_rate": 5.0105322963741725e-06, "loss": 0.4564, "mean_token_accuracy": 0.8489800810813903, "num_tokens": 1112074569.0, "step": 33195 }, { "epoch": 1.9818529130850049, "grad_norm": 0.49297916889190674, "learning_rate": 5.010195310263383e-06, "loss": 0.3998, "mean_token_accuracy": 0.8655552983283996, "num_tokens": 1112242249.0, "step": 33200 }, { "epoch": 1.9821513849092645, "grad_norm": 0.5222697854042053, "learning_rate": 5.0098638019596404e-06, "loss": 0.4177, "mean_token_accuracy": 0.8594477057456971, "num_tokens": 1112409929.0, "step": 33205 }, { "epoch": 1.9824498567335245, "grad_norm": 0.5047992467880249, "learning_rate": 5.00953777154369e-06, "loss": 0.418, "mean_token_accuracy": 0.8590838551521301, "num_tokens": 1112577609.0, "step": 33210 }, { "epoch": 1.982748328557784, "grad_norm": 0.5256698131561279, "learning_rate": 5.009217219094941e-06, "loss": 0.4375, "mean_token_accuracy": 0.8532148361206054, "num_tokens": 1112745289.0, "step": 33215 }, { "epoch": 1.983046800382044, "grad_norm": 0.5669993162155151, "learning_rate": 5.008902144691471e-06, "loss": 0.4578, "mean_token_accuracy": 0.8474114298820495, "num_tokens": 1112912969.0, "step": 33220 }, { "epoch": 1.9833452722063036, "grad_norm": 0.49203798174858093, "learning_rate": 5.008592548410022e-06, "loss": 0.4388, "mean_token_accuracy": 0.852105462551117, "num_tokens": 1113080649.0, "step": 33225 }, { "epoch": 1.9836437440305636, "grad_norm": 0.511444628238678, "learning_rate": 5.008288430326001e-06, "loss": 0.4418, "mean_token_accuracy": 0.8524275422096252, "num_tokens": 1113248329.0, "step": 33230 }, { "epoch": 1.9839422158548232, "grad_norm": 0.4787483513355255, "learning_rate": 5.007989790513481e-06, "loss": 0.4039, "mean_token_accuracy": 0.8646009922027588, "num_tokens": 1113416009.0, "step": 33235 }, { "epoch": 1.9842406876790832, "grad_norm": 0.49464672803878784, "learning_rate": 5.007696629045205e-06, "loss": 0.4124, "mean_token_accuracy": 0.8605809450149536, "num_tokens": 1113583689.0, "step": 33240 }, { "epoch": 1.9845391595033428, "grad_norm": 0.4983403980731964, "learning_rate": 5.0074089459925755e-06, "loss": 0.4435, "mean_token_accuracy": 0.8521472096443177, "num_tokens": 1113751369.0, "step": 33245 }, { "epoch": 1.9848376313276028, "grad_norm": 0.553785502910614, "learning_rate": 5.0071267414256605e-06, "loss": 0.4299, "mean_token_accuracy": 0.8551592469215393, "num_tokens": 1113919049.0, "step": 33250 }, { "epoch": 1.9851361031518624, "grad_norm": 0.503053605556488, "learning_rate": 5.006850015413201e-06, "loss": 0.4165, "mean_token_accuracy": 0.8602290272712707, "num_tokens": 1114086729.0, "step": 33255 }, { "epoch": 1.9854345749761224, "grad_norm": 0.6152521371841431, "learning_rate": 5.006578768022595e-06, "loss": 0.4511, "mean_token_accuracy": 0.850715720653534, "num_tokens": 1114254409.0, "step": 33260 }, { "epoch": 1.985733046800382, "grad_norm": 0.47155895829200745, "learning_rate": 5.00631299931991e-06, "loss": 0.4256, "mean_token_accuracy": 0.8568531632423401, "num_tokens": 1114422089.0, "step": 33265 }, { "epoch": 1.9860315186246418, "grad_norm": 0.45770949125289917, "learning_rate": 5.006052709369881e-06, "loss": 0.4136, "mean_token_accuracy": 0.8623821973800659, "num_tokens": 1114589769.0, "step": 33270 }, { "epoch": 1.9863299904489016, "grad_norm": 0.5494801998138428, "learning_rate": 5.005797898235906e-06, "loss": 0.4718, "mean_token_accuracy": 0.8433913826942444, "num_tokens": 1114757449.0, "step": 33275 }, { "epoch": 1.9866284622731614, "grad_norm": 0.5019585490226746, "learning_rate": 5.005548565980046e-06, "loss": 0.4449, "mean_token_accuracy": 0.850501012802124, "num_tokens": 1114925129.0, "step": 33280 }, { "epoch": 1.9869269340974212, "grad_norm": 0.49741071462631226, "learning_rate": 5.005304712663035e-06, "loss": 0.4449, "mean_token_accuracy": 0.8513837575912475, "num_tokens": 1115092809.0, "step": 33285 }, { "epoch": 1.987225405921681, "grad_norm": 0.4714791476726532, "learning_rate": 5.005066338344263e-06, "loss": 0.4112, "mean_token_accuracy": 0.8611952781677246, "num_tokens": 1115260489.0, "step": 33290 }, { "epoch": 1.9875238777459407, "grad_norm": 0.50583416223526, "learning_rate": 5.004833443081796e-06, "loss": 0.4276, "mean_token_accuracy": 0.8571155905723572, "num_tokens": 1115428169.0, "step": 33295 }, { "epoch": 1.9878223495702005, "grad_norm": 0.5397236943244934, "learning_rate": 5.004606026932356e-06, "loss": 0.4563, "mean_token_accuracy": 0.8470356583595275, "num_tokens": 1115595849.0, "step": 33300 }, { "epoch": 1.9881208213944603, "grad_norm": 0.5348944067955017, "learning_rate": 5.004384089951338e-06, "loss": 0.4351, "mean_token_accuracy": 0.8554216980934143, "num_tokens": 1115763529.0, "step": 33305 }, { "epoch": 1.9884192932187201, "grad_norm": 0.4901181757450104, "learning_rate": 5.004167632192794e-06, "loss": 0.4269, "mean_token_accuracy": 0.856817364692688, "num_tokens": 1115931209.0, "step": 33310 }, { "epoch": 1.98871776504298, "grad_norm": 0.48394614458084106, "learning_rate": 5.003956653709448e-06, "loss": 0.4253, "mean_token_accuracy": 0.8571633100509644, "num_tokens": 1116098889.0, "step": 33315 }, { "epoch": 1.9890162368672397, "grad_norm": 0.5433171391487122, "learning_rate": 5.003751154552688e-06, "loss": 0.4372, "mean_token_accuracy": 0.8538411021232605, "num_tokens": 1116266569.0, "step": 33320 }, { "epoch": 1.9893147086914995, "grad_norm": 0.5413157939910889, "learning_rate": 5.003551134772569e-06, "loss": 0.4102, "mean_token_accuracy": 0.8618573307991028, "num_tokens": 1116434249.0, "step": 33325 }, { "epoch": 1.9896131805157593, "grad_norm": 0.5299882888793945, "learning_rate": 5.003356594417809e-06, "loss": 0.4193, "mean_token_accuracy": 0.8580997228622437, "num_tokens": 1116601929.0, "step": 33330 }, { "epoch": 1.989911652340019, "grad_norm": 0.4970306158065796, "learning_rate": 5.0031675335357895e-06, "loss": 0.4112, "mean_token_accuracy": 0.8617618918418884, "num_tokens": 1116769609.0, "step": 33335 }, { "epoch": 1.990210124164279, "grad_norm": 0.47278285026550293, "learning_rate": 5.0029839521725605e-06, "loss": 0.426, "mean_token_accuracy": 0.858421802520752, "num_tokens": 1116937289.0, "step": 33340 }, { "epoch": 1.9905085959885387, "grad_norm": 0.4842239320278168, "learning_rate": 5.002805850372837e-06, "loss": 0.4031, "mean_token_accuracy": 0.8642132878303528, "num_tokens": 1117104969.0, "step": 33345 }, { "epoch": 1.9908070678127985, "grad_norm": 0.5508589744567871, "learning_rate": 5.002633228180002e-06, "loss": 0.4413, "mean_token_accuracy": 0.8531373023986817, "num_tokens": 1117272649.0, "step": 33350 }, { "epoch": 1.9911055396370583, "grad_norm": 0.49138855934143066, "learning_rate": 5.002466085636097e-06, "loss": 0.4588, "mean_token_accuracy": 0.8479780554771423, "num_tokens": 1117440329.0, "step": 33355 }, { "epoch": 1.991404011461318, "grad_norm": 0.5040826797485352, "learning_rate": 5.002304422781831e-06, "loss": 0.441, "mean_token_accuracy": 0.8517058372497559, "num_tokens": 1117608009.0, "step": 33360 }, { "epoch": 1.9917024832855779, "grad_norm": 0.4997211694717407, "learning_rate": 5.002148239656586e-06, "loss": 0.4086, "mean_token_accuracy": 0.8632410883903503, "num_tokens": 1117775689.0, "step": 33365 }, { "epoch": 1.9920009551098377, "grad_norm": 0.6780403852462769, "learning_rate": 5.001997536298399e-06, "loss": 0.4413, "mean_token_accuracy": 0.8516461968421936, "num_tokens": 1117943369.0, "step": 33370 }, { "epoch": 1.9922994269340975, "grad_norm": 0.5391092300415039, "learning_rate": 5.001852312743976e-06, "loss": 0.4375, "mean_token_accuracy": 0.8525408625602722, "num_tokens": 1118111049.0, "step": 33375 }, { "epoch": 1.9925978987583572, "grad_norm": 0.4946630001068115, "learning_rate": 5.001712569028694e-06, "loss": 0.4145, "mean_token_accuracy": 0.8602588534355163, "num_tokens": 1118278729.0, "step": 33380 }, { "epoch": 1.992896370582617, "grad_norm": 0.5029065608978271, "learning_rate": 5.001578305186583e-06, "loss": 0.439, "mean_token_accuracy": 0.8532923817634582, "num_tokens": 1118446409.0, "step": 33385 }, { "epoch": 1.9931948424068768, "grad_norm": 0.5339170694351196, "learning_rate": 5.00144952125035e-06, "loss": 0.4476, "mean_token_accuracy": 0.849886679649353, "num_tokens": 1118614089.0, "step": 33390 }, { "epoch": 1.9934933142311366, "grad_norm": 0.5761162638664246, "learning_rate": 5.001326217251364e-06, "loss": 0.4348, "mean_token_accuracy": 0.8539246082305908, "num_tokens": 1118781769.0, "step": 33395 }, { "epoch": 1.9937917860553964, "grad_norm": 0.4588549733161926, "learning_rate": 5.001208393219653e-06, "loss": 0.4167, "mean_token_accuracy": 0.8589824676513672, "num_tokens": 1118949449.0, "step": 33400 }, { "epoch": 1.9940902578796562, "grad_norm": 0.5122301578521729, "learning_rate": 5.001096049183919e-06, "loss": 0.4492, "mean_token_accuracy": 0.8490909337997437, "num_tokens": 1119115055.0, "step": 33405 }, { "epoch": 1.9943887297039158, "grad_norm": 0.5342676043510437, "learning_rate": 5.000989185171524e-06, "loss": 0.4385, "mean_token_accuracy": 0.8538709282875061, "num_tokens": 1119282735.0, "step": 33410 }, { "epoch": 1.9946872015281758, "grad_norm": 0.4799768328666687, "learning_rate": 5.000887801208497e-06, "loss": 0.4652, "mean_token_accuracy": 0.8442085146903991, "num_tokens": 1119450415.0, "step": 33415 }, { "epoch": 1.9949856733524354, "grad_norm": 0.5027874708175659, "learning_rate": 5.000791897319534e-06, "loss": 0.406, "mean_token_accuracy": 0.8625551700592041, "num_tokens": 1119618095.0, "step": 33420 }, { "epoch": 1.9952841451766954, "grad_norm": 0.5278739929199219, "learning_rate": 5.0007014735279895e-06, "loss": 0.4203, "mean_token_accuracy": 0.8578969359397888, "num_tokens": 1119785775.0, "step": 33425 }, { "epoch": 1.995582617000955, "grad_norm": 0.5141813158988953, "learning_rate": 5.000616529855894e-06, "loss": 0.4572, "mean_token_accuracy": 0.8492305874824524, "num_tokens": 1119953455.0, "step": 33430 }, { "epoch": 1.995881088825215, "grad_norm": 0.6113985776901245, "learning_rate": 5.000537066323929e-06, "loss": 0.4255, "mean_token_accuracy": 0.8578372836112976, "num_tokens": 1120121135.0, "step": 33435 }, { "epoch": 1.9961795606494745, "grad_norm": 0.5158660411834717, "learning_rate": 5.000463082951458e-06, "loss": 0.4125, "mean_token_accuracy": 0.8619825959205627, "num_tokens": 1120288815.0, "step": 33440 }, { "epoch": 1.9964780324737346, "grad_norm": 0.49674826860427856, "learning_rate": 5.000394579756494e-06, "loss": 0.4249, "mean_token_accuracy": 0.8567219376564026, "num_tokens": 1120456495.0, "step": 33445 }, { "epoch": 1.9967765042979941, "grad_norm": 0.6172782778739929, "learning_rate": 5.000331556755729e-06, "loss": 0.4158, "mean_token_accuracy": 0.8608806848526, "num_tokens": 1120618264.0, "step": 33450 }, { "epoch": 1.9970749761222542, "grad_norm": 0.5504373908042908, "learning_rate": 5.000274013964507e-06, "loss": 0.4341, "mean_token_accuracy": 0.8539544343948364, "num_tokens": 1120785944.0, "step": 33455 }, { "epoch": 1.9973734479465137, "grad_norm": 0.4685095548629761, "learning_rate": 5.000221951396845e-06, "loss": 0.4103, "mean_token_accuracy": 0.8622092366218567, "num_tokens": 1120953624.0, "step": 33460 }, { "epoch": 1.9976719197707737, "grad_norm": 0.4722028076648712, "learning_rate": 5.000175369065427e-06, "loss": 0.42, "mean_token_accuracy": 0.8578611493110657, "num_tokens": 1121121304.0, "step": 33465 }, { "epoch": 1.9979703915950333, "grad_norm": 0.4826979339122772, "learning_rate": 5.000134266981596e-06, "loss": 0.3938, "mean_token_accuracy": 0.8672253370285035, "num_tokens": 1121288984.0, "step": 33470 }, { "epoch": 1.9982688634192933, "grad_norm": 0.4782162308692932, "learning_rate": 5.000098645155365e-06, "loss": 0.4515, "mean_token_accuracy": 0.8497673869132996, "num_tokens": 1121456664.0, "step": 33475 }, { "epoch": 1.998567335243553, "grad_norm": 0.48977309465408325, "learning_rate": 5.0000685035954095e-06, "loss": 0.4259, "mean_token_accuracy": 0.8563461780548096, "num_tokens": 1121624344.0, "step": 33480 }, { "epoch": 1.998865807067813, "grad_norm": 0.44192495942115784, "learning_rate": 5.000043842309071e-06, "loss": 0.4209, "mean_token_accuracy": 0.8587379336357117, "num_tokens": 1121792024.0, "step": 33485 }, { "epoch": 1.9991642788920725, "grad_norm": 0.4938274621963501, "learning_rate": 5.000024661302356e-06, "loss": 0.4278, "mean_token_accuracy": 0.8574078559875489, "num_tokens": 1121959704.0, "step": 33490 }, { "epoch": 1.9994627507163325, "grad_norm": 0.4794931411743164, "learning_rate": 5.000010960579939e-06, "loss": 0.4223, "mean_token_accuracy": 0.8573720693588257, "num_tokens": 1122127384.0, "step": 33495 }, { "epoch": 1.999761222540592, "grad_norm": 0.4582333564758301, "learning_rate": 5.0000027401451516e-06, "loss": 0.4157, "mean_token_accuracy": 0.8597220659255982, "num_tokens": 1122295064.0, "step": 33500 }, { "epoch": 2.0, "step": 33504, "total_flos": 8.833714643953254e+18, "train_loss": 0.0, "train_runtime": 2.2868, "train_samples_per_second": 58602.159, "train_steps_per_second": 14651.196 } ], "logging_steps": 5, "max_steps": 33504, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.833714643953254e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }