{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 486, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030959752321981424, "grad_norm": 37.88094364827914, "learning_rate": 1.6326530612244896e-07, "loss": 2.012, "mean_token_accuracy": 0.6721480131149292, "num_tokens": 356347.0, "step": 5 }, { "epoch": 0.06191950464396285, "grad_norm": 33.96281709708476, "learning_rate": 3.673469387755102e-07, "loss": 1.9687, "mean_token_accuracy": 0.6716048419475555, "num_tokens": 710036.0, "step": 10 }, { "epoch": 0.09287925696594428, "grad_norm": 22.65005612491039, "learning_rate": 5.714285714285714e-07, "loss": 1.7907, "mean_token_accuracy": 0.68106742699941, "num_tokens": 1066084.0, "step": 15 }, { "epoch": 0.1238390092879257, "grad_norm": 12.825394787025013, "learning_rate": 7.755102040816326e-07, "loss": 1.3465, "mean_token_accuracy": 0.7263136367003123, "num_tokens": 1418691.0, "step": 20 }, { "epoch": 0.15479876160990713, "grad_norm": 4.82390687349429, "learning_rate": 9.795918367346939e-07, "loss": 1.105, "mean_token_accuracy": 0.7517302592595418, "num_tokens": 1773596.0, "step": 25 }, { "epoch": 0.18575851393188855, "grad_norm": 2.138743079455233, "learning_rate": 1.183673469387755e-06, "loss": 0.9184, "mean_token_accuracy": 0.7795046945412953, "num_tokens": 2129005.0, "step": 30 }, { "epoch": 0.21671826625386997, "grad_norm": 1.56843353818363, "learning_rate": 1.3877551020408162e-06, "loss": 0.8507, "mean_token_accuracy": 0.7924608170986176, "num_tokens": 2484075.0, "step": 35 }, { "epoch": 0.2476780185758514, "grad_norm": 1.3036043605324696, "learning_rate": 1.5918367346938775e-06, "loss": 0.7988, "mean_token_accuracy": 0.802043096224467, "num_tokens": 2838384.0, "step": 40 }, { "epoch": 0.2786377708978328, "grad_norm": 1.1700385198630212, "learning_rate": 1.7959183673469386e-06, "loss": 0.7936, "mean_token_accuracy": 0.7999271392822266, "num_tokens": 3192492.0, "step": 45 }, { "epoch": 0.30959752321981426, "grad_norm": 1.1487890193406949, "learning_rate": 2e-06, "loss": 0.7533, "mean_token_accuracy": 0.8103362222512563, "num_tokens": 3545097.0, "step": 50 }, { "epoch": 0.34055727554179566, "grad_norm": 1.162010071560409, "learning_rate": 1.9993540481842407e-06, "loss": 0.8066, "mean_token_accuracy": 0.7923434535662334, "num_tokens": 3903788.0, "step": 55 }, { "epoch": 0.3715170278637771, "grad_norm": 1.2349108637093285, "learning_rate": 1.9974170272444602e-06, "loss": 0.7724, "mean_token_accuracy": 0.8026398738225301, "num_tokens": 4259188.0, "step": 60 }, { "epoch": 0.4024767801857585, "grad_norm": 1.1955598435745196, "learning_rate": 1.9941914396250445e-06, "loss": 0.7695, "mean_token_accuracy": 0.8047589619954427, "num_tokens": 4613362.0, "step": 65 }, { "epoch": 0.43343653250773995, "grad_norm": 1.0813062584413962, "learning_rate": 1.9896814524743527e-06, "loss": 0.7418, "mean_token_accuracy": 0.8074904640515645, "num_tokens": 4968926.0, "step": 70 }, { "epoch": 0.46439628482972134, "grad_norm": 1.2490538133285465, "learning_rate": 1.983892892261163e-06, "loss": 0.782, "mean_token_accuracy": 0.7980295320351919, "num_tokens": 5326662.0, "step": 75 }, { "epoch": 0.4953560371517028, "grad_norm": 1.1648094821501256, "learning_rate": 1.9768332372474366e-06, "loss": 0.781, "mean_token_accuracy": 0.7981564621130626, "num_tokens": 5683326.0, "step": 80 }, { "epoch": 0.5263157894736842, "grad_norm": 1.153658574148309, "learning_rate": 1.9685116078271223e-06, "loss": 0.7208, "mean_token_accuracy": 0.8101318061351777, "num_tokens": 6037360.0, "step": 85 }, { "epoch": 0.5572755417956656, "grad_norm": 1.0499709465738478, "learning_rate": 1.958938754743489e-06, "loss": 0.7171, "mean_token_accuracy": 0.8120081921418508, "num_tokens": 6391493.0, "step": 90 }, { "epoch": 0.5882352941176471, "grad_norm": 1.1336114333093723, "learning_rate": 1.9481270452001986e-06, "loss": 0.7145, "mean_token_accuracy": 0.8118303279081981, "num_tokens": 6744880.0, "step": 95 }, { "epoch": 0.6191950464396285, "grad_norm": 1.1623292661065792, "learning_rate": 1.9360904468840735e-06, "loss": 0.7779, "mean_token_accuracy": 0.7967695931593577, "num_tokens": 7104675.0, "step": 100 }, { "epoch": 0.6501547987616099, "grad_norm": 1.1258828367181777, "learning_rate": 1.92284450992019e-06, "loss": 0.6596, "mean_token_accuracy": 0.8252765933672587, "num_tokens": 7456185.0, "step": 105 }, { "epoch": 0.6811145510835913, "grad_norm": 1.0529210280076258, "learning_rate": 1.9084063467826133e-06, "loss": 0.7169, "mean_token_accuracy": 0.807880413532257, "num_tokens": 7810532.0, "step": 110 }, { "epoch": 0.7120743034055728, "grad_norm": 0.9981313212919131, "learning_rate": 1.8927946101867344e-06, "loss": 0.7207, "mean_token_accuracy": 0.8063897867997487, "num_tokens": 8166048.0, "step": 115 }, { "epoch": 0.7430340557275542, "grad_norm": 1.1009837013665933, "learning_rate": 1.8760294689917554e-06, "loss": 0.7037, "mean_token_accuracy": 0.8074031293392181, "num_tokens": 8523864.0, "step": 120 }, { "epoch": 0.7739938080495357, "grad_norm": 1.0150983601738315, "learning_rate": 1.858132582144469e-06, "loss": 0.6768, "mean_token_accuracy": 0.8155300041039785, "num_tokens": 8875663.0, "step": 125 }, { "epoch": 0.804953560371517, "grad_norm": 0.9926457956250347, "learning_rate": 1.8391270706979861e-06, "loss": 0.6913, "mean_token_accuracy": 0.8103034933408101, "num_tokens": 9230905.0, "step": 130 }, { "epoch": 0.8359133126934984, "grad_norm": 1.0928972897431848, "learning_rate": 1.819037487941563e-06, "loss": 0.7202, "mean_token_accuracy": 0.8045949776967366, "num_tokens": 9587132.0, "step": 135 }, { "epoch": 0.8668730650154799, "grad_norm": 1.0669887753572231, "learning_rate": 1.7978897876801188e-06, "loss": 0.7363, "mean_token_accuracy": 0.8004952649275462, "num_tokens": 9943019.0, "step": 140 }, { "epoch": 0.8978328173374613, "grad_norm": 0.9764132937473873, "learning_rate": 1.7757112907044198e-06, "loss": 0.7099, "mean_token_accuracy": 0.8056580940882365, "num_tokens": 10299363.0, "step": 145 }, { "epoch": 0.9287925696594427, "grad_norm": 0.9521500960876352, "learning_rate": 1.7525306494952496e-06, "loss": 0.6723, "mean_token_accuracy": 0.8147999107837677, "num_tokens": 10653205.0, "step": 150 }, { "epoch": 0.9597523219814241, "grad_norm": 0.9014288610770413, "learning_rate": 1.728377811207168e-06, "loss": 0.6682, "mean_token_accuracy": 0.8167306999365489, "num_tokens": 11006084.0, "step": 155 }, { "epoch": 0.9907120743034056, "grad_norm": 0.9887395976242098, "learning_rate": 1.7032839789796709e-06, "loss": 0.6729, "mean_token_accuracy": 0.812424510717392, "num_tokens": 11362461.0, "step": 160 }, { "epoch": 1.0185758513931888, "grad_norm": 1.0123568191837427, "learning_rate": 1.6772815716257411e-06, "loss": 0.6889, "mean_token_accuracy": 0.812437218648416, "num_tokens": 11684418.0, "step": 165 }, { "epoch": 1.0495356037151702, "grad_norm": 1.2247818299294482, "learning_rate": 1.6504041817498676e-06, "loss": 0.6003, "mean_token_accuracy": 0.8325801193714142, "num_tokens": 12038916.0, "step": 170 }, { "epoch": 1.0804953560371517, "grad_norm": 1.1309870189119007, "learning_rate": 1.622686532349637e-06, "loss": 0.595, "mean_token_accuracy": 0.8306204895178477, "num_tokens": 12392491.0, "step": 175 }, { "epoch": 1.111455108359133, "grad_norm": 1.0261193684060383, "learning_rate": 1.5941644319569663e-06, "loss": 0.5745, "mean_token_accuracy": 0.8364119688669841, "num_tokens": 12746211.0, "step": 180 }, { "epoch": 1.1424148606811146, "grad_norm": 1.065208049531987, "learning_rate": 1.5648747283769316e-06, "loss": 0.6384, "mean_token_accuracy": 0.821829471985499, "num_tokens": 13105142.0, "step": 185 }, { "epoch": 1.173374613003096, "grad_norm": 1.049060212665513, "learning_rate": 1.5348552610839538e-06, "loss": 0.583, "mean_token_accuracy": 0.8339940627415975, "num_tokens": 13459616.0, "step": 190 }, { "epoch": 1.2043343653250773, "grad_norm": 1.0646992389936152, "learning_rate": 1.5041448123368452e-06, "loss": 0.585, "mean_token_accuracy": 0.8339759588241578, "num_tokens": 13813912.0, "step": 195 }, { "epoch": 1.2352941176470589, "grad_norm": 1.0502983258770635, "learning_rate": 1.4727830570758676e-06, "loss": 0.6229, "mean_token_accuracy": 0.8257229665915171, "num_tokens": 14170920.0, "step": 200 }, { "epoch": 1.2662538699690402, "grad_norm": 1.0418470874061867, "learning_rate": 1.4408105116665333e-06, "loss": 0.5806, "mean_token_accuracy": 0.836287780602773, "num_tokens": 14524500.0, "step": 205 }, { "epoch": 1.2972136222910216, "grad_norm": 1.087294144738772, "learning_rate": 1.4082684815563658e-06, "loss": 0.5956, "mean_token_accuracy": 0.8319136381149292, "num_tokens": 14880512.0, "step": 210 }, { "epoch": 1.328173374613003, "grad_norm": 1.101276572209567, "learning_rate": 1.375199007912241e-06, "loss": 0.6054, "mean_token_accuracy": 0.8299936970074971, "num_tokens": 15237315.0, "step": 215 }, { "epoch": 1.3591331269349844, "grad_norm": 1.010589916838959, "learning_rate": 1.3416448133072523e-06, "loss": 0.5881, "mean_token_accuracy": 0.8364007751146952, "num_tokens": 15589797.0, "step": 220 }, { "epoch": 1.390092879256966, "grad_norm": 1.0622699241655902, "learning_rate": 1.307649246527263e-06, "loss": 0.5995, "mean_token_accuracy": 0.8305212179819743, "num_tokens": 15946015.0, "step": 225 }, { "epoch": 1.4210526315789473, "grad_norm": 1.1594145716237165, "learning_rate": 1.273256226568451e-06, "loss": 0.6167, "mean_token_accuracy": 0.8282975077629089, "num_tokens": 16300916.0, "step": 230 }, { "epoch": 1.4520123839009287, "grad_norm": 1.051260396120812, "learning_rate": 1.2385101858982004e-06, "loss": 0.6137, "mean_token_accuracy": 0.8268493433793386, "num_tokens": 16658826.0, "step": 235 }, { "epoch": 1.4829721362229102, "grad_norm": 1.1511715900211696, "learning_rate": 1.203456013052634e-06, "loss": 0.6359, "mean_token_accuracy": 0.8252548217773438, "num_tokens": 17016876.0, "step": 240 }, { "epoch": 1.5139318885448918, "grad_norm": 1.0811656373656746, "learning_rate": 1.1681389946449502e-06, "loss": 0.5956, "mean_token_accuracy": 0.8341775079568227, "num_tokens": 17371949.0, "step": 245 }, { "epoch": 1.5448916408668731, "grad_norm": 1.0456087268969014, "learning_rate": 1.132604756859485e-06, "loss": 0.5821, "mean_token_accuracy": 0.836975779136022, "num_tokens": 17724910.0, "step": 250 }, { "epoch": 1.5758513931888545, "grad_norm": 1.1464877768864348, "learning_rate": 1.0968992065070768e-06, "loss": 0.6304, "mean_token_accuracy": 0.8253893832365672, "num_tokens": 18082483.0, "step": 255 }, { "epoch": 1.6068111455108358, "grad_norm": 1.0740762703373092, "learning_rate": 1.0610684717178905e-06, "loss": 0.6069, "mean_token_accuracy": 0.8298774818579356, "num_tokens": 18438484.0, "step": 260 }, { "epoch": 1.6377708978328174, "grad_norm": 1.058918476787192, "learning_rate": 1.0251588423483204e-06, "loss": 0.5919, "mean_token_accuracy": 0.831935566663742, "num_tokens": 18794196.0, "step": 265 }, { "epoch": 1.6687306501547987, "grad_norm": 1.0594158390719453, "learning_rate": 9.892167101789563e-07, "loss": 0.5799, "mean_token_accuracy": 0.8371186554431915, "num_tokens": 19147172.0, "step": 270 }, { "epoch": 1.6996904024767803, "grad_norm": 1.0350217488731246, "learning_rate": 9.532885089808712e-07, "loss": 0.5707, "mean_token_accuracy": 0.8371800223986308, "num_tokens": 19500395.0, "step": 275 }, { "epoch": 1.7306501547987616, "grad_norm": 1.1043252167652924, "learning_rate": 9.174206545276677e-07, "loss": 0.5919, "mean_token_accuracy": 0.833638709783554, "num_tokens": 19854642.0, "step": 280 }, { "epoch": 1.761609907120743, "grad_norm": 1.0594957855862892, "learning_rate": 8.81659484630768e-07, "loss": 0.6144, "mean_token_accuracy": 0.8295779307683309, "num_tokens": 20210201.0, "step": 285 }, { "epoch": 1.7925696594427245, "grad_norm": 1.0655593034762212, "learning_rate": 8.460511992754299e-07, "loss": 0.6008, "mean_token_accuracy": 0.8292633573214213, "num_tokens": 20564970.0, "step": 290 }, { "epoch": 1.8235294117647058, "grad_norm": 1.0925951219291423, "learning_rate": 8.106418009348156e-07, "loss": 0.5495, "mean_token_accuracy": 0.8422843952973683, "num_tokens": 20916569.0, "step": 295 }, { "epoch": 1.8544891640866874, "grad_norm": 1.066475477830169, "learning_rate": 7.75477035139231e-07, "loss": 0.592, "mean_token_accuracy": 0.8322900295257568, "num_tokens": 21270517.0, "step": 300 }, { "epoch": 1.8854489164086687, "grad_norm": 1.0726398714625474, "learning_rate": 7.406023313773097e-07, "loss": 0.5846, "mean_token_accuracy": 0.833445531129837, "num_tokens": 21626435.0, "step": 305 }, { "epoch": 1.91640866873065, "grad_norm": 1.0081168980943895, "learning_rate": 7.060627444054893e-07, "loss": 0.599, "mean_token_accuracy": 0.8309976756572723, "num_tokens": 21981684.0, "step": 310 }, { "epoch": 1.9473684210526314, "grad_norm": 1.1440591625457015, "learning_rate": 6.719028960416098e-07, "loss": 0.5952, "mean_token_accuracy": 0.8320066591103872, "num_tokens": 22335994.0, "step": 315 }, { "epoch": 1.978328173374613, "grad_norm": 1.0647370303589478, "learning_rate": 6.381669175178248e-07, "loss": 0.5769, "mean_token_accuracy": 0.8349888563156128, "num_tokens": 22690758.0, "step": 320 }, { "epoch": 2.0061919504643964, "grad_norm": 1.4685059022343072, "learning_rate": 6.048983924673022e-07, "loss": 0.6154, "mean_token_accuracy": 0.8296286706571225, "num_tokens": 23012996.0, "step": 325 }, { "epoch": 2.0371517027863777, "grad_norm": 1.0756981234282057, "learning_rate": 5.72140300618369e-07, "loss": 0.548, "mean_token_accuracy": 0.845786041021347, "num_tokens": 23369489.0, "step": 330 }, { "epoch": 2.068111455108359, "grad_norm": 1.22987487372924, "learning_rate": 5.399349622688478e-07, "loss": 0.5536, "mean_token_accuracy": 0.8429702619711558, "num_tokens": 23727840.0, "step": 335 }, { "epoch": 2.0990712074303404, "grad_norm": 1.0621757442951574, "learning_rate": 5.083239836123059e-07, "loss": 0.5258, "mean_token_accuracy": 0.8519696414470672, "num_tokens": 24082355.0, "step": 340 }, { "epoch": 2.130030959752322, "grad_norm": 1.1425718017062751, "learning_rate": 4.773482029868656e-07, "loss": 0.5293, "mean_token_accuracy": 0.8492769340674082, "num_tokens": 24438493.0, "step": 345 }, { "epoch": 2.1609907120743035, "grad_norm": 1.1312756633475332, "learning_rate": 4.4704763811600643e-07, "loss": 0.5308, "mean_token_accuracy": 0.8488172392050425, "num_tokens": 24792322.0, "step": 350 }, { "epoch": 2.191950464396285, "grad_norm": 1.2842356289182306, "learning_rate": 4.174614344095213e-07, "loss": 0.5655, "mean_token_accuracy": 0.8417594293753307, "num_tokens": 25151009.0, "step": 355 }, { "epoch": 2.222910216718266, "grad_norm": 1.1702297440144753, "learning_rate": 3.886278143914219e-07, "loss": 0.5364, "mean_token_accuracy": 0.8470952173074087, "num_tokens": 25507520.0, "step": 360 }, { "epoch": 2.2538699690402475, "grad_norm": 1.3759095012811366, "learning_rate": 3.605840283201195e-07, "loss": 0.5599, "mean_token_accuracy": 0.8408537685871125, "num_tokens": 25864215.0, "step": 365 }, { "epoch": 2.2848297213622293, "grad_norm": 1.1193461931741189, "learning_rate": 3.333663060646813e-07, "loss": 0.49, "mean_token_accuracy": 0.859304424126943, "num_tokens": 26216877.0, "step": 370 }, { "epoch": 2.3157894736842106, "grad_norm": 1.1648036361258658, "learning_rate": 3.0700981029933016e-07, "loss": 0.4931, "mean_token_accuracy": 0.8578304747740427, "num_tokens": 26573016.0, "step": 375 }, { "epoch": 2.346749226006192, "grad_norm": 1.1405575574665878, "learning_rate": 2.8154859107665987e-07, "loss": 0.4917, "mean_token_accuracy": 0.8590823928515117, "num_tokens": 26925170.0, "step": 380 }, { "epoch": 2.3777089783281733, "grad_norm": 1.1724816632075952, "learning_rate": 2.5701554183824724e-07, "loss": 0.5237, "mean_token_accuracy": 0.8520345091819763, "num_tokens": 27279422.0, "step": 385 }, { "epoch": 2.4086687306501546, "grad_norm": 1.1357266729474, "learning_rate": 2.3344235691949476e-07, "loss": 0.4672, "mean_token_accuracy": 0.8642761449019114, "num_tokens": 27630575.0, "step": 390 }, { "epoch": 2.4396284829721364, "grad_norm": 1.2409266497871647, "learning_rate": 2.1085949060360653e-07, "loss": 0.5369, "mean_token_accuracy": 0.849452143907547, "num_tokens": 27984763.0, "step": 395 }, { "epoch": 2.4705882352941178, "grad_norm": 1.188481006905719, "learning_rate": 1.8929611777758525e-07, "loss": 0.5212, "mean_token_accuracy": 0.8504625717798869, "num_tokens": 28339838.0, "step": 400 }, { "epoch": 2.501547987616099, "grad_norm": 1.1055493889153472, "learning_rate": 1.6878009624109312e-07, "loss": 0.5062, "mean_token_accuracy": 0.8539404590924581, "num_tokens": 28694134.0, "step": 405 }, { "epoch": 2.5325077399380804, "grad_norm": 1.0702929024634484, "learning_rate": 1.493379307168573e-07, "loss": 0.5392, "mean_token_accuracy": 0.8472303132216136, "num_tokens": 29049115.0, "step": 410 }, { "epoch": 2.5634674922600618, "grad_norm": 1.2498891293489098, "learning_rate": 1.3099473860912325e-07, "loss": 0.5218, "mean_token_accuracy": 0.8514606674512227, "num_tokens": 29404233.0, "step": 415 }, { "epoch": 2.594427244582043, "grad_norm": 1.135112532376098, "learning_rate": 1.1377421755438832e-07, "loss": 0.5072, "mean_token_accuracy": 0.8529640992482503, "num_tokens": 29759031.0, "step": 420 }, { "epoch": 2.625386996904025, "grad_norm": 1.228152423413386, "learning_rate": 9.769861480633979e-08, "loss": 0.5377, "mean_token_accuracy": 0.8478512247403462, "num_tokens": 30116017.0, "step": 425 }, { "epoch": 2.656346749226006, "grad_norm": 1.1448683622452198, "learning_rate": 8.278869849454717e-08, "loss": 0.5106, "mean_token_accuracy": 0.8535682797431946, "num_tokens": 30469452.0, "step": 430 }, { "epoch": 2.6873065015479876, "grad_norm": 1.1448594360250504, "learning_rate": 6.906373079403849e-08, "loss": 0.4842, "mean_token_accuracy": 0.8605853617191315, "num_tokens": 30822195.0, "step": 435 }, { "epoch": 2.718266253869969, "grad_norm": 1.1485381285009257, "learning_rate": 5.6541443040429295e-08, "loss": 0.5117, "mean_token_accuracy": 0.8541167537371318, "num_tokens": 31177700.0, "step": 440 }, { "epoch": 2.7492260061919502, "grad_norm": 1.1004929101776235, "learning_rate": 4.523801282274286e-08, "loss": 0.5198, "mean_token_accuracy": 0.8509711424509684, "num_tokens": 31534499.0, "step": 445 }, { "epoch": 2.780185758513932, "grad_norm": 1.235343825874549, "learning_rate": 3.5168043083526274e-08, "loss": 0.5151, "mean_token_accuracy": 0.8518358329931895, "num_tokens": 31890705.0, "step": 450 }, { "epoch": 2.8111455108359134, "grad_norm": 1.2048990404018094, "learning_rate": 2.634454325325497e-08, "loss": 0.5085, "mean_token_accuracy": 0.8524168650309245, "num_tokens": 32246784.0, "step": 455 }, { "epoch": 2.8421052631578947, "grad_norm": 1.1330314224549871, "learning_rate": 1.877891244340224e-08, "loss": 0.5141, "mean_token_accuracy": 0.853009025255839, "num_tokens": 32601413.0, "step": 460 }, { "epoch": 2.873065015479876, "grad_norm": 1.1251032096069928, "learning_rate": 1.2480924719885932e-08, "loss": 0.4948, "mean_token_accuracy": 0.8585106293360393, "num_tokens": 32955057.0, "step": 465 }, { "epoch": 2.9040247678018574, "grad_norm": 1.3402807034322533, "learning_rate": 7.45871647591756e-09, "loss": 0.5186, "mean_token_accuracy": 0.8510903239250183, "num_tokens": 33310606.0, "step": 470 }, { "epoch": 2.934984520123839, "grad_norm": 1.1610151530941533, "learning_rate": 3.7187759205656864e-09, "loss": 0.5312, "mean_token_accuracy": 0.8494451999664306, "num_tokens": 33664584.0, "step": 475 }, { "epoch": 2.9659442724458205, "grad_norm": 1.2159889934118617, "learning_rate": 1.2659346966152895e-09, "loss": 0.5202, "mean_token_accuracy": 0.8501146256923675, "num_tokens": 34019473.0, "step": 480 }, { "epoch": 2.996904024767802, "grad_norm": 1.1268577396776651, "learning_rate": 1.0336163855129143e-10, "loss": 0.524, "mean_token_accuracy": 0.8500737905502319, "num_tokens": 34375925.0, "step": 485 }, { "epoch": 3.0, "mean_token_accuracy": 0.8452582756678263, "num_tokens": 34411845.0, "step": 486, "total_flos": 131252961812480.0, "train_loss": 0.6672706213998206, "train_runtime": 11583.057, "train_samples_per_second": 4.014, "train_steps_per_second": 0.042 } ], "logging_steps": 5, "max_steps": 486, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 131252961812480.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }