{ "best_metric": 1.84632552, "best_model_checkpoint": "/home/anubhab-pg/sm745052/swift/exp_output/v0-20250508-111158/checkpoint-4000", "epoch": 3.0, "eval_steps": 500, "global_step": 4944, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006067961165048543, "grad_norm": 11.652853965759277, "learning_rate": 9.999998990554643e-05, "loss": 4.739078998565674, "memory(GiB)": 15.0, "step": 1, "token_acc": 0.29411764705882354, "train_speed(iter/s)": 0.121659 }, { "epoch": 0.003033980582524272, "grad_norm": 5.216057300567627, "learning_rate": 9.999974763886429e-05, "loss": 3.816542148590088, "memory(GiB)": 38.29, "step": 5, "token_acc": 0.3445692883895131, "train_speed(iter/s)": 0.246604 }, { "epoch": 0.006067961165048544, "grad_norm": 6.8135881423950195, "learning_rate": 9.999899055800455e-05, "loss": 2.9166372299194334, "memory(GiB)": 45.79, "step": 10, "token_acc": 0.3940298507462687, "train_speed(iter/s)": 0.273194 }, { "epoch": 0.009101941747572815, "grad_norm": 7.265444278717041, "learning_rate": 9.99977287650631e-05, "loss": 2.9143123626708984, "memory(GiB)": 45.79, "step": 15, "token_acc": 0.42483660130718953, "train_speed(iter/s)": 0.300436 }, { "epoch": 0.012135922330097087, "grad_norm": 4.8003153800964355, "learning_rate": 9.999596227277707e-05, "loss": 2.5010822296142576, "memory(GiB)": 46.14, "step": 20, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 0.312584 }, { "epoch": 0.01516990291262136, "grad_norm": 7.243627548217773, "learning_rate": 9.999369109897819e-05, "loss": 2.8699390411376955, "memory(GiB)": 46.51, "step": 25, "token_acc": 0.4406779661016949, "train_speed(iter/s)": 0.329126 }, { "epoch": 0.01820388349514563, "grad_norm": 4.523545265197754, "learning_rate": 9.999091526659272e-05, "loss": 2.710918998718262, "memory(GiB)": 46.51, "step": 30, "token_acc": 0.45722713864306785, "train_speed(iter/s)": 0.330957 }, { "epoch": 0.021237864077669904, "grad_norm": 4.85239315032959, "learning_rate": 9.998763480364113e-05, "loss": 2.425708770751953, "memory(GiB)": 46.51, "step": 35, "token_acc": 0.4533333333333333, "train_speed(iter/s)": 0.326065 }, { "epoch": 0.024271844660194174, "grad_norm": 4.216326713562012, "learning_rate": 9.99838497432379e-05, "loss": 2.6875373840332033, "memory(GiB)": 46.51, "step": 40, "token_acc": 0.43343653250773995, "train_speed(iter/s)": 0.326551 }, { "epoch": 0.027305825242718445, "grad_norm": 7.5979533195495605, "learning_rate": 9.997956012359109e-05, "loss": 2.381887435913086, "memory(GiB)": 46.51, "step": 45, "token_acc": 0.5, "train_speed(iter/s)": 0.326383 }, { "epoch": 0.03033980582524272, "grad_norm": 3.4132895469665527, "learning_rate": 9.997476598800203e-05, "loss": 2.228038024902344, "memory(GiB)": 46.51, "step": 50, "token_acc": 0.4838709677419355, "train_speed(iter/s)": 0.332111 }, { "epoch": 0.03337378640776699, "grad_norm": 5.495943069458008, "learning_rate": 9.99694673848649e-05, "loss": 2.1937400817871096, "memory(GiB)": 46.94, "step": 55, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 0.33154 }, { "epoch": 0.03640776699029126, "grad_norm": 6.0272088050842285, "learning_rate": 9.996366436766611e-05, "loss": 2.377857208251953, "memory(GiB)": 46.94, "step": 60, "token_acc": 0.4837662337662338, "train_speed(iter/s)": 0.329266 }, { "epoch": 0.03944174757281554, "grad_norm": 5.419928073883057, "learning_rate": 9.995735699498394e-05, "loss": 2.5187705993652343, "memory(GiB)": 46.94, "step": 65, "token_acc": 0.44648318042813456, "train_speed(iter/s)": 0.334167 }, { "epoch": 0.04247572815533981, "grad_norm": 5.46040678024292, "learning_rate": 9.995054533048777e-05, "loss": 2.362880897521973, "memory(GiB)": 46.94, "step": 70, "token_acc": 0.48059701492537316, "train_speed(iter/s)": 0.340524 }, { "epoch": 0.04550970873786408, "grad_norm": 9.648872375488281, "learning_rate": 9.994322944293763e-05, "loss": 2.4048995971679688, "memory(GiB)": 46.94, "step": 75, "token_acc": 0.4601449275362319, "train_speed(iter/s)": 0.34581 }, { "epoch": 0.04854368932038835, "grad_norm": 6.08892297744751, "learning_rate": 9.993540940618334e-05, "loss": 2.628174591064453, "memory(GiB)": 46.94, "step": 80, "token_acc": 0.4896551724137931, "train_speed(iter/s)": 0.337622 }, { "epoch": 0.05157766990291262, "grad_norm": 4.922782897949219, "learning_rate": 9.992708529916379e-05, "loss": 2.406493377685547, "memory(GiB)": 46.94, "step": 85, "token_acc": 0.4779220779220779, "train_speed(iter/s)": 0.339155 }, { "epoch": 0.05461165048543689, "grad_norm": 6.307302474975586, "learning_rate": 9.991825720590626e-05, "loss": 2.5711450576782227, "memory(GiB)": 46.94, "step": 90, "token_acc": 0.47635135135135137, "train_speed(iter/s)": 0.33887 }, { "epoch": 0.05764563106796117, "grad_norm": 5.0484619140625, "learning_rate": 9.990892521552546e-05, "loss": 2.614651679992676, "memory(GiB)": 46.94, "step": 95, "token_acc": 0.45287958115183247, "train_speed(iter/s)": 0.337986 }, { "epoch": 0.06067961165048544, "grad_norm": 8.132536888122559, "learning_rate": 9.989908942222264e-05, "loss": 2.427901840209961, "memory(GiB)": 46.94, "step": 100, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 0.337264 }, { "epoch": 0.06371359223300971, "grad_norm": 8.484292984008789, "learning_rate": 9.988874992528468e-05, "loss": 2.7749258041381837, "memory(GiB)": 46.94, "step": 105, "token_acc": 0.423841059602649, "train_speed(iter/s)": 0.334967 }, { "epoch": 0.06674757281553398, "grad_norm": 5.119959354400635, "learning_rate": 9.987790682908306e-05, "loss": 2.3933948516845702, "memory(GiB)": 46.94, "step": 110, "token_acc": 0.4835164835164835, "train_speed(iter/s)": 0.333499 }, { "epoch": 0.06978155339805825, "grad_norm": 4.2784743309021, "learning_rate": 9.986656024307286e-05, "loss": 2.3542047500610352, "memory(GiB)": 46.94, "step": 115, "token_acc": 0.5085324232081911, "train_speed(iter/s)": 0.338043 }, { "epoch": 0.07281553398058252, "grad_norm": 5.737245559692383, "learning_rate": 9.985471028179154e-05, "loss": 2.531406021118164, "memory(GiB)": 46.94, "step": 120, "token_acc": 0.45317220543806647, "train_speed(iter/s)": 0.336373 }, { "epoch": 0.07584951456310679, "grad_norm": 7.097526550292969, "learning_rate": 9.984235706485789e-05, "loss": 2.2263906478881834, "memory(GiB)": 46.94, "step": 125, "token_acc": 0.48104956268221577, "train_speed(iter/s)": 0.332569 }, { "epoch": 0.07888349514563107, "grad_norm": 5.05879545211792, "learning_rate": 9.98295007169708e-05, "loss": 2.215901756286621, "memory(GiB)": 46.94, "step": 130, "token_acc": 0.517799352750809, "train_speed(iter/s)": 0.335329 }, { "epoch": 0.08191747572815535, "grad_norm": 4.92686128616333, "learning_rate": 9.981614136790796e-05, "loss": 2.5729293823242188, "memory(GiB)": 46.94, "step": 135, "token_acc": 0.45481049562682213, "train_speed(iter/s)": 0.335352 }, { "epoch": 0.08495145631067962, "grad_norm": 5.164937973022461, "learning_rate": 9.980227915252459e-05, "loss": 2.6663639068603517, "memory(GiB)": 46.94, "step": 140, "token_acc": 0.46938775510204084, "train_speed(iter/s)": 0.335154 }, { "epoch": 0.08798543689320389, "grad_norm": 6.683323383331299, "learning_rate": 9.978791421075206e-05, "loss": 2.473577308654785, "memory(GiB)": 46.94, "step": 145, "token_acc": 0.4791666666666667, "train_speed(iter/s)": 0.335126 }, { "epoch": 0.09101941747572816, "grad_norm": 6.357656002044678, "learning_rate": 9.97730466875965e-05, "loss": 2.2693323135375976, "memory(GiB)": 46.94, "step": 150, "token_acc": 0.506896551724138, "train_speed(iter/s)": 0.333768 }, { "epoch": 0.09405339805825243, "grad_norm": 7.356832981109619, "learning_rate": 9.975767673313734e-05, "loss": 2.823617935180664, "memory(GiB)": 46.94, "step": 155, "token_acc": 0.42394822006472493, "train_speed(iter/s)": 0.333359 }, { "epoch": 0.0970873786407767, "grad_norm": 5.493913173675537, "learning_rate": 9.974180450252569e-05, "loss": 2.3958545684814454, "memory(GiB)": 46.94, "step": 160, "token_acc": 0.44642857142857145, "train_speed(iter/s)": 0.333225 }, { "epoch": 0.10012135922330097, "grad_norm": 3.637171506881714, "learning_rate": 9.972543015598295e-05, "loss": 2.221699523925781, "memory(GiB)": 46.94, "step": 165, "token_acc": 0.5497237569060773, "train_speed(iter/s)": 0.333266 }, { "epoch": 0.10315533980582524, "grad_norm": 10.502406120300293, "learning_rate": 9.970855385879908e-05, "loss": 2.613112449645996, "memory(GiB)": 46.94, "step": 170, "token_acc": 0.42424242424242425, "train_speed(iter/s)": 0.334264 }, { "epoch": 0.10618932038834951, "grad_norm": 4.684762001037598, "learning_rate": 9.969117578133089e-05, "loss": 2.458433723449707, "memory(GiB)": 46.94, "step": 175, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 0.336742 }, { "epoch": 0.10922330097087378, "grad_norm": 5.488919734954834, "learning_rate": 9.96732960990005e-05, "loss": 2.51656436920166, "memory(GiB)": 46.94, "step": 180, "token_acc": 0.4376731301939058, "train_speed(iter/s)": 0.336421 }, { "epoch": 0.11225728155339806, "grad_norm": 4.314403533935547, "learning_rate": 9.965491499229332e-05, "loss": 2.2378902435302734, "memory(GiB)": 46.94, "step": 185, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 0.335765 }, { "epoch": 0.11529126213592233, "grad_norm": 5.447028636932373, "learning_rate": 9.963603264675648e-05, "loss": 2.411647415161133, "memory(GiB)": 46.94, "step": 190, "token_acc": 0.4738562091503268, "train_speed(iter/s)": 0.335762 }, { "epoch": 0.1183252427184466, "grad_norm": 4.604159832000732, "learning_rate": 9.961664925299677e-05, "loss": 2.277849006652832, "memory(GiB)": 46.94, "step": 195, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 0.33742 }, { "epoch": 0.12135922330097088, "grad_norm": 4.791499137878418, "learning_rate": 9.95967650066788e-05, "loss": 2.239641571044922, "memory(GiB)": 46.94, "step": 200, "token_acc": 0.5163398692810458, "train_speed(iter/s)": 0.337319 }, { "epoch": 0.12439320388349515, "grad_norm": 4.106400489807129, "learning_rate": 9.957638010852301e-05, "loss": 2.55577392578125, "memory(GiB)": 46.94, "step": 205, "token_acc": 0.47352941176470587, "train_speed(iter/s)": 0.337896 }, { "epoch": 0.12742718446601942, "grad_norm": 6.23160982131958, "learning_rate": 9.955549476430364e-05, "loss": 2.423176956176758, "memory(GiB)": 46.94, "step": 210, "token_acc": 0.4904214559386973, "train_speed(iter/s)": 0.336441 }, { "epoch": 0.1304611650485437, "grad_norm": 3.543123722076416, "learning_rate": 9.953410918484667e-05, "loss": 2.557846450805664, "memory(GiB)": 46.94, "step": 215, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 0.337343 }, { "epoch": 0.13349514563106796, "grad_norm": 7.55106782913208, "learning_rate": 9.951222358602763e-05, "loss": 2.653213119506836, "memory(GiB)": 46.94, "step": 220, "token_acc": 0.4511494252873563, "train_speed(iter/s)": 0.338826 }, { "epoch": 0.13652912621359223, "grad_norm": 4.811387062072754, "learning_rate": 9.948983818876954e-05, "loss": 2.500718116760254, "memory(GiB)": 46.94, "step": 225, "token_acc": 0.4430379746835443, "train_speed(iter/s)": 0.339177 }, { "epoch": 0.1395631067961165, "grad_norm": 4.239647388458252, "learning_rate": 9.946695321904056e-05, "loss": 2.3520763397216795, "memory(GiB)": 46.94, "step": 230, "token_acc": 0.5028089887640449, "train_speed(iter/s)": 0.339208 }, { "epoch": 0.14259708737864077, "grad_norm": 5.49220085144043, "learning_rate": 9.944356890785177e-05, "loss": 2.4051780700683594, "memory(GiB)": 46.94, "step": 235, "token_acc": 0.476027397260274, "train_speed(iter/s)": 0.337951 }, { "epoch": 0.14563106796116504, "grad_norm": 6.164605140686035, "learning_rate": 9.941968549125481e-05, "loss": 2.5280595779418946, "memory(GiB)": 46.94, "step": 240, "token_acc": 0.45161290322580644, "train_speed(iter/s)": 0.337787 }, { "epoch": 0.1486650485436893, "grad_norm": 14.162618637084961, "learning_rate": 9.939530321033955e-05, "loss": 2.2646312713623047, "memory(GiB)": 46.94, "step": 245, "token_acc": 0.50920245398773, "train_speed(iter/s)": 0.337724 }, { "epoch": 0.15169902912621358, "grad_norm": 7.441068649291992, "learning_rate": 9.937042231123155e-05, "loss": 2.124452018737793, "memory(GiB)": 46.94, "step": 250, "token_acc": 0.5174603174603175, "train_speed(iter/s)": 0.338222 }, { "epoch": 0.15473300970873785, "grad_norm": 5.451231002807617, "learning_rate": 9.934504304508974e-05, "loss": 2.4457843780517576, "memory(GiB)": 46.94, "step": 255, "token_acc": 0.4959349593495935, "train_speed(iter/s)": 0.33759 }, { "epoch": 0.15776699029126215, "grad_norm": 4.345837116241455, "learning_rate": 9.931916566810371e-05, "loss": 2.2872390747070312, "memory(GiB)": 46.94, "step": 260, "token_acc": 0.519298245614035, "train_speed(iter/s)": 0.337904 }, { "epoch": 0.16080097087378642, "grad_norm": 4.379652500152588, "learning_rate": 9.929279044149123e-05, "loss": 2.400668716430664, "memory(GiB)": 46.94, "step": 265, "token_acc": 0.4960212201591512, "train_speed(iter/s)": 0.338122 }, { "epoch": 0.1638349514563107, "grad_norm": 9.584769248962402, "learning_rate": 9.926591763149559e-05, "loss": 2.2210363388061523, "memory(GiB)": 46.94, "step": 270, "token_acc": 0.5457627118644067, "train_speed(iter/s)": 0.338785 }, { "epoch": 0.16686893203883496, "grad_norm": 5.150815963745117, "learning_rate": 9.923854750938291e-05, "loss": 2.4257923126220704, "memory(GiB)": 46.94, "step": 275, "token_acc": 0.48338368580060426, "train_speed(iter/s)": 0.338839 }, { "epoch": 0.16990291262135923, "grad_norm": 5.8772382736206055, "learning_rate": 9.921068035143936e-05, "loss": 2.192362976074219, "memory(GiB)": 46.94, "step": 280, "token_acc": 0.5173501577287066, "train_speed(iter/s)": 0.337216 }, { "epoch": 0.1729368932038835, "grad_norm": 4.844017028808594, "learning_rate": 9.918231643896852e-05, "loss": 2.5290973663330076, "memory(GiB)": 46.94, "step": 285, "token_acc": 0.4835820895522388, "train_speed(iter/s)": 0.338311 }, { "epoch": 0.17597087378640777, "grad_norm": 7.689733028411865, "learning_rate": 9.915345605828828e-05, "loss": 2.318752670288086, "memory(GiB)": 46.94, "step": 290, "token_acc": 0.5, "train_speed(iter/s)": 0.340411 }, { "epoch": 0.17900485436893204, "grad_norm": 3.9893710613250732, "learning_rate": 9.912409950072821e-05, "loss": 2.054383087158203, "memory(GiB)": 46.94, "step": 295, "token_acc": 0.5361111111111111, "train_speed(iter/s)": 0.341817 }, { "epoch": 0.1820388349514563, "grad_norm": 6.386891841888428, "learning_rate": 9.909424706262647e-05, "loss": 2.6359893798828127, "memory(GiB)": 46.94, "step": 300, "token_acc": 0.4585635359116022, "train_speed(iter/s)": 0.342204 }, { "epoch": 0.18507281553398058, "grad_norm": 4.928503036499023, "learning_rate": 9.906389904532688e-05, "loss": 2.573761749267578, "memory(GiB)": 46.94, "step": 305, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 0.341991 }, { "epoch": 0.18810679611650485, "grad_norm": 4.7213006019592285, "learning_rate": 9.903305575517584e-05, "loss": 2.4224880218505858, "memory(GiB)": 46.94, "step": 310, "token_acc": 0.5, "train_speed(iter/s)": 0.341701 }, { "epoch": 0.19114077669902912, "grad_norm": 15.000086784362793, "learning_rate": 9.900171750351925e-05, "loss": 2.2887813568115236, "memory(GiB)": 46.94, "step": 315, "token_acc": 0.525, "train_speed(iter/s)": 0.34191 }, { "epoch": 0.1941747572815534, "grad_norm": 4.775755405426025, "learning_rate": 9.89698846066994e-05, "loss": 2.412546157836914, "memory(GiB)": 47.49, "step": 320, "token_acc": 0.5144694533762058, "train_speed(iter/s)": 0.340568 }, { "epoch": 0.19720873786407767, "grad_norm": 5.56797456741333, "learning_rate": 9.893755738605171e-05, "loss": 2.503924560546875, "memory(GiB)": 47.49, "step": 325, "token_acc": 0.5210355987055016, "train_speed(iter/s)": 0.34167 }, { "epoch": 0.20024271844660194, "grad_norm": 6.093564987182617, "learning_rate": 9.890473616790154e-05, "loss": 2.3153804779052733, "memory(GiB)": 47.49, "step": 330, "token_acc": 0.5245901639344263, "train_speed(iter/s)": 0.340922 }, { "epoch": 0.2032766990291262, "grad_norm": 4.194920063018799, "learning_rate": 9.887142128356092e-05, "loss": 2.421812057495117, "memory(GiB)": 47.49, "step": 335, "token_acc": 0.4606413994169096, "train_speed(iter/s)": 0.34066 }, { "epoch": 0.20631067961165048, "grad_norm": 3.8001320362091064, "learning_rate": 9.88376130693251e-05, "loss": 2.1935026168823244, "memory(GiB)": 47.49, "step": 340, "token_acc": 0.4923547400611621, "train_speed(iter/s)": 0.339691 }, { "epoch": 0.20934466019417475, "grad_norm": 4.428964614868164, "learning_rate": 9.880331186646925e-05, "loss": 2.449082946777344, "memory(GiB)": 47.49, "step": 345, "token_acc": 0.4525993883792049, "train_speed(iter/s)": 0.33968 }, { "epoch": 0.21237864077669902, "grad_norm": 5.185049057006836, "learning_rate": 9.876851802124503e-05, "loss": 2.2941963195800783, "memory(GiB)": 47.49, "step": 350, "token_acc": 0.4879725085910653, "train_speed(iter/s)": 0.340828 }, { "epoch": 0.2154126213592233, "grad_norm": 4.480109691619873, "learning_rate": 9.873323188487697e-05, "loss": 2.6083600997924803, "memory(GiB)": 47.49, "step": 355, "token_acc": 0.4504792332268371, "train_speed(iter/s)": 0.340513 }, { "epoch": 0.21844660194174756, "grad_norm": 5.908204078674316, "learning_rate": 9.869745381355906e-05, "loss": 2.277309226989746, "memory(GiB)": 47.49, "step": 360, "token_acc": 0.5047318611987381, "train_speed(iter/s)": 0.34089 }, { "epoch": 0.22148058252427186, "grad_norm": 5.289805889129639, "learning_rate": 9.86611841684511e-05, "loss": 2.2966209411621095, "memory(GiB)": 47.49, "step": 365, "token_acc": 0.4819672131147541, "train_speed(iter/s)": 0.341544 }, { "epoch": 0.22451456310679613, "grad_norm": 5.663209915161133, "learning_rate": 9.862442331567503e-05, "loss": 2.407209014892578, "memory(GiB)": 47.49, "step": 370, "token_acc": 0.4819277108433735, "train_speed(iter/s)": 0.340918 }, { "epoch": 0.2275485436893204, "grad_norm": 5.247391700744629, "learning_rate": 9.858717162631128e-05, "loss": 2.2594379425048827, "memory(GiB)": 47.49, "step": 375, "token_acc": 0.49363057324840764, "train_speed(iter/s)": 0.340675 }, { "epoch": 0.23058252427184467, "grad_norm": 4.221131324768066, "learning_rate": 9.854942947639501e-05, "loss": 2.1822589874267577, "memory(GiB)": 47.49, "step": 380, "token_acc": 0.5451263537906137, "train_speed(iter/s)": 0.340714 }, { "epoch": 0.23361650485436894, "grad_norm": 5.7013936042785645, "learning_rate": 9.851119724691225e-05, "loss": 2.4558753967285156, "memory(GiB)": 47.49, "step": 385, "token_acc": 0.49382716049382713, "train_speed(iter/s)": 0.341181 }, { "epoch": 0.2366504854368932, "grad_norm": 5.81847620010376, "learning_rate": 9.84724753237962e-05, "loss": 2.4878921508789062, "memory(GiB)": 47.49, "step": 390, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 0.341 }, { "epoch": 0.23968446601941748, "grad_norm": 4.679526329040527, "learning_rate": 9.843326409792317e-05, "loss": 2.4688491821289062, "memory(GiB)": 47.49, "step": 395, "token_acc": 0.46703296703296704, "train_speed(iter/s)": 0.341035 }, { "epoch": 0.24271844660194175, "grad_norm": 7.632129192352295, "learning_rate": 9.839356396510875e-05, "loss": 2.200436019897461, "memory(GiB)": 47.49, "step": 400, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 0.34216 }, { "epoch": 0.24575242718446602, "grad_norm": 4.0452985763549805, "learning_rate": 9.835337532610376e-05, "loss": 2.252799415588379, "memory(GiB)": 47.49, "step": 405, "token_acc": 0.5017667844522968, "train_speed(iter/s)": 0.343292 }, { "epoch": 0.2487864077669903, "grad_norm": 4.371352672576904, "learning_rate": 9.831269858659023e-05, "loss": 2.4701959609985353, "memory(GiB)": 47.95, "step": 410, "token_acc": 0.46745562130177515, "train_speed(iter/s)": 0.343245 }, { "epoch": 0.2518203883495146, "grad_norm": 3.472031354904175, "learning_rate": 9.827153415717729e-05, "loss": 2.2846242904663088, "memory(GiB)": 47.95, "step": 415, "token_acc": 0.5481049562682215, "train_speed(iter/s)": 0.342968 }, { "epoch": 0.25485436893203883, "grad_norm": 4.385293006896973, "learning_rate": 9.822988245339701e-05, "loss": 2.3196638107299803, "memory(GiB)": 47.95, "step": 420, "token_acc": 0.5052631578947369, "train_speed(iter/s)": 0.342984 }, { "epoch": 0.25788834951456313, "grad_norm": 3.1495554447174072, "learning_rate": 9.818774389570027e-05, "loss": 2.1482851028442385, "memory(GiB)": 47.95, "step": 425, "token_acc": 0.5048231511254019, "train_speed(iter/s)": 0.343458 }, { "epoch": 0.2609223300970874, "grad_norm": 3.995600461959839, "learning_rate": 9.814511890945241e-05, "loss": 2.2113052368164063, "memory(GiB)": 47.95, "step": 430, "token_acc": 0.5386996904024768, "train_speed(iter/s)": 0.344144 }, { "epoch": 0.26395631067961167, "grad_norm": 3.8057103157043457, "learning_rate": 9.810200792492904e-05, "loss": 2.4170047760009767, "memory(GiB)": 47.95, "step": 435, "token_acc": 0.5114754098360655, "train_speed(iter/s)": 0.34434 }, { "epoch": 0.2669902912621359, "grad_norm": 12.608009338378906, "learning_rate": 9.805841137731164e-05, "loss": 2.269071578979492, "memory(GiB)": 47.95, "step": 440, "token_acc": 0.4939759036144578, "train_speed(iter/s)": 0.344796 }, { "epoch": 0.2700242718446602, "grad_norm": 3.0036213397979736, "learning_rate": 9.801432970668318e-05, "loss": 2.0752655029296876, "memory(GiB)": 47.95, "step": 445, "token_acc": 0.5205479452054794, "train_speed(iter/s)": 0.345262 }, { "epoch": 0.27305825242718446, "grad_norm": 3.9543864727020264, "learning_rate": 9.79697633580237e-05, "loss": 2.3695743560791014, "memory(GiB)": 47.95, "step": 450, "token_acc": 0.5055555555555555, "train_speed(iter/s)": 0.345598 }, { "epoch": 0.27609223300970875, "grad_norm": 2.8131890296936035, "learning_rate": 9.792471278120573e-05, "loss": 2.109127998352051, "memory(GiB)": 47.95, "step": 455, "token_acc": 0.5109489051094891, "train_speed(iter/s)": 0.345345 }, { "epoch": 0.279126213592233, "grad_norm": 6.712737083435059, "learning_rate": 9.787917843098989e-05, "loss": 2.4151309967041015, "memory(GiB)": 47.95, "step": 460, "token_acc": 0.47987616099071206, "train_speed(iter/s)": 0.344994 }, { "epoch": 0.2821601941747573, "grad_norm": 6.335672855377197, "learning_rate": 9.783316076702019e-05, "loss": 2.563591957092285, "memory(GiB)": 47.95, "step": 465, "token_acc": 0.45964912280701753, "train_speed(iter/s)": 0.344628 }, { "epoch": 0.28519417475728154, "grad_norm": 3.936614751815796, "learning_rate": 9.778666025381943e-05, "loss": 2.273646926879883, "memory(GiB)": 47.95, "step": 470, "token_acc": 0.504885993485342, "train_speed(iter/s)": 0.343891 }, { "epoch": 0.28822815533980584, "grad_norm": 4.484070777893066, "learning_rate": 9.77396773607845e-05, "loss": 2.071964073181152, "memory(GiB)": 47.95, "step": 475, "token_acc": 0.5503597122302158, "train_speed(iter/s)": 0.343417 }, { "epoch": 0.2912621359223301, "grad_norm": 3.9730005264282227, "learning_rate": 9.769221256218164e-05, "loss": 2.3989818572998045, "memory(GiB)": 47.95, "step": 480, "token_acc": 0.49244712990936557, "train_speed(iter/s)": 0.343769 }, { "epoch": 0.2942961165048544, "grad_norm": 5.614996433258057, "learning_rate": 9.764426633714167e-05, "loss": 2.472169876098633, "memory(GiB)": 47.95, "step": 485, "token_acc": 0.4774193548387097, "train_speed(iter/s)": 0.344198 }, { "epoch": 0.2973300970873786, "grad_norm": 6.481118202209473, "learning_rate": 9.759583916965517e-05, "loss": 2.3003664016723633, "memory(GiB)": 47.95, "step": 490, "token_acc": 0.5435435435435435, "train_speed(iter/s)": 0.343586 }, { "epoch": 0.3003640776699029, "grad_norm": 3.180483341217041, "learning_rate": 9.754693154856751e-05, "loss": 2.4743316650390623, "memory(GiB)": 47.95, "step": 495, "token_acc": 0.47305389221556887, "train_speed(iter/s)": 0.343396 }, { "epoch": 0.30339805825242716, "grad_norm": 4.278759956359863, "learning_rate": 9.7497543967574e-05, "loss": 2.2425241470336914, "memory(GiB)": 47.95, "step": 500, "token_acc": 0.5353846153846153, "train_speed(iter/s)": 0.343543 }, { "epoch": 0.30339805825242716, "eval_loss": 2.2120792865753174, "eval_runtime": 13.3757, "eval_samples_per_second": 7.476, "eval_steps_per_second": 7.476, "eval_token_acc": 0.4887955182072829, "step": 500 }, { "epoch": 0.30643203883495146, "grad_norm": 4.150091648101807, "learning_rate": 9.74476769252149e-05, "loss": 2.1912309646606447, "memory(GiB)": 47.95, "step": 505, "token_acc": 0.49282296650717705, "train_speed(iter/s)": 0.340142 }, { "epoch": 0.3094660194174757, "grad_norm": 10.498276710510254, "learning_rate": 9.739733092487035e-05, "loss": 2.2416282653808595, "memory(GiB)": 47.95, "step": 510, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 0.339871 }, { "epoch": 0.3125, "grad_norm": 4.736756324768066, "learning_rate": 9.73465064747553e-05, "loss": 2.364884948730469, "memory(GiB)": 47.95, "step": 515, "token_acc": 0.4557377049180328, "train_speed(iter/s)": 0.340057 }, { "epoch": 0.3155339805825243, "grad_norm": 4.276501178741455, "learning_rate": 9.729520408791434e-05, "loss": 2.2216148376464844, "memory(GiB)": 47.95, "step": 520, "token_acc": 0.5059523809523809, "train_speed(iter/s)": 0.340342 }, { "epoch": 0.31856796116504854, "grad_norm": 5.066372394561768, "learning_rate": 9.72434242822167e-05, "loss": 2.112790107727051, "memory(GiB)": 47.95, "step": 525, "token_acc": 0.5528169014084507, "train_speed(iter/s)": 0.339839 }, { "epoch": 0.32160194174757284, "grad_norm": 4.777381896972656, "learning_rate": 9.719116758035074e-05, "loss": 2.4403156280517577, "memory(GiB)": 47.95, "step": 530, "token_acc": 0.4748427672955975, "train_speed(iter/s)": 0.340157 }, { "epoch": 0.3246359223300971, "grad_norm": 4.124401092529297, "learning_rate": 9.71384345098189e-05, "loss": 2.0084070205688476, "memory(GiB)": 47.95, "step": 535, "token_acc": 0.5546875, "train_speed(iter/s)": 0.340219 }, { "epoch": 0.3276699029126214, "grad_norm": 6.198371887207031, "learning_rate": 9.70852256029323e-05, "loss": 2.3408830642700194, "memory(GiB)": 47.95, "step": 540, "token_acc": 0.4204946996466431, "train_speed(iter/s)": 0.340319 }, { "epoch": 0.3307038834951456, "grad_norm": 6.003149509429932, "learning_rate": 9.703154139680533e-05, "loss": 2.531490707397461, "memory(GiB)": 47.95, "step": 545, "token_acc": 0.438871473354232, "train_speed(iter/s)": 0.341419 }, { "epoch": 0.3337378640776699, "grad_norm": 3.8410537242889404, "learning_rate": 9.697738243335028e-05, "loss": 2.4120954513549804, "memory(GiB)": 47.95, "step": 550, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 0.34105 }, { "epoch": 0.33677184466019416, "grad_norm": 3.459972858428955, "learning_rate": 9.692274925927185e-05, "loss": 2.178766441345215, "memory(GiB)": 47.95, "step": 555, "token_acc": 0.49722222222222223, "train_speed(iter/s)": 0.340827 }, { "epoch": 0.33980582524271846, "grad_norm": 4.943912029266357, "learning_rate": 9.686764242606163e-05, "loss": 2.3330970764160157, "memory(GiB)": 47.95, "step": 560, "token_acc": 0.5076452599388379, "train_speed(iter/s)": 0.340405 }, { "epoch": 0.3428398058252427, "grad_norm": 6.074972152709961, "learning_rate": 9.681206248999257e-05, "loss": 2.4544017791748045, "memory(GiB)": 47.95, "step": 565, "token_acc": 0.4630225080385852, "train_speed(iter/s)": 0.340832 }, { "epoch": 0.345873786407767, "grad_norm": 4.912213325500488, "learning_rate": 9.675601001211326e-05, "loss": 2.4787572860717773, "memory(GiB)": 47.95, "step": 570, "token_acc": 0.46564885496183206, "train_speed(iter/s)": 0.340481 }, { "epoch": 0.34890776699029125, "grad_norm": 4.58314847946167, "learning_rate": 9.669948555824242e-05, "loss": 2.3357780456542967, "memory(GiB)": 47.95, "step": 575, "token_acc": 0.4735099337748344, "train_speed(iter/s)": 0.340294 }, { "epoch": 0.35194174757281554, "grad_norm": 3.581042528152466, "learning_rate": 9.664248969896303e-05, "loss": 2.6192626953125, "memory(GiB)": 47.95, "step": 580, "token_acc": 0.4479638009049774, "train_speed(iter/s)": 0.340168 }, { "epoch": 0.3549757281553398, "grad_norm": 4.23941707611084, "learning_rate": 9.65850230096167e-05, "loss": 2.0317205429077148, "memory(GiB)": 47.95, "step": 585, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 0.339705 }, { "epoch": 0.3580097087378641, "grad_norm": 4.4633097648620605, "learning_rate": 9.652708607029779e-05, "loss": 2.206028175354004, "memory(GiB)": 47.95, "step": 590, "token_acc": 0.5315985130111525, "train_speed(iter/s)": 0.338448 }, { "epoch": 0.36104368932038833, "grad_norm": 4.798620223999023, "learning_rate": 9.646867946584757e-05, "loss": 1.9743743896484376, "memory(GiB)": 47.95, "step": 595, "token_acc": 0.589041095890411, "train_speed(iter/s)": 0.338476 }, { "epoch": 0.3640776699029126, "grad_norm": 4.184017658233643, "learning_rate": 9.64098037858483e-05, "loss": 2.322881317138672, "memory(GiB)": 47.95, "step": 600, "token_acc": 0.5329153605015674, "train_speed(iter/s)": 0.338117 }, { "epoch": 0.36711165048543687, "grad_norm": 13.235075950622559, "learning_rate": 9.635045962461735e-05, "loss": 2.190867233276367, "memory(GiB)": 47.95, "step": 605, "token_acc": 0.5303030303030303, "train_speed(iter/s)": 0.338685 }, { "epoch": 0.37014563106796117, "grad_norm": 3.9224014282226562, "learning_rate": 9.62906475812011e-05, "loss": 2.1764543533325194, "memory(GiB)": 47.95, "step": 610, "token_acc": 0.5015384615384615, "train_speed(iter/s)": 0.338644 }, { "epoch": 0.3731796116504854, "grad_norm": 4.106988906860352, "learning_rate": 9.623036825936898e-05, "loss": 2.198202705383301, "memory(GiB)": 47.95, "step": 615, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 0.338425 }, { "epoch": 0.3762135922330097, "grad_norm": 4.208174705505371, "learning_rate": 9.616962226760728e-05, "loss": 2.1807647705078126, "memory(GiB)": 47.95, "step": 620, "token_acc": 0.5, "train_speed(iter/s)": 0.338536 }, { "epoch": 0.379247572815534, "grad_norm": 3.6400604248046875, "learning_rate": 9.610841021911312e-05, "loss": 2.0912832260131835, "memory(GiB)": 47.95, "step": 625, "token_acc": 0.5407166123778502, "train_speed(iter/s)": 0.33845 }, { "epoch": 0.38228155339805825, "grad_norm": 3.9780805110931396, "learning_rate": 9.604673273178819e-05, "loss": 2.3044837951660155, "memory(GiB)": 47.95, "step": 630, "token_acc": 0.5351681957186545, "train_speed(iter/s)": 0.338528 }, { "epoch": 0.38531553398058255, "grad_norm": 4.758601188659668, "learning_rate": 9.59845904282325e-05, "loss": 2.1180328369140624, "memory(GiB)": 47.95, "step": 635, "token_acc": 0.525, "train_speed(iter/s)": 0.33854 }, { "epoch": 0.3883495145631068, "grad_norm": 5.266363143920898, "learning_rate": 9.592198393573816e-05, "loss": 2.077616310119629, "memory(GiB)": 47.95, "step": 640, "token_acc": 0.5, "train_speed(iter/s)": 0.338495 }, { "epoch": 0.3913834951456311, "grad_norm": 3.8194117546081543, "learning_rate": 9.585891388628298e-05, "loss": 2.1584760665893556, "memory(GiB)": 47.95, "step": 645, "token_acc": 0.5368421052631579, "train_speed(iter/s)": 0.338618 }, { "epoch": 0.39441747572815533, "grad_norm": 3.915362596511841, "learning_rate": 9.579538091652414e-05, "loss": 2.1824935913085937, "memory(GiB)": 47.95, "step": 650, "token_acc": 0.5311572700296736, "train_speed(iter/s)": 0.338632 }, { "epoch": 0.39745145631067963, "grad_norm": 6.392123222351074, "learning_rate": 9.573138566779171e-05, "loss": 2.14979248046875, "memory(GiB)": 47.95, "step": 655, "token_acc": 0.49193548387096775, "train_speed(iter/s)": 0.339159 }, { "epoch": 0.40048543689320387, "grad_norm": 4.176852226257324, "learning_rate": 9.566692878608229e-05, "loss": 2.1492748260498047, "memory(GiB)": 47.95, "step": 660, "token_acc": 0.5482758620689655, "train_speed(iter/s)": 0.339294 }, { "epoch": 0.40351941747572817, "grad_norm": 4.81455135345459, "learning_rate": 9.560201092205231e-05, "loss": 2.3684976577758787, "memory(GiB)": 47.95, "step": 665, "token_acc": 0.4852941176470588, "train_speed(iter/s)": 0.339577 }, { "epoch": 0.4065533980582524, "grad_norm": 4.782194137573242, "learning_rate": 9.553663273101162e-05, "loss": 2.3068775177001952, "memory(GiB)": 47.95, "step": 670, "token_acc": 0.47560975609756095, "train_speed(iter/s)": 0.339625 }, { "epoch": 0.4095873786407767, "grad_norm": 3.8881473541259766, "learning_rate": 9.54707948729168e-05, "loss": 2.236045265197754, "memory(GiB)": 47.95, "step": 675, "token_acc": 0.5185185185185185, "train_speed(iter/s)": 0.339433 }, { "epoch": 0.41262135922330095, "grad_norm": 4.458930969238281, "learning_rate": 9.540449801236451e-05, "loss": 2.4408639907836913, "memory(GiB)": 47.95, "step": 680, "token_acc": 0.47686832740213525, "train_speed(iter/s)": 0.33987 }, { "epoch": 0.41565533980582525, "grad_norm": 4.160996913909912, "learning_rate": 9.533774281858481e-05, "loss": 2.1458906173706054, "memory(GiB)": 47.95, "step": 685, "token_acc": 0.5025906735751295, "train_speed(iter/s)": 0.34033 }, { "epoch": 0.4186893203883495, "grad_norm": 5.265639305114746, "learning_rate": 9.527052996543436e-05, "loss": 2.09564208984375, "memory(GiB)": 47.95, "step": 690, "token_acc": 0.5392491467576792, "train_speed(iter/s)": 0.340478 }, { "epoch": 0.4217233009708738, "grad_norm": 4.328799247741699, "learning_rate": 9.520286013138959e-05, "loss": 2.367569160461426, "memory(GiB)": 47.95, "step": 695, "token_acc": 0.4715447154471545, "train_speed(iter/s)": 0.340697 }, { "epoch": 0.42475728155339804, "grad_norm": 4.285299777984619, "learning_rate": 9.513473399954001e-05, "loss": 2.2947181701660155, "memory(GiB)": 47.95, "step": 700, "token_acc": 0.516728624535316, "train_speed(iter/s)": 0.340904 }, { "epoch": 0.42779126213592233, "grad_norm": 5.728127479553223, "learning_rate": 9.506615225758111e-05, "loss": 2.398201751708984, "memory(GiB)": 47.95, "step": 705, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 0.341299 }, { "epoch": 0.4308252427184466, "grad_norm": 6.604185104370117, "learning_rate": 9.499711559780756e-05, "loss": 2.197690010070801, "memory(GiB)": 47.95, "step": 710, "token_acc": 0.5369774919614148, "train_speed(iter/s)": 0.341395 }, { "epoch": 0.4338592233009709, "grad_norm": 3.069992780685425, "learning_rate": 9.492762471710612e-05, "loss": 2.174225616455078, "memory(GiB)": 47.95, "step": 715, "token_acc": 0.5400593471810089, "train_speed(iter/s)": 0.341566 }, { "epoch": 0.4368932038834951, "grad_norm": 7.041407108306885, "learning_rate": 9.485768031694872e-05, "loss": 2.0840965270996095, "memory(GiB)": 47.95, "step": 720, "token_acc": 0.5148148148148148, "train_speed(iter/s)": 0.34142 }, { "epoch": 0.4399271844660194, "grad_norm": 4.811408042907715, "learning_rate": 9.478728310338527e-05, "loss": 2.5931447982788085, "memory(GiB)": 47.95, "step": 725, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 0.341014 }, { "epoch": 0.4429611650485437, "grad_norm": 5.489245414733887, "learning_rate": 9.471643378703662e-05, "loss": 2.5609107971191407, "memory(GiB)": 47.95, "step": 730, "token_acc": 0.4740484429065744, "train_speed(iter/s)": 0.341113 }, { "epoch": 0.44599514563106796, "grad_norm": 5.343659400939941, "learning_rate": 9.464513308308734e-05, "loss": 2.1627069473266602, "memory(GiB)": 47.95, "step": 735, "token_acc": 0.5125448028673835, "train_speed(iter/s)": 0.341064 }, { "epoch": 0.44902912621359226, "grad_norm": 4.827804088592529, "learning_rate": 9.457338171127847e-05, "loss": 2.5902605056762695, "memory(GiB)": 47.95, "step": 740, "token_acc": 0.474025974025974, "train_speed(iter/s)": 0.340956 }, { "epoch": 0.4520631067961165, "grad_norm": 3.625948905944824, "learning_rate": 9.450118039590032e-05, "loss": 1.9722644805908203, "memory(GiB)": 47.95, "step": 745, "token_acc": 0.5486111111111112, "train_speed(iter/s)": 0.341245 }, { "epoch": 0.4550970873786408, "grad_norm": 4.791515827178955, "learning_rate": 9.442852986578514e-05, "loss": 2.426283073425293, "memory(GiB)": 47.95, "step": 750, "token_acc": 0.510989010989011, "train_speed(iter/s)": 0.340837 }, { "epoch": 0.45813106796116504, "grad_norm": 7.5137786865234375, "learning_rate": 9.435543085429972e-05, "loss": 2.4069377899169924, "memory(GiB)": 47.95, "step": 755, "token_acc": 0.4591194968553459, "train_speed(iter/s)": 0.341108 }, { "epoch": 0.46116504854368934, "grad_norm": 4.976150989532471, "learning_rate": 9.428188409933806e-05, "loss": 2.2120594024658202, "memory(GiB)": 47.95, "step": 760, "token_acc": 0.5274390243902439, "train_speed(iter/s)": 0.34104 }, { "epoch": 0.4641990291262136, "grad_norm": 4.0205979347229, "learning_rate": 9.420789034331387e-05, "loss": 2.1097999572753907, "memory(GiB)": 47.95, "step": 765, "token_acc": 0.4880239520958084, "train_speed(iter/s)": 0.341367 }, { "epoch": 0.4672330097087379, "grad_norm": 5.951793193817139, "learning_rate": 9.413345033315307e-05, "loss": 1.9745277404785155, "memory(GiB)": 47.95, "step": 770, "token_acc": 0.5680933852140078, "train_speed(iter/s)": 0.34128 }, { "epoch": 0.4702669902912621, "grad_norm": 4.966895580291748, "learning_rate": 9.405856482028627e-05, "loss": 2.2937433242797853, "memory(GiB)": 47.95, "step": 775, "token_acc": 0.49050632911392406, "train_speed(iter/s)": 0.341139 }, { "epoch": 0.4733009708737864, "grad_norm": 4.2331438064575195, "learning_rate": 9.398323456064123e-05, "loss": 2.361500930786133, "memory(GiB)": 47.95, "step": 780, "token_acc": 0.5015290519877675, "train_speed(iter/s)": 0.341178 }, { "epoch": 0.47633495145631066, "grad_norm": 7.8501763343811035, "learning_rate": 9.39074603146351e-05, "loss": 2.2374454498291017, "memory(GiB)": 47.95, "step": 785, "token_acc": 0.518840579710145, "train_speed(iter/s)": 0.341516 }, { "epoch": 0.47936893203883496, "grad_norm": 3.8284718990325928, "learning_rate": 9.383124284716691e-05, "loss": 1.9690286636352539, "memory(GiB)": 47.95, "step": 790, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 0.342018 }, { "epoch": 0.4824029126213592, "grad_norm": 3.7183237075805664, "learning_rate": 9.37545829276097e-05, "loss": 2.0185150146484374, "memory(GiB)": 47.95, "step": 795, "token_acc": 0.5015015015015015, "train_speed(iter/s)": 0.34192 }, { "epoch": 0.4854368932038835, "grad_norm": 6.458893775939941, "learning_rate": 9.367748132980287e-05, "loss": 2.0081209182739257, "memory(GiB)": 47.95, "step": 800, "token_acc": 0.5471698113207547, "train_speed(iter/s)": 0.341396 }, { "epoch": 0.48847087378640774, "grad_norm": 4.59126091003418, "learning_rate": 9.359993883204425e-05, "loss": 2.121066856384277, "memory(GiB)": 47.95, "step": 805, "token_acc": 0.564179104477612, "train_speed(iter/s)": 0.340443 }, { "epoch": 0.49150485436893204, "grad_norm": 5.332643508911133, "learning_rate": 9.352195621708239e-05, "loss": 2.3226802825927733, "memory(GiB)": 47.95, "step": 810, "token_acc": 0.4887459807073955, "train_speed(iter/s)": 0.340627 }, { "epoch": 0.4945388349514563, "grad_norm": 4.230836868286133, "learning_rate": 9.344353427210852e-05, "loss": 2.303377151489258, "memory(GiB)": 47.95, "step": 815, "token_acc": 0.48859934853420195, "train_speed(iter/s)": 0.340948 }, { "epoch": 0.4975728155339806, "grad_norm": 4.910395622253418, "learning_rate": 9.336467378874871e-05, "loss": 2.3208490371704102, "memory(GiB)": 47.95, "step": 820, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 0.340934 }, { "epoch": 0.5006067961165048, "grad_norm": 5.890263080596924, "learning_rate": 9.328537556305578e-05, "loss": 2.190505027770996, "memory(GiB)": 47.95, "step": 825, "token_acc": 0.5104166666666666, "train_speed(iter/s)": 0.341431 }, { "epoch": 0.5036407766990292, "grad_norm": 5.387331485748291, "learning_rate": 9.320564039550134e-05, "loss": 2.3586183547973634, "memory(GiB)": 47.95, "step": 830, "token_acc": 0.5128205128205128, "train_speed(iter/s)": 0.341664 }, { "epoch": 0.5066747572815534, "grad_norm": 6.672213077545166, "learning_rate": 9.31254690909677e-05, "loss": 2.0102565765380858, "memory(GiB)": 47.95, "step": 835, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 0.341771 }, { "epoch": 0.5097087378640777, "grad_norm": 4.3900580406188965, "learning_rate": 9.304486245873972e-05, "loss": 2.1370363235473633, "memory(GiB)": 47.95, "step": 840, "token_acc": 0.5508771929824562, "train_speed(iter/s)": 0.34162 }, { "epoch": 0.5127427184466019, "grad_norm": 4.079644680023193, "learning_rate": 9.296382131249666e-05, "loss": 2.4044963836669924, "memory(GiB)": 47.95, "step": 845, "token_acc": 0.4807692307692308, "train_speed(iter/s)": 0.341681 }, { "epoch": 0.5157766990291263, "grad_norm": 7.195343971252441, "learning_rate": 9.288234647030391e-05, "loss": 2.45920352935791, "memory(GiB)": 47.95, "step": 850, "token_acc": 0.4384858044164038, "train_speed(iter/s)": 0.341519 }, { "epoch": 0.5188106796116505, "grad_norm": 3.0323264598846436, "learning_rate": 9.280043875460485e-05, "loss": 2.1931310653686524, "memory(GiB)": 47.95, "step": 855, "token_acc": 0.5272727272727272, "train_speed(iter/s)": 0.341432 }, { "epoch": 0.5218446601941747, "grad_norm": 3.355743408203125, "learning_rate": 9.271809899221246e-05, "loss": 2.111046028137207, "memory(GiB)": 47.95, "step": 860, "token_acc": 0.5235602094240838, "train_speed(iter/s)": 0.34145 }, { "epoch": 0.524878640776699, "grad_norm": 5.7925519943237305, "learning_rate": 9.263532801430094e-05, "loss": 1.933934211730957, "memory(GiB)": 47.95, "step": 865, "token_acc": 0.5664556962025317, "train_speed(iter/s)": 0.342026 }, { "epoch": 0.5279126213592233, "grad_norm": 4.084681987762451, "learning_rate": 9.255212665639744e-05, "loss": 1.6992759704589844, "memory(GiB)": 47.95, "step": 870, "token_acc": 0.5886792452830188, "train_speed(iter/s)": 0.341873 }, { "epoch": 0.5309466019417476, "grad_norm": 5.349972724914551, "learning_rate": 9.246849575837349e-05, "loss": 2.2484100341796873, "memory(GiB)": 47.95, "step": 875, "token_acc": 0.5318352059925093, "train_speed(iter/s)": 0.341923 }, { "epoch": 0.5339805825242718, "grad_norm": 6.48075008392334, "learning_rate": 9.238443616443666e-05, "loss": 2.5302217483520506, "memory(GiB)": 47.95, "step": 880, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 0.341716 }, { "epoch": 0.5370145631067961, "grad_norm": 4.047274589538574, "learning_rate": 9.229994872312193e-05, "loss": 1.9528484344482422, "memory(GiB)": 47.95, "step": 885, "token_acc": 0.5680272108843537, "train_speed(iter/s)": 0.341568 }, { "epoch": 0.5400485436893204, "grad_norm": 5.688394546508789, "learning_rate": 9.221503428728316e-05, "loss": 1.8645639419555664, "memory(GiB)": 47.95, "step": 890, "token_acc": 0.5708812260536399, "train_speed(iter/s)": 0.34154 }, { "epoch": 0.5430825242718447, "grad_norm": 5.595506191253662, "learning_rate": 9.212969371408449e-05, "loss": 2.1312353134155275, "memory(GiB)": 47.95, "step": 895, "token_acc": 0.5241935483870968, "train_speed(iter/s)": 0.341373 }, { "epoch": 0.5461165048543689, "grad_norm": 5.208035945892334, "learning_rate": 9.204392786499168e-05, "loss": 2.0691448211669923, "memory(GiB)": 47.95, "step": 900, "token_acc": 0.5160256410256411, "train_speed(iter/s)": 0.341122 }, { "epoch": 0.5491504854368932, "grad_norm": 4.882120609283447, "learning_rate": 9.19577376057634e-05, "loss": 2.201594352722168, "memory(GiB)": 47.95, "step": 905, "token_acc": 0.49498327759197325, "train_speed(iter/s)": 0.341111 }, { "epoch": 0.5521844660194175, "grad_norm": 4.964993000030518, "learning_rate": 9.187112380644254e-05, "loss": 2.2276058197021484, "memory(GiB)": 47.95, "step": 910, "token_acc": 0.49032258064516127, "train_speed(iter/s)": 0.340571 }, { "epoch": 0.5552184466019418, "grad_norm": 4.424059867858887, "learning_rate": 9.178408734134736e-05, "loss": 2.4902462005615233, "memory(GiB)": 47.95, "step": 915, "token_acc": 0.4861878453038674, "train_speed(iter/s)": 0.340643 }, { "epoch": 0.558252427184466, "grad_norm": 5.663638114929199, "learning_rate": 9.16966290890627e-05, "loss": 2.0774208068847657, "memory(GiB)": 47.95, "step": 920, "token_acc": 0.5244299674267101, "train_speed(iter/s)": 0.340731 }, { "epoch": 0.5612864077669902, "grad_norm": 5.591066360473633, "learning_rate": 9.160874993243113e-05, "loss": 2.063447952270508, "memory(GiB)": 47.95, "step": 925, "token_acc": 0.5606060606060606, "train_speed(iter/s)": 0.340774 }, { "epoch": 0.5643203883495146, "grad_norm": 4.221002101898193, "learning_rate": 9.152045075854398e-05, "loss": 2.34899959564209, "memory(GiB)": 47.95, "step": 930, "token_acc": 0.5073529411764706, "train_speed(iter/s)": 0.34102 }, { "epoch": 0.5673543689320388, "grad_norm": 5.272368907928467, "learning_rate": 9.143173245873247e-05, "loss": 2.1581178665161134, "memory(GiB)": 47.95, "step": 935, "token_acc": 0.55, "train_speed(iter/s)": 0.341262 }, { "epoch": 0.5703883495145631, "grad_norm": 5.044839382171631, "learning_rate": 9.134259592855861e-05, "loss": 2.4777673721313476, "memory(GiB)": 47.95, "step": 940, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 0.341236 }, { "epoch": 0.5734223300970874, "grad_norm": 3.423116683959961, "learning_rate": 9.125304206780627e-05, "loss": 2.2395530700683595, "memory(GiB)": 47.95, "step": 945, "token_acc": 0.5208333333333334, "train_speed(iter/s)": 0.341405 }, { "epoch": 0.5764563106796117, "grad_norm": 5.214086055755615, "learning_rate": 9.116307178047198e-05, "loss": 2.4418460845947267, "memory(GiB)": 47.95, "step": 950, "token_acc": 0.4652014652014652, "train_speed(iter/s)": 0.341766 }, { "epoch": 0.5794902912621359, "grad_norm": 4.812069416046143, "learning_rate": 9.10726859747559e-05, "loss": 2.18359489440918, "memory(GiB)": 47.95, "step": 955, "token_acc": 0.5351351351351351, "train_speed(iter/s)": 0.341167 }, { "epoch": 0.5825242718446602, "grad_norm": 4.6082987785339355, "learning_rate": 9.098188556305263e-05, "loss": 2.017580032348633, "memory(GiB)": 47.95, "step": 960, "token_acc": 0.5369774919614148, "train_speed(iter/s)": 0.341505 }, { "epoch": 0.5855582524271845, "grad_norm": 3.7764463424682617, "learning_rate": 9.089067146194196e-05, "loss": 2.3622528076171876, "memory(GiB)": 47.95, "step": 965, "token_acc": 0.47530864197530864, "train_speed(iter/s)": 0.34165 }, { "epoch": 0.5885922330097088, "grad_norm": 4.654725551605225, "learning_rate": 9.079904459217966e-05, "loss": 2.0436727523803713, "memory(GiB)": 47.95, "step": 970, "token_acc": 0.554016620498615, "train_speed(iter/s)": 0.341874 }, { "epoch": 0.591626213592233, "grad_norm": 5.9028544425964355, "learning_rate": 9.070700587868817e-05, "loss": 2.3862071990966798, "memory(GiB)": 47.95, "step": 975, "token_acc": 0.5054945054945055, "train_speed(iter/s)": 0.341755 }, { "epoch": 0.5946601941747572, "grad_norm": 4.00972318649292, "learning_rate": 9.061455625054725e-05, "loss": 1.9884757995605469, "memory(GiB)": 47.95, "step": 980, "token_acc": 0.5290102389078498, "train_speed(iter/s)": 0.341801 }, { "epoch": 0.5976941747572816, "grad_norm": 3.2706916332244873, "learning_rate": 9.052169664098461e-05, "loss": 2.3900108337402344, "memory(GiB)": 47.95, "step": 985, "token_acc": 0.4854368932038835, "train_speed(iter/s)": 0.342143 }, { "epoch": 0.6007281553398058, "grad_norm": 8.22692584991455, "learning_rate": 9.042842798736654e-05, "loss": 2.078816032409668, "memory(GiB)": 47.95, "step": 990, "token_acc": 0.5658682634730539, "train_speed(iter/s)": 0.342213 }, { "epoch": 0.6037621359223301, "grad_norm": 4.992088317871094, "learning_rate": 9.03347512311884e-05, "loss": 2.3420574188232424, "memory(GiB)": 47.95, "step": 995, "token_acc": 0.4918032786885246, "train_speed(iter/s)": 0.34243 }, { "epoch": 0.6067961165048543, "grad_norm": 3.9104936122894287, "learning_rate": 9.024066731806501e-05, "loss": 2.182134246826172, "memory(GiB)": 47.95, "step": 1000, "token_acc": 0.5284810126582279, "train_speed(iter/s)": 0.342591 }, { "epoch": 0.6067961165048543, "eval_loss": 2.2492244243621826, "eval_runtime": 12.8515, "eval_samples_per_second": 7.781, "eval_steps_per_second": 7.781, "eval_token_acc": 0.4798962386511025, "step": 1000 }, { "epoch": 0.6098300970873787, "grad_norm": 7.750658988952637, "learning_rate": 9.01461771977214e-05, "loss": 2.5756719589233397, "memory(GiB)": 47.95, "step": 1005, "token_acc": 0.4888268156424581, "train_speed(iter/s)": 0.340926 }, { "epoch": 0.6128640776699029, "grad_norm": 4.210326194763184, "learning_rate": 9.005128182398283e-05, "loss": 2.276882362365723, "memory(GiB)": 47.95, "step": 1010, "token_acc": 0.5032679738562091, "train_speed(iter/s)": 0.340958 }, { "epoch": 0.6158980582524272, "grad_norm": 3.898102045059204, "learning_rate": 8.995598215476555e-05, "loss": 2.379372406005859, "memory(GiB)": 47.95, "step": 1015, "token_acc": 0.5186335403726708, "train_speed(iter/s)": 0.340758 }, { "epoch": 0.6189320388349514, "grad_norm": 3.819361925125122, "learning_rate": 8.986027915206686e-05, "loss": 2.2519060134887696, "memory(GiB)": 47.95, "step": 1020, "token_acc": 0.5205047318611987, "train_speed(iter/s)": 0.340981 }, { "epoch": 0.6219660194174758, "grad_norm": 4.484439373016357, "learning_rate": 8.976417378195544e-05, "loss": 2.068866157531738, "memory(GiB)": 47.95, "step": 1025, "token_acc": 0.5629370629370629, "train_speed(iter/s)": 0.34129 }, { "epoch": 0.625, "grad_norm": 5.354769229888916, "learning_rate": 8.966766701456177e-05, "loss": 2.4065807342529295, "memory(GiB)": 47.95, "step": 1030, "token_acc": 0.4921135646687697, "train_speed(iter/s)": 0.341495 }, { "epoch": 0.6280339805825242, "grad_norm": 3.579660177230835, "learning_rate": 8.957075982406811e-05, "loss": 2.0959863662719727, "memory(GiB)": 47.95, "step": 1035, "token_acc": 0.5633802816901409, "train_speed(iter/s)": 0.341546 }, { "epoch": 0.6310679611650486, "grad_norm": 3.6382579803466797, "learning_rate": 8.947345318869882e-05, "loss": 2.21999454498291, "memory(GiB)": 47.95, "step": 1040, "token_acc": 0.49859154929577465, "train_speed(iter/s)": 0.341375 }, { "epoch": 0.6341019417475728, "grad_norm": 5.252345085144043, "learning_rate": 8.937574809071041e-05, "loss": 2.06900577545166, "memory(GiB)": 47.95, "step": 1045, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 0.340951 }, { "epoch": 0.6371359223300971, "grad_norm": 5.333433151245117, "learning_rate": 8.927764551638169e-05, "loss": 2.190864372253418, "memory(GiB)": 47.95, "step": 1050, "token_acc": 0.5083612040133779, "train_speed(iter/s)": 0.340756 }, { "epoch": 0.6401699029126213, "grad_norm": 4.690061569213867, "learning_rate": 8.917914645600369e-05, "loss": 2.0215267181396483, "memory(GiB)": 47.95, "step": 1055, "token_acc": 0.5377906976744186, "train_speed(iter/s)": 0.340559 }, { "epoch": 0.6432038834951457, "grad_norm": 8.77694320678711, "learning_rate": 8.908025190386985e-05, "loss": 2.0633663177490233, "memory(GiB)": 47.95, "step": 1060, "token_acc": 0.546875, "train_speed(iter/s)": 0.340807 }, { "epoch": 0.6462378640776699, "grad_norm": 4.13136625289917, "learning_rate": 8.898096285826582e-05, "loss": 2.3428586959838866, "memory(GiB)": 47.95, "step": 1065, "token_acc": 0.47435897435897434, "train_speed(iter/s)": 0.340441 }, { "epoch": 0.6492718446601942, "grad_norm": 4.561661243438721, "learning_rate": 8.888128032145941e-05, "loss": 2.2523740768432616, "memory(GiB)": 47.95, "step": 1070, "token_acc": 0.494475138121547, "train_speed(iter/s)": 0.340492 }, { "epoch": 0.6523058252427184, "grad_norm": 5.565670490264893, "learning_rate": 8.878120529969061e-05, "loss": 2.4203561782836913, "memory(GiB)": 47.95, "step": 1075, "token_acc": 0.4954682779456193, "train_speed(iter/s)": 0.340725 }, { "epoch": 0.6553398058252428, "grad_norm": 3.4084830284118652, "learning_rate": 8.868073880316124e-05, "loss": 2.394355583190918, "memory(GiB)": 47.95, "step": 1080, "token_acc": 0.4782608695652174, "train_speed(iter/s)": 0.340473 }, { "epoch": 0.658373786407767, "grad_norm": 4.823993682861328, "learning_rate": 8.857988184602484e-05, "loss": 2.310639762878418, "memory(GiB)": 47.95, "step": 1085, "token_acc": 0.4750733137829912, "train_speed(iter/s)": 0.340648 }, { "epoch": 0.6614077669902912, "grad_norm": 4.821571350097656, "learning_rate": 8.84786354463765e-05, "loss": 2.1617660522460938, "memory(GiB)": 47.95, "step": 1090, "token_acc": 0.5212765957446809, "train_speed(iter/s)": 0.34065 }, { "epoch": 0.6644417475728155, "grad_norm": 5.226224899291992, "learning_rate": 8.837700062624245e-05, "loss": 2.1462074279785157, "memory(GiB)": 47.95, "step": 1095, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 0.340273 }, { "epoch": 0.6674757281553398, "grad_norm": 6.323085784912109, "learning_rate": 8.827497841156986e-05, "loss": 2.472101402282715, "memory(GiB)": 47.95, "step": 1100, "token_acc": 0.4720496894409938, "train_speed(iter/s)": 0.340276 }, { "epoch": 0.6705097087378641, "grad_norm": 4.203432559967041, "learning_rate": 8.817256983221637e-05, "loss": 2.290559768676758, "memory(GiB)": 47.95, "step": 1105, "token_acc": 0.49572649572649574, "train_speed(iter/s)": 0.340202 }, { "epoch": 0.6735436893203883, "grad_norm": 5.223368167877197, "learning_rate": 8.806977592193985e-05, "loss": 2.3806716918945314, "memory(GiB)": 47.95, "step": 1110, "token_acc": 0.49222797927461137, "train_speed(iter/s)": 0.340071 }, { "epoch": 0.6765776699029126, "grad_norm": 3.787109613418579, "learning_rate": 8.796659771838777e-05, "loss": 2.3185195922851562, "memory(GiB)": 47.95, "step": 1115, "token_acc": 0.5047318611987381, "train_speed(iter/s)": 0.340161 }, { "epoch": 0.6796116504854369, "grad_norm": 4.999514102935791, "learning_rate": 8.786303626308689e-05, "loss": 2.1115589141845703, "memory(GiB)": 47.95, "step": 1120, "token_acc": 0.543046357615894, "train_speed(iter/s)": 0.339965 }, { "epoch": 0.6826456310679612, "grad_norm": 3.995405673980713, "learning_rate": 8.775909260143266e-05, "loss": 1.858725357055664, "memory(GiB)": 47.95, "step": 1125, "token_acc": 0.5618729096989966, "train_speed(iter/s)": 0.3399 }, { "epoch": 0.6856796116504854, "grad_norm": 4.423548698425293, "learning_rate": 8.765476778267874e-05, "loss": 2.1476119995117187, "memory(GiB)": 47.95, "step": 1130, "token_acc": 0.5287769784172662, "train_speed(iter/s)": 0.340171 }, { "epoch": 0.6887135922330098, "grad_norm": 3.980705976486206, "learning_rate": 8.755006285992629e-05, "loss": 2.184649848937988, "memory(GiB)": 47.95, "step": 1135, "token_acc": 0.4927536231884058, "train_speed(iter/s)": 0.340022 }, { "epoch": 0.691747572815534, "grad_norm": 4.438331127166748, "learning_rate": 8.744497889011343e-05, "loss": 2.271321487426758, "memory(GiB)": 47.95, "step": 1140, "token_acc": 0.48757763975155277, "train_speed(iter/s)": 0.340464 }, { "epoch": 0.6947815533980582, "grad_norm": 3.692849636077881, "learning_rate": 8.733951693400458e-05, "loss": 2.380019950866699, "memory(GiB)": 47.95, "step": 1145, "token_acc": 0.5017301038062284, "train_speed(iter/s)": 0.340513 }, { "epoch": 0.6978155339805825, "grad_norm": 4.832522869110107, "learning_rate": 8.723367805617965e-05, "loss": 2.1678884506225584, "memory(GiB)": 47.95, "step": 1150, "token_acc": 0.4943820224719101, "train_speed(iter/s)": 0.340676 }, { "epoch": 0.7008495145631068, "grad_norm": 6.489658355712891, "learning_rate": 8.712746332502351e-05, "loss": 2.1112688064575194, "memory(GiB)": 47.95, "step": 1155, "token_acc": 0.5625, "train_speed(iter/s)": 0.340364 }, { "epoch": 0.7038834951456311, "grad_norm": 4.604823112487793, "learning_rate": 8.702087381271488e-05, "loss": 2.1586540222167967, "memory(GiB)": 47.95, "step": 1160, "token_acc": 0.506578947368421, "train_speed(iter/s)": 0.340462 }, { "epoch": 0.7069174757281553, "grad_norm": 9.543238639831543, "learning_rate": 8.691391059521583e-05, "loss": 2.230578804016113, "memory(GiB)": 47.95, "step": 1165, "token_acc": 0.5132450331125827, "train_speed(iter/s)": 0.340554 }, { "epoch": 0.7099514563106796, "grad_norm": 3.811716079711914, "learning_rate": 8.680657475226069e-05, "loss": 2.368003082275391, "memory(GiB)": 48.13, "step": 1170, "token_acc": 0.5024390243902439, "train_speed(iter/s)": 0.340669 }, { "epoch": 0.7129854368932039, "grad_norm": 5.8555588722229, "learning_rate": 8.669886736734527e-05, "loss": 2.076985168457031, "memory(GiB)": 48.13, "step": 1175, "token_acc": 0.5297619047619048, "train_speed(iter/s)": 0.340516 }, { "epoch": 0.7160194174757282, "grad_norm": 5.868568420410156, "learning_rate": 8.659078952771592e-05, "loss": 2.270547103881836, "memory(GiB)": 48.13, "step": 1180, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 0.340168 }, { "epoch": 0.7190533980582524, "grad_norm": 5.378441333770752, "learning_rate": 8.648234232435845e-05, "loss": 2.4539085388183595, "memory(GiB)": 48.13, "step": 1185, "token_acc": 0.4289855072463768, "train_speed(iter/s)": 0.340089 }, { "epoch": 0.7220873786407767, "grad_norm": 4.5980963706970215, "learning_rate": 8.63735268519873e-05, "loss": 2.4108238220214844, "memory(GiB)": 48.13, "step": 1190, "token_acc": 0.5188679245283019, "train_speed(iter/s)": 0.339992 }, { "epoch": 0.725121359223301, "grad_norm": 4.462465286254883, "learning_rate": 8.626434420903424e-05, "loss": 2.20062141418457, "memory(GiB)": 48.13, "step": 1195, "token_acc": 0.49283667621776506, "train_speed(iter/s)": 0.33982 }, { "epoch": 0.7281553398058253, "grad_norm": 3.034787654876709, "learning_rate": 8.615479549763756e-05, "loss": 2.0883810043334963, "memory(GiB)": 48.13, "step": 1200, "token_acc": 0.5190615835777126, "train_speed(iter/s)": 0.339948 }, { "epoch": 0.7311893203883495, "grad_norm": 5.098156452178955, "learning_rate": 8.604488182363074e-05, "loss": 2.131735610961914, "memory(GiB)": 48.13, "step": 1205, "token_acc": 0.504950495049505, "train_speed(iter/s)": 0.340055 }, { "epoch": 0.7342233009708737, "grad_norm": 4.831148147583008, "learning_rate": 8.593460429653133e-05, "loss": 2.2659610748291015, "memory(GiB)": 48.13, "step": 1210, "token_acc": 0.5236593059936908, "train_speed(iter/s)": 0.340248 }, { "epoch": 0.7372572815533981, "grad_norm": 3.812371015548706, "learning_rate": 8.582396402952984e-05, "loss": 2.0403234481811525, "memory(GiB)": 48.13, "step": 1215, "token_acc": 0.5467836257309941, "train_speed(iter/s)": 0.340187 }, { "epoch": 0.7402912621359223, "grad_norm": 5.171321392059326, "learning_rate": 8.571296213947838e-05, "loss": 2.442454528808594, "memory(GiB)": 48.13, "step": 1220, "token_acc": 0.49698795180722893, "train_speed(iter/s)": 0.339966 }, { "epoch": 0.7433252427184466, "grad_norm": 4.190127372741699, "learning_rate": 8.560159974687952e-05, "loss": 2.24304256439209, "memory(GiB)": 48.13, "step": 1225, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 0.339932 }, { "epoch": 0.7463592233009708, "grad_norm": 4.305750846862793, "learning_rate": 8.54898779758748e-05, "loss": 2.122520637512207, "memory(GiB)": 48.13, "step": 1230, "token_acc": 0.5125, "train_speed(iter/s)": 0.340058 }, { "epoch": 0.7493932038834952, "grad_norm": 4.0683722496032715, "learning_rate": 8.537779795423359e-05, "loss": 2.5035858154296875, "memory(GiB)": 48.13, "step": 1235, "token_acc": 0.4743202416918429, "train_speed(iter/s)": 0.340066 }, { "epoch": 0.7524271844660194, "grad_norm": 3.54524827003479, "learning_rate": 8.526536081334152e-05, "loss": 2.468919563293457, "memory(GiB)": 48.13, "step": 1240, "token_acc": 0.48010610079575594, "train_speed(iter/s)": 0.339943 }, { "epoch": 0.7554611650485437, "grad_norm": 4.291368007659912, "learning_rate": 8.515256768818918e-05, "loss": 2.5299869537353517, "memory(GiB)": 48.13, "step": 1245, "token_acc": 0.494475138121547, "train_speed(iter/s)": 0.339822 }, { "epoch": 0.758495145631068, "grad_norm": 6.096860885620117, "learning_rate": 8.503941971736062e-05, "loss": 2.04672794342041, "memory(GiB)": 48.13, "step": 1250, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 0.339949 }, { "epoch": 0.7615291262135923, "grad_norm": 3.5838451385498047, "learning_rate": 8.492591804302186e-05, "loss": 2.1460617065429686, "memory(GiB)": 48.13, "step": 1255, "token_acc": 0.5392491467576792, "train_speed(iter/s)": 0.340196 }, { "epoch": 0.7645631067961165, "grad_norm": 4.5352983474731445, "learning_rate": 8.481206381090934e-05, "loss": 2.333251190185547, "memory(GiB)": 48.13, "step": 1260, "token_acc": 0.48857142857142855, "train_speed(iter/s)": 0.340421 }, { "epoch": 0.7675970873786407, "grad_norm": 4.353206634521484, "learning_rate": 8.469785817031841e-05, "loss": 1.9905767440795898, "memory(GiB)": 48.13, "step": 1265, "token_acc": 0.5431654676258992, "train_speed(iter/s)": 0.340515 }, { "epoch": 0.7706310679611651, "grad_norm": 5.2052130699157715, "learning_rate": 8.458330227409168e-05, "loss": 2.4288896560668944, "memory(GiB)": 48.13, "step": 1270, "token_acc": 0.4250871080139373, "train_speed(iter/s)": 0.340714 }, { "epoch": 0.7736650485436893, "grad_norm": 4.87480354309082, "learning_rate": 8.446839727860738e-05, "loss": 2.2712291717529296, "memory(GiB)": 48.13, "step": 1275, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 0.340651 }, { "epoch": 0.7766990291262136, "grad_norm": 4.463343620300293, "learning_rate": 8.435314434376773e-05, "loss": 2.43187255859375, "memory(GiB)": 48.13, "step": 1280, "token_acc": 0.47592067988668557, "train_speed(iter/s)": 0.341034 }, { "epoch": 0.7797330097087378, "grad_norm": 3.1532158851623535, "learning_rate": 8.423754463298717e-05, "loss": 2.240469551086426, "memory(GiB)": 48.13, "step": 1285, "token_acc": 0.5173210161662818, "train_speed(iter/s)": 0.34096 }, { "epoch": 0.7827669902912622, "grad_norm": 4.581279277801514, "learning_rate": 8.412159931318068e-05, "loss": 2.2193695068359376, "memory(GiB)": 48.13, "step": 1290, "token_acc": 0.5123456790123457, "train_speed(iter/s)": 0.341025 }, { "epoch": 0.7858009708737864, "grad_norm": 4.517386436462402, "learning_rate": 8.400530955475198e-05, "loss": 2.241312789916992, "memory(GiB)": 48.13, "step": 1295, "token_acc": 0.5227272727272727, "train_speed(iter/s)": 0.340967 }, { "epoch": 0.7888349514563107, "grad_norm": 4.086625576019287, "learning_rate": 8.38886765315817e-05, "loss": 2.1092390060424804, "memory(GiB)": 48.13, "step": 1300, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 0.341106 }, { "epoch": 0.7918689320388349, "grad_norm": 3.8964576721191406, "learning_rate": 8.377170142101548e-05, "loss": 1.7721071243286133, "memory(GiB)": 48.13, "step": 1305, "token_acc": 0.5936395759717314, "train_speed(iter/s)": 0.340864 }, { "epoch": 0.7949029126213593, "grad_norm": 5.37955379486084, "learning_rate": 8.365438540385223e-05, "loss": 2.2174589157104494, "memory(GiB)": 48.13, "step": 1310, "token_acc": 0.5215686274509804, "train_speed(iter/s)": 0.340851 }, { "epoch": 0.7979368932038835, "grad_norm": 5.972751140594482, "learning_rate": 8.353672966433206e-05, "loss": 2.2027944564819335, "memory(GiB)": 48.13, "step": 1315, "token_acc": 0.5305555555555556, "train_speed(iter/s)": 0.340754 }, { "epoch": 0.8009708737864077, "grad_norm": 4.488302707672119, "learning_rate": 8.341873539012444e-05, "loss": 2.296087646484375, "memory(GiB)": 48.13, "step": 1320, "token_acc": 0.4896142433234421, "train_speed(iter/s)": 0.340792 }, { "epoch": 0.804004854368932, "grad_norm": 4.65601110458374, "learning_rate": 8.33004037723161e-05, "loss": 2.225381851196289, "memory(GiB)": 48.13, "step": 1325, "token_acc": 0.5164835164835165, "train_speed(iter/s)": 0.34092 }, { "epoch": 0.8070388349514563, "grad_norm": 3.5977799892425537, "learning_rate": 8.318173600539911e-05, "loss": 2.1774517059326173, "memory(GiB)": 48.13, "step": 1330, "token_acc": 0.5434173669467787, "train_speed(iter/s)": 0.341276 }, { "epoch": 0.8100728155339806, "grad_norm": 4.62757682800293, "learning_rate": 8.306273328725878e-05, "loss": 2.2286869049072267, "memory(GiB)": 48.13, "step": 1335, "token_acc": 0.5615384615384615, "train_speed(iter/s)": 0.341449 }, { "epoch": 0.8131067961165048, "grad_norm": 7.3607683181762695, "learning_rate": 8.294339681916154e-05, "loss": 1.880356216430664, "memory(GiB)": 48.13, "step": 1340, "token_acc": 0.5470383275261324, "train_speed(iter/s)": 0.341408 }, { "epoch": 0.8161407766990292, "grad_norm": 4.139468669891357, "learning_rate": 8.282372780574285e-05, "loss": 2.120819854736328, "memory(GiB)": 48.13, "step": 1345, "token_acc": 0.547112462006079, "train_speed(iter/s)": 0.341452 }, { "epoch": 0.8191747572815534, "grad_norm": 3.9162635803222656, "learning_rate": 8.270372745499506e-05, "loss": 2.360477828979492, "memory(GiB)": 48.13, "step": 1350, "token_acc": 0.4984709480122324, "train_speed(iter/s)": 0.3418 }, { "epoch": 0.8222087378640777, "grad_norm": 4.735358715057373, "learning_rate": 8.258339697825515e-05, "loss": 2.056313705444336, "memory(GiB)": 48.13, "step": 1355, "token_acc": 0.5121951219512195, "train_speed(iter/s)": 0.342025 }, { "epoch": 0.8252427184466019, "grad_norm": 7.3946709632873535, "learning_rate": 8.246273759019252e-05, "loss": 2.165603256225586, "memory(GiB)": 48.13, "step": 1360, "token_acc": 0.5220883534136547, "train_speed(iter/s)": 0.341884 }, { "epoch": 0.8282766990291263, "grad_norm": 5.227313995361328, "learning_rate": 8.234175050879684e-05, "loss": 2.5020654678344725, "memory(GiB)": 48.13, "step": 1365, "token_acc": 0.4585635359116022, "train_speed(iter/s)": 0.342147 }, { "epoch": 0.8313106796116505, "grad_norm": 5.750569820404053, "learning_rate": 8.222043695536555e-05, "loss": 2.6139543533325194, "memory(GiB)": 48.58, "step": 1370, "token_acc": 0.44481605351170567, "train_speed(iter/s)": 0.34204 }, { "epoch": 0.8343446601941747, "grad_norm": 4.407741546630859, "learning_rate": 8.20987981544917e-05, "loss": 2.5359169006347657, "memory(GiB)": 48.58, "step": 1375, "token_acc": 0.4559748427672956, "train_speed(iter/s)": 0.342226 }, { "epoch": 0.837378640776699, "grad_norm": 3.6558454036712646, "learning_rate": 8.197683533405157e-05, "loss": 2.2956287384033205, "memory(GiB)": 48.58, "step": 1380, "token_acc": 0.48148148148148145, "train_speed(iter/s)": 0.342383 }, { "epoch": 0.8404126213592233, "grad_norm": 14.889799118041992, "learning_rate": 8.185454972519213e-05, "loss": 2.3386974334716797, "memory(GiB)": 48.58, "step": 1385, "token_acc": 0.49393939393939396, "train_speed(iter/s)": 0.34219 }, { "epoch": 0.8434466019417476, "grad_norm": 3.9922854900360107, "learning_rate": 8.173194256231884e-05, "loss": 2.3400365829467775, "memory(GiB)": 48.58, "step": 1390, "token_acc": 0.5027322404371585, "train_speed(iter/s)": 0.342181 }, { "epoch": 0.8464805825242718, "grad_norm": 5.3954596519470215, "learning_rate": 8.1609015083083e-05, "loss": 2.5830408096313477, "memory(GiB)": 48.58, "step": 1395, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 0.342173 }, { "epoch": 0.8495145631067961, "grad_norm": 3.9419169425964355, "learning_rate": 8.148576852836933e-05, "loss": 2.117966079711914, "memory(GiB)": 48.58, "step": 1400, "token_acc": 0.5236363636363637, "train_speed(iter/s)": 0.342202 }, { "epoch": 0.8525485436893204, "grad_norm": 5.104124069213867, "learning_rate": 8.136220414228347e-05, "loss": 2.1816055297851564, "memory(GiB)": 48.58, "step": 1405, "token_acc": 0.5094339622641509, "train_speed(iter/s)": 0.342292 }, { "epoch": 0.8555825242718447, "grad_norm": 4.876266002655029, "learning_rate": 8.123832317213933e-05, "loss": 1.918435287475586, "memory(GiB)": 48.58, "step": 1410, "token_acc": 0.56, "train_speed(iter/s)": 0.342201 }, { "epoch": 0.8586165048543689, "grad_norm": 3.879775047302246, "learning_rate": 8.111412686844664e-05, "loss": 2.233755874633789, "memory(GiB)": 48.58, "step": 1415, "token_acc": 0.5299684542586751, "train_speed(iter/s)": 0.34226 }, { "epoch": 0.8616504854368932, "grad_norm": 4.957579612731934, "learning_rate": 8.098961648489821e-05, "loss": 1.976656723022461, "memory(GiB)": 48.58, "step": 1420, "token_acc": 0.5569620253164557, "train_speed(iter/s)": 0.342173 }, { "epoch": 0.8646844660194175, "grad_norm": 4.2408294677734375, "learning_rate": 8.08647932783573e-05, "loss": 2.27285213470459, "memory(GiB)": 48.58, "step": 1425, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 0.342014 }, { "epoch": 0.8677184466019418, "grad_norm": 6.029801845550537, "learning_rate": 8.073965850884496e-05, "loss": 2.1388595581054686, "memory(GiB)": 48.58, "step": 1430, "token_acc": 0.5356037151702786, "train_speed(iter/s)": 0.341982 }, { "epoch": 0.870752427184466, "grad_norm": 3.7228658199310303, "learning_rate": 8.061421343952731e-05, "loss": 1.9840200424194336, "memory(GiB)": 48.58, "step": 1435, "token_acc": 0.5352941176470588, "train_speed(iter/s)": 0.341868 }, { "epoch": 0.8737864077669902, "grad_norm": 3.244697093963623, "learning_rate": 8.048845933670273e-05, "loss": 2.0865535736083984, "memory(GiB)": 48.58, "step": 1440, "token_acc": 0.5350140056022409, "train_speed(iter/s)": 0.34217 }, { "epoch": 0.8768203883495146, "grad_norm": 3.9532413482666016, "learning_rate": 8.036239746978914e-05, "loss": 2.2547679901123048, "memory(GiB)": 48.58, "step": 1445, "token_acc": 0.5099337748344371, "train_speed(iter/s)": 0.342376 }, { "epoch": 0.8798543689320388, "grad_norm": 3.8620779514312744, "learning_rate": 8.02360291113112e-05, "loss": 2.2382217407226563, "memory(GiB)": 48.58, "step": 1450, "token_acc": 0.5445205479452054, "train_speed(iter/s)": 0.342483 }, { "epoch": 0.8828883495145631, "grad_norm": 5.215780735015869, "learning_rate": 8.010935553688741e-05, "loss": 2.3056978225708007, "memory(GiB)": 48.58, "step": 1455, "token_acc": 0.515625, "train_speed(iter/s)": 0.342328 }, { "epoch": 0.8859223300970874, "grad_norm": 4.153907299041748, "learning_rate": 7.998237802521726e-05, "loss": 2.1756689071655275, "memory(GiB)": 48.58, "step": 1460, "token_acc": 0.5348837209302325, "train_speed(iter/s)": 0.342403 }, { "epoch": 0.8889563106796117, "grad_norm": 6.095757961273193, "learning_rate": 7.985509785806827e-05, "loss": 2.0519405364990235, "memory(GiB)": 48.58, "step": 1465, "token_acc": 0.5373134328358209, "train_speed(iter/s)": 0.342281 }, { "epoch": 0.8919902912621359, "grad_norm": 6.475569248199463, "learning_rate": 7.97275163202632e-05, "loss": 2.274300765991211, "memory(GiB)": 48.58, "step": 1470, "token_acc": 0.5067567567567568, "train_speed(iter/s)": 0.342463 }, { "epoch": 0.8950242718446602, "grad_norm": 11.25387191772461, "learning_rate": 7.959963469966687e-05, "loss": 2.1999380111694338, "memory(GiB)": 48.58, "step": 1475, "token_acc": 0.5186335403726708, "train_speed(iter/s)": 0.342428 }, { "epoch": 0.8980582524271845, "grad_norm": 3.7553141117095947, "learning_rate": 7.947145428717335e-05, "loss": 2.3262218475341796, "memory(GiB)": 48.58, "step": 1480, "token_acc": 0.4919093851132686, "train_speed(iter/s)": 0.34253 }, { "epoch": 0.9010922330097088, "grad_norm": 4.932270526885986, "learning_rate": 7.934297637669281e-05, "loss": 2.239975166320801, "memory(GiB)": 48.58, "step": 1485, "token_acc": 0.5272206303724928, "train_speed(iter/s)": 0.342599 }, { "epoch": 0.904126213592233, "grad_norm": 4.741405963897705, "learning_rate": 7.921420226513852e-05, "loss": 2.41599178314209, "memory(GiB)": 48.58, "step": 1490, "token_acc": 0.4617737003058104, "train_speed(iter/s)": 0.342684 }, { "epoch": 0.9071601941747572, "grad_norm": 3.99161958694458, "learning_rate": 7.90851332524137e-05, "loss": 2.7072750091552735, "memory(GiB)": 48.58, "step": 1495, "token_acc": 0.48427672955974843, "train_speed(iter/s)": 0.342664 }, { "epoch": 0.9101941747572816, "grad_norm": 3.8044540882110596, "learning_rate": 7.895577064139848e-05, "loss": 2.2672801971435548, "memory(GiB)": 48.58, "step": 1500, "token_acc": 0.5154929577464789, "train_speed(iter/s)": 0.342326 }, { "epoch": 0.9101941747572816, "eval_loss": 2.119637966156006, "eval_runtime": 12.8007, "eval_samples_per_second": 7.812, "eval_steps_per_second": 7.812, "eval_token_acc": 0.5078597339782346, "step": 1500 }, { "epoch": 0.9132281553398058, "grad_norm": 11.186798095703125, "learning_rate": 7.882611573793663e-05, "loss": 2.142319679260254, "memory(GiB)": 48.58, "step": 1505, "token_acc": 0.5177797051170858, "train_speed(iter/s)": 0.341393 }, { "epoch": 0.9162621359223301, "grad_norm": 3.814943552017212, "learning_rate": 7.869616985082255e-05, "loss": 2.3513526916503906, "memory(GiB)": 48.58, "step": 1510, "token_acc": 0.49855907780979825, "train_speed(iter/s)": 0.341464 }, { "epoch": 0.9192961165048543, "grad_norm": 5.959221363067627, "learning_rate": 7.856593429178789e-05, "loss": 2.3082876205444336, "memory(GiB)": 48.58, "step": 1515, "token_acc": 0.4962962962962963, "train_speed(iter/s)": 0.341447 }, { "epoch": 0.9223300970873787, "grad_norm": 4.519277095794678, "learning_rate": 7.843541037548838e-05, "loss": 2.1605640411376954, "memory(GiB)": 48.58, "step": 1520, "token_acc": 0.5338078291814946, "train_speed(iter/s)": 0.341331 }, { "epoch": 0.9253640776699029, "grad_norm": 4.679305076599121, "learning_rate": 7.830459941949058e-05, "loss": 2.190970802307129, "memory(GiB)": 48.58, "step": 1525, "token_acc": 0.51010101010101, "train_speed(iter/s)": 0.34144 }, { "epoch": 0.9283980582524272, "grad_norm": 4.741668701171875, "learning_rate": 7.817350274425856e-05, "loss": 2.3047306060791017, "memory(GiB)": 48.58, "step": 1530, "token_acc": 0.5180722891566265, "train_speed(iter/s)": 0.341582 }, { "epoch": 0.9314320388349514, "grad_norm": 6.428112506866455, "learning_rate": 7.804212167314054e-05, "loss": 2.288783836364746, "memory(GiB)": 48.58, "step": 1535, "token_acc": 0.5031055900621118, "train_speed(iter/s)": 0.341857 }, { "epoch": 0.9344660194174758, "grad_norm": 4.361258029937744, "learning_rate": 7.791045753235555e-05, "loss": 2.17122745513916, "memory(GiB)": 48.58, "step": 1540, "token_acc": 0.5256410256410257, "train_speed(iter/s)": 0.341941 }, { "epoch": 0.9375, "grad_norm": 4.059023380279541, "learning_rate": 7.777851165098012e-05, "loss": 2.033574676513672, "memory(GiB)": 48.58, "step": 1545, "token_acc": 0.5512367491166078, "train_speed(iter/s)": 0.342205 }, { "epoch": 0.9405339805825242, "grad_norm": 4.767461776733398, "learning_rate": 7.76462853609347e-05, "loss": 2.313043785095215, "memory(GiB)": 48.58, "step": 1550, "token_acc": 0.5014005602240896, "train_speed(iter/s)": 0.342114 }, { "epoch": 0.9435679611650486, "grad_norm": 5.499074935913086, "learning_rate": 7.751377999697043e-05, "loss": 2.171670913696289, "memory(GiB)": 48.58, "step": 1555, "token_acc": 0.49337748344370863, "train_speed(iter/s)": 0.342101 }, { "epoch": 0.9466019417475728, "grad_norm": 4.537415981292725, "learning_rate": 7.73809968966554e-05, "loss": 2.1898992538452147, "memory(GiB)": 48.58, "step": 1560, "token_acc": 0.5316091954022989, "train_speed(iter/s)": 0.342011 }, { "epoch": 0.9496359223300971, "grad_norm": 5.201609134674072, "learning_rate": 7.724793740036142e-05, "loss": 2.514649200439453, "memory(GiB)": 48.58, "step": 1565, "token_acc": 0.4584717607973422, "train_speed(iter/s)": 0.342007 }, { "epoch": 0.9526699029126213, "grad_norm": 4.430958271026611, "learning_rate": 7.711460285125028e-05, "loss": 1.884146499633789, "memory(GiB)": 48.58, "step": 1570, "token_acc": 0.5508771929824562, "train_speed(iter/s)": 0.342026 }, { "epoch": 0.9557038834951457, "grad_norm": 3.8485047817230225, "learning_rate": 7.698099459526034e-05, "loss": 2.2105873107910154, "memory(GiB)": 48.58, "step": 1575, "token_acc": 0.5223367697594502, "train_speed(iter/s)": 0.341925 }, { "epoch": 0.9587378640776699, "grad_norm": 4.212836742401123, "learning_rate": 7.684711398109284e-05, "loss": 2.2711381912231445, "memory(GiB)": 48.58, "step": 1580, "token_acc": 0.5085714285714286, "train_speed(iter/s)": 0.341977 }, { "epoch": 0.9617718446601942, "grad_norm": 3.729053497314453, "learning_rate": 7.67129623601983e-05, "loss": 2.011947250366211, "memory(GiB)": 48.58, "step": 1585, "token_acc": 0.5614035087719298, "train_speed(iter/s)": 0.341976 }, { "epoch": 0.9648058252427184, "grad_norm": 3.948821544647217, "learning_rate": 7.657854108676299e-05, "loss": 2.060837173461914, "memory(GiB)": 48.58, "step": 1590, "token_acc": 0.5283582089552239, "train_speed(iter/s)": 0.341767 }, { "epoch": 0.9678398058252428, "grad_norm": 3.723236560821533, "learning_rate": 7.644385151769509e-05, "loss": 2.3746822357177733, "memory(GiB)": 48.58, "step": 1595, "token_acc": 0.49865951742627346, "train_speed(iter/s)": 0.341682 }, { "epoch": 0.970873786407767, "grad_norm": 4.396999359130859, "learning_rate": 7.630889501261109e-05, "loss": 1.9808526992797852, "memory(GiB)": 48.58, "step": 1600, "token_acc": 0.5483870967741935, "train_speed(iter/s)": 0.341715 }, { "epoch": 0.9739077669902912, "grad_norm": 5.338048458099365, "learning_rate": 7.617367293382211e-05, "loss": 2.3883081436157227, "memory(GiB)": 48.58, "step": 1605, "token_acc": 0.504885993485342, "train_speed(iter/s)": 0.341843 }, { "epoch": 0.9769417475728155, "grad_norm": 4.621726036071777, "learning_rate": 7.603818664632001e-05, "loss": 2.209703254699707, "memory(GiB)": 48.58, "step": 1610, "token_acc": 0.5437956204379562, "train_speed(iter/s)": 0.341818 }, { "epoch": 0.9799757281553398, "grad_norm": 4.83400297164917, "learning_rate": 7.590243751776374e-05, "loss": 2.2092563629150392, "memory(GiB)": 48.58, "step": 1615, "token_acc": 0.47896440129449835, "train_speed(iter/s)": 0.341839 }, { "epoch": 0.9830097087378641, "grad_norm": 5.153246879577637, "learning_rate": 7.576642691846546e-05, "loss": 2.221535491943359, "memory(GiB)": 48.58, "step": 1620, "token_acc": 0.5216049382716049, "train_speed(iter/s)": 0.341961 }, { "epoch": 0.9860436893203883, "grad_norm": 3.5656988620758057, "learning_rate": 7.563015622137674e-05, "loss": 2.3794229507446287, "memory(GiB)": 48.58, "step": 1625, "token_acc": 0.5082417582417582, "train_speed(iter/s)": 0.341866 }, { "epoch": 0.9890776699029126, "grad_norm": 3.4351837635040283, "learning_rate": 7.549362680207472e-05, "loss": 2.264466094970703, "memory(GiB)": 48.58, "step": 1630, "token_acc": 0.5377643504531722, "train_speed(iter/s)": 0.341987 }, { "epoch": 0.9921116504854369, "grad_norm": 4.98211669921875, "learning_rate": 7.535684003874816e-05, "loss": 2.1651094436645506, "memory(GiB)": 48.58, "step": 1635, "token_acc": 0.562111801242236, "train_speed(iter/s)": 0.342079 }, { "epoch": 0.9951456310679612, "grad_norm": 5.339380741119385, "learning_rate": 7.521979731218356e-05, "loss": 2.2094818115234376, "memory(GiB)": 48.58, "step": 1640, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 0.342034 }, { "epoch": 0.9981796116504854, "grad_norm": 4.058453559875488, "learning_rate": 7.508250000575125e-05, "loss": 2.265620231628418, "memory(GiB)": 48.58, "step": 1645, "token_acc": 0.5127272727272727, "train_speed(iter/s)": 0.341844 }, { "epoch": 1.0012135922330097, "grad_norm": 4.797982215881348, "learning_rate": 7.494494950539143e-05, "loss": 2.067191314697266, "memory(GiB)": 48.58, "step": 1650, "token_acc": 0.5459770114942529, "train_speed(iter/s)": 0.342021 }, { "epoch": 1.004247572815534, "grad_norm": 4.9101762771606445, "learning_rate": 7.480714719960007e-05, "loss": 1.926223373413086, "memory(GiB)": 48.58, "step": 1655, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 0.341734 }, { "epoch": 1.0072815533980584, "grad_norm": 5.355608940124512, "learning_rate": 7.466909447941508e-05, "loss": 1.9348087310791016, "memory(GiB)": 48.58, "step": 1660, "token_acc": 0.5570934256055363, "train_speed(iter/s)": 0.341642 }, { "epoch": 1.0103155339805825, "grad_norm": 6.056564807891846, "learning_rate": 7.453079273840207e-05, "loss": 2.060457229614258, "memory(GiB)": 48.58, "step": 1665, "token_acc": 0.5436893203883495, "train_speed(iter/s)": 0.341622 }, { "epoch": 1.0133495145631068, "grad_norm": 3.841477632522583, "learning_rate": 7.439224337264043e-05, "loss": 1.9137187957763673, "memory(GiB)": 48.58, "step": 1670, "token_acc": 0.5869565217391305, "train_speed(iter/s)": 0.341438 }, { "epoch": 1.016383495145631, "grad_norm": 2.8432776927948, "learning_rate": 7.425344778070917e-05, "loss": 1.8410741806030273, "memory(GiB)": 48.58, "step": 1675, "token_acc": 0.5627240143369175, "train_speed(iter/s)": 0.341365 }, { "epoch": 1.0194174757281553, "grad_norm": 5.52289342880249, "learning_rate": 7.411440736367281e-05, "loss": 2.0187950134277344, "memory(GiB)": 48.58, "step": 1680, "token_acc": 0.5666666666666667, "train_speed(iter/s)": 0.341423 }, { "epoch": 1.0224514563106797, "grad_norm": 4.637285232543945, "learning_rate": 7.397512352506727e-05, "loss": 1.7935161590576172, "memory(GiB)": 48.58, "step": 1685, "token_acc": 0.565068493150685, "train_speed(iter/s)": 0.34143 }, { "epoch": 1.0254854368932038, "grad_norm": 5.479421615600586, "learning_rate": 7.383559767088566e-05, "loss": 2.355337142944336, "memory(GiB)": 48.58, "step": 1690, "token_acc": 0.5069444444444444, "train_speed(iter/s)": 0.341486 }, { "epoch": 1.0285194174757282, "grad_norm": 3.832714557647705, "learning_rate": 7.369583120956407e-05, "loss": 2.019994926452637, "memory(GiB)": 48.58, "step": 1695, "token_acc": 0.5340136054421769, "train_speed(iter/s)": 0.341666 }, { "epoch": 1.0315533980582525, "grad_norm": 4.21201753616333, "learning_rate": 7.355582555196745e-05, "loss": 2.0614465713500976, "memory(GiB)": 48.58, "step": 1700, "token_acc": 0.5492063492063493, "train_speed(iter/s)": 0.341771 }, { "epoch": 1.0345873786407767, "grad_norm": 5.133928298950195, "learning_rate": 7.341558211137526e-05, "loss": 2.0001636505126954, "memory(GiB)": 48.58, "step": 1705, "token_acc": 0.5394736842105263, "train_speed(iter/s)": 0.341705 }, { "epoch": 1.037621359223301, "grad_norm": 4.81807279586792, "learning_rate": 7.327510230346726e-05, "loss": 2.3588188171386717, "memory(GiB)": 48.58, "step": 1710, "token_acc": 0.4850498338870432, "train_speed(iter/s)": 0.341603 }, { "epoch": 1.0406553398058251, "grad_norm": 3.6796188354492188, "learning_rate": 7.313438754630918e-05, "loss": 2.20300178527832, "memory(GiB)": 48.58, "step": 1715, "token_acc": 0.5125, "train_speed(iter/s)": 0.34189 }, { "epoch": 1.0436893203883495, "grad_norm": 4.132872581481934, "learning_rate": 7.299343926033851e-05, "loss": 2.326984405517578, "memory(GiB)": 48.58, "step": 1720, "token_acc": 0.48606811145510836, "train_speed(iter/s)": 0.341702 }, { "epoch": 1.0467233009708738, "grad_norm": 3.4396705627441406, "learning_rate": 7.285225886834997e-05, "loss": 2.050062561035156, "memory(GiB)": 48.58, "step": 1725, "token_acc": 0.4968944099378882, "train_speed(iter/s)": 0.341829 }, { "epoch": 1.049757281553398, "grad_norm": 4.9279985427856445, "learning_rate": 7.271084779548136e-05, "loss": 1.93597412109375, "memory(GiB)": 48.58, "step": 1730, "token_acc": 0.5696969696969697, "train_speed(iter/s)": 0.342031 }, { "epoch": 1.0527912621359223, "grad_norm": 5.022060871124268, "learning_rate": 7.256920746919904e-05, "loss": 2.4032569885253907, "memory(GiB)": 48.58, "step": 1735, "token_acc": 0.4864864864864865, "train_speed(iter/s)": 0.342023 }, { "epoch": 1.0558252427184467, "grad_norm": 3.5440573692321777, "learning_rate": 7.242733931928352e-05, "loss": 2.14691047668457, "memory(GiB)": 48.58, "step": 1740, "token_acc": 0.5369127516778524, "train_speed(iter/s)": 0.342162 }, { "epoch": 1.0588592233009708, "grad_norm": 8.292412757873535, "learning_rate": 7.228524477781514e-05, "loss": 2.236528778076172, "memory(GiB)": 48.58, "step": 1745, "token_acc": 0.5181159420289855, "train_speed(iter/s)": 0.342361 }, { "epoch": 1.0618932038834952, "grad_norm": 3.835604190826416, "learning_rate": 7.214292527915949e-05, "loss": 2.2530035018920898, "memory(GiB)": 48.58, "step": 1750, "token_acc": 0.5201072386058981, "train_speed(iter/s)": 0.34242 }, { "epoch": 1.0649271844660193, "grad_norm": 4.860381126403809, "learning_rate": 7.200038225995294e-05, "loss": 1.919698715209961, "memory(GiB)": 48.58, "step": 1755, "token_acc": 0.556497175141243, "train_speed(iter/s)": 0.342549 }, { "epoch": 1.0679611650485437, "grad_norm": 5.044214725494385, "learning_rate": 7.185761715908825e-05, "loss": 2.1757881164550783, "memory(GiB)": 48.58, "step": 1760, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 0.34263 }, { "epoch": 1.070995145631068, "grad_norm": 6.300179481506348, "learning_rate": 7.171463141769994e-05, "loss": 1.898487663269043, "memory(GiB)": 48.58, "step": 1765, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 0.342719 }, { "epoch": 1.0740291262135921, "grad_norm": 5.766304969787598, "learning_rate": 7.157142647914979e-05, "loss": 2.0412919998168944, "memory(GiB)": 48.58, "step": 1770, "token_acc": 0.5673758865248227, "train_speed(iter/s)": 0.342859 }, { "epoch": 1.0770631067961165, "grad_norm": 4.097749710083008, "learning_rate": 7.14280037890122e-05, "loss": 1.9373649597167968, "memory(GiB)": 48.58, "step": 1775, "token_acc": 0.5607843137254902, "train_speed(iter/s)": 0.342985 }, { "epoch": 1.0800970873786409, "grad_norm": 4.749823570251465, "learning_rate": 7.128436479505971e-05, "loss": 1.9778429031372071, "memory(GiB)": 48.58, "step": 1780, "token_acc": 0.5407166123778502, "train_speed(iter/s)": 0.343072 }, { "epoch": 1.083131067961165, "grad_norm": 4.9614129066467285, "learning_rate": 7.114051094724831e-05, "loss": 2.216476058959961, "memory(GiB)": 48.58, "step": 1785, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 0.343268 }, { "epoch": 1.0861650485436893, "grad_norm": 3.5989792346954346, "learning_rate": 7.09964436977028e-05, "loss": 2.0698268890380858, "memory(GiB)": 48.58, "step": 1790, "token_acc": 0.5252225519287834, "train_speed(iter/s)": 0.343266 }, { "epoch": 1.0891990291262137, "grad_norm": 6.2526702880859375, "learning_rate": 7.085216450070218e-05, "loss": 2.1593814849853517, "memory(GiB)": 48.58, "step": 1795, "token_acc": 0.5152354570637119, "train_speed(iter/s)": 0.343202 }, { "epoch": 1.0922330097087378, "grad_norm": 4.004349231719971, "learning_rate": 7.070767481266492e-05, "loss": 2.3234344482421876, "memory(GiB)": 48.58, "step": 1800, "token_acc": 0.5234899328859061, "train_speed(iter/s)": 0.3435 }, { "epoch": 1.0952669902912622, "grad_norm": 4.335809230804443, "learning_rate": 7.056297609213432e-05, "loss": 2.0612245559692384, "memory(GiB)": 48.58, "step": 1805, "token_acc": 0.48854961832061067, "train_speed(iter/s)": 0.343493 }, { "epoch": 1.0983009708737863, "grad_norm": 3.820521831512451, "learning_rate": 7.041806979976368e-05, "loss": 2.3006450653076174, "memory(GiB)": 48.58, "step": 1810, "token_acc": 0.5, "train_speed(iter/s)": 0.343654 }, { "epoch": 1.1013349514563107, "grad_norm": 5.044717311859131, "learning_rate": 7.027295739830169e-05, "loss": 2.227552795410156, "memory(GiB)": 48.58, "step": 1815, "token_acc": 0.528023598820059, "train_speed(iter/s)": 0.343564 }, { "epoch": 1.104368932038835, "grad_norm": 5.343876838684082, "learning_rate": 7.012764035257756e-05, "loss": 2.2927597045898436, "memory(GiB)": 48.58, "step": 1820, "token_acc": 0.49299719887955185, "train_speed(iter/s)": 0.343722 }, { "epoch": 1.1074029126213591, "grad_norm": 7.605224132537842, "learning_rate": 6.998212012948626e-05, "loss": 2.1167228698730467, "memory(GiB)": 48.58, "step": 1825, "token_acc": 0.5152542372881356, "train_speed(iter/s)": 0.343618 }, { "epoch": 1.1104368932038835, "grad_norm": 6.60134220123291, "learning_rate": 6.983639819797377e-05, "loss": 1.9928577423095704, "memory(GiB)": 48.58, "step": 1830, "token_acc": 0.5773584905660377, "train_speed(iter/s)": 0.343604 }, { "epoch": 1.1134708737864079, "grad_norm": 5.24697732925415, "learning_rate": 6.969047602902213e-05, "loss": 2.319891357421875, "memory(GiB)": 48.58, "step": 1835, "token_acc": 0.48264984227129337, "train_speed(iter/s)": 0.343622 }, { "epoch": 1.116504854368932, "grad_norm": 4.553162097930908, "learning_rate": 6.954435509563478e-05, "loss": 2.2016368865966798, "memory(GiB)": 48.58, "step": 1840, "token_acc": 0.5473684210526316, "train_speed(iter/s)": 0.343783 }, { "epoch": 1.1195388349514563, "grad_norm": 4.612880706787109, "learning_rate": 6.939803687282146e-05, "loss": 2.2357139587402344, "memory(GiB)": 48.58, "step": 1845, "token_acc": 0.5064102564102564, "train_speed(iter/s)": 0.343682 }, { "epoch": 1.1225728155339807, "grad_norm": 3.8921847343444824, "learning_rate": 6.925152283758348e-05, "loss": 2.321514701843262, "memory(GiB)": 48.58, "step": 1850, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 0.343812 }, { "epoch": 1.1256067961165048, "grad_norm": 5.028384208679199, "learning_rate": 6.91048144688988e-05, "loss": 1.983698844909668, "memory(GiB)": 48.58, "step": 1855, "token_acc": 0.5689655172413793, "train_speed(iter/s)": 0.343792 }, { "epoch": 1.1286407766990292, "grad_norm": 6.004411220550537, "learning_rate": 6.895791324770701e-05, "loss": 2.239480972290039, "memory(GiB)": 48.58, "step": 1860, "token_acc": 0.5445205479452054, "train_speed(iter/s)": 0.343677 }, { "epoch": 1.1316747572815533, "grad_norm": 4.040624141693115, "learning_rate": 6.881082065689453e-05, "loss": 2.1818286895751955, "memory(GiB)": 48.58, "step": 1865, "token_acc": 0.5062111801242236, "train_speed(iter/s)": 0.343504 }, { "epoch": 1.1347087378640777, "grad_norm": 5.721238136291504, "learning_rate": 6.866353818127942e-05, "loss": 1.9519981384277343, "memory(GiB)": 48.58, "step": 1870, "token_acc": 0.5530973451327433, "train_speed(iter/s)": 0.343403 }, { "epoch": 1.137742718446602, "grad_norm": 5.189139366149902, "learning_rate": 6.851606730759664e-05, "loss": 1.690359878540039, "memory(GiB)": 48.58, "step": 1875, "token_acc": 0.5765765765765766, "train_speed(iter/s)": 0.343485 }, { "epoch": 1.1407766990291262, "grad_norm": 7.511564254760742, "learning_rate": 6.836840952448285e-05, "loss": 2.1529712677001953, "memory(GiB)": 48.58, "step": 1880, "token_acc": 0.4938650306748466, "train_speed(iter/s)": 0.343653 }, { "epoch": 1.1438106796116505, "grad_norm": 4.377907752990723, "learning_rate": 6.82205663224615e-05, "loss": 2.254614067077637, "memory(GiB)": 48.58, "step": 1885, "token_acc": 0.5298245614035088, "train_speed(iter/s)": 0.34361 }, { "epoch": 1.1468446601941746, "grad_norm": 7.221999645233154, "learning_rate": 6.807253919392773e-05, "loss": 1.817204475402832, "memory(GiB)": 48.58, "step": 1890, "token_acc": 0.5785714285714286, "train_speed(iter/s)": 0.343532 }, { "epoch": 1.149878640776699, "grad_norm": 4.545262336730957, "learning_rate": 6.792432963313328e-05, "loss": 2.210196304321289, "memory(GiB)": 48.58, "step": 1895, "token_acc": 0.5540983606557377, "train_speed(iter/s)": 0.343535 }, { "epoch": 1.1529126213592233, "grad_norm": 4.8057780265808105, "learning_rate": 6.777593913617152e-05, "loss": 2.015823173522949, "memory(GiB)": 48.58, "step": 1900, "token_acc": 0.5443037974683544, "train_speed(iter/s)": 0.343463 }, { "epoch": 1.1559466019417475, "grad_norm": 3.566347122192383, "learning_rate": 6.762736920096218e-05, "loss": 2.1104352951049803, "memory(GiB)": 48.58, "step": 1905, "token_acc": 0.52, "train_speed(iter/s)": 0.343355 }, { "epoch": 1.1589805825242718, "grad_norm": 5.208457946777344, "learning_rate": 6.747862132723641e-05, "loss": 2.0910247802734374, "memory(GiB)": 48.58, "step": 1910, "token_acc": 0.5251572327044025, "train_speed(iter/s)": 0.343367 }, { "epoch": 1.1620145631067962, "grad_norm": 4.095450401306152, "learning_rate": 6.732969701652145e-05, "loss": 2.177256965637207, "memory(GiB)": 48.58, "step": 1915, "token_acc": 0.504225352112676, "train_speed(iter/s)": 0.343392 }, { "epoch": 1.1650485436893203, "grad_norm": 4.347114086151123, "learning_rate": 6.718059777212567e-05, "loss": 2.0958702087402346, "memory(GiB)": 48.58, "step": 1920, "token_acc": 0.5451388888888888, "train_speed(iter/s)": 0.343334 }, { "epoch": 1.1680825242718447, "grad_norm": 4.913068771362305, "learning_rate": 6.703132509912322e-05, "loss": 1.880838966369629, "memory(GiB)": 48.58, "step": 1925, "token_acc": 0.56657223796034, "train_speed(iter/s)": 0.343531 }, { "epoch": 1.171116504854369, "grad_norm": 5.108087062835693, "learning_rate": 6.688188050433897e-05, "loss": 2.1963933944702148, "memory(GiB)": 48.58, "step": 1930, "token_acc": 0.5331010452961672, "train_speed(iter/s)": 0.343475 }, { "epoch": 1.1741504854368932, "grad_norm": 4.714470863342285, "learning_rate": 6.673226549633325e-05, "loss": 2.2074684143066405, "memory(GiB)": 48.58, "step": 1935, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 0.343522 }, { "epoch": 1.1771844660194175, "grad_norm": 3.4806647300720215, "learning_rate": 6.658248158538655e-05, "loss": 1.9368829727172852, "memory(GiB)": 48.58, "step": 1940, "token_acc": 0.5827338129496403, "train_speed(iter/s)": 0.343768 }, { "epoch": 1.1802184466019416, "grad_norm": 5.499170780181885, "learning_rate": 6.643253028348443e-05, "loss": 2.1628402709960937, "memory(GiB)": 48.58, "step": 1945, "token_acc": 0.5033333333333333, "train_speed(iter/s)": 0.343738 }, { "epoch": 1.183252427184466, "grad_norm": 3.957730531692505, "learning_rate": 6.628241310430208e-05, "loss": 1.742011260986328, "memory(GiB)": 48.58, "step": 1950, "token_acc": 0.6021505376344086, "train_speed(iter/s)": 0.343527 }, { "epoch": 1.1862864077669903, "grad_norm": 4.180270195007324, "learning_rate": 6.613213156318921e-05, "loss": 1.8580095291137695, "memory(GiB)": 48.58, "step": 1955, "token_acc": 0.5582089552238806, "train_speed(iter/s)": 0.343565 }, { "epoch": 1.1893203883495145, "grad_norm": 7.1107587814331055, "learning_rate": 6.598168717715462e-05, "loss": 2.3417797088623047, "memory(GiB)": 48.58, "step": 1960, "token_acc": 0.5205992509363296, "train_speed(iter/s)": 0.343421 }, { "epoch": 1.1923543689320388, "grad_norm": 6.8834547996521, "learning_rate": 6.583108146485092e-05, "loss": 1.9168407440185546, "memory(GiB)": 48.58, "step": 1965, "token_acc": 0.5140845070422535, "train_speed(iter/s)": 0.34352 }, { "epoch": 1.1953883495145632, "grad_norm": 3.3027169704437256, "learning_rate": 6.568031594655933e-05, "loss": 2.27227668762207, "memory(GiB)": 48.58, "step": 1970, "token_acc": 0.5558659217877095, "train_speed(iter/s)": 0.343451 }, { "epoch": 1.1984223300970873, "grad_norm": 5.00934362411499, "learning_rate": 6.552939214417411e-05, "loss": 2.115768814086914, "memory(GiB)": 48.58, "step": 1975, "token_acc": 0.5588235294117647, "train_speed(iter/s)": 0.34355 }, { "epoch": 1.2014563106796117, "grad_norm": 7.091533184051514, "learning_rate": 6.537831158118732e-05, "loss": 2.265652656555176, "memory(GiB)": 48.58, "step": 1980, "token_acc": 0.5192307692307693, "train_speed(iter/s)": 0.343554 }, { "epoch": 1.204490291262136, "grad_norm": 3.855525255203247, "learning_rate": 6.522707578267349e-05, "loss": 2.0944122314453124, "memory(GiB)": 48.58, "step": 1985, "token_acc": 0.5326370757180157, "train_speed(iter/s)": 0.343645 }, { "epoch": 1.2075242718446602, "grad_norm": 4.063835620880127, "learning_rate": 6.507568627527411e-05, "loss": 2.325888442993164, "memory(GiB)": 48.58, "step": 1990, "token_acc": 0.5261538461538462, "train_speed(iter/s)": 0.343594 }, { "epoch": 1.2105582524271845, "grad_norm": 4.776934623718262, "learning_rate": 6.492414458718235e-05, "loss": 2.093099594116211, "memory(GiB)": 48.58, "step": 1995, "token_acc": 0.5302013422818792, "train_speed(iter/s)": 0.343629 }, { "epoch": 1.2135922330097086, "grad_norm": 4.554028034210205, "learning_rate": 6.477245224812745e-05, "loss": 1.8396465301513671, "memory(GiB)": 48.58, "step": 2000, "token_acc": 0.5603112840466926, "train_speed(iter/s)": 0.343697 }, { "epoch": 1.2135922330097086, "eval_loss": 2.046851873397827, "eval_runtime": 12.7671, "eval_samples_per_second": 7.833, "eval_steps_per_second": 7.833, "eval_token_acc": 0.5554035567715458, "step": 2000 }, { "epoch": 1.216626213592233, "grad_norm": 4.073495864868164, "learning_rate": 6.462061078935951e-05, "loss": 2.3618484497070313, "memory(GiB)": 48.58, "step": 2005, "token_acc": 0.53719723183391, "train_speed(iter/s)": 0.342965 }, { "epoch": 1.2196601941747574, "grad_norm": 7.868297100067139, "learning_rate": 6.446862174363378e-05, "loss": 2.216505432128906, "memory(GiB)": 48.58, "step": 2010, "token_acc": 0.51, "train_speed(iter/s)": 0.34307 }, { "epoch": 1.2226941747572815, "grad_norm": 4.013204097747803, "learning_rate": 6.431648664519544e-05, "loss": 2.0036340713500977, "memory(GiB)": 48.58, "step": 2015, "token_acc": 0.5758620689655173, "train_speed(iter/s)": 0.34307 }, { "epoch": 1.2257281553398058, "grad_norm": 4.843589782714844, "learning_rate": 6.416420702976393e-05, "loss": 2.211021614074707, "memory(GiB)": 48.58, "step": 2020, "token_acc": 0.5163636363636364, "train_speed(iter/s)": 0.343153 }, { "epoch": 1.2287621359223302, "grad_norm": 4.74764347076416, "learning_rate": 6.401178443451751e-05, "loss": 1.9067876815795899, "memory(GiB)": 48.58, "step": 2025, "token_acc": 0.5563139931740614, "train_speed(iter/s)": 0.343034 }, { "epoch": 1.2317961165048543, "grad_norm": 5.631340503692627, "learning_rate": 6.385922039807773e-05, "loss": 2.320001220703125, "memory(GiB)": 48.58, "step": 2030, "token_acc": 0.47678018575851394, "train_speed(iter/s)": 0.343001 }, { "epoch": 1.2348300970873787, "grad_norm": 16.38920021057129, "learning_rate": 6.370651646049398e-05, "loss": 1.9658702850341796, "memory(GiB)": 48.58, "step": 2035, "token_acc": 0.576271186440678, "train_speed(iter/s)": 0.342744 }, { "epoch": 1.237864077669903, "grad_norm": 5.151001453399658, "learning_rate": 6.355367416322779e-05, "loss": 2.308607482910156, "memory(GiB)": 48.58, "step": 2040, "token_acc": 0.4844290657439446, "train_speed(iter/s)": 0.342947 }, { "epoch": 1.2408980582524272, "grad_norm": 5.743799209594727, "learning_rate": 6.340069504913737e-05, "loss": 2.211442756652832, "memory(GiB)": 48.58, "step": 2045, "token_acc": 0.4870967741935484, "train_speed(iter/s)": 0.343038 }, { "epoch": 1.2439320388349515, "grad_norm": 3.5934197902679443, "learning_rate": 6.324758066246211e-05, "loss": 2.199032211303711, "memory(GiB)": 48.58, "step": 2050, "token_acc": 0.5111731843575419, "train_speed(iter/s)": 0.343234 }, { "epoch": 1.2469660194174756, "grad_norm": 5.036136150360107, "learning_rate": 6.309433254880675e-05, "loss": 2.0224218368530273, "memory(GiB)": 48.58, "step": 2055, "token_acc": 0.5432098765432098, "train_speed(iter/s)": 0.343337 }, { "epoch": 1.25, "grad_norm": 6.216937065124512, "learning_rate": 6.294095225512603e-05, "loss": 1.941237449645996, "memory(GiB)": 48.58, "step": 2060, "token_acc": 0.5746268656716418, "train_speed(iter/s)": 0.343276 }, { "epoch": 1.2530339805825244, "grad_norm": 5.691662311553955, "learning_rate": 6.278744132970899e-05, "loss": 2.1386714935302735, "memory(GiB)": 48.58, "step": 2065, "token_acc": 0.5145631067961165, "train_speed(iter/s)": 0.343377 }, { "epoch": 1.2560679611650485, "grad_norm": 3.808877468109131, "learning_rate": 6.263380132216328e-05, "loss": 2.1295469284057615, "memory(GiB)": 48.58, "step": 2070, "token_acc": 0.50814332247557, "train_speed(iter/s)": 0.343455 }, { "epoch": 1.2591019417475728, "grad_norm": 4.166950225830078, "learning_rate": 6.248003378339958e-05, "loss": 2.106120491027832, "memory(GiB)": 48.58, "step": 2075, "token_acc": 0.5261437908496732, "train_speed(iter/s)": 0.343588 }, { "epoch": 1.262135922330097, "grad_norm": 4.258691787719727, "learning_rate": 6.232614026561587e-05, "loss": 2.0506404876708983, "memory(GiB)": 48.58, "step": 2080, "token_acc": 0.5397350993377483, "train_speed(iter/s)": 0.343482 }, { "epoch": 1.2651699029126213, "grad_norm": 4.840569496154785, "learning_rate": 6.217212232228189e-05, "loss": 2.3917665481567383, "memory(GiB)": 48.58, "step": 2085, "token_acc": 0.45098039215686275, "train_speed(iter/s)": 0.343456 }, { "epoch": 1.2682038834951457, "grad_norm": 3.033198833465576, "learning_rate": 6.201798150812338e-05, "loss": 2.2491912841796875, "memory(GiB)": 48.58, "step": 2090, "token_acc": 0.5337423312883436, "train_speed(iter/s)": 0.343298 }, { "epoch": 1.27123786407767, "grad_norm": 5.938803195953369, "learning_rate": 6.186371937910637e-05, "loss": 1.9746942520141602, "memory(GiB)": 48.58, "step": 2095, "token_acc": 0.5381944444444444, "train_speed(iter/s)": 0.343224 }, { "epoch": 1.2742718446601942, "grad_norm": 5.278983116149902, "learning_rate": 6.170933749242152e-05, "loss": 2.1614307403564452, "memory(GiB)": 48.58, "step": 2100, "token_acc": 0.554006968641115, "train_speed(iter/s)": 0.343109 }, { "epoch": 1.2773058252427185, "grad_norm": 5.103209972381592, "learning_rate": 6.155483740646832e-05, "loss": 2.300009536743164, "memory(GiB)": 48.58, "step": 2105, "token_acc": 0.5179640718562875, "train_speed(iter/s)": 0.342943 }, { "epoch": 1.2803398058252426, "grad_norm": 8.197809219360352, "learning_rate": 6.140022068083948e-05, "loss": 2.3231576919555663, "memory(GiB)": 48.58, "step": 2110, "token_acc": 0.5, "train_speed(iter/s)": 0.342977 }, { "epoch": 1.283373786407767, "grad_norm": 4.649695873260498, "learning_rate": 6.124548887630508e-05, "loss": 1.9816240310668944, "memory(GiB)": 48.58, "step": 2115, "token_acc": 0.5169712793733682, "train_speed(iter/s)": 0.342965 }, { "epoch": 1.2864077669902914, "grad_norm": 4.234870433807373, "learning_rate": 6.109064355479692e-05, "loss": 2.1336917877197266, "memory(GiB)": 48.58, "step": 2120, "token_acc": 0.51875, "train_speed(iter/s)": 0.342927 }, { "epoch": 1.2894417475728155, "grad_norm": 4.564899444580078, "learning_rate": 6.093568627939261e-05, "loss": 2.1313234329223634, "memory(GiB)": 48.58, "step": 2125, "token_acc": 0.5229357798165137, "train_speed(iter/s)": 0.342915 }, { "epoch": 1.2924757281553398, "grad_norm": 4.541316032409668, "learning_rate": 6.078061861429995e-05, "loss": 2.2283203125, "memory(GiB)": 48.58, "step": 2130, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 0.342938 }, { "epoch": 1.295509708737864, "grad_norm": 4.162906169891357, "learning_rate": 6.062544212484096e-05, "loss": 2.115195465087891, "memory(GiB)": 48.58, "step": 2135, "token_acc": 0.4984984984984985, "train_speed(iter/s)": 0.342993 }, { "epoch": 1.2985436893203883, "grad_norm": 4.833617210388184, "learning_rate": 6.047015837743629e-05, "loss": 2.023375129699707, "memory(GiB)": 49.62, "step": 2140, "token_acc": 0.574468085106383, "train_speed(iter/s)": 0.343009 }, { "epoch": 1.3015776699029127, "grad_norm": 4.030360221862793, "learning_rate": 6.031476893958926e-05, "loss": 2.1666656494140626, "memory(GiB)": 49.62, "step": 2145, "token_acc": 0.4728434504792332, "train_speed(iter/s)": 0.342938 }, { "epoch": 1.3046116504854368, "grad_norm": 3.855311632156372, "learning_rate": 6.015927537987004e-05, "loss": 2.200743865966797, "memory(GiB)": 49.62, "step": 2150, "token_acc": 0.5251989389920424, "train_speed(iter/s)": 0.342784 }, { "epoch": 1.3076456310679612, "grad_norm": 7.273021221160889, "learning_rate": 6.0003679267899904e-05, "loss": 1.8679014205932618, "memory(GiB)": 49.62, "step": 2155, "token_acc": 0.5667870036101083, "train_speed(iter/s)": 0.342791 }, { "epoch": 1.3106796116504853, "grad_norm": 6.096770286560059, "learning_rate": 5.9847982174335316e-05, "loss": 1.9540796279907227, "memory(GiB)": 49.62, "step": 2160, "token_acc": 0.5377358490566038, "train_speed(iter/s)": 0.342674 }, { "epoch": 1.3137135922330097, "grad_norm": 4.696122646331787, "learning_rate": 5.969218567085206e-05, "loss": 2.2308191299438476, "memory(GiB)": 49.62, "step": 2165, "token_acc": 0.5049180327868853, "train_speed(iter/s)": 0.342608 }, { "epoch": 1.316747572815534, "grad_norm": 4.60797643661499, "learning_rate": 5.953629133012949e-05, "loss": 2.2384479522705076, "memory(GiB)": 49.62, "step": 2170, "token_acc": 0.49343832020997375, "train_speed(iter/s)": 0.342686 }, { "epoch": 1.3197815533980584, "grad_norm": 6.001910209655762, "learning_rate": 5.938030072583447e-05, "loss": 2.312704658508301, "memory(GiB)": 49.62, "step": 2175, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 0.342797 }, { "epoch": 1.3228155339805825, "grad_norm": 5.268497467041016, "learning_rate": 5.922421543260567e-05, "loss": 1.8038692474365234, "memory(GiB)": 49.62, "step": 2180, "token_acc": 0.5466666666666666, "train_speed(iter/s)": 0.34279 }, { "epoch": 1.3258495145631068, "grad_norm": 5.056935787200928, "learning_rate": 5.906803702603755e-05, "loss": 2.1694931030273437, "memory(GiB)": 49.62, "step": 2185, "token_acc": 0.53125, "train_speed(iter/s)": 0.342771 }, { "epoch": 1.328883495145631, "grad_norm": 4.458377838134766, "learning_rate": 5.891176708266454e-05, "loss": 2.272746276855469, "memory(GiB)": 49.62, "step": 2190, "token_acc": 0.5278688524590164, "train_speed(iter/s)": 0.342828 }, { "epoch": 1.3319174757281553, "grad_norm": 3.5730910301208496, "learning_rate": 5.875540717994503e-05, "loss": 1.6815277099609376, "memory(GiB)": 49.62, "step": 2195, "token_acc": 0.6044303797468354, "train_speed(iter/s)": 0.343008 }, { "epoch": 1.3349514563106797, "grad_norm": 4.826231956481934, "learning_rate": 5.859895889624554e-05, "loss": 2.1504987716674804, "memory(GiB)": 49.62, "step": 2200, "token_acc": 0.5174603174603175, "train_speed(iter/s)": 0.343252 }, { "epoch": 1.3379854368932038, "grad_norm": 4.638493537902832, "learning_rate": 5.84424238108247e-05, "loss": 1.9677400588989258, "memory(GiB)": 49.62, "step": 2205, "token_acc": 0.5351681957186545, "train_speed(iter/s)": 0.343253 }, { "epoch": 1.3410194174757282, "grad_norm": 4.795359134674072, "learning_rate": 5.8285803503817425e-05, "loss": 1.9439615249633788, "memory(GiB)": 49.62, "step": 2210, "token_acc": 0.5362776025236593, "train_speed(iter/s)": 0.343391 }, { "epoch": 1.3440533980582523, "grad_norm": 5.863192081451416, "learning_rate": 5.812909955621886e-05, "loss": 2.1396678924560546, "memory(GiB)": 49.62, "step": 2215, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 0.343409 }, { "epoch": 1.3470873786407767, "grad_norm": 4.897850513458252, "learning_rate": 5.7972313549868415e-05, "loss": 2.1459718704223634, "memory(GiB)": 49.62, "step": 2220, "token_acc": 0.532967032967033, "train_speed(iter/s)": 0.343359 }, { "epoch": 1.350121359223301, "grad_norm": 3.850672483444214, "learning_rate": 5.7815447067433917e-05, "loss": 2.033509635925293, "memory(GiB)": 49.62, "step": 2225, "token_acc": 0.5120481927710844, "train_speed(iter/s)": 0.343335 }, { "epoch": 1.3531553398058254, "grad_norm": 6.333509922027588, "learning_rate": 5.7658501692395475e-05, "loss": 2.2355020523071287, "memory(GiB)": 49.62, "step": 2230, "token_acc": 0.5165562913907285, "train_speed(iter/s)": 0.343323 }, { "epoch": 1.3561893203883495, "grad_norm": 6.353528022766113, "learning_rate": 5.7501479009029636e-05, "loss": 2.2727027893066407, "memory(GiB)": 49.62, "step": 2235, "token_acc": 0.5209580838323353, "train_speed(iter/s)": 0.343409 }, { "epoch": 1.3592233009708738, "grad_norm": 7.123393535614014, "learning_rate": 5.734438060239331e-05, "loss": 2.0144338607788086, "memory(GiB)": 49.62, "step": 2240, "token_acc": 0.5541666666666667, "train_speed(iter/s)": 0.343284 }, { "epoch": 1.362257281553398, "grad_norm": 4.7920331954956055, "learning_rate": 5.718720805830777e-05, "loss": 2.2678756713867188, "memory(GiB)": 49.62, "step": 2245, "token_acc": 0.531986531986532, "train_speed(iter/s)": 0.343355 }, { "epoch": 1.3652912621359223, "grad_norm": 4.0656609535217285, "learning_rate": 5.70299629633427e-05, "loss": 2.105788803100586, "memory(GiB)": 49.62, "step": 2250, "token_acc": 0.5501519756838906, "train_speed(iter/s)": 0.343214 }, { "epoch": 1.3683252427184467, "grad_norm": 3.9841389656066895, "learning_rate": 5.687264690480014e-05, "loss": 2.2956415176391602, "memory(GiB)": 49.62, "step": 2255, "token_acc": 0.5013192612137203, "train_speed(iter/s)": 0.343266 }, { "epoch": 1.3713592233009708, "grad_norm": 5.072377681732178, "learning_rate": 5.6715261470698434e-05, "loss": 2.1527652740478516, "memory(GiB)": 49.62, "step": 2260, "token_acc": 0.49157303370786515, "train_speed(iter/s)": 0.343375 }, { "epoch": 1.3743932038834952, "grad_norm": 4.474189281463623, "learning_rate": 5.655780824975628e-05, "loss": 1.7831310272216796, "memory(GiB)": 49.62, "step": 2265, "token_acc": 0.5351681957186545, "train_speed(iter/s)": 0.34328 }, { "epoch": 1.3774271844660193, "grad_norm": 6.300962448120117, "learning_rate": 5.6400288831376604e-05, "loss": 2.126059341430664, "memory(GiB)": 49.62, "step": 2270, "token_acc": 0.5446153846153846, "train_speed(iter/s)": 0.343293 }, { "epoch": 1.3804611650485437, "grad_norm": 3.6221325397491455, "learning_rate": 5.624270480563059e-05, "loss": 2.1459629058837892, "memory(GiB)": 49.62, "step": 2275, "token_acc": 0.501432664756447, "train_speed(iter/s)": 0.343264 }, { "epoch": 1.383495145631068, "grad_norm": 4.326887607574463, "learning_rate": 5.608505776324158e-05, "loss": 2.266560935974121, "memory(GiB)": 49.62, "step": 2280, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 0.343383 }, { "epoch": 1.3865291262135924, "grad_norm": 4.584870338439941, "learning_rate": 5.592734929556907e-05, "loss": 2.1797780990600586, "memory(GiB)": 49.62, "step": 2285, "token_acc": 0.5424242424242425, "train_speed(iter/s)": 0.343486 }, { "epoch": 1.3895631067961165, "grad_norm": 4.851840972900391, "learning_rate": 5.576958099459254e-05, "loss": 2.0150262832641603, "memory(GiB)": 49.62, "step": 2290, "token_acc": 0.5570032573289903, "train_speed(iter/s)": 0.343288 }, { "epoch": 1.3925970873786409, "grad_norm": 6.114690780639648, "learning_rate": 5.5611754452895516e-05, "loss": 2.0670263290405275, "memory(GiB)": 49.62, "step": 2295, "token_acc": 0.5416666666666666, "train_speed(iter/s)": 0.343234 }, { "epoch": 1.395631067961165, "grad_norm": 4.03108549118042, "learning_rate": 5.5453871263649395e-05, "loss": 1.9458183288574218, "memory(GiB)": 49.62, "step": 2300, "token_acc": 0.5369318181818182, "train_speed(iter/s)": 0.343119 }, { "epoch": 1.3986650485436893, "grad_norm": 5.796468734741211, "learning_rate": 5.5295933020597426e-05, "loss": 2.245075225830078, "memory(GiB)": 49.62, "step": 2305, "token_acc": 0.5017421602787456, "train_speed(iter/s)": 0.342798 }, { "epoch": 1.4016990291262137, "grad_norm": 4.434948921203613, "learning_rate": 5.5137941318038596e-05, "loss": 2.1706329345703126, "memory(GiB)": 49.62, "step": 2310, "token_acc": 0.5098039215686274, "train_speed(iter/s)": 0.342906 }, { "epoch": 1.4047330097087378, "grad_norm": 4.494696617126465, "learning_rate": 5.4979897750811506e-05, "loss": 2.2465625762939454, "memory(GiB)": 49.62, "step": 2315, "token_acc": 0.4882154882154882, "train_speed(iter/s)": 0.343042 }, { "epoch": 1.4077669902912622, "grad_norm": 5.618915557861328, "learning_rate": 5.4821803914278336e-05, "loss": 2.154610824584961, "memory(GiB)": 49.62, "step": 2320, "token_acc": 0.5269461077844312, "train_speed(iter/s)": 0.343209 }, { "epoch": 1.4108009708737863, "grad_norm": 4.559238910675049, "learning_rate": 5.4663661404308677e-05, "loss": 2.1724952697753905, "memory(GiB)": 49.62, "step": 2325, "token_acc": 0.5575539568345323, "train_speed(iter/s)": 0.343157 }, { "epoch": 1.4138349514563107, "grad_norm": 4.715585708618164, "learning_rate": 5.4505471817263475e-05, "loss": 2.0474773406982423, "memory(GiB)": 49.62, "step": 2330, "token_acc": 0.5361930294906166, "train_speed(iter/s)": 0.343076 }, { "epoch": 1.416868932038835, "grad_norm": 5.772848129272461, "learning_rate": 5.434723674997888e-05, "loss": 1.8260784149169922, "memory(GiB)": 49.62, "step": 2335, "token_acc": 0.5566343042071198, "train_speed(iter/s)": 0.343095 }, { "epoch": 1.4199029126213591, "grad_norm": 5.415388584136963, "learning_rate": 5.418895779975014e-05, "loss": 2.160201835632324, "memory(GiB)": 49.62, "step": 2340, "token_acc": 0.5379061371841155, "train_speed(iter/s)": 0.343136 }, { "epoch": 1.4229368932038835, "grad_norm": 6.025780200958252, "learning_rate": 5.403063656431548e-05, "loss": 2.3115482330322266, "memory(GiB)": 49.62, "step": 2345, "token_acc": 0.4808743169398907, "train_speed(iter/s)": 0.34321 }, { "epoch": 1.4259708737864076, "grad_norm": 5.439345836639404, "learning_rate": 5.387227464183999e-05, "loss": 2.1189598083496093, "memory(GiB)": 49.62, "step": 2350, "token_acc": 0.5304659498207885, "train_speed(iter/s)": 0.343213 }, { "epoch": 1.429004854368932, "grad_norm": 7.2754106521606445, "learning_rate": 5.371387363089945e-05, "loss": 2.2224246978759767, "memory(GiB)": 49.62, "step": 2355, "token_acc": 0.5123287671232877, "train_speed(iter/s)": 0.343262 }, { "epoch": 1.4320388349514563, "grad_norm": 5.184502601623535, "learning_rate": 5.355543513046419e-05, "loss": 2.1893695831298827, "memory(GiB)": 49.62, "step": 2360, "token_acc": 0.5508196721311476, "train_speed(iter/s)": 0.34323 }, { "epoch": 1.4350728155339807, "grad_norm": 5.3074140548706055, "learning_rate": 5.3396960739883037e-05, "loss": 2.0637306213378905, "memory(GiB)": 49.62, "step": 2365, "token_acc": 0.5506756756756757, "train_speed(iter/s)": 0.34317 }, { "epoch": 1.4381067961165048, "grad_norm": 4.266348838806152, "learning_rate": 5.323845205886707e-05, "loss": 1.9541097640991212, "memory(GiB)": 49.62, "step": 2370, "token_acc": 0.5509554140127388, "train_speed(iter/s)": 0.343311 }, { "epoch": 1.4411407766990292, "grad_norm": 6.474839210510254, "learning_rate": 5.307991068747353e-05, "loss": 1.9641443252563477, "memory(GiB)": 49.62, "step": 2375, "token_acc": 0.5815602836879432, "train_speed(iter/s)": 0.343447 }, { "epoch": 1.4441747572815533, "grad_norm": 4.766982555389404, "learning_rate": 5.292133822608961e-05, "loss": 2.058863067626953, "memory(GiB)": 49.62, "step": 2380, "token_acc": 0.5314465408805031, "train_speed(iter/s)": 0.343556 }, { "epoch": 1.4472087378640777, "grad_norm": 4.205195426940918, "learning_rate": 5.2762736275416416e-05, "loss": 2.2327764511108397, "memory(GiB)": 49.62, "step": 2385, "token_acc": 0.5137614678899083, "train_speed(iter/s)": 0.343537 }, { "epoch": 1.450242718446602, "grad_norm": 5.749515533447266, "learning_rate": 5.260410643645263e-05, "loss": 1.9898389816284179, "memory(GiB)": 49.62, "step": 2390, "token_acc": 0.6174242424242424, "train_speed(iter/s)": 0.343588 }, { "epoch": 1.4532766990291262, "grad_norm": 6.119357585906982, "learning_rate": 5.2445450310478525e-05, "loss": 2.109103965759277, "memory(GiB)": 49.62, "step": 2395, "token_acc": 0.5382165605095541, "train_speed(iter/s)": 0.343685 }, { "epoch": 1.4563106796116505, "grad_norm": 5.528497219085693, "learning_rate": 5.228676949903973e-05, "loss": 2.0721433639526365, "memory(GiB)": 49.62, "step": 2400, "token_acc": 0.531986531986532, "train_speed(iter/s)": 0.343774 }, { "epoch": 1.4593446601941746, "grad_norm": 4.549882411956787, "learning_rate": 5.2128065603931006e-05, "loss": 1.9939346313476562, "memory(GiB)": 49.62, "step": 2405, "token_acc": 0.552901023890785, "train_speed(iter/s)": 0.343803 }, { "epoch": 1.462378640776699, "grad_norm": 4.605144023895264, "learning_rate": 5.196934022718017e-05, "loss": 2.117628288269043, "memory(GiB)": 49.62, "step": 2410, "token_acc": 0.5017543859649123, "train_speed(iter/s)": 0.343843 }, { "epoch": 1.4654126213592233, "grad_norm": 6.057964324951172, "learning_rate": 5.18105949710319e-05, "loss": 2.3108386993408203, "memory(GiB)": 49.62, "step": 2415, "token_acc": 0.49393939393939396, "train_speed(iter/s)": 0.343969 }, { "epoch": 1.4684466019417477, "grad_norm": 5.627535343170166, "learning_rate": 5.165183143793149e-05, "loss": 2.2582305908203124, "memory(GiB)": 49.62, "step": 2420, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 0.343938 }, { "epoch": 1.4714805825242718, "grad_norm": 5.74299430847168, "learning_rate": 5.149305123050877e-05, "loss": 2.1035749435424806, "memory(GiB)": 49.62, "step": 2425, "token_acc": 0.5358255451713395, "train_speed(iter/s)": 0.343999 }, { "epoch": 1.4745145631067962, "grad_norm": 4.360835075378418, "learning_rate": 5.133425595156187e-05, "loss": 2.192898750305176, "memory(GiB)": 49.83, "step": 2430, "token_acc": 0.5072886297376094, "train_speed(iter/s)": 0.344146 }, { "epoch": 1.4775485436893203, "grad_norm": 7.765800952911377, "learning_rate": 5.1175447204041096e-05, "loss": 2.050806999206543, "memory(GiB)": 49.83, "step": 2435, "token_acc": 0.56, "train_speed(iter/s)": 0.344358 }, { "epoch": 1.4805825242718447, "grad_norm": 4.39985990524292, "learning_rate": 5.101662659103265e-05, "loss": 1.9693151473999024, "memory(GiB)": 49.83, "step": 2440, "token_acc": 0.5413793103448276, "train_speed(iter/s)": 0.344396 }, { "epoch": 1.483616504854369, "grad_norm": 7.249627113342285, "learning_rate": 5.0857795715742575e-05, "loss": 2.1018869400024416, "memory(GiB)": 49.83, "step": 2445, "token_acc": 0.49736842105263157, "train_speed(iter/s)": 0.344432 }, { "epoch": 1.4866504854368932, "grad_norm": 5.259532451629639, "learning_rate": 5.0698956181480465e-05, "loss": 2.101152801513672, "memory(GiB)": 49.83, "step": 2450, "token_acc": 0.5503597122302158, "train_speed(iter/s)": 0.344466 }, { "epoch": 1.4896844660194175, "grad_norm": 4.838339328765869, "learning_rate": 5.054010959164329e-05, "loss": 2.0922826766967773, "memory(GiB)": 49.83, "step": 2455, "token_acc": 0.5552147239263804, "train_speed(iter/s)": 0.344417 }, { "epoch": 1.4927184466019416, "grad_norm": 5.462285995483398, "learning_rate": 5.038125754969933e-05, "loss": 2.317370796203613, "memory(GiB)": 49.83, "step": 2460, "token_acc": 0.5, "train_speed(iter/s)": 0.344638 }, { "epoch": 1.495752427184466, "grad_norm": 4.665005683898926, "learning_rate": 5.0222401659171846e-05, "loss": 2.17450065612793, "memory(GiB)": 49.83, "step": 2465, "token_acc": 0.5308219178082192, "train_speed(iter/s)": 0.344794 }, { "epoch": 1.4987864077669903, "grad_norm": 4.870514392852783, "learning_rate": 5.006354352362296e-05, "loss": 2.430714797973633, "memory(GiB)": 49.83, "step": 2470, "token_acc": 0.46853146853146854, "train_speed(iter/s)": 0.344663 }, { "epoch": 1.5018203883495147, "grad_norm": 4.969100475311279, "learning_rate": 4.9904684746637445e-05, "loss": 2.029414749145508, "memory(GiB)": 49.83, "step": 2475, "token_acc": 0.5678233438485805, "train_speed(iter/s)": 0.344674 }, { "epoch": 1.5048543689320388, "grad_norm": 3.692633867263794, "learning_rate": 4.9745826931806524e-05, "loss": 2.291071319580078, "memory(GiB)": 49.83, "step": 2480, "token_acc": 0.48450704225352115, "train_speed(iter/s)": 0.344764 }, { "epoch": 1.507888349514563, "grad_norm": 3.932565927505493, "learning_rate": 4.958697168271179e-05, "loss": 1.8374879837036133, "memory(GiB)": 49.83, "step": 2485, "token_acc": 0.6081504702194357, "train_speed(iter/s)": 0.344746 }, { "epoch": 1.5109223300970873, "grad_norm": 4.920394420623779, "learning_rate": 4.942812060290886e-05, "loss": 1.9674636840820312, "memory(GiB)": 49.83, "step": 2490, "token_acc": 0.5086505190311419, "train_speed(iter/s)": 0.344745 }, { "epoch": 1.5139563106796117, "grad_norm": 6.033856391906738, "learning_rate": 4.92692752959113e-05, "loss": 2.0914737701416017, "memory(GiB)": 49.83, "step": 2495, "token_acc": 0.4931972789115646, "train_speed(iter/s)": 0.344843 }, { "epoch": 1.516990291262136, "grad_norm": 5.79646635055542, "learning_rate": 4.91104373651744e-05, "loss": 2.0259641647338866, "memory(GiB)": 49.83, "step": 2500, "token_acc": 0.5184049079754601, "train_speed(iter/s)": 0.344917 }, { "epoch": 1.516990291262136, "eval_loss": 1.955409288406372, "eval_runtime": 11.7765, "eval_samples_per_second": 8.492, "eval_steps_per_second": 8.492, "eval_token_acc": 0.5192307692307693, "step": 2500 }, { "epoch": 1.5200242718446602, "grad_norm": 5.9858927726745605, "learning_rate": 4.8951608414078944e-05, "loss": 2.319754791259766, "memory(GiB)": 49.83, "step": 2505, "token_acc": 0.5185891325071497, "train_speed(iter/s)": 0.344328 }, { "epoch": 1.5230582524271845, "grad_norm": 6.414853096008301, "learning_rate": 4.8792790045915167e-05, "loss": 2.330271911621094, "memory(GiB)": 49.83, "step": 2510, "token_acc": 0.5448504983388704, "train_speed(iter/s)": 0.344264 }, { "epoch": 1.5260922330097086, "grad_norm": 3.873676061630249, "learning_rate": 4.863398386386638e-05, "loss": 1.745873260498047, "memory(GiB)": 49.83, "step": 2515, "token_acc": 0.5886075949367089, "train_speed(iter/s)": 0.344134 }, { "epoch": 1.529126213592233, "grad_norm": 6.27769660949707, "learning_rate": 4.847519147099294e-05, "loss": 2.2258495330810546, "memory(GiB)": 49.83, "step": 2520, "token_acc": 0.48942598187311176, "train_speed(iter/s)": 0.344069 }, { "epoch": 1.5321601941747574, "grad_norm": 6.054216384887695, "learning_rate": 4.831641447021599e-05, "loss": 2.1401975631713865, "memory(GiB)": 49.83, "step": 2525, "token_acc": 0.5165745856353591, "train_speed(iter/s)": 0.343914 }, { "epoch": 1.5351941747572817, "grad_norm": 4.984788417816162, "learning_rate": 4.8157654464301275e-05, "loss": 2.14502010345459, "memory(GiB)": 49.83, "step": 2530, "token_acc": 0.5302491103202847, "train_speed(iter/s)": 0.343866 }, { "epoch": 1.5382281553398058, "grad_norm": 4.374177932739258, "learning_rate": 4.7998913055843054e-05, "loss": 2.0998167037963866, "memory(GiB)": 49.83, "step": 2535, "token_acc": 0.5319767441860465, "train_speed(iter/s)": 0.343699 }, { "epoch": 1.54126213592233, "grad_norm": 3.3926897048950195, "learning_rate": 4.7840191847247774e-05, "loss": 2.164424514770508, "memory(GiB)": 49.83, "step": 2540, "token_acc": 0.542319749216301, "train_speed(iter/s)": 0.343775 }, { "epoch": 1.5442961165048543, "grad_norm": 4.463199615478516, "learning_rate": 4.7681492440718045e-05, "loss": 2.0967248916625976, "memory(GiB)": 49.83, "step": 2545, "token_acc": 0.5063694267515924, "train_speed(iter/s)": 0.343891 }, { "epoch": 1.5473300970873787, "grad_norm": 8.36011028289795, "learning_rate": 4.752281643823633e-05, "loss": 2.083425521850586, "memory(GiB)": 49.83, "step": 2550, "token_acc": 0.5467625899280576, "train_speed(iter/s)": 0.343878 }, { "epoch": 1.550364077669903, "grad_norm": 4.297141075134277, "learning_rate": 4.736416544154891e-05, "loss": 1.9757675170898437, "memory(GiB)": 49.83, "step": 2555, "token_acc": 0.5866666666666667, "train_speed(iter/s)": 0.343812 }, { "epoch": 1.5533980582524272, "grad_norm": 4.152921199798584, "learning_rate": 4.720554105214961e-05, "loss": 2.062337875366211, "memory(GiB)": 49.83, "step": 2560, "token_acc": 0.5418060200668896, "train_speed(iter/s)": 0.343915 }, { "epoch": 1.5564320388349513, "grad_norm": 12.788975715637207, "learning_rate": 4.704694487126365e-05, "loss": 1.8080659866333009, "memory(GiB)": 49.83, "step": 2565, "token_acc": 0.5925925925925926, "train_speed(iter/s)": 0.343915 }, { "epoch": 1.5594660194174756, "grad_norm": 4.3370184898376465, "learning_rate": 4.688837849983154e-05, "loss": 2.2551008224487306, "memory(GiB)": 49.83, "step": 2570, "token_acc": 0.5207253886010362, "train_speed(iter/s)": 0.343936 }, { "epoch": 1.5625, "grad_norm": 5.775643825531006, "learning_rate": 4.6729843538492847e-05, "loss": 2.208485412597656, "memory(GiB)": 49.83, "step": 2575, "token_acc": 0.5207253886010362, "train_speed(iter/s)": 0.343936 }, { "epoch": 1.5655339805825244, "grad_norm": 5.196451663970947, "learning_rate": 4.657134158757012e-05, "loss": 1.8371131896972657, "memory(GiB)": 49.83, "step": 2580, "token_acc": 0.5615942028985508, "train_speed(iter/s)": 0.343904 }, { "epoch": 1.5685679611650487, "grad_norm": 4.364643096923828, "learning_rate": 4.6412874247052615e-05, "loss": 2.0728885650634767, "memory(GiB)": 49.83, "step": 2585, "token_acc": 0.562962962962963, "train_speed(iter/s)": 0.344007 }, { "epoch": 1.5716019417475728, "grad_norm": 5.949049949645996, "learning_rate": 4.625444311658028e-05, "loss": 2.0201629638671874, "memory(GiB)": 49.83, "step": 2590, "token_acc": 0.5399449035812672, "train_speed(iter/s)": 0.344019 }, { "epoch": 1.574635922330097, "grad_norm": 4.6345038414001465, "learning_rate": 4.6096049795427514e-05, "loss": 2.3515548706054688, "memory(GiB)": 49.83, "step": 2595, "token_acc": 0.5, "train_speed(iter/s)": 0.344026 }, { "epoch": 1.5776699029126213, "grad_norm": 3.8500425815582275, "learning_rate": 4.593769588248702e-05, "loss": 2.171489143371582, "memory(GiB)": 49.83, "step": 2600, "token_acc": 0.5053475935828877, "train_speed(iter/s)": 0.343969 }, { "epoch": 1.5807038834951457, "grad_norm": 3.6574926376342773, "learning_rate": 4.577938297625378e-05, "loss": 2.0082653045654295, "memory(GiB)": 49.83, "step": 2605, "token_acc": 0.5978260869565217, "train_speed(iter/s)": 0.344039 }, { "epoch": 1.58373786407767, "grad_norm": 5.567201137542725, "learning_rate": 4.5621112674808756e-05, "loss": 1.6284290313720704, "memory(GiB)": 49.83, "step": 2610, "token_acc": 0.6095238095238096, "train_speed(iter/s)": 0.344039 }, { "epoch": 1.5867718446601942, "grad_norm": 6.214598178863525, "learning_rate": 4.5462886575802884e-05, "loss": 1.7911537170410157, "memory(GiB)": 49.83, "step": 2615, "token_acc": 0.5609756097560976, "train_speed(iter/s)": 0.343996 }, { "epoch": 1.5898058252427183, "grad_norm": 3.896662950515747, "learning_rate": 4.530470627644088e-05, "loss": 2.238356018066406, "memory(GiB)": 49.83, "step": 2620, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 0.344069 }, { "epoch": 1.5928398058252426, "grad_norm": 4.645061492919922, "learning_rate": 4.514657337346512e-05, "loss": 1.9629156112670898, "memory(GiB)": 49.83, "step": 2625, "token_acc": 0.5701219512195121, "train_speed(iter/s)": 0.344201 }, { "epoch": 1.595873786407767, "grad_norm": 6.88320779800415, "learning_rate": 4.4988489463139605e-05, "loss": 2.066257286071777, "memory(GiB)": 50.28, "step": 2630, "token_acc": 0.5264900662251656, "train_speed(iter/s)": 0.344229 }, { "epoch": 1.5989077669902914, "grad_norm": 3.9658396244049072, "learning_rate": 4.483045614123371e-05, "loss": 2.031512451171875, "memory(GiB)": 50.28, "step": 2635, "token_acc": 0.5466666666666666, "train_speed(iter/s)": 0.344167 }, { "epoch": 1.6019417475728155, "grad_norm": 5.622066974639893, "learning_rate": 4.46724750030062e-05, "loss": 1.8012283325195313, "memory(GiB)": 50.28, "step": 2640, "token_acc": 0.5728155339805825, "train_speed(iter/s)": 0.344152 }, { "epoch": 1.6049757281553398, "grad_norm": 4.002096176147461, "learning_rate": 4.451454764318903e-05, "loss": 2.0324357986450194, "memory(GiB)": 50.28, "step": 2645, "token_acc": 0.5264900662251656, "train_speed(iter/s)": 0.344168 }, { "epoch": 1.608009708737864, "grad_norm": 4.180598735809326, "learning_rate": 4.4356675655971344e-05, "loss": 2.1916387557983397, "memory(GiB)": 50.28, "step": 2650, "token_acc": 0.5203488372093024, "train_speed(iter/s)": 0.344231 }, { "epoch": 1.6110436893203883, "grad_norm": 6.329728126525879, "learning_rate": 4.419886063498329e-05, "loss": 1.9965480804443358, "memory(GiB)": 50.28, "step": 2655, "token_acc": 0.549407114624506, "train_speed(iter/s)": 0.344253 }, { "epoch": 1.6140776699029127, "grad_norm": 5.174497604370117, "learning_rate": 4.404110417327998e-05, "loss": 1.9759635925292969, "memory(GiB)": 50.28, "step": 2660, "token_acc": 0.5544217687074829, "train_speed(iter/s)": 0.344345 }, { "epoch": 1.617111650485437, "grad_norm": 6.921404838562012, "learning_rate": 4.388340786332541e-05, "loss": 2.2114421844482424, "memory(GiB)": 50.28, "step": 2665, "token_acc": 0.5247813411078717, "train_speed(iter/s)": 0.344499 }, { "epoch": 1.6201456310679612, "grad_norm": 5.644909381866455, "learning_rate": 4.372577329697636e-05, "loss": 2.2224405288696287, "memory(GiB)": 50.28, "step": 2670, "token_acc": 0.5075987841945289, "train_speed(iter/s)": 0.344387 }, { "epoch": 1.6231796116504853, "grad_norm": 5.899232387542725, "learning_rate": 4.35682020654664e-05, "loss": 2.0828458786010744, "memory(GiB)": 50.28, "step": 2675, "token_acc": 0.555205047318612, "train_speed(iter/s)": 0.344376 }, { "epoch": 1.6262135922330097, "grad_norm": 6.157591342926025, "learning_rate": 4.341069575938968e-05, "loss": 2.098480224609375, "memory(GiB)": 50.28, "step": 2680, "token_acc": 0.5451127819548872, "train_speed(iter/s)": 0.34441 }, { "epoch": 1.629247572815534, "grad_norm": 5.411756992340088, "learning_rate": 4.3253255968685044e-05, "loss": 1.9910245895385743, "memory(GiB)": 50.28, "step": 2685, "token_acc": 0.5525525525525525, "train_speed(iter/s)": 0.344595 }, { "epoch": 1.6322815533980584, "grad_norm": 6.132040023803711, "learning_rate": 4.3095884282619866e-05, "loss": 2.1848316192626953, "memory(GiB)": 50.28, "step": 2690, "token_acc": 0.5486111111111112, "train_speed(iter/s)": 0.34469 }, { "epoch": 1.6353155339805825, "grad_norm": 5.636852741241455, "learning_rate": 4.2938582289774e-05, "loss": 2.0264423370361326, "memory(GiB)": 50.28, "step": 2695, "token_acc": 0.5677083333333334, "train_speed(iter/s)": 0.34473 }, { "epoch": 1.6383495145631068, "grad_norm": 4.577986240386963, "learning_rate": 4.278135157802389e-05, "loss": 1.9610004425048828, "memory(GiB)": 50.28, "step": 2700, "token_acc": 0.5178571428571429, "train_speed(iter/s)": 0.34478 }, { "epoch": 1.641383495145631, "grad_norm": 5.811229228973389, "learning_rate": 4.262419373452634e-05, "loss": 2.1040117263793947, "memory(GiB)": 50.28, "step": 2705, "token_acc": 0.514792899408284, "train_speed(iter/s)": 0.34463 }, { "epoch": 1.6444174757281553, "grad_norm": 5.271268367767334, "learning_rate": 4.246711034570264e-05, "loss": 2.0251487731933593, "memory(GiB)": 50.28, "step": 2710, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 0.344624 }, { "epoch": 1.6474514563106797, "grad_norm": 5.542680740356445, "learning_rate": 4.231010299722248e-05, "loss": 2.254073715209961, "memory(GiB)": 50.28, "step": 2715, "token_acc": 0.5092592592592593, "train_speed(iter/s)": 0.344655 }, { "epoch": 1.650485436893204, "grad_norm": 4.919449329376221, "learning_rate": 4.2153173273987946e-05, "loss": 2.275184440612793, "memory(GiB)": 50.28, "step": 2720, "token_acc": 0.5197568389057751, "train_speed(iter/s)": 0.344718 }, { "epoch": 1.6535194174757282, "grad_norm": 4.951817512512207, "learning_rate": 4.199632276011761e-05, "loss": 2.1903215408325196, "memory(GiB)": 50.28, "step": 2725, "token_acc": 0.5369774919614148, "train_speed(iter/s)": 0.344783 }, { "epoch": 1.6565533980582523, "grad_norm": 4.197880268096924, "learning_rate": 4.1839553038930396e-05, "loss": 2.153451919555664, "memory(GiB)": 50.28, "step": 2730, "token_acc": 0.5184210526315789, "train_speed(iter/s)": 0.344774 }, { "epoch": 1.6595873786407767, "grad_norm": 5.517632007598877, "learning_rate": 4.168286569292972e-05, "loss": 2.303066444396973, "memory(GiB)": 50.28, "step": 2735, "token_acc": 0.4902597402597403, "train_speed(iter/s)": 0.344839 }, { "epoch": 1.662621359223301, "grad_norm": 7.003429412841797, "learning_rate": 4.152626230378741e-05, "loss": 2.215951156616211, "memory(GiB)": 50.28, "step": 2740, "token_acc": 0.5368663594470046, "train_speed(iter/s)": 0.344935 }, { "epoch": 1.6656553398058254, "grad_norm": 5.832604885101318, "learning_rate": 4.136974445232788e-05, "loss": 2.100927543640137, "memory(GiB)": 50.28, "step": 2745, "token_acc": 0.5171428571428571, "train_speed(iter/s)": 0.344901 }, { "epoch": 1.6686893203883495, "grad_norm": 4.277848243713379, "learning_rate": 4.121331371851201e-05, "loss": 2.0746074676513673, "memory(GiB)": 50.28, "step": 2750, "token_acc": 0.5445859872611465, "train_speed(iter/s)": 0.344995 }, { "epoch": 1.6717233009708736, "grad_norm": 4.857871055603027, "learning_rate": 4.10569716814213e-05, "loss": 2.2131591796875, "memory(GiB)": 50.28, "step": 2755, "token_acc": 0.5389830508474577, "train_speed(iter/s)": 0.345011 }, { "epoch": 1.674757281553398, "grad_norm": 4.403052806854248, "learning_rate": 4.0900719919241935e-05, "loss": 2.2471851348876952, "memory(GiB)": 50.28, "step": 2760, "token_acc": 0.5073746312684366, "train_speed(iter/s)": 0.344897 }, { "epoch": 1.6777912621359223, "grad_norm": 5.228289604187012, "learning_rate": 4.0744560009248766e-05, "loss": 2.1872901916503906, "memory(GiB)": 50.28, "step": 2765, "token_acc": 0.5070422535211268, "train_speed(iter/s)": 0.344938 }, { "epoch": 1.6808252427184467, "grad_norm": 4.925614356994629, "learning_rate": 4.0588493527789537e-05, "loss": 2.337154197692871, "memory(GiB)": 50.28, "step": 2770, "token_acc": 0.45103857566765576, "train_speed(iter/s)": 0.344939 }, { "epoch": 1.6838592233009708, "grad_norm": 4.150095462799072, "learning_rate": 4.043252205026879e-05, "loss": 1.9851512908935547, "memory(GiB)": 50.28, "step": 2775, "token_acc": 0.528125, "train_speed(iter/s)": 0.34482 }, { "epoch": 1.6868932038834952, "grad_norm": 4.304738998413086, "learning_rate": 4.027664715113209e-05, "loss": 2.0736413955688477, "memory(GiB)": 50.28, "step": 2780, "token_acc": 0.5306748466257669, "train_speed(iter/s)": 0.344858 }, { "epoch": 1.6899271844660193, "grad_norm": 7.609385013580322, "learning_rate": 4.012087040385012e-05, "loss": 2.259031295776367, "memory(GiB)": 50.28, "step": 2785, "token_acc": 0.5091463414634146, "train_speed(iter/s)": 0.344882 }, { "epoch": 1.6929611650485437, "grad_norm": 4.6254401206970215, "learning_rate": 3.996519338090273e-05, "loss": 2.064649963378906, "memory(GiB)": 50.28, "step": 2790, "token_acc": 0.5041666666666667, "train_speed(iter/s)": 0.344654 }, { "epoch": 1.695995145631068, "grad_norm": 5.290862083435059, "learning_rate": 3.980961765376316e-05, "loss": 2.277753448486328, "memory(GiB)": 50.28, "step": 2795, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 0.344587 }, { "epoch": 1.6990291262135924, "grad_norm": 7.1083831787109375, "learning_rate": 3.965414479288209e-05, "loss": 2.1571630477905273, "memory(GiB)": 50.28, "step": 2800, "token_acc": 0.5381944444444444, "train_speed(iter/s)": 0.344566 }, { "epoch": 1.7020631067961165, "grad_norm": 6.405331134796143, "learning_rate": 3.9498776367671825e-05, "loss": 2.208438491821289, "memory(GiB)": 50.28, "step": 2805, "token_acc": 0.5236486486486487, "train_speed(iter/s)": 0.344533 }, { "epoch": 1.7050970873786406, "grad_norm": 5.49265193939209, "learning_rate": 3.9343513946490454e-05, "loss": 2.0562555313110353, "memory(GiB)": 50.28, "step": 2810, "token_acc": 0.4982698961937716, "train_speed(iter/s)": 0.344555 }, { "epoch": 1.708131067961165, "grad_norm": 3.735551118850708, "learning_rate": 3.9188359096626e-05, "loss": 2.136612892150879, "memory(GiB)": 50.28, "step": 2815, "token_acc": 0.5819935691318328, "train_speed(iter/s)": 0.344598 }, { "epoch": 1.7111650485436893, "grad_norm": 4.8528008460998535, "learning_rate": 3.903331338428067e-05, "loss": 2.201272201538086, "memory(GiB)": 50.28, "step": 2820, "token_acc": 0.5258620689655172, "train_speed(iter/s)": 0.344561 }, { "epoch": 1.7141990291262137, "grad_norm": 4.708635330200195, "learning_rate": 3.88783783745549e-05, "loss": 2.2453622817993164, "memory(GiB)": 50.28, "step": 2825, "token_acc": 0.5259515570934256, "train_speed(iter/s)": 0.344538 }, { "epoch": 1.7172330097087378, "grad_norm": 6.6683502197265625, "learning_rate": 3.872355563143173e-05, "loss": 1.9784585952758789, "memory(GiB)": 50.28, "step": 2830, "token_acc": 0.551948051948052, "train_speed(iter/s)": 0.34462 }, { "epoch": 1.7202669902912622, "grad_norm": 5.398106575012207, "learning_rate": 3.856884671776085e-05, "loss": 1.895383071899414, "memory(GiB)": 50.28, "step": 2835, "token_acc": 0.5604026845637584, "train_speed(iter/s)": 0.344681 }, { "epoch": 1.7233009708737863, "grad_norm": 4.595198631286621, "learning_rate": 3.8414253195242986e-05, "loss": 2.176414680480957, "memory(GiB)": 50.28, "step": 2840, "token_acc": 0.52046783625731, "train_speed(iter/s)": 0.344697 }, { "epoch": 1.7263349514563107, "grad_norm": 4.330265998840332, "learning_rate": 3.8259776624414e-05, "loss": 1.9469837188720702, "memory(GiB)": 50.28, "step": 2845, "token_acc": 0.576, "train_speed(iter/s)": 0.344588 }, { "epoch": 1.729368932038835, "grad_norm": 5.9306488037109375, "learning_rate": 3.81054185646292e-05, "loss": 1.991217041015625, "memory(GiB)": 50.28, "step": 2850, "token_acc": 0.5448504983388704, "train_speed(iter/s)": 0.344689 }, { "epoch": 1.7324029126213594, "grad_norm": 4.61736536026001, "learning_rate": 3.795118057404761e-05, "loss": 1.9766633987426758, "memory(GiB)": 50.28, "step": 2855, "token_acc": 0.5606557377049181, "train_speed(iter/s)": 0.344689 }, { "epoch": 1.7354368932038835, "grad_norm": 4.509167194366455, "learning_rate": 3.779706420961617e-05, "loss": 1.9337194442749024, "memory(GiB)": 50.28, "step": 2860, "token_acc": 0.5830721003134797, "train_speed(iter/s)": 0.344727 }, { "epoch": 1.7384708737864076, "grad_norm": 6.19570255279541, "learning_rate": 3.764307102705417e-05, "loss": 2.1587066650390625, "memory(GiB)": 50.28, "step": 2865, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 0.344715 }, { "epoch": 1.741504854368932, "grad_norm": 5.265482425689697, "learning_rate": 3.748920258083736e-05, "loss": 2.0900060653686525, "memory(GiB)": 50.28, "step": 2870, "token_acc": 0.5259515570934256, "train_speed(iter/s)": 0.34471 }, { "epoch": 1.7445388349514563, "grad_norm": 5.675642490386963, "learning_rate": 3.7335460424182356e-05, "loss": 2.3027883529663087, "memory(GiB)": 50.28, "step": 2875, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 0.344601 }, { "epoch": 1.7475728155339807, "grad_norm": 5.00628137588501, "learning_rate": 3.7181846109031005e-05, "loss": 1.9302787780761719, "memory(GiB)": 50.28, "step": 2880, "token_acc": 0.5439739413680782, "train_speed(iter/s)": 0.344598 }, { "epoch": 1.7506067961165048, "grad_norm": 5.7854509353637695, "learning_rate": 3.702836118603458e-05, "loss": 2.1817916870117187, "memory(GiB)": 50.28, "step": 2885, "token_acc": 0.5562130177514792, "train_speed(iter/s)": 0.344523 }, { "epoch": 1.7536407766990292, "grad_norm": 4.599446773529053, "learning_rate": 3.687500720453831e-05, "loss": 2.1247314453125, "memory(GiB)": 50.28, "step": 2890, "token_acc": 0.5470588235294118, "train_speed(iter/s)": 0.344549 }, { "epoch": 1.7566747572815533, "grad_norm": 7.078361988067627, "learning_rate": 3.672178571256556e-05, "loss": 2.412958526611328, "memory(GiB)": 50.28, "step": 2895, "token_acc": 0.5287009063444109, "train_speed(iter/s)": 0.344576 }, { "epoch": 1.7597087378640777, "grad_norm": 6.372231483459473, "learning_rate": 3.656869825680234e-05, "loss": 1.9741497039794922, "memory(GiB)": 50.28, "step": 2900, "token_acc": 0.5566666666666666, "train_speed(iter/s)": 0.344445 }, { "epoch": 1.762742718446602, "grad_norm": 6.948692321777344, "learning_rate": 3.641574638258162e-05, "loss": 2.0564809799194337, "memory(GiB)": 50.28, "step": 2905, "token_acc": 0.5368098159509203, "train_speed(iter/s)": 0.344486 }, { "epoch": 1.7657766990291264, "grad_norm": 4.16779899597168, "learning_rate": 3.62629316338677e-05, "loss": 2.2558427810668946, "memory(GiB)": 50.28, "step": 2910, "token_acc": 0.5045045045045045, "train_speed(iter/s)": 0.344525 }, { "epoch": 1.7688106796116505, "grad_norm": 6.0249342918396, "learning_rate": 3.611025555324079e-05, "loss": 2.115042495727539, "memory(GiB)": 50.28, "step": 2915, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 0.344466 }, { "epoch": 1.7718446601941746, "grad_norm": 5.627461910247803, "learning_rate": 3.595771968188121e-05, "loss": 1.9789396286010743, "memory(GiB)": 50.28, "step": 2920, "token_acc": 0.5547445255474452, "train_speed(iter/s)": 0.344617 }, { "epoch": 1.774878640776699, "grad_norm": 4.772979259490967, "learning_rate": 3.5805325559554006e-05, "loss": 2.0461313247680666, "memory(GiB)": 50.28, "step": 2925, "token_acc": 0.5488721804511278, "train_speed(iter/s)": 0.344639 }, { "epoch": 1.7779126213592233, "grad_norm": 3.8835439682006836, "learning_rate": 3.5653074724593306e-05, "loss": 2.321491241455078, "memory(GiB)": 50.28, "step": 2930, "token_acc": 0.5038560411311054, "train_speed(iter/s)": 0.344663 }, { "epoch": 1.7809466019417477, "grad_norm": 6.731331825256348, "learning_rate": 3.550096871388689e-05, "loss": 2.049272918701172, "memory(GiB)": 50.28, "step": 2935, "token_acc": 0.5290322580645161, "train_speed(iter/s)": 0.344802 }, { "epoch": 1.7839805825242718, "grad_norm": 4.779376983642578, "learning_rate": 3.5349009062860586e-05, "loss": 2.100991439819336, "memory(GiB)": 50.28, "step": 2940, "token_acc": 0.5340501792114696, "train_speed(iter/s)": 0.344489 }, { "epoch": 1.787014563106796, "grad_norm": 3.728651762008667, "learning_rate": 3.519719730546275e-05, "loss": 1.9749870300292969, "memory(GiB)": 50.28, "step": 2945, "token_acc": 0.5852941176470589, "train_speed(iter/s)": 0.344493 }, { "epoch": 1.7900485436893203, "grad_norm": 4.912387847900391, "learning_rate": 3.504553497414893e-05, "loss": 2.27115535736084, "memory(GiB)": 50.28, "step": 2950, "token_acc": 0.5703971119133574, "train_speed(iter/s)": 0.344609 }, { "epoch": 1.7930825242718447, "grad_norm": 4.941476345062256, "learning_rate": 3.489402359986621e-05, "loss": 1.9939556121826172, "memory(GiB)": 50.28, "step": 2955, "token_acc": 0.5392156862745098, "train_speed(iter/s)": 0.344682 }, { "epoch": 1.796116504854369, "grad_norm": 5.409017086029053, "learning_rate": 3.474266471203794e-05, "loss": 2.3090913772583006, "memory(GiB)": 50.28, "step": 2960, "token_acc": 0.5093167701863354, "train_speed(iter/s)": 0.344684 }, { "epoch": 1.7991504854368932, "grad_norm": 4.6386237144470215, "learning_rate": 3.459145983854813e-05, "loss": 2.065108299255371, "memory(GiB)": 50.28, "step": 2965, "token_acc": 0.5580645161290323, "train_speed(iter/s)": 0.344723 }, { "epoch": 1.8021844660194175, "grad_norm": 4.531946182250977, "learning_rate": 3.444041050572611e-05, "loss": 2.0947938919067384, "memory(GiB)": 50.28, "step": 2970, "token_acc": 0.5443037974683544, "train_speed(iter/s)": 0.344847 }, { "epoch": 1.8052184466019416, "grad_norm": 6.083227157592773, "learning_rate": 3.4289518238331145e-05, "loss": 2.10572509765625, "memory(GiB)": 50.28, "step": 2975, "token_acc": 0.5039370078740157, "train_speed(iter/s)": 0.344793 }, { "epoch": 1.808252427184466, "grad_norm": 7.034914970397949, "learning_rate": 3.413878455953698e-05, "loss": 2.5399431228637694, "memory(GiB)": 50.28, "step": 2980, "token_acc": 0.4965034965034965, "train_speed(iter/s)": 0.344693 }, { "epoch": 1.8112864077669903, "grad_norm": 5.111301422119141, "learning_rate": 3.398821099091652e-05, "loss": 1.8892005920410155, "memory(GiB)": 50.28, "step": 2985, "token_acc": 0.5907473309608541, "train_speed(iter/s)": 0.344726 }, { "epoch": 1.8143203883495147, "grad_norm": 5.859311103820801, "learning_rate": 3.3837799052426434e-05, "loss": 2.418396759033203, "memory(GiB)": 50.28, "step": 2990, "token_acc": 0.5, "train_speed(iter/s)": 0.344828 }, { "epoch": 1.8173543689320388, "grad_norm": 6.873039722442627, "learning_rate": 3.3687550262391836e-05, "loss": 1.8656806945800781, "memory(GiB)": 50.28, "step": 2995, "token_acc": 0.5703971119133574, "train_speed(iter/s)": 0.344881 }, { "epoch": 1.820388349514563, "grad_norm": 5.602076530456543, "learning_rate": 3.353746613749094e-05, "loss": 1.7854610443115235, "memory(GiB)": 50.28, "step": 3000, "token_acc": 0.612, "train_speed(iter/s)": 0.344801 }, { "epoch": 1.820388349514563, "eval_loss": 2.0100154876708984, "eval_runtime": 12.1138, "eval_samples_per_second": 8.255, "eval_steps_per_second": 8.255, "eval_token_acc": 0.528023598820059, "step": 3000 }, { "epoch": 1.8234223300970873, "grad_norm": 8.399064064025879, "learning_rate": 3.33875481927397e-05, "loss": 1.9528318405151368, "memory(GiB)": 50.28, "step": 3005, "token_acc": 0.5252918287937743, "train_speed(iter/s)": 0.344402 }, { "epoch": 1.8264563106796117, "grad_norm": 5.018898010253906, "learning_rate": 3.3237797941476715e-05, "loss": 1.8331533432006837, "memory(GiB)": 50.28, "step": 3010, "token_acc": 0.5666666666666667, "train_speed(iter/s)": 0.344502 }, { "epoch": 1.829490291262136, "grad_norm": 5.958186149597168, "learning_rate": 3.308821689534766e-05, "loss": 2.132673645019531, "memory(GiB)": 50.28, "step": 3015, "token_acc": 0.5258064516129032, "train_speed(iter/s)": 0.344343 }, { "epoch": 1.8325242718446602, "grad_norm": 6.066761016845703, "learning_rate": 3.293880656429028e-05, "loss": 1.9015365600585938, "memory(GiB)": 50.28, "step": 3020, "token_acc": 0.55, "train_speed(iter/s)": 0.344319 }, { "epoch": 1.8355582524271845, "grad_norm": 4.496471881866455, "learning_rate": 3.278956845651897e-05, "loss": 2.1529977798461912, "memory(GiB)": 50.28, "step": 3025, "token_acc": 0.5177514792899408, "train_speed(iter/s)": 0.344319 }, { "epoch": 1.8385922330097086, "grad_norm": 6.538642883300781, "learning_rate": 3.2640504078509706e-05, "loss": 2.196062469482422, "memory(GiB)": 50.28, "step": 3030, "token_acc": 0.5352480417754569, "train_speed(iter/s)": 0.344311 }, { "epoch": 1.841626213592233, "grad_norm": 5.559994220733643, "learning_rate": 3.2491614934984706e-05, "loss": 2.1528417587280275, "memory(GiB)": 50.28, "step": 3035, "token_acc": 0.5307443365695793, "train_speed(iter/s)": 0.344272 }, { "epoch": 1.8446601941747574, "grad_norm": 5.954353332519531, "learning_rate": 3.2342902528897276e-05, "loss": 2.0980907440185548, "memory(GiB)": 50.28, "step": 3040, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 0.344373 }, { "epoch": 1.8476941747572817, "grad_norm": 7.812261581420898, "learning_rate": 3.219436836141672e-05, "loss": 1.9015705108642578, "memory(GiB)": 50.28, "step": 3045, "token_acc": 0.5936395759717314, "train_speed(iter/s)": 0.344512 }, { "epoch": 1.8507281553398058, "grad_norm": 4.1047563552856445, "learning_rate": 3.204601393191305e-05, "loss": 2.1011117935180663, "memory(GiB)": 50.28, "step": 3050, "token_acc": 0.5641025641025641, "train_speed(iter/s)": 0.34452 }, { "epoch": 1.85376213592233, "grad_norm": 6.6495041847229, "learning_rate": 3.1897840737941996e-05, "loss": 2.0118566513061524, "memory(GiB)": 50.28, "step": 3055, "token_acc": 0.5370919881305638, "train_speed(iter/s)": 0.344597 }, { "epoch": 1.8567961165048543, "grad_norm": 5.358304977416992, "learning_rate": 3.174985027522978e-05, "loss": 2.109381103515625, "memory(GiB)": 50.28, "step": 3060, "token_acc": 0.5518394648829431, "train_speed(iter/s)": 0.344617 }, { "epoch": 1.8598300970873787, "grad_norm": 6.459053993225098, "learning_rate": 3.1602044037657994e-05, "loss": 2.4759895324707033, "memory(GiB)": 50.28, "step": 3065, "token_acc": 0.4924924924924925, "train_speed(iter/s)": 0.344655 }, { "epoch": 1.862864077669903, "grad_norm": 7.753528594970703, "learning_rate": 3.1454423517248704e-05, "loss": 1.9918514251708985, "memory(GiB)": 50.28, "step": 3070, "token_acc": 0.5389408099688473, "train_speed(iter/s)": 0.344668 }, { "epoch": 1.8658980582524272, "grad_norm": 5.160576343536377, "learning_rate": 3.1306990204149146e-05, "loss": 2.038582992553711, "memory(GiB)": 50.28, "step": 3075, "token_acc": 0.56, "train_speed(iter/s)": 0.344604 }, { "epoch": 1.8689320388349513, "grad_norm": 4.737534523010254, "learning_rate": 3.115974558661691e-05, "loss": 2.2147716522216796, "memory(GiB)": 50.28, "step": 3080, "token_acc": 0.5241635687732342, "train_speed(iter/s)": 0.344661 }, { "epoch": 1.8719660194174756, "grad_norm": 13.144882202148438, "learning_rate": 3.1012691151004694e-05, "loss": 1.9811866760253907, "memory(GiB)": 50.28, "step": 3085, "token_acc": 0.5842696629213483, "train_speed(iter/s)": 0.34478 }, { "epoch": 1.875, "grad_norm": 4.721546649932861, "learning_rate": 3.086582838174551e-05, "loss": 1.773356819152832, "memory(GiB)": 50.28, "step": 3090, "token_acc": 0.6068965517241379, "train_speed(iter/s)": 0.344689 }, { "epoch": 1.8780339805825244, "grad_norm": 6.987473487854004, "learning_rate": 3.0719158761337574e-05, "loss": 2.129978561401367, "memory(GiB)": 50.28, "step": 3095, "token_acc": 0.5648854961832062, "train_speed(iter/s)": 0.344623 }, { "epoch": 1.8810679611650487, "grad_norm": 4.66473388671875, "learning_rate": 3.0572683770329316e-05, "loss": 2.3790740966796875, "memory(GiB)": 50.28, "step": 3100, "token_acc": 0.5029411764705882, "train_speed(iter/s)": 0.344523 }, { "epoch": 1.8841019417475728, "grad_norm": 5.27858829498291, "learning_rate": 3.0426404887304605e-05, "loss": 2.170367431640625, "memory(GiB)": 50.28, "step": 3105, "token_acc": 0.5327380952380952, "train_speed(iter/s)": 0.344555 }, { "epoch": 1.887135922330097, "grad_norm": 6.859893798828125, "learning_rate": 3.0280323588867586e-05, "loss": 2.046696090698242, "memory(GiB)": 50.28, "step": 3110, "token_acc": 0.5733333333333334, "train_speed(iter/s)": 0.344515 }, { "epoch": 1.8901699029126213, "grad_norm": 4.379154205322266, "learning_rate": 3.0134441349627997e-05, "loss": 2.3432376861572264, "memory(GiB)": 50.28, "step": 3115, "token_acc": 0.5090361445783133, "train_speed(iter/s)": 0.344554 }, { "epoch": 1.8932038834951457, "grad_norm": 4.856401443481445, "learning_rate": 2.9988759642186097e-05, "loss": 2.2709232330322267, "memory(GiB)": 50.28, "step": 3120, "token_acc": 0.5228070175438596, "train_speed(iter/s)": 0.344676 }, { "epoch": 1.89623786407767, "grad_norm": 6.317505359649658, "learning_rate": 2.9843279937117997e-05, "loss": 1.8118804931640624, "memory(GiB)": 50.28, "step": 3125, "token_acc": 0.6027397260273972, "train_speed(iter/s)": 0.344764 }, { "epoch": 1.8992718446601942, "grad_norm": 5.960622787475586, "learning_rate": 2.9698003702960586e-05, "loss": 2.2873558044433593, "memory(GiB)": 50.28, "step": 3130, "token_acc": 0.5168539325842697, "train_speed(iter/s)": 0.34475 }, { "epoch": 1.9023058252427183, "grad_norm": 5.5217061042785645, "learning_rate": 2.9552932406196876e-05, "loss": 2.0904094696044924, "memory(GiB)": 50.28, "step": 3135, "token_acc": 0.5114754098360655, "train_speed(iter/s)": 0.344856 }, { "epoch": 1.9053398058252426, "grad_norm": 4.304378509521484, "learning_rate": 2.94080675112412e-05, "loss": 1.8553009033203125, "memory(GiB)": 50.28, "step": 3140, "token_acc": 0.6013986013986014, "train_speed(iter/s)": 0.344799 }, { "epoch": 1.908373786407767, "grad_norm": 5.418023109436035, "learning_rate": 2.9263410480424303e-05, "loss": 1.9853260040283203, "memory(GiB)": 50.28, "step": 3145, "token_acc": 0.5751445086705202, "train_speed(iter/s)": 0.344862 }, { "epoch": 1.9114077669902914, "grad_norm": 3.7519288063049316, "learning_rate": 2.9118962773978693e-05, "loss": 2.1828121185302733, "memory(GiB)": 50.28, "step": 3150, "token_acc": 0.5623188405797102, "train_speed(iter/s)": 0.344783 }, { "epoch": 1.9144417475728155, "grad_norm": 5.235729217529297, "learning_rate": 2.8974725850023886e-05, "loss": 1.8463653564453124, "memory(GiB)": 50.28, "step": 3155, "token_acc": 0.5777777777777777, "train_speed(iter/s)": 0.344806 }, { "epoch": 1.9174757281553398, "grad_norm": 4.126675605773926, "learning_rate": 2.8830701164551598e-05, "loss": 2.193348693847656, "memory(GiB)": 50.28, "step": 3160, "token_acc": 0.5086206896551724, "train_speed(iter/s)": 0.344847 }, { "epoch": 1.920509708737864, "grad_norm": 5.895939826965332, "learning_rate": 2.8686890171411175e-05, "loss": 1.9792446136474608, "memory(GiB)": 50.28, "step": 3165, "token_acc": 0.5618729096989966, "train_speed(iter/s)": 0.34482 }, { "epoch": 1.9235436893203883, "grad_norm": 4.5731940269470215, "learning_rate": 2.8543294322294846e-05, "loss": 1.9657699584960937, "memory(GiB)": 50.28, "step": 3170, "token_acc": 0.593167701863354, "train_speed(iter/s)": 0.344849 }, { "epoch": 1.9265776699029127, "grad_norm": 4.8557281494140625, "learning_rate": 2.8399915066723072e-05, "loss": 1.9642379760742188, "memory(GiB)": 50.28, "step": 3175, "token_acc": 0.5833333333333334, "train_speed(iter/s)": 0.344834 }, { "epoch": 1.929611650485437, "grad_norm": 4.523667335510254, "learning_rate": 2.8256753852029915e-05, "loss": 1.915951919555664, "memory(GiB)": 50.28, "step": 3180, "token_acc": 0.54, "train_speed(iter/s)": 0.344865 }, { "epoch": 1.9326456310679612, "grad_norm": 5.198642253875732, "learning_rate": 2.811381212334847e-05, "loss": 2.083959197998047, "memory(GiB)": 50.28, "step": 3185, "token_acc": 0.5530546623794212, "train_speed(iter/s)": 0.344957 }, { "epoch": 1.9356796116504853, "grad_norm": 5.4120283126831055, "learning_rate": 2.7971091323596177e-05, "loss": 2.0416549682617187, "memory(GiB)": 50.28, "step": 3190, "token_acc": 0.5405405405405406, "train_speed(iter/s)": 0.344962 }, { "epoch": 1.9387135922330097, "grad_norm": 4.352965354919434, "learning_rate": 2.782859289346038e-05, "loss": 1.838254737854004, "memory(GiB)": 50.28, "step": 3195, "token_acc": 0.554858934169279, "train_speed(iter/s)": 0.344883 }, { "epoch": 1.941747572815534, "grad_norm": 3.9446985721588135, "learning_rate": 2.7686318271383714e-05, "loss": 2.068651580810547, "memory(GiB)": 50.28, "step": 3200, "token_acc": 0.55, "train_speed(iter/s)": 0.344964 }, { "epoch": 1.9447815533980584, "grad_norm": 4.529702663421631, "learning_rate": 2.7544268893549573e-05, "loss": 2.049180603027344, "memory(GiB)": 50.28, "step": 3205, "token_acc": 0.5379537953795379, "train_speed(iter/s)": 0.344891 }, { "epoch": 1.9478155339805825, "grad_norm": 4.878699779510498, "learning_rate": 2.740244619386768e-05, "loss": 1.9800500869750977, "memory(GiB)": 50.28, "step": 3210, "token_acc": 0.5594202898550724, "train_speed(iter/s)": 0.34495 }, { "epoch": 1.9508495145631068, "grad_norm": 5.12588357925415, "learning_rate": 2.726085160395948e-05, "loss": 1.7163261413574218, "memory(GiB)": 50.28, "step": 3215, "token_acc": 0.599250936329588, "train_speed(iter/s)": 0.344894 }, { "epoch": 1.953883495145631, "grad_norm": 4.685471534729004, "learning_rate": 2.7119486553143904e-05, "loss": 2.2038547515869142, "memory(GiB)": 50.28, "step": 3220, "token_acc": 0.4924924924924925, "train_speed(iter/s)": 0.344812 }, { "epoch": 1.9569174757281553, "grad_norm": 5.362670421600342, "learning_rate": 2.6978352468422685e-05, "loss": 1.9291072845458985, "memory(GiB)": 50.28, "step": 3225, "token_acc": 0.5529801324503312, "train_speed(iter/s)": 0.34484 }, { "epoch": 1.9599514563106797, "grad_norm": 6.871734142303467, "learning_rate": 2.683745077446616e-05, "loss": 1.9386585235595704, "memory(GiB)": 50.28, "step": 3230, "token_acc": 0.5510204081632653, "train_speed(iter/s)": 0.344843 }, { "epoch": 1.962985436893204, "grad_norm": 4.2407941818237305, "learning_rate": 2.6696782893598816e-05, "loss": 2.020560646057129, "memory(GiB)": 50.28, "step": 3235, "token_acc": 0.5619596541786743, "train_speed(iter/s)": 0.344852 }, { "epoch": 1.9660194174757282, "grad_norm": 6.340878009796143, "learning_rate": 2.6556350245784833e-05, "loss": 2.3107511520385744, "memory(GiB)": 50.28, "step": 3240, "token_acc": 0.4657142857142857, "train_speed(iter/s)": 0.344822 }, { "epoch": 1.9690533980582523, "grad_norm": 4.866008758544922, "learning_rate": 2.641615424861399e-05, "loss": 1.95379581451416, "memory(GiB)": 50.28, "step": 3245, "token_acc": 0.550314465408805, "train_speed(iter/s)": 0.344856 }, { "epoch": 1.9720873786407767, "grad_norm": 5.4675092697143555, "learning_rate": 2.6276196317287083e-05, "loss": 2.146615982055664, "memory(GiB)": 50.28, "step": 3250, "token_acc": 0.5358361774744027, "train_speed(iter/s)": 0.344869 }, { "epoch": 1.975121359223301, "grad_norm": 5.971146106719971, "learning_rate": 2.6136477864601817e-05, "loss": 2.317397117614746, "memory(GiB)": 50.28, "step": 3255, "token_acc": 0.5, "train_speed(iter/s)": 0.344937 }, { "epoch": 1.9781553398058254, "grad_norm": 4.265781402587891, "learning_rate": 2.5997000300938506e-05, "loss": 1.8758274078369142, "memory(GiB)": 50.28, "step": 3260, "token_acc": 0.5298245614035088, "train_speed(iter/s)": 0.345039 }, { "epoch": 1.9811893203883495, "grad_norm": 6.009401798248291, "learning_rate": 2.585776503424576e-05, "loss": 2.0564142227172852, "memory(GiB)": 50.28, "step": 3265, "token_acc": 0.5391849529780565, "train_speed(iter/s)": 0.345002 }, { "epoch": 1.9842233009708736, "grad_norm": 5.191855430603027, "learning_rate": 2.5718773470026448e-05, "loss": 2.049026679992676, "memory(GiB)": 50.28, "step": 3270, "token_acc": 0.5283687943262412, "train_speed(iter/s)": 0.344984 }, { "epoch": 1.987257281553398, "grad_norm": 5.83769416809082, "learning_rate": 2.5580027011323282e-05, "loss": 2.318960952758789, "memory(GiB)": 50.28, "step": 3275, "token_acc": 0.5061728395061729, "train_speed(iter/s)": 0.344997 }, { "epoch": 1.9902912621359223, "grad_norm": 5.0391645431518555, "learning_rate": 2.544152705870483e-05, "loss": 2.2063325881958007, "memory(GiB)": 50.28, "step": 3280, "token_acc": 0.5030487804878049, "train_speed(iter/s)": 0.344892 }, { "epoch": 1.9933252427184467, "grad_norm": 4.2786784172058105, "learning_rate": 2.5303275010251315e-05, "loss": 2.3689651489257812, "memory(GiB)": 50.28, "step": 3285, "token_acc": 0.48009950248756217, "train_speed(iter/s)": 0.344878 }, { "epoch": 1.9963592233009708, "grad_norm": 6.388525485992432, "learning_rate": 2.5165272261540458e-05, "loss": 2.2494415283203124, "memory(GiB)": 50.28, "step": 3290, "token_acc": 0.5064935064935064, "train_speed(iter/s)": 0.34487 }, { "epoch": 1.9993932038834952, "grad_norm": 5.210384368896484, "learning_rate": 2.5027520205633537e-05, "loss": 1.9783794403076171, "memory(GiB)": 50.28, "step": 3295, "token_acc": 0.6063492063492063, "train_speed(iter/s)": 0.34485 }, { "epoch": 2.0024271844660193, "grad_norm": 5.193016052246094, "learning_rate": 2.4890020233061117e-05, "loss": 1.8787994384765625, "memory(GiB)": 50.28, "step": 3300, "token_acc": 0.563265306122449, "train_speed(iter/s)": 0.344928 }, { "epoch": 2.0054611650485437, "grad_norm": 3.543552875518799, "learning_rate": 2.4752773731809176e-05, "loss": 2.162800598144531, "memory(GiB)": 50.28, "step": 3305, "token_acc": 0.5657142857142857, "train_speed(iter/s)": 0.345013 }, { "epoch": 2.008495145631068, "grad_norm": 5.575774669647217, "learning_rate": 2.461578208730504e-05, "loss": 1.7372661590576173, "memory(GiB)": 50.28, "step": 3310, "token_acc": 0.5851393188854489, "train_speed(iter/s)": 0.345054 }, { "epoch": 2.0115291262135924, "grad_norm": 4.383781433105469, "learning_rate": 2.447904668240338e-05, "loss": 1.8831472396850586, "memory(GiB)": 50.28, "step": 3315, "token_acc": 0.5666666666666667, "train_speed(iter/s)": 0.344992 }, { "epoch": 2.0145631067961167, "grad_norm": 7.744244575500488, "learning_rate": 2.4342568897372304e-05, "loss": 2.075815200805664, "memory(GiB)": 50.28, "step": 3320, "token_acc": 0.5613382899628253, "train_speed(iter/s)": 0.345055 }, { "epoch": 2.0175970873786406, "grad_norm": 3.8814077377319336, "learning_rate": 2.4206350109879322e-05, "loss": 1.916240119934082, "memory(GiB)": 50.28, "step": 3325, "token_acc": 0.5642458100558659, "train_speed(iter/s)": 0.345123 }, { "epoch": 2.020631067961165, "grad_norm": 5.217380523681641, "learning_rate": 2.4070391694977578e-05, "loss": 1.935063934326172, "memory(GiB)": 50.28, "step": 3330, "token_acc": 0.5573770491803278, "train_speed(iter/s)": 0.345207 }, { "epoch": 2.0236650485436893, "grad_norm": 12.581655502319336, "learning_rate": 2.3934695025091863e-05, "loss": 2.122936248779297, "memory(GiB)": 50.28, "step": 3335, "token_acc": 0.5524475524475524, "train_speed(iter/s)": 0.345273 }, { "epoch": 2.0266990291262137, "grad_norm": 3.3827898502349854, "learning_rate": 2.3799261470004817e-05, "loss": 1.861661148071289, "memory(GiB)": 50.28, "step": 3340, "token_acc": 0.6027397260273972, "train_speed(iter/s)": 0.345258 }, { "epoch": 2.029733009708738, "grad_norm": 5.65352725982666, "learning_rate": 2.3664092396843078e-05, "loss": 2.132010078430176, "memory(GiB)": 50.6, "step": 3345, "token_acc": 0.540625, "train_speed(iter/s)": 0.34524 }, { "epoch": 2.032766990291262, "grad_norm": 4.553690433502197, "learning_rate": 2.3529189170063448e-05, "loss": 1.958633041381836, "memory(GiB)": 50.6, "step": 3350, "token_acc": 0.5442622950819672, "train_speed(iter/s)": 0.345229 }, { "epoch": 2.0358009708737863, "grad_norm": 5.222290992736816, "learning_rate": 2.3394553151439207e-05, "loss": 2.0045536041259764, "memory(GiB)": 50.6, "step": 3355, "token_acc": 0.5376712328767124, "train_speed(iter/s)": 0.345218 }, { "epoch": 2.0388349514563107, "grad_norm": 5.207190036773682, "learning_rate": 2.3260185700046294e-05, "loss": 2.203332710266113, "memory(GiB)": 50.6, "step": 3360, "token_acc": 0.5584795321637427, "train_speed(iter/s)": 0.345264 }, { "epoch": 2.041868932038835, "grad_norm": 4.448570251464844, "learning_rate": 2.3126088172249617e-05, "loss": 2.05941162109375, "memory(GiB)": 50.6, "step": 3365, "token_acc": 0.5466666666666666, "train_speed(iter/s)": 0.345265 }, { "epoch": 2.0449029126213594, "grad_norm": 6.660695552825928, "learning_rate": 2.299226192168935e-05, "loss": 2.0592878341674803, "memory(GiB)": 50.6, "step": 3370, "token_acc": 0.5466666666666666, "train_speed(iter/s)": 0.34532 }, { "epoch": 2.0479368932038833, "grad_norm": 5.232614040374756, "learning_rate": 2.28587082992673e-05, "loss": 2.029874801635742, "memory(GiB)": 50.6, "step": 3375, "token_acc": 0.5746268656716418, "train_speed(iter/s)": 0.345325 }, { "epoch": 2.0509708737864076, "grad_norm": 5.230781078338623, "learning_rate": 2.2725428653133178e-05, "loss": 1.9541156768798829, "memory(GiB)": 50.6, "step": 3380, "token_acc": 0.5203488372093024, "train_speed(iter/s)": 0.345262 }, { "epoch": 2.054004854368932, "grad_norm": 4.539322853088379, "learning_rate": 2.2592424328671125e-05, "loss": 2.32218017578125, "memory(GiB)": 50.6, "step": 3385, "token_acc": 0.49604221635883905, "train_speed(iter/s)": 0.345325 }, { "epoch": 2.0570388349514563, "grad_norm": 4.8435282707214355, "learning_rate": 2.2459696668486025e-05, "loss": 1.9362192153930664, "memory(GiB)": 50.6, "step": 3390, "token_acc": 0.5582191780821918, "train_speed(iter/s)": 0.345323 }, { "epoch": 2.0600728155339807, "grad_norm": 4.828400135040283, "learning_rate": 2.2327247012390005e-05, "loss": 1.8824956893920899, "memory(GiB)": 50.6, "step": 3395, "token_acc": 0.58125, "train_speed(iter/s)": 0.345272 }, { "epoch": 2.063106796116505, "grad_norm": 5.68388032913208, "learning_rate": 2.2195076697388915e-05, "loss": 2.0521530151367187, "memory(GiB)": 50.6, "step": 3400, "token_acc": 0.5298507462686567, "train_speed(iter/s)": 0.345171 }, { "epoch": 2.066140776699029, "grad_norm": 3.8343381881713867, "learning_rate": 2.2063187057668727e-05, "loss": 2.1346080780029295, "memory(GiB)": 50.6, "step": 3405, "token_acc": 0.5532467532467532, "train_speed(iter/s)": 0.34522 }, { "epoch": 2.0691747572815533, "grad_norm": 7.211141586303711, "learning_rate": 2.1931579424582283e-05, "loss": 1.811175537109375, "memory(GiB)": 50.6, "step": 3410, "token_acc": 0.5818815331010453, "train_speed(iter/s)": 0.345304 }, { "epoch": 2.0722087378640777, "grad_norm": 5.827749729156494, "learning_rate": 2.18002551266356e-05, "loss": 2.0785232543945313, "memory(GiB)": 50.6, "step": 3415, "token_acc": 0.5510204081632653, "train_speed(iter/s)": 0.345316 }, { "epoch": 2.075242718446602, "grad_norm": 4.561173915863037, "learning_rate": 2.166921548947466e-05, "loss": 1.8665559768676758, "memory(GiB)": 50.6, "step": 3420, "token_acc": 0.562015503875969, "train_speed(iter/s)": 0.345309 }, { "epoch": 2.0782766990291264, "grad_norm": 6.844289302825928, "learning_rate": 2.1538461835871937e-05, "loss": 1.875143814086914, "memory(GiB)": 50.6, "step": 3425, "token_acc": 0.5476190476190477, "train_speed(iter/s)": 0.345292 }, { "epoch": 2.0813106796116503, "grad_norm": 4.902288913726807, "learning_rate": 2.1407995485713007e-05, "loss": 2.0950822830200195, "memory(GiB)": 50.6, "step": 3430, "token_acc": 0.5290322580645161, "train_speed(iter/s)": 0.345388 }, { "epoch": 2.0843446601941746, "grad_norm": 4.216062545776367, "learning_rate": 2.127781775598339e-05, "loss": 1.9906414031982422, "memory(GiB)": 50.6, "step": 3435, "token_acc": 0.5490196078431373, "train_speed(iter/s)": 0.345371 }, { "epoch": 2.087378640776699, "grad_norm": 5.434879779815674, "learning_rate": 2.1147929960755032e-05, "loss": 1.9950527191162108, "memory(GiB)": 50.6, "step": 3440, "token_acc": 0.5496894409937888, "train_speed(iter/s)": 0.345468 }, { "epoch": 2.0904126213592233, "grad_norm": 5.217255592346191, "learning_rate": 2.101833341117319e-05, "loss": 1.9016313552856445, "memory(GiB)": 50.6, "step": 3445, "token_acc": 0.5552147239263804, "train_speed(iter/s)": 0.345428 }, { "epoch": 2.0934466019417477, "grad_norm": 6.1612396240234375, "learning_rate": 2.08890294154432e-05, "loss": 2.1277400970458986, "memory(GiB)": 50.6, "step": 3450, "token_acc": 0.5154639175257731, "train_speed(iter/s)": 0.345396 }, { "epoch": 2.096480582524272, "grad_norm": 4.644462585449219, "learning_rate": 2.0760019278817123e-05, "loss": 2.06710147857666, "memory(GiB)": 50.6, "step": 3455, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 0.345323 }, { "epoch": 2.099514563106796, "grad_norm": 6.732641696929932, "learning_rate": 2.0631304303580824e-05, "loss": 2.186777114868164, "memory(GiB)": 50.6, "step": 3460, "token_acc": 0.5123674911660777, "train_speed(iter/s)": 0.34533 }, { "epoch": 2.1025485436893203, "grad_norm": 5.6535258293151855, "learning_rate": 2.0502885789040537e-05, "loss": 2.0752628326416014, "memory(GiB)": 50.6, "step": 3465, "token_acc": 0.5432525951557093, "train_speed(iter/s)": 0.345331 }, { "epoch": 2.1055825242718447, "grad_norm": 5.06096076965332, "learning_rate": 2.037476503150997e-05, "loss": 1.8622182846069335, "memory(GiB)": 50.6, "step": 3470, "token_acc": 0.5967741935483871, "train_speed(iter/s)": 0.345315 }, { "epoch": 2.108616504854369, "grad_norm": 7.963958263397217, "learning_rate": 2.024694332429713e-05, "loss": 2.3141984939575195, "memory(GiB)": 50.6, "step": 3475, "token_acc": 0.4986449864498645, "train_speed(iter/s)": 0.345289 }, { "epoch": 2.1116504854368934, "grad_norm": 4.8425703048706055, "learning_rate": 2.011942195769122e-05, "loss": 1.6697425842285156, "memory(GiB)": 50.6, "step": 3480, "token_acc": 0.6095890410958904, "train_speed(iter/s)": 0.345347 }, { "epoch": 2.1146844660194173, "grad_norm": 5.740769386291504, "learning_rate": 1.9992202218949784e-05, "loss": 2.2240673065185548, "memory(GiB)": 50.6, "step": 3485, "token_acc": 0.5434083601286174, "train_speed(iter/s)": 0.345436 }, { "epoch": 2.1177184466019416, "grad_norm": 6.1080169677734375, "learning_rate": 1.986528539228548e-05, "loss": 1.9705156326293944, "memory(GiB)": 50.6, "step": 3490, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 0.34534 }, { "epoch": 2.120752427184466, "grad_norm": 7.133843898773193, "learning_rate": 1.9738672758853305e-05, "loss": 1.8085777282714843, "memory(GiB)": 50.6, "step": 3495, "token_acc": 0.5772058823529411, "train_speed(iter/s)": 0.345304 }, { "epoch": 2.1237864077669903, "grad_norm": 4.645880699157715, "learning_rate": 1.9612365596737598e-05, "loss": 1.8919126510620117, "memory(GiB)": 50.6, "step": 3500, "token_acc": 0.5700636942675159, "train_speed(iter/s)": 0.345233 }, { "epoch": 2.1237864077669903, "eval_loss": 2.029945135116577, "eval_runtime": 12.0306, "eval_samples_per_second": 8.312, "eval_steps_per_second": 8.312, "eval_token_acc": 0.5397653194263363, "step": 3500 }, { "epoch": 2.1268203883495147, "grad_norm": 4.594156265258789, "learning_rate": 1.948636518093906e-05, "loss": 1.894108772277832, "memory(GiB)": 50.6, "step": 3505, "token_acc": 0.540796963946869, "train_speed(iter/s)": 0.344826 }, { "epoch": 2.1298543689320386, "grad_norm": 4.802730560302734, "learning_rate": 1.9360672783362076e-05, "loss": 1.9896692276000976, "memory(GiB)": 50.6, "step": 3510, "token_acc": 0.5321100917431193, "train_speed(iter/s)": 0.344885 }, { "epoch": 2.132888349514563, "grad_norm": 4.689046859741211, "learning_rate": 1.9235289672801653e-05, "loss": 1.9118257522583009, "memory(GiB)": 50.6, "step": 3515, "token_acc": 0.5634674922600619, "train_speed(iter/s)": 0.344884 }, { "epoch": 2.1359223300970873, "grad_norm": 5.41667366027832, "learning_rate": 1.911021711493077e-05, "loss": 1.9389488220214843, "memory(GiB)": 50.6, "step": 3520, "token_acc": 0.5668789808917197, "train_speed(iter/s)": 0.344798 }, { "epoch": 2.1389563106796117, "grad_norm": 5.899313449859619, "learning_rate": 1.8985456372287534e-05, "loss": 1.9425783157348633, "memory(GiB)": 50.6, "step": 3525, "token_acc": 0.5548780487804879, "train_speed(iter/s)": 0.344813 }, { "epoch": 2.141990291262136, "grad_norm": 6.026679992675781, "learning_rate": 1.8861008704262457e-05, "loss": 2.3336254119873048, "memory(GiB)": 50.6, "step": 3530, "token_acc": 0.49843260188087773, "train_speed(iter/s)": 0.344837 }, { "epoch": 2.1450242718446604, "grad_norm": 5.279869556427002, "learning_rate": 1.8736875367085755e-05, "loss": 1.6840200424194336, "memory(GiB)": 50.6, "step": 3535, "token_acc": 0.5759717314487632, "train_speed(iter/s)": 0.344885 }, { "epoch": 2.1480582524271843, "grad_norm": 5.156026363372803, "learning_rate": 1.8613057613814584e-05, "loss": 1.8844125747680665, "memory(GiB)": 50.6, "step": 3540, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 0.344753 }, { "epoch": 2.1510922330097086, "grad_norm": 5.9521918296813965, "learning_rate": 1.8489556694320513e-05, "loss": 2.0156713485717774, "memory(GiB)": 50.6, "step": 3545, "token_acc": 0.5215827338129496, "train_speed(iter/s)": 0.344823 }, { "epoch": 2.154126213592233, "grad_norm": 6.482142925262451, "learning_rate": 1.836637385527684e-05, "loss": 1.9887617111206055, "memory(GiB)": 50.6, "step": 3550, "token_acc": 0.5275862068965518, "train_speed(iter/s)": 0.344801 }, { "epoch": 2.1571601941747574, "grad_norm": 6.972325325012207, "learning_rate": 1.8243510340146015e-05, "loss": 2.128932762145996, "memory(GiB)": 50.6, "step": 3555, "token_acc": 0.534375, "train_speed(iter/s)": 0.344698 }, { "epoch": 2.1601941747572817, "grad_norm": 5.6347479820251465, "learning_rate": 1.8120967389167076e-05, "loss": 2.149120903015137, "memory(GiB)": 50.6, "step": 3560, "token_acc": 0.5462686567164179, "train_speed(iter/s)": 0.344718 }, { "epoch": 2.163228155339806, "grad_norm": 5.050625324249268, "learning_rate": 1.799874623934318e-05, "loss": 2.237873077392578, "memory(GiB)": 50.6, "step": 3565, "token_acc": 0.5313432835820896, "train_speed(iter/s)": 0.344743 }, { "epoch": 2.16626213592233, "grad_norm": 6.398519992828369, "learning_rate": 1.7876848124429014e-05, "loss": 1.987525177001953, "memory(GiB)": 50.6, "step": 3570, "token_acc": 0.5639097744360902, "train_speed(iter/s)": 0.34477 }, { "epoch": 2.1692961165048543, "grad_norm": 5.828600883483887, "learning_rate": 1.775527427491847e-05, "loss": 2.3566856384277344, "memory(GiB)": 50.6, "step": 3575, "token_acc": 0.48014440433212996, "train_speed(iter/s)": 0.344847 }, { "epoch": 2.1723300970873787, "grad_norm": 5.7275896072387695, "learning_rate": 1.7634025918032132e-05, "loss": 2.3688285827636717, "memory(GiB)": 50.6, "step": 3580, "token_acc": 0.5451895043731778, "train_speed(iter/s)": 0.344903 }, { "epoch": 2.175364077669903, "grad_norm": 5.7674431800842285, "learning_rate": 1.7513104277704926e-05, "loss": 1.837372398376465, "memory(GiB)": 50.6, "step": 3585, "token_acc": 0.5619335347432024, "train_speed(iter/s)": 0.344878 }, { "epoch": 2.1783980582524274, "grad_norm": 4.379698753356934, "learning_rate": 1.739251057457377e-05, "loss": 1.9321279525756836, "memory(GiB)": 50.6, "step": 3590, "token_acc": 0.56657223796034, "train_speed(iter/s)": 0.344793 }, { "epoch": 2.1814320388349513, "grad_norm": 4.948322772979736, "learning_rate": 1.7272246025965178e-05, "loss": 2.120316505432129, "memory(GiB)": 50.6, "step": 3595, "token_acc": 0.5451807228915663, "train_speed(iter/s)": 0.344763 }, { "epoch": 2.1844660194174756, "grad_norm": 4.791065216064453, "learning_rate": 1.7152311845883095e-05, "loss": 2.0224021911621093, "memory(GiB)": 50.6, "step": 3600, "token_acc": 0.5647058823529412, "train_speed(iter/s)": 0.344784 }, { "epoch": 2.1875, "grad_norm": 5.331698894500732, "learning_rate": 1.703270924499656e-05, "loss": 2.056295394897461, "memory(GiB)": 50.6, "step": 3605, "token_acc": 0.5485714285714286, "train_speed(iter/s)": 0.34475 }, { "epoch": 2.1905339805825244, "grad_norm": 5.583906173706055, "learning_rate": 1.691343943062749e-05, "loss": 1.9945308685302734, "memory(GiB)": 50.6, "step": 3610, "token_acc": 0.5709779179810726, "train_speed(iter/s)": 0.344778 }, { "epoch": 2.1935679611650487, "grad_norm": 4.002586364746094, "learning_rate": 1.6794503606738548e-05, "loss": 1.7582271575927735, "memory(GiB)": 50.6, "step": 3615, "token_acc": 0.6151315789473685, "train_speed(iter/s)": 0.344822 }, { "epoch": 2.1966019417475726, "grad_norm": 6.799562454223633, "learning_rate": 1.667590297392086e-05, "loss": 1.8144449234008788, "memory(GiB)": 50.6, "step": 3620, "token_acc": 0.5977859778597786, "train_speed(iter/s)": 0.344814 }, { "epoch": 2.199635922330097, "grad_norm": 4.525378704071045, "learning_rate": 1.6557638729382107e-05, "loss": 1.9035455703735351, "memory(GiB)": 50.6, "step": 3625, "token_acc": 0.5570934256055363, "train_speed(iter/s)": 0.344871 }, { "epoch": 2.2026699029126213, "grad_norm": 4.928961277008057, "learning_rate": 1.6439712066934204e-05, "loss": 1.8028743743896485, "memory(GiB)": 50.6, "step": 3630, "token_acc": 0.5808823529411765, "train_speed(iter/s)": 0.344845 }, { "epoch": 2.2057038834951457, "grad_norm": 5.1491007804870605, "learning_rate": 1.632212417698143e-05, "loss": 2.139560508728027, "memory(GiB)": 50.6, "step": 3635, "token_acc": 0.5, "train_speed(iter/s)": 0.344768 }, { "epoch": 2.20873786407767, "grad_norm": 6.033669471740723, "learning_rate": 1.620487624650834e-05, "loss": 1.7967906951904298, "memory(GiB)": 50.6, "step": 3640, "token_acc": 0.5793650793650794, "train_speed(iter/s)": 0.34487 }, { "epoch": 2.211771844660194, "grad_norm": 5.841419219970703, "learning_rate": 1.6087969459067708e-05, "loss": 1.8101051330566407, "memory(GiB)": 50.6, "step": 3645, "token_acc": 0.6012861736334405, "train_speed(iter/s)": 0.344821 }, { "epoch": 2.2148058252427183, "grad_norm": 5.800178527832031, "learning_rate": 1.5971404994768797e-05, "loss": 2.175870895385742, "memory(GiB)": 50.6, "step": 3650, "token_acc": 0.5154320987654321, "train_speed(iter/s)": 0.34484 }, { "epoch": 2.2178398058252426, "grad_norm": 4.805243968963623, "learning_rate": 1.585518403026518e-05, "loss": 2.0600021362304686, "memory(GiB)": 50.6, "step": 3655, "token_acc": 0.5444126074498568, "train_speed(iter/s)": 0.344817 }, { "epoch": 2.220873786407767, "grad_norm": 3.8938114643096924, "learning_rate": 1.5739307738743057e-05, "loss": 1.9185747146606444, "memory(GiB)": 50.6, "step": 3660, "token_acc": 0.5596330275229358, "train_speed(iter/s)": 0.344873 }, { "epoch": 2.2239077669902914, "grad_norm": 5.153964042663574, "learning_rate": 1.5623777289909347e-05, "loss": 1.6549354553222657, "memory(GiB)": 50.6, "step": 3665, "token_acc": 0.6027397260273972, "train_speed(iter/s)": 0.34477 }, { "epoch": 2.2269417475728157, "grad_norm": 5.778039932250977, "learning_rate": 1.5508593849979812e-05, "loss": 1.9642074584960938, "memory(GiB)": 50.6, "step": 3670, "token_acc": 0.57, "train_speed(iter/s)": 0.344657 }, { "epoch": 2.2299757281553396, "grad_norm": 5.437472343444824, "learning_rate": 1.5393758581667462e-05, "loss": 2.049532890319824, "memory(GiB)": 50.6, "step": 3675, "token_acc": 0.5331125827814569, "train_speed(iter/s)": 0.344694 }, { "epoch": 2.233009708737864, "grad_norm": 11.309470176696777, "learning_rate": 1.52792726441706e-05, "loss": 2.142658805847168, "memory(GiB)": 50.6, "step": 3680, "token_acc": 0.5335463258785943, "train_speed(iter/s)": 0.344721 }, { "epoch": 2.2360436893203883, "grad_norm": 7.32981014251709, "learning_rate": 1.5165137193161289e-05, "loss": 2.171151351928711, "memory(GiB)": 50.6, "step": 3685, "token_acc": 0.553030303030303, "train_speed(iter/s)": 0.344757 }, { "epoch": 2.2390776699029127, "grad_norm": 7.668642044067383, "learning_rate": 1.505135338077363e-05, "loss": 1.9927787780761719, "memory(GiB)": 50.6, "step": 3690, "token_acc": 0.5691823899371069, "train_speed(iter/s)": 0.344756 }, { "epoch": 2.242111650485437, "grad_norm": 3.881864547729492, "learning_rate": 1.4937922355592054e-05, "loss": 1.7769926071166993, "memory(GiB)": 50.6, "step": 3695, "token_acc": 0.6126984126984127, "train_speed(iter/s)": 0.344734 }, { "epoch": 2.2451456310679614, "grad_norm": 4.81337308883667, "learning_rate": 1.482484526263993e-05, "loss": 1.9315889358520508, "memory(GiB)": 50.6, "step": 3700, "token_acc": 0.5588235294117647, "train_speed(iter/s)": 0.344778 }, { "epoch": 2.2481796116504853, "grad_norm": 6.5428690910339355, "learning_rate": 1.4712123243367742e-05, "loss": 2.0162689208984377, "memory(GiB)": 50.6, "step": 3705, "token_acc": 0.5749128919860628, "train_speed(iter/s)": 0.344848 }, { "epoch": 2.2512135922330097, "grad_norm": 5.6883111000061035, "learning_rate": 1.459975743564178e-05, "loss": 1.9167594909667969, "memory(GiB)": 50.6, "step": 3710, "token_acc": 0.576271186440678, "train_speed(iter/s)": 0.344906 }, { "epoch": 2.254247572815534, "grad_norm": 4.094846725463867, "learning_rate": 1.4487748973732567e-05, "loss": 1.8676132202148437, "memory(GiB)": 50.6, "step": 3715, "token_acc": 0.5803571428571429, "train_speed(iter/s)": 0.344941 }, { "epoch": 2.2572815533980584, "grad_norm": 5.352729320526123, "learning_rate": 1.4376098988303405e-05, "loss": 2.115581512451172, "memory(GiB)": 50.6, "step": 3720, "token_acc": 0.5292397660818714, "train_speed(iter/s)": 0.34496 }, { "epoch": 2.2603155339805827, "grad_norm": 5.883962154388428, "learning_rate": 1.4264808606398988e-05, "loss": 1.9366184234619142, "memory(GiB)": 50.6, "step": 3725, "token_acc": 0.5743944636678201, "train_speed(iter/s)": 0.344983 }, { "epoch": 2.2633495145631066, "grad_norm": 10.41577434539795, "learning_rate": 1.4153878951433985e-05, "loss": 2.1848495483398436, "memory(GiB)": 50.6, "step": 3730, "token_acc": 0.5300353356890459, "train_speed(iter/s)": 0.345025 }, { "epoch": 2.266383495145631, "grad_norm": 6.488175868988037, "learning_rate": 1.4043311143181743e-05, "loss": 1.988439178466797, "memory(GiB)": 50.6, "step": 3735, "token_acc": 0.5735735735735735, "train_speed(iter/s)": 0.345065 }, { "epoch": 2.2694174757281553, "grad_norm": 5.074716567993164, "learning_rate": 1.3933106297762983e-05, "loss": 2.0763900756835936, "memory(GiB)": 50.6, "step": 3740, "token_acc": 0.5311475409836065, "train_speed(iter/s)": 0.344947 }, { "epoch": 2.2724514563106797, "grad_norm": 4.931432247161865, "learning_rate": 1.38232655276345e-05, "loss": 2.1887067794799804, "memory(GiB)": 50.6, "step": 3745, "token_acc": 0.5389408099688473, "train_speed(iter/s)": 0.344861 }, { "epoch": 2.275485436893204, "grad_norm": 5.03133487701416, "learning_rate": 1.3713789941577947e-05, "loss": 2.2176355361938476, "memory(GiB)": 50.6, "step": 3750, "token_acc": 0.5140449438202247, "train_speed(iter/s)": 0.344929 }, { "epoch": 2.278519417475728, "grad_norm": 6.510335922241211, "learning_rate": 1.3604680644688673e-05, "loss": 2.0438154220581053, "memory(GiB)": 50.6, "step": 3755, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 0.344964 }, { "epoch": 2.2815533980582523, "grad_norm": 5.163710117340088, "learning_rate": 1.3495938738364495e-05, "loss": 1.8949357986450195, "memory(GiB)": 50.6, "step": 3760, "token_acc": 0.5844155844155844, "train_speed(iter/s)": 0.3449 }, { "epoch": 2.2845873786407767, "grad_norm": 8.918208122253418, "learning_rate": 1.338756532029466e-05, "loss": 2.229213905334473, "memory(GiB)": 50.6, "step": 3765, "token_acc": 0.5403508771929825, "train_speed(iter/s)": 0.344878 }, { "epoch": 2.287621359223301, "grad_norm": 4.586476802825928, "learning_rate": 1.3279561484448726e-05, "loss": 2.0009828567504884, "memory(GiB)": 50.6, "step": 3770, "token_acc": 0.56282722513089, "train_speed(iter/s)": 0.344892 }, { "epoch": 2.2906553398058254, "grad_norm": 5.315647125244141, "learning_rate": 1.3171928321065525e-05, "loss": 2.008698654174805, "memory(GiB)": 50.6, "step": 3775, "token_acc": 0.4956772334293948, "train_speed(iter/s)": 0.344907 }, { "epoch": 2.2936893203883493, "grad_norm": 4.923291206359863, "learning_rate": 1.306466691664216e-05, "loss": 1.7733882904052733, "memory(GiB)": 50.6, "step": 3780, "token_acc": 0.5912698412698413, "train_speed(iter/s)": 0.344892 }, { "epoch": 2.2967233009708736, "grad_norm": 5.02618408203125, "learning_rate": 1.2957778353922994e-05, "loss": 1.716277503967285, "memory(GiB)": 50.6, "step": 3785, "token_acc": 0.5909090909090909, "train_speed(iter/s)": 0.344929 }, { "epoch": 2.299757281553398, "grad_norm": 4.985109806060791, "learning_rate": 1.285126371188881e-05, "loss": 2.0128692626953124, "memory(GiB)": 50.6, "step": 3790, "token_acc": 0.5719063545150501, "train_speed(iter/s)": 0.344931 }, { "epoch": 2.3027912621359223, "grad_norm": 3.823289155960083, "learning_rate": 1.2745124065745845e-05, "loss": 1.594803524017334, "memory(GiB)": 50.6, "step": 3795, "token_acc": 0.6047297297297297, "train_speed(iter/s)": 0.344942 }, { "epoch": 2.3058252427184467, "grad_norm": 5.443478107452393, "learning_rate": 1.2639360486914964e-05, "loss": 2.235613250732422, "memory(GiB)": 50.6, "step": 3800, "token_acc": 0.5244565217391305, "train_speed(iter/s)": 0.34485 }, { "epoch": 2.308859223300971, "grad_norm": 6.027771472930908, "learning_rate": 1.2533974043020862e-05, "loss": 2.0554859161376955, "memory(GiB)": 50.6, "step": 3805, "token_acc": 0.5654596100278552, "train_speed(iter/s)": 0.344729 }, { "epoch": 2.311893203883495, "grad_norm": 5.283422946929932, "learning_rate": 1.2428965797881204e-05, "loss": 2.196268081665039, "memory(GiB)": 50.6, "step": 3810, "token_acc": 0.5163934426229508, "train_speed(iter/s)": 0.344726 }, { "epoch": 2.3149271844660193, "grad_norm": 5.5613555908203125, "learning_rate": 1.232433681149604e-05, "loss": 2.181218910217285, "memory(GiB)": 50.6, "step": 3815, "token_acc": 0.5236486486486487, "train_speed(iter/s)": 0.344767 }, { "epoch": 2.3179611650485437, "grad_norm": 7.4019622802734375, "learning_rate": 1.2220088140036934e-05, "loss": 1.8360416412353515, "memory(GiB)": 50.6, "step": 3820, "token_acc": 0.5543478260869565, "train_speed(iter/s)": 0.34476 }, { "epoch": 2.320995145631068, "grad_norm": 5.247798442840576, "learning_rate": 1.2116220835836389e-05, "loss": 2.0518161773681642, "memory(GiB)": 50.6, "step": 3825, "token_acc": 0.5630252100840336, "train_speed(iter/s)": 0.344765 }, { "epoch": 2.3240291262135924, "grad_norm": 4.724814414978027, "learning_rate": 1.2012735947377297e-05, "loss": 2.1411914825439453, "memory(GiB)": 50.6, "step": 3830, "token_acc": 0.5244565217391305, "train_speed(iter/s)": 0.344772 }, { "epoch": 2.3270631067961167, "grad_norm": 4.550099849700928, "learning_rate": 1.1909634519282154e-05, "loss": 1.9134037017822265, "memory(GiB)": 50.6, "step": 3835, "token_acc": 0.5538922155688623, "train_speed(iter/s)": 0.344763 }, { "epoch": 2.3300970873786406, "grad_norm": 4.154212951660156, "learning_rate": 1.1806917592302762e-05, "loss": 2.0584575653076174, "memory(GiB)": 50.6, "step": 3840, "token_acc": 0.5657142857142857, "train_speed(iter/s)": 0.344787 }, { "epoch": 2.333131067961165, "grad_norm": 7.262927532196045, "learning_rate": 1.1704586203309486e-05, "loss": 2.36254825592041, "memory(GiB)": 50.6, "step": 3845, "token_acc": 0.5129682997118156, "train_speed(iter/s)": 0.344775 }, { "epoch": 2.3361650485436893, "grad_norm": 6.921282768249512, "learning_rate": 1.1602641385280971e-05, "loss": 2.1352420806884767, "memory(GiB)": 50.6, "step": 3850, "token_acc": 0.5218978102189781, "train_speed(iter/s)": 0.344697 }, { "epoch": 2.3391990291262137, "grad_norm": 4.498334884643555, "learning_rate": 1.1501084167293624e-05, "loss": 1.9374540328979493, "memory(GiB)": 50.6, "step": 3855, "token_acc": 0.5634674922600619, "train_speed(iter/s)": 0.344653 }, { "epoch": 2.342233009708738, "grad_norm": 8.23636245727539, "learning_rate": 1.1399915574511205e-05, "loss": 2.0232606887817384, "memory(GiB)": 50.6, "step": 3860, "token_acc": 0.5609756097560976, "train_speed(iter/s)": 0.344645 }, { "epoch": 2.345266990291262, "grad_norm": 4.897376537322998, "learning_rate": 1.1299136628174606e-05, "loss": 2.0116275787353515, "memory(GiB)": 50.6, "step": 3865, "token_acc": 0.5598705501618123, "train_speed(iter/s)": 0.344712 }, { "epoch": 2.3483009708737863, "grad_norm": 4.954316139221191, "learning_rate": 1.1198748345591358e-05, "loss": 1.7617246627807617, "memory(GiB)": 50.6, "step": 3870, "token_acc": 0.5543071161048689, "train_speed(iter/s)": 0.344704 }, { "epoch": 2.3513349514563107, "grad_norm": 4.603342056274414, "learning_rate": 1.1098751740125518e-05, "loss": 2.026332664489746, "memory(GiB)": 50.6, "step": 3875, "token_acc": 0.5602605863192183, "train_speed(iter/s)": 0.344668 }, { "epoch": 2.354368932038835, "grad_norm": 5.0532145500183105, "learning_rate": 1.0999147821187378e-05, "loss": 1.9312347412109374, "memory(GiB)": 50.6, "step": 3880, "token_acc": 0.5323076923076923, "train_speed(iter/s)": 0.344607 }, { "epoch": 2.3574029126213594, "grad_norm": 6.357297420501709, "learning_rate": 1.0899937594223225e-05, "loss": 2.2144514083862306, "memory(GiB)": 50.6, "step": 3885, "token_acc": 0.5471204188481675, "train_speed(iter/s)": 0.344619 }, { "epoch": 2.3604368932038833, "grad_norm": 6.343308448791504, "learning_rate": 1.080112206070531e-05, "loss": 1.9430349349975586, "memory(GiB)": 50.6, "step": 3890, "token_acc": 0.5888157894736842, "train_speed(iter/s)": 0.344557 }, { "epoch": 2.3634708737864076, "grad_norm": 8.07403564453125, "learning_rate": 1.070270221812163e-05, "loss": 1.907916259765625, "memory(GiB)": 50.6, "step": 3895, "token_acc": 0.5579937304075235, "train_speed(iter/s)": 0.344571 }, { "epoch": 2.366504854368932, "grad_norm": 6.639111518859863, "learning_rate": 1.0604679059965922e-05, "loss": 1.9934438705444335, "memory(GiB)": 50.6, "step": 3900, "token_acc": 0.5283582089552239, "train_speed(iter/s)": 0.344519 }, { "epoch": 2.3695388349514563, "grad_norm": 5.882744312286377, "learning_rate": 1.050705357572761e-05, "loss": 2.051889991760254, "memory(GiB)": 50.6, "step": 3905, "token_acc": 0.5085714285714286, "train_speed(iter/s)": 0.344542 }, { "epoch": 2.3725728155339807, "grad_norm": 7.371013164520264, "learning_rate": 1.0409826750881824e-05, "loss": 2.107813262939453, "memory(GiB)": 50.6, "step": 3910, "token_acc": 0.5399449035812672, "train_speed(iter/s)": 0.344602 }, { "epoch": 2.375606796116505, "grad_norm": 5.27397346496582, "learning_rate": 1.031299956687941e-05, "loss": 2.1293098449707033, "memory(GiB)": 50.6, "step": 3915, "token_acc": 0.5506849315068493, "train_speed(iter/s)": 0.344645 }, { "epoch": 2.378640776699029, "grad_norm": 5.882137775421143, "learning_rate": 1.0216573001137126e-05, "loss": 2.3114339828491213, "memory(GiB)": 50.6, "step": 3920, "token_acc": 0.4958217270194986, "train_speed(iter/s)": 0.344622 }, { "epoch": 2.3816747572815533, "grad_norm": 4.661534786224365, "learning_rate": 1.0120548027027655e-05, "loss": 1.9488998413085938, "memory(GiB)": 50.6, "step": 3925, "token_acc": 0.5640326975476839, "train_speed(iter/s)": 0.344658 }, { "epoch": 2.3847087378640777, "grad_norm": 5.2108354568481445, "learning_rate": 1.0024925613869874e-05, "loss": 2.0759273529052735, "memory(GiB)": 50.6, "step": 3930, "token_acc": 0.5298013245033113, "train_speed(iter/s)": 0.34465 }, { "epoch": 2.387742718446602, "grad_norm": 5.205474853515625, "learning_rate": 9.929706726919019e-06, "loss": 2.2559911727905275, "memory(GiB)": 50.6, "step": 3935, "token_acc": 0.5112359550561798, "train_speed(iter/s)": 0.344666 }, { "epoch": 2.3907766990291264, "grad_norm": 6.178864002227783, "learning_rate": 9.834892327356909e-06, "loss": 1.9785406112670898, "memory(GiB)": 50.6, "step": 3940, "token_acc": 0.5492957746478874, "train_speed(iter/s)": 0.344639 }, { "epoch": 2.3938106796116507, "grad_norm": 4.592073917388916, "learning_rate": 9.740483372282383e-06, "loss": 1.8985654830932617, "memory(GiB)": 50.6, "step": 3945, "token_acc": 0.5588235294117647, "train_speed(iter/s)": 0.344563 }, { "epoch": 2.3968446601941746, "grad_norm": 7.324235916137695, "learning_rate": 9.646480814701447e-06, "loss": 1.7632612228393554, "memory(GiB)": 50.6, "step": 3950, "token_acc": 0.6046511627906976, "train_speed(iter/s)": 0.344501 }, { "epoch": 2.399878640776699, "grad_norm": 5.564411163330078, "learning_rate": 9.552885603517797e-06, "loss": 1.9235519409179687, "memory(GiB)": 50.6, "step": 3955, "token_acc": 0.5936507936507937, "train_speed(iter/s)": 0.344408 }, { "epoch": 2.4029126213592233, "grad_norm": 4.379534721374512, "learning_rate": 9.459698683523204e-06, "loss": 2.0796350479125976, "memory(GiB)": 50.6, "step": 3960, "token_acc": 0.5470383275261324, "train_speed(iter/s)": 0.344434 }, { "epoch": 2.4059466019417477, "grad_norm": 6.752010345458984, "learning_rate": 9.366920995387901e-06, "loss": 1.7882482528686523, "memory(GiB)": 50.6, "step": 3965, "token_acc": 0.5898876404494382, "train_speed(iter/s)": 0.344417 }, { "epoch": 2.408980582524272, "grad_norm": 4.32159948348999, "learning_rate": 9.274553475651254e-06, "loss": 1.9115240097045898, "memory(GiB)": 50.6, "step": 3970, "token_acc": 0.5867052023121387, "train_speed(iter/s)": 0.344498 }, { "epoch": 2.412014563106796, "grad_norm": 3.9558327198028564, "learning_rate": 9.182597056712111e-06, "loss": 2.1894863128662108, "memory(GiB)": 50.6, "step": 3975, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 0.344506 }, { "epoch": 2.4150485436893203, "grad_norm": 7.3276472091674805, "learning_rate": 9.09105266681954e-06, "loss": 2.0905841827392577, "memory(GiB)": 50.6, "step": 3980, "token_acc": 0.5159420289855072, "train_speed(iter/s)": 0.344349 }, { "epoch": 2.4180825242718447, "grad_norm": 6.179529190063477, "learning_rate": 8.99992123006339e-06, "loss": 1.8323509216308593, "memory(GiB)": 50.6, "step": 3985, "token_acc": 0.5662251655629139, "train_speed(iter/s)": 0.344376 }, { "epoch": 2.421116504854369, "grad_norm": 4.686245918273926, "learning_rate": 8.909203666364957e-06, "loss": 2.0102216720581056, "memory(GiB)": 50.6, "step": 3990, "token_acc": 0.5437262357414449, "train_speed(iter/s)": 0.344379 }, { "epoch": 2.4241504854368934, "grad_norm": 4.411731719970703, "learning_rate": 8.818900891467773e-06, "loss": 2.2799463272094727, "memory(GiB)": 50.6, "step": 3995, "token_acc": 0.5174603174603175, "train_speed(iter/s)": 0.344396 }, { "epoch": 2.4271844660194173, "grad_norm": 4.712284088134766, "learning_rate": 8.729013816928239e-06, "loss": 2.1641727447509767, "memory(GiB)": 50.6, "step": 4000, "token_acc": 0.5325443786982249, "train_speed(iter/s)": 0.344431 }, { "epoch": 2.4271844660194173, "eval_loss": 1.8463255167007446, "eval_runtime": 11.4039, "eval_samples_per_second": 8.769, "eval_steps_per_second": 8.769, "eval_token_acc": 0.5323033707865169, "step": 4000 }, { "epoch": 2.4302184466019416, "grad_norm": 5.290402412414551, "learning_rate": 8.639543350106532e-06, "loss": 1.9997318267822266, "memory(GiB)": 50.6, "step": 4005, "token_acc": 0.5501930501930502, "train_speed(iter/s)": 0.344049 }, { "epoch": 2.433252427184466, "grad_norm": 4.340409278869629, "learning_rate": 8.550490394157417e-06, "loss": 2.0915660858154297, "memory(GiB)": 50.6, "step": 4010, "token_acc": 0.550314465408805, "train_speed(iter/s)": 0.343987 }, { "epoch": 2.4362864077669903, "grad_norm": 6.429068088531494, "learning_rate": 8.46185584802106e-06, "loss": 2.058956527709961, "memory(GiB)": 50.6, "step": 4015, "token_acc": 0.5704918032786885, "train_speed(iter/s)": 0.344036 }, { "epoch": 2.4393203883495147, "grad_norm": 7.687424659729004, "learning_rate": 8.373640606414096e-06, "loss": 2.192945098876953, "memory(GiB)": 50.6, "step": 4020, "token_acc": 0.5291828793774319, "train_speed(iter/s)": 0.343944 }, { "epoch": 2.4423543689320386, "grad_norm": 5.217949867248535, "learning_rate": 8.285845559820427e-06, "loss": 2.209462356567383, "memory(GiB)": 50.6, "step": 4025, "token_acc": 0.5206611570247934, "train_speed(iter/s)": 0.343926 }, { "epoch": 2.445388349514563, "grad_norm": 4.875537872314453, "learning_rate": 8.198471594482376e-06, "loss": 2.2033403396606444, "memory(GiB)": 50.6, "step": 4030, "token_acc": 0.5159420289855072, "train_speed(iter/s)": 0.343885 }, { "epoch": 2.4484223300970873, "grad_norm": 6.047563076019287, "learning_rate": 8.111519592391669e-06, "loss": 2.232436752319336, "memory(GiB)": 50.6, "step": 4035, "token_acc": 0.5225563909774437, "train_speed(iter/s)": 0.343965 }, { "epoch": 2.4514563106796117, "grad_norm": 4.142251014709473, "learning_rate": 8.024990431280543e-06, "loss": 2.2608406066894533, "memory(GiB)": 50.6, "step": 4040, "token_acc": 0.5108695652173914, "train_speed(iter/s)": 0.343903 }, { "epoch": 2.454490291262136, "grad_norm": 6.838778972625732, "learning_rate": 7.93888498461291e-06, "loss": 1.822138214111328, "memory(GiB)": 50.6, "step": 4045, "token_acc": 0.571875, "train_speed(iter/s)": 0.343911 }, { "epoch": 2.4575242718446604, "grad_norm": 5.675113677978516, "learning_rate": 7.853204121575475e-06, "loss": 2.105767822265625, "memory(GiB)": 50.6, "step": 4050, "token_acc": 0.536741214057508, "train_speed(iter/s)": 0.343931 }, { "epoch": 2.4605582524271843, "grad_norm": 4.500559329986572, "learning_rate": 7.76794870706905e-06, "loss": 2.0516807556152346, "memory(GiB)": 50.6, "step": 4055, "token_acc": 0.546448087431694, "train_speed(iter/s)": 0.343791 }, { "epoch": 2.4635922330097086, "grad_norm": 4.457030296325684, "learning_rate": 7.683119601699757e-06, "loss": 2.059749412536621, "memory(GiB)": 50.6, "step": 4060, "token_acc": 0.5271084337349398, "train_speed(iter/s)": 0.343652 }, { "epoch": 2.466626213592233, "grad_norm": 5.025282859802246, "learning_rate": 7.598717661770377e-06, "loss": 1.8586057662963866, "memory(GiB)": 50.6, "step": 4065, "token_acc": 0.5772870662460567, "train_speed(iter/s)": 0.343685 }, { "epoch": 2.4696601941747574, "grad_norm": 4.709406852722168, "learning_rate": 7.514743739271696e-06, "loss": 1.9511306762695313, "memory(GiB)": 50.6, "step": 4070, "token_acc": 0.5512048192771084, "train_speed(iter/s)": 0.343689 }, { "epoch": 2.4726941747572817, "grad_norm": 4.842862129211426, "learning_rate": 7.4311986818738685e-06, "loss": 1.8791656494140625, "memory(GiB)": 50.6, "step": 4075, "token_acc": 0.5798611111111112, "train_speed(iter/s)": 0.343586 }, { "epoch": 2.475728155339806, "grad_norm": 5.616004467010498, "learning_rate": 7.348083332917926e-06, "loss": 2.0656101226806642, "memory(GiB)": 50.6, "step": 4080, "token_acc": 0.543859649122807, "train_speed(iter/s)": 0.343662 }, { "epoch": 2.47876213592233, "grad_norm": 4.97971773147583, "learning_rate": 7.26539853140723e-06, "loss": 2.1649560928344727, "memory(GiB)": 50.6, "step": 4085, "token_acc": 0.5404624277456648, "train_speed(iter/s)": 0.34368 }, { "epoch": 2.4817961165048543, "grad_norm": 4.55922794342041, "learning_rate": 7.1831451119989955e-06, "loss": 2.267315673828125, "memory(GiB)": 50.6, "step": 4090, "token_acc": 0.5028571428571429, "train_speed(iter/s)": 0.343691 }, { "epoch": 2.4848300970873787, "grad_norm": 4.6038289070129395, "learning_rate": 7.1013239049958714e-06, "loss": 1.751937484741211, "memory(GiB)": 50.6, "step": 4095, "token_acc": 0.6020761245674741, "train_speed(iter/s)": 0.343657 }, { "epoch": 2.487864077669903, "grad_norm": 4.496725082397461, "learning_rate": 7.019935736337585e-06, "loss": 2.0439178466796877, "memory(GiB)": 50.6, "step": 4100, "token_acc": 0.5085158150851582, "train_speed(iter/s)": 0.343536 }, { "epoch": 2.4908980582524274, "grad_norm": 7.153860569000244, "learning_rate": 6.938981427592534e-06, "loss": 2.105657768249512, "memory(GiB)": 50.6, "step": 4105, "token_acc": 0.5506329113924051, "train_speed(iter/s)": 0.343499 }, { "epoch": 2.4939320388349513, "grad_norm": 6.969353199005127, "learning_rate": 6.858461795949583e-06, "loss": 2.242031288146973, "memory(GiB)": 50.6, "step": 4110, "token_acc": 0.5148514851485149, "train_speed(iter/s)": 0.343445 }, { "epoch": 2.4969660194174756, "grad_norm": 5.8284406661987305, "learning_rate": 6.778377654209761e-06, "loss": 1.8477231979370117, "memory(GiB)": 50.6, "step": 4115, "token_acc": 0.5765765765765766, "train_speed(iter/s)": 0.343448 }, { "epoch": 2.5, "grad_norm": 5.155461311340332, "learning_rate": 6.698729810778065e-06, "loss": 2.0302501678466798, "memory(GiB)": 50.6, "step": 4120, "token_acc": 0.5272727272727272, "train_speed(iter/s)": 0.343437 }, { "epoch": 2.5030339805825244, "grad_norm": 6.494997024536133, "learning_rate": 6.619519069655322e-06, "loss": 2.1096899032592775, "memory(GiB)": 50.6, "step": 4125, "token_acc": 0.5030674846625767, "train_speed(iter/s)": 0.343419 }, { "epoch": 2.5060679611650487, "grad_norm": 5.513462543487549, "learning_rate": 6.54074623042999e-06, "loss": 1.889683151245117, "memory(GiB)": 50.6, "step": 4130, "token_acc": 0.5728813559322034, "train_speed(iter/s)": 0.343424 }, { "epoch": 2.5091019417475726, "grad_norm": 5.953548431396484, "learning_rate": 6.4624120882702535e-06, "loss": 1.8970447540283204, "memory(GiB)": 50.6, "step": 4135, "token_acc": 0.5863309352517986, "train_speed(iter/s)": 0.343407 }, { "epoch": 2.512135922330097, "grad_norm": 7.94427490234375, "learning_rate": 6.384517433915793e-06, "loss": 2.2867000579833983, "memory(GiB)": 50.6, "step": 4140, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 0.343309 }, { "epoch": 2.5151699029126213, "grad_norm": 7.19397497177124, "learning_rate": 6.30706305366996e-06, "loss": 1.942053985595703, "memory(GiB)": 50.6, "step": 4145, "token_acc": 0.5708955223880597, "train_speed(iter/s)": 0.343273 }, { "epoch": 2.5182038834951457, "grad_norm": 5.771632671356201, "learning_rate": 6.230049729391779e-06, "loss": 1.7596673965454102, "memory(GiB)": 50.6, "step": 4150, "token_acc": 0.6215277777777778, "train_speed(iter/s)": 0.343187 }, { "epoch": 2.52123786407767, "grad_norm": 5.398373603820801, "learning_rate": 6.153478238488019e-06, "loss": 1.7413190841674804, "memory(GiB)": 50.6, "step": 4155, "token_acc": 0.6111111111111112, "train_speed(iter/s)": 0.343126 }, { "epoch": 2.524271844660194, "grad_norm": 4.223130702972412, "learning_rate": 6.077349353905465e-06, "loss": 1.87268009185791, "memory(GiB)": 50.6, "step": 4160, "token_acc": 0.5669781931464174, "train_speed(iter/s)": 0.343151 }, { "epoch": 2.5273058252427183, "grad_norm": 6.797779560089111, "learning_rate": 6.00166384412294e-06, "loss": 1.96234130859375, "memory(GiB)": 50.6, "step": 4165, "token_acc": 0.5709459459459459, "train_speed(iter/s)": 0.343145 }, { "epoch": 2.5303398058252426, "grad_norm": 4.209136009216309, "learning_rate": 5.926422473143717e-06, "loss": 2.101329040527344, "memory(GiB)": 50.6, "step": 4170, "token_acc": 0.5149051490514905, "train_speed(iter/s)": 0.343096 }, { "epoch": 2.533373786407767, "grad_norm": 5.318611145019531, "learning_rate": 5.851626000487714e-06, "loss": 2.2430967330932616, "memory(GiB)": 50.6, "step": 4175, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 0.343074 }, { "epoch": 2.5364077669902914, "grad_norm": 4.254490375518799, "learning_rate": 5.7772751811838165e-06, "loss": 2.113543701171875, "memory(GiB)": 50.6, "step": 4180, "token_acc": 0.529126213592233, "train_speed(iter/s)": 0.343029 }, { "epoch": 2.5394417475728153, "grad_norm": 5.927794456481934, "learning_rate": 5.703370765762345e-06, "loss": 2.20776424407959, "memory(GiB)": 50.6, "step": 4185, "token_acc": 0.5565749235474006, "train_speed(iter/s)": 0.342995 }, { "epoch": 2.54247572815534, "grad_norm": 5.130653381347656, "learning_rate": 5.629913500247364e-06, "loss": 2.2823274612426756, "memory(GiB)": 50.6, "step": 4190, "token_acc": 0.5159420289855072, "train_speed(iter/s)": 0.343035 }, { "epoch": 2.545509708737864, "grad_norm": 4.409907817840576, "learning_rate": 5.556904126149237e-06, "loss": 1.7890758514404297, "memory(GiB)": 50.6, "step": 4195, "token_acc": 0.6131147540983607, "train_speed(iter/s)": 0.343054 }, { "epoch": 2.5485436893203883, "grad_norm": 7.719186782836914, "learning_rate": 5.484343380457125e-06, "loss": 1.792487907409668, "memory(GiB)": 50.6, "step": 4200, "token_acc": 0.5826330532212886, "train_speed(iter/s)": 0.343122 }, { "epoch": 2.5515776699029127, "grad_norm": 5.2396440505981445, "learning_rate": 5.412231995631473e-06, "loss": 2.1427207946777345, "memory(GiB)": 50.6, "step": 4205, "token_acc": 0.524390243902439, "train_speed(iter/s)": 0.343025 }, { "epoch": 2.554611650485437, "grad_norm": 5.071186542510986, "learning_rate": 5.340570699596769e-06, "loss": 1.802716064453125, "memory(GiB)": 50.6, "step": 4210, "token_acc": 0.5798045602605864, "train_speed(iter/s)": 0.343017 }, { "epoch": 2.5576456310679614, "grad_norm": 5.367197036743164, "learning_rate": 5.269360215734026e-06, "loss": 2.1213735580444335, "memory(GiB)": 50.6, "step": 4215, "token_acc": 0.5601374570446735, "train_speed(iter/s)": 0.343062 }, { "epoch": 2.5606796116504853, "grad_norm": 6.4417643547058105, "learning_rate": 5.198601262873593e-06, "loss": 2.0216903686523438, "memory(GiB)": 50.6, "step": 4220, "token_acc": 0.49714285714285716, "train_speed(iter/s)": 0.34305 }, { "epoch": 2.5637135922330097, "grad_norm": 5.322556495666504, "learning_rate": 5.12829455528786e-06, "loss": 1.7803016662597657, "memory(GiB)": 50.6, "step": 4225, "token_acc": 0.60546875, "train_speed(iter/s)": 0.343063 }, { "epoch": 2.566747572815534, "grad_norm": 8.737229347229004, "learning_rate": 5.0584408026840555e-06, "loss": 1.7748079299926758, "memory(GiB)": 50.6, "step": 4230, "token_acc": 0.5626740947075209, "train_speed(iter/s)": 0.343098 }, { "epoch": 2.5697815533980584, "grad_norm": 4.922790050506592, "learning_rate": 4.989040710197068e-06, "loss": 1.7644372940063477, "memory(GiB)": 50.6, "step": 4235, "token_acc": 0.6094276094276094, "train_speed(iter/s)": 0.343019 }, { "epoch": 2.5728155339805827, "grad_norm": 4.8600969314575195, "learning_rate": 4.920094978382339e-06, "loss": 2.0481204986572266, "memory(GiB)": 50.6, "step": 4240, "token_acc": 0.551622418879056, "train_speed(iter/s)": 0.343025 }, { "epoch": 2.5758495145631066, "grad_norm": 4.3228068351745605, "learning_rate": 4.851604303208801e-06, "loss": 1.7788314819335938, "memory(GiB)": 50.6, "step": 4245, "token_acc": 0.5877192982456141, "train_speed(iter/s)": 0.343036 }, { "epoch": 2.578883495145631, "grad_norm": 6.610318660736084, "learning_rate": 4.783569376051833e-06, "loss": 2.1976871490478516, "memory(GiB)": 50.6, "step": 4250, "token_acc": 0.4968553459119497, "train_speed(iter/s)": 0.342995 }, { "epoch": 2.5819174757281553, "grad_norm": 4.59051513671875, "learning_rate": 4.7159908836862994e-06, "loss": 1.9117321014404296, "memory(GiB)": 50.6, "step": 4255, "token_acc": 0.5505952380952381, "train_speed(iter/s)": 0.343047 }, { "epoch": 2.5849514563106797, "grad_norm": 7.418725490570068, "learning_rate": 4.648869508279613e-06, "loss": 1.9904735565185547, "memory(GiB)": 50.6, "step": 4260, "token_acc": 0.5390946502057613, "train_speed(iter/s)": 0.343103 }, { "epoch": 2.587985436893204, "grad_norm": 5.6940693855285645, "learning_rate": 4.582205927384814e-06, "loss": 2.0271472930908203, "memory(GiB)": 50.6, "step": 4265, "token_acc": 0.5282392026578073, "train_speed(iter/s)": 0.343055 }, { "epoch": 2.591019417475728, "grad_norm": 5.059177875518799, "learning_rate": 4.51600081393379e-06, "loss": 1.774452018737793, "memory(GiB)": 50.6, "step": 4270, "token_acc": 0.6190476190476191, "train_speed(iter/s)": 0.343052 }, { "epoch": 2.5940533980582523, "grad_norm": 6.3451247215271, "learning_rate": 4.450254836230449e-06, "loss": 1.775105857849121, "memory(GiB)": 50.6, "step": 4275, "token_acc": 0.6013071895424836, "train_speed(iter/s)": 0.343023 }, { "epoch": 2.5970873786407767, "grad_norm": 6.179014682769775, "learning_rate": 4.384968657943972e-06, "loss": 2.1539392471313477, "memory(GiB)": 50.6, "step": 4280, "token_acc": 0.5183374083129584, "train_speed(iter/s)": 0.342948 }, { "epoch": 2.600121359223301, "grad_norm": 6.580298900604248, "learning_rate": 4.3201429381021285e-06, "loss": 2.038502311706543, "memory(GiB)": 50.6, "step": 4285, "token_acc": 0.525, "train_speed(iter/s)": 0.342918 }, { "epoch": 2.6031553398058254, "grad_norm": 5.3828864097595215, "learning_rate": 4.255778331084609e-06, "loss": 2.141800308227539, "memory(GiB)": 50.6, "step": 4290, "token_acc": 0.5368098159509203, "train_speed(iter/s)": 0.342952 }, { "epoch": 2.6061893203883493, "grad_norm": 5.281216144561768, "learning_rate": 4.1918754866164205e-06, "loss": 2.2497278213500977, "memory(GiB)": 50.6, "step": 4295, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 0.342979 }, { "epoch": 2.6092233009708736, "grad_norm": 6.10798978805542, "learning_rate": 4.1284350497613426e-06, "loss": 2.0990455627441404, "memory(GiB)": 50.6, "step": 4300, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 0.343006 }, { "epoch": 2.612257281553398, "grad_norm": 4.555973529815674, "learning_rate": 4.065457660915401e-06, "loss": 1.7031167984008788, "memory(GiB)": 50.6, "step": 4305, "token_acc": 0.6493506493506493, "train_speed(iter/s)": 0.342957 }, { "epoch": 2.6152912621359223, "grad_norm": 5.801638126373291, "learning_rate": 4.002943955800409e-06, "loss": 2.1469167709350585, "memory(GiB)": 50.6, "step": 4310, "token_acc": 0.5160256410256411, "train_speed(iter/s)": 0.342985 }, { "epoch": 2.6183252427184467, "grad_norm": 4.538209915161133, "learning_rate": 3.94089456545757e-06, "loss": 2.061663055419922, "memory(GiB)": 50.6, "step": 4315, "token_acc": 0.5606469002695418, "train_speed(iter/s)": 0.343102 }, { "epoch": 2.6213592233009706, "grad_norm": 8.23294448852539, "learning_rate": 3.879310116241042e-06, "loss": 2.0767585754394533, "memory(GiB)": 50.6, "step": 4320, "token_acc": 0.5395189003436426, "train_speed(iter/s)": 0.343092 }, { "epoch": 2.6243932038834954, "grad_norm": 7.09895658493042, "learning_rate": 3.818191229811696e-06, "loss": 2.361587142944336, "memory(GiB)": 50.6, "step": 4325, "token_acc": 0.5314685314685315, "train_speed(iter/s)": 0.343065 }, { "epoch": 2.6274271844660193, "grad_norm": 9.5071382522583, "learning_rate": 3.757538523130799e-06, "loss": 2.3378437042236326, "memory(GiB)": 50.6, "step": 4330, "token_acc": 0.5211726384364821, "train_speed(iter/s)": 0.343054 }, { "epoch": 2.6304611650485437, "grad_norm": 6.152956485748291, "learning_rate": 3.697352608453791e-06, "loss": 2.0282224655151366, "memory(GiB)": 50.6, "step": 4335, "token_acc": 0.5164473684210527, "train_speed(iter/s)": 0.34308 }, { "epoch": 2.633495145631068, "grad_norm": 6.063511371612549, "learning_rate": 3.6376340933241104e-06, "loss": 1.7369041442871094, "memory(GiB)": 50.6, "step": 4340, "token_acc": 0.5838709677419355, "train_speed(iter/s)": 0.343086 }, { "epoch": 2.6365291262135924, "grad_norm": 5.01829719543457, "learning_rate": 3.5783835805670183e-06, "loss": 1.9777847290039063, "memory(GiB)": 50.6, "step": 4345, "token_acc": 0.5505952380952381, "train_speed(iter/s)": 0.343142 }, { "epoch": 2.6395631067961167, "grad_norm": 4.848254203796387, "learning_rate": 3.519601668283623e-06, "loss": 2.128549575805664, "memory(GiB)": 50.6, "step": 4350, "token_acc": 0.5505050505050505, "train_speed(iter/s)": 0.34321 }, { "epoch": 2.6425970873786406, "grad_norm": 5.600897312164307, "learning_rate": 3.4612889498447043e-06, "loss": 1.967574691772461, "memory(GiB)": 50.6, "step": 4355, "token_acc": 0.5773195876288659, "train_speed(iter/s)": 0.343184 }, { "epoch": 2.645631067961165, "grad_norm": 6.377498149871826, "learning_rate": 3.40344601388482e-06, "loss": 2.1233291625976562, "memory(GiB)": 50.6, "step": 4360, "token_acc": 0.5705329153605015, "train_speed(iter/s)": 0.343257 }, { "epoch": 2.6486650485436893, "grad_norm": 4.153836727142334, "learning_rate": 3.346073444296338e-06, "loss": 1.9815092086791992, "memory(GiB)": 50.6, "step": 4365, "token_acc": 0.5586592178770949, "train_speed(iter/s)": 0.343295 }, { "epoch": 2.6516990291262137, "grad_norm": 5.802259922027588, "learning_rate": 3.289171820223519e-06, "loss": 1.9684083938598633, "memory(GiB)": 50.6, "step": 4370, "token_acc": 0.5445859872611465, "train_speed(iter/s)": 0.343252 }, { "epoch": 2.654733009708738, "grad_norm": 4.702697277069092, "learning_rate": 3.2327417160567196e-06, "loss": 2.079043388366699, "memory(GiB)": 50.6, "step": 4375, "token_acc": 0.5541795665634675, "train_speed(iter/s)": 0.343281 }, { "epoch": 2.657766990291262, "grad_norm": 6.295142650604248, "learning_rate": 3.176783701426528e-06, "loss": 2.014111137390137, "memory(GiB)": 50.6, "step": 4380, "token_acc": 0.5728813559322034, "train_speed(iter/s)": 0.343317 }, { "epoch": 2.6608009708737863, "grad_norm": 3.821598768234253, "learning_rate": 3.121298341198081e-06, "loss": 1.9962123870849608, "memory(GiB)": 50.6, "step": 4385, "token_acc": 0.5344262295081967, "train_speed(iter/s)": 0.343258 }, { "epoch": 2.6638349514563107, "grad_norm": 6.481614589691162, "learning_rate": 3.0662861954653232e-06, "loss": 1.724216842651367, "memory(GiB)": 50.6, "step": 4390, "token_acc": 0.5787878787878787, "train_speed(iter/s)": 0.343301 }, { "epoch": 2.666868932038835, "grad_norm": 6.811887741088867, "learning_rate": 3.0117478195453353e-06, "loss": 1.8783321380615234, "memory(GiB)": 50.6, "step": 4395, "token_acc": 0.5751633986928104, "train_speed(iter/s)": 0.343223 }, { "epoch": 2.6699029126213594, "grad_norm": 5.35984468460083, "learning_rate": 2.9576837639728073e-06, "loss": 1.9953695297241212, "memory(GiB)": 50.6, "step": 4400, "token_acc": 0.551829268292683, "train_speed(iter/s)": 0.343204 }, { "epoch": 2.6729368932038833, "grad_norm": 5.454570293426514, "learning_rate": 2.9040945744943757e-06, "loss": 1.8858226776123046, "memory(GiB)": 50.6, "step": 4405, "token_acc": 0.5683453237410072, "train_speed(iter/s)": 0.343146 }, { "epoch": 2.6759708737864076, "grad_norm": 5.413751125335693, "learning_rate": 2.850980792063196e-06, "loss": 2.124787712097168, "memory(GiB)": 50.6, "step": 4410, "token_acc": 0.5204819277108433, "train_speed(iter/s)": 0.343224 }, { "epoch": 2.679004854368932, "grad_norm": 6.198506832122803, "learning_rate": 2.798342952833455e-06, "loss": 2.0300464630126953, "memory(GiB)": 50.6, "step": 4415, "token_acc": 0.5505226480836237, "train_speed(iter/s)": 0.343355 }, { "epoch": 2.6820388349514563, "grad_norm": 6.12868070602417, "learning_rate": 2.7461815881549225e-06, "loss": 2.18955020904541, "memory(GiB)": 50.6, "step": 4420, "token_acc": 0.5088967971530249, "train_speed(iter/s)": 0.343336 }, { "epoch": 2.6850728155339807, "grad_norm": 6.154922008514404, "learning_rate": 2.694497224567688e-06, "loss": 1.9575935363769532, "memory(GiB)": 50.6, "step": 4425, "token_acc": 0.5302491103202847, "train_speed(iter/s)": 0.343309 }, { "epoch": 2.6881067961165046, "grad_norm": 4.614657878875732, "learning_rate": 2.6432903837967036e-06, "loss": 2.2028165817260743, "memory(GiB)": 50.6, "step": 4430, "token_acc": 0.5287356321839081, "train_speed(iter/s)": 0.343288 }, { "epoch": 2.6911407766990294, "grad_norm": 4.579322814941406, "learning_rate": 2.5925615827466444e-06, "loss": 1.9925561904907227, "memory(GiB)": 50.6, "step": 4435, "token_acc": 0.5451977401129944, "train_speed(iter/s)": 0.343387 }, { "epoch": 2.6941747572815533, "grad_norm": 5.844526767730713, "learning_rate": 2.542311333496622e-06, "loss": 1.7176218032836914, "memory(GiB)": 50.6, "step": 4440, "token_acc": 0.6280701754385964, "train_speed(iter/s)": 0.34341 }, { "epoch": 2.6972087378640777, "grad_norm": 5.508568286895752, "learning_rate": 2.492540143295036e-06, "loss": 2.3741428375244142, "memory(GiB)": 50.6, "step": 4445, "token_acc": 0.5195195195195195, "train_speed(iter/s)": 0.343488 }, { "epoch": 2.700242718446602, "grad_norm": 4.3963189125061035, "learning_rate": 2.4432485145544527e-06, "loss": 2.1507076263427733, "memory(GiB)": 50.6, "step": 4450, "token_acc": 0.5, "train_speed(iter/s)": 0.343484 }, { "epoch": 2.7032766990291264, "grad_norm": 5.949728965759277, "learning_rate": 2.394436944846523e-06, "loss": 1.9588092803955077, "memory(GiB)": 50.6, "step": 4455, "token_acc": 0.5179856115107914, "train_speed(iter/s)": 0.343532 }, { "epoch": 2.7063106796116507, "grad_norm": 4.5839314460754395, "learning_rate": 2.3461059268969744e-06, "loss": 2.0510395050048826, "memory(GiB)": 50.6, "step": 4460, "token_acc": 0.5755627009646302, "train_speed(iter/s)": 0.343575 }, { "epoch": 2.7093446601941746, "grad_norm": 6.653810501098633, "learning_rate": 2.29825594858063e-06, "loss": 2.081754684448242, "memory(GiB)": 50.6, "step": 4465, "token_acc": 0.5536912751677853, "train_speed(iter/s)": 0.343482 }, { "epoch": 2.712378640776699, "grad_norm": 7.647735595703125, "learning_rate": 2.250887492916487e-06, "loss": 2.182686424255371, "memory(GiB)": 50.6, "step": 4470, "token_acc": 0.5186440677966102, "train_speed(iter/s)": 0.34348 }, { "epoch": 2.7154126213592233, "grad_norm": 6.021912097930908, "learning_rate": 2.204001038062836e-06, "loss": 2.3439844131469725, "memory(GiB)": 50.6, "step": 4475, "token_acc": 0.528052805280528, "train_speed(iter/s)": 0.343509 }, { "epoch": 2.7184466019417477, "grad_norm": 5.825223445892334, "learning_rate": 2.157597057312444e-06, "loss": 1.743293571472168, "memory(GiB)": 50.6, "step": 4480, "token_acc": 0.6242774566473989, "train_speed(iter/s)": 0.343537 }, { "epoch": 2.721480582524272, "grad_norm": 4.740017890930176, "learning_rate": 2.1116760190877437e-06, "loss": 2.289973831176758, "memory(GiB)": 50.6, "step": 4485, "token_acc": 0.5013192612137203, "train_speed(iter/s)": 0.34356 }, { "epoch": 2.724514563106796, "grad_norm": 10.523870468139648, "learning_rate": 2.0662383869361645e-06, "loss": 2.2363439559936524, "memory(GiB)": 50.6, "step": 4490, "token_acc": 0.5310344827586206, "train_speed(iter/s)": 0.343621 }, { "epoch": 2.7275485436893203, "grad_norm": 4.651697635650635, "learning_rate": 2.0212846195253987e-06, "loss": 2.3429183959960938, "memory(GiB)": 50.6, "step": 4495, "token_acc": 0.5176848874598071, "train_speed(iter/s)": 0.343527 }, { "epoch": 2.7305825242718447, "grad_norm": 5.93331241607666, "learning_rate": 1.976815170638802e-06, "loss": 2.079600143432617, "memory(GiB)": 50.6, "step": 4500, "token_acc": 0.565359477124183, "train_speed(iter/s)": 0.343566 }, { "epoch": 2.7305825242718447, "eval_loss": 1.8709077835083008, "eval_runtime": 11.9618, "eval_samples_per_second": 8.36, "eval_steps_per_second": 8.36, "eval_token_acc": 0.5143229166666666, "step": 4500 }, { "epoch": 2.733616504854369, "grad_norm": 5.982638835906982, "learning_rate": 1.9328304891708003e-06, "loss": 2.1300792694091797, "memory(GiB)": 50.6, "step": 4505, "token_acc": 0.5265654648956357, "train_speed(iter/s)": 0.343235 }, { "epoch": 2.7366504854368934, "grad_norm": 5.518435001373291, "learning_rate": 1.8893310191223535e-06, "loss": 2.2343555450439454, "memory(GiB)": 50.6, "step": 4510, "token_acc": 0.5051546391752577, "train_speed(iter/s)": 0.34321 }, { "epoch": 2.7396844660194173, "grad_norm": 4.82221794128418, "learning_rate": 1.8463171995964978e-06, "loss": 1.9464319229125977, "memory(GiB)": 50.6, "step": 4515, "token_acc": 0.5579937304075235, "train_speed(iter/s)": 0.34329 }, { "epoch": 2.7427184466019416, "grad_norm": 6.244739532470703, "learning_rate": 1.8037894647938758e-06, "loss": 1.8333675384521484, "memory(GiB)": 50.6, "step": 4520, "token_acc": 0.5793103448275863, "train_speed(iter/s)": 0.343371 }, { "epoch": 2.745752427184466, "grad_norm": 5.5475382804870605, "learning_rate": 1.7617482440083931e-06, "loss": 2.038858985900879, "memory(GiB)": 50.6, "step": 4525, "token_acc": 0.5127272727272727, "train_speed(iter/s)": 0.34339 }, { "epoch": 2.7487864077669903, "grad_norm": 5.996771812438965, "learning_rate": 1.7201939616228569e-06, "loss": 2.1655454635620117, "memory(GiB)": 50.6, "step": 4530, "token_acc": 0.5552238805970149, "train_speed(iter/s)": 0.343524 }, { "epoch": 2.7518203883495147, "grad_norm": 6.879034519195557, "learning_rate": 1.6791270371046997e-06, "loss": 1.9134620666503905, "memory(GiB)": 50.6, "step": 4535, "token_acc": 0.5653409090909091, "train_speed(iter/s)": 0.343538 }, { "epoch": 2.7548543689320386, "grad_norm": 5.388585090637207, "learning_rate": 1.638547885001762e-06, "loss": 1.992584800720215, "memory(GiB)": 50.6, "step": 4540, "token_acc": 0.5640138408304498, "train_speed(iter/s)": 0.343366 }, { "epoch": 2.757888349514563, "grad_norm": 5.858697414398193, "learning_rate": 1.5984569149380678e-06, "loss": 1.8246040344238281, "memory(GiB)": 50.6, "step": 4545, "token_acc": 0.5719424460431655, "train_speed(iter/s)": 0.343391 }, { "epoch": 2.7609223300970873, "grad_norm": 4.910253047943115, "learning_rate": 1.5588545316097269e-06, "loss": 1.7403861999511718, "memory(GiB)": 50.6, "step": 4550, "token_acc": 0.6114649681528662, "train_speed(iter/s)": 0.34342 }, { "epoch": 2.7639563106796117, "grad_norm": 5.3084235191345215, "learning_rate": 1.51974113478085e-06, "loss": 1.9259815216064453, "memory(GiB)": 50.6, "step": 4555, "token_acc": 0.5428571428571428, "train_speed(iter/s)": 0.343435 }, { "epoch": 2.766990291262136, "grad_norm": 6.652512550354004, "learning_rate": 1.4811171192794627e-06, "loss": 1.8169631958007812, "memory(GiB)": 50.6, "step": 4560, "token_acc": 0.5634674922600619, "train_speed(iter/s)": 0.343331 }, { "epoch": 2.77002427184466, "grad_norm": 5.558200359344482, "learning_rate": 1.4429828749936092e-06, "loss": 1.821939468383789, "memory(GiB)": 50.6, "step": 4565, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 0.343276 }, { "epoch": 2.7730582524271847, "grad_norm": 5.8728790283203125, "learning_rate": 1.4053387868673217e-06, "loss": 1.9860660552978515, "memory(GiB)": 50.6, "step": 4570, "token_acc": 0.5532544378698225, "train_speed(iter/s)": 0.343319 }, { "epoch": 2.7760922330097086, "grad_norm": 4.538073539733887, "learning_rate": 1.368185234896796e-06, "loss": 1.7361505508422852, "memory(GiB)": 50.6, "step": 4575, "token_acc": 0.5862068965517241, "train_speed(iter/s)": 0.343298 }, { "epoch": 2.779126213592233, "grad_norm": 5.922171115875244, "learning_rate": 1.3315225941265386e-06, "loss": 1.6432966232299804, "memory(GiB)": 50.6, "step": 4580, "token_acc": 0.5964285714285714, "train_speed(iter/s)": 0.343254 }, { "epoch": 2.7821601941747574, "grad_norm": 4.502394676208496, "learning_rate": 1.2953512346455643e-06, "loss": 1.9708721160888671, "memory(GiB)": 50.6, "step": 4585, "token_acc": 0.5224274406332454, "train_speed(iter/s)": 0.343346 }, { "epoch": 2.7851941747572817, "grad_norm": 5.806447982788086, "learning_rate": 1.2596715215836996e-06, "loss": 2.084531784057617, "memory(GiB)": 50.6, "step": 4590, "token_acc": 0.531986531986532, "train_speed(iter/s)": 0.343315 }, { "epoch": 2.788228155339806, "grad_norm": 5.160143852233887, "learning_rate": 1.224483815107863e-06, "loss": 1.9821706771850587, "memory(GiB)": 50.6, "step": 4595, "token_acc": 0.5302013422818792, "train_speed(iter/s)": 0.343205 }, { "epoch": 2.79126213592233, "grad_norm": 6.206644535064697, "learning_rate": 1.1897884704184236e-06, "loss": 2.0337772369384766, "memory(GiB)": 50.6, "step": 4600, "token_acc": 0.551829268292683, "train_speed(iter/s)": 0.343222 }, { "epoch": 2.7942961165048543, "grad_norm": 8.812095642089844, "learning_rate": 1.1555858377456596e-06, "loss": 2.3754322052001955, "memory(GiB)": 50.6, "step": 4605, "token_acc": 0.5207547169811321, "train_speed(iter/s)": 0.343271 }, { "epoch": 2.7973300970873787, "grad_norm": 6.804746150970459, "learning_rate": 1.1218762623461666e-06, "loss": 2.0446744918823243, "memory(GiB)": 50.6, "step": 4610, "token_acc": 0.5857605177993528, "train_speed(iter/s)": 0.343278 }, { "epoch": 2.800364077669903, "grad_norm": 7.311898708343506, "learning_rate": 1.0886600844994266e-06, "loss": 2.203943634033203, "memory(GiB)": 50.6, "step": 4615, "token_acc": 0.5364238410596026, "train_speed(iter/s)": 0.343275 }, { "epoch": 2.8033980582524274, "grad_norm": 3.9792897701263428, "learning_rate": 1.0559376395043285e-06, "loss": 1.7650745391845704, "memory(GiB)": 50.6, "step": 4620, "token_acc": 0.6132075471698113, "train_speed(iter/s)": 0.343265 }, { "epoch": 2.8064320388349513, "grad_norm": 3.9612162113189697, "learning_rate": 1.0237092576758034e-06, "loss": 1.9585369110107422, "memory(GiB)": 50.6, "step": 4625, "token_acc": 0.5705521472392638, "train_speed(iter/s)": 0.34332 }, { "epoch": 2.8094660194174756, "grad_norm": 5.008122444152832, "learning_rate": 9.919752643414992e-07, "loss": 1.8453510284423829, "memory(GiB)": 50.6, "step": 4630, "token_acc": 0.5451807228915663, "train_speed(iter/s)": 0.343341 }, { "epoch": 2.8125, "grad_norm": 4.290339469909668, "learning_rate": 9.607359798384785e-07, "loss": 2.1010000228881838, "memory(GiB)": 50.6, "step": 4635, "token_acc": 0.5186246418338109, "train_speed(iter/s)": 0.343371 }, { "epoch": 2.8155339805825244, "grad_norm": 5.311565399169922, "learning_rate": 9.299917195099927e-07, "loss": 1.9364433288574219, "memory(GiB)": 50.6, "step": 4640, "token_acc": 0.5844155844155844, "train_speed(iter/s)": 0.343404 }, { "epoch": 2.8185679611650487, "grad_norm": 7.349529266357422, "learning_rate": 8.997427937023018e-07, "loss": 2.1145442962646483, "memory(GiB)": 50.6, "step": 4645, "token_acc": 0.5551839464882943, "train_speed(iter/s)": 0.343399 }, { "epoch": 2.8216019417475726, "grad_norm": 9.552902221679688, "learning_rate": 8.699895077615316e-07, "loss": 2.0455989837646484, "memory(GiB)": 50.6, "step": 4650, "token_acc": 0.5608974358974359, "train_speed(iter/s)": 0.343317 }, { "epoch": 2.824635922330097, "grad_norm": 4.865658283233643, "learning_rate": 8.407321620306108e-07, "loss": 1.8446220397949218, "memory(GiB)": 50.6, "step": 4655, "token_acc": 0.6019417475728155, "train_speed(iter/s)": 0.343338 }, { "epoch": 2.8276699029126213, "grad_norm": 5.059858322143555, "learning_rate": 8.119710518462164e-07, "loss": 2.0013347625732423, "memory(GiB)": 50.6, "step": 4660, "token_acc": 0.5722543352601156, "train_speed(iter/s)": 0.343307 }, { "epoch": 2.8307038834951457, "grad_norm": 5.6271257400512695, "learning_rate": 7.837064675357997e-07, "loss": 1.8029741287231444, "memory(GiB)": 50.6, "step": 4665, "token_acc": 0.6020066889632107, "train_speed(iter/s)": 0.343277 }, { "epoch": 2.83373786407767, "grad_norm": 3.851916551589966, "learning_rate": 7.559386944146762e-07, "loss": 2.1071809768676757, "memory(GiB)": 50.6, "step": 4670, "token_acc": 0.5684754521963824, "train_speed(iter/s)": 0.343342 }, { "epoch": 2.836771844660194, "grad_norm": 7.78564453125, "learning_rate": 7.28668012783107e-07, "loss": 2.0491716384887697, "memory(GiB)": 50.6, "step": 4675, "token_acc": 0.55, "train_speed(iter/s)": 0.343284 }, { "epoch": 2.8398058252427183, "grad_norm": 4.6299638748168945, "learning_rate": 7.018946979234997e-07, "loss": 1.9229818344116212, "memory(GiB)": 50.6, "step": 4680, "token_acc": 0.5814696485623003, "train_speed(iter/s)": 0.343353 }, { "epoch": 2.8428398058252426, "grad_norm": 8.45301342010498, "learning_rate": 6.756190200976287e-07, "loss": 2.2035804748535157, "memory(GiB)": 50.6, "step": 4685, "token_acc": 0.5, "train_speed(iter/s)": 0.343387 }, { "epoch": 2.845873786407767, "grad_norm": 5.428055763244629, "learning_rate": 6.498412445438751e-07, "loss": 2.035969352722168, "memory(GiB)": 50.6, "step": 4690, "token_acc": 0.5316804407713499, "train_speed(iter/s)": 0.343386 }, { "epoch": 2.8489077669902914, "grad_norm": 4.995771884918213, "learning_rate": 6.245616314746072e-07, "loss": 2.2498245239257812, "memory(GiB)": 50.6, "step": 4695, "token_acc": 0.5194805194805194, "train_speed(iter/s)": 0.343396 }, { "epoch": 2.8519417475728153, "grad_norm": 5.4094696044921875, "learning_rate": 5.997804360734827e-07, "loss": 1.9600257873535156, "memory(GiB)": 50.6, "step": 4700, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 0.343398 }, { "epoch": 2.85497572815534, "grad_norm": 5.476764678955078, "learning_rate": 5.754979084929335e-07, "loss": 1.9202001571655274, "memory(GiB)": 50.6, "step": 4705, "token_acc": 0.5249169435215947, "train_speed(iter/s)": 0.343407 }, { "epoch": 2.858009708737864, "grad_norm": 5.412562370300293, "learning_rate": 5.517142938516074e-07, "loss": 1.6411771774291992, "memory(GiB)": 50.6, "step": 4710, "token_acc": 0.6142857142857143, "train_speed(iter/s)": 0.343454 }, { "epoch": 2.8610436893203883, "grad_norm": 5.939364910125732, "learning_rate": 5.284298322319026e-07, "loss": 1.9455268859863282, "memory(GiB)": 50.6, "step": 4715, "token_acc": 0.5833333333333334, "train_speed(iter/s)": 0.343524 }, { "epoch": 2.8640776699029127, "grad_norm": 5.847973346710205, "learning_rate": 5.056447586775593e-07, "loss": 1.91121826171875, "memory(GiB)": 50.6, "step": 4720, "token_acc": 0.5449101796407185, "train_speed(iter/s)": 0.343579 }, { "epoch": 2.867111650485437, "grad_norm": 6.285111427307129, "learning_rate": 4.833593031912387e-07, "loss": 2.198866271972656, "memory(GiB)": 50.6, "step": 4725, "token_acc": 0.5271565495207667, "train_speed(iter/s)": 0.343581 }, { "epoch": 2.8701456310679614, "grad_norm": 6.059581756591797, "learning_rate": 4.6157369073226984e-07, "loss": 1.9775382995605468, "memory(GiB)": 50.6, "step": 4730, "token_acc": 0.5668789808917197, "train_speed(iter/s)": 0.343631 }, { "epoch": 2.8731796116504853, "grad_norm": 6.790964126586914, "learning_rate": 4.402881412143234e-07, "loss": 2.208084297180176, "memory(GiB)": 50.6, "step": 4735, "token_acc": 0.5373665480427047, "train_speed(iter/s)": 0.343638 }, { "epoch": 2.8762135922330097, "grad_norm": 5.4126081466674805, "learning_rate": 4.1950286950321327e-07, "loss": 1.8405704498291016, "memory(GiB)": 50.6, "step": 4740, "token_acc": 0.541795665634675, "train_speed(iter/s)": 0.343622 }, { "epoch": 2.879247572815534, "grad_norm": 4.3289103507995605, "learning_rate": 3.9921808541474316e-07, "loss": 1.7929954528808594, "memory(GiB)": 50.6, "step": 4745, "token_acc": 0.5727554179566563, "train_speed(iter/s)": 0.343668 }, { "epoch": 2.8822815533980584, "grad_norm": 11.337580680847168, "learning_rate": 3.7943399371254686e-07, "loss": 2.043023109436035, "memory(GiB)": 50.6, "step": 4750, "token_acc": 0.5896551724137931, "train_speed(iter/s)": 0.343674 }, { "epoch": 2.8853155339805827, "grad_norm": 7.296205043792725, "learning_rate": 3.601507941060622e-07, "loss": 1.9796852111816405, "memory(GiB)": 50.6, "step": 4755, "token_acc": 0.5571030640668524, "train_speed(iter/s)": 0.343703 }, { "epoch": 2.8883495145631066, "grad_norm": 5.731447219848633, "learning_rate": 3.41368681248494e-07, "loss": 1.9645160675048827, "memory(GiB)": 50.6, "step": 4760, "token_acc": 0.5179153094462541, "train_speed(iter/s)": 0.343727 }, { "epoch": 2.891383495145631, "grad_norm": 4.664777755737305, "learning_rate": 3.2308784473485956e-07, "loss": 2.353625679016113, "memory(GiB)": 50.6, "step": 4765, "token_acc": 0.5016393442622951, "train_speed(iter/s)": 0.343736 }, { "epoch": 2.8944174757281553, "grad_norm": 5.083872318267822, "learning_rate": 3.053084691000685e-07, "loss": 1.8501104354858398, "memory(GiB)": 50.6, "step": 4770, "token_acc": 0.5844155844155844, "train_speed(iter/s)": 0.343705 }, { "epoch": 2.8974514563106797, "grad_norm": 4.665907382965088, "learning_rate": 2.8803073381704626e-07, "loss": 1.8698816299438477, "memory(GiB)": 50.6, "step": 4775, "token_acc": 0.5827586206896552, "train_speed(iter/s)": 0.343694 }, { "epoch": 2.900485436893204, "grad_norm": 4.781173229217529, "learning_rate": 2.712548132949577e-07, "loss": 1.9878482818603516, "memory(GiB)": 50.6, "step": 4780, "token_acc": 0.5573248407643312, "train_speed(iter/s)": 0.343603 }, { "epoch": 2.903519417475728, "grad_norm": 6.290499210357666, "learning_rate": 2.5498087687741424e-07, "loss": 1.8117362976074218, "memory(GiB)": 50.6, "step": 4785, "token_acc": 0.583941605839416, "train_speed(iter/s)": 0.343624 }, { "epoch": 2.9065533980582523, "grad_norm": 6.009178161621094, "learning_rate": 2.3920908884078053e-07, "loss": 1.7990478515625, "memory(GiB)": 50.6, "step": 4790, "token_acc": 0.5659340659340659, "train_speed(iter/s)": 0.343604 }, { "epoch": 2.9095873786407767, "grad_norm": 5.173957824707031, "learning_rate": 2.239396083925094e-07, "loss": 2.0423053741455077, "memory(GiB)": 50.6, "step": 4795, "token_acc": 0.5399361022364217, "train_speed(iter/s)": 0.343609 }, { "epoch": 2.912621359223301, "grad_norm": 7.16471529006958, "learning_rate": 2.0917258966953733e-07, "loss": 2.215689849853516, "memory(GiB)": 50.6, "step": 4800, "token_acc": 0.5189504373177842, "train_speed(iter/s)": 0.343609 }, { "epoch": 2.9156553398058254, "grad_norm": 4.602297782897949, "learning_rate": 1.9490818173672486e-07, "loss": 2.090025520324707, "memory(GiB)": 50.6, "step": 4805, "token_acc": 0.5120481927710844, "train_speed(iter/s)": 0.343562 }, { "epoch": 2.9186893203883493, "grad_norm": 4.454275608062744, "learning_rate": 1.8114652858536862e-07, "loss": 1.920309066772461, "memory(GiB)": 50.6, "step": 4810, "token_acc": 0.5407407407407407, "train_speed(iter/s)": 0.343546 }, { "epoch": 2.9217233009708736, "grad_norm": 5.600799083709717, "learning_rate": 1.6788776913171932e-07, "loss": 1.9104179382324218, "memory(GiB)": 50.6, "step": 4815, "token_acc": 0.5428571428571428, "train_speed(iter/s)": 0.343574 }, { "epoch": 2.924757281553398, "grad_norm": 8.521369934082031, "learning_rate": 1.5513203721559955e-07, "loss": 2.028363037109375, "memory(GiB)": 50.6, "step": 4820, "token_acc": 0.5513196480938416, "train_speed(iter/s)": 0.343579 }, { "epoch": 2.9277912621359223, "grad_norm": 7.325007915496826, "learning_rate": 1.428794615990603e-07, "loss": 1.8625770568847657, "memory(GiB)": 50.6, "step": 4825, "token_acc": 0.5857605177993528, "train_speed(iter/s)": 0.343551 }, { "epoch": 2.9308252427184467, "grad_norm": 8.010647773742676, "learning_rate": 1.3113016596503769e-07, "loss": 2.0838871002197266, "memory(GiB)": 50.6, "step": 4830, "token_acc": 0.5321428571428571, "train_speed(iter/s)": 0.343522 }, { "epoch": 2.9338592233009706, "grad_norm": 7.505803108215332, "learning_rate": 1.1988426891617054e-07, "loss": 2.1630321502685548, "memory(GiB)": 50.6, "step": 4835, "token_acc": 0.541795665634675, "train_speed(iter/s)": 0.343572 }, { "epoch": 2.9368932038834954, "grad_norm": 6.283768653869629, "learning_rate": 1.0914188397355141e-07, "loss": 1.9630155563354492, "memory(GiB)": 50.6, "step": 4840, "token_acc": 0.5917721518987342, "train_speed(iter/s)": 0.343545 }, { "epoch": 2.9399271844660193, "grad_norm": 6.98088264465332, "learning_rate": 9.890311957559406e-08, "loss": 2.001510238647461, "memory(GiB)": 50.6, "step": 4845, "token_acc": 0.5897435897435898, "train_speed(iter/s)": 0.343526 }, { "epoch": 2.9429611650485437, "grad_norm": 6.317451000213623, "learning_rate": 8.916807907695113e-08, "loss": 2.060597610473633, "memory(GiB)": 50.6, "step": 4850, "token_acc": 0.5375, "train_speed(iter/s)": 0.343579 }, { "epoch": 2.945995145631068, "grad_norm": 4.475234031677246, "learning_rate": 7.993686074744821e-08, "loss": 1.6695629119873048, "memory(GiB)": 50.6, "step": 4855, "token_acc": 0.6179775280898876, "train_speed(iter/s)": 0.343561 }, { "epoch": 2.9490291262135924, "grad_norm": 7.141847133636475, "learning_rate": 7.120955777112914e-08, "loss": 2.255269241333008, "memory(GiB)": 50.6, "step": 4860, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 0.343536 }, { "epoch": 2.9520631067961167, "grad_norm": 4.163337230682373, "learning_rate": 6.298625824527337e-08, "loss": 2.051700210571289, "memory(GiB)": 50.6, "step": 4865, "token_acc": 0.5659340659340659, "train_speed(iter/s)": 0.343508 }, { "epoch": 2.9550970873786406, "grad_norm": 6.36781644821167, "learning_rate": 5.526704517951897e-08, "loss": 1.9743682861328125, "memory(GiB)": 50.6, "step": 4870, "token_acc": 0.5667752442996743, "train_speed(iter/s)": 0.34352 }, { "epoch": 2.958131067961165, "grad_norm": 4.282163619995117, "learning_rate": 4.8051996495052096e-08, "loss": 1.831898307800293, "memory(GiB)": 50.6, "step": 4875, "token_acc": 0.5661538461538461, "train_speed(iter/s)": 0.34355 }, { "epoch": 2.9611650485436893, "grad_norm": 5.308740615844727, "learning_rate": 4.134118502378548e-08, "loss": 1.9177091598510743, "memory(GiB)": 50.6, "step": 4880, "token_acc": 0.5698324022346368, "train_speed(iter/s)": 0.343597 }, { "epoch": 2.9641990291262137, "grad_norm": 4.954492092132568, "learning_rate": 3.5134678507636745e-08, "loss": 1.8720256805419921, "memory(GiB)": 50.6, "step": 4885, "token_acc": 0.5813953488372093, "train_speed(iter/s)": 0.343608 }, { "epoch": 2.967233009708738, "grad_norm": 4.621790885925293, "learning_rate": 2.9432539597851195e-08, "loss": 1.8929941177368164, "memory(GiB)": 50.6, "step": 4890, "token_acc": 0.6056782334384858, "train_speed(iter/s)": 0.343644 }, { "epoch": 2.970266990291262, "grad_norm": 8.908370018005371, "learning_rate": 2.423482585435788e-08, "loss": 2.0634042739868166, "memory(GiB)": 50.6, "step": 4895, "token_acc": 0.5187319884726225, "train_speed(iter/s)": 0.343588 }, { "epoch": 2.9733009708737863, "grad_norm": 5.2636308670043945, "learning_rate": 1.9541589745186717e-08, "loss": 2.0188762664794924, "memory(GiB)": 50.6, "step": 4900, "token_acc": 0.5611111111111111, "train_speed(iter/s)": 0.343613 }, { "epoch": 2.9763349514563107, "grad_norm": 4.989402770996094, "learning_rate": 1.5352878645963352e-08, "loss": 2.3414190292358397, "memory(GiB)": 50.6, "step": 4905, "token_acc": 0.47352941176470587, "train_speed(iter/s)": 0.34365 }, { "epoch": 2.979368932038835, "grad_norm": 5.3599724769592285, "learning_rate": 1.1668734839404006e-08, "loss": 1.681999969482422, "memory(GiB)": 50.6, "step": 4910, "token_acc": 0.6286764705882353, "train_speed(iter/s)": 0.343695 }, { "epoch": 2.9824029126213594, "grad_norm": 5.3972883224487305, "learning_rate": 8.489195514888027e-09, "loss": 2.041118049621582, "memory(GiB)": 50.6, "step": 4915, "token_acc": 0.584192439862543, "train_speed(iter/s)": 0.343722 }, { "epoch": 2.9854368932038833, "grad_norm": 22.398517608642578, "learning_rate": 5.814292768108187e-09, "loss": 2.087300109863281, "memory(GiB)": 50.6, "step": 4920, "token_acc": 0.5209003215434084, "train_speed(iter/s)": 0.343746 }, { "epoch": 2.9884708737864076, "grad_norm": 4.857041358947754, "learning_rate": 3.644053600726505e-09, "loss": 2.011292266845703, "memory(GiB)": 50.6, "step": 4925, "token_acc": 0.5572289156626506, "train_speed(iter/s)": 0.343802 }, { "epoch": 2.991504854368932, "grad_norm": 5.026576042175293, "learning_rate": 1.978499920096688e-09, "loss": 2.1435102462768554, "memory(GiB)": 50.6, "step": 4930, "token_acc": 0.5440251572327044, "train_speed(iter/s)": 0.343891 }, { "epoch": 2.9945388349514563, "grad_norm": 5.564507007598877, "learning_rate": 8.176485390642974e-10, "loss": 1.9260440826416017, "memory(GiB)": 50.6, "step": 4935, "token_acc": 0.5105105105105106, "train_speed(iter/s)": 0.343988 }, { "epoch": 2.9975728155339807, "grad_norm": 4.94175386428833, "learning_rate": 1.6151117577800633e-10, "loss": 1.9196613311767579, "memory(GiB)": 50.6, "step": 4940, "token_acc": 0.5775577557755776, "train_speed(iter/s)": 0.344023 }, { "epoch": 3.0, "eval_loss": 2.129533529281616, "eval_runtime": 12.9052, "eval_samples_per_second": 7.749, "eval_steps_per_second": 7.749, "eval_token_acc": 0.5140758873929009, "step": 4944 } ], "logging_steps": 5, "max_steps": 4944, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.677863945070756e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }