{ "best_metric": 0.19586918, "best_model_checkpoint": "/share/project/gsai/kch/output/v9-20250120-041149/checkpoint-614", "epoch": 4.996548223350254, "eval_steps": 500, "global_step": 1535, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003248730964467005, "grad_norm": 9.773202050760368, "learning_rate": 1.2987012987012988e-06, "loss": 1.5496090650558472, "memory(GiB)": 35.94, "step": 1, "token_acc": 0.7444444444444445, "train_speed(iter/s)": 0.013018 }, { "epoch": 0.016243654822335026, "grad_norm": 9.248677372850217, "learning_rate": 6.493506493506493e-06, "loss": 1.707068681716919, "memory(GiB)": 36.33, "step": 5, "token_acc": 0.7226890756302521, "train_speed(iter/s)": 0.016033 }, { "epoch": 0.03248730964467005, "grad_norm": 7.883848099402922, "learning_rate": 1.2987012987012986e-05, "loss": 1.7282501220703126, "memory(GiB)": 36.33, "step": 10, "token_acc": 0.6761363636363636, "train_speed(iter/s)": 0.016046 }, { "epoch": 0.048730964467005075, "grad_norm": 6.0889640814527155, "learning_rate": 1.9480519480519483e-05, "loss": 1.2976716995239257, "memory(GiB)": 36.33, "step": 15, "token_acc": 0.7266355140186916, "train_speed(iter/s)": 0.015999 }, { "epoch": 0.0649746192893401, "grad_norm": 2.6158303198283113, "learning_rate": 2.5974025974025972e-05, "loss": 0.7637146949768067, "memory(GiB)": 36.33, "step": 20, "token_acc": 0.8190709046454768, "train_speed(iter/s)": 0.016127 }, { "epoch": 0.08121827411167512, "grad_norm": 1.1712343980644169, "learning_rate": 3.246753246753247e-05, "loss": 0.5213486194610596, "memory(GiB)": 36.91, "step": 25, "token_acc": 0.8802992518703242, "train_speed(iter/s)": 0.016159 }, { "epoch": 0.09746192893401015, "grad_norm": 1.4361934956753106, "learning_rate": 3.8961038961038966e-05, "loss": 0.4833333969116211, "memory(GiB)": 36.91, "step": 30, "token_acc": 0.8929440389294404, "train_speed(iter/s)": 0.01608 }, { "epoch": 0.11370558375634518, "grad_norm": 1.1662861682771686, "learning_rate": 4.545454545454546e-05, "loss": 0.4054920196533203, "memory(GiB)": 36.91, "step": 35, "token_acc": 0.8860103626943006, "train_speed(iter/s)": 0.016078 }, { "epoch": 0.1299492385786802, "grad_norm": 1.0429297515235254, "learning_rate": 5.1948051948051944e-05, "loss": 0.43406662940979, "memory(GiB)": 36.91, "step": 40, "token_acc": 0.8708333333333333, "train_speed(iter/s)": 0.016008 }, { "epoch": 0.14619289340101524, "grad_norm": 1.2238750692730618, "learning_rate": 5.844155844155844e-05, "loss": 0.36366307735443115, "memory(GiB)": 36.91, "step": 45, "token_acc": 0.9007832898172323, "train_speed(iter/s)": 0.01607 }, { "epoch": 0.16243654822335024, "grad_norm": 1.2558032464123954, "learning_rate": 6.493506493506494e-05, "loss": 0.327667236328125, "memory(GiB)": 36.91, "step": 50, "token_acc": 0.9095890410958904, "train_speed(iter/s)": 0.016095 }, { "epoch": 0.17868020304568527, "grad_norm": 1.1346516950379935, "learning_rate": 7.142857142857143e-05, "loss": 0.2869602680206299, "memory(GiB)": 36.91, "step": 55, "token_acc": 0.9400428265524625, "train_speed(iter/s)": 0.016146 }, { "epoch": 0.1949238578680203, "grad_norm": 1.062592286052222, "learning_rate": 7.792207792207793e-05, "loss": 0.32817542552948, "memory(GiB)": 36.91, "step": 60, "token_acc": 0.9162162162162162, "train_speed(iter/s)": 0.016208 }, { "epoch": 0.21116751269035533, "grad_norm": 1.0747418170911354, "learning_rate": 8.441558441558442e-05, "loss": 0.3106029987335205, "memory(GiB)": 36.91, "step": 65, "token_acc": 0.8882235528942116, "train_speed(iter/s)": 0.016166 }, { "epoch": 0.22741116751269036, "grad_norm": 1.3626948899821127, "learning_rate": 9.090909090909092e-05, "loss": 0.2963001251220703, "memory(GiB)": 36.91, "step": 70, "token_acc": 0.9046511627906977, "train_speed(iter/s)": 0.016127 }, { "epoch": 0.2436548223350254, "grad_norm": 1.767990529674908, "learning_rate": 9.74025974025974e-05, "loss": 0.30068559646606446, "memory(GiB)": 36.91, "step": 75, "token_acc": 0.9203539823008849, "train_speed(iter/s)": 0.016153 }, { "epoch": 0.2598984771573604, "grad_norm": 1.1682901865357622, "learning_rate": 9.99989553622803e-05, "loss": 0.2741088390350342, "memory(GiB)": 36.91, "step": 80, "token_acc": 0.9041394335511983, "train_speed(iter/s)": 0.016134 }, { "epoch": 0.27614213197969545, "grad_norm": 1.3278362200249414, "learning_rate": 9.999257162318026e-05, "loss": 0.25543942451477053, "memory(GiB)": 36.91, "step": 85, "token_acc": 0.9399538106235565, "train_speed(iter/s)": 0.016119 }, { "epoch": 0.2923857868020305, "grad_norm": 1.1803595161351554, "learning_rate": 9.998038523933224e-05, "loss": 0.3038362503051758, "memory(GiB)": 36.91, "step": 90, "token_acc": 0.9416058394160584, "train_speed(iter/s)": 0.016104 }, { "epoch": 0.3086294416243655, "grad_norm": 1.1025992286590631, "learning_rate": 9.996239762521151e-05, "loss": 0.24188714027404784, "memory(GiB)": 36.91, "step": 95, "token_acc": 0.9402298850574713, "train_speed(iter/s)": 0.016109 }, { "epoch": 0.3248730964467005, "grad_norm": 1.7473005302414135, "learning_rate": 9.993861086864293e-05, "loss": 0.2190408945083618, "memory(GiB)": 36.91, "step": 100, "token_acc": 0.9553349875930521, "train_speed(iter/s)": 0.016079 }, { "epoch": 0.3411167512690355, "grad_norm": 0.9780470952963239, "learning_rate": 9.990902773055866e-05, "loss": 0.22316210269927977, "memory(GiB)": 36.91, "step": 105, "token_acc": 0.9384236453201971, "train_speed(iter/s)": 0.016094 }, { "epoch": 0.35736040609137054, "grad_norm": 1.2071939622104944, "learning_rate": 9.987365164467767e-05, "loss": 0.1844509482383728, "memory(GiB)": 36.91, "step": 110, "token_acc": 0.9557291666666666, "train_speed(iter/s)": 0.016096 }, { "epoch": 0.37360406091370557, "grad_norm": 1.3488873859555934, "learning_rate": 9.983248671710714e-05, "loss": 0.24020743370056152, "memory(GiB)": 36.91, "step": 115, "token_acc": 0.91792656587473, "train_speed(iter/s)": 0.016103 }, { "epoch": 0.3898477157360406, "grad_norm": 1.3346849143090171, "learning_rate": 9.978553772586596e-05, "loss": 0.17928496599197388, "memory(GiB)": 36.91, "step": 120, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.016107 }, { "epoch": 0.40609137055837563, "grad_norm": 1.5370257812561328, "learning_rate": 9.973281012033007e-05, "loss": 0.22673540115356444, "memory(GiB)": 36.91, "step": 125, "token_acc": 0.9307692307692308, "train_speed(iter/s)": 0.016132 }, { "epoch": 0.42233502538071066, "grad_norm": 1.564641958549246, "learning_rate": 9.967431002060002e-05, "loss": 0.2366321563720703, "memory(GiB)": 36.91, "step": 130, "token_acc": 0.9309576837416481, "train_speed(iter/s)": 0.016147 }, { "epoch": 0.4385786802030457, "grad_norm": 2.562291519667129, "learning_rate": 9.961004421679047e-05, "loss": 0.1997455835342407, "memory(GiB)": 36.91, "step": 135, "token_acc": 0.9694656488549618, "train_speed(iter/s)": 0.01615 }, { "epoch": 0.4548223350253807, "grad_norm": 1.3505627897575587, "learning_rate": 9.954002016824227e-05, "loss": 0.23050181865692138, "memory(GiB)": 36.91, "step": 140, "token_acc": 0.9395248380129589, "train_speed(iter/s)": 0.016177 }, { "epoch": 0.47106598984771575, "grad_norm": 1.1439093152874722, "learning_rate": 9.946424600265646e-05, "loss": 0.2069091796875, "memory(GiB)": 36.91, "step": 145, "token_acc": 0.9485294117647058, "train_speed(iter/s)": 0.016153 }, { "epoch": 0.4873096446700508, "grad_norm": 1.3223308004820944, "learning_rate": 9.938273051515098e-05, "loss": 0.21799993515014648, "memory(GiB)": 36.91, "step": 150, "token_acc": 0.9325581395348838, "train_speed(iter/s)": 0.016154 }, { "epoch": 0.5035532994923858, "grad_norm": 1.2523275744092777, "learning_rate": 9.929548316723982e-05, "loss": 0.25325832366943357, "memory(GiB)": 36.91, "step": 155, "token_acc": 0.9368421052631579, "train_speed(iter/s)": 0.016145 }, { "epoch": 0.5197969543147208, "grad_norm": 0.9022910796931503, "learning_rate": 9.920251408573483e-05, "loss": 0.2051997184753418, "memory(GiB)": 36.91, "step": 160, "token_acc": 0.9321266968325792, "train_speed(iter/s)": 0.016138 }, { "epoch": 0.5360406091370559, "grad_norm": 1.3630797879167007, "learning_rate": 9.910383406157018e-05, "loss": 0.19534312486648558, "memory(GiB)": 36.91, "step": 165, "token_acc": 0.9489795918367347, "train_speed(iter/s)": 0.016146 }, { "epoch": 0.5522842639593909, "grad_norm": 1.2845653777954962, "learning_rate": 9.899945454855006e-05, "loss": 0.25403494834899903, "memory(GiB)": 36.91, "step": 170, "token_acc": 0.9025974025974026, "train_speed(iter/s)": 0.01615 }, { "epoch": 0.5685279187817259, "grad_norm": 1.2637865638643238, "learning_rate": 9.888938766201907e-05, "loss": 0.21994171142578126, "memory(GiB)": 36.91, "step": 175, "token_acc": 0.9292452830188679, "train_speed(iter/s)": 0.016148 }, { "epoch": 0.584771573604061, "grad_norm": 1.3035045872952578, "learning_rate": 9.877364617745604e-05, "loss": 0.21233229637145995, "memory(GiB)": 36.91, "step": 180, "token_acc": 0.936046511627907, "train_speed(iter/s)": 0.016163 }, { "epoch": 0.601015228426396, "grad_norm": 1.0837997073678936, "learning_rate": 9.865224352899119e-05, "loss": 0.20809760093688964, "memory(GiB)": 36.91, "step": 185, "token_acc": 0.9612403100775194, "train_speed(iter/s)": 0.016158 }, { "epoch": 0.617258883248731, "grad_norm": 1.6131697829206757, "learning_rate": 9.852519380784686e-05, "loss": 0.16450556516647338, "memory(GiB)": 36.91, "step": 190, "token_acc": 0.9518716577540107, "train_speed(iter/s)": 0.01615 }, { "epoch": 0.6335025380710659, "grad_norm": 1.0897399385105642, "learning_rate": 9.839251176070184e-05, "loss": 0.21039419174194335, "memory(GiB)": 36.91, "step": 195, "token_acc": 0.943089430894309, "train_speed(iter/s)": 0.016128 }, { "epoch": 0.649746192893401, "grad_norm": 1.0509670789538326, "learning_rate": 9.825421278797983e-05, "loss": 0.2035764217376709, "memory(GiB)": 36.91, "step": 200, "token_acc": 0.9397260273972603, "train_speed(iter/s)": 0.016132 }, { "epoch": 0.665989847715736, "grad_norm": 1.2329373260124112, "learning_rate": 9.811031294206184e-05, "loss": 0.21548199653625488, "memory(GiB)": 36.91, "step": 205, "token_acc": 0.9368191721132898, "train_speed(iter/s)": 0.01613 }, { "epoch": 0.682233502538071, "grad_norm": 0.8421449582235737, "learning_rate": 9.796082892542302e-05, "loss": 0.166330087184906, "memory(GiB)": 36.91, "step": 210, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.016131 }, { "epoch": 0.698477157360406, "grad_norm": 3.430879327858774, "learning_rate": 9.780577808869398e-05, "loss": 0.2193552017211914, "memory(GiB)": 36.91, "step": 215, "token_acc": 0.952020202020202, "train_speed(iter/s)": 0.016136 }, { "epoch": 0.7147208121827411, "grad_norm": 1.5093266746905538, "learning_rate": 9.764517842864696e-05, "loss": 0.21606364250183105, "memory(GiB)": 36.91, "step": 220, "token_acc": 0.9311926605504587, "train_speed(iter/s)": 0.016141 }, { "epoch": 0.7309644670050761, "grad_norm": 1.3437389442959786, "learning_rate": 9.747904858610681e-05, "loss": 0.18983598947525024, "memory(GiB)": 36.91, "step": 225, "token_acc": 0.9449035812672176, "train_speed(iter/s)": 0.016146 }, { "epoch": 0.7472081218274111, "grad_norm": 0.9560028124850986, "learning_rate": 9.730740784378753e-05, "loss": 0.15862367153167725, "memory(GiB)": 36.91, "step": 230, "token_acc": 0.9284009546539379, "train_speed(iter/s)": 0.016153 }, { "epoch": 0.7634517766497462, "grad_norm": 0.9944797001481037, "learning_rate": 9.713027612405395e-05, "loss": 0.2057633638381958, "memory(GiB)": 36.91, "step": 235, "token_acc": 0.9560975609756097, "train_speed(iter/s)": 0.016148 }, { "epoch": 0.7796954314720812, "grad_norm": 1.3080304212648073, "learning_rate": 9.694767398660942e-05, "loss": 0.20023531913757325, "memory(GiB)": 36.91, "step": 240, "token_acc": 0.930835734870317, "train_speed(iter/s)": 0.016152 }, { "epoch": 0.7959390862944162, "grad_norm": 1.0766984239588557, "learning_rate": 9.67596226261095e-05, "loss": 0.17447829246520996, "memory(GiB)": 36.91, "step": 245, "token_acc": 0.9543269230769231, "train_speed(iter/s)": 0.016152 }, { "epoch": 0.8121827411167513, "grad_norm": 1.507367869013474, "learning_rate": 9.656614386970173e-05, "loss": 0.1656266212463379, "memory(GiB)": 36.91, "step": 250, "token_acc": 0.9447368421052632, "train_speed(iter/s)": 0.016157 }, { "epoch": 0.8284263959390863, "grad_norm": 0.9746901508793566, "learning_rate": 9.636726017449236e-05, "loss": 0.1971142530441284, "memory(GiB)": 36.91, "step": 255, "token_acc": 0.9336384439359268, "train_speed(iter/s)": 0.016144 }, { "epoch": 0.8446700507614213, "grad_norm": 1.2090195353569724, "learning_rate": 9.616299462493952e-05, "loss": 0.13225051164627075, "memory(GiB)": 36.91, "step": 260, "token_acc": 0.9502369668246445, "train_speed(iter/s)": 0.016132 }, { "epoch": 0.8609137055837564, "grad_norm": 2.0461505378854024, "learning_rate": 9.595337093017404e-05, "loss": 0.15409984588623046, "memory(GiB)": 36.91, "step": 265, "token_acc": 0.9537444933920705, "train_speed(iter/s)": 0.016136 }, { "epoch": 0.8771573604060914, "grad_norm": 2.0732984340431178, "learning_rate": 9.57384134212473e-05, "loss": 0.21368227005004883, "memory(GiB)": 36.91, "step": 270, "token_acc": 0.9419642857142857, "train_speed(iter/s)": 0.016136 }, { "epoch": 0.8934010152284264, "grad_norm": 0.7925657032904146, "learning_rate": 9.551814704830734e-05, "loss": 0.1758435010910034, "memory(GiB)": 36.91, "step": 275, "token_acc": 0.948509485094851, "train_speed(iter/s)": 0.016143 }, { "epoch": 0.9096446700507614, "grad_norm": 1.493944081608633, "learning_rate": 9.529259737770269e-05, "loss": 0.1807725191116333, "memory(GiB)": 36.91, "step": 280, "token_acc": 0.9431524547803618, "train_speed(iter/s)": 0.016126 }, { "epoch": 0.9258883248730965, "grad_norm": 1.6848601658017734, "learning_rate": 9.506179058901503e-05, "loss": 0.20769875049591063, "memory(GiB)": 36.91, "step": 285, "token_acc": 0.9391304347826087, "train_speed(iter/s)": 0.016132 }, { "epoch": 0.9421319796954315, "grad_norm": 1.3210817601987923, "learning_rate": 9.482575347202047e-05, "loss": 0.162405526638031, "memory(GiB)": 36.91, "step": 290, "token_acc": 0.9507042253521126, "train_speed(iter/s)": 0.016136 }, { "epoch": 0.9583756345177665, "grad_norm": 1.3496077516635223, "learning_rate": 9.458451342358002e-05, "loss": 0.19487454891204833, "memory(GiB)": 36.91, "step": 295, "token_acc": 0.9321608040201005, "train_speed(iter/s)": 0.016132 }, { "epoch": 0.9746192893401016, "grad_norm": 0.990005748680569, "learning_rate": 9.433809844445969e-05, "loss": 0.18303027153015136, "memory(GiB)": 36.91, "step": 300, "token_acc": 0.9073170731707317, "train_speed(iter/s)": 0.016134 }, { "epoch": 0.9908629441624366, "grad_norm": 1.2295422719869937, "learning_rate": 9.40865371360804e-05, "loss": 0.17322018146514892, "memory(GiB)": 36.91, "step": 305, "token_acc": 0.9247311827956989, "train_speed(iter/s)": 0.016141 }, { "epoch": 0.9973604060913706, "eval_loss": 0.1993405520915985, "eval_runtime": 62.0419, "eval_samples_per_second": 3.191, "eval_steps_per_second": 0.806, "eval_token_acc": 0.9332079021636877, "step": 307 }, { "epoch": 1.0095431472081218, "grad_norm": 0.6300057786945967, "learning_rate": 9.382985869719825e-05, "loss": 0.18641979694366456, "memory(GiB)": 36.91, "step": 310, "token_acc": 0.9554234769687965, "train_speed(iter/s)": 0.016081 }, { "epoch": 1.0257868020304568, "grad_norm": 0.6546315853574257, "learning_rate": 9.35680929205154e-05, "loss": 0.09114786386489868, "memory(GiB)": 36.91, "step": 315, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.016095 }, { "epoch": 1.0420304568527918, "grad_norm": 1.0908662736650971, "learning_rate": 9.330127018922194e-05, "loss": 0.10798045396804809, "memory(GiB)": 36.91, "step": 320, "token_acc": 0.9705093833780161, "train_speed(iter/s)": 0.016104 }, { "epoch": 1.0582741116751269, "grad_norm": 1.3297407747084764, "learning_rate": 9.302942147346945e-05, "loss": 0.1425997495651245, "memory(GiB)": 36.91, "step": 325, "token_acc": 0.9705014749262537, "train_speed(iter/s)": 0.016103 }, { "epoch": 1.074517766497462, "grad_norm": 0.9696985174488663, "learning_rate": 9.275257832677623e-05, "loss": 0.09851968884468079, "memory(GiB)": 36.91, "step": 330, "token_acc": 0.9644549763033176, "train_speed(iter/s)": 0.016115 }, { "epoch": 1.090761421319797, "grad_norm": 0.9656710998245678, "learning_rate": 9.247077288236488e-05, "loss": 0.11144424676895141, "memory(GiB)": 36.91, "step": 335, "token_acc": 0.972568578553616, "train_speed(iter/s)": 0.016119 }, { "epoch": 1.107005076142132, "grad_norm": 2.119365217816497, "learning_rate": 9.21840378494325e-05, "loss": 0.11279252767562867, "memory(GiB)": 36.91, "step": 340, "token_acc": 0.9637462235649547, "train_speed(iter/s)": 0.016124 }, { "epoch": 1.123248730964467, "grad_norm": 1.0607496749665157, "learning_rate": 9.189240650935433e-05, "loss": 0.15501840114593507, "memory(GiB)": 36.91, "step": 345, "token_acc": 0.9662337662337662, "train_speed(iter/s)": 0.016118 }, { "epoch": 1.139492385786802, "grad_norm": 1.1350038539205582, "learning_rate": 9.159591271182058e-05, "loss": 0.12092633247375488, "memory(GiB)": 36.91, "step": 350, "token_acc": 0.9680232558139535, "train_speed(iter/s)": 0.016126 }, { "epoch": 1.155736040609137, "grad_norm": 0.6471881138956326, "learning_rate": 9.129459087090763e-05, "loss": 0.09021483659744263, "memory(GiB)": 36.91, "step": 355, "token_acc": 0.9718670076726342, "train_speed(iter/s)": 0.016119 }, { "epoch": 1.171979695431472, "grad_norm": 0.5557368721254966, "learning_rate": 9.098847596108351e-05, "loss": 0.09125213623046875, "memory(GiB)": 36.91, "step": 360, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.01612 }, { "epoch": 1.188223350253807, "grad_norm": 0.8767747521686889, "learning_rate": 9.067760351314838e-05, "loss": 0.10847616195678711, "memory(GiB)": 36.91, "step": 365, "token_acc": 0.9425587467362925, "train_speed(iter/s)": 0.016116 }, { "epoch": 1.2044670050761421, "grad_norm": 0.7043233347928591, "learning_rate": 9.036200961011059e-05, "loss": 0.14046638011932372, "memory(GiB)": 36.91, "step": 370, "token_acc": 0.9632034632034632, "train_speed(iter/s)": 0.016126 }, { "epoch": 1.2207106598984772, "grad_norm": 1.0689456764149206, "learning_rate": 9.004173088299837e-05, "loss": 0.13291985988616944, "memory(GiB)": 36.91, "step": 375, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.016139 }, { "epoch": 1.2369543147208122, "grad_norm": 1.3657829465422844, "learning_rate": 8.97168045066082e-05, "loss": 0.11737120151519775, "memory(GiB)": 36.91, "step": 380, "token_acc": 0.973421926910299, "train_speed(iter/s)": 0.01615 }, { "epoch": 1.2531979695431472, "grad_norm": 0.991725434659403, "learning_rate": 8.938726819518977e-05, "loss": 0.1285269021987915, "memory(GiB)": 36.91, "step": 385, "token_acc": 0.97, "train_speed(iter/s)": 0.016149 }, { "epoch": 1.2694416243654822, "grad_norm": 0.7615458350738632, "learning_rate": 8.905316019806868e-05, "loss": 0.08999634981155395, "memory(GiB)": 36.91, "step": 390, "token_acc": 0.9392405063291139, "train_speed(iter/s)": 0.016141 }, { "epoch": 1.2856852791878173, "grad_norm": 1.0176469569030087, "learning_rate": 8.871451929520663e-05, "loss": 0.12240591049194335, "memory(GiB)": 36.91, "step": 395, "token_acc": 0.9611872146118722, "train_speed(iter/s)": 0.016137 }, { "epoch": 1.3019289340101523, "grad_norm": 1.5999057477034428, "learning_rate": 8.837138479270036e-05, "loss": 0.1078599214553833, "memory(GiB)": 36.91, "step": 400, "token_acc": 0.9562982005141388, "train_speed(iter/s)": 0.016137 }, { "epoch": 1.3181725888324873, "grad_norm": 1.8517636831594235, "learning_rate": 8.802379651821938e-05, "loss": 0.14071439504623412, "memory(GiB)": 36.91, "step": 405, "token_acc": 0.9592875318066157, "train_speed(iter/s)": 0.016131 }, { "epoch": 1.3344162436548224, "grad_norm": 1.333329930877741, "learning_rate": 8.767179481638303e-05, "loss": 0.13171937465667724, "memory(GiB)": 36.91, "step": 410, "token_acc": 0.9744897959183674, "train_speed(iter/s)": 0.016135 }, { "epoch": 1.3506598984771574, "grad_norm": 1.1709434640964491, "learning_rate": 8.731542054407793e-05, "loss": 0.10031242370605468, "memory(GiB)": 36.91, "step": 415, "token_acc": 0.9507829977628636, "train_speed(iter/s)": 0.016142 }, { "epoch": 1.3669035532994924, "grad_norm": 0.8550588073511182, "learning_rate": 8.695471506571542e-05, "loss": 0.09321081638336182, "memory(GiB)": 36.91, "step": 420, "token_acc": 0.9667519181585678, "train_speed(iter/s)": 0.016135 }, { "epoch": 1.3831472081218275, "grad_norm": 0.8651388677420173, "learning_rate": 8.658972024843062e-05, "loss": 0.11361520290374756, "memory(GiB)": 36.91, "step": 425, "token_acc": 0.9243243243243243, "train_speed(iter/s)": 0.016131 }, { "epoch": 1.3993908629441625, "grad_norm": 1.1539120381770573, "learning_rate": 8.622047845722275e-05, "loss": 0.11814072132110595, "memory(GiB)": 36.91, "step": 430, "token_acc": 0.9747368421052631, "train_speed(iter/s)": 0.016135 }, { "epoch": 1.4156345177664975, "grad_norm": 0.8277592112279485, "learning_rate": 8.584703255003795e-05, "loss": 0.11146994829177856, "memory(GiB)": 36.91, "step": 435, "token_acc": 0.9720101781170484, "train_speed(iter/s)": 0.016134 }, { "epoch": 1.4318781725888325, "grad_norm": 0.613271329664299, "learning_rate": 8.546942587279465e-05, "loss": 0.09394789338111878, "memory(GiB)": 36.91, "step": 440, "token_acc": 0.9636803874092009, "train_speed(iter/s)": 0.016134 }, { "epoch": 1.4481218274111676, "grad_norm": 1.0271786482031176, "learning_rate": 8.508770225435244e-05, "loss": 0.09493039846420288, "memory(GiB)": 36.91, "step": 445, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.016139 }, { "epoch": 1.4643654822335026, "grad_norm": 1.0170609694346187, "learning_rate": 8.470190600142486e-05, "loss": 0.0872123122215271, "memory(GiB)": 36.91, "step": 450, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.016139 }, { "epoch": 1.4806091370558376, "grad_norm": 1.6562131424643847, "learning_rate": 8.43120818934367e-05, "loss": 0.12921547889709473, "memory(GiB)": 36.91, "step": 455, "token_acc": 0.9691516709511568, "train_speed(iter/s)": 0.016142 }, { "epoch": 1.4968527918781727, "grad_norm": 1.9551348110028592, "learning_rate": 8.39182751773264e-05, "loss": 0.10002539157867432, "memory(GiB)": 36.91, "step": 460, "token_acc": 0.9665924276169265, "train_speed(iter/s)": 0.016147 }, { "epoch": 1.5130964467005077, "grad_norm": 1.376875063389563, "learning_rate": 8.352053156229438e-05, "loss": 0.0880006194114685, "memory(GiB)": 36.91, "step": 465, "token_acc": 0.958128078817734, "train_speed(iter/s)": 0.016149 }, { "epoch": 1.5293401015228425, "grad_norm": 1.688502126127077, "learning_rate": 8.31188972144974e-05, "loss": 0.08950616717338562, "memory(GiB)": 36.91, "step": 470, "token_acc": 0.96996996996997, "train_speed(iter/s)": 0.016152 }, { "epoch": 1.5455837563451778, "grad_norm": 1.3189009566745062, "learning_rate": 8.27134187516901e-05, "loss": 0.08834458589553833, "memory(GiB)": 36.91, "step": 475, "token_acc": 0.9663865546218487, "train_speed(iter/s)": 0.016152 }, { "epoch": 1.5618274111675126, "grad_norm": 1.4946742975658185, "learning_rate": 8.23041432378141e-05, "loss": 0.14390041828155517, "memory(GiB)": 36.91, "step": 480, "token_acc": 0.9621212121212122, "train_speed(iter/s)": 0.016158 }, { "epoch": 1.5780710659898478, "grad_norm": 1.3879821905262077, "learning_rate": 8.18911181775353e-05, "loss": 0.1267578125, "memory(GiB)": 36.91, "step": 485, "token_acc": 0.9685230024213075, "train_speed(iter/s)": 0.016166 }, { "epoch": 1.5943147208121826, "grad_norm": 1.0017173842059925, "learning_rate": 8.147439151072994e-05, "loss": 0.11637402772903442, "memory(GiB)": 36.91, "step": 490, "token_acc": 0.945031712473573, "train_speed(iter/s)": 0.016169 }, { "epoch": 1.6105583756345179, "grad_norm": 3.517464157304767, "learning_rate": 8.105401160692023e-05, "loss": 0.11228004693984986, "memory(GiB)": 36.91, "step": 495, "token_acc": 0.9544513457556936, "train_speed(iter/s)": 0.016174 }, { "epoch": 1.6268020304568527, "grad_norm": 1.123471909128111, "learning_rate": 8.063002725966015e-05, "loss": 0.1422884702682495, "memory(GiB)": 36.91, "step": 500, "token_acc": 0.9502487562189055, "train_speed(iter/s)": 0.016173 }, { "epoch": 1.643045685279188, "grad_norm": 0.6219224292611003, "learning_rate": 8.020248768087188e-05, "loss": 0.09764043688774109, "memory(GiB)": 36.91, "step": 505, "token_acc": 0.9696312364425163, "train_speed(iter/s)": 0.016171 }, { "epoch": 1.6592893401015227, "grad_norm": 0.6599500918289528, "learning_rate": 7.977144249513391e-05, "loss": 0.11226143836975097, "memory(GiB)": 36.91, "step": 510, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.016171 }, { "epoch": 1.675532994923858, "grad_norm": 1.1327299497198065, "learning_rate": 7.93369417339209e-05, "loss": 0.15791513919830322, "memory(GiB)": 36.91, "step": 515, "token_acc": 0.9535962877030162, "train_speed(iter/s)": 0.016172 }, { "epoch": 1.6917766497461928, "grad_norm": 1.246895523664307, "learning_rate": 7.88990358297967e-05, "loss": 0.1254945158958435, "memory(GiB)": 36.91, "step": 520, "token_acc": 0.9494252873563218, "train_speed(iter/s)": 0.016169 }, { "epoch": 1.708020304568528, "grad_norm": 0.7907689981367572, "learning_rate": 7.84577756105606e-05, "loss": 0.11963515281677246, "memory(GiB)": 36.91, "step": 525, "token_acc": 0.9851116625310173, "train_speed(iter/s)": 0.016171 }, { "epoch": 1.7242639593908629, "grad_norm": 0.9327837359999639, "learning_rate": 7.801321229334764e-05, "loss": 0.0870942771434784, "memory(GiB)": 36.91, "step": 530, "token_acc": 0.9618320610687023, "train_speed(iter/s)": 0.01617 }, { "epoch": 1.740507614213198, "grad_norm": 1.0881384151057631, "learning_rate": 7.756539747868394e-05, "loss": 0.08531727194786072, "memory(GiB)": 36.91, "step": 535, "token_acc": 0.9748603351955307, "train_speed(iter/s)": 0.016168 }, { "epoch": 1.756751269035533, "grad_norm": 0.7767069783252919, "learning_rate": 7.71143831444974e-05, "loss": 0.11042824983596802, "memory(GiB)": 36.91, "step": 540, "token_acc": 0.957983193277311, "train_speed(iter/s)": 0.016168 }, { "epoch": 1.7729949238578682, "grad_norm": 1.4499212106775468, "learning_rate": 7.666022164008457e-05, "loss": 0.11432676315307617, "memory(GiB)": 36.91, "step": 545, "token_acc": 0.975, "train_speed(iter/s)": 0.016171 }, { "epoch": 1.789238578680203, "grad_norm": 1.0081688781849556, "learning_rate": 7.620296568003449e-05, "loss": 0.12327454090118409, "memory(GiB)": 36.91, "step": 550, "token_acc": 0.9525, "train_speed(iter/s)": 0.016174 }, { "epoch": 1.8054822335025382, "grad_norm": 0.9935491377578084, "learning_rate": 7.57426683381101e-05, "loss": 0.09574033617973328, "memory(GiB)": 36.91, "step": 555, "token_acc": 0.9694117647058823, "train_speed(iter/s)": 0.016174 }, { "epoch": 1.821725888324873, "grad_norm": 1.0191162814710237, "learning_rate": 7.527938304108795e-05, "loss": 0.10299128293991089, "memory(GiB)": 36.91, "step": 560, "token_acc": 0.9694793536804309, "train_speed(iter/s)": 0.016176 }, { "epoch": 1.8379695431472083, "grad_norm": 1.322632268427317, "learning_rate": 7.481316356255698e-05, "loss": 0.12594590187072754, "memory(GiB)": 36.91, "step": 565, "token_acc": 0.946257197696737, "train_speed(iter/s)": 0.016169 }, { "epoch": 1.854213197969543, "grad_norm": 1.2990436559927216, "learning_rate": 7.434406401667695e-05, "loss": 0.10811959505081177, "memory(GiB)": 36.91, "step": 570, "token_acc": 0.9556650246305419, "train_speed(iter/s)": 0.016173 }, { "epoch": 1.8704568527918781, "grad_norm": 1.141255912127714, "learning_rate": 7.387213885189746e-05, "loss": 0.10128064155578613, "memory(GiB)": 36.91, "step": 575, "token_acc": 0.9654255319148937, "train_speed(iter/s)": 0.016178 }, { "epoch": 1.8867005076142132, "grad_norm": 1.6575287534795722, "learning_rate": 7.339744284463808e-05, "loss": 0.09879794716835022, "memory(GiB)": 36.91, "step": 580, "token_acc": 0.9805555555555555, "train_speed(iter/s)": 0.016182 }, { "epoch": 1.9029441624365482, "grad_norm": 1.1141293923635756, "learning_rate": 7.292003109293048e-05, "loss": 0.0816422462463379, "memory(GiB)": 36.91, "step": 585, "token_acc": 0.961038961038961, "train_speed(iter/s)": 0.016187 }, { "epoch": 1.9191878172588832, "grad_norm": 0.9384463374768481, "learning_rate": 7.243995901002312e-05, "loss": 0.10118494033813477, "memory(GiB)": 36.91, "step": 590, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.016179 }, { "epoch": 1.9354314720812182, "grad_norm": 1.2458643327317989, "learning_rate": 7.19572823179495e-05, "loss": 0.13551709651947022, "memory(GiB)": 36.91, "step": 595, "token_acc": 0.96, "train_speed(iter/s)": 0.016178 }, { "epoch": 1.9516751269035533, "grad_norm": 1.2473685164472739, "learning_rate": 7.147205704106046e-05, "loss": 0.12769120931625366, "memory(GiB)": 36.91, "step": 600, "token_acc": 0.9561586638830898, "train_speed(iter/s)": 0.016179 }, { "epoch": 1.9679187817258883, "grad_norm": 0.7203387342947396, "learning_rate": 7.098433949952146e-05, "loss": 0.09962844252586364, "memory(GiB)": 36.91, "step": 605, "token_acc": 0.9623115577889447, "train_speed(iter/s)": 0.016178 }, { "epoch": 1.9841624365482233, "grad_norm": 0.9094364008463653, "learning_rate": 7.049418630277542e-05, "loss": 0.10799739360809327, "memory(GiB)": 36.91, "step": 610, "token_acc": 0.9705159705159705, "train_speed(iter/s)": 0.016178 }, { "epoch": 1.9971573604060913, "eval_loss": 0.19586917757987976, "eval_runtime": 62.6829, "eval_samples_per_second": 3.159, "eval_steps_per_second": 0.798, "eval_token_acc": 0.9416745061147695, "step": 614 }, { "epoch": 2.0028426395939087, "grad_norm": 11.787067733742486, "learning_rate": 7.000165434297214e-05, "loss": 0.12140052318572998, "memory(GiB)": 36.91, "step": 615, "token_acc": 0.951048951048951, "train_speed(iter/s)": 0.016146 }, { "epoch": 2.0190862944162435, "grad_norm": 0.9036939767517369, "learning_rate": 6.950680078836474e-05, "loss": 0.0476156622171402, "memory(GiB)": 36.91, "step": 620, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.016145 }, { "epoch": 2.035329949238579, "grad_norm": 0.8045933316745676, "learning_rate": 6.900968307667423e-05, "loss": 0.0368287205696106, "memory(GiB)": 36.91, "step": 625, "token_acc": 0.9932584269662922, "train_speed(iter/s)": 0.016144 }, { "epoch": 2.0515736040609136, "grad_norm": 0.9084110351960255, "learning_rate": 6.851035890842259e-05, "loss": 0.03829330801963806, "memory(GiB)": 36.91, "step": 630, "token_acc": 0.9928741092636579, "train_speed(iter/s)": 0.016139 }, { "epoch": 2.067817258883249, "grad_norm": 0.6115130889160721, "learning_rate": 6.800888624023553e-05, "loss": 0.04897831082344055, "memory(GiB)": 36.91, "step": 635, "token_acc": 0.995, "train_speed(iter/s)": 0.016139 }, { "epoch": 2.0840609137055837, "grad_norm": 0.2929609590178906, "learning_rate": 6.750532327811547e-05, "loss": 0.027808183431625368, "memory(GiB)": 36.91, "step": 640, "token_acc": 0.9877750611246944, "train_speed(iter/s)": 0.016139 }, { "epoch": 2.100304568527919, "grad_norm": 1.6659772014622232, "learning_rate": 6.699972847068553e-05, "loss": 0.04012786149978638, "memory(GiB)": 36.91, "step": 645, "token_acc": 0.9892183288409704, "train_speed(iter/s)": 0.016136 }, { "epoch": 2.1165482233502537, "grad_norm": 1.6942318499082378, "learning_rate": 6.649216050240539e-05, "loss": 0.03581180572509766, "memory(GiB)": 36.91, "step": 650, "token_acc": 0.9848866498740554, "train_speed(iter/s)": 0.016138 }, { "epoch": 2.132791878172589, "grad_norm": 1.7750332328595628, "learning_rate": 6.598267828675979e-05, "loss": 0.038441383838653566, "memory(GiB)": 36.91, "step": 655, "token_acc": 0.9860724233983287, "train_speed(iter/s)": 0.016136 }, { "epoch": 2.149035532994924, "grad_norm": 0.948452800180108, "learning_rate": 6.547134095942044e-05, "loss": 0.03809022605419159, "memory(GiB)": 36.91, "step": 660, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.016135 }, { "epoch": 2.165279187817259, "grad_norm": 1.185267349759789, "learning_rate": 6.495820787138209e-05, "loss": 0.033171114325523374, "memory(GiB)": 36.91, "step": 665, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.016132 }, { "epoch": 2.181522842639594, "grad_norm": 1.1780464513130944, "learning_rate": 6.44433385820737e-05, "loss": 0.03416465222835541, "memory(GiB)": 36.91, "step": 670, "token_acc": 0.9948051948051948, "train_speed(iter/s)": 0.01614 }, { "epoch": 2.197766497461929, "grad_norm": 0.5862751780031482, "learning_rate": 6.392679285244538e-05, "loss": 0.043843358755111694, "memory(GiB)": 36.91, "step": 675, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.016137 }, { "epoch": 2.214010152284264, "grad_norm": 0.7314774852745054, "learning_rate": 6.340863063803188e-05, "loss": 0.03051617741584778, "memory(GiB)": 36.91, "step": 680, "token_acc": 0.9970326409495549, "train_speed(iter/s)": 0.016136 }, { "epoch": 2.230253807106599, "grad_norm": 1.4305053109603272, "learning_rate": 6.288891208199353e-05, "loss": 0.03859332203865051, "memory(GiB)": 36.91, "step": 685, "token_acc": 0.9813829787234043, "train_speed(iter/s)": 0.016138 }, { "epoch": 2.246497461928934, "grad_norm": 1.2676862153868658, "learning_rate": 6.23676975081355e-05, "loss": 0.03608715534210205, "memory(GiB)": 36.91, "step": 690, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.016143 }, { "epoch": 2.262741116751269, "grad_norm": 0.717797595223322, "learning_rate": 6.184504741390596e-05, "loss": 0.024200823903083802, "memory(GiB)": 36.91, "step": 695, "token_acc": 0.9932885906040269, "train_speed(iter/s)": 0.016142 }, { "epoch": 2.278984771573604, "grad_norm": 1.2738346733999926, "learning_rate": 6.132102246337407e-05, "loss": 0.04924860596656799, "memory(GiB)": 36.91, "step": 700, "token_acc": 0.989769820971867, "train_speed(iter/s)": 0.016144 }, { "epoch": 2.2952284263959393, "grad_norm": 0.9709229547354659, "learning_rate": 6.079568348018882e-05, "loss": 0.04101951122283935, "memory(GiB)": 36.91, "step": 705, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.016145 }, { "epoch": 2.311472081218274, "grad_norm": 0.34074159031019935, "learning_rate": 6.02690914405191e-05, "loss": 0.012625060975551605, "memory(GiB)": 36.91, "step": 710, "token_acc": 0.9893162393162394, "train_speed(iter/s)": 0.016143 }, { "epoch": 2.3277157360406093, "grad_norm": 1.405033686903226, "learning_rate": 5.974130746597628e-05, "loss": 0.023314157128334047, "memory(GiB)": 36.91, "step": 715, "token_acc": 0.9845261121856866, "train_speed(iter/s)": 0.016146 }, { "epoch": 2.343959390862944, "grad_norm": 0.393622080479984, "learning_rate": 5.921239281651976e-05, "loss": 0.03884749114513397, "memory(GiB)": 36.91, "step": 720, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.016147 }, { "epoch": 2.360203045685279, "grad_norm": 0.8205162732404321, "learning_rate": 5.868240888334653e-05, "loss": 0.0408410519361496, "memory(GiB)": 36.91, "step": 725, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.016147 }, { "epoch": 2.376446700507614, "grad_norm": 0.9254262259522679, "learning_rate": 5.815141718176549e-05, "loss": 0.03491292595863342, "memory(GiB)": 36.91, "step": 730, "token_acc": 0.9818731117824774, "train_speed(iter/s)": 0.016148 }, { "epoch": 2.3926903553299494, "grad_norm": 0.4613013276623316, "learning_rate": 5.761947934405736e-05, "loss": 0.041343241930007935, "memory(GiB)": 36.91, "step": 735, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.01615 }, { "epoch": 2.4089340101522843, "grad_norm": 0.5995425123829327, "learning_rate": 5.708665711232103e-05, "loss": 0.026265931129455567, "memory(GiB)": 36.91, "step": 740, "token_acc": 0.980225988700565, "train_speed(iter/s)": 0.016147 }, { "epoch": 2.425177664974619, "grad_norm": 0.8947399880614664, "learning_rate": 5.655301233130711e-05, "loss": 0.026338309049606323, "memory(GiB)": 36.91, "step": 745, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.01615 }, { "epoch": 2.4414213197969543, "grad_norm": 0.6528954286261448, "learning_rate": 5.6018606941239615e-05, "loss": 0.031349584460258484, "memory(GiB)": 36.91, "step": 750, "token_acc": 0.9825870646766169, "train_speed(iter/s)": 0.016153 }, { "epoch": 2.4576649746192896, "grad_norm": 0.9124965491201447, "learning_rate": 5.548350297062659e-05, "loss": 0.04390305280685425, "memory(GiB)": 36.91, "step": 755, "token_acc": 0.9971181556195965, "train_speed(iter/s)": 0.016158 }, { "epoch": 2.4739086294416244, "grad_norm": 1.2758793187917294, "learning_rate": 5.494776252906036e-05, "loss": 0.03932673335075378, "memory(GiB)": 36.91, "step": 760, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.016155 }, { "epoch": 2.490152284263959, "grad_norm": 1.6183527750946778, "learning_rate": 5.44114478000086e-05, "loss": 0.040107494592666625, "memory(GiB)": 36.91, "step": 765, "token_acc": 0.980722891566265, "train_speed(iter/s)": 0.01616 }, { "epoch": 2.5063959390862944, "grad_norm": 0.8155608212943981, "learning_rate": 5.387462103359655e-05, "loss": 0.034613233804702756, "memory(GiB)": 36.91, "step": 770, "token_acc": 0.9809885931558935, "train_speed(iter/s)": 0.016158 }, { "epoch": 2.5226395939086297, "grad_norm": 0.72914335142115, "learning_rate": 5.333734453938174e-05, "loss": 0.03472020030021668, "memory(GiB)": 36.91, "step": 775, "token_acc": 0.980722891566265, "train_speed(iter/s)": 0.016157 }, { "epoch": 2.5388832487309645, "grad_norm": 0.715640193227215, "learning_rate": 5.279968067912161e-05, "loss": 0.03267112672328949, "memory(GiB)": 36.91, "step": 780, "token_acc": 0.9949109414758269, "train_speed(iter/s)": 0.016159 }, { "epoch": 2.5551269035532993, "grad_norm": 0.5201766196940287, "learning_rate": 5.226169185953532e-05, "loss": 0.06324458122253418, "memory(GiB)": 36.91, "step": 785, "token_acc": 0.9822784810126582, "train_speed(iter/s)": 0.016157 }, { "epoch": 2.5713705583756346, "grad_norm": 0.716527670309396, "learning_rate": 5.1723440525060026e-05, "loss": 0.036973622441291806, "memory(GiB)": 36.91, "step": 790, "token_acc": 0.9828009828009828, "train_speed(iter/s)": 0.016157 }, { "epoch": 2.58761421319797, "grad_norm": 0.9508048665101771, "learning_rate": 5.118498915060307e-05, "loss": 0.04134515523910522, "memory(GiB)": 36.91, "step": 795, "token_acc": 0.9832402234636871, "train_speed(iter/s)": 0.016159 }, { "epoch": 2.6038578680203046, "grad_norm": 0.1695737988935869, "learning_rate": 5.064640023429043e-05, "loss": 0.0396234929561615, "memory(GiB)": 36.91, "step": 800, "token_acc": 0.9937888198757764, "train_speed(iter/s)": 0.01616 }, { "epoch": 2.6201015228426394, "grad_norm": 1.353410357397197, "learning_rate": 5.0107736290212603e-05, "loss": 0.032366597652435304, "memory(GiB)": 36.91, "step": 805, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.016161 }, { "epoch": 2.6363451776649747, "grad_norm": 0.9287301884362714, "learning_rate": 4.956905984116858e-05, "loss": 0.02025129795074463, "memory(GiB)": 36.91, "step": 810, "token_acc": 1.0, "train_speed(iter/s)": 0.016156 }, { "epoch": 2.65258883248731, "grad_norm": 0.6605215469870417, "learning_rate": 4.903043341140879e-05, "loss": 0.027498137950897217, "memory(GiB)": 36.91, "step": 815, "token_acc": 0.9890590809628009, "train_speed(iter/s)": 0.016158 }, { "epoch": 2.6688324873096447, "grad_norm": 1.284202747583917, "learning_rate": 4.84919195193779e-05, "loss": 0.04052730202674866, "memory(GiB)": 36.91, "step": 820, "token_acc": 0.9691714836223507, "train_speed(iter/s)": 0.016161 }, { "epoch": 2.6850761421319795, "grad_norm": 1.054572423840406, "learning_rate": 4.7953580670458345e-05, "loss": 0.029700332880020143, "memory(GiB)": 36.91, "step": 825, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.016161 }, { "epoch": 2.701319796954315, "grad_norm": 1.515148160249309, "learning_rate": 4.7415479349715275e-05, "loss": 0.03995212614536285, "memory(GiB)": 36.91, "step": 830, "token_acc": 0.9887005649717514, "train_speed(iter/s)": 0.016163 }, { "epoch": 2.7175634517766496, "grad_norm": 0.7966857436927859, "learning_rate": 4.687767801464388e-05, "loss": 0.029492130875587462, "memory(GiB)": 36.91, "step": 835, "token_acc": 0.9946091644204852, "train_speed(iter/s)": 0.016162 }, { "epoch": 2.733807106598985, "grad_norm": 0.6747809015160623, "learning_rate": 4.634023908791999e-05, "loss": 0.028040975332260132, "memory(GiB)": 36.91, "step": 840, "token_acc": 0.9950372208436724, "train_speed(iter/s)": 0.016165 }, { "epoch": 2.7500507614213197, "grad_norm": 0.7236373548114289, "learning_rate": 4.5803224950154656e-05, "loss": 0.022182533144950868, "memory(GiB)": 36.91, "step": 845, "token_acc": 0.9973753280839895, "train_speed(iter/s)": 0.016167 }, { "epoch": 2.766294416243655, "grad_norm": 0.8702609694851884, "learning_rate": 4.5266697932653616e-05, "loss": 0.03542717695236206, "memory(GiB)": 36.91, "step": 850, "token_acc": 0.9930394431554525, "train_speed(iter/s)": 0.016168 }, { "epoch": 2.7825380710659897, "grad_norm": 0.2339976820774803, "learning_rate": 4.473072031018248e-05, "loss": 0.017447268962860106, "memory(GiB)": 36.91, "step": 855, "token_acc": 0.9897172236503856, "train_speed(iter/s)": 0.016172 }, { "epoch": 2.798781725888325, "grad_norm": 1.7564108472908913, "learning_rate": 4.4195354293738484e-05, "loss": 0.040924933552742, "memory(GiB)": 36.91, "step": 860, "token_acc": 0.9693396226415094, "train_speed(iter/s)": 0.016172 }, { "epoch": 2.8150253807106598, "grad_norm": 1.749637468786309, "learning_rate": 4.366066202332974e-05, "loss": 0.0398847758769989, "memory(GiB)": 36.91, "step": 865, "token_acc": 0.9884726224783862, "train_speed(iter/s)": 0.016173 }, { "epoch": 2.831269035532995, "grad_norm": 1.6657986428559317, "learning_rate": 4.312670556076244e-05, "loss": 0.027478563785552978, "memory(GiB)": 36.91, "step": 870, "token_acc": 0.9953379953379954, "train_speed(iter/s)": 0.016178 }, { "epoch": 2.84751269035533, "grad_norm": 0.8830417040757416, "learning_rate": 4.259354688243757e-05, "loss": 0.05422350764274597, "memory(GiB)": 36.91, "step": 875, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.016176 }, { "epoch": 2.863756345177665, "grad_norm": 1.4037166255295264, "learning_rate": 4.206124787215714e-05, "loss": 0.03585241138935089, "memory(GiB)": 36.91, "step": 880, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.016178 }, { "epoch": 2.88, "grad_norm": 0.40929439648007787, "learning_rate": 4.1529870313941386e-05, "loss": 0.037713998556137086, "memory(GiB)": 36.91, "step": 885, "token_acc": 0.9755555555555555, "train_speed(iter/s)": 0.016182 }, { "epoch": 2.896243654822335, "grad_norm": 0.5649136450093045, "learning_rate": 4.099947588485744e-05, "loss": 0.02235218584537506, "memory(GiB)": 36.91, "step": 890, "token_acc": 0.9738562091503268, "train_speed(iter/s)": 0.016179 }, { "epoch": 2.91248730964467, "grad_norm": 0.9411441260021843, "learning_rate": 4.047012614786055e-05, "loss": 0.03756971955299378, "memory(GiB)": 36.91, "step": 895, "token_acc": 0.9953596287703016, "train_speed(iter/s)": 0.016182 }, { "epoch": 2.928730964467005, "grad_norm": 0.493632814272918, "learning_rate": 3.994188254464838e-05, "loss": 0.03068949580192566, "memory(GiB)": 36.91, "step": 900, "token_acc": 0.9681372549019608, "train_speed(iter/s)": 0.016183 }, { "epoch": 2.94497461928934, "grad_norm": 0.9098057371042104, "learning_rate": 3.941480638852948e-05, "loss": 0.060313427448272706, "memory(GiB)": 36.91, "step": 905, "token_acc": 0.9809976247030879, "train_speed(iter/s)": 0.016186 }, { "epoch": 2.9612182741116753, "grad_norm": 0.7111307711774197, "learning_rate": 3.888895885730666e-05, "loss": 0.017010049521923067, "memory(GiB)": 36.91, "step": 910, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.016184 }, { "epoch": 2.97746192893401, "grad_norm": 1.1085076966021257, "learning_rate": 3.836440098617611e-05, "loss": 0.0352476716041565, "memory(GiB)": 36.91, "step": 915, "token_acc": 0.9971264367816092, "train_speed(iter/s)": 0.016185 }, { "epoch": 2.9937055837563453, "grad_norm": 1.0414881730973389, "learning_rate": 3.784119366064293e-05, "loss": 0.036097651720046996, "memory(GiB)": 36.91, "step": 920, "token_acc": 0.9859484777517564, "train_speed(iter/s)": 0.016183 }, { "epoch": 2.996954314720812, "eval_loss": 0.2438431978225708, "eval_runtime": 61.9093, "eval_samples_per_second": 3.198, "eval_steps_per_second": 0.808, "eval_token_acc": 0.9426152398871119, "step": 921 }, { "epoch": 3.0123857868020303, "grad_norm": 0.40292122284066784, "learning_rate": 3.731939760945423e-05, "loss": 0.02739437222480774, "memory(GiB)": 36.91, "step": 925, "token_acc": 0.9686609686609686, "train_speed(iter/s)": 0.016163 }, { "epoch": 3.0286294416243655, "grad_norm": 2.9493043319197345, "learning_rate": 3.6799073397550324e-05, "loss": 0.023541851341724394, "memory(GiB)": 36.91, "step": 930, "token_acc": 1.0, "train_speed(iter/s)": 0.016168 }, { "epoch": 3.0448730964467003, "grad_norm": 0.17930096671859505, "learning_rate": 3.628028141903493e-05, "loss": 0.011585032194852829, "memory(GiB)": 36.91, "step": 935, "token_acc": 0.9955849889624724, "train_speed(iter/s)": 0.016168 }, { "epoch": 3.0611167512690356, "grad_norm": 0.32421421634457975, "learning_rate": 3.576308189016521e-05, "loss": 0.01218060329556465, "memory(GiB)": 36.91, "step": 940, "token_acc": 1.0, "train_speed(iter/s)": 0.016169 }, { "epoch": 3.0773604060913704, "grad_norm": 0.6594419595560748, "learning_rate": 3.5247534842362486e-05, "loss": 0.02207506597042084, "memory(GiB)": 36.91, "step": 945, "token_acc": 0.988558352402746, "train_speed(iter/s)": 0.016162 }, { "epoch": 3.0936040609137057, "grad_norm": 0.2767332960437252, "learning_rate": 3.473370011524435e-05, "loss": 0.007218687236309052, "memory(GiB)": 36.91, "step": 950, "token_acc": 1.0, "train_speed(iter/s)": 0.016165 }, { "epoch": 3.1098477157360405, "grad_norm": 0.35071543831944074, "learning_rate": 3.422163734967913e-05, "loss": 0.01153595745563507, "memory(GiB)": 36.91, "step": 955, "token_acc": 1.0, "train_speed(iter/s)": 0.016173 }, { "epoch": 3.1260913705583757, "grad_norm": 0.09053944993100493, "learning_rate": 3.371140598086332e-05, "loss": 0.0028192587196826935, "memory(GiB)": 36.91, "step": 960, "token_acc": 0.9975247524752475, "train_speed(iter/s)": 0.016172 }, { "epoch": 3.1423350253807105, "grad_norm": 0.2428779518534084, "learning_rate": 3.3203065231422904e-05, "loss": 0.0033150166273117065, "memory(GiB)": 36.91, "step": 965, "token_acc": 1.0, "train_speed(iter/s)": 0.016172 }, { "epoch": 3.1585786802030458, "grad_norm": 0.3634314044068558, "learning_rate": 3.269667410453944e-05, "loss": 0.006601892411708832, "memory(GiB)": 36.91, "step": 970, "token_acc": 0.9974160206718347, "train_speed(iter/s)": 0.016171 }, { "epoch": 3.1748223350253806, "grad_norm": 0.09528591509222967, "learning_rate": 3.2192291377101544e-05, "loss": 0.006571587175130844, "memory(GiB)": 36.91, "step": 975, "token_acc": 1.0, "train_speed(iter/s)": 0.016173 }, { "epoch": 3.191065989847716, "grad_norm": 1.3857004471442305, "learning_rate": 3.1689975592882603e-05, "loss": 0.010420820116996765, "memory(GiB)": 36.91, "step": 980, "token_acc": 1.0, "train_speed(iter/s)": 0.016176 }, { "epoch": 3.2073096446700506, "grad_norm": 0.6960737288379213, "learning_rate": 3.11897850557456e-05, "loss": 0.013220900297164917, "memory(GiB)": 36.91, "step": 985, "token_acc": 0.9951807228915662, "train_speed(iter/s)": 0.016176 }, { "epoch": 3.223553299492386, "grad_norm": 0.9453732221306024, "learning_rate": 3.0691777822875846e-05, "loss": 0.01793895959854126, "memory(GiB)": 36.91, "step": 990, "token_acc": 1.0, "train_speed(iter/s)": 0.016176 }, { "epoch": 3.2397969543147207, "grad_norm": 0.7409992990444315, "learning_rate": 3.019601169804216e-05, "loss": 0.019229742884635925, "memory(GiB)": 36.91, "step": 995, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.016174 }, { "epoch": 3.256040609137056, "grad_norm": 0.5679417621370911, "learning_rate": 2.9702544224887684e-05, "loss": 0.024555668234825134, "memory(GiB)": 36.91, "step": 1000, "token_acc": 0.9953161592505855, "train_speed(iter/s)": 0.016175 }, { "epoch": 3.2722842639593908, "grad_norm": 0.08818412948467023, "learning_rate": 2.9211432680250717e-05, "loss": 0.009600495547056198, "memory(GiB)": 36.91, "step": 1005, "token_acc": 1.0, "train_speed(iter/s)": 0.016176 }, { "epoch": 3.288527918781726, "grad_norm": 0.597788232010352, "learning_rate": 2.872273406751664e-05, "loss": 0.015477313101291657, "memory(GiB)": 36.91, "step": 1010, "token_acc": 1.0, "train_speed(iter/s)": 0.016178 }, { "epoch": 3.304771573604061, "grad_norm": 0.5815875303347526, "learning_rate": 2.823650511000142e-05, "loss": 0.007314224541187286, "memory(GiB)": 36.91, "step": 1015, "token_acc": 0.9928741092636579, "train_speed(iter/s)": 0.016175 }, { "epoch": 3.321015228426396, "grad_norm": 0.06303638116527722, "learning_rate": 2.7752802244367875e-05, "loss": 0.0048162821680307385, "memory(GiB)": 36.91, "step": 1020, "token_acc": 0.9976359338061466, "train_speed(iter/s)": 0.016175 }, { "epoch": 3.337258883248731, "grad_norm": 1.530822467857818, "learning_rate": 2.7271681614074973e-05, "loss": 0.011756302416324615, "memory(GiB)": 36.91, "step": 1025, "token_acc": 0.9976744186046511, "train_speed(iter/s)": 0.016173 }, { "epoch": 3.353502538071066, "grad_norm": 0.03790601751186608, "learning_rate": 2.679319906286122e-05, "loss": 0.008612405508756638, "memory(GiB)": 36.91, "step": 1030, "token_acc": 0.9927184466019418, "train_speed(iter/s)": 0.016176 }, { "epoch": 3.369746192893401, "grad_norm": 0.21401768725028367, "learning_rate": 2.6317410128262954e-05, "loss": 0.006316320598125457, "memory(GiB)": 36.91, "step": 1035, "token_acc": 0.9950124688279302, "train_speed(iter/s)": 0.016179 }, { "epoch": 3.385989847715736, "grad_norm": 0.19540220508166592, "learning_rate": 2.5844370035168073e-05, "loss": 0.004939628392457962, "memory(GiB)": 36.91, "step": 1040, "token_acc": 1.0, "train_speed(iter/s)": 0.016181 }, { "epoch": 3.402233502538071, "grad_norm": 0.8965894055639708, "learning_rate": 2.537413368940601e-05, "loss": 0.016151268780231477, "memory(GiB)": 36.91, "step": 1045, "token_acc": 0.9898785425101214, "train_speed(iter/s)": 0.016182 }, { "epoch": 3.4184771573604062, "grad_norm": 0.21427146738429803, "learning_rate": 2.4906755671374903e-05, "loss": 0.010773959755897521, "memory(GiB)": 36.91, "step": 1050, "token_acc": 0.9977827050997783, "train_speed(iter/s)": 0.016182 }, { "epoch": 3.434720812182741, "grad_norm": 0.09286838269357345, "learning_rate": 2.4442290229706344e-05, "loss": 0.004091666638851165, "memory(GiB)": 36.91, "step": 1055, "token_acc": 0.9954233409610984, "train_speed(iter/s)": 0.016183 }, { "epoch": 3.4509644670050763, "grad_norm": 0.13489614133107514, "learning_rate": 2.3980791274968837e-05, "loss": 0.018990179896354674, "memory(GiB)": 36.91, "step": 1060, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.016184 }, { "epoch": 3.467208121827411, "grad_norm": 0.1825955700626613, "learning_rate": 2.3522312373410276e-05, "loss": 0.011526491492986679, "memory(GiB)": 36.91, "step": 1065, "token_acc": 0.997275204359673, "train_speed(iter/s)": 0.016188 }, { "epoch": 3.4834517766497464, "grad_norm": 0.2440094791459664, "learning_rate": 2.3066906740740623e-05, "loss": 0.019795812666416168, "memory(GiB)": 36.91, "step": 1070, "token_acc": 0.9896373056994818, "train_speed(iter/s)": 0.016187 }, { "epoch": 3.499695431472081, "grad_norm": 0.4913730237430669, "learning_rate": 2.2614627235955026e-05, "loss": 0.007270602881908417, "memory(GiB)": 36.91, "step": 1075, "token_acc": 1.0, "train_speed(iter/s)": 0.016189 }, { "epoch": 3.5159390862944164, "grad_norm": 0.6922284750457558, "learning_rate": 2.2165526355198605e-05, "loss": 0.0127563938498497, "memory(GiB)": 36.91, "step": 1080, "token_acc": 1.0, "train_speed(iter/s)": 0.016191 }, { "epoch": 3.5321827411167512, "grad_norm": 0.6450602563278425, "learning_rate": 2.171965622567308e-05, "loss": 0.007853203266859055, "memory(GiB)": 36.91, "step": 1085, "token_acc": 1.0, "train_speed(iter/s)": 0.016193 }, { "epoch": 3.548426395939086, "grad_norm": 0.3234875973475892, "learning_rate": 2.127706859958647e-05, "loss": 0.008352670073509216, "memory(GiB)": 36.91, "step": 1090, "token_acc": 1.0, "train_speed(iter/s)": 0.016193 }, { "epoch": 3.5646700507614213, "grad_norm": 0.09371017997182811, "learning_rate": 2.0837814848146166e-05, "loss": 0.001982194371521473, "memory(GiB)": 36.91, "step": 1095, "token_acc": 1.0, "train_speed(iter/s)": 0.016191 }, { "epoch": 3.5809137055837565, "grad_norm": 0.8724610494447905, "learning_rate": 2.0401945955596206e-05, "loss": 0.0030656153336167335, "memory(GiB)": 36.91, "step": 1100, "token_acc": 1.0, "train_speed(iter/s)": 0.016188 }, { "epoch": 3.5971573604060914, "grad_norm": 0.5650605008223917, "learning_rate": 1.9969512513299664e-05, "loss": 0.00554112084209919, "memory(GiB)": 36.91, "step": 1105, "token_acc": 1.0, "train_speed(iter/s)": 0.01619 }, { "epoch": 3.613401015228426, "grad_norm": 0.39939968413297244, "learning_rate": 1.9540564713866387e-05, "loss": 0.006034587323665619, "memory(GiB)": 36.91, "step": 1110, "token_acc": 0.9948586118251928, "train_speed(iter/s)": 0.016191 }, { "epoch": 3.6296446700507614, "grad_norm": 0.1065247660653177, "learning_rate": 1.9115152345327152e-05, "loss": 0.005482121184468269, "memory(GiB)": 36.91, "step": 1115, "token_acc": 1.0, "train_speed(iter/s)": 0.016191 }, { "epoch": 3.6458883248730967, "grad_norm": 0.8174090560458377, "learning_rate": 1.8693324785354822e-05, "loss": 0.011324305832386018, "memory(GiB)": 36.91, "step": 1120, "token_acc": 1.0, "train_speed(iter/s)": 0.016193 }, { "epoch": 3.6621319796954315, "grad_norm": 0.17850770204119407, "learning_rate": 1.8275130995532974e-05, "loss": 0.0144767165184021, "memory(GiB)": 36.91, "step": 1125, "token_acc": 0.9978586723768736, "train_speed(iter/s)": 0.016195 }, { "epoch": 3.6783756345177663, "grad_norm": 0.33877743892749795, "learning_rate": 1.7860619515673033e-05, "loss": 0.01116895154118538, "memory(GiB)": 36.91, "step": 1130, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.016195 }, { "epoch": 3.6946192893401015, "grad_norm": 0.5168488777536275, "learning_rate": 1.744983845818019e-05, "loss": 0.0068625412881374356, "memory(GiB)": 36.91, "step": 1135, "token_acc": 0.9978213507625272, "train_speed(iter/s)": 0.0162 }, { "epoch": 3.710862944162437, "grad_norm": 0.7346145409084535, "learning_rate": 1.7042835502468934e-05, "loss": 0.002322973683476448, "memory(GiB)": 36.91, "step": 1140, "token_acc": 1.0, "train_speed(iter/s)": 0.016197 }, { "epoch": 3.7271065989847716, "grad_norm": 0.6646625028373466, "learning_rate": 1.6639657889429018e-05, "loss": 0.018248292803764343, "memory(GiB)": 36.91, "step": 1145, "token_acc": 0.9840182648401826, "train_speed(iter/s)": 0.016195 }, { "epoch": 3.7433502538071064, "grad_norm": 0.8354437881107281, "learning_rate": 1.624035241594213e-05, "loss": 0.006459401547908783, "memory(GiB)": 36.91, "step": 1150, "token_acc": 1.0, "train_speed(iter/s)": 0.016193 }, { "epoch": 3.7595939086294416, "grad_norm": 0.2958093671449778, "learning_rate": 1.5844965429450132e-05, "loss": 0.008441635966300964, "memory(GiB)": 36.91, "step": 1155, "token_acc": 0.9834368530020704, "train_speed(iter/s)": 0.016192 }, { "epoch": 3.775837563451777, "grad_norm": 0.4306627690474224, "learning_rate": 1.545354282257562e-05, "loss": 0.015231077373027802, "memory(GiB)": 36.91, "step": 1160, "token_acc": 0.9976851851851852, "train_speed(iter/s)": 0.016196 }, { "epoch": 3.7920812182741117, "grad_norm": 0.0801666210860899, "learning_rate": 1.5066130027795044e-05, "loss": 0.02225690186023712, "memory(GiB)": 36.91, "step": 1165, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.0162 }, { "epoch": 3.8083248730964465, "grad_norm": 1.390297822775598, "learning_rate": 1.4682772012165436e-05, "loss": 0.011767344176769256, "memory(GiB)": 36.91, "step": 1170, "token_acc": 0.9953810623556582, "train_speed(iter/s)": 0.0162 }, { "epoch": 3.8245685279187818, "grad_norm": 0.576269037629794, "learning_rate": 1.4303513272105057e-05, "loss": 0.01135575920343399, "memory(GiB)": 36.91, "step": 1175, "token_acc": 0.9976744186046511, "train_speed(iter/s)": 0.016199 }, { "epoch": 3.840812182741117, "grad_norm": 0.6175307257021349, "learning_rate": 1.3928397828228628e-05, "loss": 0.00802643597126007, "memory(GiB)": 36.91, "step": 1180, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.016201 }, { "epoch": 3.857055837563452, "grad_norm": 0.13098006216818975, "learning_rate": 1.3557469220237962e-05, "loss": 0.011502113938331605, "memory(GiB)": 36.91, "step": 1185, "token_acc": 0.9935344827586207, "train_speed(iter/s)": 0.016204 }, { "epoch": 3.8732994923857866, "grad_norm": 0.3987654668677921, "learning_rate": 1.3190770501868243e-05, "loss": 0.011363585293293, "memory(GiB)": 36.91, "step": 1190, "token_acc": 0.9974160206718347, "train_speed(iter/s)": 0.016203 }, { "epoch": 3.889543147208122, "grad_norm": 0.14976124575026759, "learning_rate": 1.2828344235890726e-05, "loss": 0.01089974120259285, "memory(GiB)": 36.91, "step": 1195, "token_acc": 0.9933481152993349, "train_speed(iter/s)": 0.016203 }, { "epoch": 3.9057868020304567, "grad_norm": 1.5199866835408566, "learning_rate": 1.247023248917259e-05, "loss": 0.009822697192430497, "memory(GiB)": 36.91, "step": 1200, "token_acc": 0.9929742388758782, "train_speed(iter/s)": 0.016204 }, { "epoch": 3.922030456852792, "grad_norm": 1.6580131250235997, "learning_rate": 1.2116476827794104e-05, "loss": 0.024014970660209654, "memory(GiB)": 36.91, "step": 1205, "token_acc": 1.0, "train_speed(iter/s)": 0.016204 }, { "epoch": 3.9382741116751268, "grad_norm": 1.157754837023021, "learning_rate": 1.1767118312224151e-05, "loss": 0.007532584667205811, "memory(GiB)": 36.91, "step": 1210, "token_acc": 0.9972375690607734, "train_speed(iter/s)": 0.016207 }, { "epoch": 3.954517766497462, "grad_norm": 0.6972765226059477, "learning_rate": 1.142219749255427e-05, "loss": 0.004430451989173889, "memory(GiB)": 36.91, "step": 1215, "token_acc": 0.9972677595628415, "train_speed(iter/s)": 0.016207 }, { "epoch": 3.970761421319797, "grad_norm": 2.2979580480692188, "learning_rate": 1.1081754403791999e-05, "loss": 0.015141716599464417, "memory(GiB)": 36.91, "step": 1220, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.016206 }, { "epoch": 3.987005076142132, "grad_norm": 0.2965970510784761, "learning_rate": 1.0745828561214056e-05, "loss": 0.021216361224651335, "memory(GiB)": 36.91, "step": 1225, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.016206 }, { "epoch": 3.996751269035533, "eval_loss": 0.29802748560905457, "eval_runtime": 62.08, "eval_samples_per_second": 3.189, "eval_steps_per_second": 0.805, "eval_token_acc": 0.9388523047977423, "step": 1228 }, { "epoch": 4.0056852791878175, "grad_norm": 0.7419564842144963, "learning_rate": 1.041445895577977e-05, "loss": 0.009254975616931916, "memory(GiB)": 36.91, "step": 1230, "token_acc": 0.9668174962292609, "train_speed(iter/s)": 0.016191 }, { "epoch": 4.021928934010153, "grad_norm": 0.1343462548929871, "learning_rate": 1.008768404960535e-05, "loss": 0.002759779617190361, "memory(GiB)": 36.91, "step": 1235, "token_acc": 1.0, "train_speed(iter/s)": 0.016192 }, { "epoch": 4.038172588832487, "grad_norm": 0.08229350773537837, "learning_rate": 9.765541771499659e-06, "loss": 0.0012123636901378632, "memory(GiB)": 36.91, "step": 1240, "token_acc": 1.0, "train_speed(iter/s)": 0.016192 }, { "epoch": 4.054416243654822, "grad_norm": 0.08190000464747839, "learning_rate": 9.448069512561775e-06, "loss": 0.0066297553479671475, "memory(GiB)": 36.91, "step": 1245, "token_acc": 1.0, "train_speed(iter/s)": 0.016195 }, { "epoch": 4.070659898477158, "grad_norm": 0.12397302242146173, "learning_rate": 9.135304121840976e-06, "loss": 0.0012923330999910832, "memory(GiB)": 36.91, "step": 1250, "token_acc": 1.0, "train_speed(iter/s)": 0.016192 }, { "epoch": 4.086903553299492, "grad_norm": 0.057048418793994596, "learning_rate": 8.827281902059698e-06, "loss": 0.0007107659243047237, "memory(GiB)": 36.91, "step": 1255, "token_acc": 1.0, "train_speed(iter/s)": 0.016191 }, { "epoch": 4.103147208121827, "grad_norm": 0.16324844745357645, "learning_rate": 8.524038605399886e-06, "loss": 0.0021383626386523246, "memory(GiB)": 36.91, "step": 1260, "token_acc": 1.0, "train_speed(iter/s)": 0.016189 }, { "epoch": 4.1193908629441625, "grad_norm": 0.06874787839714207, "learning_rate": 8.225609429353187e-06, "loss": 0.0028022559359669684, "memory(GiB)": 36.91, "step": 1265, "token_acc": 1.0, "train_speed(iter/s)": 0.016188 }, { "epoch": 4.135634517766498, "grad_norm": 0.2526140368602798, "learning_rate": 7.932029012635623e-06, "loss": 0.003260459750890732, "memory(GiB)": 36.91, "step": 1270, "token_acc": 1.0, "train_speed(iter/s)": 0.016187 }, { "epoch": 4.151878172588832, "grad_norm": 0.14918347721067196, "learning_rate": 7.643331431167017e-06, "loss": 0.004188637435436249, "memory(GiB)": 36.91, "step": 1275, "token_acc": 1.0, "train_speed(iter/s)": 0.016188 }, { "epoch": 4.168121827411167, "grad_norm": 0.46928271799249704, "learning_rate": 7.35955019411585e-06, "loss": 0.011932872980833054, "memory(GiB)": 36.91, "step": 1280, "token_acc": 1.0, "train_speed(iter/s)": 0.016191 }, { "epoch": 4.184365482233503, "grad_norm": 0.07080459315091195, "learning_rate": 7.080718240009826e-06, "loss": 0.004019932448863983, "memory(GiB)": 36.91, "step": 1285, "token_acc": 1.0, "train_speed(iter/s)": 0.01619 }, { "epoch": 4.200609137055838, "grad_norm": 0.7271340874397169, "learning_rate": 6.806867932912653e-06, "loss": 0.0061328854411840435, "memory(GiB)": 36.91, "step": 1290, "token_acc": 1.0, "train_speed(iter/s)": 0.016194 }, { "epoch": 4.216852791878172, "grad_norm": 0.1265328539578886, "learning_rate": 6.53803105866761e-06, "loss": 0.006417517364025116, "memory(GiB)": 36.91, "step": 1295, "token_acc": 1.0, "train_speed(iter/s)": 0.016196 }, { "epoch": 4.233096446700507, "grad_norm": 0.057288978819073196, "learning_rate": 6.274238821208128e-06, "loss": 0.003987757861614228, "memory(GiB)": 36.91, "step": 1300, "token_acc": 0.9975062344139651, "train_speed(iter/s)": 0.016195 }, { "epoch": 4.249340101522843, "grad_norm": 0.1481683428098521, "learning_rate": 6.015521838935905e-06, "loss": 0.0010721445083618163, "memory(GiB)": 36.91, "step": 1305, "token_acc": 1.0, "train_speed(iter/s)": 0.016194 }, { "epoch": 4.265583756345178, "grad_norm": 0.10590383120253814, "learning_rate": 5.7619101411671095e-06, "loss": 0.002213609591126442, "memory(GiB)": 36.91, "step": 1310, "token_acc": 1.0, "train_speed(iter/s)": 0.016195 }, { "epoch": 4.281827411167512, "grad_norm": 0.04714189372424805, "learning_rate": 5.513433164646814e-06, "loss": 0.0011348580941557885, "memory(GiB)": 36.91, "step": 1315, "token_acc": 0.9976689976689976, "train_speed(iter/s)": 0.016199 }, { "epoch": 4.298071065989848, "grad_norm": 0.476391282204877, "learning_rate": 5.270119750132258e-06, "loss": 0.004196888953447342, "memory(GiB)": 36.91, "step": 1320, "token_acc": 1.0, "train_speed(iter/s)": 0.016201 }, { "epoch": 4.314314720812183, "grad_norm": 0.35042552841819846, "learning_rate": 5.031998139045352e-06, "loss": 0.0034095611423254012, "memory(GiB)": 36.91, "step": 1325, "token_acc": 1.0, "train_speed(iter/s)": 0.016203 }, { "epoch": 4.330558375634518, "grad_norm": 0.05524764971116243, "learning_rate": 4.799095970194628e-06, "loss": 0.0037711452692747115, "memory(GiB)": 36.91, "step": 1330, "token_acc": 1.0, "train_speed(iter/s)": 0.016203 }, { "epoch": 4.346802030456852, "grad_norm": 0.5445980593755461, "learning_rate": 4.571440276567257e-06, "loss": 0.0024499524384737014, "memory(GiB)": 36.91, "step": 1335, "token_acc": 0.997624703087886, "train_speed(iter/s)": 0.016206 }, { "epoch": 4.363045685279188, "grad_norm": 0.10598886435572437, "learning_rate": 4.349057482191299e-06, "loss": 0.004410183429718018, "memory(GiB)": 36.91, "step": 1340, "token_acc": 1.0, "train_speed(iter/s)": 0.016206 }, { "epoch": 4.379289340101523, "grad_norm": 0.04699969388550453, "learning_rate": 4.1319733990686446e-06, "loss": 0.0011100947856903076, "memory(GiB)": 36.91, "step": 1345, "token_acc": 1.0, "train_speed(iter/s)": 0.016205 }, { "epoch": 4.395532994923858, "grad_norm": 0.017045928815902597, "learning_rate": 3.920213224179042e-06, "loss": 0.00034863052424043416, "memory(GiB)": 36.91, "step": 1350, "token_acc": 1.0, "train_speed(iter/s)": 0.016206 }, { "epoch": 4.4117766497461925, "grad_norm": 0.7161935581935048, "learning_rate": 3.7138015365554833e-06, "loss": 0.0035605177283287047, "memory(GiB)": 36.91, "step": 1355, "token_acc": 0.9977220956719818, "train_speed(iter/s)": 0.016207 }, { "epoch": 4.428020304568528, "grad_norm": 0.06887525802872778, "learning_rate": 3.512762294431271e-06, "loss": 0.006134101003408432, "memory(GiB)": 36.91, "step": 1360, "token_acc": 0.9975186104218362, "train_speed(iter/s)": 0.016208 }, { "epoch": 4.444263959390863, "grad_norm": 0.041826315852571724, "learning_rate": 3.3171188324592427e-06, "loss": 0.0012344198301434516, "memory(GiB)": 36.91, "step": 1365, "token_acc": 1.0, "train_speed(iter/s)": 0.016208 }, { "epoch": 4.460507614213198, "grad_norm": 0.07787992465189252, "learning_rate": 3.126893859003249e-06, "loss": 0.0013754777610301971, "memory(GiB)": 36.91, "step": 1370, "token_acc": 1.0, "train_speed(iter/s)": 0.016208 }, { "epoch": 4.476751269035533, "grad_norm": 0.9611581457799497, "learning_rate": 2.9421094535024507e-06, "loss": 0.004121043905615807, "memory(GiB)": 36.91, "step": 1375, "token_acc": 0.9933920704845814, "train_speed(iter/s)": 0.016206 }, { "epoch": 4.492994923857868, "grad_norm": 0.11072593270596472, "learning_rate": 2.762787063908523e-06, "loss": 0.0024029091000556946, "memory(GiB)": 36.91, "step": 1380, "token_acc": 1.0, "train_speed(iter/s)": 0.016205 }, { "epoch": 4.509238578680203, "grad_norm": 0.02340550565254115, "learning_rate": 2.5889475041961765e-06, "loss": 0.001028289459645748, "memory(GiB)": 36.91, "step": 1385, "token_acc": 1.0, "train_speed(iter/s)": 0.01621 }, { "epoch": 4.525482233502538, "grad_norm": 0.08895116218405089, "learning_rate": 2.4206109519473163e-06, "loss": 0.0021161407232284544, "memory(GiB)": 36.91, "step": 1390, "token_acc": 1.0, "train_speed(iter/s)": 0.016211 }, { "epoch": 4.541725888324873, "grad_norm": 0.24076601170504602, "learning_rate": 2.2577969460089997e-06, "loss": 0.0007429494522511959, "memory(GiB)": 36.91, "step": 1395, "token_acc": 1.0, "train_speed(iter/s)": 0.016212 }, { "epoch": 4.557969543147208, "grad_norm": 0.19664829308024404, "learning_rate": 2.100524384225555e-06, "loss": 0.0008249727077782154, "memory(GiB)": 36.91, "step": 1400, "token_acc": 1.0, "train_speed(iter/s)": 0.01621 }, { "epoch": 4.574213197969543, "grad_norm": 0.06599531052332817, "learning_rate": 1.948811521245131e-06, "loss": 0.000786225963383913, "memory(GiB)": 36.91, "step": 1405, "token_acc": 1.0, "train_speed(iter/s)": 0.016209 }, { "epoch": 4.5904568527918785, "grad_norm": 0.10702737644857346, "learning_rate": 1.8026759664008465e-06, "loss": 0.003063713386654854, "memory(GiB)": 36.91, "step": 1410, "token_acc": 1.0, "train_speed(iter/s)": 0.016207 }, { "epoch": 4.606700507614213, "grad_norm": 0.41678449867799244, "learning_rate": 1.6621346816668992e-06, "loss": 0.00532943345606327, "memory(GiB)": 36.91, "step": 1415, "token_acc": 0.9937629937629938, "train_speed(iter/s)": 0.016207 }, { "epoch": 4.622944162436548, "grad_norm": 0.029982460463042173, "learning_rate": 1.5272039796897786e-06, "loss": 0.0017097776755690575, "memory(GiB)": 36.91, "step": 1420, "token_acc": 1.0, "train_speed(iter/s)": 0.016209 }, { "epoch": 4.639187817258883, "grad_norm": 0.03591858354249925, "learning_rate": 1.397899521894841e-06, "loss": 0.0013645312748849392, "memory(GiB)": 36.91, "step": 1425, "token_acc": 1.0, "train_speed(iter/s)": 0.016212 }, { "epoch": 4.655431472081219, "grad_norm": 0.04773799774300644, "learning_rate": 1.2742363166685034e-06, "loss": 0.0009639391675591469, "memory(GiB)": 36.91, "step": 1430, "token_acc": 1.0, "train_speed(iter/s)": 0.01621 }, { "epoch": 4.671675126903553, "grad_norm": 0.129000803673704, "learning_rate": 1.15622871761622e-06, "loss": 0.0005136763211339712, "memory(GiB)": 36.91, "step": 1435, "token_acc": 1.0, "train_speed(iter/s)": 0.016212 }, { "epoch": 4.687918781725888, "grad_norm": 0.029179325549530243, "learning_rate": 1.0438904218964319e-06, "loss": 0.0004105303902179003, "memory(GiB)": 36.91, "step": 1440, "token_acc": 1.0, "train_speed(iter/s)": 0.016213 }, { "epoch": 4.7041624365482235, "grad_norm": 0.04897256940654327, "learning_rate": 9.372344686307655e-07, "loss": 0.0009922079741954803, "memory(GiB)": 36.91, "step": 1445, "token_acc": 1.0, "train_speed(iter/s)": 0.016212 }, { "epoch": 4.720406091370558, "grad_norm": 0.0393178010532892, "learning_rate": 8.362732373905723e-07, "loss": 0.0008288329467177391, "memory(GiB)": 36.91, "step": 1450, "token_acc": 1.0, "train_speed(iter/s)": 0.016214 }, { "epoch": 4.736649746192893, "grad_norm": 0.08771738931354985, "learning_rate": 7.410184467600001e-07, "loss": 0.0005111692938953638, "memory(GiB)": 36.91, "step": 1455, "token_acc": 1.0, "train_speed(iter/s)": 0.016214 }, { "epoch": 4.752893401015228, "grad_norm": 0.04916799951696976, "learning_rate": 6.514811529758747e-07, "loss": 0.007441927492618561, "memory(GiB)": 36.91, "step": 1460, "token_acc": 1.0, "train_speed(iter/s)": 0.016214 }, { "epoch": 4.769137055837564, "grad_norm": 0.44716598217302617, "learning_rate": 5.676717486443439e-07, "loss": 0.0024275451898574827, "memory(GiB)": 36.91, "step": 1465, "token_acc": 1.0, "train_speed(iter/s)": 0.016214 }, { "epoch": 4.785380710659899, "grad_norm": 0.12117859136787597, "learning_rate": 4.895999615346314e-07, "loss": 0.001637093722820282, "memory(GiB)": 36.91, "step": 1470, "token_acc": 1.0, "train_speed(iter/s)": 0.016214 }, { "epoch": 4.801624365482233, "grad_norm": 0.01706819131966345, "learning_rate": 4.1727485344994486e-07, "loss": 0.0003483247943222523, "memory(GiB)": 36.91, "step": 1475, "token_acc": 1.0, "train_speed(iter/s)": 0.016212 }, { "epoch": 4.8178680203045685, "grad_norm": 0.04859669108953238, "learning_rate": 3.507048191756401e-07, "loss": 0.0021356761455535887, "memory(GiB)": 36.91, "step": 1480, "token_acc": 1.0, "train_speed(iter/s)": 0.016212 }, { "epoch": 4.834111675126904, "grad_norm": 0.03682429514387162, "learning_rate": 2.8989758550487245e-07, "loss": 0.0021858945488929748, "memory(GiB)": 36.91, "step": 1485, "token_acc": 1.0, "train_speed(iter/s)": 0.016213 }, { "epoch": 4.850355329949238, "grad_norm": 0.06507640939116277, "learning_rate": 2.3486021034170857e-07, "loss": 0.002923069894313812, "memory(GiB)": 36.91, "step": 1490, "token_acc": 0.9977064220183486, "train_speed(iter/s)": 0.016212 }, { "epoch": 4.866598984771573, "grad_norm": 0.04259804746440851, "learning_rate": 1.8559908188195418e-07, "loss": 0.0019719479605555534, "memory(GiB)": 36.91, "step": 1495, "token_acc": 1.0, "train_speed(iter/s)": 0.016212 }, { "epoch": 4.882842639593909, "grad_norm": 0.25393381486977334, "learning_rate": 1.4211991787164147e-07, "loss": 0.0011512625962495804, "memory(GiB)": 36.91, "step": 1500, "token_acc": 1.0, "train_speed(iter/s)": 0.016215 }, { "epoch": 4.899086294416244, "grad_norm": 0.21720000107148496, "learning_rate": 1.044277649433989e-07, "loss": 0.003379678726196289, "memory(GiB)": 36.91, "step": 1505, "token_acc": 1.0, "train_speed(iter/s)": 0.016213 }, { "epoch": 4.915329949238579, "grad_norm": 0.6636335728606932, "learning_rate": 7.252699803065311e-08, "loss": 0.014554958045482635, "memory(GiB)": 36.91, "step": 1510, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.016211 }, { "epoch": 4.9315736040609135, "grad_norm": 0.042674818413491626, "learning_rate": 4.6421319859862864e-08, "loss": 0.0024311095476150513, "memory(GiB)": 36.91, "step": 1515, "token_acc": 1.0, "train_speed(iter/s)": 0.016212 }, { "epoch": 4.947817258883249, "grad_norm": 0.07981897617268605, "learning_rate": 2.6113760520735108e-08, "loss": 0.0024462098255753515, "memory(GiB)": 36.91, "step": 1520, "token_acc": 1.0, "train_speed(iter/s)": 0.01621 }, { "epoch": 4.964060913705584, "grad_norm": 0.01695528976036472, "learning_rate": 1.1606677114500697e-08, "loss": 0.011407441645860671, "memory(GiB)": 36.91, "step": 1525, "token_acc": 1.0, "train_speed(iter/s)": 0.01621 }, { "epoch": 4.980304568527918, "grad_norm": 0.05383783400729952, "learning_rate": 2.901753480361036e-09, "loss": 0.005226198583841324, "memory(GiB)": 36.91, "step": 1530, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.01621 }, { "epoch": 4.996548223350254, "grad_norm": 0.9774296313594534, "learning_rate": 0.0, "loss": 0.003532126545906067, "memory(GiB)": 36.91, "step": 1535, "token_acc": 1.0, "train_speed(iter/s)": 0.016209 }, { "epoch": 4.996548223350254, "eval_loss": 0.31882038712501526, "eval_runtime": 62.2556, "eval_samples_per_second": 3.18, "eval_steps_per_second": 0.803, "eval_token_acc": 0.9397930385700847, "step": 1535 }, { "epoch": 4.996548223350254, "eval_loss": 0.31882038712501526, "eval_runtime": 62.6813, "eval_samples_per_second": 3.159, "eval_steps_per_second": 0.798, "eval_token_acc": 0.9397930385700847, "step": 1535 } ], "logging_steps": 5, "max_steps": 1535, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2119246482890555e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }