{ "best_global_step": 4639, "best_metric": 0.71991217, "best_model_checkpoint": "/mnt/hdd2/yansc/log/phi4mm/full_zhifei_0429v2/v5-20250430-125512/checkpoint-4639", "epoch": 1.0, "eval_steps": 2000, "global_step": 4639, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002155636990730761, "grad_norm": 5.661046504974365, "learning_rate": 8.620689655172415e-07, "loss": 1.0767223834991455, "memory(GiB)": 71.28, "step": 1, "token_acc": 0.709544146318819, "train_speed(iter/s)": 0.030934 }, { "epoch": 0.0010778184953653806, "grad_norm": 3.837540864944458, "learning_rate": 4.310344827586207e-06, "loss": 1.0872713327407837, "memory(GiB)": 71.28, "step": 5, "token_acc": 0.6938249020849457, "train_speed(iter/s)": 0.065614 }, { "epoch": 0.002155636990730761, "grad_norm": 1.2554740905761719, "learning_rate": 8.620689655172414e-06, "loss": 0.9432369232177734, "memory(GiB)": 87.63, "step": 10, "token_acc": 0.748069228621941, "train_speed(iter/s)": 0.075942 }, { "epoch": 0.0032334554860961413, "grad_norm": 1.0317494869232178, "learning_rate": 1.2931034482758622e-05, "loss": 0.8897874832153321, "memory(GiB)": 87.63, "step": 15, "token_acc": 0.7574592376233963, "train_speed(iter/s)": 0.080485 }, { "epoch": 0.004311273981461522, "grad_norm": 0.7988346219062805, "learning_rate": 1.7241379310344828e-05, "loss": 0.8504341125488282, "memory(GiB)": 88.73, "step": 20, "token_acc": 0.7576599223021077, "train_speed(iter/s)": 0.082373 }, { "epoch": 0.0053890924768269025, "grad_norm": 0.6801260113716125, "learning_rate": 2.1551724137931033e-05, "loss": 0.8576517105102539, "memory(GiB)": 88.73, "step": 25, "token_acc": 0.7605484222813296, "train_speed(iter/s)": 0.08369 }, { "epoch": 0.006466910972192283, "grad_norm": 0.5753203630447388, "learning_rate": 2.5862068965517244e-05, "loss": 0.8260368347167969, "memory(GiB)": 88.73, "step": 30, "token_acc": 0.7544627128596594, "train_speed(iter/s)": 0.0853 }, { "epoch": 0.007544729467557664, "grad_norm": 0.611558735370636, "learning_rate": 3.017241379310345e-05, "loss": 0.8421600341796875, "memory(GiB)": 88.73, "step": 35, "token_acc": 0.7551620360273913, "train_speed(iter/s)": 0.085916 }, { "epoch": 0.008622547962923045, "grad_norm": 0.5788382887840271, "learning_rate": 3.4482758620689657e-05, "loss": 0.7987819194793702, "memory(GiB)": 88.73, "step": 40, "token_acc": 0.7656490519366119, "train_speed(iter/s)": 0.086277 }, { "epoch": 0.009700366458288425, "grad_norm": 0.5692189931869507, "learning_rate": 3.8793103448275865e-05, "loss": 0.8142763137817383, "memory(GiB)": 88.73, "step": 45, "token_acc": 0.7633664461195377, "train_speed(iter/s)": 0.086814 }, { "epoch": 0.010778184953653805, "grad_norm": 0.6034435033798218, "learning_rate": 4.3103448275862066e-05, "loss": 0.794434928894043, "memory(GiB)": 88.73, "step": 50, "token_acc": 0.7644429064449453, "train_speed(iter/s)": 0.087196 }, { "epoch": 0.011856003449019185, "grad_norm": 0.6427741646766663, "learning_rate": 4.741379310344828e-05, "loss": 0.8116023063659668, "memory(GiB)": 88.73, "step": 55, "token_acc": 0.7686651427941946, "train_speed(iter/s)": 0.087579 }, { "epoch": 0.012933821944384565, "grad_norm": 0.6185195446014404, "learning_rate": 5.172413793103449e-05, "loss": 0.8255170822143555, "memory(GiB)": 88.73, "step": 60, "token_acc": 0.7442739731245774, "train_speed(iter/s)": 0.088041 }, { "epoch": 0.014011640439749945, "grad_norm": 0.6371958255767822, "learning_rate": 5.603448275862069e-05, "loss": 0.8135019302368164, "memory(GiB)": 88.73, "step": 65, "token_acc": 0.7529014819295328, "train_speed(iter/s)": 0.088152 }, { "epoch": 0.015089458935115327, "grad_norm": 0.608048141002655, "learning_rate": 6.03448275862069e-05, "loss": 0.7952300548553467, "memory(GiB)": 88.73, "step": 70, "token_acc": 0.7666317933005773, "train_speed(iter/s)": 0.088469 }, { "epoch": 0.016167277430480707, "grad_norm": 0.6459090709686279, "learning_rate": 6.46551724137931e-05, "loss": 0.8116374969482422, "memory(GiB)": 88.73, "step": 75, "token_acc": 0.7533916304701402, "train_speed(iter/s)": 0.088659 }, { "epoch": 0.01724509592584609, "grad_norm": 0.5890176296234131, "learning_rate": 6.896551724137931e-05, "loss": 0.8122952461242676, "memory(GiB)": 88.73, "step": 80, "token_acc": 0.7620805081108918, "train_speed(iter/s)": 0.088779 }, { "epoch": 0.018322914421211468, "grad_norm": 0.6364602446556091, "learning_rate": 7.327586206896552e-05, "loss": 0.8073968887329102, "memory(GiB)": 88.73, "step": 85, "token_acc": 0.7719013927989332, "train_speed(iter/s)": 0.088849 }, { "epoch": 0.01940073291657685, "grad_norm": 0.5791258215904236, "learning_rate": 7.758620689655173e-05, "loss": 0.8298323631286622, "memory(GiB)": 88.73, "step": 90, "token_acc": 0.7563602442333786, "train_speed(iter/s)": 0.089139 }, { "epoch": 0.020478551411942228, "grad_norm": 0.5789495706558228, "learning_rate": 8.189655172413794e-05, "loss": 0.8007688522338867, "memory(GiB)": 88.73, "step": 95, "token_acc": 0.7786222361093241, "train_speed(iter/s)": 0.089148 }, { "epoch": 0.02155636990730761, "grad_norm": 0.6254374384880066, "learning_rate": 8.620689655172413e-05, "loss": 0.8283987045288086, "memory(GiB)": 88.73, "step": 100, "token_acc": 0.7691091196626252, "train_speed(iter/s)": 0.089245 }, { "epoch": 0.022634188402672988, "grad_norm": 0.5493739247322083, "learning_rate": 9.051724137931035e-05, "loss": 0.8227524757385254, "memory(GiB)": 88.73, "step": 105, "token_acc": 0.7648992695216268, "train_speed(iter/s)": 0.089212 }, { "epoch": 0.02371200689803837, "grad_norm": 0.6340286731719971, "learning_rate": 9.482758620689656e-05, "loss": 0.8371359825134277, "memory(GiB)": 88.73, "step": 110, "token_acc": 0.7527082188554831, "train_speed(iter/s)": 0.089353 }, { "epoch": 0.024789825393403752, "grad_norm": 0.6534638404846191, "learning_rate": 9.913793103448277e-05, "loss": 0.8190339088439942, "memory(GiB)": 88.73, "step": 115, "token_acc": 0.7620466783072943, "train_speed(iter/s)": 0.089498 }, { "epoch": 0.02586764388876913, "grad_norm": 0.5949416160583496, "learning_rate": 0.00010344827586206898, "loss": 0.8367951393127442, "memory(GiB)": 88.73, "step": 120, "token_acc": 0.7603366502371431, "train_speed(iter/s)": 0.089636 }, { "epoch": 0.026945462384134512, "grad_norm": 0.5666251182556152, "learning_rate": 0.00010775862068965518, "loss": 0.8100610733032226, "memory(GiB)": 88.73, "step": 125, "token_acc": 0.7573579610103922, "train_speed(iter/s)": 0.089737 }, { "epoch": 0.02802328087949989, "grad_norm": 0.5843753814697266, "learning_rate": 0.00011206896551724138, "loss": 0.8317647933959961, "memory(GiB)": 88.73, "step": 130, "token_acc": 0.7613499397348332, "train_speed(iter/s)": 0.089796 }, { "epoch": 0.029101099374865273, "grad_norm": 0.9183714985847473, "learning_rate": 0.00011637931034482759, "loss": 0.8461246490478516, "memory(GiB)": 88.73, "step": 135, "token_acc": 0.7635938567150587, "train_speed(iter/s)": 0.089723 }, { "epoch": 0.030178917870230654, "grad_norm": 1.239608645439148, "learning_rate": 0.0001206896551724138, "loss": 0.8860382080078125, "memory(GiB)": 88.73, "step": 140, "token_acc": 0.7574622400986681, "train_speed(iter/s)": 0.089786 }, { "epoch": 0.03125673636559603, "grad_norm": 1.4744077920913696, "learning_rate": 0.000125, "loss": 0.8804110527038574, "memory(GiB)": 88.73, "step": 145, "token_acc": 0.7445354941376598, "train_speed(iter/s)": 0.089746 }, { "epoch": 0.032334554860961415, "grad_norm": 2.341097116470337, "learning_rate": 0.0001293103448275862, "loss": 0.893686580657959, "memory(GiB)": 88.73, "step": 150, "token_acc": 0.7399476454319251, "train_speed(iter/s)": 0.089811 }, { "epoch": 0.0334123733563268, "grad_norm": 0.6831941604614258, "learning_rate": 0.00013362068965517242, "loss": 0.8777801513671875, "memory(GiB)": 88.73, "step": 155, "token_acc": 0.7535869565217391, "train_speed(iter/s)": 0.089816 }, { "epoch": 0.03449019185169218, "grad_norm": 0.5728196501731873, "learning_rate": 0.00013793103448275863, "loss": 0.863647174835205, "memory(GiB)": 88.73, "step": 160, "token_acc": 0.7561969654524259, "train_speed(iter/s)": 0.089822 }, { "epoch": 0.03556801034705755, "grad_norm": 0.5426782369613647, "learning_rate": 0.00014224137931034483, "loss": 0.8596952438354493, "memory(GiB)": 88.73, "step": 165, "token_acc": 0.7409669895945461, "train_speed(iter/s)": 0.089768 }, { "epoch": 0.036645828842422935, "grad_norm": 0.5218408107757568, "learning_rate": 0.00014655172413793104, "loss": 0.8550775527954102, "memory(GiB)": 88.73, "step": 170, "token_acc": 0.7513248628732122, "train_speed(iter/s)": 0.089854 }, { "epoch": 0.03772364733778832, "grad_norm": 0.511150062084198, "learning_rate": 0.00015086206896551725, "loss": 0.8629619598388671, "memory(GiB)": 88.73, "step": 175, "token_acc": 0.7471277353866477, "train_speed(iter/s)": 0.089819 }, { "epoch": 0.0388014658331537, "grad_norm": 0.5180395245552063, "learning_rate": 0.00015517241379310346, "loss": 0.8539790153503418, "memory(GiB)": 88.73, "step": 180, "token_acc": 0.7535670340602665, "train_speed(iter/s)": 0.089885 }, { "epoch": 0.039879284328519074, "grad_norm": 0.5233175158500671, "learning_rate": 0.00015948275862068967, "loss": 0.8610179901123047, "memory(GiB)": 88.73, "step": 185, "token_acc": 0.7630914341060712, "train_speed(iter/s)": 0.089831 }, { "epoch": 0.040957102823884456, "grad_norm": 0.640143871307373, "learning_rate": 0.00016379310344827587, "loss": 0.8914051055908203, "memory(GiB)": 88.73, "step": 190, "token_acc": 0.7471746066079057, "train_speed(iter/s)": 0.089858 }, { "epoch": 0.04203492131924984, "grad_norm": 0.6472980380058289, "learning_rate": 0.00016810344827586208, "loss": 0.8640270233154297, "memory(GiB)": 88.73, "step": 195, "token_acc": 0.747571778321834, "train_speed(iter/s)": 0.089866 }, { "epoch": 0.04311273981461522, "grad_norm": 0.5480747222900391, "learning_rate": 0.00017241379310344826, "loss": 0.8949162483215332, "memory(GiB)": 88.73, "step": 200, "token_acc": 0.7529637377963738, "train_speed(iter/s)": 0.089938 }, { "epoch": 0.0441905583099806, "grad_norm": 0.5225749611854553, "learning_rate": 0.00017672413793103447, "loss": 0.886573314666748, "memory(GiB)": 88.73, "step": 205, "token_acc": 0.7422848705509038, "train_speed(iter/s)": 0.089905 }, { "epoch": 0.045268376805345976, "grad_norm": 0.4496360421180725, "learning_rate": 0.0001810344827586207, "loss": 0.8933658599853516, "memory(GiB)": 88.73, "step": 210, "token_acc": 0.7473035168464323, "train_speed(iter/s)": 0.089904 }, { "epoch": 0.04634619530071136, "grad_norm": 0.6532354354858398, "learning_rate": 0.0001853448275862069, "loss": 0.8972723960876465, "memory(GiB)": 88.73, "step": 215, "token_acc": 0.7440370107024461, "train_speed(iter/s)": 0.089957 }, { "epoch": 0.04742401379607674, "grad_norm": 0.5721102952957153, "learning_rate": 0.00018965517241379312, "loss": 0.8774005889892578, "memory(GiB)": 88.73, "step": 220, "token_acc": 0.7458653250483396, "train_speed(iter/s)": 0.089951 }, { "epoch": 0.04850183229144212, "grad_norm": 0.5491283535957336, "learning_rate": 0.00019396551724137933, "loss": 0.912078857421875, "memory(GiB)": 88.73, "step": 225, "token_acc": 0.7377543196447995, "train_speed(iter/s)": 0.089951 }, { "epoch": 0.049579650786807504, "grad_norm": 0.5220667123794556, "learning_rate": 0.00019827586206896554, "loss": 0.8997896194458008, "memory(GiB)": 88.73, "step": 230, "token_acc": 0.7448181020365696, "train_speed(iter/s)": 0.089998 }, { "epoch": 0.05065746928217288, "grad_norm": 0.5665777921676636, "learning_rate": 0.00019999977132115367, "loss": 0.8994318008422851, "memory(GiB)": 88.73, "step": 235, "token_acc": 0.754438030437379, "train_speed(iter/s)": 0.090004 }, { "epoch": 0.05173528777753826, "grad_norm": 0.611210286617279, "learning_rate": 0.0001999983738431023, "loss": 0.9068660736083984, "memory(GiB)": 88.73, "step": 240, "token_acc": 0.7481563865175744, "train_speed(iter/s)": 0.090053 }, { "epoch": 0.05281310627290364, "grad_norm": 0.599448025226593, "learning_rate": 0.00019999570594853575, "loss": 0.9139809608459473, "memory(GiB)": 88.73, "step": 245, "token_acc": 0.7481180749386789, "train_speed(iter/s)": 0.090038 }, { "epoch": 0.053890924768269025, "grad_norm": 0.6098793745040894, "learning_rate": 0.00019999176767134788, "loss": 0.9120397567749023, "memory(GiB)": 88.73, "step": 250, "token_acc": 0.7417220914307198, "train_speed(iter/s)": 0.090035 }, { "epoch": 0.054968743263634406, "grad_norm": 0.5907915830612183, "learning_rate": 0.00019998655906157203, "loss": 0.9487130165100097, "memory(GiB)": 88.73, "step": 255, "token_acc": 0.7332112793024046, "train_speed(iter/s)": 0.089997 }, { "epoch": 0.05604656175899978, "grad_norm": 0.6379203200340271, "learning_rate": 0.00019998008018538036, "loss": 0.9220156669616699, "memory(GiB)": 88.73, "step": 260, "token_acc": 0.7371329783810128, "train_speed(iter/s)": 0.090054 }, { "epoch": 0.05712438025436516, "grad_norm": 0.5306751132011414, "learning_rate": 0.0001999723311250828, "loss": 0.9344444274902344, "memory(GiB)": 88.73, "step": 265, "token_acc": 0.7357421711218985, "train_speed(iter/s)": 0.09012 }, { "epoch": 0.058202198749730545, "grad_norm": 0.5425118207931519, "learning_rate": 0.00019996331197912645, "loss": 0.9182867050170899, "memory(GiB)": 88.73, "step": 270, "token_acc": 0.7378928785920978, "train_speed(iter/s)": 0.090134 }, { "epoch": 0.05928001724509593, "grad_norm": 0.53849196434021, "learning_rate": 0.00019995302286209378, "loss": 0.9262136459350586, "memory(GiB)": 88.73, "step": 275, "token_acc": 0.7300669089178329, "train_speed(iter/s)": 0.090123 }, { "epoch": 0.06035783574046131, "grad_norm": 0.5792683959007263, "learning_rate": 0.00019994146390470163, "loss": 0.9218643188476563, "memory(GiB)": 88.73, "step": 280, "token_acc": 0.7431568104134476, "train_speed(iter/s)": 0.090126 }, { "epoch": 0.061435654235826684, "grad_norm": 0.5885177254676819, "learning_rate": 0.00019992863525379923, "loss": 0.9088655471801758, "memory(GiB)": 88.73, "step": 285, "token_acc": 0.7371447058279799, "train_speed(iter/s)": 0.090159 }, { "epoch": 0.06251347273119207, "grad_norm": 0.5630527138710022, "learning_rate": 0.0001999145370723666, "loss": 0.898719596862793, "memory(GiB)": 88.73, "step": 290, "token_acc": 0.7447793981420215, "train_speed(iter/s)": 0.090186 }, { "epoch": 0.06359129122655745, "grad_norm": 0.4674566686153412, "learning_rate": 0.0001998991695395122, "loss": 0.9114974021911622, "memory(GiB)": 88.73, "step": 295, "token_acc": 0.7391409693682611, "train_speed(iter/s)": 0.090211 }, { "epoch": 0.06466910972192283, "grad_norm": 0.4685205817222595, "learning_rate": 0.00019988253285047094, "loss": 0.9294984817504883, "memory(GiB)": 88.73, "step": 300, "token_acc": 0.7292210352402231, "train_speed(iter/s)": 0.090284 }, { "epoch": 0.06574692821728821, "grad_norm": 0.5084885358810425, "learning_rate": 0.00019986462721660147, "loss": 0.9087018013000489, "memory(GiB)": 88.73, "step": 305, "token_acc": 0.7447656936670213, "train_speed(iter/s)": 0.090321 }, { "epoch": 0.0668247467126536, "grad_norm": 0.5176272392272949, "learning_rate": 0.0001998454528653836, "loss": 0.9171270370483399, "memory(GiB)": 88.73, "step": 310, "token_acc": 0.7495327645811344, "train_speed(iter/s)": 0.090389 }, { "epoch": 0.06790256520801898, "grad_norm": 0.47679659724235535, "learning_rate": 0.00019982501004041549, "loss": 0.9229598999023437, "memory(GiB)": 88.73, "step": 315, "token_acc": 0.740474781704409, "train_speed(iter/s)": 0.090362 }, { "epoch": 0.06898038370338436, "grad_norm": 0.45314136147499084, "learning_rate": 0.0001998032990014103, "loss": 0.9118033409118652, "memory(GiB)": 88.73, "step": 320, "token_acc": 0.7418006337953553, "train_speed(iter/s)": 0.090354 }, { "epoch": 0.07005820219874972, "grad_norm": 0.45677947998046875, "learning_rate": 0.00019978032002419322, "loss": 0.9114517211914063, "memory(GiB)": 88.73, "step": 325, "token_acc": 0.7536293766011956, "train_speed(iter/s)": 0.090354 }, { "epoch": 0.0711360206941151, "grad_norm": 0.4702832102775574, "learning_rate": 0.00019975607340069767, "loss": 0.911367130279541, "memory(GiB)": 88.73, "step": 330, "token_acc": 0.7442103801101903, "train_speed(iter/s)": 0.090394 }, { "epoch": 0.07221383918948049, "grad_norm": 0.44406020641326904, "learning_rate": 0.00019973055943896178, "loss": 0.9199405670166015, "memory(GiB)": 88.73, "step": 335, "token_acc": 0.7474635294117647, "train_speed(iter/s)": 0.090431 }, { "epoch": 0.07329165768484587, "grad_norm": 0.43742668628692627, "learning_rate": 0.00019970377846312442, "loss": 0.9072576522827148, "memory(GiB)": 88.73, "step": 340, "token_acc": 0.7448765059690305, "train_speed(iter/s)": 0.090392 }, { "epoch": 0.07436947618021125, "grad_norm": 0.48286423087120056, "learning_rate": 0.00019967573081342103, "loss": 0.9394813537597656, "memory(GiB)": 88.73, "step": 345, "token_acc": 0.7324060658746258, "train_speed(iter/s)": 0.09043 }, { "epoch": 0.07544729467557663, "grad_norm": 0.44604936242103577, "learning_rate": 0.00019964641684617937, "loss": 0.9257543563842774, "memory(GiB)": 88.73, "step": 350, "token_acc": 0.7362590183213089, "train_speed(iter/s)": 0.090483 }, { "epoch": 0.07652511317094202, "grad_norm": 0.4677391052246094, "learning_rate": 0.00019961583693381502, "loss": 0.9080276489257812, "memory(GiB)": 88.73, "step": 355, "token_acc": 0.748174919335941, "train_speed(iter/s)": 0.090491 }, { "epoch": 0.0776029316663074, "grad_norm": 0.4772037863731384, "learning_rate": 0.00019958399146482656, "loss": 0.9210508346557618, "memory(GiB)": 88.73, "step": 360, "token_acc": 0.7503972686080616, "train_speed(iter/s)": 0.09046 }, { "epoch": 0.07868075016167278, "grad_norm": 0.4248864948749542, "learning_rate": 0.00019955088084379062, "loss": 0.9053157806396485, "memory(GiB)": 88.73, "step": 365, "token_acc": 0.7477634306327001, "train_speed(iter/s)": 0.090452 }, { "epoch": 0.07975856865703815, "grad_norm": 0.4409038722515106, "learning_rate": 0.00019951650549135687, "loss": 0.9242048263549805, "memory(GiB)": 88.73, "step": 370, "token_acc": 0.744416670454029, "train_speed(iter/s)": 0.090439 }, { "epoch": 0.08083638715240353, "grad_norm": 0.41120997071266174, "learning_rate": 0.00019948086584424256, "loss": 0.9199428558349609, "memory(GiB)": 88.73, "step": 375, "token_acc": 0.74176891286139, "train_speed(iter/s)": 0.090445 }, { "epoch": 0.08191420564776891, "grad_norm": 0.47451648116111755, "learning_rate": 0.00019944396235522702, "loss": 0.9232440948486328, "memory(GiB)": 88.73, "step": 380, "token_acc": 0.7535579532666976, "train_speed(iter/s)": 0.090473 }, { "epoch": 0.0829920241431343, "grad_norm": 0.40214264392852783, "learning_rate": 0.00019940579549314588, "loss": 0.9306692123413086, "memory(GiB)": 88.73, "step": 385, "token_acc": 0.7444515951089454, "train_speed(iter/s)": 0.090469 }, { "epoch": 0.08406984263849968, "grad_norm": 0.37348198890686035, "learning_rate": 0.00019936636574288517, "loss": 0.9222702980041504, "memory(GiB)": 88.73, "step": 390, "token_acc": 0.7455256485821475, "train_speed(iter/s)": 0.090446 }, { "epoch": 0.08514766113386506, "grad_norm": 0.5010365843772888, "learning_rate": 0.00019932567360537507, "loss": 0.901220703125, "memory(GiB)": 88.73, "step": 395, "token_acc": 0.7542318975420341, "train_speed(iter/s)": 0.090464 }, { "epoch": 0.08622547962923044, "grad_norm": 0.4462658166885376, "learning_rate": 0.0001992837195975836, "loss": 0.903138542175293, "memory(GiB)": 88.73, "step": 400, "token_acc": 0.7556640745950441, "train_speed(iter/s)": 0.090429 }, { "epoch": 0.08730329812459582, "grad_norm": 0.4861457645893097, "learning_rate": 0.00019924050425251013, "loss": 0.9164201736450195, "memory(GiB)": 88.73, "step": 405, "token_acc": 0.7416369510367108, "train_speed(iter/s)": 0.090412 }, { "epoch": 0.0883811166199612, "grad_norm": 0.5217711925506592, "learning_rate": 0.00019919602811917844, "loss": 0.9071553230285645, "memory(GiB)": 88.73, "step": 410, "token_acc": 0.7415488543118854, "train_speed(iter/s)": 0.090428 }, { "epoch": 0.08945893511532658, "grad_norm": 0.4896453320980072, "learning_rate": 0.00019915029176262986, "loss": 0.9114551544189453, "memory(GiB)": 88.73, "step": 415, "token_acc": 0.7443950735646595, "train_speed(iter/s)": 0.090445 }, { "epoch": 0.09053675361069195, "grad_norm": 0.44842737913131714, "learning_rate": 0.00019910329576391605, "loss": 0.9312037467956543, "memory(GiB)": 88.73, "step": 420, "token_acc": 0.7431434649019144, "train_speed(iter/s)": 0.090444 }, { "epoch": 0.09161457210605733, "grad_norm": 0.4130711853504181, "learning_rate": 0.00019905504072009167, "loss": 0.9252151489257813, "memory(GiB)": 88.73, "step": 425, "token_acc": 0.7375694937444105, "train_speed(iter/s)": 0.090467 }, { "epoch": 0.09269239060142272, "grad_norm": 0.4788070619106293, "learning_rate": 0.00019900552724420672, "loss": 0.9283370971679688, "memory(GiB)": 88.73, "step": 430, "token_acc": 0.7336248630528218, "train_speed(iter/s)": 0.090477 }, { "epoch": 0.0937702090967881, "grad_norm": 0.4697573781013489, "learning_rate": 0.00019895475596529884, "loss": 0.9044897079467773, "memory(GiB)": 88.73, "step": 435, "token_acc": 0.75642031868874, "train_speed(iter/s)": 0.090477 }, { "epoch": 0.09484802759215348, "grad_norm": 0.44446617364883423, "learning_rate": 0.00019890272752838518, "loss": 0.9221466064453125, "memory(GiB)": 88.73, "step": 440, "token_acc": 0.7484308860020435, "train_speed(iter/s)": 0.090498 }, { "epoch": 0.09592584608751886, "grad_norm": 0.38471779227256775, "learning_rate": 0.0001988494425944544, "loss": 0.9338311195373535, "memory(GiB)": 88.73, "step": 445, "token_acc": 0.7368623622402043, "train_speed(iter/s)": 0.090525 }, { "epoch": 0.09700366458288424, "grad_norm": 0.3869895935058594, "learning_rate": 0.00019879490184045805, "loss": 0.9229958534240723, "memory(GiB)": 88.73, "step": 450, "token_acc": 0.7350404992572801, "train_speed(iter/s)": 0.090513 }, { "epoch": 0.09808148307824963, "grad_norm": 0.4316529929637909, "learning_rate": 0.00019873910595930217, "loss": 0.901104736328125, "memory(GiB)": 88.73, "step": 455, "token_acc": 0.7458434221146085, "train_speed(iter/s)": 0.0905 }, { "epoch": 0.09915930157361501, "grad_norm": 0.4023454487323761, "learning_rate": 0.0001986820556598384, "loss": 0.9106796264648438, "memory(GiB)": 88.73, "step": 460, "token_acc": 0.7401160930572696, "train_speed(iter/s)": 0.090519 }, { "epoch": 0.10023712006898039, "grad_norm": 0.4310077428817749, "learning_rate": 0.00019862375166685494, "loss": 0.9086391448974609, "memory(GiB)": 88.73, "step": 465, "token_acc": 0.7475860418450097, "train_speed(iter/s)": 0.090538 }, { "epoch": 0.10131493856434576, "grad_norm": 0.39574119448661804, "learning_rate": 0.00019856419472106735, "loss": 0.9008966445922851, "memory(GiB)": 88.73, "step": 470, "token_acc": 0.7470332334232211, "train_speed(iter/s)": 0.090559 }, { "epoch": 0.10239275705971114, "grad_norm": 0.4013592302799225, "learning_rate": 0.00019850338557910927, "loss": 0.8932973861694335, "memory(GiB)": 88.73, "step": 475, "token_acc": 0.7484444026694238, "train_speed(iter/s)": 0.090558 }, { "epoch": 0.10347057555507652, "grad_norm": 0.38656729459762573, "learning_rate": 0.0001984413250135226, "loss": 0.9177732467651367, "memory(GiB)": 88.73, "step": 480, "token_acc": 0.7433336474135495, "train_speed(iter/s)": 0.090554 }, { "epoch": 0.1045483940504419, "grad_norm": 0.3958161473274231, "learning_rate": 0.00019837801381274785, "loss": 0.9225729942321778, "memory(GiB)": 88.73, "step": 485, "token_acc": 0.7341326625841029, "train_speed(iter/s)": 0.090529 }, { "epoch": 0.10562621254580729, "grad_norm": 0.3869446814060211, "learning_rate": 0.00019831345278111407, "loss": 0.9007123947143555, "memory(GiB)": 88.73, "step": 490, "token_acc": 0.7423751483525799, "train_speed(iter/s)": 0.09056 }, { "epoch": 0.10670403104117267, "grad_norm": 0.4161109924316406, "learning_rate": 0.00019824764273882858, "loss": 0.9075139999389649, "memory(GiB)": 88.73, "step": 495, "token_acc": 0.7485750474984167, "train_speed(iter/s)": 0.090576 }, { "epoch": 0.10778184953653805, "grad_norm": 0.40261998772621155, "learning_rate": 0.0001981805845219666, "loss": 0.9044816970825196, "memory(GiB)": 88.73, "step": 500, "token_acc": 0.7489236986538269, "train_speed(iter/s)": 0.090592 }, { "epoch": 0.10885966803190343, "grad_norm": 0.41422584652900696, "learning_rate": 0.0001981122789824607, "loss": 0.9133739471435547, "memory(GiB)": 88.73, "step": 505, "token_acc": 0.7409159083171541, "train_speed(iter/s)": 0.090597 }, { "epoch": 0.10993748652726881, "grad_norm": 0.3509552478790283, "learning_rate": 0.00019804272698808984, "loss": 0.931029224395752, "memory(GiB)": 88.73, "step": 510, "token_acc": 0.745147528323743, "train_speed(iter/s)": 0.090595 }, { "epoch": 0.1110153050226342, "grad_norm": 0.45505374670028687, "learning_rate": 0.0001979719294224684, "loss": 0.9063175201416016, "memory(GiB)": 88.73, "step": 515, "token_acc": 0.7390973773630953, "train_speed(iter/s)": 0.090602 }, { "epoch": 0.11209312351799956, "grad_norm": 0.3943646252155304, "learning_rate": 0.00019789988718503493, "loss": 0.90817232131958, "memory(GiB)": 88.73, "step": 520, "token_acc": 0.7525533722590791, "train_speed(iter/s)": 0.09056 }, { "epoch": 0.11317094201336494, "grad_norm": 0.3542667627334595, "learning_rate": 0.00019782660119104084, "loss": 0.9087008476257324, "memory(GiB)": 88.73, "step": 525, "token_acc": 0.7427368843927469, "train_speed(iter/s)": 0.090552 }, { "epoch": 0.11424876050873033, "grad_norm": 0.40044885873794556, "learning_rate": 0.00019775207237153865, "loss": 0.9111812591552735, "memory(GiB)": 88.73, "step": 530, "token_acc": 0.7547386819536139, "train_speed(iter/s)": 0.090552 }, { "epoch": 0.11532657900409571, "grad_norm": 0.37696319818496704, "learning_rate": 0.00019767630167337016, "loss": 0.9171828269958496, "memory(GiB)": 88.73, "step": 535, "token_acc": 0.7493121683252348, "train_speed(iter/s)": 0.090561 }, { "epoch": 0.11640439749946109, "grad_norm": 0.3833931088447571, "learning_rate": 0.00019759929005915453, "loss": 0.9039739608764649, "memory(GiB)": 88.73, "step": 540, "token_acc": 0.7577092101717906, "train_speed(iter/s)": 0.090572 }, { "epoch": 0.11748221599482647, "grad_norm": 0.41779956221580505, "learning_rate": 0.00019752103850727588, "loss": 0.9042394638061524, "memory(GiB)": 88.73, "step": 545, "token_acc": 0.7526614458161615, "train_speed(iter/s)": 0.090588 }, { "epoch": 0.11856003449019185, "grad_norm": 0.3538309633731842, "learning_rate": 0.000197441548011871, "loss": 0.9034321784973145, "memory(GiB)": 88.73, "step": 550, "token_acc": 0.7472319764941953, "train_speed(iter/s)": 0.090562 }, { "epoch": 0.11963785298555724, "grad_norm": 0.3503566384315491, "learning_rate": 0.0001973608195828168, "loss": 0.8913958549499512, "memory(GiB)": 88.73, "step": 555, "token_acc": 0.7465905802344089, "train_speed(iter/s)": 0.090569 }, { "epoch": 0.12071567148092262, "grad_norm": 0.38121363520622253, "learning_rate": 0.00019727885424571716, "loss": 0.9377856254577637, "memory(GiB)": 88.73, "step": 560, "token_acc": 0.734819903704204, "train_speed(iter/s)": 0.09059 }, { "epoch": 0.121793489976288, "grad_norm": 0.45212697982788086, "learning_rate": 0.00019719565304189023, "loss": 0.9161121368408203, "memory(GiB)": 88.73, "step": 565, "token_acc": 0.7443009649899771, "train_speed(iter/s)": 0.090601 }, { "epoch": 0.12287130847165337, "grad_norm": 0.3871101140975952, "learning_rate": 0.00019711121702835504, "loss": 0.9285185813903809, "memory(GiB)": 88.73, "step": 570, "token_acc": 0.735828178569485, "train_speed(iter/s)": 0.090608 }, { "epoch": 0.12394912696701875, "grad_norm": 0.4280000329017639, "learning_rate": 0.00019702554727781804, "loss": 0.9073677062988281, "memory(GiB)": 88.73, "step": 575, "token_acc": 0.7401557868357318, "train_speed(iter/s)": 0.0906 }, { "epoch": 0.12502694546238413, "grad_norm": 0.3899902403354645, "learning_rate": 0.00019693864487865966, "loss": 0.9002187728881836, "memory(GiB)": 88.73, "step": 580, "token_acc": 0.7439659863318178, "train_speed(iter/s)": 0.090613 }, { "epoch": 0.12610476395774953, "grad_norm": 0.3306461274623871, "learning_rate": 0.00019685051093492023, "loss": 0.8804924964904786, "memory(GiB)": 88.73, "step": 585, "token_acc": 0.7577803677554205, "train_speed(iter/s)": 0.090612 }, { "epoch": 0.1271825824531149, "grad_norm": 0.39648500084877014, "learning_rate": 0.00019676114656628612, "loss": 0.8946271896362304, "memory(GiB)": 88.73, "step": 590, "token_acc": 0.7538411938098747, "train_speed(iter/s)": 0.090601 }, { "epoch": 0.12826040094848026, "grad_norm": 0.37641412019729614, "learning_rate": 0.00019667055290807558, "loss": 0.8973848342895507, "memory(GiB)": 88.73, "step": 595, "token_acc": 0.7390115937775169, "train_speed(iter/s)": 0.090616 }, { "epoch": 0.12933821944384566, "grad_norm": 0.4099547266960144, "learning_rate": 0.000196578731111224, "loss": 0.9093647003173828, "memory(GiB)": 88.73, "step": 600, "token_acc": 0.7397350743452336, "train_speed(iter/s)": 0.090612 }, { "epoch": 0.13041603793921103, "grad_norm": 0.3916568160057068, "learning_rate": 0.00019648568234226975, "loss": 0.9189321517944335, "memory(GiB)": 88.73, "step": 605, "token_acc": 0.7469369468217395, "train_speed(iter/s)": 0.090615 }, { "epoch": 0.13149385643457642, "grad_norm": 0.3671991229057312, "learning_rate": 0.0001963914077833389, "loss": 0.9084060668945313, "memory(GiB)": 88.73, "step": 610, "token_acc": 0.739792536324059, "train_speed(iter/s)": 0.090604 }, { "epoch": 0.1325716749299418, "grad_norm": 0.37689363956451416, "learning_rate": 0.00019629590863213053, "loss": 0.9002798080444336, "memory(GiB)": 88.73, "step": 615, "token_acc": 0.7426240741957589, "train_speed(iter/s)": 0.090635 }, { "epoch": 0.1336494934253072, "grad_norm": 0.3764330744743347, "learning_rate": 0.00019619918610190134, "loss": 0.9026242256164551, "memory(GiB)": 88.73, "step": 620, "token_acc": 0.7549048882019216, "train_speed(iter/s)": 0.090632 }, { "epoch": 0.13472731192067255, "grad_norm": 0.39102256298065186, "learning_rate": 0.00019610124142145025, "loss": 0.9003114700317383, "memory(GiB)": 88.73, "step": 625, "token_acc": 0.7490746046037853, "train_speed(iter/s)": 0.090639 }, { "epoch": 0.13580513041603795, "grad_norm": 0.35801634192466736, "learning_rate": 0.000196002075835103, "loss": 0.8972740173339844, "memory(GiB)": 88.73, "step": 630, "token_acc": 0.7442300586682047, "train_speed(iter/s)": 0.090654 }, { "epoch": 0.13688294891140332, "grad_norm": 0.41567403078079224, "learning_rate": 0.00019590169060269602, "loss": 0.9096755981445312, "memory(GiB)": 88.73, "step": 635, "token_acc": 0.7445315908018553, "train_speed(iter/s)": 0.090675 }, { "epoch": 0.13796076740676871, "grad_norm": 0.3889220952987671, "learning_rate": 0.0001958000869995606, "loss": 0.8927362442016602, "memory(GiB)": 88.73, "step": 640, "token_acc": 0.7533186625967204, "train_speed(iter/s)": 0.090674 }, { "epoch": 0.13903858590213408, "grad_norm": 0.3593297600746155, "learning_rate": 0.0001956972663165067, "loss": 0.8948752403259277, "memory(GiB)": 88.73, "step": 645, "token_acc": 0.7487679182365424, "train_speed(iter/s)": 0.090658 }, { "epoch": 0.14011640439749945, "grad_norm": 0.38027486205101013, "learning_rate": 0.00019559322985980652, "loss": 0.9069482803344726, "memory(GiB)": 88.73, "step": 650, "token_acc": 0.7427287503585429, "train_speed(iter/s)": 0.090667 }, { "epoch": 0.14119422289286485, "grad_norm": 0.4165724813938141, "learning_rate": 0.00019548797895117785, "loss": 0.8847145080566406, "memory(GiB)": 88.73, "step": 655, "token_acc": 0.7531168509812133, "train_speed(iter/s)": 0.090676 }, { "epoch": 0.1422720413882302, "grad_norm": 0.4067953824996948, "learning_rate": 0.00019538151492776734, "loss": 0.9011445045471191, "memory(GiB)": 88.73, "step": 660, "token_acc": 0.7474171389005302, "train_speed(iter/s)": 0.090678 }, { "epoch": 0.1433498598835956, "grad_norm": 0.3498924970626831, "learning_rate": 0.00019527383914213359, "loss": 0.903720474243164, "memory(GiB)": 88.73, "step": 665, "token_acc": 0.756180741041969, "train_speed(iter/s)": 0.090659 }, { "epoch": 0.14442767837896098, "grad_norm": 0.40953177213668823, "learning_rate": 0.00019516495296222977, "loss": 0.8954207420349121, "memory(GiB)": 88.73, "step": 670, "token_acc": 0.7536564299424184, "train_speed(iter/s)": 0.090653 }, { "epoch": 0.14550549687432637, "grad_norm": 0.4148671329021454, "learning_rate": 0.0001950548577713864, "loss": 0.8987580299377441, "memory(GiB)": 88.73, "step": 675, "token_acc": 0.7409970166490232, "train_speed(iter/s)": 0.09066 }, { "epoch": 0.14658331536969174, "grad_norm": 0.3642005920410156, "learning_rate": 0.00019494355496829367, "loss": 0.8843393325805664, "memory(GiB)": 88.73, "step": 680, "token_acc": 0.7476116501241848, "train_speed(iter/s)": 0.090668 }, { "epoch": 0.14766113386505714, "grad_norm": 0.3430444002151489, "learning_rate": 0.0001948310459669838, "loss": 0.9020472526550293, "memory(GiB)": 88.73, "step": 685, "token_acc": 0.7397709621711319, "train_speed(iter/s)": 0.090683 }, { "epoch": 0.1487389523604225, "grad_norm": 0.37738940119743347, "learning_rate": 0.00019471733219681296, "loss": 0.8922735214233398, "memory(GiB)": 88.73, "step": 690, "token_acc": 0.7580809206001724, "train_speed(iter/s)": 0.090709 }, { "epoch": 0.14981677085578787, "grad_norm": 0.3643823266029358, "learning_rate": 0.00019460241510244316, "loss": 0.9264860153198242, "memory(GiB)": 88.73, "step": 695, "token_acc": 0.7378929783717499, "train_speed(iter/s)": 0.090711 }, { "epoch": 0.15089458935115327, "grad_norm": 0.3655630052089691, "learning_rate": 0.0001944862961438239, "loss": 0.9023061752319336, "memory(GiB)": 88.73, "step": 700, "token_acc": 0.7475738499649285, "train_speed(iter/s)": 0.090716 }, { "epoch": 0.15197240784651864, "grad_norm": 0.3566775321960449, "learning_rate": 0.00019436897679617362, "loss": 0.8793312072753906, "memory(GiB)": 88.73, "step": 705, "token_acc": 0.7467480880959445, "train_speed(iter/s)": 0.090702 }, { "epoch": 0.15305022634188403, "grad_norm": 0.3326597213745117, "learning_rate": 0.00019425045854996084, "loss": 0.8805784225463867, "memory(GiB)": 88.73, "step": 710, "token_acc": 0.7470807156514614, "train_speed(iter/s)": 0.090697 }, { "epoch": 0.1541280448372494, "grad_norm": 0.31456875801086426, "learning_rate": 0.0001941307429108855, "loss": 0.8999569892883301, "memory(GiB)": 88.73, "step": 715, "token_acc": 0.7521138928715856, "train_speed(iter/s)": 0.090696 }, { "epoch": 0.1552058633326148, "grad_norm": 0.46562129259109497, "learning_rate": 0.00019400983139985955, "loss": 0.9167325019836425, "memory(GiB)": 88.73, "step": 720, "token_acc": 0.7451846240542463, "train_speed(iter/s)": 0.090696 }, { "epoch": 0.15628368182798016, "grad_norm": 0.41583460569381714, "learning_rate": 0.00019388772555298776, "loss": 0.9117277145385743, "memory(GiB)": 88.73, "step": 725, "token_acc": 0.7365026751454666, "train_speed(iter/s)": 0.090709 }, { "epoch": 0.15736150032334556, "grad_norm": 0.34480807185173035, "learning_rate": 0.00019376442692154824, "loss": 0.914979362487793, "memory(GiB)": 88.73, "step": 730, "token_acc": 0.750650248433787, "train_speed(iter/s)": 0.090699 }, { "epoch": 0.15843931881871093, "grad_norm": 0.34430089592933655, "learning_rate": 0.00019363993707197262, "loss": 0.8847478866577149, "memory(GiB)": 88.73, "step": 735, "token_acc": 0.7562554485931037, "train_speed(iter/s)": 0.090703 }, { "epoch": 0.1595171373140763, "grad_norm": 0.35195833444595337, "learning_rate": 0.0001935142575858262, "loss": 0.8971000671386719, "memory(GiB)": 88.73, "step": 740, "token_acc": 0.7591823669288458, "train_speed(iter/s)": 0.090705 }, { "epoch": 0.1605949558094417, "grad_norm": 0.6069426536560059, "learning_rate": 0.00019338739005978793, "loss": 0.9084870338439941, "memory(GiB)": 88.73, "step": 745, "token_acc": 0.7469669451507447, "train_speed(iter/s)": 0.090692 }, { "epoch": 0.16167277430480706, "grad_norm": 0.377095103263855, "learning_rate": 0.00019325933610563002, "loss": 0.8869780540466309, "memory(GiB)": 88.73, "step": 750, "token_acc": 0.7545626405526238, "train_speed(iter/s)": 0.090682 }, { "epoch": 0.16275059280017246, "grad_norm": 0.37895113229751587, "learning_rate": 0.0001931300973501975, "loss": 0.8869126319885254, "memory(GiB)": 88.73, "step": 755, "token_acc": 0.7465400034861426, "train_speed(iter/s)": 0.09067 }, { "epoch": 0.16382841129553782, "grad_norm": 0.40123969316482544, "learning_rate": 0.00019299967543538754, "loss": 0.9023336410522461, "memory(GiB)": 88.73, "step": 760, "token_acc": 0.7491372421605388, "train_speed(iter/s)": 0.090649 }, { "epoch": 0.16490622979090322, "grad_norm": 0.3021450340747833, "learning_rate": 0.00019286807201812867, "loss": 0.8775389671325684, "memory(GiB)": 88.73, "step": 765, "token_acc": 0.7665111016829302, "train_speed(iter/s)": 0.090654 }, { "epoch": 0.1659840482862686, "grad_norm": 0.3383391499519348, "learning_rate": 0.0001927352887703596, "loss": 0.8915297508239746, "memory(GiB)": 88.73, "step": 770, "token_acc": 0.7411729918036574, "train_speed(iter/s)": 0.090658 }, { "epoch": 0.16706186678163398, "grad_norm": 0.32876160740852356, "learning_rate": 0.00019260132737900808, "loss": 0.8692303657531738, "memory(GiB)": 88.73, "step": 775, "token_acc": 0.7506594648913453, "train_speed(iter/s)": 0.090675 }, { "epoch": 0.16813968527699935, "grad_norm": 0.34362098574638367, "learning_rate": 0.0001924661895459694, "loss": 0.9025952339172363, "memory(GiB)": 88.73, "step": 780, "token_acc": 0.7397033717456253, "train_speed(iter/s)": 0.09067 }, { "epoch": 0.16921750377236475, "grad_norm": 0.3517121374607086, "learning_rate": 0.0001923298769880849, "loss": 0.8839033126831055, "memory(GiB)": 88.73, "step": 785, "token_acc": 0.7358573967326099, "train_speed(iter/s)": 0.090686 }, { "epoch": 0.17029532226773011, "grad_norm": 0.37859269976615906, "learning_rate": 0.0001921923914371199, "loss": 0.9059449195861816, "memory(GiB)": 88.73, "step": 790, "token_acc": 0.747459754633829, "train_speed(iter/s)": 0.090687 }, { "epoch": 0.17137314076309548, "grad_norm": 0.31685298681259155, "learning_rate": 0.00019205373463974202, "loss": 0.8852771759033203, "memory(GiB)": 88.73, "step": 795, "token_acc": 0.7394449767495012, "train_speed(iter/s)": 0.090674 }, { "epoch": 0.17245095925846088, "grad_norm": 0.34116417169570923, "learning_rate": 0.0001919139083574987, "loss": 0.8852038383483887, "memory(GiB)": 88.73, "step": 800, "token_acc": 0.7471696281192349, "train_speed(iter/s)": 0.090685 }, { "epoch": 0.17352877775382625, "grad_norm": 0.33934485912323, "learning_rate": 0.00019177291436679505, "loss": 0.9038798332214355, "memory(GiB)": 88.73, "step": 805, "token_acc": 0.745712899297767, "train_speed(iter/s)": 0.090691 }, { "epoch": 0.17460659624919164, "grad_norm": 0.3100833296775818, "learning_rate": 0.00019163075445887114, "loss": 0.8809244155883789, "memory(GiB)": 88.73, "step": 810, "token_acc": 0.7441435642272463, "train_speed(iter/s)": 0.09067 }, { "epoch": 0.175684414744557, "grad_norm": 0.319581001996994, "learning_rate": 0.00019148743043977926, "loss": 0.9050381660461426, "memory(GiB)": 88.73, "step": 815, "token_acc": 0.7535669733354124, "train_speed(iter/s)": 0.090672 }, { "epoch": 0.1767622332399224, "grad_norm": 0.35274654626846313, "learning_rate": 0.00019134294413036105, "loss": 0.8914363861083985, "memory(GiB)": 88.73, "step": 820, "token_acc": 0.7443278759937562, "train_speed(iter/s)": 0.090678 }, { "epoch": 0.17784005173528777, "grad_norm": 0.35693061351776123, "learning_rate": 0.0001911972973662243, "loss": 0.8710582733154297, "memory(GiB)": 88.73, "step": 825, "token_acc": 0.7484593037100539, "train_speed(iter/s)": 0.090678 }, { "epoch": 0.17891787023065317, "grad_norm": 0.3604641556739807, "learning_rate": 0.00019105049199771962, "loss": 0.8978882789611816, "memory(GiB)": 88.73, "step": 830, "token_acc": 0.761647535449021, "train_speed(iter/s)": 0.090676 }, { "epoch": 0.17999568872601854, "grad_norm": 0.35102489590644836, "learning_rate": 0.000190902529889917, "loss": 0.8884178161621094, "memory(GiB)": 88.73, "step": 835, "token_acc": 0.7559235832514757, "train_speed(iter/s)": 0.090681 }, { "epoch": 0.1810735072213839, "grad_norm": 0.3431907892227173, "learning_rate": 0.00019075341292258208, "loss": 0.8857933044433594, "memory(GiB)": 88.73, "step": 840, "token_acc": 0.7473226659947083, "train_speed(iter/s)": 0.09068 }, { "epoch": 0.1821513257167493, "grad_norm": 0.35465049743652344, "learning_rate": 0.00019060314299015226, "loss": 0.8795783042907714, "memory(GiB)": 88.73, "step": 845, "token_acc": 0.7535403403923606, "train_speed(iter/s)": 0.090679 }, { "epoch": 0.18322914421211467, "grad_norm": 0.347788542509079, "learning_rate": 0.00019045172200171255, "loss": 0.9144908905029296, "memory(GiB)": 88.73, "step": 850, "token_acc": 0.7509559221324629, "train_speed(iter/s)": 0.090685 }, { "epoch": 0.18430696270748007, "grad_norm": 0.3288102447986603, "learning_rate": 0.00019029915188097153, "loss": 0.8947425842285156, "memory(GiB)": 88.73, "step": 855, "token_acc": 0.7507136322049406, "train_speed(iter/s)": 0.090693 }, { "epoch": 0.18538478120284543, "grad_norm": 0.32153844833374023, "learning_rate": 0.00019014543456623674, "loss": 0.8780288696289062, "memory(GiB)": 88.73, "step": 860, "token_acc": 0.7500145402376844, "train_speed(iter/s)": 0.090681 }, { "epoch": 0.18646259969821083, "grad_norm": 0.31643739342689514, "learning_rate": 0.00018999057201039005, "loss": 0.8857738494873046, "memory(GiB)": 88.73, "step": 865, "token_acc": 0.7584359663324207, "train_speed(iter/s)": 0.090686 }, { "epoch": 0.1875404181935762, "grad_norm": 0.3230378329753876, "learning_rate": 0.000189834566180863, "loss": 0.8978780746459961, "memory(GiB)": 88.73, "step": 870, "token_acc": 0.7440366556276369, "train_speed(iter/s)": 0.0907 }, { "epoch": 0.1886182366889416, "grad_norm": 0.3118763566017151, "learning_rate": 0.00018967741905961155, "loss": 0.8761393547058105, "memory(GiB)": 88.73, "step": 875, "token_acc": 0.7547009704355676, "train_speed(iter/s)": 0.0907 }, { "epoch": 0.18969605518430696, "grad_norm": 0.3721042275428772, "learning_rate": 0.00018951913264309128, "loss": 0.8656279563903808, "memory(GiB)": 88.73, "step": 880, "token_acc": 0.7639512450991723, "train_speed(iter/s)": 0.090696 }, { "epoch": 0.19077387367967236, "grad_norm": 0.41057825088500977, "learning_rate": 0.00018935970894223153, "loss": 0.8707589149475098, "memory(GiB)": 88.73, "step": 885, "token_acc": 0.753261570978331, "train_speed(iter/s)": 0.090704 }, { "epoch": 0.19185169217503772, "grad_norm": 0.34374094009399414, "learning_rate": 0.00018919914998241034, "loss": 0.9091244697570801, "memory(GiB)": 88.73, "step": 890, "token_acc": 0.7314947881610303, "train_speed(iter/s)": 0.090695 }, { "epoch": 0.1929295106704031, "grad_norm": 0.3008347749710083, "learning_rate": 0.00018903745780342839, "loss": 0.8685267448425293, "memory(GiB)": 88.73, "step": 895, "token_acc": 0.7563539043406343, "train_speed(iter/s)": 0.090701 }, { "epoch": 0.1940073291657685, "grad_norm": 0.34797072410583496, "learning_rate": 0.00018887463445948325, "loss": 0.8904449462890625, "memory(GiB)": 88.73, "step": 900, "token_acc": 0.7543671228852844, "train_speed(iter/s)": 0.090705 }, { "epoch": 0.19508514766113386, "grad_norm": 0.4099235236644745, "learning_rate": 0.0001887106820191432, "loss": 0.8687204360961914, "memory(GiB)": 88.73, "step": 905, "token_acc": 0.7498371458512315, "train_speed(iter/s)": 0.09069 }, { "epoch": 0.19616296615649925, "grad_norm": 0.29887574911117554, "learning_rate": 0.000188545602565321, "loss": 0.8879858016967773, "memory(GiB)": 88.73, "step": 910, "token_acc": 0.752286512901447, "train_speed(iter/s)": 0.090694 }, { "epoch": 0.19724078465186462, "grad_norm": 0.3665757179260254, "learning_rate": 0.00018837939819524738, "loss": 0.8874917984008789, "memory(GiB)": 88.73, "step": 915, "token_acc": 0.7575448501957928, "train_speed(iter/s)": 0.090698 }, { "epoch": 0.19831860314723002, "grad_norm": 0.3288756310939789, "learning_rate": 0.00018821207102044448, "loss": 0.8806957244873047, "memory(GiB)": 88.73, "step": 920, "token_acc": 0.7463729331931568, "train_speed(iter/s)": 0.090695 }, { "epoch": 0.19939642164259538, "grad_norm": 0.32552266120910645, "learning_rate": 0.0001880436231666989, "loss": 0.9038246154785157, "memory(GiB)": 88.73, "step": 925, "token_acc": 0.741014004510399, "train_speed(iter/s)": 0.090701 }, { "epoch": 0.20047424013796078, "grad_norm": 0.3395576477050781, "learning_rate": 0.0001878740567740348, "loss": 0.9073991775512695, "memory(GiB)": 88.73, "step": 930, "token_acc": 0.7383182601205857, "train_speed(iter/s)": 0.090697 }, { "epoch": 0.20155205863332615, "grad_norm": 0.3467939496040344, "learning_rate": 0.00018770337399668668, "loss": 0.9035392761230469, "memory(GiB)": 88.73, "step": 935, "token_acc": 0.7394750154572537, "train_speed(iter/s)": 0.090702 }, { "epoch": 0.20262987712869152, "grad_norm": 0.3339230716228485, "learning_rate": 0.00018753157700307195, "loss": 0.8912302017211914, "memory(GiB)": 88.73, "step": 940, "token_acc": 0.7486076616241439, "train_speed(iter/s)": 0.090707 }, { "epoch": 0.2037076956240569, "grad_norm": 0.2919709384441376, "learning_rate": 0.00018735866797576356, "loss": 0.9139111518859864, "memory(GiB)": 88.73, "step": 945, "token_acc": 0.7504745068850018, "train_speed(iter/s)": 0.090715 }, { "epoch": 0.20478551411942228, "grad_norm": 0.31341609358787537, "learning_rate": 0.000187184649111462, "loss": 0.8773763656616211, "memory(GiB)": 88.73, "step": 950, "token_acc": 0.7500311338041951, "train_speed(iter/s)": 0.090705 }, { "epoch": 0.20586333261478768, "grad_norm": 0.3128372132778168, "learning_rate": 0.00018700952262096766, "loss": 0.8929924011230469, "memory(GiB)": 88.73, "step": 955, "token_acc": 0.7369707292688797, "train_speed(iter/s)": 0.090685 }, { "epoch": 0.20694115111015304, "grad_norm": 0.3436991274356842, "learning_rate": 0.00018683329072915252, "loss": 0.8958606719970703, "memory(GiB)": 88.73, "step": 960, "token_acc": 0.7552114573624812, "train_speed(iter/s)": 0.090692 }, { "epoch": 0.20801896960551844, "grad_norm": 0.35602518916130066, "learning_rate": 0.0001866559556749321, "loss": 0.9087234497070312, "memory(GiB)": 88.73, "step": 965, "token_acc": 0.7405176928256153, "train_speed(iter/s)": 0.090693 }, { "epoch": 0.2090967881008838, "grad_norm": 0.33853679895401, "learning_rate": 0.00018647751971123685, "loss": 0.8780548095703125, "memory(GiB)": 88.73, "step": 970, "token_acc": 0.7471214011562015, "train_speed(iter/s)": 0.090698 }, { "epoch": 0.2101746065962492, "grad_norm": 0.31715530157089233, "learning_rate": 0.00018629798510498354, "loss": 0.8864520072937012, "memory(GiB)": 88.73, "step": 975, "token_acc": 0.745318245574131, "train_speed(iter/s)": 0.090692 }, { "epoch": 0.21125242509161457, "grad_norm": 0.3075437545776367, "learning_rate": 0.00018611735413704664, "loss": 0.880126667022705, "memory(GiB)": 88.73, "step": 980, "token_acc": 0.7409638554216867, "train_speed(iter/s)": 0.0907 }, { "epoch": 0.21233024358697994, "grad_norm": 0.3181777596473694, "learning_rate": 0.000185935629102229, "loss": 0.8719268798828125, "memory(GiB)": 88.73, "step": 985, "token_acc": 0.7483757807885741, "train_speed(iter/s)": 0.090698 }, { "epoch": 0.21340806208234533, "grad_norm": 0.36789169907569885, "learning_rate": 0.00018575281230923308, "loss": 0.8861351013183594, "memory(GiB)": 88.73, "step": 990, "token_acc": 0.749486327039921, "train_speed(iter/s)": 0.090703 }, { "epoch": 0.2144858805777107, "grad_norm": 0.33004286885261536, "learning_rate": 0.0001855689060806314, "loss": 0.8615049362182617, "memory(GiB)": 88.73, "step": 995, "token_acc": 0.7494673378055248, "train_speed(iter/s)": 0.090699 }, { "epoch": 0.2155636990730761, "grad_norm": 0.30279314517974854, "learning_rate": 0.00018538391275283703, "loss": 0.8792425155639648, "memory(GiB)": 88.73, "step": 1000, "token_acc": 0.7383059418457648, "train_speed(iter/s)": 0.090706 }, { "epoch": 0.21664151756844147, "grad_norm": 0.31002071499824524, "learning_rate": 0.00018519783467607395, "loss": 0.8697021484375, "memory(GiB)": 88.73, "step": 1005, "token_acc": 0.7535877889137105, "train_speed(iter/s)": 0.090711 }, { "epoch": 0.21771933606380686, "grad_norm": 0.39754360914230347, "learning_rate": 0.00018501067421434727, "loss": 0.883603572845459, "memory(GiB)": 88.73, "step": 1010, "token_acc": 0.7586370048453094, "train_speed(iter/s)": 0.090697 }, { "epoch": 0.21879715455917223, "grad_norm": 0.3268727958202362, "learning_rate": 0.00018482243374541303, "loss": 0.8844954490661621, "memory(GiB)": 88.73, "step": 1015, "token_acc": 0.7483813093408888, "train_speed(iter/s)": 0.09068 }, { "epoch": 0.21987497305453763, "grad_norm": 0.3095914423465729, "learning_rate": 0.00018463311566074812, "loss": 0.8940093994140625, "memory(GiB)": 88.73, "step": 1020, "token_acc": 0.7396316007802226, "train_speed(iter/s)": 0.09069 }, { "epoch": 0.220952791549903, "grad_norm": 0.31101733446121216, "learning_rate": 0.0001844427223655199, "loss": 0.8754401206970215, "memory(GiB)": 88.73, "step": 1025, "token_acc": 0.7427362827856497, "train_speed(iter/s)": 0.09069 }, { "epoch": 0.2220306100452684, "grad_norm": 0.31033429503440857, "learning_rate": 0.00018425125627855558, "loss": 0.8796735763549804, "memory(GiB)": 88.73, "step": 1030, "token_acc": 0.7453862710079223, "train_speed(iter/s)": 0.09069 }, { "epoch": 0.22310842854063376, "grad_norm": 0.3059726059436798, "learning_rate": 0.00018405871983231148, "loss": 0.8886590003967285, "memory(GiB)": 88.73, "step": 1035, "token_acc": 0.7429312168321743, "train_speed(iter/s)": 0.090687 }, { "epoch": 0.22418624703599913, "grad_norm": 0.3207603693008423, "learning_rate": 0.0001838651154728422, "loss": 0.9112618446350098, "memory(GiB)": 88.73, "step": 1040, "token_acc": 0.7486375762195122, "train_speed(iter/s)": 0.090685 }, { "epoch": 0.22526406553136452, "grad_norm": 0.34268030524253845, "learning_rate": 0.00018367044565976946, "loss": 0.8889823913574219, "memory(GiB)": 88.73, "step": 1045, "token_acc": 0.7414696799418248, "train_speed(iter/s)": 0.09068 }, { "epoch": 0.2263418840267299, "grad_norm": 0.33225882053375244, "learning_rate": 0.00018347471286625099, "loss": 0.8507140159606934, "memory(GiB)": 88.73, "step": 1050, "token_acc": 0.7703410970038398, "train_speed(iter/s)": 0.090689 }, { "epoch": 0.22741970252209529, "grad_norm": 0.30311882495880127, "learning_rate": 0.00018327791957894888, "loss": 0.8598455429077149, "memory(GiB)": 88.73, "step": 1055, "token_acc": 0.758344484091585, "train_speed(iter/s)": 0.090681 }, { "epoch": 0.22849752101746065, "grad_norm": 0.33225154876708984, "learning_rate": 0.0001830800682979983, "loss": 0.867893123626709, "memory(GiB)": 88.73, "step": 1060, "token_acc": 0.7509079820497183, "train_speed(iter/s)": 0.090684 }, { "epoch": 0.22957533951282605, "grad_norm": 0.3222039043903351, "learning_rate": 0.00018288116153697542, "loss": 0.8614047050476075, "memory(GiB)": 88.73, "step": 1065, "token_acc": 0.7547905556951541, "train_speed(iter/s)": 0.090694 }, { "epoch": 0.23065315800819142, "grad_norm": 0.3672635555267334, "learning_rate": 0.00018268120182286568, "loss": 0.8906951904296875, "memory(GiB)": 88.73, "step": 1070, "token_acc": 0.7362211656981863, "train_speed(iter/s)": 0.090694 }, { "epoch": 0.2317309765035568, "grad_norm": 0.3156149387359619, "learning_rate": 0.00018248019169603166, "loss": 0.8801830291748047, "memory(GiB)": 88.73, "step": 1075, "token_acc": 0.7391451710728174, "train_speed(iter/s)": 0.090693 }, { "epoch": 0.23280879499892218, "grad_norm": 0.31747502088546753, "learning_rate": 0.00018227813371018064, "loss": 0.8821067810058594, "memory(GiB)": 88.73, "step": 1080, "token_acc": 0.746450067200374, "train_speed(iter/s)": 0.090703 }, { "epoch": 0.23388661349428755, "grad_norm": 0.2803831994533539, "learning_rate": 0.0001820750304323325, "loss": 0.8797122001647949, "memory(GiB)": 88.73, "step": 1085, "token_acc": 0.7454967310269162, "train_speed(iter/s)": 0.090697 }, { "epoch": 0.23496443198965294, "grad_norm": 0.31105804443359375, "learning_rate": 0.00018187088444278674, "loss": 0.8659972190856934, "memory(GiB)": 88.73, "step": 1090, "token_acc": 0.7436706912774695, "train_speed(iter/s)": 0.090698 }, { "epoch": 0.2360422504850183, "grad_norm": 0.31607556343078613, "learning_rate": 0.0001816656983350899, "loss": 0.8563562393188476, "memory(GiB)": 88.73, "step": 1095, "token_acc": 0.769275520066058, "train_speed(iter/s)": 0.090692 }, { "epoch": 0.2371200689803837, "grad_norm": 0.3036384582519531, "learning_rate": 0.00018145947471600256, "loss": 0.874875259399414, "memory(GiB)": 88.73, "step": 1100, "token_acc": 0.7615142016499683, "train_speed(iter/s)": 0.090701 }, { "epoch": 0.23819788747574908, "grad_norm": 0.3247828185558319, "learning_rate": 0.00018125221620546623, "loss": 0.8694080352783203, "memory(GiB)": 88.73, "step": 1105, "token_acc": 0.7480637854820443, "train_speed(iter/s)": 0.090715 }, { "epoch": 0.23927570597111447, "grad_norm": 0.29979124665260315, "learning_rate": 0.0001810439254365701, "loss": 0.8815859794616699, "memory(GiB)": 88.73, "step": 1110, "token_acc": 0.7436300637222186, "train_speed(iter/s)": 0.090715 }, { "epoch": 0.24035352446647984, "grad_norm": 0.32453835010528564, "learning_rate": 0.0001808346050555175, "loss": 0.8731791496276855, "memory(GiB)": 88.73, "step": 1115, "token_acc": 0.7523246899661782, "train_speed(iter/s)": 0.090718 }, { "epoch": 0.24143134296184524, "grad_norm": 0.2976701855659485, "learning_rate": 0.0001806242577215924, "loss": 0.8836530685424805, "memory(GiB)": 88.73, "step": 1120, "token_acc": 0.7466696812118472, "train_speed(iter/s)": 0.090723 }, { "epoch": 0.2425091614572106, "grad_norm": 0.3211086690425873, "learning_rate": 0.00018041288610712555, "loss": 0.8603124618530273, "memory(GiB)": 88.73, "step": 1125, "token_acc": 0.7471335522420197, "train_speed(iter/s)": 0.090719 }, { "epoch": 0.243586979952576, "grad_norm": 0.31973403692245483, "learning_rate": 0.00018020049289746044, "loss": 0.8624408721923829, "memory(GiB)": 88.73, "step": 1130, "token_acc": 0.7540864887233603, "train_speed(iter/s)": 0.090728 }, { "epoch": 0.24466479844794137, "grad_norm": 0.31327131390571594, "learning_rate": 0.00017998708079091942, "loss": 0.8567747116088867, "memory(GiB)": 88.73, "step": 1135, "token_acc": 0.7555419952136231, "train_speed(iter/s)": 0.090734 }, { "epoch": 0.24574261694330674, "grad_norm": 0.34130340814590454, "learning_rate": 0.00017977265249876918, "loss": 0.8735370635986328, "memory(GiB)": 88.73, "step": 1140, "token_acc": 0.7571458170058284, "train_speed(iter/s)": 0.09072 }, { "epoch": 0.24682043543867213, "grad_norm": 0.3064236342906952, "learning_rate": 0.00017955721074518647, "loss": 0.871483039855957, "memory(GiB)": 88.73, "step": 1145, "token_acc": 0.7530870893673953, "train_speed(iter/s)": 0.090727 }, { "epoch": 0.2478982539340375, "grad_norm": 0.32768213748931885, "learning_rate": 0.00017934075826722348, "loss": 0.8899338722229004, "memory(GiB)": 88.73, "step": 1150, "token_acc": 0.7570873061406624, "train_speed(iter/s)": 0.090737 }, { "epoch": 0.2489760724294029, "grad_norm": 0.30989205837249756, "learning_rate": 0.00017912329781477287, "loss": 0.8955196380615235, "memory(GiB)": 88.73, "step": 1155, "token_acc": 0.7495791869805248, "train_speed(iter/s)": 0.090724 }, { "epoch": 0.25005389092476826, "grad_norm": 0.2907588481903076, "learning_rate": 0.0001789048321505331, "loss": 0.8667471885681153, "memory(GiB)": 88.73, "step": 1160, "token_acc": 0.7604598129101318, "train_speed(iter/s)": 0.090723 }, { "epoch": 0.25113170942013363, "grad_norm": 0.3075993061065674, "learning_rate": 0.0001786853640499732, "loss": 0.8907232284545898, "memory(GiB)": 88.73, "step": 1165, "token_acc": 0.7471097602409528, "train_speed(iter/s)": 0.090736 }, { "epoch": 0.25220952791549905, "grad_norm": 0.3170734643936157, "learning_rate": 0.0001784648963012975, "loss": 0.8890779495239258, "memory(GiB)": 88.73, "step": 1170, "token_acc": 0.7411138636572162, "train_speed(iter/s)": 0.090724 }, { "epoch": 0.2532873464108644, "grad_norm": 0.28377771377563477, "learning_rate": 0.0001782434317054102, "loss": 0.865540885925293, "memory(GiB)": 88.73, "step": 1175, "token_acc": 0.7560227575784514, "train_speed(iter/s)": 0.090727 }, { "epoch": 0.2543651649062298, "grad_norm": 0.2996067702770233, "learning_rate": 0.00017802097307587992, "loss": 0.8845622062683105, "memory(GiB)": 88.73, "step": 1180, "token_acc": 0.7599559021533846, "train_speed(iter/s)": 0.090734 }, { "epoch": 0.25544298340159516, "grad_norm": 0.31120091676712036, "learning_rate": 0.0001777975232389037, "loss": 0.8686803817749024, "memory(GiB)": 88.73, "step": 1185, "token_acc": 0.7443069125101006, "train_speed(iter/s)": 0.090738 }, { "epoch": 0.2565208018969605, "grad_norm": 0.3394160568714142, "learning_rate": 0.0001775730850332714, "loss": 0.874500846862793, "memory(GiB)": 88.73, "step": 1190, "token_acc": 0.7424460431654676, "train_speed(iter/s)": 0.090739 }, { "epoch": 0.25759862039232595, "grad_norm": 0.3104134202003479, "learning_rate": 0.0001773476613103294, "loss": 0.8683677673339844, "memory(GiB)": 88.73, "step": 1195, "token_acc": 0.7540851836441351, "train_speed(iter/s)": 0.090743 }, { "epoch": 0.2586764388876913, "grad_norm": 0.27440857887268066, "learning_rate": 0.00017712125493394448, "loss": 0.8579555511474609, "memory(GiB)": 88.73, "step": 1200, "token_acc": 0.7529050403375028, "train_speed(iter/s)": 0.09074 }, { "epoch": 0.2597542573830567, "grad_norm": 0.3069484233856201, "learning_rate": 0.00017689386878046745, "loss": 0.8774654388427734, "memory(GiB)": 88.73, "step": 1205, "token_acc": 0.7612315367217967, "train_speed(iter/s)": 0.090735 }, { "epoch": 0.26083207587842205, "grad_norm": 0.2612002491950989, "learning_rate": 0.00017666550573869648, "loss": 0.8495336532592773, "memory(GiB)": 88.73, "step": 1210, "token_acc": 0.750131291633268, "train_speed(iter/s)": 0.090727 }, { "epoch": 0.2619098943737875, "grad_norm": 0.3644395172595978, "learning_rate": 0.00017643616870984053, "loss": 0.8798219680786132, "memory(GiB)": 88.73, "step": 1215, "token_acc": 0.7507542353214203, "train_speed(iter/s)": 0.090733 }, { "epoch": 0.26298771286915285, "grad_norm": 0.3031451404094696, "learning_rate": 0.00017620586060748252, "loss": 0.8655767440795898, "memory(GiB)": 88.73, "step": 1220, "token_acc": 0.7503253955159581, "train_speed(iter/s)": 0.090735 }, { "epoch": 0.2640655313645182, "grad_norm": 0.2845922112464905, "learning_rate": 0.00017597458435754217, "loss": 0.8762717247009277, "memory(GiB)": 88.73, "step": 1225, "token_acc": 0.7442910686534419, "train_speed(iter/s)": 0.090727 }, { "epoch": 0.2651433498598836, "grad_norm": 0.2965712249279022, "learning_rate": 0.00017574234289823894, "loss": 0.8423262596130371, "memory(GiB)": 88.73, "step": 1230, "token_acc": 0.7561903888481292, "train_speed(iter/s)": 0.09072 }, { "epoch": 0.26622116835524895, "grad_norm": 0.27077746391296387, "learning_rate": 0.00017550913918005465, "loss": 0.8373355865478516, "memory(GiB)": 88.73, "step": 1235, "token_acc": 0.7643992279524505, "train_speed(iter/s)": 0.090722 }, { "epoch": 0.2672989868506144, "grad_norm": 0.31012821197509766, "learning_rate": 0.00017527497616569608, "loss": 0.8678425788879395, "memory(GiB)": 88.73, "step": 1240, "token_acc": 0.7507263439737187, "train_speed(iter/s)": 0.090722 }, { "epoch": 0.26837680534597974, "grad_norm": 0.28305676579475403, "learning_rate": 0.00017503985683005717, "loss": 0.8675318717956543, "memory(GiB)": 88.73, "step": 1245, "token_acc": 0.7443765107351059, "train_speed(iter/s)": 0.09073 }, { "epoch": 0.2694546238413451, "grad_norm": 0.2915281057357788, "learning_rate": 0.00017480378416018135, "loss": 0.8683904647827149, "memory(GiB)": 88.73, "step": 1250, "token_acc": 0.7417681634758895, "train_speed(iter/s)": 0.090735 }, { "epoch": 0.2705324423367105, "grad_norm": 0.3140674829483032, "learning_rate": 0.00017456676115522362, "loss": 0.8825206756591797, "memory(GiB)": 88.73, "step": 1255, "token_acc": 0.7448676541559937, "train_speed(iter/s)": 0.090735 }, { "epoch": 0.2716102608320759, "grad_norm": 0.3252476453781128, "learning_rate": 0.00017432879082641232, "loss": 0.8588446617126465, "memory(GiB)": 88.73, "step": 1260, "token_acc": 0.7611674090901024, "train_speed(iter/s)": 0.090727 }, { "epoch": 0.27268807932744127, "grad_norm": 0.30587416887283325, "learning_rate": 0.000174089876197011, "loss": 0.8756070137023926, "memory(GiB)": 88.73, "step": 1265, "token_acc": 0.7418996995608967, "train_speed(iter/s)": 0.090736 }, { "epoch": 0.27376589782280664, "grad_norm": 0.30624493956565857, "learning_rate": 0.00017385002030227985, "loss": 0.8816617012023926, "memory(GiB)": 88.73, "step": 1270, "token_acc": 0.7478603911082656, "train_speed(iter/s)": 0.090743 }, { "epoch": 0.274843716318172, "grad_norm": 0.3082628846168518, "learning_rate": 0.0001736092261894374, "loss": 0.8604524612426758, "memory(GiB)": 88.73, "step": 1275, "token_acc": 0.7608873622951111, "train_speed(iter/s)": 0.090746 }, { "epoch": 0.27592153481353743, "grad_norm": 0.29148080945014954, "learning_rate": 0.00017336749691762156, "loss": 0.8643165588378906, "memory(GiB)": 88.73, "step": 1280, "token_acc": 0.7477799262362588, "train_speed(iter/s)": 0.090748 }, { "epoch": 0.2769993533089028, "grad_norm": 0.2830874025821686, "learning_rate": 0.00017312483555785086, "loss": 0.8682438850402832, "memory(GiB)": 88.73, "step": 1285, "token_acc": 0.7435218619618702, "train_speed(iter/s)": 0.090757 }, { "epoch": 0.27807717180426816, "grad_norm": 0.28748950362205505, "learning_rate": 0.0001728812451929854, "loss": 0.8669134140014648, "memory(GiB)": 88.73, "step": 1290, "token_acc": 0.768884883982879, "train_speed(iter/s)": 0.090758 }, { "epoch": 0.27915499029963353, "grad_norm": 0.312134325504303, "learning_rate": 0.00017263672891768776, "loss": 0.8656195640563965, "memory(GiB)": 88.73, "step": 1295, "token_acc": 0.749933881131955, "train_speed(iter/s)": 0.090761 }, { "epoch": 0.2802328087949989, "grad_norm": 0.3052614629268646, "learning_rate": 0.00017239128983838358, "loss": 0.8570053100585937, "memory(GiB)": 88.73, "step": 1300, "token_acc": 0.7570742883109752, "train_speed(iter/s)": 0.090755 }, { "epoch": 0.2813106272903643, "grad_norm": 0.27821168303489685, "learning_rate": 0.00017214493107322215, "loss": 0.8532657623291016, "memory(GiB)": 88.73, "step": 1305, "token_acc": 0.7516083767970135, "train_speed(iter/s)": 0.090749 }, { "epoch": 0.2823884457857297, "grad_norm": 0.2938804030418396, "learning_rate": 0.00017189765575203686, "loss": 0.8696994781494141, "memory(GiB)": 88.73, "step": 1310, "token_acc": 0.7574029372201441, "train_speed(iter/s)": 0.09076 }, { "epoch": 0.28346626428109506, "grad_norm": 0.3080523908138275, "learning_rate": 0.00017164946701630525, "loss": 0.8871194839477539, "memory(GiB)": 88.73, "step": 1315, "token_acc": 0.7487196046246715, "train_speed(iter/s)": 0.090764 }, { "epoch": 0.2845440827764604, "grad_norm": 0.30644580721855164, "learning_rate": 0.0001714003680191093, "loss": 0.8475534439086914, "memory(GiB)": 88.73, "step": 1320, "token_acc": 0.7622229312512904, "train_speed(iter/s)": 0.090768 }, { "epoch": 0.28562190127182585, "grad_norm": 0.30902040004730225, "learning_rate": 0.0001711503619250953, "loss": 0.8722443580627441, "memory(GiB)": 88.73, "step": 1325, "token_acc": 0.7579166741848757, "train_speed(iter/s)": 0.090764 }, { "epoch": 0.2866997197671912, "grad_norm": 0.2873646914958954, "learning_rate": 0.00017089945191043357, "loss": 0.8771150588989258, "memory(GiB)": 88.73, "step": 1330, "token_acc": 0.7469671026266895, "train_speed(iter/s)": 0.090764 }, { "epoch": 0.2877775382625566, "grad_norm": 0.29235923290252686, "learning_rate": 0.00017064764116277823, "loss": 0.8778883934020996, "memory(GiB)": 88.73, "step": 1335, "token_acc": 0.7509075884608599, "train_speed(iter/s)": 0.090762 }, { "epoch": 0.28885535675792195, "grad_norm": 0.30336469411849976, "learning_rate": 0.00017039493288122662, "loss": 0.854495906829834, "memory(GiB)": 88.73, "step": 1340, "token_acc": 0.7552121725044819, "train_speed(iter/s)": 0.090764 }, { "epoch": 0.2899331752532873, "grad_norm": 0.28838422894477844, "learning_rate": 0.0001701413302762787, "loss": 0.8434688568115234, "memory(GiB)": 88.73, "step": 1345, "token_acc": 0.7535295955760508, "train_speed(iter/s)": 0.090762 }, { "epoch": 0.29101099374865275, "grad_norm": 0.28007248044013977, "learning_rate": 0.00016988683656979624, "loss": 0.8616040229797364, "memory(GiB)": 88.73, "step": 1350, "token_acc": 0.7500597371565113, "train_speed(iter/s)": 0.09077 }, { "epoch": 0.2920888122440181, "grad_norm": 0.28495824337005615, "learning_rate": 0.0001696314549949619, "loss": 0.8752744674682618, "memory(GiB)": 88.73, "step": 1355, "token_acc": 0.7557785725724657, "train_speed(iter/s)": 0.090769 }, { "epoch": 0.2931666307393835, "grad_norm": 0.2759030759334564, "learning_rate": 0.00016937518879623812, "loss": 0.8472925186157226, "memory(GiB)": 88.73, "step": 1360, "token_acc": 0.7566133694339761, "train_speed(iter/s)": 0.090757 }, { "epoch": 0.29424444923474885, "grad_norm": 0.27506211400032043, "learning_rate": 0.00016911804122932597, "loss": 0.8349401473999023, "memory(GiB)": 88.73, "step": 1365, "token_acc": 0.7681410443407235, "train_speed(iter/s)": 0.090751 }, { "epoch": 0.2953222677301143, "grad_norm": 0.2526291608810425, "learning_rate": 0.00016886001556112374, "loss": 0.8469820022583008, "memory(GiB)": 88.73, "step": 1370, "token_acc": 0.754791336186685, "train_speed(iter/s)": 0.090749 }, { "epoch": 0.29640008622547964, "grad_norm": 0.25075405836105347, "learning_rate": 0.00016860111506968544, "loss": 0.8490007400512696, "memory(GiB)": 88.73, "step": 1375, "token_acc": 0.7515119592923184, "train_speed(iter/s)": 0.090756 }, { "epoch": 0.297477904720845, "grad_norm": 0.28928419947624207, "learning_rate": 0.00016834134304417916, "loss": 0.8343512535095214, "memory(GiB)": 88.73, "step": 1380, "token_acc": 0.753833918128655, "train_speed(iter/s)": 0.090747 }, { "epoch": 0.2985557232162104, "grad_norm": 0.3065609931945801, "learning_rate": 0.00016808070278484522, "loss": 0.8761491775512695, "memory(GiB)": 88.73, "step": 1385, "token_acc": 0.753328705187084, "train_speed(iter/s)": 0.090748 }, { "epoch": 0.29963354171157575, "grad_norm": 0.28008612990379333, "learning_rate": 0.00016781919760295444, "loss": 0.8467257499694825, "memory(GiB)": 88.73, "step": 1390, "token_acc": 0.7441130151985313, "train_speed(iter/s)": 0.090752 }, { "epoch": 0.30071136020694117, "grad_norm": 0.2821221351623535, "learning_rate": 0.00016755683082076583, "loss": 0.8849431991577148, "memory(GiB)": 88.73, "step": 1395, "token_acc": 0.7602497398543184, "train_speed(iter/s)": 0.090764 }, { "epoch": 0.30178917870230654, "grad_norm": 0.284231960773468, "learning_rate": 0.00016729360577148454, "loss": 0.8684951782226562, "memory(GiB)": 88.73, "step": 1400, "token_acc": 0.7410936826521242, "train_speed(iter/s)": 0.090764 }, { "epoch": 0.3028669971976719, "grad_norm": 0.30633628368377686, "learning_rate": 0.00016702952579921947, "loss": 0.851997184753418, "memory(GiB)": 88.73, "step": 1405, "token_acc": 0.7561066979165666, "train_speed(iter/s)": 0.090765 }, { "epoch": 0.3039448156930373, "grad_norm": 0.3097163736820221, "learning_rate": 0.00016676459425894069, "loss": 0.8619900703430176, "memory(GiB)": 88.73, "step": 1410, "token_acc": 0.7518935111914989, "train_speed(iter/s)": 0.090771 }, { "epoch": 0.3050226341884027, "grad_norm": 0.298454225063324, "learning_rate": 0.00016649881451643705, "loss": 0.8493214607238769, "memory(GiB)": 88.73, "step": 1415, "token_acc": 0.7556153204117593, "train_speed(iter/s)": 0.090779 }, { "epoch": 0.30610045268376807, "grad_norm": 0.27557069063186646, "learning_rate": 0.00016623218994827314, "loss": 0.8758691787719727, "memory(GiB)": 88.73, "step": 1420, "token_acc": 0.7531293341057916, "train_speed(iter/s)": 0.090776 }, { "epoch": 0.30717827117913343, "grad_norm": 0.2853390872478485, "learning_rate": 0.00016596472394174664, "loss": 0.8561724662780762, "memory(GiB)": 88.73, "step": 1425, "token_acc": 0.7580813158347598, "train_speed(iter/s)": 0.090785 }, { "epoch": 0.3082560896744988, "grad_norm": 0.29730504751205444, "learning_rate": 0.0001656964198948451, "loss": 0.8730886459350586, "memory(GiB)": 88.73, "step": 1430, "token_acc": 0.7496378204180587, "train_speed(iter/s)": 0.090795 }, { "epoch": 0.30933390816986417, "grad_norm": 0.3178441524505615, "learning_rate": 0.00016542728121620286, "loss": 0.8687295913696289, "memory(GiB)": 88.73, "step": 1435, "token_acc": 0.7492734889946188, "train_speed(iter/s)": 0.090798 }, { "epoch": 0.3104117266652296, "grad_norm": 0.29485249519348145, "learning_rate": 0.00016515731132505778, "loss": 0.8828972816467285, "memory(GiB)": 88.73, "step": 1440, "token_acc": 0.7503785081240768, "train_speed(iter/s)": 0.090802 }, { "epoch": 0.31148954516059496, "grad_norm": 0.26084184646606445, "learning_rate": 0.0001648865136512077, "loss": 0.8388940811157226, "memory(GiB)": 88.73, "step": 1445, "token_acc": 0.7596254734932688, "train_speed(iter/s)": 0.090797 }, { "epoch": 0.31256736365596033, "grad_norm": 0.3608708083629608, "learning_rate": 0.00016461489163496692, "loss": 0.8594511032104493, "memory(GiB)": 88.73, "step": 1450, "token_acc": 0.7597757146796581, "train_speed(iter/s)": 0.090797 }, { "epoch": 0.3136451821513257, "grad_norm": 0.3070800006389618, "learning_rate": 0.0001643424487271225, "loss": 0.8477870941162109, "memory(GiB)": 88.73, "step": 1455, "token_acc": 0.7567479096911768, "train_speed(iter/s)": 0.090799 }, { "epoch": 0.3147230006466911, "grad_norm": 0.29502132534980774, "learning_rate": 0.00016406918838889048, "loss": 0.849368667602539, "memory(GiB)": 88.73, "step": 1460, "token_acc": 0.7555982146234445, "train_speed(iter/s)": 0.090798 }, { "epoch": 0.3158008191420565, "grad_norm": 0.27850180864334106, "learning_rate": 0.00016379511409187175, "loss": 0.858004379272461, "memory(GiB)": 88.73, "step": 1465, "token_acc": 0.7653620513950478, "train_speed(iter/s)": 0.090806 }, { "epoch": 0.31687863763742186, "grad_norm": 0.29244697093963623, "learning_rate": 0.00016352022931800803, "loss": 0.8236812591552735, "memory(GiB)": 88.73, "step": 1470, "token_acc": 0.7596997657947459, "train_speed(iter/s)": 0.09081 }, { "epoch": 0.3179564561327872, "grad_norm": 0.2818242907524109, "learning_rate": 0.00016324453755953773, "loss": 0.8692058563232422, "memory(GiB)": 88.73, "step": 1475, "token_acc": 0.7557902879718567, "train_speed(iter/s)": 0.090814 }, { "epoch": 0.3190342746281526, "grad_norm": 0.2580603361129761, "learning_rate": 0.00016296804231895142, "loss": 0.8455587387084961, "memory(GiB)": 88.73, "step": 1480, "token_acc": 0.757762658111148, "train_speed(iter/s)": 0.090806 }, { "epoch": 0.320112093123518, "grad_norm": 0.28830286860466003, "learning_rate": 0.00016269074710894738, "loss": 0.8390979766845703, "memory(GiB)": 88.73, "step": 1485, "token_acc": 0.7586961944621525, "train_speed(iter/s)": 0.090798 }, { "epoch": 0.3211899116188834, "grad_norm": 0.29304251074790955, "learning_rate": 0.0001624126554523871, "loss": 0.8586416244506836, "memory(GiB)": 88.73, "step": 1490, "token_acc": 0.7434280810522076, "train_speed(iter/s)": 0.090798 }, { "epoch": 0.32226773011424875, "grad_norm": 0.2886076271533966, "learning_rate": 0.0001621337708822503, "loss": 0.8587904930114746, "memory(GiB)": 88.73, "step": 1495, "token_acc": 0.7581880187890098, "train_speed(iter/s)": 0.090792 }, { "epoch": 0.3233455486096141, "grad_norm": 0.28654685616493225, "learning_rate": 0.0001618540969415903, "loss": 0.8535709381103516, "memory(GiB)": 88.73, "step": 1500, "token_acc": 0.7639902016613788, "train_speed(iter/s)": 0.090785 }, { "epoch": 0.32442336710497954, "grad_norm": 0.3027206361293793, "learning_rate": 0.00016157363718348879, "loss": 0.8511140823364258, "memory(GiB)": 88.73, "step": 1505, "token_acc": 0.7487131994540847, "train_speed(iter/s)": 0.090797 }, { "epoch": 0.3255011856003449, "grad_norm": 0.28740787506103516, "learning_rate": 0.00016129239517101077, "loss": 0.8478855133056641, "memory(GiB)": 88.73, "step": 1510, "token_acc": 0.7563486300929202, "train_speed(iter/s)": 0.090795 }, { "epoch": 0.3265790040957103, "grad_norm": 0.27861881256103516, "learning_rate": 0.00016101037447715933, "loss": 0.8379668235778809, "memory(GiB)": 88.73, "step": 1515, "token_acc": 0.7659103278719291, "train_speed(iter/s)": 0.090793 }, { "epoch": 0.32765682259107565, "grad_norm": 0.26536089181900024, "learning_rate": 0.00016072757868483022, "loss": 0.8418020248413086, "memory(GiB)": 88.73, "step": 1520, "token_acc": 0.7583801429822001, "train_speed(iter/s)": 0.090791 }, { "epoch": 0.32873464108644107, "grad_norm": 0.2488608956336975, "learning_rate": 0.00016044401138676627, "loss": 0.8171049118041992, "memory(GiB)": 88.73, "step": 1525, "token_acc": 0.7500110107905748, "train_speed(iter/s)": 0.090785 }, { "epoch": 0.32981245958180644, "grad_norm": 0.2889085114002228, "learning_rate": 0.00016015967618551193, "loss": 0.8649381637573242, "memory(GiB)": 88.73, "step": 1530, "token_acc": 0.7400435444403769, "train_speed(iter/s)": 0.090789 }, { "epoch": 0.3308902780771718, "grad_norm": 0.2682845890522003, "learning_rate": 0.00015987457669336722, "loss": 0.8556795120239258, "memory(GiB)": 88.73, "step": 1535, "token_acc": 0.7534068191405462, "train_speed(iter/s)": 0.090788 }, { "epoch": 0.3319680965725372, "grad_norm": 0.27802756428718567, "learning_rate": 0.00015958871653234212, "loss": 0.8605093955993652, "memory(GiB)": 88.73, "step": 1540, "token_acc": 0.755724278662757, "train_speed(iter/s)": 0.09079 }, { "epoch": 0.33304591506790254, "grad_norm": 0.3015299141407013, "learning_rate": 0.00015930209933411036, "loss": 0.852234172821045, "memory(GiB)": 88.73, "step": 1545, "token_acc": 0.7593119698397738, "train_speed(iter/s)": 0.090791 }, { "epoch": 0.33412373356326797, "grad_norm": 0.2580326795578003, "learning_rate": 0.00015901472873996336, "loss": 0.8513265609741211, "memory(GiB)": 88.73, "step": 1550, "token_acc": 0.7602723007241463, "train_speed(iter/s)": 0.090787 }, { "epoch": 0.33520155205863333, "grad_norm": 0.2698589563369751, "learning_rate": 0.000158726608400764, "loss": 0.8556797981262207, "memory(GiB)": 88.73, "step": 1555, "token_acc": 0.7580406380695066, "train_speed(iter/s)": 0.090782 }, { "epoch": 0.3362793705539987, "grad_norm": 0.2768225371837616, "learning_rate": 0.00015843774197690018, "loss": 0.847596549987793, "memory(GiB)": 88.73, "step": 1560, "token_acc": 0.7578975458485548, "train_speed(iter/s)": 0.090778 }, { "epoch": 0.33735718904936407, "grad_norm": 0.2590539753437042, "learning_rate": 0.00015814813313823835, "loss": 0.8389090538024903, "memory(GiB)": 88.73, "step": 1565, "token_acc": 0.7606040500509293, "train_speed(iter/s)": 0.090783 }, { "epoch": 0.3384350075447295, "grad_norm": 0.2635687589645386, "learning_rate": 0.0001578577855640768, "loss": 0.8476555824279786, "memory(GiB)": 88.73, "step": 1570, "token_acc": 0.752173832088927, "train_speed(iter/s)": 0.090787 }, { "epoch": 0.33951282604009486, "grad_norm": 0.2843003273010254, "learning_rate": 0.00015756670294309906, "loss": 0.8478913307189941, "memory(GiB)": 88.73, "step": 1575, "token_acc": 0.7639012703365278, "train_speed(iter/s)": 0.090782 }, { "epoch": 0.34059064453546023, "grad_norm": 0.2567037045955658, "learning_rate": 0.00015727488897332705, "loss": 0.8460362434387207, "memory(GiB)": 88.73, "step": 1580, "token_acc": 0.7501041502726423, "train_speed(iter/s)": 0.090779 }, { "epoch": 0.3416684630308256, "grad_norm": 0.2692916989326477, "learning_rate": 0.00015698234736207377, "loss": 0.8291967391967774, "memory(GiB)": 88.73, "step": 1585, "token_acc": 0.7688668086113083, "train_speed(iter/s)": 0.090776 }, { "epoch": 0.34274628152619097, "grad_norm": 0.2946663796901703, "learning_rate": 0.00015668908182589673, "loss": 0.8523354530334473, "memory(GiB)": 88.73, "step": 1590, "token_acc": 0.7536383481899218, "train_speed(iter/s)": 0.090779 }, { "epoch": 0.3438241000215564, "grad_norm": 0.28657376766204834, "learning_rate": 0.00015639509609055024, "loss": 0.8492325782775879, "memory(GiB)": 88.73, "step": 1595, "token_acc": 0.7621988475602305, "train_speed(iter/s)": 0.090777 }, { "epoch": 0.34490191851692176, "grad_norm": 0.28782057762145996, "learning_rate": 0.00015610039389093842, "loss": 0.8399961471557618, "memory(GiB)": 88.73, "step": 1600, "token_acc": 0.7603227249012446, "train_speed(iter/s)": 0.090771 }, { "epoch": 0.3459797370122871, "grad_norm": 0.29190579056739807, "learning_rate": 0.00015580497897106757, "loss": 0.8523462295532227, "memory(GiB)": 88.73, "step": 1605, "token_acc": 0.7505423784434194, "train_speed(iter/s)": 0.090778 }, { "epoch": 0.3470575555076525, "grad_norm": 0.2903738021850586, "learning_rate": 0.00015550885508399856, "loss": 0.8610392570495605, "memory(GiB)": 88.73, "step": 1610, "token_acc": 0.7487929746570227, "train_speed(iter/s)": 0.090775 }, { "epoch": 0.3481353740030179, "grad_norm": 0.27031436562538147, "learning_rate": 0.00015521202599179948, "loss": 0.8268445014953614, "memory(GiB)": 88.73, "step": 1615, "token_acc": 0.756410851777206, "train_speed(iter/s)": 0.090778 }, { "epoch": 0.3492131924983833, "grad_norm": 0.28598150610923767, "learning_rate": 0.00015491449546549735, "loss": 0.8427545547485351, "memory(GiB)": 88.73, "step": 1620, "token_acc": 0.7698424200264726, "train_speed(iter/s)": 0.090774 }, { "epoch": 0.35029101099374865, "grad_norm": 0.2641347348690033, "learning_rate": 0.00015461626728503061, "loss": 0.8433626174926758, "memory(GiB)": 88.73, "step": 1625, "token_acc": 0.7575746244238986, "train_speed(iter/s)": 0.090779 }, { "epoch": 0.351368829489114, "grad_norm": 0.29311028122901917, "learning_rate": 0.00015431734523920095, "loss": 0.8445764541625976, "memory(GiB)": 88.73, "step": 1630, "token_acc": 0.7567330890894469, "train_speed(iter/s)": 0.090779 }, { "epoch": 0.3524466479844794, "grad_norm": 0.274105042219162, "learning_rate": 0.00015401773312562515, "loss": 0.8542932510375977, "memory(GiB)": 88.73, "step": 1635, "token_acc": 0.7480866833158785, "train_speed(iter/s)": 0.090782 }, { "epoch": 0.3535244664798448, "grad_norm": 0.2926783859729767, "learning_rate": 0.00015371743475068689, "loss": 0.8462813377380372, "memory(GiB)": 88.73, "step": 1640, "token_acc": 0.7616641480921799, "train_speed(iter/s)": 0.09079 }, { "epoch": 0.3546022849752102, "grad_norm": 0.26269713044166565, "learning_rate": 0.0001534164539294883, "loss": 0.8508657455444336, "memory(GiB)": 88.73, "step": 1645, "token_acc": 0.7613146408029372, "train_speed(iter/s)": 0.090784 }, { "epoch": 0.35568010347057555, "grad_norm": 0.26450392603874207, "learning_rate": 0.00015311479448580176, "loss": 0.8444323539733887, "memory(GiB)": 88.73, "step": 1650, "token_acc": 0.7599128773677539, "train_speed(iter/s)": 0.090786 }, { "epoch": 0.3567579219659409, "grad_norm": 0.28955137729644775, "learning_rate": 0.0001528124602520208, "loss": 0.8380136489868164, "memory(GiB)": 88.73, "step": 1655, "token_acc": 0.7655002899831533, "train_speed(iter/s)": 0.09079 }, { "epoch": 0.35783574046130634, "grad_norm": 0.26275134086608887, "learning_rate": 0.00015250945506911209, "loss": 0.8346182823181152, "memory(GiB)": 88.73, "step": 1660, "token_acc": 0.7570767167149842, "train_speed(iter/s)": 0.090791 }, { "epoch": 0.3589135589566717, "grad_norm": 0.26187482476234436, "learning_rate": 0.000152205782786566, "loss": 0.8270325660705566, "memory(GiB)": 88.73, "step": 1665, "token_acc": 0.7658582174193138, "train_speed(iter/s)": 0.090794 }, { "epoch": 0.3599913774520371, "grad_norm": 0.2534135580062866, "learning_rate": 0.0001519014472623481, "loss": 0.8564752578735352, "memory(GiB)": 88.73, "step": 1670, "token_acc": 0.7526134642742417, "train_speed(iter/s)": 0.09079 }, { "epoch": 0.36106919594740244, "grad_norm": 0.2668834924697876, "learning_rate": 0.0001515964523628501, "loss": 0.8505046844482422, "memory(GiB)": 88.73, "step": 1675, "token_acc": 0.7571771792684413, "train_speed(iter/s)": 0.090782 }, { "epoch": 0.3621470144427678, "grad_norm": 0.2583718001842499, "learning_rate": 0.00015129080196284053, "loss": 0.843173599243164, "memory(GiB)": 88.73, "step": 1680, "token_acc": 0.7551168968679391, "train_speed(iter/s)": 0.090779 }, { "epoch": 0.36322483293813324, "grad_norm": 0.2715505063533783, "learning_rate": 0.00015098449994541572, "loss": 0.8361042022705079, "memory(GiB)": 88.73, "step": 1685, "token_acc": 0.7535256266030689, "train_speed(iter/s)": 0.090782 }, { "epoch": 0.3643026514334986, "grad_norm": 0.29546570777893066, "learning_rate": 0.00015067755020195035, "loss": 0.8499635696411133, "memory(GiB)": 88.73, "step": 1690, "token_acc": 0.7473378875062596, "train_speed(iter/s)": 0.090792 }, { "epoch": 0.36538046992886397, "grad_norm": 0.2710934281349182, "learning_rate": 0.00015036995663204804, "loss": 0.8565677642822266, "memory(GiB)": 88.73, "step": 1695, "token_acc": 0.7524623077505872, "train_speed(iter/s)": 0.090795 }, { "epoch": 0.36645828842422934, "grad_norm": 0.2586804926395416, "learning_rate": 0.0001500617231434919, "loss": 0.8506926536560059, "memory(GiB)": 88.73, "step": 1700, "token_acc": 0.7692879786551637, "train_speed(iter/s)": 0.090792 }, { "epoch": 0.36753610691959476, "grad_norm": 0.2694554328918457, "learning_rate": 0.00014975285365219473, "loss": 0.8342960357666016, "memory(GiB)": 88.73, "step": 1705, "token_acc": 0.7567960654106345, "train_speed(iter/s)": 0.090791 }, { "epoch": 0.36861392541496013, "grad_norm": 0.2820327579975128, "learning_rate": 0.00014944335208214938, "loss": 0.8701129913330078, "memory(GiB)": 88.73, "step": 1710, "token_acc": 0.7567878588609895, "train_speed(iter/s)": 0.090773 }, { "epoch": 0.3696917439103255, "grad_norm": 0.271011620759964, "learning_rate": 0.00014913322236537882, "loss": 0.8407217025756836, "memory(GiB)": 88.73, "step": 1715, "token_acc": 0.7501652223287505, "train_speed(iter/s)": 0.090783 }, { "epoch": 0.37076956240569087, "grad_norm": 0.3109191358089447, "learning_rate": 0.00014882246844188628, "loss": 0.8364040374755859, "memory(GiB)": 88.73, "step": 1720, "token_acc": 0.762428254451433, "train_speed(iter/s)": 0.090787 }, { "epoch": 0.37184738090105623, "grad_norm": 0.260318785905838, "learning_rate": 0.00014851109425960515, "loss": 0.8386424064636231, "memory(GiB)": 88.73, "step": 1725, "token_acc": 0.7667616538523576, "train_speed(iter/s)": 0.090775 }, { "epoch": 0.37292519939642166, "grad_norm": 0.2563639283180237, "learning_rate": 0.00014819910377434877, "loss": 0.8344226837158203, "memory(GiB)": 88.73, "step": 1730, "token_acc": 0.7555631728150303, "train_speed(iter/s)": 0.090769 }, { "epoch": 0.374003017891787, "grad_norm": 0.2606707513332367, "learning_rate": 0.00014788650094976023, "loss": 0.8303499221801758, "memory(GiB)": 88.73, "step": 1735, "token_acc": 0.7551928923913352, "train_speed(iter/s)": 0.090777 }, { "epoch": 0.3750808363871524, "grad_norm": 0.2770051658153534, "learning_rate": 0.00014757328975726207, "loss": 0.8249828338623046, "memory(GiB)": 88.73, "step": 1740, "token_acc": 0.7559673255772835, "train_speed(iter/s)": 0.090779 }, { "epoch": 0.37615865488251776, "grad_norm": 0.27481648325920105, "learning_rate": 0.00014725947417600565, "loss": 0.8278720855712891, "memory(GiB)": 88.73, "step": 1745, "token_acc": 0.7585601247390069, "train_speed(iter/s)": 0.090774 }, { "epoch": 0.3772364733778832, "grad_norm": 0.25115635991096497, "learning_rate": 0.00014694505819282084, "loss": 0.8418206214904785, "memory(GiB)": 88.73, "step": 1750, "token_acc": 0.756392900120337, "train_speed(iter/s)": 0.090775 }, { "epoch": 0.37831429187324855, "grad_norm": 0.277896910905838, "learning_rate": 0.00014663004580216512, "loss": 0.858989143371582, "memory(GiB)": 88.73, "step": 1755, "token_acc": 0.7504904887827348, "train_speed(iter/s)": 0.090783 }, { "epoch": 0.3793921103686139, "grad_norm": 0.2511812746524811, "learning_rate": 0.00014631444100607303, "loss": 0.8558472633361817, "memory(GiB)": 88.73, "step": 1760, "token_acc": 0.7618693798285979, "train_speed(iter/s)": 0.090777 }, { "epoch": 0.3804699288639793, "grad_norm": 0.2717522978782654, "learning_rate": 0.0001459982478141052, "loss": 0.8595260620117188, "memory(GiB)": 88.73, "step": 1765, "token_acc": 0.7469075656765309, "train_speed(iter/s)": 0.090778 }, { "epoch": 0.3815477473593447, "grad_norm": 0.2521330714225769, "learning_rate": 0.00014568147024329752, "loss": 0.8391046524047852, "memory(GiB)": 88.73, "step": 1770, "token_acc": 0.7599521772430928, "train_speed(iter/s)": 0.090776 }, { "epoch": 0.3826255658547101, "grad_norm": 0.2767956852912903, "learning_rate": 0.00014536411231811, "loss": 0.836371898651123, "memory(GiB)": 88.73, "step": 1775, "token_acc": 0.7622231352481978, "train_speed(iter/s)": 0.090776 }, { "epoch": 0.38370338435007545, "grad_norm": 0.246454656124115, "learning_rate": 0.0001450461780703757, "loss": 0.8268041610717773, "memory(GiB)": 88.73, "step": 1780, "token_acc": 0.763464311134634, "train_speed(iter/s)": 0.090773 }, { "epoch": 0.3847812028454408, "grad_norm": 0.24952833354473114, "learning_rate": 0.00014472767153924949, "loss": 0.8168627738952636, "memory(GiB)": 88.73, "step": 1785, "token_acc": 0.7720252378790546, "train_speed(iter/s)": 0.090772 }, { "epoch": 0.3858590213408062, "grad_norm": 0.2589110732078552, "learning_rate": 0.00014440859677115672, "loss": 0.8476753234863281, "memory(GiB)": 88.73, "step": 1790, "token_acc": 0.7595088982350456, "train_speed(iter/s)": 0.090769 }, { "epoch": 0.3869368398361716, "grad_norm": 0.25669485330581665, "learning_rate": 0.00014408895781974188, "loss": 0.8393096923828125, "memory(GiB)": 88.73, "step": 1795, "token_acc": 0.7572721303987896, "train_speed(iter/s)": 0.090776 }, { "epoch": 0.388014658331537, "grad_norm": 0.24903720617294312, "learning_rate": 0.00014376875874581705, "loss": 0.8376697540283203, "memory(GiB)": 88.73, "step": 1800, "token_acc": 0.7525457957457348, "train_speed(iter/s)": 0.090773 }, { "epoch": 0.38909247682690234, "grad_norm": 0.24215489625930786, "learning_rate": 0.00014344800361731027, "loss": 0.8335319519042969, "memory(GiB)": 88.73, "step": 1805, "token_acc": 0.7576350146661298, "train_speed(iter/s)": 0.090774 }, { "epoch": 0.3901702953222677, "grad_norm": 0.25726985931396484, "learning_rate": 0.00014312669650921395, "loss": 0.8506484985351562, "memory(GiB)": 88.73, "step": 1810, "token_acc": 0.7584201667958925, "train_speed(iter/s)": 0.090769 }, { "epoch": 0.39124811381763314, "grad_norm": 0.2677796185016632, "learning_rate": 0.00014280484150353302, "loss": 0.8381481170654297, "memory(GiB)": 88.73, "step": 1815, "token_acc": 0.7473938132626998, "train_speed(iter/s)": 0.090767 }, { "epoch": 0.3923259323129985, "grad_norm": 0.26906439661979675, "learning_rate": 0.00014248244268923313, "loss": 0.8291493415832519, "memory(GiB)": 88.73, "step": 1820, "token_acc": 0.7598213781018597, "train_speed(iter/s)": 0.090773 }, { "epoch": 0.3934037508083639, "grad_norm": 0.2693934142589569, "learning_rate": 0.00014215950416218866, "loss": 0.8182261466979981, "memory(GiB)": 88.73, "step": 1825, "token_acc": 0.7708800732001302, "train_speed(iter/s)": 0.09076 }, { "epoch": 0.39448156930372924, "grad_norm": 0.2523048520088196, "learning_rate": 0.0001418360300251307, "loss": 0.8171014785766602, "memory(GiB)": 88.73, "step": 1830, "token_acc": 0.766314257910301, "train_speed(iter/s)": 0.090755 }, { "epoch": 0.3955593877990946, "grad_norm": 0.2815280556678772, "learning_rate": 0.000141512024387595, "loss": 0.8358818054199219, "memory(GiB)": 88.73, "step": 1835, "token_acc": 0.7661857126919788, "train_speed(iter/s)": 0.090761 }, { "epoch": 0.39663720629446003, "grad_norm": 0.3000071942806244, "learning_rate": 0.00014118749136586952, "loss": 0.8240570068359375, "memory(GiB)": 88.73, "step": 1840, "token_acc": 0.7618869303947005, "train_speed(iter/s)": 0.090759 }, { "epoch": 0.3977150247898254, "grad_norm": 0.25292786955833435, "learning_rate": 0.0001408624350829425, "loss": 0.8406278610229492, "memory(GiB)": 88.73, "step": 1845, "token_acc": 0.7540501258463611, "train_speed(iter/s)": 0.090754 }, { "epoch": 0.39879284328519077, "grad_norm": 0.26157331466674805, "learning_rate": 0.00014053685966844976, "loss": 0.842344093322754, "memory(GiB)": 88.73, "step": 1850, "token_acc": 0.7485835760373842, "train_speed(iter/s)": 0.090755 }, { "epoch": 0.39987066178055614, "grad_norm": 0.22903767228126526, "learning_rate": 0.0001402107692586224, "loss": 0.8267465591430664, "memory(GiB)": 88.73, "step": 1855, "token_acc": 0.7616672862622068, "train_speed(iter/s)": 0.090748 }, { "epoch": 0.40094848027592156, "grad_norm": 0.25253620743751526, "learning_rate": 0.0001398841679962342, "loss": 0.8252335548400879, "memory(GiB)": 88.73, "step": 1860, "token_acc": 0.7591205758739322, "train_speed(iter/s)": 0.090751 }, { "epoch": 0.4020262987712869, "grad_norm": 0.25643235445022583, "learning_rate": 0.00013955706003054903, "loss": 0.8341848373413085, "memory(GiB)": 88.73, "step": 1865, "token_acc": 0.7556679533253472, "train_speed(iter/s)": 0.090744 }, { "epoch": 0.4031041172666523, "grad_norm": 0.2488357424736023, "learning_rate": 0.0001392294495172681, "loss": 0.8118571281433106, "memory(GiB)": 88.73, "step": 1870, "token_acc": 0.7718059478318657, "train_speed(iter/s)": 0.09074 }, { "epoch": 0.40418193576201766, "grad_norm": 0.25288254022598267, "learning_rate": 0.0001389013406184771, "loss": 0.8068958282470703, "memory(GiB)": 88.73, "step": 1875, "token_acc": 0.7613111640723914, "train_speed(iter/s)": 0.090747 }, { "epoch": 0.40525975425738303, "grad_norm": 0.28890395164489746, "learning_rate": 0.00013857273750259353, "loss": 0.8436056137084961, "memory(GiB)": 88.73, "step": 1880, "token_acc": 0.7517505705201973, "train_speed(iter/s)": 0.090744 }, { "epoch": 0.40633757275274845, "grad_norm": 0.25125473737716675, "learning_rate": 0.00013824364434431347, "loss": 0.8183420181274415, "memory(GiB)": 88.73, "step": 1885, "token_acc": 0.7585830812941069, "train_speed(iter/s)": 0.090745 }, { "epoch": 0.4074153912481138, "grad_norm": 0.2506405711174011, "learning_rate": 0.00013791406532455877, "loss": 0.835018539428711, "memory(GiB)": 88.73, "step": 1890, "token_acc": 0.7571616413511675, "train_speed(iter/s)": 0.090748 }, { "epoch": 0.4084932097434792, "grad_norm": 0.2447991520166397, "learning_rate": 0.00013758400463042381, "loss": 0.8347936630249023, "memory(GiB)": 88.73, "step": 1895, "token_acc": 0.757547472233222, "train_speed(iter/s)": 0.090751 }, { "epoch": 0.40957102823884456, "grad_norm": 0.23907671868801117, "learning_rate": 0.00013725346645512233, "loss": 0.8481650352478027, "memory(GiB)": 88.73, "step": 1900, "token_acc": 0.7565812663488117, "train_speed(iter/s)": 0.090752 }, { "epoch": 0.41064884673421, "grad_norm": 0.29096952080726624, "learning_rate": 0.0001369224549979342, "loss": 0.8256063461303711, "memory(GiB)": 88.73, "step": 1905, "token_acc": 0.760840500108405, "train_speed(iter/s)": 0.090749 }, { "epoch": 0.41172666522957535, "grad_norm": 0.2669379711151123, "learning_rate": 0.00013659097446415202, "loss": 0.8266623497009278, "memory(GiB)": 88.73, "step": 1910, "token_acc": 0.7574177481175944, "train_speed(iter/s)": 0.090748 }, { "epoch": 0.4128044837249407, "grad_norm": 0.27574971318244934, "learning_rate": 0.00013625902906502777, "loss": 0.8126872062683106, "memory(GiB)": 88.73, "step": 1915, "token_acc": 0.7587317338026455, "train_speed(iter/s)": 0.090746 }, { "epoch": 0.4138823022203061, "grad_norm": 0.25856342911720276, "learning_rate": 0.0001359266230177191, "loss": 0.8150705337524414, "memory(GiB)": 88.73, "step": 1920, "token_acc": 0.7623235854466086, "train_speed(iter/s)": 0.090752 }, { "epoch": 0.41496012071567145, "grad_norm": 0.25781509280204773, "learning_rate": 0.0001355937605452361, "loss": 0.8201316833496094, "memory(GiB)": 88.73, "step": 1925, "token_acc": 0.759454894985537, "train_speed(iter/s)": 0.090746 }, { "epoch": 0.4160379392110369, "grad_norm": 0.25978022813796997, "learning_rate": 0.00013526044587638727, "loss": 0.8365470886230468, "memory(GiB)": 88.73, "step": 1930, "token_acc": 0.7520437990070988, "train_speed(iter/s)": 0.090752 }, { "epoch": 0.41711575770640225, "grad_norm": 0.2575080096721649, "learning_rate": 0.00013492668324572614, "loss": 0.8076364517211914, "memory(GiB)": 88.73, "step": 1935, "token_acc": 0.7560267051605919, "train_speed(iter/s)": 0.090753 }, { "epoch": 0.4181935762017676, "grad_norm": 0.2628234624862671, "learning_rate": 0.00013459247689349724, "loss": 0.8321111679077149, "memory(GiB)": 88.73, "step": 1940, "token_acc": 0.7549204587495375, "train_speed(iter/s)": 0.090757 }, { "epoch": 0.419271394697133, "grad_norm": 0.2532503008842468, "learning_rate": 0.0001342578310655823, "loss": 0.8314437866210938, "memory(GiB)": 88.73, "step": 1945, "token_acc": 0.7539388957391423, "train_speed(iter/s)": 0.090748 }, { "epoch": 0.4203492131924984, "grad_norm": 0.2680584490299225, "learning_rate": 0.00013392275001344628, "loss": 0.8158219337463379, "memory(GiB)": 88.73, "step": 1950, "token_acc": 0.7561378971441531, "train_speed(iter/s)": 0.090754 }, { "epoch": 0.4214270316878638, "grad_norm": 0.2609739303588867, "learning_rate": 0.00013358723799408346, "loss": 0.8303052902221679, "memory(GiB)": 88.73, "step": 1955, "token_acc": 0.7630726951022099, "train_speed(iter/s)": 0.090753 }, { "epoch": 0.42250485018322914, "grad_norm": 0.26549097895622253, "learning_rate": 0.0001332512992699633, "loss": 0.824283218383789, "memory(GiB)": 88.73, "step": 1960, "token_acc": 0.7660631162262215, "train_speed(iter/s)": 0.090752 }, { "epoch": 0.4235826686785945, "grad_norm": 0.24012601375579834, "learning_rate": 0.00013291493810897617, "loss": 0.8251029014587402, "memory(GiB)": 88.73, "step": 1965, "token_acc": 0.7619554667696917, "train_speed(iter/s)": 0.090756 }, { "epoch": 0.4246604871739599, "grad_norm": 0.257487416267395, "learning_rate": 0.00013257815878437934, "loss": 0.8098674774169922, "memory(GiB)": 88.73, "step": 1970, "token_acc": 0.7693086128621961, "train_speed(iter/s)": 0.090756 }, { "epoch": 0.4257383056693253, "grad_norm": 0.2526766061782837, "learning_rate": 0.00013224096557474252, "loss": 0.8363197326660157, "memory(GiB)": 88.73, "step": 1975, "token_acc": 0.7619065431048188, "train_speed(iter/s)": 0.09076 }, { "epoch": 0.42681612416469067, "grad_norm": 0.2466905266046524, "learning_rate": 0.00013190336276389363, "loss": 0.8133687973022461, "memory(GiB)": 88.73, "step": 1980, "token_acc": 0.7717355702744013, "train_speed(iter/s)": 0.090757 }, { "epoch": 0.42789394266005604, "grad_norm": 0.2572924494743347, "learning_rate": 0.00013156535464086425, "loss": 0.8235321044921875, "memory(GiB)": 88.73, "step": 1985, "token_acc": 0.7606015935174048, "train_speed(iter/s)": 0.090759 }, { "epoch": 0.4289717611554214, "grad_norm": 0.24721023440361023, "learning_rate": 0.00013122694549983524, "loss": 0.8133857727050782, "memory(GiB)": 88.73, "step": 1990, "token_acc": 0.7634598990097712, "train_speed(iter/s)": 0.090759 }, { "epoch": 0.43004957965078683, "grad_norm": 0.24329617619514465, "learning_rate": 0.00013088813964008214, "loss": 0.8140478134155273, "memory(GiB)": 88.73, "step": 1995, "token_acc": 0.7591894090673766, "train_speed(iter/s)": 0.090758 }, { "epoch": 0.4311273981461522, "grad_norm": 0.2472774237394333, "learning_rate": 0.00013054894136592052, "loss": 0.8275058746337891, "memory(GiB)": 88.73, "step": 2000, "token_acc": 0.761151970406819, "train_speed(iter/s)": 0.090754 }, { "epoch": 0.4311273981461522, "eval_loss": 0.8233972191810608, "eval_runtime": 144.337, "eval_samples_per_second": 41.549, "eval_steps_per_second": 5.196, "eval_token_acc": 0.7612176727090534, "step": 2000 }, { "epoch": 0.43220521664151756, "grad_norm": 0.26200464367866516, "learning_rate": 0.00013020935498665135, "loss": 0.8358120918273926, "memory(GiB)": 90.49, "step": 2005, "token_acc": 0.7592802881979794, "train_speed(iter/s)": 0.089892 }, { "epoch": 0.43328303513688293, "grad_norm": 0.24313980340957642, "learning_rate": 0.00012986938481650623, "loss": 0.810368824005127, "memory(GiB)": 90.49, "step": 2010, "token_acc": 0.7658244002041859, "train_speed(iter/s)": 0.089889 }, { "epoch": 0.43436085363224836, "grad_norm": 0.27997440099716187, "learning_rate": 0.00012952903517459262, "loss": 0.8068580627441406, "memory(GiB)": 90.49, "step": 2015, "token_acc": 0.7714860515021459, "train_speed(iter/s)": 0.089886 }, { "epoch": 0.4354386721276137, "grad_norm": 0.2458069920539856, "learning_rate": 0.00012918831038483887, "loss": 0.833229923248291, "memory(GiB)": 90.49, "step": 2020, "token_acc": 0.7514918190567854, "train_speed(iter/s)": 0.089888 }, { "epoch": 0.4365164906229791, "grad_norm": 0.26310858130455017, "learning_rate": 0.00012884721477593938, "loss": 0.8056654930114746, "memory(GiB)": 90.49, "step": 2025, "token_acc": 0.7624665576694412, "train_speed(iter/s)": 0.089892 }, { "epoch": 0.43759430911834446, "grad_norm": 0.23649801313877106, "learning_rate": 0.0001285057526812995, "loss": 0.823182201385498, "memory(GiB)": 90.49, "step": 2030, "token_acc": 0.7644926264044943, "train_speed(iter/s)": 0.089896 }, { "epoch": 0.43867212761370983, "grad_norm": 0.25168582797050476, "learning_rate": 0.00012816392843898074, "loss": 0.8447717666625977, "memory(GiB)": 90.49, "step": 2035, "token_acc": 0.7493874727365316, "train_speed(iter/s)": 0.089899 }, { "epoch": 0.43974994610907525, "grad_norm": 0.2361283153295517, "learning_rate": 0.0001278217463916453, "loss": 0.8150223731994629, "memory(GiB)": 90.49, "step": 2040, "token_acc": 0.7581041827141703, "train_speed(iter/s)": 0.089891 }, { "epoch": 0.4408277646044406, "grad_norm": 0.26395317912101746, "learning_rate": 0.00012747921088650103, "loss": 0.830561637878418, "memory(GiB)": 90.49, "step": 2045, "token_acc": 0.7607638509452519, "train_speed(iter/s)": 0.089882 }, { "epoch": 0.441905583099806, "grad_norm": 0.24244140088558197, "learning_rate": 0.0001271363262752464, "loss": 0.8133491516113281, "memory(GiB)": 90.49, "step": 2050, "token_acc": 0.7658636275764299, "train_speed(iter/s)": 0.089885 }, { "epoch": 0.44298340159517136, "grad_norm": 0.2592514455318451, "learning_rate": 0.00012679309691401503, "loss": 0.8247940063476562, "memory(GiB)": 90.49, "step": 2055, "token_acc": 0.7486207836392516, "train_speed(iter/s)": 0.089888 }, { "epoch": 0.4440612200905368, "grad_norm": 0.2564166784286499, "learning_rate": 0.00012644952716332018, "loss": 0.8435881614685059, "memory(GiB)": 90.49, "step": 2060, "token_acc": 0.7475987668620917, "train_speed(iter/s)": 0.089895 }, { "epoch": 0.44513903858590215, "grad_norm": 0.2479657530784607, "learning_rate": 0.00012610562138799978, "loss": 0.8264467239379882, "memory(GiB)": 90.49, "step": 2065, "token_acc": 0.7544757772957339, "train_speed(iter/s)": 0.089905 }, { "epoch": 0.4462168570812675, "grad_norm": 0.25349298119544983, "learning_rate": 0.00012576138395716064, "loss": 0.811968994140625, "memory(GiB)": 90.49, "step": 2070, "token_acc": 0.7626092682970834, "train_speed(iter/s)": 0.089908 }, { "epoch": 0.4472946755766329, "grad_norm": 0.23237302899360657, "learning_rate": 0.00012541681924412303, "loss": 0.8137406349182129, "memory(GiB)": 90.49, "step": 2075, "token_acc": 0.7569761448976586, "train_speed(iter/s)": 0.089908 }, { "epoch": 0.44837249407199825, "grad_norm": 0.2557789087295532, "learning_rate": 0.00012507193162636515, "loss": 0.8129938125610352, "memory(GiB)": 90.49, "step": 2080, "token_acc": 0.7630777392592863, "train_speed(iter/s)": 0.089912 }, { "epoch": 0.4494503125673637, "grad_norm": 0.25231945514678955, "learning_rate": 0.0001247267254854675, "loss": 0.8210895538330079, "memory(GiB)": 90.49, "step": 2085, "token_acc": 0.7629830495839274, "train_speed(iter/s)": 0.089919 }, { "epoch": 0.45052813106272904, "grad_norm": 0.26368072628974915, "learning_rate": 0.00012438120520705728, "loss": 0.8206973075866699, "memory(GiB)": 90.49, "step": 2090, "token_acc": 0.7579125830696561, "train_speed(iter/s)": 0.089914 }, { "epoch": 0.4516059495580944, "grad_norm": 0.25619271397590637, "learning_rate": 0.00012403537518075245, "loss": 0.8239354133605957, "memory(GiB)": 90.49, "step": 2095, "token_acc": 0.7552358036573629, "train_speed(iter/s)": 0.089917 }, { "epoch": 0.4526837680534598, "grad_norm": 0.23952969908714294, "learning_rate": 0.00012368923980010633, "loss": 0.8132918357849122, "memory(GiB)": 90.49, "step": 2100, "token_acc": 0.7683812015253956, "train_speed(iter/s)": 0.089922 }, { "epoch": 0.4537615865488252, "grad_norm": 0.24730557203292847, "learning_rate": 0.0001233428034625514, "loss": 0.8206700325012207, "memory(GiB)": 90.49, "step": 2105, "token_acc": 0.7606898047313287, "train_speed(iter/s)": 0.089928 }, { "epoch": 0.45483940504419057, "grad_norm": 0.27493369579315186, "learning_rate": 0.0001229960705693437, "loss": 0.8215415000915527, "memory(GiB)": 90.49, "step": 2110, "token_acc": 0.7561696342591718, "train_speed(iter/s)": 0.089935 }, { "epoch": 0.45591722353955594, "grad_norm": 0.24349282681941986, "learning_rate": 0.0001226490455255068, "loss": 0.7817745208740234, "memory(GiB)": 90.49, "step": 2115, "token_acc": 0.7686622919535885, "train_speed(iter/s)": 0.089942 }, { "epoch": 0.4569950420349213, "grad_norm": 0.23706865310668945, "learning_rate": 0.0001223017327397759, "loss": 0.8135540962219239, "memory(GiB)": 90.49, "step": 2120, "token_acc": 0.7619580656388453, "train_speed(iter/s)": 0.089942 }, { "epoch": 0.4580728605302867, "grad_norm": 0.2437783181667328, "learning_rate": 0.0001219541366245417, "loss": 0.8251320838928222, "memory(GiB)": 90.49, "step": 2125, "token_acc": 0.7628615767724706, "train_speed(iter/s)": 0.089944 }, { "epoch": 0.4591506790256521, "grad_norm": 0.23320017755031586, "learning_rate": 0.00012160626159579447, "loss": 0.825019359588623, "memory(GiB)": 90.49, "step": 2130, "token_acc": 0.7548112251067353, "train_speed(iter/s)": 0.089943 }, { "epoch": 0.46022849752101747, "grad_norm": 0.2282591015100479, "learning_rate": 0.00012125811207306788, "loss": 0.8097822189331054, "memory(GiB)": 90.49, "step": 2135, "token_acc": 0.7616832529860074, "train_speed(iter/s)": 0.089946 }, { "epoch": 0.46130631601638283, "grad_norm": 0.2396487295627594, "learning_rate": 0.00012090969247938294, "loss": 0.8249451637268066, "memory(GiB)": 90.49, "step": 2140, "token_acc": 0.7554881636717512, "train_speed(iter/s)": 0.089952 }, { "epoch": 0.4623841345117482, "grad_norm": 0.2606123089790344, "learning_rate": 0.00012056100724119164, "loss": 0.8271636009216309, "memory(GiB)": 90.49, "step": 2145, "token_acc": 0.7578725256424813, "train_speed(iter/s)": 0.089954 }, { "epoch": 0.4634619530071136, "grad_norm": 0.25488388538360596, "learning_rate": 0.00012021206078832088, "loss": 0.7970648765563965, "memory(GiB)": 90.49, "step": 2150, "token_acc": 0.7617907148120855, "train_speed(iter/s)": 0.089953 }, { "epoch": 0.464539771502479, "grad_norm": 0.257005900144577, "learning_rate": 0.00011986285755391613, "loss": 0.8295022964477539, "memory(GiB)": 90.49, "step": 2155, "token_acc": 0.7523143757733401, "train_speed(iter/s)": 0.089955 }, { "epoch": 0.46561758999784436, "grad_norm": 0.2528907060623169, "learning_rate": 0.00011951340197438513, "loss": 0.8002513885498047, "memory(GiB)": 90.49, "step": 2160, "token_acc": 0.7603967528329593, "train_speed(iter/s)": 0.089962 }, { "epoch": 0.46669540849320973, "grad_norm": 0.23222331702709198, "learning_rate": 0.0001191636984893414, "loss": 0.8309502601623535, "memory(GiB)": 90.49, "step": 2165, "token_acc": 0.7551370801293175, "train_speed(iter/s)": 0.089968 }, { "epoch": 0.4677732269885751, "grad_norm": 0.2550753653049469, "learning_rate": 0.00011881375154154803, "loss": 0.8236516952514649, "memory(GiB)": 90.49, "step": 2170, "token_acc": 0.7533982885397851, "train_speed(iter/s)": 0.089971 }, { "epoch": 0.4688510454839405, "grad_norm": 0.25124311447143555, "learning_rate": 0.00011846356557686116, "loss": 0.8262309074401856, "memory(GiB)": 90.49, "step": 2175, "token_acc": 0.7625445988005769, "train_speed(iter/s)": 0.089976 }, { "epoch": 0.4699288639793059, "grad_norm": 0.24924340844154358, "learning_rate": 0.00011811314504417348, "loss": 0.8426900863647461, "memory(GiB)": 90.49, "step": 2180, "token_acc": 0.7594443136331528, "train_speed(iter/s)": 0.089981 }, { "epoch": 0.47100668247467126, "grad_norm": 0.2655542194843292, "learning_rate": 0.00011776249439535765, "loss": 0.8152193069458008, "memory(GiB)": 90.49, "step": 2185, "token_acc": 0.7592781873275745, "train_speed(iter/s)": 0.089981 }, { "epoch": 0.4720845009700366, "grad_norm": 0.24368460476398468, "learning_rate": 0.0001174116180852099, "loss": 0.8074836730957031, "memory(GiB)": 90.49, "step": 2190, "token_acc": 0.7596219142883363, "train_speed(iter/s)": 0.089979 }, { "epoch": 0.47316231946540205, "grad_norm": 0.24505382776260376, "learning_rate": 0.00011706052057139335, "loss": 0.8138942718505859, "memory(GiB)": 90.49, "step": 2195, "token_acc": 0.7624359929462576, "train_speed(iter/s)": 0.089977 }, { "epoch": 0.4742401379607674, "grad_norm": 0.24057433009147644, "learning_rate": 0.00011670920631438125, "loss": 0.8016160011291504, "memory(GiB)": 90.49, "step": 2200, "token_acc": 0.7726475953196601, "train_speed(iter/s)": 0.089978 }, { "epoch": 0.4753179564561328, "grad_norm": 0.23985306918621063, "learning_rate": 0.00011635767977740057, "loss": 0.8280704498291016, "memory(GiB)": 90.49, "step": 2205, "token_acc": 0.7550264651003389, "train_speed(iter/s)": 0.089985 }, { "epoch": 0.47639577495149815, "grad_norm": 0.2835563123226166, "learning_rate": 0.00011600594542637511, "loss": 0.7942589282989502, "memory(GiB)": 90.49, "step": 2210, "token_acc": 0.769363358354184, "train_speed(iter/s)": 0.089986 }, { "epoch": 0.4774735934468636, "grad_norm": 0.24187520146369934, "learning_rate": 0.00011565400772986883, "loss": 0.7944571495056152, "memory(GiB)": 90.49, "step": 2215, "token_acc": 0.7644922164566346, "train_speed(iter/s)": 0.089985 }, { "epoch": 0.47855141194222894, "grad_norm": 0.23560170829296112, "learning_rate": 0.00011530187115902901, "loss": 0.8190946578979492, "memory(GiB)": 90.49, "step": 2220, "token_acc": 0.7697421715690382, "train_speed(iter/s)": 0.089987 }, { "epoch": 0.4796292304375943, "grad_norm": 0.263911634683609, "learning_rate": 0.00011494954018752961, "loss": 0.8122806549072266, "memory(GiB)": 90.49, "step": 2225, "token_acc": 0.7655443820785085, "train_speed(iter/s)": 0.089986 }, { "epoch": 0.4807070489329597, "grad_norm": 0.23923230171203613, "learning_rate": 0.00011459701929151423, "loss": 0.8029682159423828, "memory(GiB)": 90.49, "step": 2230, "token_acc": 0.7657172881110266, "train_speed(iter/s)": 0.089989 }, { "epoch": 0.48178486742832505, "grad_norm": 0.22699642181396484, "learning_rate": 0.00011424431294953944, "loss": 0.8009753227233887, "memory(GiB)": 90.49, "step": 2235, "token_acc": 0.7650473599446785, "train_speed(iter/s)": 0.089995 }, { "epoch": 0.48286268592369047, "grad_norm": 0.2517281770706177, "learning_rate": 0.00011389142564251765, "loss": 0.8190506935119629, "memory(GiB)": 90.49, "step": 2240, "token_acc": 0.7596615616460741, "train_speed(iter/s)": 0.089997 }, { "epoch": 0.48394050441905584, "grad_norm": 0.23365731537342072, "learning_rate": 0.00011353836185366041, "loss": 0.796909236907959, "memory(GiB)": 90.49, "step": 2245, "token_acc": 0.7631809423716939, "train_speed(iter/s)": 0.089997 }, { "epoch": 0.4850183229144212, "grad_norm": 0.2798527479171753, "learning_rate": 0.0001131851260684214, "loss": 0.8070747375488281, "memory(GiB)": 90.49, "step": 2250, "token_acc": 0.7626837816923847, "train_speed(iter/s)": 0.089995 }, { "epoch": 0.4860961414097866, "grad_norm": 0.2652362883090973, "learning_rate": 0.00011283172277443928, "loss": 0.8144262313842774, "memory(GiB)": 90.49, "step": 2255, "token_acc": 0.7725116304727581, "train_speed(iter/s)": 0.09 }, { "epoch": 0.487173959905152, "grad_norm": 0.23246917128562927, "learning_rate": 0.00011247815646148087, "loss": 0.7937706470489502, "memory(GiB)": 90.49, "step": 2260, "token_acc": 0.757537482524225, "train_speed(iter/s)": 0.090004 }, { "epoch": 0.48825177840051737, "grad_norm": 0.25758039951324463, "learning_rate": 0.00011212443162138411, "loss": 0.8088253974914551, "memory(GiB)": 90.49, "step": 2265, "token_acc": 0.758174871156922, "train_speed(iter/s)": 0.090004 }, { "epoch": 0.48932959689588273, "grad_norm": 0.2619822323322296, "learning_rate": 0.00011177055274800084, "loss": 0.8077878952026367, "memory(GiB)": 90.49, "step": 2270, "token_acc": 0.7659545700378583, "train_speed(iter/s)": 0.090009 }, { "epoch": 0.4904074153912481, "grad_norm": 0.24306602776050568, "learning_rate": 0.0001114165243371398, "loss": 0.8075483322143555, "memory(GiB)": 90.49, "step": 2275, "token_acc": 0.7623187618297245, "train_speed(iter/s)": 0.090008 }, { "epoch": 0.49148523388661347, "grad_norm": 0.25107342004776, "learning_rate": 0.00011106235088650953, "loss": 0.8278181076049804, "memory(GiB)": 90.49, "step": 2280, "token_acc": 0.7581400535236396, "train_speed(iter/s)": 0.090015 }, { "epoch": 0.4925630523819789, "grad_norm": 0.2615970969200134, "learning_rate": 0.00011070803689566122, "loss": 0.804340934753418, "memory(GiB)": 90.49, "step": 2285, "token_acc": 0.7669444852401234, "train_speed(iter/s)": 0.090019 }, { "epoch": 0.49364087087734426, "grad_norm": 0.2547217607498169, "learning_rate": 0.00011035358686593156, "loss": 0.8005382537841796, "memory(GiB)": 90.49, "step": 2290, "token_acc": 0.7720539907221307, "train_speed(iter/s)": 0.090017 }, { "epoch": 0.49471868937270963, "grad_norm": 0.2419973462820053, "learning_rate": 0.00010999900530038543, "loss": 0.8091569900512695, "memory(GiB)": 90.49, "step": 2295, "token_acc": 0.7539686041278777, "train_speed(iter/s)": 0.090014 }, { "epoch": 0.495796507868075, "grad_norm": 0.22358399629592896, "learning_rate": 0.00010964429670375886, "loss": 0.7994516372680665, "memory(GiB)": 90.49, "step": 2300, "token_acc": 0.7691098625137758, "train_speed(iter/s)": 0.09002 }, { "epoch": 0.4968743263634404, "grad_norm": 0.22894488275051117, "learning_rate": 0.00010928946558240176, "loss": 0.7922653198242188, "memory(GiB)": 90.49, "step": 2305, "token_acc": 0.7719715808170515, "train_speed(iter/s)": 0.090018 }, { "epoch": 0.4979521448588058, "grad_norm": 0.2362579107284546, "learning_rate": 0.00010893451644422061, "loss": 0.8029891014099121, "memory(GiB)": 90.49, "step": 2310, "token_acc": 0.7659860023959862, "train_speed(iter/s)": 0.090017 }, { "epoch": 0.49902996335417116, "grad_norm": 0.2389765977859497, "learning_rate": 0.00010857945379862115, "loss": 0.8136133193969727, "memory(GiB)": 90.49, "step": 2315, "token_acc": 0.7517747834999201, "train_speed(iter/s)": 0.09002 }, { "epoch": 0.5001077818495365, "grad_norm": 0.2386116087436676, "learning_rate": 0.00010822428215645128, "loss": 0.7974858283996582, "memory(GiB)": 90.49, "step": 2320, "token_acc": 0.7615702068865872, "train_speed(iter/s)": 0.090024 }, { "epoch": 0.5011856003449019, "grad_norm": 0.24667155742645264, "learning_rate": 0.00010786900602994359, "loss": 0.8109920501708985, "memory(GiB)": 90.49, "step": 2325, "token_acc": 0.7672210350274087, "train_speed(iter/s)": 0.090027 }, { "epoch": 0.5022634188402673, "grad_norm": 0.23570172488689423, "learning_rate": 0.00010751362993265797, "loss": 0.7962733268737793, "memory(GiB)": 90.49, "step": 2330, "token_acc": 0.7677694912674555, "train_speed(iter/s)": 0.090031 }, { "epoch": 0.5033412373356326, "grad_norm": 0.24215206503868103, "learning_rate": 0.00010715815837942456, "loss": 0.7874496459960938, "memory(GiB)": 90.49, "step": 2335, "token_acc": 0.7700788967112814, "train_speed(iter/s)": 0.090032 }, { "epoch": 0.5044190558309981, "grad_norm": 0.2400427907705307, "learning_rate": 0.00010680259588628607, "loss": 0.8028629302978516, "memory(GiB)": 90.49, "step": 2340, "token_acc": 0.7619983737076784, "train_speed(iter/s)": 0.090035 }, { "epoch": 0.5054968743263635, "grad_norm": 0.21412144601345062, "learning_rate": 0.00010644694697044062, "loss": 0.7881648540496826, "memory(GiB)": 90.49, "step": 2345, "token_acc": 0.7793253614777624, "train_speed(iter/s)": 0.090032 }, { "epoch": 0.5065746928217288, "grad_norm": 0.22316822409629822, "learning_rate": 0.00010609121615018423, "loss": 0.8190513610839844, "memory(GiB)": 90.49, "step": 2350, "token_acc": 0.773549755602693, "train_speed(iter/s)": 0.090028 }, { "epoch": 0.5076525113170942, "grad_norm": 0.23365463316440582, "learning_rate": 0.00010573540794485348, "loss": 0.7784983158111572, "memory(GiB)": 90.49, "step": 2355, "token_acc": 0.7662065283709579, "train_speed(iter/s)": 0.090033 }, { "epoch": 0.5087303298124596, "grad_norm": 0.26420271396636963, "learning_rate": 0.00010537952687476808, "loss": 0.7946662902832031, "memory(GiB)": 90.49, "step": 2360, "token_acc": 0.7655792515914543, "train_speed(iter/s)": 0.090038 }, { "epoch": 0.509808148307825, "grad_norm": 0.22949139773845673, "learning_rate": 0.00010502357746117346, "loss": 0.7891981124877929, "memory(GiB)": 90.49, "step": 2365, "token_acc": 0.764585764294049, "train_speed(iter/s)": 0.09004 }, { "epoch": 0.5108859668031903, "grad_norm": 0.2354673147201538, "learning_rate": 0.00010466756422618329, "loss": 0.7972259998321534, "memory(GiB)": 90.49, "step": 2370, "token_acc": 0.7619544287548139, "train_speed(iter/s)": 0.090037 }, { "epoch": 0.5119637852985557, "grad_norm": 0.29993072152137756, "learning_rate": 0.00010431149169272203, "loss": 0.7985096931457519, "memory(GiB)": 90.49, "step": 2375, "token_acc": 0.7665638368267519, "train_speed(iter/s)": 0.090036 }, { "epoch": 0.513041603793921, "grad_norm": 0.261380136013031, "learning_rate": 0.00010395536438446756, "loss": 0.7701329231262207, "memory(GiB)": 90.49, "step": 2380, "token_acc": 0.7722420897852488, "train_speed(iter/s)": 0.090034 }, { "epoch": 0.5141194222892865, "grad_norm": 0.24752232432365417, "learning_rate": 0.00010359918682579353, "loss": 0.7936477661132812, "memory(GiB)": 90.49, "step": 2385, "token_acc": 0.7662251775619655, "train_speed(iter/s)": 0.090036 }, { "epoch": 0.5151972407846519, "grad_norm": 0.23216986656188965, "learning_rate": 0.00010324296354171207, "loss": 0.8033089637756348, "memory(GiB)": 90.49, "step": 2390, "token_acc": 0.7741091980480619, "train_speed(iter/s)": 0.090033 }, { "epoch": 0.5162750592800173, "grad_norm": 0.22938647866249084, "learning_rate": 0.00010288669905781628, "loss": 0.8016170501708985, "memory(GiB)": 90.49, "step": 2395, "token_acc": 0.7699955949908753, "train_speed(iter/s)": 0.090039 }, { "epoch": 0.5173528777753826, "grad_norm": 0.22938017547130585, "learning_rate": 0.00010253039790022248, "loss": 0.8087236404418945, "memory(GiB)": 90.49, "step": 2400, "token_acc": 0.7678265622153397, "train_speed(iter/s)": 0.090031 }, { "epoch": 0.518430696270748, "grad_norm": 0.24423062801361084, "learning_rate": 0.0001021740645955131, "loss": 0.8220039367675781, "memory(GiB)": 90.49, "step": 2405, "token_acc": 0.7542258406045782, "train_speed(iter/s)": 0.09003 }, { "epoch": 0.5195085147661134, "grad_norm": 0.23492908477783203, "learning_rate": 0.00010181770367067893, "loss": 0.7787254333496094, "memory(GiB)": 90.49, "step": 2410, "token_acc": 0.7715783510623949, "train_speed(iter/s)": 0.090034 }, { "epoch": 0.5205863332614787, "grad_norm": 0.22952096164226532, "learning_rate": 0.0001014613196530616, "loss": 0.7899215698242188, "memory(GiB)": 90.49, "step": 2415, "token_acc": 0.7659656587716986, "train_speed(iter/s)": 0.090041 }, { "epoch": 0.5216641517568441, "grad_norm": 0.23223388195037842, "learning_rate": 0.00010110491707029614, "loss": 0.8032407760620117, "memory(GiB)": 90.49, "step": 2420, "token_acc": 0.7670482056049426, "train_speed(iter/s)": 0.090043 }, { "epoch": 0.5227419702522095, "grad_norm": 0.22685404121875763, "learning_rate": 0.00010074850045025348, "loss": 0.8070381164550782, "memory(GiB)": 90.49, "step": 2425, "token_acc": 0.7678414852655255, "train_speed(iter/s)": 0.090046 }, { "epoch": 0.523819788747575, "grad_norm": 0.24959486722946167, "learning_rate": 0.00010039207432098285, "loss": 0.8024883270263672, "memory(GiB)": 90.49, "step": 2430, "token_acc": 0.7584186606800635, "train_speed(iter/s)": 0.090052 }, { "epoch": 0.5248976072429403, "grad_norm": 0.24010279774665833, "learning_rate": 0.00010003564321065431, "loss": 0.7975437641143799, "memory(GiB)": 90.49, "step": 2435, "token_acc": 0.7665948751932847, "train_speed(iter/s)": 0.090049 }, { "epoch": 0.5259754257383057, "grad_norm": 0.22265958786010742, "learning_rate": 9.967921164750115e-05, "loss": 0.8073368072509766, "memory(GiB)": 90.49, "step": 2440, "token_acc": 0.7628278113021367, "train_speed(iter/s)": 0.090046 }, { "epoch": 0.5270532442336711, "grad_norm": 0.24697081744670868, "learning_rate": 9.932278415976247e-05, "loss": 0.7791303157806396, "memory(GiB)": 90.49, "step": 2445, "token_acc": 0.7810365608678184, "train_speed(iter/s)": 0.090045 }, { "epoch": 0.5281310627290364, "grad_norm": 0.25603240728378296, "learning_rate": 9.896636527562561e-05, "loss": 0.8048822402954101, "memory(GiB)": 90.49, "step": 2450, "token_acc": 0.7660052449155496, "train_speed(iter/s)": 0.090046 }, { "epoch": 0.5292088812244018, "grad_norm": 0.23026348650455475, "learning_rate": 9.860995952316851e-05, "loss": 0.8059176445007324, "memory(GiB)": 90.49, "step": 2455, "token_acc": 0.7628015234870926, "train_speed(iter/s)": 0.090048 }, { "epoch": 0.5302866997197672, "grad_norm": 0.23278585076332092, "learning_rate": 9.825357143030235e-05, "loss": 0.7987946033477783, "memory(GiB)": 90.49, "step": 2460, "token_acc": 0.7612270920784431, "train_speed(iter/s)": 0.090052 }, { "epoch": 0.5313645182151325, "grad_norm": 0.23164083063602448, "learning_rate": 9.789720552471399e-05, "loss": 0.7995065689086914, "memory(GiB)": 90.49, "step": 2465, "token_acc": 0.7599940375818668, "train_speed(iter/s)": 0.090061 }, { "epoch": 0.5324423367104979, "grad_norm": 0.24287478625774384, "learning_rate": 9.754086633380835e-05, "loss": 0.7774336814880372, "memory(GiB)": 90.49, "step": 2470, "token_acc": 0.7687850612419201, "train_speed(iter/s)": 0.090065 }, { "epoch": 0.5335201552058634, "grad_norm": 0.22623340785503387, "learning_rate": 9.718455838465097e-05, "loss": 0.7926499843597412, "memory(GiB)": 90.49, "step": 2475, "token_acc": 0.7571172734657706, "train_speed(iter/s)": 0.090066 }, { "epoch": 0.5345979737012287, "grad_norm": 0.23705744743347168, "learning_rate": 9.68282862039105e-05, "loss": 0.8012151718139648, "memory(GiB)": 90.49, "step": 2480, "token_acc": 0.7579143565386871, "train_speed(iter/s)": 0.090064 }, { "epoch": 0.5356757921965941, "grad_norm": 0.23197300732135773, "learning_rate": 9.647205431780115e-05, "loss": 0.7646932601928711, "memory(GiB)": 90.49, "step": 2485, "token_acc": 0.7727113777745058, "train_speed(iter/s)": 0.090068 }, { "epoch": 0.5367536106919595, "grad_norm": 0.23407894372940063, "learning_rate": 9.611586725202528e-05, "loss": 0.7824219703674317, "memory(GiB)": 90.49, "step": 2490, "token_acc": 0.7705786772037372, "train_speed(iter/s)": 0.090072 }, { "epoch": 0.5378314291873248, "grad_norm": 0.26215818524360657, "learning_rate": 9.575972953171572e-05, "loss": 0.8078936576843262, "memory(GiB)": 90.49, "step": 2495, "token_acc": 0.7628058191249395, "train_speed(iter/s)": 0.090072 }, { "epoch": 0.5389092476826902, "grad_norm": 0.23686009645462036, "learning_rate": 9.540364568137849e-05, "loss": 0.80023193359375, "memory(GiB)": 90.49, "step": 2500, "token_acc": 0.76637590520079, "train_speed(iter/s)": 0.090074 }, { "epoch": 0.5399870661780556, "grad_norm": 0.23283424973487854, "learning_rate": 9.504762022483521e-05, "loss": 0.8061882019042969, "memory(GiB)": 90.49, "step": 2505, "token_acc": 0.7757427314373838, "train_speed(iter/s)": 0.090078 }, { "epoch": 0.541064884673421, "grad_norm": 0.24969616532325745, "learning_rate": 9.469165768516562e-05, "loss": 0.7914113998413086, "memory(GiB)": 90.49, "step": 2510, "token_acc": 0.7580570337017328, "train_speed(iter/s)": 0.090084 }, { "epoch": 0.5421427031687863, "grad_norm": 0.23261533677577972, "learning_rate": 9.433576258465014e-05, "loss": 0.791796350479126, "memory(GiB)": 90.49, "step": 2515, "token_acc": 0.7693231480478203, "train_speed(iter/s)": 0.090088 }, { "epoch": 0.5432205216641518, "grad_norm": 0.25374144315719604, "learning_rate": 9.397993944471244e-05, "loss": 0.7908319473266602, "memory(GiB)": 90.49, "step": 2520, "token_acc": 0.7658196955324499, "train_speed(iter/s)": 0.090083 }, { "epoch": 0.5442983401595172, "grad_norm": 0.23715470731258392, "learning_rate": 9.362419278586198e-05, "loss": 0.8054104804992676, "memory(GiB)": 90.49, "step": 2525, "token_acc": 0.7563167177202265, "train_speed(iter/s)": 0.090085 }, { "epoch": 0.5453761586548825, "grad_norm": 0.23133426904678345, "learning_rate": 9.326852712763652e-05, "loss": 0.7887297153472901, "memory(GiB)": 90.49, "step": 2530, "token_acc": 0.7681709229188819, "train_speed(iter/s)": 0.090085 }, { "epoch": 0.5464539771502479, "grad_norm": 0.243832528591156, "learning_rate": 9.291294698854484e-05, "loss": 0.784972095489502, "memory(GiB)": 90.49, "step": 2535, "token_acc": 0.7719180475603701, "train_speed(iter/s)": 0.090086 }, { "epoch": 0.5475317956456133, "grad_norm": 0.23673878610134125, "learning_rate": 9.255745688600917e-05, "loss": 0.8034584045410156, "memory(GiB)": 90.49, "step": 2540, "token_acc": 0.7769572731725801, "train_speed(iter/s)": 0.090089 }, { "epoch": 0.5486096141409786, "grad_norm": 0.23620684444904327, "learning_rate": 9.2202061336308e-05, "loss": 0.7817193031311035, "memory(GiB)": 90.49, "step": 2545, "token_acc": 0.7686464329321472, "train_speed(iter/s)": 0.090083 }, { "epoch": 0.549687432636344, "grad_norm": 0.2243748903274536, "learning_rate": 9.184676485451841e-05, "loss": 0.7674824714660644, "memory(GiB)": 90.49, "step": 2550, "token_acc": 0.7720758976066905, "train_speed(iter/s)": 0.090086 }, { "epoch": 0.5507652511317094, "grad_norm": 0.2502426207065582, "learning_rate": 9.149157195445906e-05, "loss": 0.7758408069610596, "memory(GiB)": 90.49, "step": 2555, "token_acc": 0.7728167409095618, "train_speed(iter/s)": 0.09009 }, { "epoch": 0.5518430696270749, "grad_norm": 0.24001875519752502, "learning_rate": 9.113648714863257e-05, "loss": 0.8170134544372558, "memory(GiB)": 90.49, "step": 2560, "token_acc": 0.7705133025798109, "train_speed(iter/s)": 0.090089 }, { "epoch": 0.5529208881224402, "grad_norm": 0.24663668870925903, "learning_rate": 9.078151494816832e-05, "loss": 0.805938720703125, "memory(GiB)": 90.49, "step": 2565, "token_acc": 0.7613029630523804, "train_speed(iter/s)": 0.090088 }, { "epoch": 0.5539987066178056, "grad_norm": 0.2387453019618988, "learning_rate": 9.042665986276509e-05, "loss": 0.8034687042236328, "memory(GiB)": 90.49, "step": 2570, "token_acc": 0.7557641426231574, "train_speed(iter/s)": 0.090094 }, { "epoch": 0.555076525113171, "grad_norm": 0.2210591733455658, "learning_rate": 9.00719264006338e-05, "loss": 0.8042877197265625, "memory(GiB)": 90.49, "step": 2575, "token_acc": 0.7667553983292762, "train_speed(iter/s)": 0.090095 }, { "epoch": 0.5561543436085363, "grad_norm": 0.2292827069759369, "learning_rate": 8.971731906844026e-05, "loss": 0.7645621299743652, "memory(GiB)": 90.49, "step": 2580, "token_acc": 0.7757638776269472, "train_speed(iter/s)": 0.090092 }, { "epoch": 0.5572321621039017, "grad_norm": 0.24992945790290833, "learning_rate": 8.936284237124778e-05, "loss": 0.7888860702514648, "memory(GiB)": 90.49, "step": 2585, "token_acc": 0.7636960434170404, "train_speed(iter/s)": 0.090094 }, { "epoch": 0.5583099805992671, "grad_norm": 0.23239120841026306, "learning_rate": 8.900850081246013e-05, "loss": 0.7885291099548339, "memory(GiB)": 91.26, "step": 2590, "token_acc": 0.7717062089853609, "train_speed(iter/s)": 0.09009 }, { "epoch": 0.5593877990946324, "grad_norm": 0.22316664457321167, "learning_rate": 8.865429889376421e-05, "loss": 0.7822419166564941, "memory(GiB)": 91.26, "step": 2595, "token_acc": 0.770089067240873, "train_speed(iter/s)": 0.090095 }, { "epoch": 0.5604656175899978, "grad_norm": 0.22324971854686737, "learning_rate": 8.830024111507287e-05, "loss": 0.7921559810638428, "memory(GiB)": 91.26, "step": 2600, "token_acc": 0.7609959101671809, "train_speed(iter/s)": 0.090095 }, { "epoch": 0.5615434360853633, "grad_norm": 0.2255001664161682, "learning_rate": 8.79463319744677e-05, "loss": 0.7612745285034179, "memory(GiB)": 91.26, "step": 2605, "token_acc": 0.7796043334903439, "train_speed(iter/s)": 0.090094 }, { "epoch": 0.5626212545807286, "grad_norm": 0.22387778759002686, "learning_rate": 8.759257596814206e-05, "loss": 0.7995076656341553, "memory(GiB)": 91.26, "step": 2610, "token_acc": 0.76524016543064, "train_speed(iter/s)": 0.090089 }, { "epoch": 0.563699073076094, "grad_norm": 0.2278072088956833, "learning_rate": 8.72389775903437e-05, "loss": 0.7870664119720459, "memory(GiB)": 91.26, "step": 2615, "token_acc": 0.770896323907687, "train_speed(iter/s)": 0.09009 }, { "epoch": 0.5647768915714594, "grad_norm": 0.23164436221122742, "learning_rate": 8.68855413333179e-05, "loss": 0.8056714057922363, "memory(GiB)": 91.26, "step": 2620, "token_acc": 0.7751472912774824, "train_speed(iter/s)": 0.090092 }, { "epoch": 0.5658547100668248, "grad_norm": 0.23355039954185486, "learning_rate": 8.653227168725022e-05, "loss": 0.7567886352539063, "memory(GiB)": 91.26, "step": 2625, "token_acc": 0.7753473221163216, "train_speed(iter/s)": 0.090095 }, { "epoch": 0.5669325285621901, "grad_norm": 0.23572733998298645, "learning_rate": 8.61791731402096e-05, "loss": 0.7818819046020508, "memory(GiB)": 91.26, "step": 2630, "token_acc": 0.7616350177019633, "train_speed(iter/s)": 0.090092 }, { "epoch": 0.5680103470575555, "grad_norm": 0.24394263327121735, "learning_rate": 8.582625017809124e-05, "loss": 0.7560413360595704, "memory(GiB)": 91.26, "step": 2635, "token_acc": 0.7735571011554454, "train_speed(iter/s)": 0.090092 }, { "epoch": 0.5690881655529209, "grad_norm": 0.23602162301540375, "learning_rate": 8.547350728455963e-05, "loss": 0.8012054443359375, "memory(GiB)": 91.26, "step": 2640, "token_acc": 0.7568030738980238, "train_speed(iter/s)": 0.090089 }, { "epoch": 0.5701659840482862, "grad_norm": 0.23921601474285126, "learning_rate": 8.512094894099163e-05, "loss": 0.7841882228851318, "memory(GiB)": 91.26, "step": 2645, "token_acc": 0.759445009470669, "train_speed(iter/s)": 0.090087 }, { "epoch": 0.5712438025436517, "grad_norm": 0.21789836883544922, "learning_rate": 8.47685796264195e-05, "loss": 0.7572050571441651, "memory(GiB)": 91.26, "step": 2650, "token_acc": 0.7719941551101431, "train_speed(iter/s)": 0.090089 }, { "epoch": 0.5723216210390171, "grad_norm": 0.22752448916435242, "learning_rate": 8.441640381747405e-05, "loss": 0.768179988861084, "memory(GiB)": 91.26, "step": 2655, "token_acc": 0.7617492533016661, "train_speed(iter/s)": 0.090089 }, { "epoch": 0.5733994395343824, "grad_norm": 0.21738629043102264, "learning_rate": 8.40644259883276e-05, "loss": 0.788800048828125, "memory(GiB)": 91.26, "step": 2660, "token_acc": 0.755604815455594, "train_speed(iter/s)": 0.090086 }, { "epoch": 0.5744772580297478, "grad_norm": 0.23287540674209595, "learning_rate": 8.371265061063736e-05, "loss": 0.7773524761199951, "memory(GiB)": 91.26, "step": 2665, "token_acc": 0.7619732627500624, "train_speed(iter/s)": 0.09009 }, { "epoch": 0.5755550765251132, "grad_norm": 0.2312609702348709, "learning_rate": 8.33610821534885e-05, "loss": 0.7647912502288818, "memory(GiB)": 91.26, "step": 2670, "token_acc": 0.7669564130939207, "train_speed(iter/s)": 0.090084 }, { "epoch": 0.5766328950204785, "grad_norm": 0.21868187189102173, "learning_rate": 8.300972508333744e-05, "loss": 0.7689993858337403, "memory(GiB)": 91.26, "step": 2675, "token_acc": 0.7752472948497655, "train_speed(iter/s)": 0.090084 }, { "epoch": 0.5777107135158439, "grad_norm": 0.22506676614284515, "learning_rate": 8.265858386395495e-05, "loss": 0.7717408657073974, "memory(GiB)": 91.26, "step": 2680, "token_acc": 0.7766688824338229, "train_speed(iter/s)": 0.09009 }, { "epoch": 0.5787885320112093, "grad_norm": 0.2356419861316681, "learning_rate": 8.230766295636966e-05, "loss": 0.7695927619934082, "memory(GiB)": 91.26, "step": 2685, "token_acc": 0.7649457781073316, "train_speed(iter/s)": 0.09009 }, { "epoch": 0.5798663505065746, "grad_norm": 0.22720186412334442, "learning_rate": 8.19569668188112e-05, "loss": 0.7834826469421386, "memory(GiB)": 91.26, "step": 2690, "token_acc": 0.7782550985632545, "train_speed(iter/s)": 0.090088 }, { "epoch": 0.5809441690019401, "grad_norm": 0.27695170044898987, "learning_rate": 8.160649990665372e-05, "loss": 0.7773715496063233, "memory(GiB)": 91.26, "step": 2695, "token_acc": 0.7719570860976449, "train_speed(iter/s)": 0.090085 }, { "epoch": 0.5820219874973055, "grad_norm": 0.22598253190517426, "learning_rate": 8.125626667235908e-05, "loss": 0.7652709007263183, "memory(GiB)": 91.26, "step": 2700, "token_acc": 0.7841398530529516, "train_speed(iter/s)": 0.090087 }, { "epoch": 0.5830998059926709, "grad_norm": 0.22998595237731934, "learning_rate": 8.090627156542056e-05, "loss": 0.7950494289398193, "memory(GiB)": 91.26, "step": 2705, "token_acc": 0.764484827099506, "train_speed(iter/s)": 0.090086 }, { "epoch": 0.5841776244880362, "grad_norm": 0.22094503045082092, "learning_rate": 8.055651903230607e-05, "loss": 0.7882239341735839, "memory(GiB)": 91.26, "step": 2710, "token_acc": 0.7731024156882794, "train_speed(iter/s)": 0.090086 }, { "epoch": 0.5852554429834016, "grad_norm": 0.21647648513317108, "learning_rate": 8.020701351640182e-05, "loss": 0.772715950012207, "memory(GiB)": 91.26, "step": 2715, "token_acc": 0.7717122031927738, "train_speed(iter/s)": 0.090086 }, { "epoch": 0.586333261478767, "grad_norm": 0.22469334304332733, "learning_rate": 7.98577594579558e-05, "loss": 0.7961447715759278, "memory(GiB)": 91.26, "step": 2720, "token_acc": 0.7713599685804056, "train_speed(iter/s)": 0.090085 }, { "epoch": 0.5874110799741323, "grad_norm": 0.25474849343299866, "learning_rate": 7.95087612940214e-05, "loss": 0.7800432682037354, "memory(GiB)": 91.26, "step": 2725, "token_acc": 0.7743721393398442, "train_speed(iter/s)": 0.090083 }, { "epoch": 0.5884888984694977, "grad_norm": 0.224869504570961, "learning_rate": 7.916002345840108e-05, "loss": 0.7897933483123779, "memory(GiB)": 91.26, "step": 2730, "token_acc": 0.7660389102122804, "train_speed(iter/s)": 0.090085 }, { "epoch": 0.5895667169648631, "grad_norm": 0.22615066170692444, "learning_rate": 7.881155038158989e-05, "loss": 0.7665471076965332, "memory(GiB)": 91.26, "step": 2735, "token_acc": 0.7782169805221792, "train_speed(iter/s)": 0.090083 }, { "epoch": 0.5906445354602285, "grad_norm": 0.21062037348747253, "learning_rate": 7.846334649071935e-05, "loss": 0.775861406326294, "memory(GiB)": 91.26, "step": 2740, "token_acc": 0.7693980531340023, "train_speed(iter/s)": 0.09008 }, { "epoch": 0.5917223539555939, "grad_norm": 0.22038547694683075, "learning_rate": 7.811541620950112e-05, "loss": 0.7700429916381836, "memory(GiB)": 91.26, "step": 2745, "token_acc": 0.7807179507113426, "train_speed(iter/s)": 0.09008 }, { "epoch": 0.5928001724509593, "grad_norm": 0.22273953258991241, "learning_rate": 7.776776395817088e-05, "loss": 0.7884064197540284, "memory(GiB)": 91.26, "step": 2750, "token_acc": 0.7703299725022915, "train_speed(iter/s)": 0.090082 }, { "epoch": 0.5938779909463247, "grad_norm": 0.21273677051067352, "learning_rate": 7.742039415343205e-05, "loss": 0.7765313148498535, "memory(GiB)": 91.26, "step": 2755, "token_acc": 0.7684413718800502, "train_speed(iter/s)": 0.090081 }, { "epoch": 0.59495580944169, "grad_norm": 0.24257369339466095, "learning_rate": 7.707331120839977e-05, "loss": 0.7813675880432129, "memory(GiB)": 91.26, "step": 2760, "token_acc": 0.7657473874636747, "train_speed(iter/s)": 0.090084 }, { "epoch": 0.5960336279370554, "grad_norm": 0.2433798462152481, "learning_rate": 7.672651953254479e-05, "loss": 0.7873180389404297, "memory(GiB)": 91.26, "step": 2765, "token_acc": 0.7722831791715636, "train_speed(iter/s)": 0.090084 }, { "epoch": 0.5971114464324208, "grad_norm": 0.2182365208864212, "learning_rate": 7.638002353163745e-05, "loss": 0.7649801254272461, "memory(GiB)": 91.26, "step": 2770, "token_acc": 0.7776335571347197, "train_speed(iter/s)": 0.090087 }, { "epoch": 0.5981892649277861, "grad_norm": 0.24012009799480438, "learning_rate": 7.603382760769182e-05, "loss": 0.7850162029266358, "memory(GiB)": 91.26, "step": 2775, "token_acc": 0.7783084819878574, "train_speed(iter/s)": 0.090085 }, { "epoch": 0.5992670834231515, "grad_norm": 0.26574277877807617, "learning_rate": 7.568793615890954e-05, "loss": 0.7820779800415039, "memory(GiB)": 91.26, "step": 2780, "token_acc": 0.7635955181863943, "train_speed(iter/s)": 0.090087 }, { "epoch": 0.600344901918517, "grad_norm": 0.22515758872032166, "learning_rate": 7.534235357962421e-05, "loss": 0.7796339511871337, "memory(GiB)": 91.26, "step": 2785, "token_acc": 0.7708018989438045, "train_speed(iter/s)": 0.090088 }, { "epoch": 0.6014227204138823, "grad_norm": 0.2376565933227539, "learning_rate": 7.499708426024536e-05, "loss": 0.77886962890625, "memory(GiB)": 91.26, "step": 2790, "token_acc": 0.7638633828198748, "train_speed(iter/s)": 0.09009 }, { "epoch": 0.6025005389092477, "grad_norm": 0.2498820722103119, "learning_rate": 7.465213258720282e-05, "loss": 0.7755109786987304, "memory(GiB)": 91.26, "step": 2795, "token_acc": 0.7709488445038978, "train_speed(iter/s)": 0.09009 }, { "epoch": 0.6035783574046131, "grad_norm": 0.22499103844165802, "learning_rate": 7.43075029428908e-05, "loss": 0.7520609855651855, "memory(GiB)": 91.26, "step": 2800, "token_acc": 0.7690969136206719, "train_speed(iter/s)": 0.090094 }, { "epoch": 0.6046561758999784, "grad_norm": 0.22083409130573273, "learning_rate": 7.396319970561248e-05, "loss": 0.7658527851104736, "memory(GiB)": 91.26, "step": 2805, "token_acc": 0.7764925851923817, "train_speed(iter/s)": 0.090095 }, { "epoch": 0.6057339943953438, "grad_norm": 0.22696027159690857, "learning_rate": 7.361922724952414e-05, "loss": 0.7576547145843506, "memory(GiB)": 91.26, "step": 2810, "token_acc": 0.7665655550317123, "train_speed(iter/s)": 0.090092 }, { "epoch": 0.6068118128907092, "grad_norm": 0.2216215580701828, "learning_rate": 7.327558994457979e-05, "loss": 0.7674195289611816, "memory(GiB)": 91.26, "step": 2815, "token_acc": 0.7733462297071765, "train_speed(iter/s)": 0.090093 }, { "epoch": 0.6078896313860745, "grad_norm": 0.2342635989189148, "learning_rate": 7.293229215647543e-05, "loss": 0.7803627014160156, "memory(GiB)": 91.26, "step": 2820, "token_acc": 0.777587108733356, "train_speed(iter/s)": 0.090095 }, { "epoch": 0.6089674498814399, "grad_norm": 0.23657168447971344, "learning_rate": 7.258933824659381e-05, "loss": 0.7683358192443848, "memory(GiB)": 91.26, "step": 2825, "token_acc": 0.758906552339037, "train_speed(iter/s)": 0.090096 }, { "epoch": 0.6100452683768054, "grad_norm": 0.2253877967596054, "learning_rate": 7.224673257194892e-05, "loss": 0.7791478633880615, "memory(GiB)": 91.26, "step": 2830, "token_acc": 0.7712530255073543, "train_speed(iter/s)": 0.090096 }, { "epoch": 0.6111230868721708, "grad_norm": 0.23033136129379272, "learning_rate": 7.190447948513058e-05, "loss": 0.7801113128662109, "memory(GiB)": 91.26, "step": 2835, "token_acc": 0.772783578127765, "train_speed(iter/s)": 0.0901 }, { "epoch": 0.6122009053675361, "grad_norm": 0.23552431166172028, "learning_rate": 7.156258333424921e-05, "loss": 0.7833035945892334, "memory(GiB)": 91.26, "step": 2840, "token_acc": 0.7633513878867039, "train_speed(iter/s)": 0.090103 }, { "epoch": 0.6132787238629015, "grad_norm": 0.21898065507411957, "learning_rate": 7.122104846288064e-05, "loss": 0.7648891925811767, "memory(GiB)": 91.26, "step": 2845, "token_acc": 0.7768114659824825, "train_speed(iter/s)": 0.090104 }, { "epoch": 0.6143565423582669, "grad_norm": 0.24331991374492645, "learning_rate": 7.087987921001086e-05, "loss": 0.7661304473876953, "memory(GiB)": 91.26, "step": 2850, "token_acc": 0.774717295783093, "train_speed(iter/s)": 0.090107 }, { "epoch": 0.6154343608536322, "grad_norm": 0.24027590453624725, "learning_rate": 7.053907990998088e-05, "loss": 0.7590895175933838, "memory(GiB)": 91.26, "step": 2855, "token_acc": 0.7795478826499258, "train_speed(iter/s)": 0.090106 }, { "epoch": 0.6165121793489976, "grad_norm": 0.2232464998960495, "learning_rate": 7.019865489243165e-05, "loss": 0.7666661739349365, "memory(GiB)": 91.26, "step": 2860, "token_acc": 0.7837294913480585, "train_speed(iter/s)": 0.090109 }, { "epoch": 0.617589997844363, "grad_norm": 0.23570337891578674, "learning_rate": 6.985860848224919e-05, "loss": 0.7735125064849854, "memory(GiB)": 91.26, "step": 2865, "token_acc": 0.7649516270888302, "train_speed(iter/s)": 0.090115 }, { "epoch": 0.6186678163397283, "grad_norm": 0.2324032485485077, "learning_rate": 6.95189449995095e-05, "loss": 0.7750429153442383, "memory(GiB)": 91.26, "step": 2870, "token_acc": 0.7662459599516097, "train_speed(iter/s)": 0.090116 }, { "epoch": 0.6197456348350938, "grad_norm": 0.22324007749557495, "learning_rate": 6.917966875942372e-05, "loss": 0.784095287322998, "memory(GiB)": 91.26, "step": 2875, "token_acc": 0.7626368688891276, "train_speed(iter/s)": 0.09012 }, { "epoch": 0.6208234533304592, "grad_norm": 0.21696695685386658, "learning_rate": 6.884078407228333e-05, "loss": 0.775106143951416, "memory(GiB)": 91.26, "step": 2880, "token_acc": 0.7778737964926982, "train_speed(iter/s)": 0.090121 }, { "epoch": 0.6219012718258246, "grad_norm": 0.2195284515619278, "learning_rate": 6.85022952434054e-05, "loss": 0.7607449531555176, "memory(GiB)": 91.26, "step": 2885, "token_acc": 0.7780740394018673, "train_speed(iter/s)": 0.090122 }, { "epoch": 0.6229790903211899, "grad_norm": 0.23662586510181427, "learning_rate": 6.816420657307781e-05, "loss": 0.7792176246643067, "memory(GiB)": 91.26, "step": 2890, "token_acc": 0.767479587834706, "train_speed(iter/s)": 0.090124 }, { "epoch": 0.6240569088165553, "grad_norm": 0.22549855709075928, "learning_rate": 6.782652235650471e-05, "loss": 0.7570499897003173, "memory(GiB)": 91.26, "step": 2895, "token_acc": 0.7775567741170245, "train_speed(iter/s)": 0.090124 }, { "epoch": 0.6251347273119207, "grad_norm": 0.22042401134967804, "learning_rate": 6.748924688375196e-05, "loss": 0.7415219783782959, "memory(GiB)": 91.26, "step": 2900, "token_acc": 0.7904641034303297, "train_speed(iter/s)": 0.090124 }, { "epoch": 0.626212545807286, "grad_norm": 0.21684907376766205, "learning_rate": 6.715238443969252e-05, "loss": 0.7441881656646728, "memory(GiB)": 91.26, "step": 2905, "token_acc": 0.7692964967245799, "train_speed(iter/s)": 0.090127 }, { "epoch": 0.6272903643026514, "grad_norm": 0.22271089255809784, "learning_rate": 6.681593930395209e-05, "loss": 0.7633299350738525, "memory(GiB)": 91.26, "step": 2910, "token_acc": 0.7758491180756039, "train_speed(iter/s)": 0.090127 }, { "epoch": 0.6283681827980168, "grad_norm": 0.21738263964653015, "learning_rate": 6.647991575085476e-05, "loss": 0.7626672267913819, "memory(GiB)": 91.26, "step": 2915, "token_acc": 0.7785829593230562, "train_speed(iter/s)": 0.090128 }, { "epoch": 0.6294460012933822, "grad_norm": 0.23286406695842743, "learning_rate": 6.614431804936867e-05, "loss": 0.7610584259033203, "memory(GiB)": 91.26, "step": 2920, "token_acc": 0.770055081684167, "train_speed(iter/s)": 0.090125 }, { "epoch": 0.6305238197887476, "grad_norm": 0.23772740364074707, "learning_rate": 6.58091504630518e-05, "loss": 0.7727019309997558, "memory(GiB)": 91.26, "step": 2925, "token_acc": 0.7643894814269953, "train_speed(iter/s)": 0.090129 }, { "epoch": 0.631601638284113, "grad_norm": 0.22674471139907837, "learning_rate": 6.54744172499977e-05, "loss": 0.7692279815673828, "memory(GiB)": 91.26, "step": 2930, "token_acc": 0.7630296373839475, "train_speed(iter/s)": 0.090126 }, { "epoch": 0.6326794567794783, "grad_norm": 0.23084431886672974, "learning_rate": 6.51401226627816e-05, "loss": 0.7705416679382324, "memory(GiB)": 91.26, "step": 2935, "token_acc": 0.7666235185032177, "train_speed(iter/s)": 0.090125 }, { "epoch": 0.6337572752748437, "grad_norm": 0.21725726127624512, "learning_rate": 6.480627094840618e-05, "loss": 0.7639439105987549, "memory(GiB)": 91.26, "step": 2940, "token_acc": 0.7689072467586233, "train_speed(iter/s)": 0.090129 }, { "epoch": 0.6348350937702091, "grad_norm": 0.2143336981534958, "learning_rate": 6.447286634824777e-05, "loss": 0.7613083839416503, "memory(GiB)": 91.26, "step": 2945, "token_acc": 0.7773929895517357, "train_speed(iter/s)": 0.090123 }, { "epoch": 0.6359129122655744, "grad_norm": 0.22228439152240753, "learning_rate": 6.413991309800227e-05, "loss": 0.7811121463775634, "memory(GiB)": 91.26, "step": 2950, "token_acc": 0.769351063013278, "train_speed(iter/s)": 0.090123 }, { "epoch": 0.6369907307609398, "grad_norm": 0.23207572102546692, "learning_rate": 6.380741542763162e-05, "loss": 0.7662136077880859, "memory(GiB)": 91.26, "step": 2955, "token_acc": 0.7722528202637439, "train_speed(iter/s)": 0.090126 }, { "epoch": 0.6380685492563052, "grad_norm": 0.24606452882289886, "learning_rate": 6.347537756130981e-05, "loss": 0.7438564300537109, "memory(GiB)": 91.26, "step": 2960, "token_acc": 0.7774567306307777, "train_speed(iter/s)": 0.090125 }, { "epoch": 0.6391463677516707, "grad_norm": 0.23862026631832123, "learning_rate": 6.314380371736931e-05, "loss": 0.7705308437347412, "memory(GiB)": 91.26, "step": 2965, "token_acc": 0.7645277755709153, "train_speed(iter/s)": 0.090122 }, { "epoch": 0.640224186247036, "grad_norm": 0.21817252039909363, "learning_rate": 6.28126981082475e-05, "loss": 0.7643505096435547, "memory(GiB)": 91.26, "step": 2970, "token_acc": 0.7754815953694493, "train_speed(iter/s)": 0.090122 }, { "epoch": 0.6413020047424014, "grad_norm": 0.23491759598255157, "learning_rate": 6.248206494043313e-05, "loss": 0.7558906555175782, "memory(GiB)": 91.26, "step": 2975, "token_acc": 0.7746024026068771, "train_speed(iter/s)": 0.090124 }, { "epoch": 0.6423798232377668, "grad_norm": 0.22570013999938965, "learning_rate": 6.21519084144129e-05, "loss": 0.7753072738647461, "memory(GiB)": 91.26, "step": 2980, "token_acc": 0.7639809109020989, "train_speed(iter/s)": 0.090126 }, { "epoch": 0.6434576417331321, "grad_norm": 0.22683808207511902, "learning_rate": 6.182223272461798e-05, "loss": 0.7664592266082764, "memory(GiB)": 91.26, "step": 2985, "token_acc": 0.7762008424980916, "train_speed(iter/s)": 0.090126 }, { "epoch": 0.6445354602284975, "grad_norm": 0.21660880744457245, "learning_rate": 6.149304205937096e-05, "loss": 0.7602437496185303, "memory(GiB)": 91.26, "step": 2990, "token_acc": 0.7691259095514414, "train_speed(iter/s)": 0.090128 }, { "epoch": 0.6456132787238629, "grad_norm": 0.21284456551074982, "learning_rate": 6.116434060083241e-05, "loss": 0.7751906394958497, "memory(GiB)": 91.26, "step": 2995, "token_acc": 0.7663774318302283, "train_speed(iter/s)": 0.090131 }, { "epoch": 0.6466910972192282, "grad_norm": 0.21843643486499786, "learning_rate": 6.083613252494784e-05, "loss": 0.7361810684204102, "memory(GiB)": 91.26, "step": 3000, "token_acc": 0.7789757673042335, "train_speed(iter/s)": 0.090131 }, { "epoch": 0.6477689157145936, "grad_norm": 0.22340431809425354, "learning_rate": 6.0508422001394635e-05, "loss": 0.748606014251709, "memory(GiB)": 91.26, "step": 3005, "token_acc": 0.7700260078023407, "train_speed(iter/s)": 0.090134 }, { "epoch": 0.6488467342099591, "grad_norm": 0.2277083396911621, "learning_rate": 6.0181213193529154e-05, "loss": 0.7752620697021484, "memory(GiB)": 91.26, "step": 3010, "token_acc": 0.7564338402695565, "train_speed(iter/s)": 0.090131 }, { "epoch": 0.6499245527053245, "grad_norm": 0.22204837203025818, "learning_rate": 5.985451025833371e-05, "loss": 0.7694263458251953, "memory(GiB)": 91.26, "step": 3015, "token_acc": 0.7730884033196638, "train_speed(iter/s)": 0.090127 }, { "epoch": 0.6510023712006898, "grad_norm": 0.22877106070518494, "learning_rate": 5.952831734636386e-05, "loss": 0.7575160026550293, "memory(GiB)": 91.26, "step": 3020, "token_acc": 0.7706601067499365, "train_speed(iter/s)": 0.090129 }, { "epoch": 0.6520801896960552, "grad_norm": 0.2226395457983017, "learning_rate": 5.920263860169557e-05, "loss": 0.7475278854370118, "memory(GiB)": 91.26, "step": 3025, "token_acc": 0.7707781844592464, "train_speed(iter/s)": 0.090134 }, { "epoch": 0.6531580081914206, "grad_norm": 0.23140117526054382, "learning_rate": 5.8877478161872726e-05, "loss": 0.7617078304290772, "memory(GiB)": 91.26, "step": 3030, "token_acc": 0.7727054039088244, "train_speed(iter/s)": 0.090135 }, { "epoch": 0.6542358266867859, "grad_norm": 0.23837780952453613, "learning_rate": 5.855284015785444e-05, "loss": 0.7633691310882569, "memory(GiB)": 91.26, "step": 3035, "token_acc": 0.7702246219122454, "train_speed(iter/s)": 0.090134 }, { "epoch": 0.6553136451821513, "grad_norm": 0.22968235611915588, "learning_rate": 5.8228728713962543e-05, "loss": 0.7584036827087403, "memory(GiB)": 91.26, "step": 3040, "token_acc": 0.7641401077608344, "train_speed(iter/s)": 0.090139 }, { "epoch": 0.6563914636775167, "grad_norm": 0.21449266374111176, "learning_rate": 5.790514794782938e-05, "loss": 0.7355292320251465, "memory(GiB)": 91.26, "step": 3045, "token_acc": 0.7813894016305184, "train_speed(iter/s)": 0.090139 }, { "epoch": 0.6574692821728821, "grad_norm": 0.22127164900302887, "learning_rate": 5.758210197034524e-05, "loss": 0.7592841148376465, "memory(GiB)": 91.26, "step": 3050, "token_acc": 0.7820042675435761, "train_speed(iter/s)": 0.090145 }, { "epoch": 0.6585471006682475, "grad_norm": 0.2207377403974533, "learning_rate": 5.7259594885606286e-05, "loss": 0.7387511730194092, "memory(GiB)": 91.26, "step": 3055, "token_acc": 0.7735070385590627, "train_speed(iter/s)": 0.090143 }, { "epoch": 0.6596249191636129, "grad_norm": 0.2256963849067688, "learning_rate": 5.69376307908625e-05, "loss": 0.7541752815246582, "memory(GiB)": 91.26, "step": 3060, "token_acc": 0.7658658392163703, "train_speed(iter/s)": 0.090145 }, { "epoch": 0.6607027376589782, "grad_norm": 0.22818222641944885, "learning_rate": 5.6616213776465376e-05, "loss": 0.7558613300323487, "memory(GiB)": 91.26, "step": 3065, "token_acc": 0.7741589278270048, "train_speed(iter/s)": 0.090149 }, { "epoch": 0.6617805561543436, "grad_norm": 0.24055013060569763, "learning_rate": 5.6295347925816124e-05, "loss": 0.743309497833252, "memory(GiB)": 91.26, "step": 3070, "token_acc": 0.7741477148909093, "train_speed(iter/s)": 0.09015 }, { "epoch": 0.662858374649709, "grad_norm": 0.22053426504135132, "learning_rate": 5.597503731531386e-05, "loss": 0.7593018054962158, "memory(GiB)": 91.26, "step": 3075, "token_acc": 0.7726624413189885, "train_speed(iter/s)": 0.090154 }, { "epoch": 0.6639361931450743, "grad_norm": 0.22992464900016785, "learning_rate": 5.5655286014303585e-05, "loss": 0.7653859615325928, "memory(GiB)": 91.26, "step": 3080, "token_acc": 0.7662768632509792, "train_speed(iter/s)": 0.090156 }, { "epoch": 0.6650140116404397, "grad_norm": 0.2153659164905548, "learning_rate": 5.5336098085024755e-05, "loss": 0.7788686752319336, "memory(GiB)": 91.26, "step": 3085, "token_acc": 0.764640501439452, "train_speed(iter/s)": 0.090153 }, { "epoch": 0.6660918301358051, "grad_norm": 0.22455787658691406, "learning_rate": 5.501747758255946e-05, "loss": 0.7530778884887696, "memory(GiB)": 91.26, "step": 3090, "token_acc": 0.7756308130479285, "train_speed(iter/s)": 0.090151 }, { "epoch": 0.6671696486311706, "grad_norm": 0.22393487393856049, "learning_rate": 5.469942855478096e-05, "loss": 0.7602992057800293, "memory(GiB)": 91.26, "step": 3095, "token_acc": 0.7723962091691923, "train_speed(iter/s)": 0.090154 }, { "epoch": 0.6682474671265359, "grad_norm": 0.2411319762468338, "learning_rate": 5.438195504230241e-05, "loss": 0.7764713764190674, "memory(GiB)": 91.26, "step": 3100, "token_acc": 0.7749024695856179, "train_speed(iter/s)": 0.090153 }, { "epoch": 0.6693252856219013, "grad_norm": 0.2184331715106964, "learning_rate": 5.4065061078425315e-05, "loss": 0.7678329467773437, "memory(GiB)": 91.26, "step": 3105, "token_acc": 0.7564337158258142, "train_speed(iter/s)": 0.090153 }, { "epoch": 0.6704031041172667, "grad_norm": 0.22201864421367645, "learning_rate": 5.374875068908833e-05, "loss": 0.7805136203765869, "memory(GiB)": 91.26, "step": 3110, "token_acc": 0.7772749342689887, "train_speed(iter/s)": 0.090147 }, { "epoch": 0.671480922612632, "grad_norm": 0.21732836961746216, "learning_rate": 5.343302789281632e-05, "loss": 0.7617403030395508, "memory(GiB)": 91.26, "step": 3115, "token_acc": 0.7772888122674894, "train_speed(iter/s)": 0.090147 }, { "epoch": 0.6725587411079974, "grad_norm": 0.22248712182044983, "learning_rate": 5.311789670066901e-05, "loss": 0.7297797203063965, "memory(GiB)": 91.26, "step": 3120, "token_acc": 0.7818937148010631, "train_speed(iter/s)": 0.090143 }, { "epoch": 0.6736365596033628, "grad_norm": 0.23490993678569794, "learning_rate": 5.2803361116190176e-05, "loss": 0.7482430458068847, "memory(GiB)": 91.26, "step": 3125, "token_acc": 0.7840391280634775, "train_speed(iter/s)": 0.090145 }, { "epoch": 0.6747143780987281, "grad_norm": 0.22323934733867645, "learning_rate": 5.248942513535691e-05, "loss": 0.7464156150817871, "memory(GiB)": 91.26, "step": 3130, "token_acc": 0.7829117092141235, "train_speed(iter/s)": 0.090149 }, { "epoch": 0.6757921965940935, "grad_norm": 0.2088039666414261, "learning_rate": 5.2176092746528505e-05, "loss": 0.7474268436431885, "memory(GiB)": 91.26, "step": 3135, "token_acc": 0.7679528501703656, "train_speed(iter/s)": 0.090148 }, { "epoch": 0.676870015089459, "grad_norm": 0.2152319848537445, "learning_rate": 5.1863367930396214e-05, "loss": 0.7568321228027344, "memory(GiB)": 91.26, "step": 3140, "token_acc": 0.7745667986631842, "train_speed(iter/s)": 0.09015 }, { "epoch": 0.6779478335848244, "grad_norm": 0.2346608191728592, "learning_rate": 5.155125465993229e-05, "loss": 0.7468510150909424, "memory(GiB)": 91.26, "step": 3145, "token_acc": 0.7745515242672145, "train_speed(iter/s)": 0.090157 }, { "epoch": 0.6790256520801897, "grad_norm": 0.2164594829082489, "learning_rate": 5.123975690033973e-05, "loss": 0.7778005599975586, "memory(GiB)": 91.26, "step": 3150, "token_acc": 0.7709694015469266, "train_speed(iter/s)": 0.09016 }, { "epoch": 0.6801034705755551, "grad_norm": 0.22819051146507263, "learning_rate": 5.092887860900195e-05, "loss": 0.7454059600830079, "memory(GiB)": 91.26, "step": 3155, "token_acc": 0.7850124665250716, "train_speed(iter/s)": 0.090161 }, { "epoch": 0.6811812890709205, "grad_norm": 0.22146260738372803, "learning_rate": 5.061862373543228e-05, "loss": 0.7504120349884034, "memory(GiB)": 91.26, "step": 3160, "token_acc": 0.7779123951537744, "train_speed(iter/s)": 0.090161 }, { "epoch": 0.6822591075662858, "grad_norm": 0.21454276144504547, "learning_rate": 5.030899622122392e-05, "loss": 0.7563515663146972, "memory(GiB)": 91.26, "step": 3165, "token_acc": 0.7774136232657909, "train_speed(iter/s)": 0.090156 }, { "epoch": 0.6833369260616512, "grad_norm": 0.2144879549741745, "learning_rate": 5.000000000000002e-05, "loss": 0.7628767967224122, "memory(GiB)": 91.26, "step": 3170, "token_acc": 0.7688384351752737, "train_speed(iter/s)": 0.090151 }, { "epoch": 0.6844147445570166, "grad_norm": 0.21993757784366608, "learning_rate": 4.969163899736339e-05, "loss": 0.7447363853454589, "memory(GiB)": 91.26, "step": 3175, "token_acc": 0.7740219550827206, "train_speed(iter/s)": 0.090148 }, { "epoch": 0.6854925630523819, "grad_norm": 0.2607014775276184, "learning_rate": 4.938391713084679e-05, "loss": 0.7386722564697266, "memory(GiB)": 91.26, "step": 3180, "token_acc": 0.7841583614245933, "train_speed(iter/s)": 0.09015 }, { "epoch": 0.6865703815477474, "grad_norm": 0.21617192029953003, "learning_rate": 4.9076838309863295e-05, "loss": 0.7312001705169677, "memory(GiB)": 91.26, "step": 3185, "token_acc": 0.7820340064892466, "train_speed(iter/s)": 0.090151 }, { "epoch": 0.6876482000431128, "grad_norm": 0.22409582138061523, "learning_rate": 4.877040643565631e-05, "loss": 0.7611231803894043, "memory(GiB)": 91.26, "step": 3190, "token_acc": 0.7722326524131102, "train_speed(iter/s)": 0.09015 }, { "epoch": 0.6887260185384781, "grad_norm": 0.22063037753105164, "learning_rate": 4.846462540125039e-05, "loss": 0.7668973922729492, "memory(GiB)": 91.26, "step": 3195, "token_acc": 0.7743311923031626, "train_speed(iter/s)": 0.090152 }, { "epoch": 0.6898038370338435, "grad_norm": 0.20992635190486908, "learning_rate": 4.81594990914014e-05, "loss": 0.7423861026763916, "memory(GiB)": 91.26, "step": 3200, "token_acc": 0.7781442989813417, "train_speed(iter/s)": 0.090155 }, { "epoch": 0.6908816555292089, "grad_norm": 0.2253515124320984, "learning_rate": 4.785503138254737e-05, "loss": 0.7453560829162598, "memory(GiB)": 91.26, "step": 3205, "token_acc": 0.7736721841386613, "train_speed(iter/s)": 0.090151 }, { "epoch": 0.6919594740245743, "grad_norm": 0.20901232957839966, "learning_rate": 4.755122614275935e-05, "loss": 0.7460142612457276, "memory(GiB)": 91.26, "step": 3210, "token_acc": 0.7859684010800555, "train_speed(iter/s)": 0.090152 }, { "epoch": 0.6930372925199396, "grad_norm": 0.22076484560966492, "learning_rate": 4.724808723169201e-05, "loss": 0.7663829803466797, "memory(GiB)": 91.26, "step": 3215, "token_acc": 0.7756962402098487, "train_speed(iter/s)": 0.090153 }, { "epoch": 0.694115111015305, "grad_norm": 0.2104821652173996, "learning_rate": 4.69456185005347e-05, "loss": 0.746420431137085, "memory(GiB)": 91.26, "step": 3220, "token_acc": 0.7723843883240407, "train_speed(iter/s)": 0.090153 }, { "epoch": 0.6951929295106704, "grad_norm": 0.23057498037815094, "learning_rate": 4.664382379196274e-05, "loss": 0.771373987197876, "memory(GiB)": 91.26, "step": 3225, "token_acc": 0.7804209360288009, "train_speed(iter/s)": 0.090154 }, { "epoch": 0.6962707480060358, "grad_norm": 0.21673372387886047, "learning_rate": 4.634270694008823e-05, "loss": 0.7372934818267822, "memory(GiB)": 91.26, "step": 3230, "token_acc": 0.7794896311597406, "train_speed(iter/s)": 0.090155 }, { "epoch": 0.6973485665014012, "grad_norm": 0.22335149347782135, "learning_rate": 4.604227177041156e-05, "loss": 0.7637501716613769, "memory(GiB)": 91.26, "step": 3235, "token_acc": 0.7808158625915635, "train_speed(iter/s)": 0.09016 }, { "epoch": 0.6984263849967666, "grad_norm": 0.23247286677360535, "learning_rate": 4.574252209977289e-05, "loss": 0.7529802322387695, "memory(GiB)": 91.26, "step": 3240, "token_acc": 0.7755937604209122, "train_speed(iter/s)": 0.090163 }, { "epoch": 0.6995042034921319, "grad_norm": 0.21034903824329376, "learning_rate": 4.5443461736303406e-05, "loss": 0.7426942825317383, "memory(GiB)": 91.26, "step": 3245, "token_acc": 0.7806444017821567, "train_speed(iter/s)": 0.090162 }, { "epoch": 0.7005820219874973, "grad_norm": 0.2284092903137207, "learning_rate": 4.514509447937719e-05, "loss": 0.7492968082427979, "memory(GiB)": 91.26, "step": 3250, "token_acc": 0.7798717272887015, "train_speed(iter/s)": 0.09016 }, { "epoch": 0.7016598404828627, "grad_norm": 0.22834619879722595, "learning_rate": 4.4847424119562765e-05, "loss": 0.7654409885406495, "memory(GiB)": 91.26, "step": 3255, "token_acc": 0.7742565225323846, "train_speed(iter/s)": 0.090163 }, { "epoch": 0.702737658978228, "grad_norm": 0.2358095645904541, "learning_rate": 4.4550454438575006e-05, "loss": 0.7570336818695068, "memory(GiB)": 91.26, "step": 3260, "token_acc": 0.7754214490674318, "train_speed(iter/s)": 0.090159 }, { "epoch": 0.7038154774735934, "grad_norm": 0.22980476915836334, "learning_rate": 4.425418920922717e-05, "loss": 0.720023250579834, "memory(GiB)": 91.26, "step": 3265, "token_acc": 0.7856520555404302, "train_speed(iter/s)": 0.090162 }, { "epoch": 0.7048932959689588, "grad_norm": 0.22382865846157074, "learning_rate": 4.39586321953829e-05, "loss": 0.7472883224487304, "memory(GiB)": 91.26, "step": 3270, "token_acc": 0.775493376830558, "train_speed(iter/s)": 0.090159 }, { "epoch": 0.7059711144643243, "grad_norm": 0.24370037019252777, "learning_rate": 4.3663787151908296e-05, "loss": 0.7584941387176514, "memory(GiB)": 91.26, "step": 3275, "token_acc": 0.7727525351458221, "train_speed(iter/s)": 0.090158 }, { "epoch": 0.7070489329596896, "grad_norm": 0.23202946782112122, "learning_rate": 4.336965782462449e-05, "loss": 0.7554913520812988, "memory(GiB)": 91.26, "step": 3280, "token_acc": 0.7682838388158506, "train_speed(iter/s)": 0.090158 }, { "epoch": 0.708126751455055, "grad_norm": 0.2162495255470276, "learning_rate": 4.307624795025974e-05, "loss": 0.7500974655151367, "memory(GiB)": 91.26, "step": 3285, "token_acc": 0.7727522210335365, "train_speed(iter/s)": 0.09016 }, { "epoch": 0.7092045699504204, "grad_norm": 0.21806137263774872, "learning_rate": 4.278356125640214e-05, "loss": 0.7314608573913575, "memory(GiB)": 91.26, "step": 3290, "token_acc": 0.7806178226251173, "train_speed(iter/s)": 0.09016 }, { "epoch": 0.7102823884457857, "grad_norm": 0.22850319743156433, "learning_rate": 4.2491601461452316e-05, "loss": 0.753504467010498, "memory(GiB)": 91.26, "step": 3295, "token_acc": 0.7871383219954649, "train_speed(iter/s)": 0.090161 }, { "epoch": 0.7113602069411511, "grad_norm": 0.22878149151802063, "learning_rate": 4.2200372274576e-05, "loss": 0.7398993015289307, "memory(GiB)": 91.26, "step": 3300, "token_acc": 0.7761068716532642, "train_speed(iter/s)": 0.090166 }, { "epoch": 0.7124380254365165, "grad_norm": 0.214716374874115, "learning_rate": 4.190987739565702e-05, "loss": 0.7725460052490234, "memory(GiB)": 91.26, "step": 3305, "token_acc": 0.7562892037312937, "train_speed(iter/s)": 0.090167 }, { "epoch": 0.7135158439318818, "grad_norm": 0.22094842791557312, "learning_rate": 4.162012051525037e-05, "loss": 0.7432554244995118, "memory(GiB)": 91.26, "step": 3310, "token_acc": 0.7831101288359162, "train_speed(iter/s)": 0.09017 }, { "epoch": 0.7145936624272472, "grad_norm": 0.21602699160575867, "learning_rate": 4.1331105314535156e-05, "loss": 0.7369527339935302, "memory(GiB)": 91.26, "step": 3315, "token_acc": 0.7781488592222172, "train_speed(iter/s)": 0.090169 }, { "epoch": 0.7156714809226127, "grad_norm": 0.22370749711990356, "learning_rate": 4.104283546526787e-05, "loss": 0.7471486568450928, "memory(GiB)": 91.26, "step": 3320, "token_acc": 0.7659143982744175, "train_speed(iter/s)": 0.090169 }, { "epoch": 0.716749299417978, "grad_norm": 0.22144624590873718, "learning_rate": 4.075531462973593e-05, "loss": 0.7566558837890625, "memory(GiB)": 91.26, "step": 3325, "token_acc": 0.7666900060682444, "train_speed(iter/s)": 0.090171 }, { "epoch": 0.7178271179133434, "grad_norm": 0.230716273188591, "learning_rate": 4.0468546460710835e-05, "loss": 0.7519946575164795, "memory(GiB)": 91.26, "step": 3330, "token_acc": 0.7685696324381667, "train_speed(iter/s)": 0.090176 }, { "epoch": 0.7189049364087088, "grad_norm": 0.22184519469738007, "learning_rate": 4.018253460140207e-05, "loss": 0.7530190467834472, "memory(GiB)": 91.26, "step": 3335, "token_acc": 0.7732952778235176, "train_speed(iter/s)": 0.090181 }, { "epoch": 0.7199827549040742, "grad_norm": 0.2071908861398697, "learning_rate": 3.9897282685410596e-05, "loss": 0.7389517784118652, "memory(GiB)": 91.26, "step": 3340, "token_acc": 0.7844724364193687, "train_speed(iter/s)": 0.090184 }, { "epoch": 0.7210605733994395, "grad_norm": 0.2232424020767212, "learning_rate": 3.9612794336682754e-05, "loss": 0.7318838119506836, "memory(GiB)": 91.26, "step": 3345, "token_acc": 0.7723964450527601, "train_speed(iter/s)": 0.09019 }, { "epoch": 0.7221383918948049, "grad_norm": 0.2113451063632965, "learning_rate": 3.932907316946437e-05, "loss": 0.7293927669525146, "memory(GiB)": 91.26, "step": 3350, "token_acc": 0.7894883100522435, "train_speed(iter/s)": 0.090189 }, { "epoch": 0.7232162103901703, "grad_norm": 0.21689848601818085, "learning_rate": 3.904612278825457e-05, "loss": 0.736536693572998, "memory(GiB)": 91.26, "step": 3355, "token_acc": 0.7852294636018698, "train_speed(iter/s)": 0.090184 }, { "epoch": 0.7242940288855356, "grad_norm": 0.21303310990333557, "learning_rate": 3.876394678776015e-05, "loss": 0.7246241092681884, "memory(GiB)": 91.26, "step": 3360, "token_acc": 0.78660529082488, "train_speed(iter/s)": 0.090188 }, { "epoch": 0.7253718473809011, "grad_norm": 0.20606185495853424, "learning_rate": 3.848254875285e-05, "loss": 0.7368947505950928, "memory(GiB)": 91.26, "step": 3365, "token_acc": 0.777546721642092, "train_speed(iter/s)": 0.09019 }, { "epoch": 0.7264496658762665, "grad_norm": 0.22658182680606842, "learning_rate": 3.820193225850932e-05, "loss": 0.7449198722839355, "memory(GiB)": 91.26, "step": 3370, "token_acc": 0.7846545959476001, "train_speed(iter/s)": 0.090195 }, { "epoch": 0.7275274843716318, "grad_norm": 0.2219938039779663, "learning_rate": 3.792210086979433e-05, "loss": 0.7521435737609863, "memory(GiB)": 91.26, "step": 3375, "token_acc": 0.7787693622079418, "train_speed(iter/s)": 0.090195 }, { "epoch": 0.7286053028669972, "grad_norm": 0.23862245678901672, "learning_rate": 3.76430581417871e-05, "loss": 0.7577279090881348, "memory(GiB)": 91.26, "step": 3380, "token_acc": 0.7782073878579272, "train_speed(iter/s)": 0.090198 }, { "epoch": 0.7296831213623626, "grad_norm": 0.21582721173763275, "learning_rate": 3.736480761955011e-05, "loss": 0.7534392356872559, "memory(GiB)": 91.26, "step": 3385, "token_acc": 0.7800474971241975, "train_speed(iter/s)": 0.090198 }, { "epoch": 0.7307609398577279, "grad_norm": 0.2180568277835846, "learning_rate": 3.708735283808149e-05, "loss": 0.7454691410064698, "memory(GiB)": 91.26, "step": 3390, "token_acc": 0.7852503052503053, "train_speed(iter/s)": 0.090204 }, { "epoch": 0.7318387583530933, "grad_norm": 0.22647275030612946, "learning_rate": 3.68106973222699e-05, "loss": 0.7586719989776611, "memory(GiB)": 91.26, "step": 3395, "token_acc": 0.7686355147613939, "train_speed(iter/s)": 0.090205 }, { "epoch": 0.7329165768484587, "grad_norm": 0.2219868153333664, "learning_rate": 3.653484458684984e-05, "loss": 0.7421576499938964, "memory(GiB)": 91.26, "step": 3400, "token_acc": 0.7807745379280293, "train_speed(iter/s)": 0.090204 }, { "epoch": 0.733994395343824, "grad_norm": 0.21715261042118073, "learning_rate": 3.625979813635705e-05, "loss": 0.7303240776062012, "memory(GiB)": 91.26, "step": 3405, "token_acc": 0.7778514890175301, "train_speed(iter/s)": 0.090203 }, { "epoch": 0.7350722138391895, "grad_norm": 0.22335954010486603, "learning_rate": 3.598556146508385e-05, "loss": 0.7431679725646972, "memory(GiB)": 91.26, "step": 3410, "token_acc": 0.7747177899499881, "train_speed(iter/s)": 0.090206 }, { "epoch": 0.7361500323345549, "grad_norm": 0.2138655185699463, "learning_rate": 3.571213805703485e-05, "loss": 0.7320580005645752, "memory(GiB)": 91.26, "step": 3415, "token_acc": 0.7836731003932559, "train_speed(iter/s)": 0.090209 }, { "epoch": 0.7372278508299203, "grad_norm": 0.21447625756263733, "learning_rate": 3.543953138588271e-05, "loss": 0.7324469566345215, "memory(GiB)": 91.26, "step": 3420, "token_acc": 0.7862471281212476, "train_speed(iter/s)": 0.09021 }, { "epoch": 0.7383056693252856, "grad_norm": 0.22362476587295532, "learning_rate": 3.5167744914923904e-05, "loss": 0.746140432357788, "memory(GiB)": 91.26, "step": 3425, "token_acc": 0.7883556116899976, "train_speed(iter/s)": 0.090212 }, { "epoch": 0.739383487820651, "grad_norm": 0.21837963163852692, "learning_rate": 3.489678209703475e-05, "loss": 0.7347356796264648, "memory(GiB)": 91.26, "step": 3430, "token_acc": 0.7798784057644674, "train_speed(iter/s)": 0.090213 }, { "epoch": 0.7404613063160164, "grad_norm": 0.22204995155334473, "learning_rate": 3.4626646374627694e-05, "loss": 0.7275646209716797, "memory(GiB)": 91.26, "step": 3435, "token_acc": 0.7782892519468539, "train_speed(iter/s)": 0.090212 }, { "epoch": 0.7415391248113817, "grad_norm": 0.21104831993579865, "learning_rate": 3.4357341179607295e-05, "loss": 0.7460927963256836, "memory(GiB)": 91.26, "step": 3440, "token_acc": 0.7821705791033542, "train_speed(iter/s)": 0.090216 }, { "epoch": 0.7426169433067471, "grad_norm": 0.2254546731710434, "learning_rate": 3.4088869933326936e-05, "loss": 0.7526806831359864, "memory(GiB)": 91.26, "step": 3445, "token_acc": 0.7835085501660183, "train_speed(iter/s)": 0.090218 }, { "epoch": 0.7436947618021125, "grad_norm": 0.20820407569408417, "learning_rate": 3.382123604654511e-05, "loss": 0.7293185710906982, "memory(GiB)": 91.26, "step": 3450, "token_acc": 0.772092980639723, "train_speed(iter/s)": 0.090221 }, { "epoch": 0.744772580297478, "grad_norm": 0.2137994021177292, "learning_rate": 3.3554442919382165e-05, "loss": 0.7403603553771972, "memory(GiB)": 91.26, "step": 3455, "token_acc": 0.769674310084339, "train_speed(iter/s)": 0.090223 }, { "epoch": 0.7458503987928433, "grad_norm": 0.2104395031929016, "learning_rate": 3.3288493941277234e-05, "loss": 0.7423724174499512, "memory(GiB)": 91.26, "step": 3460, "token_acc": 0.7768750757865478, "train_speed(iter/s)": 0.090225 }, { "epoch": 0.7469282172882087, "grad_norm": 0.2210676521062851, "learning_rate": 3.302339249094495e-05, "loss": 0.7549271583557129, "memory(GiB)": 91.26, "step": 3465, "token_acc": 0.7730852695775807, "train_speed(iter/s)": 0.090223 }, { "epoch": 0.748006035783574, "grad_norm": 0.22691284120082855, "learning_rate": 3.2759141936332685e-05, "loss": 0.745268440246582, "memory(GiB)": 91.26, "step": 3470, "token_acc": 0.7730993719301815, "train_speed(iter/s)": 0.090223 }, { "epoch": 0.7490838542789394, "grad_norm": 0.21874092519283295, "learning_rate": 3.249574563457777e-05, "loss": 0.7463291645050049, "memory(GiB)": 91.26, "step": 3475, "token_acc": 0.7731472916524669, "train_speed(iter/s)": 0.090228 }, { "epoch": 0.7501616727743048, "grad_norm": 0.21490636467933655, "learning_rate": 3.22332069319647e-05, "loss": 0.7352697849273682, "memory(GiB)": 91.26, "step": 3480, "token_acc": 0.7692009991907962, "train_speed(iter/s)": 0.09023 }, { "epoch": 0.7512394912696702, "grad_norm": 0.2209213823080063, "learning_rate": 3.197152916388273e-05, "loss": 0.736881160736084, "memory(GiB)": 91.26, "step": 3485, "token_acc": 0.7790096151176672, "train_speed(iter/s)": 0.09023 }, { "epoch": 0.7523173097650355, "grad_norm": 0.21888989210128784, "learning_rate": 3.171071565478356e-05, "loss": 0.7501363277435302, "memory(GiB)": 91.26, "step": 3490, "token_acc": 0.7830340641266934, "train_speed(iter/s)": 0.090225 }, { "epoch": 0.753395128260401, "grad_norm": 0.2083372324705124, "learning_rate": 3.145076971813891e-05, "loss": 0.7555474281311035, "memory(GiB)": 91.26, "step": 3495, "token_acc": 0.7804491843127856, "train_speed(iter/s)": 0.09023 }, { "epoch": 0.7544729467557664, "grad_norm": 0.22007182240486145, "learning_rate": 3.119169465639864e-05, "loss": 0.7215785980224609, "memory(GiB)": 91.26, "step": 3500, "token_acc": 0.7855005343783399, "train_speed(iter/s)": 0.090227 }, { "epoch": 0.7555507652511317, "grad_norm": 0.2094966322183609, "learning_rate": 3.093349376094863e-05, "loss": 0.7351235389709473, "memory(GiB)": 91.26, "step": 3505, "token_acc": 0.7794548771522187, "train_speed(iter/s)": 0.090226 }, { "epoch": 0.7566285837464971, "grad_norm": 0.2145504355430603, "learning_rate": 3.0676170312068995e-05, "loss": 0.7237605094909668, "memory(GiB)": 91.26, "step": 3510, "token_acc": 0.7761054193089448, "train_speed(iter/s)": 0.090222 }, { "epoch": 0.7577064022418625, "grad_norm": 0.21505559980869293, "learning_rate": 3.0419727578892576e-05, "loss": 0.7318454265594483, "memory(GiB)": 91.26, "step": 3515, "token_acc": 0.7890855192238592, "train_speed(iter/s)": 0.09022 }, { "epoch": 0.7587842207372278, "grad_norm": 0.22119516134262085, "learning_rate": 3.0164168819363126e-05, "loss": 0.7459331512451172, "memory(GiB)": 91.26, "step": 3520, "token_acc": 0.7886430407638634, "train_speed(iter/s)": 0.090223 }, { "epoch": 0.7598620392325932, "grad_norm": 0.21449056267738342, "learning_rate": 2.990949728019411e-05, "loss": 0.7196578979492188, "memory(GiB)": 91.26, "step": 3525, "token_acc": 0.7897769499674279, "train_speed(iter/s)": 0.090222 }, { "epoch": 0.7609398577279586, "grad_norm": 0.22222140431404114, "learning_rate": 2.9655716196827497e-05, "loss": 0.7405521869659424, "memory(GiB)": 91.26, "step": 3530, "token_acc": 0.7791397603672591, "train_speed(iter/s)": 0.090223 }, { "epoch": 0.762017676223324, "grad_norm": 0.24021229147911072, "learning_rate": 2.9402828793392477e-05, "loss": 0.7592340469360351, "memory(GiB)": 91.26, "step": 3535, "token_acc": 0.7743402117176448, "train_speed(iter/s)": 0.090223 }, { "epoch": 0.7630954947186894, "grad_norm": 0.22116945683956146, "learning_rate": 2.9150838282664618e-05, "loss": 0.7599722385406494, "memory(GiB)": 91.26, "step": 3540, "token_acc": 0.7705139145035647, "train_speed(iter/s)": 0.090226 }, { "epoch": 0.7641733132140548, "grad_norm": 0.21651774644851685, "learning_rate": 2.8899747866025085e-05, "loss": 0.7462778568267823, "memory(GiB)": 91.26, "step": 3545, "token_acc": 0.775051163858233, "train_speed(iter/s)": 0.090229 }, { "epoch": 0.7652511317094202, "grad_norm": 0.2135532945394516, "learning_rate": 2.8649560733419823e-05, "loss": 0.7431081771850586, "memory(GiB)": 91.26, "step": 3550, "token_acc": 0.7737618820651636, "train_speed(iter/s)": 0.090233 }, { "epoch": 0.7663289502047855, "grad_norm": 0.21978868544101715, "learning_rate": 2.8400280063319228e-05, "loss": 0.7337148666381836, "memory(GiB)": 91.26, "step": 3555, "token_acc": 0.7757618312034826, "train_speed(iter/s)": 0.090236 }, { "epoch": 0.7674067687001509, "grad_norm": 0.21880528330802917, "learning_rate": 2.815190902267757e-05, "loss": 0.7292414665222168, "memory(GiB)": 91.26, "step": 3560, "token_acc": 0.7730221098117539, "train_speed(iter/s)": 0.090238 }, { "epoch": 0.7684845871955163, "grad_norm": 0.21093672513961792, "learning_rate": 2.7904450766892853e-05, "loss": 0.7311521053314209, "memory(GiB)": 91.26, "step": 3565, "token_acc": 0.7777675183054634, "train_speed(iter/s)": 0.090239 }, { "epoch": 0.7695624056908816, "grad_norm": 0.21627680957317352, "learning_rate": 2.765790843976681e-05, "loss": 0.7288570404052734, "memory(GiB)": 91.26, "step": 3570, "token_acc": 0.7887634655053862, "train_speed(iter/s)": 0.090243 }, { "epoch": 0.770640224186247, "grad_norm": 0.22756144404411316, "learning_rate": 2.741228517346478e-05, "loss": 0.7312281608581543, "memory(GiB)": 91.26, "step": 3575, "token_acc": 0.7820076765220035, "train_speed(iter/s)": 0.090244 }, { "epoch": 0.7717180426816124, "grad_norm": 0.21160663664340973, "learning_rate": 2.7167584088476027e-05, "loss": 0.7360856056213378, "memory(GiB)": 91.26, "step": 3580, "token_acc": 0.7783525324731101, "train_speed(iter/s)": 0.090248 }, { "epoch": 0.7727958611769778, "grad_norm": 0.21797838807106018, "learning_rate": 2.692380829357416e-05, "loss": 0.7523116588592529, "memory(GiB)": 91.26, "step": 3585, "token_acc": 0.776992390513862, "train_speed(iter/s)": 0.090248 }, { "epoch": 0.7738736796723432, "grad_norm": 0.20867909491062164, "learning_rate": 2.668096088577744e-05, "loss": 0.7388627529144287, "memory(GiB)": 91.26, "step": 3590, "token_acc": 0.7780267437658114, "train_speed(iter/s)": 0.090249 }, { "epoch": 0.7749514981677086, "grad_norm": 0.22310753166675568, "learning_rate": 2.6439044950309654e-05, "loss": 0.7644324779510498, "memory(GiB)": 91.26, "step": 3595, "token_acc": 0.7635491034558702, "train_speed(iter/s)": 0.090248 }, { "epoch": 0.776029316663074, "grad_norm": 0.2116236835718155, "learning_rate": 2.619806356056078e-05, "loss": 0.711247730255127, "memory(GiB)": 91.26, "step": 3600, "token_acc": 0.7842002791859526, "train_speed(iter/s)": 0.090243 }, { "epoch": 0.7771071351584393, "grad_norm": 0.21723444759845734, "learning_rate": 2.5958019778047928e-05, "loss": 0.7216385841369629, "memory(GiB)": 91.26, "step": 3605, "token_acc": 0.770698463638208, "train_speed(iter/s)": 0.090244 }, { "epoch": 0.7781849536538047, "grad_norm": 0.21666820347309113, "learning_rate": 2.571891665237659e-05, "loss": 0.7315185546875, "memory(GiB)": 91.26, "step": 3610, "token_acc": 0.7842318767091038, "train_speed(iter/s)": 0.090244 }, { "epoch": 0.7792627721491701, "grad_norm": 0.20962965488433838, "learning_rate": 2.5480757221201767e-05, "loss": 0.7355643272399902, "memory(GiB)": 91.26, "step": 3615, "token_acc": 0.7759540979127431, "train_speed(iter/s)": 0.090245 }, { "epoch": 0.7803405906445354, "grad_norm": 0.21728304028511047, "learning_rate": 2.5243544510189354e-05, "loss": 0.7153668880462647, "memory(GiB)": 91.26, "step": 3620, "token_acc": 0.780889462431865, "train_speed(iter/s)": 0.090247 }, { "epoch": 0.7814184091399008, "grad_norm": 0.21996697783470154, "learning_rate": 2.500728153297788e-05, "loss": 0.7327059745788574, "memory(GiB)": 91.26, "step": 3625, "token_acc": 0.7826423541482068, "train_speed(iter/s)": 0.090245 }, { "epoch": 0.7824962276352663, "grad_norm": 0.21488246321678162, "learning_rate": 2.4771971291139996e-05, "loss": 0.7275919437408447, "memory(GiB)": 91.26, "step": 3630, "token_acc": 0.7865772907860603, "train_speed(iter/s)": 0.090245 }, { "epoch": 0.7835740461306316, "grad_norm": 0.2308480143547058, "learning_rate": 2.453761677414448e-05, "loss": 0.7229132175445556, "memory(GiB)": 91.26, "step": 3635, "token_acc": 0.7775997135055285, "train_speed(iter/s)": 0.090249 }, { "epoch": 0.784651864625997, "grad_norm": 0.21999061107635498, "learning_rate": 2.430422095931828e-05, "loss": 0.7217930793762207, "memory(GiB)": 91.26, "step": 3640, "token_acc": 0.7824345520409306, "train_speed(iter/s)": 0.090255 }, { "epoch": 0.7857296831213624, "grad_norm": 0.21987846493721008, "learning_rate": 2.407178681180855e-05, "loss": 0.7320925235748291, "memory(GiB)": 91.26, "step": 3645, "token_acc": 0.7750923659646544, "train_speed(iter/s)": 0.090256 }, { "epoch": 0.7868075016167277, "grad_norm": 0.21633771061897278, "learning_rate": 2.3840317284545143e-05, "loss": 0.7295530319213868, "memory(GiB)": 91.26, "step": 3650, "token_acc": 0.7935295102627149, "train_speed(iter/s)": 0.090259 }, { "epoch": 0.7878853201120931, "grad_norm": 0.2289315015077591, "learning_rate": 2.3609815318202966e-05, "loss": 0.7523348808288575, "memory(GiB)": 91.26, "step": 3655, "token_acc": 0.7728126580449389, "train_speed(iter/s)": 0.090259 }, { "epoch": 0.7889631386074585, "grad_norm": 0.21634483337402344, "learning_rate": 2.3380283841164653e-05, "loss": 0.751465129852295, "memory(GiB)": 91.26, "step": 3660, "token_acc": 0.7813996101714706, "train_speed(iter/s)": 0.090259 }, { "epoch": 0.7900409571028238, "grad_norm": 0.20615601539611816, "learning_rate": 2.3151725769483468e-05, "loss": 0.7358405113220214, "memory(GiB)": 91.26, "step": 3665, "token_acc": 0.7708992525335902, "train_speed(iter/s)": 0.090262 }, { "epoch": 0.7911187755981892, "grad_norm": 0.21192476153373718, "learning_rate": 2.2924144006846103e-05, "loss": 0.7246754169464111, "memory(GiB)": 91.26, "step": 3670, "token_acc": 0.7757742663656885, "train_speed(iter/s)": 0.090263 }, { "epoch": 0.7921965940935547, "grad_norm": 0.2214789241552353, "learning_rate": 2.269754144453584e-05, "loss": 0.7348628044128418, "memory(GiB)": 91.26, "step": 3675, "token_acc": 0.7870527956430606, "train_speed(iter/s)": 0.090266 }, { "epoch": 0.7932744125889201, "grad_norm": 0.21100376546382904, "learning_rate": 2.247192096139592e-05, "loss": 0.7363664150238037, "memory(GiB)": 91.26, "step": 3680, "token_acc": 0.7728937061663175, "train_speed(iter/s)": 0.090267 }, { "epoch": 0.7943522310842854, "grad_norm": 0.22565259039402008, "learning_rate": 2.2247285423792808e-05, "loss": 0.7249825477600098, "memory(GiB)": 91.26, "step": 3685, "token_acc": 0.7784305226365154, "train_speed(iter/s)": 0.090267 }, { "epoch": 0.7954300495796508, "grad_norm": 0.20841084420681, "learning_rate": 2.2023637685579856e-05, "loss": 0.7256580352783203, "memory(GiB)": 91.26, "step": 3690, "token_acc": 0.7666788538880444, "train_speed(iter/s)": 0.090265 }, { "epoch": 0.7965078680750162, "grad_norm": 0.22089894115924835, "learning_rate": 2.1800980588061137e-05, "loss": 0.7265109062194824, "memory(GiB)": 91.26, "step": 3695, "token_acc": 0.7798090851019802, "train_speed(iter/s)": 0.090268 }, { "epoch": 0.7975856865703815, "grad_norm": 0.23983483016490936, "learning_rate": 2.157931695995511e-05, "loss": 0.741006326675415, "memory(GiB)": 91.26, "step": 3700, "token_acc": 0.7746998563018588, "train_speed(iter/s)": 0.090266 }, { "epoch": 0.7986635050657469, "grad_norm": 0.23100592195987701, "learning_rate": 2.1358649617358984e-05, "loss": 0.7226755142211914, "memory(GiB)": 91.26, "step": 3705, "token_acc": 0.7789008093355919, "train_speed(iter/s)": 0.090269 }, { "epoch": 0.7997413235611123, "grad_norm": 0.2205343395471573, "learning_rate": 2.113898136371266e-05, "loss": 0.7344723701477051, "memory(GiB)": 91.26, "step": 3710, "token_acc": 0.7833235784070955, "train_speed(iter/s)": 0.090272 }, { "epoch": 0.8008191420564776, "grad_norm": 0.22770698368549347, "learning_rate": 2.09203149897633e-05, "loss": 0.7412842750549317, "memory(GiB)": 91.26, "step": 3715, "token_acc": 0.7689801578632421, "train_speed(iter/s)": 0.090277 }, { "epoch": 0.8018969605518431, "grad_norm": 0.2095748782157898, "learning_rate": 2.0702653273529838e-05, "loss": 0.7262363910675049, "memory(GiB)": 91.26, "step": 3720, "token_acc": 0.7773935634959841, "train_speed(iter/s)": 0.09028 }, { "epoch": 0.8029747790472085, "grad_norm": 0.22464071214199066, "learning_rate": 2.0485998980267606e-05, "loss": 0.7542255878448486, "memory(GiB)": 91.26, "step": 3725, "token_acc": 0.7728358043200622, "train_speed(iter/s)": 0.090281 }, { "epoch": 0.8040525975425739, "grad_norm": 0.21433407068252563, "learning_rate": 2.0270354862433262e-05, "loss": 0.7307805061340332, "memory(GiB)": 91.26, "step": 3730, "token_acc": 0.7782866068496677, "train_speed(iter/s)": 0.090282 }, { "epoch": 0.8051304160379392, "grad_norm": 0.21284368634223938, "learning_rate": 2.0055723659649904e-05, "loss": 0.7180994033813477, "memory(GiB)": 91.26, "step": 3735, "token_acc": 0.790243001016354, "train_speed(iter/s)": 0.090282 }, { "epoch": 0.8062082345333046, "grad_norm": 0.21861092746257782, "learning_rate": 1.9842108098672084e-05, "loss": 0.7462057113647461, "memory(GiB)": 91.26, "step": 3740, "token_acc": 0.7765615173582945, "train_speed(iter/s)": 0.090285 }, { "epoch": 0.80728605302867, "grad_norm": 0.21885690093040466, "learning_rate": 1.9629510893351277e-05, "loss": 0.7438394546508789, "memory(GiB)": 91.26, "step": 3745, "token_acc": 0.7701719121439914, "train_speed(iter/s)": 0.090286 }, { "epoch": 0.8083638715240353, "grad_norm": 0.21034783124923706, "learning_rate": 1.9417934744601462e-05, "loss": 0.7343828201293945, "memory(GiB)": 91.26, "step": 3750, "token_acc": 0.7818049584342174, "train_speed(iter/s)": 0.090289 }, { "epoch": 0.8094416900194007, "grad_norm": 0.2085954248905182, "learning_rate": 1.9207382340364634e-05, "loss": 0.7291169166564941, "memory(GiB)": 91.26, "step": 3755, "token_acc": 0.7821942697414396, "train_speed(iter/s)": 0.090291 }, { "epoch": 0.8105195085147661, "grad_norm": 0.21062400937080383, "learning_rate": 1.8997856355576872e-05, "loss": 0.7378106117248535, "memory(GiB)": 91.26, "step": 3760, "token_acc": 0.7687735165786359, "train_speed(iter/s)": 0.090289 }, { "epoch": 0.8115973270101315, "grad_norm": 0.22174476087093353, "learning_rate": 1.8789359452134127e-05, "loss": 0.7401809692382812, "memory(GiB)": 91.26, "step": 3765, "token_acc": 0.7844627156842658, "train_speed(iter/s)": 0.090288 }, { "epoch": 0.8126751455054969, "grad_norm": 0.23539355397224426, "learning_rate": 1.8581894278858557e-05, "loss": 0.743313217163086, "memory(GiB)": 91.26, "step": 3770, "token_acc": 0.7794260493978517, "train_speed(iter/s)": 0.090292 }, { "epoch": 0.8137529640008623, "grad_norm": 0.21716360747814178, "learning_rate": 1.837546347146487e-05, "loss": 0.7407948017120362, "memory(GiB)": 91.26, "step": 3775, "token_acc": 0.7697542754891512, "train_speed(iter/s)": 0.090293 }, { "epoch": 0.8148307824962276, "grad_norm": 0.20780131220817566, "learning_rate": 1.8170069652526812e-05, "loss": 0.710191297531128, "memory(GiB)": 91.26, "step": 3780, "token_acc": 0.7904268370841039, "train_speed(iter/s)": 0.090296 }, { "epoch": 0.815908600991593, "grad_norm": 0.22094883024692535, "learning_rate": 1.7965715431443764e-05, "loss": 0.7360224723815918, "memory(GiB)": 91.26, "step": 3785, "token_acc": 0.7861472547001328, "train_speed(iter/s)": 0.090296 }, { "epoch": 0.8169864194869584, "grad_norm": 0.20635469257831573, "learning_rate": 1.7762403404407755e-05, "loss": 0.7203975200653077, "memory(GiB)": 91.26, "step": 3790, "token_acc": 0.7798997441682637, "train_speed(iter/s)": 0.090293 }, { "epoch": 0.8180642379823237, "grad_norm": 0.2102869153022766, "learning_rate": 1.7560136154370355e-05, "loss": 0.7180530071258545, "memory(GiB)": 91.26, "step": 3795, "token_acc": 0.7874331906294041, "train_speed(iter/s)": 0.090292 }, { "epoch": 0.8191420564776891, "grad_norm": 0.21749019622802734, "learning_rate": 1.735891625100985e-05, "loss": 0.7199493408203125, "memory(GiB)": 91.26, "step": 3800, "token_acc": 0.7769795365523301, "train_speed(iter/s)": 0.090292 }, { "epoch": 0.8202198749730545, "grad_norm": 0.23268502950668335, "learning_rate": 1.7158746250698765e-05, "loss": 0.7174210548400879, "memory(GiB)": 91.26, "step": 3805, "token_acc": 0.785713629473104, "train_speed(iter/s)": 0.090293 }, { "epoch": 0.82129769346842, "grad_norm": 0.22326399385929108, "learning_rate": 1.6959628696471153e-05, "loss": 0.7395180702209473, "memory(GiB)": 91.26, "step": 3810, "token_acc": 0.7797154919934578, "train_speed(iter/s)": 0.090292 }, { "epoch": 0.8223755119637853, "grad_norm": 0.21281059086322784, "learning_rate": 1.6761566117990435e-05, "loss": 0.717953109741211, "memory(GiB)": 91.26, "step": 3815, "token_acc": 0.7848441255736033, "train_speed(iter/s)": 0.090295 }, { "epoch": 0.8234533304591507, "grad_norm": 0.2102707028388977, "learning_rate": 1.656456103151728e-05, "loss": 0.7190811157226562, "memory(GiB)": 91.26, "step": 3820, "token_acc": 0.7855489262389189, "train_speed(iter/s)": 0.090295 }, { "epoch": 0.8245311489545161, "grad_norm": 0.20796917378902435, "learning_rate": 1.6368615939877518e-05, "loss": 0.7178693771362304, "memory(GiB)": 91.26, "step": 3825, "token_acc": 0.7806976765384159, "train_speed(iter/s)": 0.090295 }, { "epoch": 0.8256089674498814, "grad_norm": 0.2144007682800293, "learning_rate": 1.6173733332430407e-05, "loss": 0.7328081607818604, "memory(GiB)": 91.26, "step": 3830, "token_acc": 0.7746453220858895, "train_speed(iter/s)": 0.090297 }, { "epoch": 0.8266867859452468, "grad_norm": 0.21255382895469666, "learning_rate": 1.5979915685037093e-05, "loss": 0.7303975105285645, "memory(GiB)": 91.26, "step": 3835, "token_acc": 0.7875252371848702, "train_speed(iter/s)": 0.09029 }, { "epoch": 0.8277646044406122, "grad_norm": 0.2178426831960678, "learning_rate": 1.5787165460028975e-05, "loss": 0.7141714572906495, "memory(GiB)": 91.26, "step": 3840, "token_acc": 0.7887297125759362, "train_speed(iter/s)": 0.090291 }, { "epoch": 0.8288424229359775, "grad_norm": 0.21086178719997406, "learning_rate": 1.55954851061766e-05, "loss": 0.7301407337188721, "memory(GiB)": 91.26, "step": 3845, "token_acc": 0.7809336580499399, "train_speed(iter/s)": 0.090295 }, { "epoch": 0.8299202414313429, "grad_norm": 0.2197170853614807, "learning_rate": 1.540487705865843e-05, "loss": 0.7205963134765625, "memory(GiB)": 91.26, "step": 3850, "token_acc": 0.7783930333483422, "train_speed(iter/s)": 0.0903 }, { "epoch": 0.8309980599267084, "grad_norm": 0.21476158499717712, "learning_rate": 1.521534373902992e-05, "loss": 0.7239388465881348, "memory(GiB)": 91.26, "step": 3855, "token_acc": 0.7774284902530451, "train_speed(iter/s)": 0.090304 }, { "epoch": 0.8320758784220738, "grad_norm": 0.21131457388401031, "learning_rate": 1.5026887555192881e-05, "loss": 0.708393669128418, "memory(GiB)": 91.26, "step": 3860, "token_acc": 0.7778655911048128, "train_speed(iter/s)": 0.090305 }, { "epoch": 0.8331536969174391, "grad_norm": 0.21772447228431702, "learning_rate": 1.4839510901364695e-05, "loss": 0.7233445167541503, "memory(GiB)": 91.26, "step": 3865, "token_acc": 0.787391829921139, "train_speed(iter/s)": 0.090307 }, { "epoch": 0.8342315154128045, "grad_norm": 0.22316232323646545, "learning_rate": 1.465321615804801e-05, "loss": 0.7450509071350098, "memory(GiB)": 91.26, "step": 3870, "token_acc": 0.7769494384038469, "train_speed(iter/s)": 0.090309 }, { "epoch": 0.8353093339081699, "grad_norm": 0.21373701095581055, "learning_rate": 1.4468005692000552e-05, "loss": 0.7242658615112305, "memory(GiB)": 91.26, "step": 3875, "token_acc": 0.7793184001808556, "train_speed(iter/s)": 0.09031 }, { "epoch": 0.8363871524035352, "grad_norm": 0.2227841168642044, "learning_rate": 1.4283881856204907e-05, "loss": 0.7298080921173096, "memory(GiB)": 91.26, "step": 3880, "token_acc": 0.7764365971107544, "train_speed(iter/s)": 0.09031 }, { "epoch": 0.8374649708989006, "grad_norm": 0.2111506164073944, "learning_rate": 1.41008469898387e-05, "loss": 0.7114417076110839, "memory(GiB)": 91.26, "step": 3885, "token_acc": 0.7916662887411452, "train_speed(iter/s)": 0.090312 }, { "epoch": 0.838542789394266, "grad_norm": 0.21742264926433563, "learning_rate": 1.3918903418244977e-05, "loss": 0.7313467025756836, "memory(GiB)": 91.26, "step": 3890, "token_acc": 0.7761684742959972, "train_speed(iter/s)": 0.090311 }, { "epoch": 0.8396206078896313, "grad_norm": 0.2127697914838791, "learning_rate": 1.3738053452902443e-05, "loss": 0.7329813957214355, "memory(GiB)": 91.26, "step": 3895, "token_acc": 0.7789306760116682, "train_speed(iter/s)": 0.090314 }, { "epoch": 0.8406984263849968, "grad_norm": 0.22200961410999298, "learning_rate": 1.3558299391396357e-05, "loss": 0.7417447090148925, "memory(GiB)": 91.26, "step": 3900, "token_acc": 0.7608699477397899, "train_speed(iter/s)": 0.090315 }, { "epoch": 0.8417762448803622, "grad_norm": 0.20828963816165924, "learning_rate": 1.3379643517389096e-05, "loss": 0.721207857131958, "memory(GiB)": 91.26, "step": 3905, "token_acc": 0.7748451537302606, "train_speed(iter/s)": 0.090313 }, { "epoch": 0.8428540633757275, "grad_norm": 0.20882220566272736, "learning_rate": 1.320208810059128e-05, "loss": 0.7161104202270507, "memory(GiB)": 91.26, "step": 3910, "token_acc": 0.7871740728870119, "train_speed(iter/s)": 0.090317 }, { "epoch": 0.8439318818710929, "grad_norm": 0.21312852203845978, "learning_rate": 1.3025635396732982e-05, "loss": 0.7434062480926513, "memory(GiB)": 91.26, "step": 3915, "token_acc": 0.7749831511157705, "train_speed(iter/s)": 0.090317 }, { "epoch": 0.8450097003664583, "grad_norm": 0.22652384638786316, "learning_rate": 1.285028764753493e-05, "loss": 0.705132007598877, "memory(GiB)": 91.26, "step": 3920, "token_acc": 0.7802787292266986, "train_speed(iter/s)": 0.090316 }, { "epoch": 0.8460875188618237, "grad_norm": 0.21455134451389313, "learning_rate": 1.2676047080680065e-05, "loss": 0.7197843551635742, "memory(GiB)": 91.26, "step": 3925, "token_acc": 0.7899542495671603, "train_speed(iter/s)": 0.090315 }, { "epoch": 0.847165337357189, "grad_norm": 0.20179928839206696, "learning_rate": 1.2502915909785396e-05, "loss": 0.7213821887969971, "memory(GiB)": 91.26, "step": 3930, "token_acc": 0.7870127045689327, "train_speed(iter/s)": 0.090313 }, { "epoch": 0.8482431558525544, "grad_norm": 0.22223185002803802, "learning_rate": 1.2330896334373644e-05, "loss": 0.7294008255004882, "memory(GiB)": 91.26, "step": 3935, "token_acc": 0.7800534533689689, "train_speed(iter/s)": 0.090315 }, { "epoch": 0.8493209743479198, "grad_norm": 0.2090303897857666, "learning_rate": 1.2159990539845412e-05, "loss": 0.7357729911804199, "memory(GiB)": 91.26, "step": 3940, "token_acc": 0.7864400967404253, "train_speed(iter/s)": 0.090315 }, { "epoch": 0.8503987928432852, "grad_norm": 0.22618499398231506, "learning_rate": 1.1990200697451504e-05, "loss": 0.7185474395751953, "memory(GiB)": 91.26, "step": 3945, "token_acc": 0.788130767631667, "train_speed(iter/s)": 0.090319 }, { "epoch": 0.8514766113386506, "grad_norm": 0.21615612506866455, "learning_rate": 1.182152896426515e-05, "loss": 0.7185166358947754, "memory(GiB)": 91.26, "step": 3950, "token_acc": 0.7862251606225347, "train_speed(iter/s)": 0.090319 }, { "epoch": 0.852554429834016, "grad_norm": 0.2168940305709839, "learning_rate": 1.1653977483154787e-05, "loss": 0.7254057884216308, "memory(GiB)": 91.26, "step": 3955, "token_acc": 0.785362836596601, "train_speed(iter/s)": 0.090315 }, { "epoch": 0.8536322483293813, "grad_norm": 0.2053513526916504, "learning_rate": 1.1487548382756707e-05, "loss": 0.7019991874694824, "memory(GiB)": 91.26, "step": 3960, "token_acc": 0.7868677888607744, "train_speed(iter/s)": 0.090318 }, { "epoch": 0.8547100668247467, "grad_norm": 0.20588794350624084, "learning_rate": 1.132224377744805e-05, "loss": 0.7231695175170898, "memory(GiB)": 91.26, "step": 3965, "token_acc": 0.791230605971897, "train_speed(iter/s)": 0.090317 }, { "epoch": 0.8557878853201121, "grad_norm": 0.2177513986825943, "learning_rate": 1.1158065767320003e-05, "loss": 0.7094979286193848, "memory(GiB)": 91.26, "step": 3970, "token_acc": 0.7757531698750932, "train_speed(iter/s)": 0.090317 }, { "epoch": 0.8568657038154774, "grad_norm": 0.22888074815273285, "learning_rate": 1.0995016438151019e-05, "loss": 0.7391958236694336, "memory(GiB)": 91.26, "step": 3975, "token_acc": 0.7745472773983332, "train_speed(iter/s)": 0.090319 }, { "epoch": 0.8579435223108428, "grad_norm": 0.22281309962272644, "learning_rate": 1.083309786138038e-05, "loss": 0.727555227279663, "memory(GiB)": 91.26, "step": 3980, "token_acc": 0.7851633622148824, "train_speed(iter/s)": 0.090319 }, { "epoch": 0.8590213408062083, "grad_norm": 0.220356285572052, "learning_rate": 1.0672312094081915e-05, "loss": 0.7377647399902344, "memory(GiB)": 91.26, "step": 3985, "token_acc": 0.7696491705194451, "train_speed(iter/s)": 0.090318 }, { "epoch": 0.8600991593015737, "grad_norm": 0.21038594841957092, "learning_rate": 1.0512661178937755e-05, "loss": 0.7139994621276855, "memory(GiB)": 91.26, "step": 3990, "token_acc": 0.7869846214106522, "train_speed(iter/s)": 0.090319 }, { "epoch": 0.861176977796939, "grad_norm": 0.21818184852600098, "learning_rate": 1.0354147144212467e-05, "loss": 0.726657772064209, "memory(GiB)": 91.26, "step": 3995, "token_acc": 0.7807688404660645, "train_speed(iter/s)": 0.090318 }, { "epoch": 0.8622547962923044, "grad_norm": 0.22787801921367645, "learning_rate": 1.0196772003727306e-05, "loss": 0.7396413326263428, "memory(GiB)": 91.26, "step": 4000, "token_acc": 0.7768146931917981, "train_speed(iter/s)": 0.090321 }, { "epoch": 0.8622547962923044, "eval_loss": 0.725006103515625, "eval_runtime": 141.4734, "eval_samples_per_second": 42.39, "eval_steps_per_second": 5.301, "eval_token_acc": 0.7813240048850564, "step": 4000 }, { "epoch": 0.8633326147876698, "grad_norm": 0.21285924315452576, "learning_rate": 1.0040537756834522e-05, "loss": 0.7252083778381347, "memory(GiB)": 91.26, "step": 4005, "token_acc": 0.7784757765471818, "train_speed(iter/s)": 0.089879 }, { "epoch": 0.8644104332830351, "grad_norm": 0.21655626595020294, "learning_rate": 9.885446388392105e-06, "loss": 0.7161072731018067, "memory(GiB)": 91.26, "step": 4010, "token_acc": 0.7930660027545291, "train_speed(iter/s)": 0.089879 }, { "epoch": 0.8654882517784005, "grad_norm": 0.21160610020160675, "learning_rate": 9.731499868738447e-06, "loss": 0.7164231300354004, "memory(GiB)": 91.26, "step": 4015, "token_acc": 0.7804700176849168, "train_speed(iter/s)": 0.089885 }, { "epoch": 0.8665660702737659, "grad_norm": 0.2155599743127823, "learning_rate": 9.57870015366732e-06, "loss": 0.7201996803283691, "memory(GiB)": 91.26, "step": 4020, "token_acc": 0.7801446196280145, "train_speed(iter/s)": 0.089885 }, { "epoch": 0.8676438887691312, "grad_norm": 0.20782513916492462, "learning_rate": 9.42704918440317e-06, "loss": 0.7189927101135254, "memory(GiB)": 91.26, "step": 4025, "token_acc": 0.7851236030260064, "train_speed(iter/s)": 0.089882 }, { "epoch": 0.8687217072644967, "grad_norm": 0.22428876161575317, "learning_rate": 9.276548887576242e-06, "loss": 0.7213934898376465, "memory(GiB)": 91.26, "step": 4030, "token_acc": 0.7872976227643727, "train_speed(iter/s)": 0.08988 }, { "epoch": 0.8697995257598621, "grad_norm": 0.22328446805477142, "learning_rate": 9.127201175198263e-06, "loss": 0.7398235321044921, "memory(GiB)": 91.26, "step": 4035, "token_acc": 0.7837513611280291, "train_speed(iter/s)": 0.089882 }, { "epoch": 0.8708773442552274, "grad_norm": 0.21689820289611816, "learning_rate": 8.979007944638106e-06, "loss": 0.7423833847045899, "memory(GiB)": 91.26, "step": 4040, "token_acc": 0.7748093764155217, "train_speed(iter/s)": 0.089886 }, { "epoch": 0.8719551627505928, "grad_norm": 0.2030118703842163, "learning_rate": 8.831971078597634e-06, "loss": 0.710521125793457, "memory(GiB)": 91.26, "step": 4045, "token_acc": 0.7879786709519548, "train_speed(iter/s)": 0.089886 }, { "epoch": 0.8730329812459582, "grad_norm": 0.2114829272031784, "learning_rate": 8.686092445087902e-06, "loss": 0.7261528968811035, "memory(GiB)": 91.26, "step": 4050, "token_acc": 0.7891857278728354, "train_speed(iter/s)": 0.089884 }, { "epoch": 0.8741107997413236, "grad_norm": 0.21785010397434235, "learning_rate": 8.541373897405258e-06, "loss": 0.7252052307128907, "memory(GiB)": 91.26, "step": 4055, "token_acc": 0.7844998596001775, "train_speed(iter/s)": 0.089884 }, { "epoch": 0.8751886182366889, "grad_norm": 0.21336394548416138, "learning_rate": 8.397817274107921e-06, "loss": 0.7264320373535156, "memory(GiB)": 91.26, "step": 4060, "token_acc": 0.7743598475174286, "train_speed(iter/s)": 0.089888 }, { "epoch": 0.8762664367320543, "grad_norm": 0.20781666040420532, "learning_rate": 8.25542439899264e-06, "loss": 0.7248138427734375, "memory(GiB)": 91.26, "step": 4065, "token_acc": 0.7855904563218054, "train_speed(iter/s)": 0.08989 }, { "epoch": 0.8773442552274197, "grad_norm": 0.21697667241096497, "learning_rate": 8.114197081071417e-06, "loss": 0.7247468948364257, "memory(GiB)": 91.26, "step": 4070, "token_acc": 0.7777007798351628, "train_speed(iter/s)": 0.089895 }, { "epoch": 0.8784220737227851, "grad_norm": 0.20338594913482666, "learning_rate": 7.974137114548575e-06, "loss": 0.7135403156280518, "memory(GiB)": 91.26, "step": 4075, "token_acc": 0.7828581475620703, "train_speed(iter/s)": 0.089897 }, { "epoch": 0.8794998922181505, "grad_norm": 0.21408383548259735, "learning_rate": 7.835246278798037e-06, "loss": 0.7347275733947753, "memory(GiB)": 91.26, "step": 4080, "token_acc": 0.7841351559813771, "train_speed(iter/s)": 0.089901 }, { "epoch": 0.8805777107135159, "grad_norm": 0.2233501374721527, "learning_rate": 7.697526338340589e-06, "loss": 0.713504695892334, "memory(GiB)": 91.26, "step": 4085, "token_acc": 0.7834428554215751, "train_speed(iter/s)": 0.089901 }, { "epoch": 0.8816555292088812, "grad_norm": 0.21559599041938782, "learning_rate": 7.5609790428215475e-06, "loss": 0.7058677673339844, "memory(GiB)": 91.26, "step": 4090, "token_acc": 0.7796736191579761, "train_speed(iter/s)": 0.089901 }, { "epoch": 0.8827333477042466, "grad_norm": 0.2206195443868637, "learning_rate": 7.425606126988538e-06, "loss": 0.7355635643005372, "memory(GiB)": 91.26, "step": 4095, "token_acc": 0.7820152811336772, "train_speed(iter/s)": 0.089904 }, { "epoch": 0.883811166199612, "grad_norm": 0.2182418704032898, "learning_rate": 7.291409310669372e-06, "loss": 0.7407171249389648, "memory(GiB)": 91.26, "step": 4100, "token_acc": 0.778044463630477, "train_speed(iter/s)": 0.0899 }, { "epoch": 0.8848889846949773, "grad_norm": 0.20796217024326324, "learning_rate": 7.158390298750328e-06, "loss": 0.7264743328094483, "memory(GiB)": 91.26, "step": 4105, "token_acc": 0.7811329826086153, "train_speed(iter/s)": 0.089902 }, { "epoch": 0.8859668031903427, "grad_norm": 0.22652900218963623, "learning_rate": 7.0265507811543614e-06, "loss": 0.7330987453460693, "memory(GiB)": 91.26, "step": 4110, "token_acc": 0.7818911123129619, "train_speed(iter/s)": 0.089907 }, { "epoch": 0.8870446216857081, "grad_norm": 0.21662814915180206, "learning_rate": 6.895892432819706e-06, "loss": 0.7088597297668457, "memory(GiB)": 91.26, "step": 4115, "token_acc": 0.7843480161766313, "train_speed(iter/s)": 0.089908 }, { "epoch": 0.8881224401810736, "grad_norm": 0.21882246434688568, "learning_rate": 6.76641691367863e-06, "loss": 0.717680549621582, "memory(GiB)": 91.26, "step": 4120, "token_acc": 0.7810422455136493, "train_speed(iter/s)": 0.089911 }, { "epoch": 0.8892002586764389, "grad_norm": 0.2076176106929779, "learning_rate": 6.6381258686362245e-06, "loss": 0.7085257530212402, "memory(GiB)": 91.26, "step": 4125, "token_acc": 0.7804230805552932, "train_speed(iter/s)": 0.089912 }, { "epoch": 0.8902780771718043, "grad_norm": 0.21008741855621338, "learning_rate": 6.511020927549616e-06, "loss": 0.728643274307251, "memory(GiB)": 91.26, "step": 4130, "token_acc": 0.7864778241329299, "train_speed(iter/s)": 0.089913 }, { "epoch": 0.8913558956671697, "grad_norm": 0.2145233303308487, "learning_rate": 6.3851037052072515e-06, "loss": 0.7383733749389648, "memory(GiB)": 91.26, "step": 4135, "token_acc": 0.7747296885531989, "train_speed(iter/s)": 0.089914 }, { "epoch": 0.892433714162535, "grad_norm": 0.20893704891204834, "learning_rate": 6.260375801308316e-06, "loss": 0.717233943939209, "memory(GiB)": 91.26, "step": 4140, "token_acc": 0.7840868226203539, "train_speed(iter/s)": 0.089917 }, { "epoch": 0.8935115326579004, "grad_norm": 0.22545728087425232, "learning_rate": 6.136838800442457e-06, "loss": 0.7292487621307373, "memory(GiB)": 91.26, "step": 4145, "token_acc": 0.779019674200364, "train_speed(iter/s)": 0.089916 }, { "epoch": 0.8945893511532658, "grad_norm": 0.21525290608406067, "learning_rate": 6.014494272069693e-06, "loss": 0.7334854125976562, "memory(GiB)": 91.26, "step": 4150, "token_acc": 0.7845460329978581, "train_speed(iter/s)": 0.089913 }, { "epoch": 0.8956671696486311, "grad_norm": 0.2091246247291565, "learning_rate": 5.8933437705003484e-06, "loss": 0.7236259460449219, "memory(GiB)": 91.26, "step": 4155, "token_acc": 0.7823016089446414, "train_speed(iter/s)": 0.089917 }, { "epoch": 0.8967449881439965, "grad_norm": 0.217746302485466, "learning_rate": 5.773388834875482e-06, "loss": 0.7281242370605469, "memory(GiB)": 91.26, "step": 4160, "token_acc": 0.7805790573943314, "train_speed(iter/s)": 0.08992 }, { "epoch": 0.897822806639362, "grad_norm": 0.21932078897953033, "learning_rate": 5.654630989147158e-06, "loss": 0.738051462173462, "memory(GiB)": 91.26, "step": 4165, "token_acc": 0.7768033794984226, "train_speed(iter/s)": 0.089919 }, { "epoch": 0.8989006251347273, "grad_norm": 0.2130691260099411, "learning_rate": 5.537071742059186e-06, "loss": 0.727278470993042, "memory(GiB)": 91.26, "step": 4170, "token_acc": 0.7791666666666667, "train_speed(iter/s)": 0.089919 }, { "epoch": 0.8999784436300927, "grad_norm": 0.20682166516780853, "learning_rate": 5.420712587127974e-06, "loss": 0.7233345031738281, "memory(GiB)": 91.26, "step": 4175, "token_acc": 0.7727156004340587, "train_speed(iter/s)": 0.089914 }, { "epoch": 0.9010562621254581, "grad_norm": 0.2179419994354248, "learning_rate": 5.305555002623463e-06, "loss": 0.737426471710205, "memory(GiB)": 91.26, "step": 4180, "token_acc": 0.7787119938190247, "train_speed(iter/s)": 0.089913 }, { "epoch": 0.9021340806208235, "grad_norm": 0.20999833941459656, "learning_rate": 5.191600451550405e-06, "loss": 0.6949337959289551, "memory(GiB)": 91.26, "step": 4185, "token_acc": 0.7928867331628464, "train_speed(iter/s)": 0.089914 }, { "epoch": 0.9032118991161888, "grad_norm": 0.20399320125579834, "learning_rate": 5.078850381629807e-06, "loss": 0.7265397071838379, "memory(GiB)": 91.26, "step": 4190, "token_acc": 0.7701281629970423, "train_speed(iter/s)": 0.089913 }, { "epoch": 0.9042897176115542, "grad_norm": 0.21370166540145874, "learning_rate": 4.967306225280455e-06, "loss": 0.7199459075927734, "memory(GiB)": 91.26, "step": 4195, "token_acc": 0.7845414092380112, "train_speed(iter/s)": 0.089915 }, { "epoch": 0.9053675361069196, "grad_norm": 0.20612017810344696, "learning_rate": 4.856969399600775e-06, "loss": 0.7387835502624511, "memory(GiB)": 91.26, "step": 4200, "token_acc": 0.7733367834468482, "train_speed(iter/s)": 0.089919 }, { "epoch": 0.9064453546022849, "grad_norm": 0.20685206353664398, "learning_rate": 4.747841306350831e-06, "loss": 0.7214616775512696, "memory(GiB)": 91.26, "step": 4205, "token_acc": 0.7836486533570259, "train_speed(iter/s)": 0.089917 }, { "epoch": 0.9075231730976504, "grad_norm": 0.20757175981998444, "learning_rate": 4.639923331934471e-06, "loss": 0.73880615234375, "memory(GiB)": 91.26, "step": 4210, "token_acc": 0.7699718283840576, "train_speed(iter/s)": 0.089921 }, { "epoch": 0.9086009915930158, "grad_norm": 0.21011170744895935, "learning_rate": 4.533216847381771e-06, "loss": 0.7277022361755371, "memory(GiB)": 91.26, "step": 4215, "token_acc": 0.7858974358974359, "train_speed(iter/s)": 0.089923 }, { "epoch": 0.9096788100883811, "grad_norm": 0.2149413377046585, "learning_rate": 4.42772320833158e-06, "loss": 0.7130107879638672, "memory(GiB)": 91.26, "step": 4220, "token_acc": 0.7750419901039539, "train_speed(iter/s)": 0.089925 }, { "epoch": 0.9107566285837465, "grad_norm": 0.2179485708475113, "learning_rate": 4.323443755014289e-06, "loss": 0.7284105300903321, "memory(GiB)": 91.26, "step": 4225, "token_acc": 0.774435755780334, "train_speed(iter/s)": 0.089926 }, { "epoch": 0.9118344470791119, "grad_norm": 0.20638763904571533, "learning_rate": 4.220379812234865e-06, "loss": 0.7003241062164307, "memory(GiB)": 91.26, "step": 4230, "token_acc": 0.7856413331709191, "train_speed(iter/s)": 0.089929 }, { "epoch": 0.9129122655744772, "grad_norm": 0.21764962375164032, "learning_rate": 4.118532689355936e-06, "loss": 0.7062634468078614, "memory(GiB)": 91.26, "step": 4235, "token_acc": 0.7901115401115401, "train_speed(iter/s)": 0.089927 }, { "epoch": 0.9139900840698426, "grad_norm": 0.22755637764930725, "learning_rate": 4.017903680281199e-06, "loss": 0.7358386039733886, "memory(GiB)": 91.26, "step": 4240, "token_acc": 0.7729881812707016, "train_speed(iter/s)": 0.08993 }, { "epoch": 0.915067902565208, "grad_norm": 0.20248976349830627, "learning_rate": 3.9184940634390155e-06, "loss": 0.7128472805023194, "memory(GiB)": 91.26, "step": 4245, "token_acc": 0.7767263062977552, "train_speed(iter/s)": 0.089932 }, { "epoch": 0.9161457210605733, "grad_norm": 0.20032113790512085, "learning_rate": 3.8203051017660975e-06, "loss": 0.6896315097808838, "memory(GiB)": 91.26, "step": 4250, "token_acc": 0.7838508028398752, "train_speed(iter/s)": 0.089934 }, { "epoch": 0.9172235395559388, "grad_norm": 0.23072639107704163, "learning_rate": 3.7233380426915267e-06, "loss": 0.7190867900848389, "memory(GiB)": 91.26, "step": 4255, "token_acc": 0.790508657204429, "train_speed(iter/s)": 0.089935 }, { "epoch": 0.9183013580513042, "grad_norm": 0.20757725834846497, "learning_rate": 3.6275941181208583e-06, "loss": 0.7324123382568359, "memory(GiB)": 91.26, "step": 4260, "token_acc": 0.7728340232658022, "train_speed(iter/s)": 0.089935 }, { "epoch": 0.9193791765466696, "grad_norm": 0.21443401277065277, "learning_rate": 3.5330745444205117e-06, "loss": 0.7228023529052734, "memory(GiB)": 91.26, "step": 4265, "token_acc": 0.7769706322396093, "train_speed(iter/s)": 0.089939 }, { "epoch": 0.9204569950420349, "grad_norm": 0.2143612653017044, "learning_rate": 3.4397805224022915e-06, "loss": 0.7339675903320313, "memory(GiB)": 91.26, "step": 4270, "token_acc": 0.7816783109991219, "train_speed(iter/s)": 0.089941 }, { "epoch": 0.9215348135374003, "grad_norm": 0.2214508056640625, "learning_rate": 3.3477132373081254e-06, "loss": 0.7276855945587158, "memory(GiB)": 91.26, "step": 4275, "token_acc": 0.7695954607148243, "train_speed(iter/s)": 0.089942 }, { "epoch": 0.9226126320327657, "grad_norm": 0.22391968965530396, "learning_rate": 3.2568738587950174e-06, "loss": 0.7292671203613281, "memory(GiB)": 91.26, "step": 4280, "token_acc": 0.7733431115373405, "train_speed(iter/s)": 0.089945 }, { "epoch": 0.923690450528131, "grad_norm": 0.2069467008113861, "learning_rate": 3.167263540920218e-06, "loss": 0.7256306648254395, "memory(GiB)": 91.26, "step": 4285, "token_acc": 0.7774584634718809, "train_speed(iter/s)": 0.089949 }, { "epoch": 0.9247682690234964, "grad_norm": 0.22039546072483063, "learning_rate": 3.078883422126533e-06, "loss": 0.7203712463378906, "memory(GiB)": 91.26, "step": 4290, "token_acc": 0.787046656371379, "train_speed(iter/s)": 0.08995 }, { "epoch": 0.9258460875188618, "grad_norm": 0.20236016809940338, "learning_rate": 2.9917346252278157e-06, "loss": 0.7166401386260987, "memory(GiB)": 91.26, "step": 4295, "token_acc": 0.7776031175285679, "train_speed(iter/s)": 0.089953 }, { "epoch": 0.9269239060142273, "grad_norm": 0.22908715903759003, "learning_rate": 2.905818257394799e-06, "loss": 0.7386090755462646, "memory(GiB)": 91.26, "step": 4300, "token_acc": 0.7683732044521502, "train_speed(iter/s)": 0.08995 }, { "epoch": 0.9280017245095926, "grad_norm": 0.21900223195552826, "learning_rate": 2.8211354101409517e-06, "loss": 0.7179301261901856, "memory(GiB)": 91.26, "step": 4305, "token_acc": 0.7815859613974205, "train_speed(iter/s)": 0.08995 }, { "epoch": 0.929079543004958, "grad_norm": 0.20916599035263062, "learning_rate": 2.7376871593086327e-06, "loss": 0.7104029655456543, "memory(GiB)": 91.26, "step": 4310, "token_acc": 0.7933340970979881, "train_speed(iter/s)": 0.089953 }, { "epoch": 0.9301573615003234, "grad_norm": 0.21610896289348602, "learning_rate": 2.655474565055471e-06, "loss": 0.7114706993103027, "memory(GiB)": 91.26, "step": 4315, "token_acc": 0.783667340043263, "train_speed(iter/s)": 0.08995 }, { "epoch": 0.9312351799956887, "grad_norm": 0.22419430315494537, "learning_rate": 2.574498671840808e-06, "loss": 0.7487603187561035, "memory(GiB)": 91.26, "step": 4320, "token_acc": 0.7782896108960001, "train_speed(iter/s)": 0.089948 }, { "epoch": 0.9323129984910541, "grad_norm": 0.20395125448703766, "learning_rate": 2.4947605084124857e-06, "loss": 0.7242486953735352, "memory(GiB)": 91.26, "step": 4325, "token_acc": 0.7804864642022812, "train_speed(iter/s)": 0.08995 }, { "epoch": 0.9333908169864195, "grad_norm": 0.21256829798221588, "learning_rate": 2.416261087793792e-06, "loss": 0.7356223106384278, "memory(GiB)": 91.26, "step": 4330, "token_acc": 0.7763174198573275, "train_speed(iter/s)": 0.089952 }, { "epoch": 0.9344686354817848, "grad_norm": 0.20915992558002472, "learning_rate": 2.3390014072705467e-06, "loss": 0.7274567604064941, "memory(GiB)": 91.26, "step": 4335, "token_acc": 0.7820506779996419, "train_speed(iter/s)": 0.089955 }, { "epoch": 0.9355464539771502, "grad_norm": 0.22017133235931396, "learning_rate": 2.2629824483784366e-06, "loss": 0.6947534561157227, "memory(GiB)": 91.26, "step": 4340, "token_acc": 0.788730584217677, "train_speed(iter/s)": 0.089955 }, { "epoch": 0.9366242724725157, "grad_norm": 0.22004073858261108, "learning_rate": 2.1882051768905897e-06, "loss": 0.7143162250518799, "memory(GiB)": 91.26, "step": 4345, "token_acc": 0.7773071076341256, "train_speed(iter/s)": 0.089955 }, { "epoch": 0.937702090967881, "grad_norm": 0.20348204672336578, "learning_rate": 2.1146705428052525e-06, "loss": 0.7214525222778321, "memory(GiB)": 91.26, "step": 4350, "token_acc": 0.7843021010177226, "train_speed(iter/s)": 0.089958 }, { "epoch": 0.9387799094632464, "grad_norm": 0.2116391360759735, "learning_rate": 2.0423794803337672e-06, "loss": 0.730920934677124, "memory(GiB)": 91.26, "step": 4355, "token_acc": 0.7764006775456557, "train_speed(iter/s)": 0.08996 }, { "epoch": 0.9398577279586118, "grad_norm": 0.21648815274238586, "learning_rate": 1.9713329078886457e-06, "loss": 0.7400577545166016, "memory(GiB)": 91.26, "step": 4360, "token_acc": 0.7654719179678396, "train_speed(iter/s)": 0.089963 }, { "epoch": 0.9409355464539771, "grad_norm": 0.214090034365654, "learning_rate": 1.9015317280719703e-06, "loss": 0.7376798629760742, "memory(GiB)": 91.26, "step": 4365, "token_acc": 0.7735489276462147, "train_speed(iter/s)": 0.089962 }, { "epoch": 0.9420133649493425, "grad_norm": 0.2167166918516159, "learning_rate": 1.8329768276638682e-06, "loss": 0.7175388336181641, "memory(GiB)": 91.26, "step": 4370, "token_acc": 0.7816790206022776, "train_speed(iter/s)": 0.089963 }, { "epoch": 0.9430911834447079, "grad_norm": 0.21185563504695892, "learning_rate": 1.7656690776112872e-06, "loss": 0.7109268188476563, "memory(GiB)": 91.26, "step": 4375, "token_acc": 0.7812273463579655, "train_speed(iter/s)": 0.089963 }, { "epoch": 0.9441690019400732, "grad_norm": 0.21129101514816284, "learning_rate": 1.6996093330168827e-06, "loss": 0.7218971252441406, "memory(GiB)": 91.26, "step": 4380, "token_acc": 0.7798909155937053, "train_speed(iter/s)": 0.089963 }, { "epoch": 0.9452468204354386, "grad_norm": 0.20574553310871124, "learning_rate": 1.6347984331282483e-06, "loss": 0.7014687538146973, "memory(GiB)": 91.26, "step": 4385, "token_acc": 0.7908480966292336, "train_speed(iter/s)": 0.089965 }, { "epoch": 0.9463246389308041, "grad_norm": 0.21078984439373016, "learning_rate": 1.5712372013271138e-06, "loss": 0.7132591247558594, "memory(GiB)": 91.26, "step": 4390, "token_acc": 0.7840226742126253, "train_speed(iter/s)": 0.089967 }, { "epoch": 0.9474024574261695, "grad_norm": 0.21788227558135986, "learning_rate": 1.5089264451190077e-06, "loss": 0.7124405860900879, "memory(GiB)": 91.26, "step": 4395, "token_acc": 0.7842289931833609, "train_speed(iter/s)": 0.089972 }, { "epoch": 0.9484802759215348, "grad_norm": 0.20152238011360168, "learning_rate": 1.4478669561229563e-06, "loss": 0.713834285736084, "memory(GiB)": 91.26, "step": 4400, "token_acc": 0.7803595251120516, "train_speed(iter/s)": 0.089973 }, { "epoch": 0.9495580944169002, "grad_norm": 0.1996513307094574, "learning_rate": 1.3880595100613792e-06, "loss": 0.715969467163086, "memory(GiB)": 91.26, "step": 4405, "token_acc": 0.786045047047492, "train_speed(iter/s)": 0.089972 }, { "epoch": 0.9506359129122656, "grad_norm": 0.21363607048988342, "learning_rate": 1.3295048667503308e-06, "loss": 0.7164482116699219, "memory(GiB)": 91.26, "step": 4410, "token_acc": 0.7709017716423295, "train_speed(iter/s)": 0.089973 }, { "epoch": 0.9517137314076309, "grad_norm": 0.21149952709674835, "learning_rate": 1.2722037700897416e-06, "loss": 0.7024490356445312, "memory(GiB)": 91.26, "step": 4415, "token_acc": 0.7987126812031051, "train_speed(iter/s)": 0.089975 }, { "epoch": 0.9527915499029963, "grad_norm": 0.2276209145784378, "learning_rate": 1.2161569480540591e-06, "loss": 0.7310225486755371, "memory(GiB)": 91.26, "step": 4420, "token_acc": 0.7831090938563786, "train_speed(iter/s)": 0.089977 }, { "epoch": 0.9538693683983617, "grad_norm": 0.21087877452373505, "learning_rate": 1.161365112682944e-06, "loss": 0.7108451843261718, "memory(GiB)": 91.26, "step": 4425, "token_acc": 0.7875768205673044, "train_speed(iter/s)": 0.089976 }, { "epoch": 0.9549471868937272, "grad_norm": 0.20917417109012604, "learning_rate": 1.1078289600722547e-06, "loss": 0.6977101325988769, "memory(GiB)": 91.26, "step": 4430, "token_acc": 0.7962116204739096, "train_speed(iter/s)": 0.089978 }, { "epoch": 0.9560250053890925, "grad_norm": 0.20612475275993347, "learning_rate": 1.0555491703651444e-06, "loss": 0.705262279510498, "memory(GiB)": 91.26, "step": 4435, "token_acc": 0.7789915667366474, "train_speed(iter/s)": 0.089976 }, { "epoch": 0.9571028238844579, "grad_norm": 0.20784829556941986, "learning_rate": 1.0045264077435224e-06, "loss": 0.7187210083007812, "memory(GiB)": 91.26, "step": 4440, "token_acc": 0.7898830827964686, "train_speed(iter/s)": 0.089977 }, { "epoch": 0.9581806423798233, "grad_norm": 0.22065778076648712, "learning_rate": 9.547613204195283e-07, "loss": 0.7114683628082276, "memory(GiB)": 91.26, "step": 4445, "token_acc": 0.775649013993398, "train_speed(iter/s)": 0.089982 }, { "epoch": 0.9592584608751886, "grad_norm": 0.22133053839206696, "learning_rate": 9.062545406273159e-07, "loss": 0.7239481925964355, "memory(GiB)": 91.26, "step": 4450, "token_acc": 0.7870013646617589, "train_speed(iter/s)": 0.089984 }, { "epoch": 0.960336279370554, "grad_norm": 0.21478867530822754, "learning_rate": 8.590066846150602e-07, "loss": 0.7305329322814942, "memory(GiB)": 91.26, "step": 4455, "token_acc": 0.7791517666220803, "train_speed(iter/s)": 0.089984 }, { "epoch": 0.9614140978659194, "grad_norm": 0.20866502821445465, "learning_rate": 8.13018352637096e-07, "loss": 0.711725902557373, "memory(GiB)": 91.26, "step": 4460, "token_acc": 0.786479175930043, "train_speed(iter/s)": 0.089983 }, { "epoch": 0.9624919163612847, "grad_norm": 0.21731503307819366, "learning_rate": 7.682901289462807e-07, "loss": 0.7062753200531006, "memory(GiB)": 91.26, "step": 4465, "token_acc": 0.7887196445561108, "train_speed(iter/s)": 0.089981 }, { "epoch": 0.9635697348566501, "grad_norm": 0.21773622930049896, "learning_rate": 7.248225817865884e-07, "loss": 0.7175806522369385, "memory(GiB)": 91.26, "step": 4470, "token_acc": 0.7742971450344284, "train_speed(iter/s)": 0.089983 }, { "epoch": 0.9646475533520156, "grad_norm": 0.20518571138381958, "learning_rate": 6.826162633859046e-07, "loss": 0.7105363845825196, "memory(GiB)": 91.26, "step": 4475, "token_acc": 0.7780797012912117, "train_speed(iter/s)": 0.089988 }, { "epoch": 0.9657253718473809, "grad_norm": 0.23465275764465332, "learning_rate": 6.416717099489878e-07, "loss": 0.7313633918762207, "memory(GiB)": 91.26, "step": 4480, "token_acc": 0.7817431157178217, "train_speed(iter/s)": 0.089988 }, { "epoch": 0.9668031903427463, "grad_norm": 0.20805859565734863, "learning_rate": 6.019894416506743e-07, "loss": 0.7153730869293213, "memory(GiB)": 91.26, "step": 4485, "token_acc": 0.7826031421637502, "train_speed(iter/s)": 0.089989 }, { "epoch": 0.9678810088381117, "grad_norm": 0.21996386349201202, "learning_rate": 5.63569962629229e-07, "loss": 0.7459645748138428, "memory(GiB)": 91.26, "step": 4490, "token_acc": 0.7784280936454849, "train_speed(iter/s)": 0.089991 }, { "epoch": 0.968958827333477, "grad_norm": 0.2162637710571289, "learning_rate": 5.264137609800157e-07, "loss": 0.7132817268371582, "memory(GiB)": 91.26, "step": 4495, "token_acc": 0.78169208375748, "train_speed(iter/s)": 0.089992 }, { "epoch": 0.9700366458288424, "grad_norm": 0.2123820036649704, "learning_rate": 4.905213087492255e-07, "loss": 0.720570421218872, "memory(GiB)": 91.26, "step": 4500, "token_acc": 0.7838685586258402, "train_speed(iter/s)": 0.089991 }, { "epoch": 0.9711144643242078, "grad_norm": 0.21034042537212372, "learning_rate": 4.5589306192791446e-07, "loss": 0.7150312423706054, "memory(GiB)": 91.26, "step": 4505, "token_acc": 0.7837169753459283, "train_speed(iter/s)": 0.089992 }, { "epoch": 0.9721922828195732, "grad_norm": 0.20039622485637665, "learning_rate": 4.2252946044621935e-07, "loss": 0.7294380187988281, "memory(GiB)": 91.26, "step": 4510, "token_acc": 0.7867996901855199, "train_speed(iter/s)": 0.089993 }, { "epoch": 0.9732701013149385, "grad_norm": 0.21560324728488922, "learning_rate": 3.9043092816770656e-07, "loss": 0.7345722675323486, "memory(GiB)": 91.26, "step": 4515, "token_acc": 0.784517429429882, "train_speed(iter/s)": 0.089997 }, { "epoch": 0.974347919810304, "grad_norm": 0.2155689299106598, "learning_rate": 3.5959787288407653e-07, "loss": 0.7037801742553711, "memory(GiB)": 91.26, "step": 4520, "token_acc": 0.777932242768182, "train_speed(iter/s)": 0.089999 }, { "epoch": 0.9754257383056694, "grad_norm": 0.2128688097000122, "learning_rate": 3.300306863099345e-07, "loss": 0.7405351161956787, "memory(GiB)": 91.26, "step": 4525, "token_acc": 0.7796503561187679, "train_speed(iter/s)": 0.090003 }, { "epoch": 0.9765035568010347, "grad_norm": 0.22825267910957336, "learning_rate": 3.0172974407779443e-07, "loss": 0.7280765533447265, "memory(GiB)": 91.26, "step": 4530, "token_acc": 0.7763043577667017, "train_speed(iter/s)": 0.090005 }, { "epoch": 0.9775813752964001, "grad_norm": 0.22008006274700165, "learning_rate": 2.746954057333606e-07, "loss": 0.717118787765503, "memory(GiB)": 91.26, "step": 4535, "token_acc": 0.7913071001014699, "train_speed(iter/s)": 0.090009 }, { "epoch": 0.9786591937917655, "grad_norm": 0.2116696685552597, "learning_rate": 2.489280147309092e-07, "loss": 0.7228737354278565, "memory(GiB)": 91.26, "step": 4540, "token_acc": 0.7749241489371573, "train_speed(iter/s)": 0.090011 }, { "epoch": 0.9797370122871308, "grad_norm": 0.2270619124174118, "learning_rate": 2.244278984289583e-07, "loss": 0.7349771022796631, "memory(GiB)": 91.26, "step": 4545, "token_acc": 0.7684010850042153, "train_speed(iter/s)": 0.09001 }, { "epoch": 0.9808148307824962, "grad_norm": 0.21378257870674133, "learning_rate": 2.0119536808609342e-07, "loss": 0.7220155715942382, "memory(GiB)": 91.26, "step": 4550, "token_acc": 0.779769502389759, "train_speed(iter/s)": 0.090012 }, { "epoch": 0.9818926492778616, "grad_norm": 0.2115267664194107, "learning_rate": 1.7923071885700415e-07, "loss": 0.7039060115814209, "memory(GiB)": 91.26, "step": 4555, "token_acc": 0.7964028341306849, "train_speed(iter/s)": 0.090013 }, { "epoch": 0.9829704677732269, "grad_norm": 0.21116676926612854, "learning_rate": 1.5853422978878706e-07, "loss": 0.7122782230377197, "memory(GiB)": 91.26, "step": 4560, "token_acc": 0.7842300522679923, "train_speed(iter/s)": 0.090012 }, { "epoch": 0.9840482862685924, "grad_norm": 0.23747503757476807, "learning_rate": 1.391061638173041e-07, "loss": 0.7437212944030762, "memory(GiB)": 91.26, "step": 4565, "token_acc": 0.7691274421706948, "train_speed(iter/s)": 0.090015 }, { "epoch": 0.9851261047639578, "grad_norm": 0.21059466898441315, "learning_rate": 1.2094676776394087e-07, "loss": 0.7231916427612305, "memory(GiB)": 91.26, "step": 4570, "token_acc": 0.7779406477307299, "train_speed(iter/s)": 0.090017 }, { "epoch": 0.9862039232593232, "grad_norm": 0.2090761512517929, "learning_rate": 1.040562723324201e-07, "loss": 0.7266976356506347, "memory(GiB)": 91.26, "step": 4575, "token_acc": 0.7734583043017629, "train_speed(iter/s)": 0.090019 }, { "epoch": 0.9872817417546885, "grad_norm": 0.21311582624912262, "learning_rate": 8.843489210587086e-08, "loss": 0.7090636253356933, "memory(GiB)": 91.26, "step": 4580, "token_acc": 0.7874593726034401, "train_speed(iter/s)": 0.09002 }, { "epoch": 0.9883595602500539, "grad_norm": 0.21094554662704468, "learning_rate": 7.40828255441084e-08, "loss": 0.7117923259735107, "memory(GiB)": 91.26, "step": 4585, "token_acc": 0.7879684045427918, "train_speed(iter/s)": 0.09002 }, { "epoch": 0.9894373787454193, "grad_norm": 0.21162006258964539, "learning_rate": 6.100025498112505e-08, "loss": 0.7360248565673828, "memory(GiB)": 91.26, "step": 4590, "token_acc": 0.7769003926027251, "train_speed(iter/s)": 0.090021 }, { "epoch": 0.9905151972407846, "grad_norm": 0.2063881754875183, "learning_rate": 4.918734662275881e-08, "loss": 0.7055489540100097, "memory(GiB)": 91.26, "step": 4595, "token_acc": 0.7857246587278536, "train_speed(iter/s)": 0.090023 }, { "epoch": 0.99159301573615, "grad_norm": 0.2180391252040863, "learning_rate": 3.8644250544594975e-08, "loss": 0.7306919574737549, "memory(GiB)": 91.26, "step": 4600, "token_acc": 0.7871569183296001, "train_speed(iter/s)": 0.090024 }, { "epoch": 0.9926708342315154, "grad_norm": 0.22651870548725128, "learning_rate": 2.937110069004545e-08, "loss": 0.703729248046875, "memory(GiB)": 91.26, "step": 4605, "token_acc": 0.7874643411692486, "train_speed(iter/s)": 0.090027 }, { "epoch": 0.9937486527268808, "grad_norm": 0.21322208642959595, "learning_rate": 2.1368014868650142e-08, "loss": 0.7249719619750976, "memory(GiB)": 91.26, "step": 4610, "token_acc": 0.7745806811736425, "train_speed(iter/s)": 0.090029 }, { "epoch": 0.9948264712222462, "grad_norm": 0.21622636914253235, "learning_rate": 1.463509475458924e-08, "loss": 0.7308333396911622, "memory(GiB)": 91.26, "step": 4615, "token_acc": 0.7808048499274742, "train_speed(iter/s)": 0.090028 }, { "epoch": 0.9959042897176116, "grad_norm": 0.21811260282993317, "learning_rate": 9.172425885373148e-09, "loss": 0.7121863842010498, "memory(GiB)": 91.26, "step": 4620, "token_acc": 0.7896811517483244, "train_speed(iter/s)": 0.090029 }, { "epoch": 0.996982108212977, "grad_norm": 0.20972463488578796, "learning_rate": 4.980077660787786e-09, "loss": 0.7050772666931152, "memory(GiB)": 91.26, "step": 4625, "token_acc": 0.7863228336712644, "train_speed(iter/s)": 0.090033 }, { "epoch": 0.9980599267083423, "grad_norm": 0.21319890022277832, "learning_rate": 2.0581033419841968e-09, "loss": 0.7299002647399903, "memory(GiB)": 91.26, "step": 4630, "token_acc": 0.7778151352137249, "train_speed(iter/s)": 0.090031 }, { "epoch": 0.9991377452037077, "grad_norm": 0.20837628841400146, "learning_rate": 4.065400508013184e-10, "loss": 0.728370189666748, "memory(GiB)": 91.26, "step": 4635, "token_acc": 0.774958046202735, "train_speed(iter/s)": 0.09003 }, { "epoch": 1.0, "eval_loss": 0.7199121713638306, "eval_runtime": 141.8629, "eval_samples_per_second": 42.273, "eval_steps_per_second": 5.287, "eval_token_acc": 0.7824753007167098, "step": 4639 } ], "logging_steps": 5, "max_steps": 4639, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.008502348455189e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }