{ "best_metric": 0.00065021, "best_model_checkpoint": "/mnt/si0001694oxp/default/vlm_sft/outputs/output/deepseek-vl-7b-chat/v32-20250613-154734/checkpoint-3000", "epoch": 1.0993219717793659, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003665017408832692, "grad_norm": 16.33839225769043, "learning_rate": 9.999999631609428e-06, "loss": 3.0264194011688232, "memory(GiB)": 149.2, "step": 1, "token_acc": 0.4675925925925926, "train_speed(iter/s)": 0.033836 }, { "epoch": 0.001832508704416346, "grad_norm": 6.188778400421143, "learning_rate": 9.999990790238409e-06, "loss": 0.6836232542991638, "memory(GiB)": 158.4, "step": 5, "token_acc": 0.8406651231319722, "train_speed(iter/s)": 0.042635 }, { "epoch": 0.003665017408832692, "grad_norm": 0.4842391312122345, "learning_rate": 9.999963160987561e-06, "loss": 0.05034670829772949, "memory(GiB)": 158.4, "step": 10, "token_acc": 0.9856121161127471, "train_speed(iter/s)": 0.04407 }, { "epoch": 0.005497526113249038, "grad_norm": 0.1827951818704605, "learning_rate": 9.99991711234924e-06, "loss": 0.01651783734560013, "memory(GiB)": 158.4, "step": 15, "token_acc": 0.9920074036681811, "train_speed(iter/s)": 0.044607 }, { "epoch": 0.007330034817665384, "grad_norm": 0.09308009594678879, "learning_rate": 9.999852644493086e-06, "loss": 0.014441253244876861, "memory(GiB)": 158.4, "step": 20, "token_acc": 0.9914947368421053, "train_speed(iter/s)": 0.044685 }, { "epoch": 0.00916254352208173, "grad_norm": 0.13165982067584991, "learning_rate": 9.999769757656593e-06, "loss": 0.013714964687824249, "memory(GiB)": 158.4, "step": 25, "token_acc": 0.9919225915018931, "train_speed(iter/s)": 0.044897 }, { "epoch": 0.010995052226498075, "grad_norm": 0.136412113904953, "learning_rate": 9.999668452145104e-06, "loss": 0.010563116520643234, "memory(GiB)": 158.4, "step": 30, "token_acc": 0.9947824623411596, "train_speed(iter/s)": 0.04502 }, { "epoch": 0.012827560930914422, "grad_norm": 0.2637465298175812, "learning_rate": 9.999548728331825e-06, "loss": 0.008089790493249894, "memory(GiB)": 158.4, "step": 35, "token_acc": 0.9959606160060591, "train_speed(iter/s)": 0.045028 }, { "epoch": 0.014660069635330768, "grad_norm": 0.2768152952194214, "learning_rate": 9.999410586657801e-06, "loss": 0.005358598381280899, "memory(GiB)": 158.4, "step": 40, "token_acc": 0.9978118161925602, "train_speed(iter/s)": 0.045061 }, { "epoch": 0.016492578339747113, "grad_norm": 0.09677782654762268, "learning_rate": 9.999254027631938e-06, "loss": 0.003943501785397529, "memory(GiB)": 158.4, "step": 45, "token_acc": 0.9986528584659425, "train_speed(iter/s)": 0.044994 }, { "epoch": 0.01832508704416346, "grad_norm": 0.3623986840248108, "learning_rate": 9.99907905183098e-06, "loss": 0.0031241703778505324, "memory(GiB)": 158.4, "step": 50, "token_acc": 0.9987373737373737, "train_speed(iter/s)": 0.04505 }, { "epoch": 0.020157595748579806, "grad_norm": 0.496895432472229, "learning_rate": 9.998885659899524e-06, "loss": 0.002511710487306118, "memory(GiB)": 158.4, "step": 55, "token_acc": 0.9988217471806093, "train_speed(iter/s)": 0.045107 }, { "epoch": 0.02199010445299615, "grad_norm": 0.1918005645275116, "learning_rate": 9.998673852550007e-06, "loss": 0.002556230500340462, "memory(GiB)": 158.4, "step": 60, "token_acc": 0.9994104270192875, "train_speed(iter/s)": 0.045164 }, { "epoch": 0.0238226131574125, "grad_norm": 0.16670851409435272, "learning_rate": 9.998443630562707e-06, "loss": 0.0034642994403839113, "memory(GiB)": 158.4, "step": 65, "token_acc": 0.9989904938167746, "train_speed(iter/s)": 0.045187 }, { "epoch": 0.025655121861828844, "grad_norm": 0.04445331171154976, "learning_rate": 9.99819499478574e-06, "loss": 0.00226197075098753, "memory(GiB)": 158.4, "step": 70, "token_acc": 0.9994109231675503, "train_speed(iter/s)": 0.045194 }, { "epoch": 0.02748763056624519, "grad_norm": 0.13421526551246643, "learning_rate": 9.997927946135055e-06, "loss": 0.0026616916060447694, "memory(GiB)": 158.4, "step": 75, "token_acc": 0.998989558773998, "train_speed(iter/s)": 0.04522 }, { "epoch": 0.029320139270661537, "grad_norm": 0.09873384982347488, "learning_rate": 9.997642485594436e-06, "loss": 0.0017027700319886207, "memory(GiB)": 158.4, "step": 80, "token_acc": 0.9993260887878022, "train_speed(iter/s)": 0.04525 }, { "epoch": 0.03115264797507788, "grad_norm": 0.03224126249551773, "learning_rate": 9.997338614215492e-06, "loss": 0.0017118226736783982, "memory(GiB)": 158.4, "step": 85, "token_acc": 0.9993263725159987, "train_speed(iter/s)": 0.04528 }, { "epoch": 0.032985156679494226, "grad_norm": 0.3803243637084961, "learning_rate": 9.997016333117655e-06, "loss": 0.0019580798223614694, "memory(GiB)": 158.4, "step": 90, "token_acc": 0.9993265993265993, "train_speed(iter/s)": 0.045299 }, { "epoch": 0.034817665383910575, "grad_norm": 0.3237900733947754, "learning_rate": 9.996675643488177e-06, "loss": 0.002880098670721054, "memory(GiB)": 158.4, "step": 95, "token_acc": 0.9990737622094982, "train_speed(iter/s)": 0.045329 }, { "epoch": 0.03665017408832692, "grad_norm": 0.1465182900428772, "learning_rate": 9.99631654658213e-06, "loss": 0.0028293343260884286, "memory(GiB)": 158.4, "step": 100, "token_acc": 0.9990743857287109, "train_speed(iter/s)": 0.045355 }, { "epoch": 0.038482682792743264, "grad_norm": 0.24748782813549042, "learning_rate": 9.995939043722388e-06, "loss": 0.0018339043483138085, "memory(GiB)": 158.4, "step": 105, "token_acc": 0.9994106255788499, "train_speed(iter/s)": 0.045379 }, { "epoch": 0.04031519149715961, "grad_norm": 0.04621001332998276, "learning_rate": 9.995543136299636e-06, "loss": 0.0019403379410505295, "memory(GiB)": 158.4, "step": 110, "token_acc": 0.9994108735903047, "train_speed(iter/s)": 0.045398 }, { "epoch": 0.04214770020157596, "grad_norm": 0.06725554913282394, "learning_rate": 9.995128825772365e-06, "loss": 0.0010762955993413926, "memory(GiB)": 158.4, "step": 115, "token_acc": 0.9995792308339645, "train_speed(iter/s)": 0.045421 }, { "epoch": 0.0439802089059923, "grad_norm": 0.16836291551589966, "learning_rate": 9.99469611366685e-06, "loss": 0.0029191805049777033, "memory(GiB)": 158.4, "step": 120, "token_acc": 0.9990743857287109, "train_speed(iter/s)": 0.045438 }, { "epoch": 0.04581271761040865, "grad_norm": 0.19015128910541534, "learning_rate": 9.994245001577163e-06, "loss": 0.0029153132811188696, "memory(GiB)": 158.4, "step": 125, "token_acc": 0.9988206553786538, "train_speed(iter/s)": 0.045454 }, { "epoch": 0.047645226314825, "grad_norm": 0.2672649919986725, "learning_rate": 9.993775491165157e-06, "loss": 0.0028599994257092476, "memory(GiB)": 158.4, "step": 130, "token_acc": 0.9989905787348586, "train_speed(iter/s)": 0.045477 }, { "epoch": 0.04947773501924134, "grad_norm": 0.09613120555877686, "learning_rate": 9.993287584160462e-06, "loss": 0.001117743458598852, "memory(GiB)": 158.4, "step": 135, "token_acc": 0.9996634129922585, "train_speed(iter/s)": 0.045484 }, { "epoch": 0.05131024372365769, "grad_norm": 0.08400937169790268, "learning_rate": 9.992781282360486e-06, "loss": 0.0014099805615842343, "memory(GiB)": 158.4, "step": 140, "token_acc": 0.9995794785534062, "train_speed(iter/s)": 0.045497 }, { "epoch": 0.053142752428074036, "grad_norm": 0.2961122989654541, "learning_rate": 9.992256587630392e-06, "loss": 0.0026107219979166984, "memory(GiB)": 158.4, "step": 145, "token_acc": 0.9993264292329713, "train_speed(iter/s)": 0.045509 }, { "epoch": 0.05497526113249038, "grad_norm": 0.11588957160711288, "learning_rate": 9.991713501903107e-06, "loss": 0.0020393442362546923, "memory(GiB)": 158.4, "step": 150, "token_acc": 0.9991583908432924, "train_speed(iter/s)": 0.045518 }, { "epoch": 0.056807769836906725, "grad_norm": 0.04025767371058464, "learning_rate": 9.991152027179307e-06, "loss": 0.001108243688941002, "memory(GiB)": 158.4, "step": 155, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.045528 }, { "epoch": 0.058640278541323074, "grad_norm": 0.26148226857185364, "learning_rate": 9.990572165527413e-06, "loss": 0.003043392114341259, "memory(GiB)": 158.4, "step": 160, "token_acc": 0.9991581074254925, "train_speed(iter/s)": 0.045537 }, { "epoch": 0.060472787245739415, "grad_norm": 0.02609323337674141, "learning_rate": 9.989973919083576e-06, "loss": 0.003145371749997139, "memory(GiB)": 158.4, "step": 165, "token_acc": 0.9989058160087535, "train_speed(iter/s)": 0.045548 }, { "epoch": 0.06230529595015576, "grad_norm": 0.08112650364637375, "learning_rate": 9.989357290051681e-06, "loss": 0.0019015805795788766, "memory(GiB)": 158.4, "step": 170, "token_acc": 0.9991585324806462, "train_speed(iter/s)": 0.045556 }, { "epoch": 0.06413780465457211, "grad_norm": 0.012307146564126015, "learning_rate": 9.98872228070333e-06, "loss": 0.0017634263262152673, "memory(GiB)": 158.4, "step": 175, "token_acc": 0.9994951619688683, "train_speed(iter/s)": 0.045559 }, { "epoch": 0.06597031335898845, "grad_norm": 0.22926685214042664, "learning_rate": 9.988068893377841e-06, "loss": 0.0008580862544476986, "memory(GiB)": 158.4, "step": 180, "token_acc": 0.9996634413125789, "train_speed(iter/s)": 0.045562 }, { "epoch": 0.06780282206340481, "grad_norm": 0.07493411749601364, "learning_rate": 9.987397130482224e-06, "loss": 0.001726461760699749, "memory(GiB)": 158.4, "step": 185, "token_acc": 0.9994107744107744, "train_speed(iter/s)": 0.045574 }, { "epoch": 0.06963533076782115, "grad_norm": 0.11616482585668564, "learning_rate": 9.986706994491194e-06, "loss": 0.0020760688930749893, "memory(GiB)": 158.4, "step": 190, "token_acc": 0.999494779386999, "train_speed(iter/s)": 0.04558 }, { "epoch": 0.07146783947223749, "grad_norm": 0.1130843311548233, "learning_rate": 9.985998487947143e-06, "loss": 0.003568219020962715, "memory(GiB)": 158.4, "step": 195, "token_acc": 0.9988221436984688, "train_speed(iter/s)": 0.045588 }, { "epoch": 0.07330034817665385, "grad_norm": 0.03086119331419468, "learning_rate": 9.985271613460144e-06, "loss": 0.0014082181267440319, "memory(GiB)": 158.4, "step": 200, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.045593 }, { "epoch": 0.07513285688107019, "grad_norm": 0.10936316847801208, "learning_rate": 9.984526373707933e-06, "loss": 0.0023099591955542563, "memory(GiB)": 158.4, "step": 205, "token_acc": 0.999242615501136, "train_speed(iter/s)": 0.045599 }, { "epoch": 0.07696536558548653, "grad_norm": 0.17849738895893097, "learning_rate": 9.983762771435902e-06, "loss": 0.0017316842451691628, "memory(GiB)": 158.4, "step": 210, "token_acc": 0.9995793016407236, "train_speed(iter/s)": 0.0456 }, { "epoch": 0.07879787428990288, "grad_norm": 0.07379074394702911, "learning_rate": 9.982980809457088e-06, "loss": 0.001504539605230093, "memory(GiB)": 158.4, "step": 215, "token_acc": 0.99949499200404, "train_speed(iter/s)": 0.045601 }, { "epoch": 0.08063038299431922, "grad_norm": 0.20956623554229736, "learning_rate": 9.982180490652165e-06, "loss": 0.001286138966679573, "memory(GiB)": 158.4, "step": 220, "token_acc": 0.9997476022211005, "train_speed(iter/s)": 0.045606 }, { "epoch": 0.08246289169873557, "grad_norm": 0.36039137840270996, "learning_rate": 9.981361817969433e-06, "loss": 0.0015822691842913628, "memory(GiB)": 158.4, "step": 225, "token_acc": 0.999494779386999, "train_speed(iter/s)": 0.045612 }, { "epoch": 0.08429540040315192, "grad_norm": 0.05167197808623314, "learning_rate": 9.9805247944248e-06, "loss": 0.0016318798065185548, "memory(GiB)": 158.4, "step": 230, "token_acc": 0.9994951194883878, "train_speed(iter/s)": 0.045618 }, { "epoch": 0.08612790910756826, "grad_norm": 0.0602310486137867, "learning_rate": 9.979669423101784e-06, "loss": 0.0017338620498776435, "memory(GiB)": 158.4, "step": 235, "token_acc": 0.9992421690804985, "train_speed(iter/s)": 0.045622 }, { "epoch": 0.0879604178119846, "grad_norm": 0.03006557747721672, "learning_rate": 9.978795707151492e-06, "loss": 0.0005913118831813336, "memory(GiB)": 158.4, "step": 240, "token_acc": 0.9997476659096644, "train_speed(iter/s)": 0.045626 }, { "epoch": 0.08979292651640096, "grad_norm": 0.1851363480091095, "learning_rate": 9.977903649792606e-06, "loss": 0.0013333003968000411, "memory(GiB)": 158.4, "step": 245, "token_acc": 0.9995793016407236, "train_speed(iter/s)": 0.04562 }, { "epoch": 0.0916254352208173, "grad_norm": 0.16427940130233765, "learning_rate": 9.976993254311385e-06, "loss": 0.0022492580115795135, "memory(GiB)": 158.4, "step": 250, "token_acc": 0.999326259053394, "train_speed(iter/s)": 0.045566 }, { "epoch": 0.09345794392523364, "grad_norm": 0.07113044708967209, "learning_rate": 9.976064524061637e-06, "loss": 0.0023244613781571387, "memory(GiB)": 158.4, "step": 255, "token_acc": 0.9994107744107744, "train_speed(iter/s)": 0.04552 }, { "epoch": 0.09529045262965, "grad_norm": 0.0672680214047432, "learning_rate": 9.975117462464716e-06, "loss": 0.0020451253280043603, "memory(GiB)": 158.4, "step": 260, "token_acc": 0.9994105263157895, "train_speed(iter/s)": 0.045509 }, { "epoch": 0.09712296133406634, "grad_norm": 0.09312908351421356, "learning_rate": 9.974152073009506e-06, "loss": 0.0018878720700740814, "memory(GiB)": 158.4, "step": 265, "token_acc": 0.9994954167017072, "train_speed(iter/s)": 0.045482 }, { "epoch": 0.09895547003848268, "grad_norm": 0.06397019326686859, "learning_rate": 9.973168359252411e-06, "loss": 0.0020165286958217623, "memory(GiB)": 158.4, "step": 270, "token_acc": 0.9994108735903047, "train_speed(iter/s)": 0.045476 }, { "epoch": 0.10078797874289903, "grad_norm": 0.15306073427200317, "learning_rate": 9.972166324817338e-06, "loss": 0.0017529357224702834, "memory(GiB)": 158.4, "step": 275, "token_acc": 0.9997474109623642, "train_speed(iter/s)": 0.045455 }, { "epoch": 0.10262048744731538, "grad_norm": 0.13208770751953125, "learning_rate": 9.971145973395685e-06, "loss": 0.001645715907216072, "memory(GiB)": 158.4, "step": 280, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.045452 }, { "epoch": 0.10445299615173172, "grad_norm": 0.0297766774892807, "learning_rate": 9.97010730874633e-06, "loss": 0.0012823720462620258, "memory(GiB)": 158.4, "step": 285, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.045422 }, { "epoch": 0.10628550485614807, "grad_norm": 0.16176588833332062, "learning_rate": 9.969050334695619e-06, "loss": 0.001742975413799286, "memory(GiB)": 158.4, "step": 290, "token_acc": 0.9995788764423482, "train_speed(iter/s)": 0.045417 }, { "epoch": 0.10811801356056441, "grad_norm": 0.10822831094264984, "learning_rate": 9.967975055137335e-06, "loss": 0.002227822504937649, "memory(GiB)": 158.4, "step": 295, "token_acc": 0.9994103773584906, "train_speed(iter/s)": 0.045373 }, { "epoch": 0.10995052226498075, "grad_norm": 0.1328648328781128, "learning_rate": 9.966881474032711e-06, "loss": 0.0017272233963012695, "memory(GiB)": 158.4, "step": 300, "token_acc": 0.9994105759514988, "train_speed(iter/s)": 0.045362 }, { "epoch": 0.11178303096939711, "grad_norm": 0.11945555359125137, "learning_rate": 9.965769595410395e-06, "loss": 0.0011399961076676846, "memory(GiB)": 158.4, "step": 305, "token_acc": 0.9995791954216462, "train_speed(iter/s)": 0.045363 }, { "epoch": 0.11361553967381345, "grad_norm": 0.2175164371728897, "learning_rate": 9.964639423366442e-06, "loss": 0.0025836611166596413, "memory(GiB)": 158.4, "step": 310, "token_acc": 0.9990738401953355, "train_speed(iter/s)": 0.045357 }, { "epoch": 0.11544804837822979, "grad_norm": 0.035975273698568344, "learning_rate": 9.963490962064297e-06, "loss": 0.0006968880537897348, "memory(GiB)": 158.4, "step": 315, "token_acc": 0.9997475385003787, "train_speed(iter/s)": 0.04536 }, { "epoch": 0.11728055708264615, "grad_norm": 0.14850489795207977, "learning_rate": 9.962324215734782e-06, "loss": 0.0017726331949234008, "memory(GiB)": 158.4, "step": 320, "token_acc": 0.999242615501136, "train_speed(iter/s)": 0.045365 }, { "epoch": 0.11911306578706249, "grad_norm": 0.03455163165926933, "learning_rate": 9.96113918867608e-06, "loss": 0.0013269748538732528, "memory(GiB)": 158.4, "step": 325, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.045365 }, { "epoch": 0.12094557449147883, "grad_norm": 0.23186658322811127, "learning_rate": 9.959935885253715e-06, "loss": 0.0010508694685995579, "memory(GiB)": 158.4, "step": 330, "token_acc": 0.9998317064961293, "train_speed(iter/s)": 0.045369 }, { "epoch": 0.12277808319589519, "grad_norm": 0.06666416674852371, "learning_rate": 9.958714309900546e-06, "loss": 0.0009142296388745308, "memory(GiB)": 158.4, "step": 335, "token_acc": 0.9995789119083712, "train_speed(iter/s)": 0.045376 }, { "epoch": 0.12461059190031153, "grad_norm": 0.014640443958342075, "learning_rate": 9.957474467116739e-06, "loss": 0.0024377334862947463, "memory(GiB)": 158.4, "step": 340, "token_acc": 0.9992424880060601, "train_speed(iter/s)": 0.045382 }, { "epoch": 0.12644310060472788, "grad_norm": 0.15044739842414856, "learning_rate": 9.956216361469755e-06, "loss": 0.002022208273410797, "memory(GiB)": 158.4, "step": 345, "token_acc": 0.9994952893674294, "train_speed(iter/s)": 0.045388 }, { "epoch": 0.12827560930914422, "grad_norm": 0.012829025276005268, "learning_rate": 9.954939997594335e-06, "loss": 0.003057861886918545, "memory(GiB)": 158.4, "step": 350, "token_acc": 0.9992422966829433, "train_speed(iter/s)": 0.045394 }, { "epoch": 0.13010811801356056, "grad_norm": 0.02966240420937538, "learning_rate": 9.953645380192485e-06, "loss": 0.0017476610839366913, "memory(GiB)": 158.4, "step": 355, "token_acc": 0.999663356337317, "train_speed(iter/s)": 0.045399 }, { "epoch": 0.1319406267179769, "grad_norm": 0.0715402215719223, "learning_rate": 9.952332514033449e-06, "loss": 0.0023743031546473504, "memory(GiB)": 158.4, "step": 360, "token_acc": 0.9991585324806462, "train_speed(iter/s)": 0.045407 }, { "epoch": 0.13377313542239325, "grad_norm": 0.07701452821493149, "learning_rate": 9.9510014039537e-06, "loss": 0.0022863084450364113, "memory(GiB)": 158.4, "step": 365, "token_acc": 0.9994110222970131, "train_speed(iter/s)": 0.04541 }, { "epoch": 0.13560564412680962, "grad_norm": 0.09453430771827698, "learning_rate": 9.949652054856924e-06, "loss": 0.0019000820815563203, "memory(GiB)": 158.4, "step": 370, "token_acc": 0.9993265426382693, "train_speed(iter/s)": 0.045415 }, { "epoch": 0.13743815283122596, "grad_norm": 0.0394257977604866, "learning_rate": 9.948284471713994e-06, "loss": 0.0016634922474622726, "memory(GiB)": 158.4, "step": 375, "token_acc": 0.9994104766717197, "train_speed(iter/s)": 0.045419 }, { "epoch": 0.1392706615356423, "grad_norm": 0.04517311230301857, "learning_rate": 9.94689865956295e-06, "loss": 0.0017285166308283807, "memory(GiB)": 158.4, "step": 380, "token_acc": 0.9994948644552955, "train_speed(iter/s)": 0.045425 }, { "epoch": 0.14110317024005864, "grad_norm": 0.07294133305549622, "learning_rate": 9.945494623509003e-06, "loss": 0.000422241585329175, "memory(GiB)": 158.4, "step": 385, "token_acc": 0.9999158390843292, "train_speed(iter/s)": 0.045427 }, { "epoch": 0.14293567894447498, "grad_norm": 0.06523015350103378, "learning_rate": 9.944072368724476e-06, "loss": 0.0024235062301158905, "memory(GiB)": 158.4, "step": 390, "token_acc": 0.9994953318193288, "train_speed(iter/s)": 0.045433 }, { "epoch": 0.14476818764889132, "grad_norm": 0.0444883331656456, "learning_rate": 9.942631900448827e-06, "loss": 0.0009868125431239604, "memory(GiB)": 158.4, "step": 395, "token_acc": 0.999663356337317, "train_speed(iter/s)": 0.045437 }, { "epoch": 0.1466006963533077, "grad_norm": 0.01692277006804943, "learning_rate": 9.941173223988603e-06, "loss": 0.0023114632815122603, "memory(GiB)": 158.4, "step": 400, "token_acc": 0.9993263725159987, "train_speed(iter/s)": 0.045442 }, { "epoch": 0.14843320505772403, "grad_norm": 0.02756733074784279, "learning_rate": 9.939696344717427e-06, "loss": 0.0015292948111891747, "memory(GiB)": 158.4, "step": 405, "token_acc": 0.9994107744107744, "train_speed(iter/s)": 0.045444 }, { "epoch": 0.15026571376214037, "grad_norm": 0.09074392169713974, "learning_rate": 9.938201268075982e-06, "loss": 0.0020554307848215103, "memory(GiB)": 158.4, "step": 410, "token_acc": 0.9992423604680528, "train_speed(iter/s)": 0.045448 }, { "epoch": 0.15209822246655672, "grad_norm": 0.07123276591300964, "learning_rate": 9.936687999571987e-06, "loss": 0.0014599796384572982, "memory(GiB)": 158.4, "step": 415, "token_acc": 0.9994952044422009, "train_speed(iter/s)": 0.045449 }, { "epoch": 0.15393073117097306, "grad_norm": 0.07088897377252579, "learning_rate": 9.935156544780183e-06, "loss": 0.0010397397913038731, "memory(GiB)": 158.4, "step": 420, "token_acc": 0.9996633846671716, "train_speed(iter/s)": 0.045448 }, { "epoch": 0.1557632398753894, "grad_norm": 0.1305522322654724, "learning_rate": 9.9336069093423e-06, "loss": 0.0015219044871628284, "memory(GiB)": 158.4, "step": 425, "token_acc": 0.9994950770007573, "train_speed(iter/s)": 0.045451 }, { "epoch": 0.15759574857980577, "grad_norm": 0.03542817756533623, "learning_rate": 9.932039098967046e-06, "loss": 0.002127250283956528, "memory(GiB)": 158.4, "step": 430, "token_acc": 0.9994949494949495, "train_speed(iter/s)": 0.045456 }, { "epoch": 0.1594282572842221, "grad_norm": 0.14930537343025208, "learning_rate": 9.930453119430086e-06, "loss": 0.000645923474803567, "memory(GiB)": 158.4, "step": 435, "token_acc": 0.9997474960020201, "train_speed(iter/s)": 0.045458 }, { "epoch": 0.16126076598863845, "grad_norm": 0.10225468873977661, "learning_rate": 9.92884897657402e-06, "loss": 0.000911066122353077, "memory(GiB)": 158.4, "step": 440, "token_acc": 0.9997473471450228, "train_speed(iter/s)": 0.045415 }, { "epoch": 0.1630932746930548, "grad_norm": 0.05018873140215874, "learning_rate": 9.927226676308354e-06, "loss": 0.00166127011179924, "memory(GiB)": 158.4, "step": 445, "token_acc": 0.9997476871320438, "train_speed(iter/s)": 0.045381 }, { "epoch": 0.16492578339747113, "grad_norm": 0.17071396112442017, "learning_rate": 9.925586224609489e-06, "loss": 0.0025668978691101075, "memory(GiB)": 158.4, "step": 450, "token_acc": 0.9994110718492344, "train_speed(iter/s)": 0.045381 }, { "epoch": 0.16675829210188747, "grad_norm": 0.008416908793151379, "learning_rate": 9.923927627520694e-06, "loss": 0.000798144843429327, "memory(GiB)": 158.4, "step": 455, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.045382 }, { "epoch": 0.16859080080630384, "grad_norm": 0.1326538473367691, "learning_rate": 9.922250891152078e-06, "loss": 0.0013757062144577504, "memory(GiB)": 158.4, "step": 460, "token_acc": 0.9994102780117945, "train_speed(iter/s)": 0.045388 }, { "epoch": 0.17042330951072018, "grad_norm": 0.10151144862174988, "learning_rate": 9.92055602168058e-06, "loss": 0.0008957336656749248, "memory(GiB)": 158.4, "step": 465, "token_acc": 0.9996634696281339, "train_speed(iter/s)": 0.045392 }, { "epoch": 0.17225581821513652, "grad_norm": 0.09111111611127853, "learning_rate": 9.918843025349941e-06, "loss": 0.0013033418916165828, "memory(GiB)": 158.4, "step": 470, "token_acc": 0.9995792308339645, "train_speed(iter/s)": 0.045396 }, { "epoch": 0.17408832691955287, "grad_norm": 0.029473107308149338, "learning_rate": 9.917111908470673e-06, "loss": 0.0013312675058841706, "memory(GiB)": 158.4, "step": 475, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.0454 }, { "epoch": 0.1759208356239692, "grad_norm": 0.1001836434006691, "learning_rate": 9.915362677420045e-06, "loss": 0.0019773678854107858, "memory(GiB)": 158.4, "step": 480, "token_acc": 0.9991580365412142, "train_speed(iter/s)": 0.045406 }, { "epoch": 0.17775334432838555, "grad_norm": 0.047665633261203766, "learning_rate": 9.913595338642059e-06, "loss": 0.0014092091470956803, "memory(GiB)": 158.4, "step": 485, "token_acc": 0.9997473896934995, "train_speed(iter/s)": 0.045408 }, { "epoch": 0.17958585303280192, "grad_norm": 0.02579871006309986, "learning_rate": 9.91180989864742e-06, "loss": 0.0007158961612731219, "memory(GiB)": 158.4, "step": 490, "token_acc": 0.9996631862579993, "train_speed(iter/s)": 0.045412 }, { "epoch": 0.18141836173721826, "grad_norm": 0.028310472145676613, "learning_rate": 9.910006364013522e-06, "loss": 0.0007194250822067261, "memory(GiB)": 158.4, "step": 495, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.045414 }, { "epoch": 0.1832508704416346, "grad_norm": 0.12511947751045227, "learning_rate": 9.908184741384412e-06, "loss": 0.0015858769416809081, "memory(GiB)": 158.4, "step": 500, "token_acc": 0.999663242970197, "train_speed(iter/s)": 0.045418 }, { "epoch": 0.1832508704416346, "eval_loss": 0.0010450058616697788, "eval_runtime": 172.5985, "eval_samples_per_second": 2.549, "eval_steps_per_second": 2.549, "eval_token_acc": 0.9996786189798448, "step": 500 }, { "epoch": 0.18508337914605094, "grad_norm": 0.007651148363947868, "learning_rate": 9.906345037470776e-06, "loss": 0.0017563182860612868, "memory(GiB)": 160.86, "step": 505, "token_acc": 0.9996503360658923, "train_speed(iter/s)": 0.0421 }, { "epoch": 0.18691588785046728, "grad_norm": 0.06049024686217308, "learning_rate": 9.904487259049907e-06, "loss": 0.0015754606574773788, "memory(GiB)": 160.86, "step": 510, "token_acc": 0.9991582491582491, "train_speed(iter/s)": 0.0421 }, { "epoch": 0.18874839655488362, "grad_norm": 0.06416209042072296, "learning_rate": 9.902611412965681e-06, "loss": 0.0016123156994581223, "memory(GiB)": 160.86, "step": 515, "token_acc": 0.9994950770007573, "train_speed(iter/s)": 0.042128 }, { "epoch": 0.1905809052593, "grad_norm": 0.028774991631507874, "learning_rate": 9.90071750612854e-06, "loss": 0.001327525917440653, "memory(GiB)": 160.86, "step": 520, "token_acc": 0.999326825984517, "train_speed(iter/s)": 0.042161 }, { "epoch": 0.19241341396371633, "grad_norm": 0.01806553080677986, "learning_rate": 9.898805545515455e-06, "loss": 0.0018014278262853622, "memory(GiB)": 160.86, "step": 525, "token_acc": 0.999494779386999, "train_speed(iter/s)": 0.042191 }, { "epoch": 0.19424592266813268, "grad_norm": 0.022810854017734528, "learning_rate": 9.896875538169906e-06, "loss": 0.0012151801958680153, "memory(GiB)": 160.86, "step": 530, "token_acc": 0.9996629876147949, "train_speed(iter/s)": 0.042224 }, { "epoch": 0.19607843137254902, "grad_norm": 0.11561686545610428, "learning_rate": 9.894927491201856e-06, "loss": 0.0021266091614961626, "memory(GiB)": 160.86, "step": 535, "token_acc": 0.9994109727364524, "train_speed(iter/s)": 0.042231 }, { "epoch": 0.19791094007696536, "grad_norm": 0.06175706535577774, "learning_rate": 9.892961411787725e-06, "loss": 0.0011159414425492287, "memory(GiB)": 160.86, "step": 540, "token_acc": 0.9996632146164857, "train_speed(iter/s)": 0.042227 }, { "epoch": 0.1997434487813817, "grad_norm": 0.05753181502223015, "learning_rate": 9.890977307170362e-06, "loss": 0.001347663253545761, "memory(GiB)": 160.86, "step": 545, "token_acc": 0.9994108240047134, "train_speed(iter/s)": 0.042253 }, { "epoch": 0.20157595748579807, "grad_norm": 0.02328096143901348, "learning_rate": 9.888975184659018e-06, "loss": 0.0003634607419371605, "memory(GiB)": 160.86, "step": 550, "token_acc": 1.0, "train_speed(iter/s)": 0.042275 }, { "epoch": 0.2034084661902144, "grad_norm": 0.06188211217522621, "learning_rate": 9.886955051629322e-06, "loss": 0.001550444681197405, "memory(GiB)": 160.86, "step": 555, "token_acc": 0.9994948219247285, "train_speed(iter/s)": 0.042291 }, { "epoch": 0.20524097489463075, "grad_norm": 0.1453787237405777, "learning_rate": 9.88491691552325e-06, "loss": 0.001519276574254036, "memory(GiB)": 160.86, "step": 560, "token_acc": 0.9992421052631579, "train_speed(iter/s)": 0.042293 }, { "epoch": 0.2070734835990471, "grad_norm": 0.023789288476109505, "learning_rate": 9.882860783849106e-06, "loss": 0.00029240711592137814, "memory(GiB)": 160.86, "step": 565, "token_acc": 0.9999158107425492, "train_speed(iter/s)": 0.042306 }, { "epoch": 0.20890599230346343, "grad_norm": 0.01045987755060196, "learning_rate": 9.880786664181477e-06, "loss": 0.0012256539426743983, "memory(GiB)": 160.86, "step": 570, "token_acc": 0.999579018270607, "train_speed(iter/s)": 0.042302 }, { "epoch": 0.21073850100787977, "grad_norm": 0.011777155101299286, "learning_rate": 9.878694564161227e-06, "loss": 0.00046466137282550333, "memory(GiB)": 160.86, "step": 575, "token_acc": 0.9998316214850985, "train_speed(iter/s)": 0.042318 }, { "epoch": 0.21257100971229614, "grad_norm": 0.23171444237232208, "learning_rate": 9.876584491495448e-06, "loss": 0.0011185991577804088, "memory(GiB)": 160.86, "step": 580, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.042309 }, { "epoch": 0.21440351841671249, "grad_norm": 0.049548666924238205, "learning_rate": 9.87445645395745e-06, "loss": 0.0009535157121717929, "memory(GiB)": 160.86, "step": 585, "token_acc": 0.9995791954216462, "train_speed(iter/s)": 0.042334 }, { "epoch": 0.21623602712112883, "grad_norm": 0.022135065868496895, "learning_rate": 9.87231045938672e-06, "loss": 0.0012145033106207848, "memory(GiB)": 160.86, "step": 590, "token_acc": 0.999663129526697, "train_speed(iter/s)": 0.042348 }, { "epoch": 0.21806853582554517, "grad_norm": 0.20922328531742096, "learning_rate": 9.870146515688896e-06, "loss": 0.0015425698831677437, "memory(GiB)": 160.86, "step": 595, "token_acc": 0.999578947368421, "train_speed(iter/s)": 0.04231 }, { "epoch": 0.2199010445299615, "grad_norm": 0.027032975107431412, "learning_rate": 9.867964630835742e-06, "loss": 0.00022940777707844973, "memory(GiB)": 160.86, "step": 600, "token_acc": 0.9999158320006734, "train_speed(iter/s)": 0.042103 }, { "epoch": 0.22173355323437785, "grad_norm": 0.016112059354782104, "learning_rate": 9.865764812865113e-06, "loss": 0.0013837903738021851, "memory(GiB)": 160.86, "step": 605, "token_acc": 0.9996631862579993, "train_speed(iter/s)": 0.042119 }, { "epoch": 0.22356606193879422, "grad_norm": 0.03569135442376137, "learning_rate": 9.863547069880928e-06, "loss": 0.002841825969517231, "memory(GiB)": 160.86, "step": 610, "token_acc": 0.9993265993265993, "train_speed(iter/s)": 0.042143 }, { "epoch": 0.22539857064321056, "grad_norm": 0.04555279016494751, "learning_rate": 9.86131141005314e-06, "loss": 0.012712681293487548, "memory(GiB)": 160.86, "step": 615, "token_acc": 0.9986531986531987, "train_speed(iter/s)": 0.042163 }, { "epoch": 0.2272310793476269, "grad_norm": 0.09330299496650696, "learning_rate": 9.859057841617709e-06, "loss": 0.007313913106918335, "memory(GiB)": 160.86, "step": 620, "token_acc": 0.9966310115387855, "train_speed(iter/s)": 0.042185 }, { "epoch": 0.22906358805204324, "grad_norm": 0.04176206886768341, "learning_rate": 9.856786372876565e-06, "loss": 0.0030346425250172616, "memory(GiB)": 160.86, "step": 625, "token_acc": 0.9989054475035783, "train_speed(iter/s)": 0.042207 }, { "epoch": 0.23089609675645958, "grad_norm": 0.0391584113240242, "learning_rate": 9.854497012197581e-06, "loss": 0.0021283647045493128, "memory(GiB)": 160.86, "step": 630, "token_acc": 0.999494779386999, "train_speed(iter/s)": 0.042231 }, { "epoch": 0.23272860546087593, "grad_norm": 0.06570518761873245, "learning_rate": 9.852189768014547e-06, "loss": 0.0012692485004663467, "memory(GiB)": 160.86, "step": 635, "token_acc": 0.9994950770007573, "train_speed(iter/s)": 0.042249 }, { "epoch": 0.2345611141652923, "grad_norm": 0.04750160127878189, "learning_rate": 9.849864648827126e-06, "loss": 0.001050265971571207, "memory(GiB)": 160.86, "step": 640, "token_acc": 0.9996630727762803, "train_speed(iter/s)": 0.042274 }, { "epoch": 0.23639362286970864, "grad_norm": 0.012142885476350784, "learning_rate": 9.847521663200837e-06, "loss": 0.00046721328981220723, "memory(GiB)": 160.86, "step": 645, "token_acc": 0.9998315363881402, "train_speed(iter/s)": 0.042296 }, { "epoch": 0.23822613157412498, "grad_norm": 0.0755368173122406, "learning_rate": 9.845160819767017e-06, "loss": 0.0013550316914916038, "memory(GiB)": 160.86, "step": 650, "token_acc": 0.9995790891489182, "train_speed(iter/s)": 0.042321 }, { "epoch": 0.24005864027854132, "grad_norm": 0.07237580418586731, "learning_rate": 9.842782127222786e-06, "loss": 0.002187203988432884, "memory(GiB)": 160.86, "step": 655, "token_acc": 0.9994101786316144, "train_speed(iter/s)": 0.042344 }, { "epoch": 0.24189114898295766, "grad_norm": 0.043931830674409866, "learning_rate": 9.840385594331022e-06, "loss": 0.0009523511864244938, "memory(GiB)": 160.86, "step": 660, "token_acc": 0.9997474534893509, "train_speed(iter/s)": 0.042366 }, { "epoch": 0.243723657687374, "grad_norm": 0.008748149499297142, "learning_rate": 9.837971229920324e-06, "loss": 0.0016139259561896325, "memory(GiB)": 160.86, "step": 665, "token_acc": 0.9994108240047134, "train_speed(iter/s)": 0.042378 }, { "epoch": 0.24555616639179037, "grad_norm": 0.12863993644714355, "learning_rate": 9.83553904288498e-06, "loss": 0.001357206143438816, "memory(GiB)": 160.86, "step": 670, "token_acc": 0.9993265993265993, "train_speed(iter/s)": 0.042358 }, { "epoch": 0.2473886750962067, "grad_norm": 0.08388248831033707, "learning_rate": 9.833089042184933e-06, "loss": 0.0016548488289117812, "memory(GiB)": 160.86, "step": 675, "token_acc": 0.9994950345059754, "train_speed(iter/s)": 0.042379 }, { "epoch": 0.24922118380062305, "grad_norm": 0.09960606694221497, "learning_rate": 9.830621236845755e-06, "loss": 0.0014729213900864125, "memory(GiB)": 160.86, "step": 680, "token_acc": 0.9994103773584906, "train_speed(iter/s)": 0.042403 }, { "epoch": 0.2510536925050394, "grad_norm": 0.07054334878921509, "learning_rate": 9.828135635958602e-06, "loss": 0.0012276002205908298, "memory(GiB)": 160.86, "step": 685, "token_acc": 0.99949499200404, "train_speed(iter/s)": 0.042425 }, { "epoch": 0.25288620120945576, "grad_norm": 0.011227499693632126, "learning_rate": 9.825632248680195e-06, "loss": 0.0014451307244598866, "memory(GiB)": 160.86, "step": 690, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.042448 }, { "epoch": 0.2547187099138721, "grad_norm": 0.09235574305057526, "learning_rate": 9.82311108423277e-06, "loss": 0.001263285707682371, "memory(GiB)": 160.86, "step": 695, "token_acc": 0.9995789119083712, "train_speed(iter/s)": 0.04247 }, { "epoch": 0.25655121861828845, "grad_norm": 0.045791253447532654, "learning_rate": 9.82057215190406e-06, "loss": 0.0009290166199207306, "memory(GiB)": 160.86, "step": 700, "token_acc": 0.9998316498316498, "train_speed(iter/s)": 0.042485 }, { "epoch": 0.25838372732270476, "grad_norm": 0.07074666768312454, "learning_rate": 9.818015461047246e-06, "loss": 0.0015341023914515971, "memory(GiB)": 160.86, "step": 705, "token_acc": 0.99949499200404, "train_speed(iter/s)": 0.042504 }, { "epoch": 0.26021623602712113, "grad_norm": 0.1540241241455078, "learning_rate": 9.815441021080935e-06, "loss": 0.0007845636457204819, "memory(GiB)": 160.86, "step": 710, "token_acc": 0.9997473896934995, "train_speed(iter/s)": 0.042523 }, { "epoch": 0.2620487447315375, "grad_norm": 0.033406198024749756, "learning_rate": 9.812848841489118e-06, "loss": 0.0012617891654372216, "memory(GiB)": 160.86, "step": 715, "token_acc": 0.9994950345059754, "train_speed(iter/s)": 0.042542 }, { "epoch": 0.2638812534359538, "grad_norm": 0.09797952324151993, "learning_rate": 9.810238931821139e-06, "loss": 0.0005904140882194043, "memory(GiB)": 160.86, "step": 720, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.042558 }, { "epoch": 0.2657137621403702, "grad_norm": 0.004131863825023174, "learning_rate": 9.807611301691656e-06, "loss": 0.0003168722614645958, "memory(GiB)": 160.86, "step": 725, "token_acc": 0.9998316498316498, "train_speed(iter/s)": 0.042576 }, { "epoch": 0.2675462708447865, "grad_norm": 0.0872625857591629, "learning_rate": 9.804965960780603e-06, "loss": 0.0018875803798437119, "memory(GiB)": 160.86, "step": 730, "token_acc": 0.9993262023077571, "train_speed(iter/s)": 0.042595 }, { "epoch": 0.26937877954920286, "grad_norm": 0.03825852647423744, "learning_rate": 9.80230291883317e-06, "loss": 0.0008161487989127636, "memory(GiB)": 160.86, "step": 735, "token_acc": 0.9996634129922585, "train_speed(iter/s)": 0.042614 }, { "epoch": 0.27121128825361923, "grad_norm": 0.09421674907207489, "learning_rate": 9.799622185659748e-06, "loss": 0.0013505241833627224, "memory(GiB)": 160.86, "step": 740, "token_acc": 0.9995793016407236, "train_speed(iter/s)": 0.0426 }, { "epoch": 0.27304379695803555, "grad_norm": 0.008914557285606861, "learning_rate": 9.7969237711359e-06, "loss": 0.0008496672846376896, "memory(GiB)": 160.86, "step": 745, "token_acc": 0.9995792662403231, "train_speed(iter/s)": 0.042614 }, { "epoch": 0.2748763056624519, "grad_norm": 0.05403187870979309, "learning_rate": 9.79420768520233e-06, "loss": 0.00033216315787285564, "memory(GiB)": 160.86, "step": 750, "token_acc": 0.9998315221969506, "train_speed(iter/s)": 0.042627 }, { "epoch": 0.2767088143668682, "grad_norm": 0.07824942469596863, "learning_rate": 9.791473937864838e-06, "loss": 0.0009146830998361111, "memory(GiB)": 160.86, "step": 755, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.042645 }, { "epoch": 0.2785413230712846, "grad_norm": 0.059788450598716736, "learning_rate": 9.788722539194291e-06, "loss": 0.0014368345960974692, "memory(GiB)": 160.86, "step": 760, "token_acc": 0.9998316214850985, "train_speed(iter/s)": 0.042663 }, { "epoch": 0.2803738317757009, "grad_norm": 0.03711073473095894, "learning_rate": 9.785953499326575e-06, "loss": 0.0013325980864465237, "memory(GiB)": 160.86, "step": 765, "token_acc": 0.9994953742640875, "train_speed(iter/s)": 0.042681 }, { "epoch": 0.2822063404801173, "grad_norm": 0.024719931185245514, "learning_rate": 9.783166828462573e-06, "loss": 0.002364422380924225, "memory(GiB)": 160.86, "step": 770, "token_acc": 0.9992422328870927, "train_speed(iter/s)": 0.0427 }, { "epoch": 0.28403884918453365, "grad_norm": 0.03786981478333473, "learning_rate": 9.780362536868113e-06, "loss": 0.0009791357442736626, "memory(GiB)": 160.86, "step": 775, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.042719 }, { "epoch": 0.28587135788894996, "grad_norm": 0.1868947595357895, "learning_rate": 9.777540634873939e-06, "loss": 0.0009650942869484424, "memory(GiB)": 160.86, "step": 780, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.042737 }, { "epoch": 0.28770386659336633, "grad_norm": 0.015713131055235863, "learning_rate": 9.774701132875665e-06, "loss": 0.0007482931017875671, "memory(GiB)": 160.86, "step": 785, "token_acc": 0.9997473045822103, "train_speed(iter/s)": 0.042755 }, { "epoch": 0.28953637529778264, "grad_norm": 0.0045456611551344395, "learning_rate": 9.771844041333751e-06, "loss": 0.0009433764033019542, "memory(GiB)": 160.86, "step": 790, "token_acc": 0.9998316214850985, "train_speed(iter/s)": 0.042771 }, { "epoch": 0.291368884002199, "grad_norm": 0.01577194780111313, "learning_rate": 9.768969370773446e-06, "loss": 0.0004402685910463333, "memory(GiB)": 160.86, "step": 795, "token_acc": 0.9999158390843292, "train_speed(iter/s)": 0.042788 }, { "epoch": 0.2932013927066154, "grad_norm": 0.022222327068448067, "learning_rate": 9.766077131784764e-06, "loss": 0.0012076054699718952, "memory(GiB)": 160.86, "step": 800, "token_acc": 0.999663129526697, "train_speed(iter/s)": 0.042807 }, { "epoch": 0.2950339014110317, "grad_norm": 0.1063130721449852, "learning_rate": 9.763167335022437e-06, "loss": 0.0008463741280138493, "memory(GiB)": 160.86, "step": 805, "token_acc": 0.9997475597441938, "train_speed(iter/s)": 0.042824 }, { "epoch": 0.29686641011544807, "grad_norm": 0.018112968653440475, "learning_rate": 9.760239991205878e-06, "loss": 0.0014921230264008044, "memory(GiB)": 160.86, "step": 810, "token_acc": 0.9998317206562894, "train_speed(iter/s)": 0.042841 }, { "epoch": 0.2986989188198644, "grad_norm": 0.17134827375411987, "learning_rate": 9.757295111119142e-06, "loss": 0.0017302492633461952, "memory(GiB)": 160.86, "step": 815, "token_acc": 0.9994105263157895, "train_speed(iter/s)": 0.042859 }, { "epoch": 0.30053142752428075, "grad_norm": 0.1881178468465805, "learning_rate": 9.75433270561089e-06, "loss": 0.0018818458542227746, "memory(GiB)": 160.86, "step": 820, "token_acc": 0.9994947368421052, "train_speed(iter/s)": 0.042876 }, { "epoch": 0.30236393622869706, "grad_norm": 0.0701608955860138, "learning_rate": 9.751352785594337e-06, "loss": 0.0015649979934096337, "memory(GiB)": 160.86, "step": 825, "token_acc": 0.9994106255788499, "train_speed(iter/s)": 0.042892 }, { "epoch": 0.30419644493311343, "grad_norm": 0.11719143390655518, "learning_rate": 9.748355362047228e-06, "loss": 0.0022079024463891985, "memory(GiB)": 160.86, "step": 830, "token_acc": 0.9993266560053867, "train_speed(iter/s)": 0.042901 }, { "epoch": 0.3060289536375298, "grad_norm": 0.052010610699653625, "learning_rate": 9.745340446011782e-06, "loss": 0.0014782694168388843, "memory(GiB)": 160.86, "step": 835, "token_acc": 0.9994952044422009, "train_speed(iter/s)": 0.042913 }, { "epoch": 0.3078614623419461, "grad_norm": 0.04955873638391495, "learning_rate": 9.742308048594665e-06, "loss": 0.0016095375642180443, "memory(GiB)": 160.86, "step": 840, "token_acc": 0.9994949069787019, "train_speed(iter/s)": 0.04293 }, { "epoch": 0.3096939710463625, "grad_norm": 0.03515881672501564, "learning_rate": 9.73925818096694e-06, "loss": 0.0010076938197016716, "memory(GiB)": 160.86, "step": 845, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.042945 }, { "epoch": 0.3115264797507788, "grad_norm": 0.05620809271931648, "learning_rate": 9.736190854364025e-06, "loss": 0.0021063588559627534, "memory(GiB)": 160.86, "step": 850, "token_acc": 0.999326485940394, "train_speed(iter/s)": 0.042961 }, { "epoch": 0.31335898845519516, "grad_norm": 0.03683305159211159, "learning_rate": 9.733106080085662e-06, "loss": 0.0005148151423782111, "memory(GiB)": 160.86, "step": 855, "token_acc": 0.9997475385003787, "train_speed(iter/s)": 0.042977 }, { "epoch": 0.31519149715961153, "grad_norm": 0.029852213338017464, "learning_rate": 9.730003869495863e-06, "loss": 0.0004310948308557272, "memory(GiB)": 160.86, "step": 860, "token_acc": 0.9998316073082428, "train_speed(iter/s)": 0.042993 }, { "epoch": 0.31702400586402785, "grad_norm": 0.0037861524615436792, "learning_rate": 9.726884234022877e-06, "loss": 0.0005989938508719206, "memory(GiB)": 160.86, "step": 865, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.042978 }, { "epoch": 0.3188565145684442, "grad_norm": 0.04197857156395912, "learning_rate": 9.723747185159146e-06, "loss": 0.0018272759392857552, "memory(GiB)": 160.86, "step": 870, "token_acc": 0.9996634413125789, "train_speed(iter/s)": 0.042973 }, { "epoch": 0.32068902327286053, "grad_norm": 0.04336322471499443, "learning_rate": 9.720592734461257e-06, "loss": 0.0018274670466780663, "memory(GiB)": 160.86, "step": 875, "token_acc": 0.999578947368421, "train_speed(iter/s)": 0.042986 }, { "epoch": 0.3225215319772769, "grad_norm": 0.007882770150899887, "learning_rate": 9.717420893549902e-06, "loss": 0.0010360433720052243, "memory(GiB)": 160.86, "step": 880, "token_acc": 0.9994951619688683, "train_speed(iter/s)": 0.042996 }, { "epoch": 0.3243540406816932, "grad_norm": 0.03858296945691109, "learning_rate": 9.714231674109845e-06, "loss": 0.0016417885199189186, "memory(GiB)": 160.86, "step": 885, "token_acc": 0.9991580365412142, "train_speed(iter/s)": 0.042993 }, { "epoch": 0.3261865493861096, "grad_norm": 0.016526591032743454, "learning_rate": 9.711025087889866e-06, "loss": 0.0008385243825614452, "memory(GiB)": 160.86, "step": 890, "token_acc": 0.9999158461667929, "train_speed(iter/s)": 0.042974 }, { "epoch": 0.32801905809052595, "grad_norm": 0.011745758354663849, "learning_rate": 9.70780114670272e-06, "loss": 0.0007513574324548245, "memory(GiB)": 160.86, "step": 895, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.042982 }, { "epoch": 0.32985156679494226, "grad_norm": 0.032515864819288254, "learning_rate": 9.704559862425101e-06, "loss": 0.000879857875406742, "memory(GiB)": 160.86, "step": 900, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.042993 }, { "epoch": 0.33168407549935863, "grad_norm": 0.11071360856294632, "learning_rate": 9.701301246997592e-06, "loss": 0.0013037783093750477, "memory(GiB)": 160.86, "step": 905, "token_acc": 0.9994106751978448, "train_speed(iter/s)": 0.043 }, { "epoch": 0.33351658420377495, "grad_norm": 0.03765702247619629, "learning_rate": 9.698025312424619e-06, "loss": 0.0015159587375819684, "memory(GiB)": 160.86, "step": 910, "token_acc": 0.999579443182774, "train_speed(iter/s)": 0.043008 }, { "epoch": 0.3353490929081913, "grad_norm": 0.008713570423424244, "learning_rate": 9.694732070774415e-06, "loss": 0.00026825035456568, "memory(GiB)": 160.86, "step": 915, "token_acc": 1.0, "train_speed(iter/s)": 0.04298 }, { "epoch": 0.3371816016126077, "grad_norm": 0.07823354005813599, "learning_rate": 9.691421534178966e-06, "loss": 0.001245938241481781, "memory(GiB)": 160.86, "step": 920, "token_acc": 0.9994108240047134, "train_speed(iter/s)": 0.042955 }, { "epoch": 0.339014110317024, "grad_norm": 0.04400285705924034, "learning_rate": 9.688093714833975e-06, "loss": 0.000505279190838337, "memory(GiB)": 160.86, "step": 925, "token_acc": 0.9998317064961293, "train_speed(iter/s)": 0.042963 }, { "epoch": 0.34084661902144037, "grad_norm": 0.05997716262936592, "learning_rate": 9.68474862499881e-06, "loss": 0.001019585132598877, "memory(GiB)": 160.86, "step": 930, "token_acc": 0.9996631578947368, "train_speed(iter/s)": 0.042968 }, { "epoch": 0.3426791277258567, "grad_norm": 0.17811425030231476, "learning_rate": 9.681386276996462e-06, "loss": 0.0005352488718926906, "memory(GiB)": 160.86, "step": 935, "token_acc": 0.999831734814067, "train_speed(iter/s)": 0.042975 }, { "epoch": 0.34451163643027305, "grad_norm": 0.2344316691160202, "learning_rate": 9.678006683213503e-06, "loss": 0.0009379078634083271, "memory(GiB)": 160.86, "step": 940, "token_acc": 0.9997475385003787, "train_speed(iter/s)": 0.04297 }, { "epoch": 0.34634414513468936, "grad_norm": 0.06496769934892654, "learning_rate": 9.674609856100032e-06, "loss": 0.0008637402206659317, "memory(GiB)": 160.86, "step": 945, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.04298 }, { "epoch": 0.34817665383910573, "grad_norm": 0.0862952470779419, "learning_rate": 9.671195808169639e-06, "loss": 0.0011458213441073895, "memory(GiB)": 160.86, "step": 950, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.042994 }, { "epoch": 0.3500091625435221, "grad_norm": 0.016611328348517418, "learning_rate": 9.667764551999346e-06, "loss": 0.0010181719437241555, "memory(GiB)": 160.86, "step": 955, "token_acc": 0.999663242970197, "train_speed(iter/s)": 0.04297 }, { "epoch": 0.3518416712479384, "grad_norm": 0.08347468078136444, "learning_rate": 9.664316100229578e-06, "loss": 0.0007937697693705559, "memory(GiB)": 160.86, "step": 960, "token_acc": 0.9995793724236561, "train_speed(iter/s)": 0.042953 }, { "epoch": 0.3536741799523548, "grad_norm": 0.07462402433156967, "learning_rate": 9.660850465564101e-06, "loss": 0.0014566186815500259, "memory(GiB)": 160.86, "step": 965, "token_acc": 0.9995790537127462, "train_speed(iter/s)": 0.042967 }, { "epoch": 0.3555066886567711, "grad_norm": 0.031168634071946144, "learning_rate": 9.657367660769984e-06, "loss": 0.0008765817619860172, "memory(GiB)": 160.86, "step": 970, "token_acc": 0.9996635828427249, "train_speed(iter/s)": 0.04298 }, { "epoch": 0.35733919736118747, "grad_norm": 0.10647280514240265, "learning_rate": 9.653867698677543e-06, "loss": 0.0011190660297870636, "memory(GiB)": 160.86, "step": 975, "token_acc": 0.9996634979389248, "train_speed(iter/s)": 0.042993 }, { "epoch": 0.35917170606560384, "grad_norm": 0.041436877101659775, "learning_rate": 9.650350592180312e-06, "loss": 0.0012339851818978786, "memory(GiB)": 160.86, "step": 980, "token_acc": 0.9994108735903047, "train_speed(iter/s)": 0.043006 }, { "epoch": 0.36100421477002015, "grad_norm": 0.056029047816991806, "learning_rate": 9.646816354234968e-06, "loss": 0.0012508154846727847, "memory(GiB)": 160.86, "step": 985, "token_acc": 0.9996634129922585, "train_speed(iter/s)": 0.043019 }, { "epoch": 0.3628367234744365, "grad_norm": 0.016829386353492737, "learning_rate": 9.643264997861312e-06, "loss": 0.0006543456576764584, "memory(GiB)": 160.86, "step": 990, "token_acc": 0.9995789119083712, "train_speed(iter/s)": 0.043032 }, { "epoch": 0.36466923217885283, "grad_norm": 0.035343799740076065, "learning_rate": 9.6396965361422e-06, "loss": 0.0010605846531689168, "memory(GiB)": 160.86, "step": 995, "token_acc": 0.9996632146164857, "train_speed(iter/s)": 0.043044 }, { "epoch": 0.3665017408832692, "grad_norm": 0.1007576435804367, "learning_rate": 9.636110982223505e-06, "loss": 0.0017275510355830193, "memory(GiB)": 160.86, "step": 1000, "token_acc": 0.9993263157894737, "train_speed(iter/s)": 0.043052 }, { "epoch": 0.3665017408832692, "eval_loss": 0.0009223763481713831, "eval_runtime": 173.3991, "eval_samples_per_second": 2.537, "eval_steps_per_second": 2.537, "eval_token_acc": 0.9996633151217422, "step": 1000 }, { "epoch": 0.3683342495876855, "grad_norm": 0.005491136573255062, "learning_rate": 9.632508349314066e-06, "loss": 0.0003129460848867893, "memory(GiB)": 160.86, "step": 1005, "token_acc": 0.9997021844125912, "train_speed(iter/s)": 0.041292 }, { "epoch": 0.3701667582921019, "grad_norm": 0.052943065762519836, "learning_rate": 9.628888650685642e-06, "loss": 0.0011203167960047722, "memory(GiB)": 160.86, "step": 1010, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.041243 }, { "epoch": 0.37199926699651825, "grad_norm": 0.03638750687241554, "learning_rate": 9.625251899672852e-06, "loss": 0.0004535942804068327, "memory(GiB)": 160.86, "step": 1015, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.041263 }, { "epoch": 0.37383177570093457, "grad_norm": 0.010707657784223557, "learning_rate": 9.621598109673142e-06, "loss": 0.00024845553562045095, "memory(GiB)": 160.86, "step": 1020, "token_acc": 1.0, "train_speed(iter/s)": 0.041283 }, { "epoch": 0.37566428440535093, "grad_norm": 0.003029848216101527, "learning_rate": 9.617927294146726e-06, "loss": 0.000255924928933382, "memory(GiB)": 160.86, "step": 1025, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.041303 }, { "epoch": 0.37749679310976725, "grad_norm": 0.002139889169484377, "learning_rate": 9.614239466616541e-06, "loss": 0.001936671696603298, "memory(GiB)": 160.86, "step": 1030, "token_acc": 0.9996633846671716, "train_speed(iter/s)": 0.041322 }, { "epoch": 0.3793293018141836, "grad_norm": 0.033104073256254196, "learning_rate": 9.61053464066819e-06, "loss": 0.0009706121869385243, "memory(GiB)": 160.86, "step": 1035, "token_acc": 0.9996632146164857, "train_speed(iter/s)": 0.041341 }, { "epoch": 0.3811618105186, "grad_norm": 0.02874094434082508, "learning_rate": 9.606812829949896e-06, "loss": 0.0007171142846345901, "memory(GiB)": 160.86, "step": 1040, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.041359 }, { "epoch": 0.3829943192230163, "grad_norm": 0.13675667345523834, "learning_rate": 9.603074048172458e-06, "loss": 0.0008686968125402927, "memory(GiB)": 160.86, "step": 1045, "token_acc": 0.9998317489694625, "train_speed(iter/s)": 0.041378 }, { "epoch": 0.38482682792743267, "grad_norm": 0.325898677110672, "learning_rate": 9.599318309109191e-06, "loss": 0.001396147720515728, "memory(GiB)": 160.86, "step": 1050, "token_acc": 0.9995791245791246, "train_speed(iter/s)": 0.041398 }, { "epoch": 0.386659336631849, "grad_norm": 0.06272176653146744, "learning_rate": 9.595545626595878e-06, "loss": 0.002794544957578182, "memory(GiB)": 160.86, "step": 1055, "token_acc": 0.9992422966829433, "train_speed(iter/s)": 0.041416 }, { "epoch": 0.38849184533626535, "grad_norm": 0.019762301817536354, "learning_rate": 9.591756014530723e-06, "loss": 0.0009371510706841946, "memory(GiB)": 160.86, "step": 1060, "token_acc": 0.9996630727762803, "train_speed(iter/s)": 0.041434 }, { "epoch": 0.3903243540406817, "grad_norm": 0.09259835630655289, "learning_rate": 9.587949486874295e-06, "loss": 0.0013479561544954776, "memory(GiB)": 160.86, "step": 1065, "token_acc": 0.9995791245791246, "train_speed(iter/s)": 0.041453 }, { "epoch": 0.39215686274509803, "grad_norm": 0.05826210230588913, "learning_rate": 9.58412605764948e-06, "loss": 0.00075059924274683, "memory(GiB)": 160.86, "step": 1070, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.041452 }, { "epoch": 0.3939893714495144, "grad_norm": 0.02435746043920517, "learning_rate": 9.580285740941425e-06, "loss": 0.0010668656788766385, "memory(GiB)": 160.86, "step": 1075, "token_acc": 0.9994948644552955, "train_speed(iter/s)": 0.04147 }, { "epoch": 0.3958218801539307, "grad_norm": 0.06046979874372482, "learning_rate": 9.57642855089749e-06, "loss": 0.0006216964218765497, "memory(GiB)": 160.86, "step": 1080, "token_acc": 0.9995790891489182, "train_speed(iter/s)": 0.041489 }, { "epoch": 0.3976543888583471, "grad_norm": 0.02380959317088127, "learning_rate": 9.572554501727198e-06, "loss": 0.000693302508443594, "memory(GiB)": 160.86, "step": 1085, "token_acc": 0.999663356337317, "train_speed(iter/s)": 0.041506 }, { "epoch": 0.3994868975627634, "grad_norm": 0.015010896138846874, "learning_rate": 9.568663607702174e-06, "loss": 0.0005827041808515787, "memory(GiB)": 160.86, "step": 1090, "token_acc": 0.9997476446837147, "train_speed(iter/s)": 0.041523 }, { "epoch": 0.40131940626717977, "grad_norm": 0.17055855691432953, "learning_rate": 9.564755883156103e-06, "loss": 0.0010279595851898193, "memory(GiB)": 160.86, "step": 1095, "token_acc": 0.9995791600033668, "train_speed(iter/s)": 0.041535 }, { "epoch": 0.40315191497159614, "grad_norm": 0.0005144431488588452, "learning_rate": 9.560831342484668e-06, "loss": 0.00026263915933668616, "memory(GiB)": 160.86, "step": 1100, "token_acc": 0.9999158249158249, "train_speed(iter/s)": 0.041545 }, { "epoch": 0.40498442367601245, "grad_norm": 0.019269630312919617, "learning_rate": 9.556890000145503e-06, "loss": 0.0010970150120556354, "memory(GiB)": 160.86, "step": 1105, "token_acc": 0.999663356337317, "train_speed(iter/s)": 0.041546 }, { "epoch": 0.4068169323804288, "grad_norm": 0.037301257252693176, "learning_rate": 9.552931870658136e-06, "loss": 0.001028469391167164, "memory(GiB)": 160.86, "step": 1110, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.041562 }, { "epoch": 0.40864944108484513, "grad_norm": 0.006164327263832092, "learning_rate": 9.54895696860394e-06, "loss": 0.0005135733168572188, "memory(GiB)": 160.86, "step": 1115, "token_acc": 0.9998317064961293, "train_speed(iter/s)": 0.041578 }, { "epoch": 0.4104819497892615, "grad_norm": 0.1576082557439804, "learning_rate": 9.544965308626075e-06, "loss": 0.001076418813318014, "memory(GiB)": 160.86, "step": 1120, "token_acc": 0.9996634413125789, "train_speed(iter/s)": 0.041593 }, { "epoch": 0.41231445849367787, "grad_norm": 0.014838850125670433, "learning_rate": 9.540956905429435e-06, "loss": 0.000989390444010496, "memory(GiB)": 160.86, "step": 1125, "token_acc": 0.9994946091644205, "train_speed(iter/s)": 0.041608 }, { "epoch": 0.4141469671980942, "grad_norm": 0.014855766668915749, "learning_rate": 9.536931773780598e-06, "loss": 0.0015475031919777392, "memory(GiB)": 160.86, "step": 1130, "token_acc": 0.9994103276893269, "train_speed(iter/s)": 0.041623 }, { "epoch": 0.41597947590251055, "grad_norm": 0.019349105656147003, "learning_rate": 9.53288992850776e-06, "loss": 0.0005111652426421642, "memory(GiB)": 160.86, "step": 1135, "token_acc": 1.0, "train_speed(iter/s)": 0.041628 }, { "epoch": 0.41781198460692687, "grad_norm": 0.03461524471640587, "learning_rate": 9.528831384500699e-06, "loss": 0.0004519184119999409, "memory(GiB)": 160.86, "step": 1140, "token_acc": 0.9999158036541214, "train_speed(iter/s)": 0.041643 }, { "epoch": 0.41964449331134324, "grad_norm": 0.15801462531089783, "learning_rate": 9.5247561567107e-06, "loss": 0.00042958445847034453, "memory(GiB)": 160.86, "step": 1145, "token_acc": 0.9997474322276477, "train_speed(iter/s)": 0.041646 }, { "epoch": 0.42147700201575955, "grad_norm": 0.04607151448726654, "learning_rate": 9.520664260150513e-06, "loss": 0.0018787598237395287, "memory(GiB)": 160.86, "step": 1150, "token_acc": 0.9995792662403231, "train_speed(iter/s)": 0.04166 }, { "epoch": 0.4233095107201759, "grad_norm": 0.0973573699593544, "learning_rate": 9.5165557098943e-06, "loss": 0.0009789202362298966, "memory(GiB)": 160.86, "step": 1155, "token_acc": 0.9997473684210526, "train_speed(iter/s)": 0.041675 }, { "epoch": 0.4251420194245923, "grad_norm": 0.038962222635746, "learning_rate": 9.512430521077565e-06, "loss": 0.0009090069681406022, "memory(GiB)": 160.86, "step": 1160, "token_acc": 0.9997473896934995, "train_speed(iter/s)": 0.041686 }, { "epoch": 0.4269745281290086, "grad_norm": 0.010646538808941841, "learning_rate": 9.508288708897109e-06, "loss": 0.00033488136250525713, "memory(GiB)": 160.86, "step": 1165, "token_acc": 1.0, "train_speed(iter/s)": 0.041701 }, { "epoch": 0.42880703683342497, "grad_norm": 0.0063909804448485374, "learning_rate": 9.504130288610972e-06, "loss": 0.0002777322195470333, "memory(GiB)": 160.86, "step": 1170, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.041717 }, { "epoch": 0.4306395455378413, "grad_norm": 0.0029652463272213936, "learning_rate": 9.499955275538384e-06, "loss": 0.0006769481580704451, "memory(GiB)": 160.86, "step": 1175, "token_acc": 0.9998316214850985, "train_speed(iter/s)": 0.041732 }, { "epoch": 0.43247205424225765, "grad_norm": 0.03148781880736351, "learning_rate": 9.495763685059689e-06, "loss": 0.0021369663998484613, "memory(GiB)": 160.86, "step": 1180, "token_acc": 0.9996631862579993, "train_speed(iter/s)": 0.041747 }, { "epoch": 0.434304562946674, "grad_norm": 0.0476820208132267, "learning_rate": 9.49155553261631e-06, "loss": 0.0006943107582628727, "memory(GiB)": 160.86, "step": 1185, "token_acc": 0.9999157610984752, "train_speed(iter/s)": 0.041763 }, { "epoch": 0.43613707165109034, "grad_norm": 0.006549006327986717, "learning_rate": 9.487330833710678e-06, "loss": 0.00024927293416112664, "memory(GiB)": 160.86, "step": 1190, "token_acc": 0.9999158532480646, "train_speed(iter/s)": 0.041649 }, { "epoch": 0.4379695803555067, "grad_norm": 0.030179157853126526, "learning_rate": 9.48308960390618e-06, "loss": 0.0010321117006242275, "memory(GiB)": 160.86, "step": 1195, "token_acc": 0.9997475597441938, "train_speed(iter/s)": 0.041664 }, { "epoch": 0.439802089059923, "grad_norm": 0.0033925846219062805, "learning_rate": 9.478831858827105e-06, "loss": 0.00027046091854572297, "memory(GiB)": 160.86, "step": 1200, "token_acc": 1.0, "train_speed(iter/s)": 0.041679 }, { "epoch": 0.4416345977643394, "grad_norm": 0.07267381250858307, "learning_rate": 9.474557614158575e-06, "loss": 0.0008655142039060593, "memory(GiB)": 160.86, "step": 1205, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.041695 }, { "epoch": 0.4434671064687557, "grad_norm": 0.006699859630316496, "learning_rate": 9.470266885646504e-06, "loss": 0.0006839127279818058, "memory(GiB)": 160.86, "step": 1210, "token_acc": 0.9998316498316498, "train_speed(iter/s)": 0.04171 }, { "epoch": 0.44529961517317207, "grad_norm": 0.01745425909757614, "learning_rate": 9.465959689097525e-06, "loss": 0.0009552924893796444, "memory(GiB)": 160.86, "step": 1215, "token_acc": 0.9997473896934995, "train_speed(iter/s)": 0.041723 }, { "epoch": 0.44713212387758844, "grad_norm": 0.018873147666454315, "learning_rate": 9.461636040378941e-06, "loss": 0.0004271782469004393, "memory(GiB)": 160.86, "step": 1220, "token_acc": 0.9998315789473684, "train_speed(iter/s)": 0.04171 }, { "epoch": 0.44896463258200475, "grad_norm": 0.030013209208846092, "learning_rate": 9.45729595541866e-06, "loss": 0.0011812681332230568, "memory(GiB)": 160.86, "step": 1225, "token_acc": 0.9996633280026934, "train_speed(iter/s)": 0.041724 }, { "epoch": 0.4507971412864211, "grad_norm": 0.0008936990634538233, "learning_rate": 9.452939450205139e-06, "loss": 0.0004920902196317911, "memory(GiB)": 160.86, "step": 1230, "token_acc": 0.9996634129922585, "train_speed(iter/s)": 0.041738 }, { "epoch": 0.45262964999083743, "grad_norm": 0.06023690477013588, "learning_rate": 9.448566540787331e-06, "loss": 0.0010696605779230595, "memory(GiB)": 160.86, "step": 1235, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.041753 }, { "epoch": 0.4544621586952538, "grad_norm": 0.05453835055232048, "learning_rate": 9.444177243274619e-06, "loss": 0.0011446685530245304, "memory(GiB)": 160.86, "step": 1240, "token_acc": 0.9994107248084856, "train_speed(iter/s)": 0.041767 }, { "epoch": 0.4562946673996702, "grad_norm": 0.06793410331010818, "learning_rate": 9.43977157383675e-06, "loss": 0.0017616702243685722, "memory(GiB)": 160.86, "step": 1245, "token_acc": 0.9994109231675503, "train_speed(iter/s)": 0.04178 }, { "epoch": 0.4581271761040865, "grad_norm": 0.03625203296542168, "learning_rate": 9.435349548703796e-06, "loss": 0.000555843859910965, "memory(GiB)": 160.86, "step": 1250, "token_acc": 0.9998317489694625, "train_speed(iter/s)": 0.041794 }, { "epoch": 0.45995968480850286, "grad_norm": 0.08264432102441788, "learning_rate": 9.430911184166074e-06, "loss": 0.0007446614094078541, "memory(GiB)": 160.86, "step": 1255, "token_acc": 0.9996634696281339, "train_speed(iter/s)": 0.041808 }, { "epoch": 0.46179219351291917, "grad_norm": 0.03210179880261421, "learning_rate": 9.426456496574095e-06, "loss": 0.0009373857639729977, "memory(GiB)": 160.86, "step": 1260, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.041821 }, { "epoch": 0.46362470221733554, "grad_norm": 0.047844789922237396, "learning_rate": 9.421985502338505e-06, "loss": 0.0005674117710441351, "memory(GiB)": 160.86, "step": 1265, "token_acc": 0.9997473258654089, "train_speed(iter/s)": 0.041818 }, { "epoch": 0.46545721092175185, "grad_norm": 0.10654474049806595, "learning_rate": 9.417498217930017e-06, "loss": 0.0010964240878820418, "memory(GiB)": 160.86, "step": 1270, "token_acc": 0.9994948644552955, "train_speed(iter/s)": 0.04183 }, { "epoch": 0.4672897196261682, "grad_norm": 0.09114305675029755, "learning_rate": 9.412994659879362e-06, "loss": 0.0010675345547497272, "memory(GiB)": 160.86, "step": 1275, "token_acc": 0.9997476022211005, "train_speed(iter/s)": 0.041843 }, { "epoch": 0.4691222283305846, "grad_norm": 0.01834912970662117, "learning_rate": 9.408474844777218e-06, "loss": 0.0008592868223786354, "memory(GiB)": 160.86, "step": 1280, "token_acc": 0.9996632146164857, "train_speed(iter/s)": 0.041856 }, { "epoch": 0.4709547370350009, "grad_norm": 0.057866550981998444, "learning_rate": 9.403938789274152e-06, "loss": 0.0005749462172389031, "memory(GiB)": 160.86, "step": 1285, "token_acc": 0.9999158320006734, "train_speed(iter/s)": 0.041858 }, { "epoch": 0.4727872457394173, "grad_norm": 0.06462471187114716, "learning_rate": 9.39938651008056e-06, "loss": 0.00032207604963332417, "memory(GiB)": 160.86, "step": 1290, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.041871 }, { "epoch": 0.4746197544438336, "grad_norm": 0.13423164188861847, "learning_rate": 9.394818023966604e-06, "loss": 0.0010271795094013215, "memory(GiB)": 160.86, "step": 1295, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.041884 }, { "epoch": 0.47645226314824995, "grad_norm": 0.08763778209686279, "learning_rate": 9.39023334776215e-06, "loss": 0.0028607085347175597, "memory(GiB)": 160.86, "step": 1300, "token_acc": 0.9993261455525606, "train_speed(iter/s)": 0.041897 }, { "epoch": 0.4782847718526663, "grad_norm": 0.002933151787146926, "learning_rate": 9.385632498356713e-06, "loss": 0.00027030634228140114, "memory(GiB)": 160.86, "step": 1305, "token_acc": 1.0, "train_speed(iter/s)": 0.041909 }, { "epoch": 0.48011728055708264, "grad_norm": 0.04423481225967407, "learning_rate": 9.381015492699379e-06, "loss": 0.00081101693212986, "memory(GiB)": 160.86, "step": 1310, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.041916 }, { "epoch": 0.481949789261499, "grad_norm": 0.02344198152422905, "learning_rate": 9.376382347798756e-06, "loss": 0.0003832927206531167, "memory(GiB)": 160.86, "step": 1315, "token_acc": 0.9998316073082428, "train_speed(iter/s)": 0.041926 }, { "epoch": 0.4837822979659153, "grad_norm": 0.016795309260487556, "learning_rate": 9.371733080722911e-06, "loss": 0.00048357550986111164, "memory(GiB)": 160.86, "step": 1320, "token_acc": 0.9998315789473684, "train_speed(iter/s)": 0.041939 }, { "epoch": 0.4856148066703317, "grad_norm": 0.09421277046203613, "learning_rate": 9.3670677085993e-06, "loss": 0.0011711867526173591, "memory(GiB)": 160.86, "step": 1325, "token_acc": 0.9997474322276477, "train_speed(iter/s)": 0.04195 }, { "epoch": 0.487447315374748, "grad_norm": 0.18248307704925537, "learning_rate": 9.362386248614706e-06, "loss": 0.0005028956104069949, "memory(GiB)": 160.86, "step": 1330, "token_acc": 0.9998316923335858, "train_speed(iter/s)": 0.041963 }, { "epoch": 0.48927982407916437, "grad_norm": 0.04889710247516632, "learning_rate": 9.357688718015185e-06, "loss": 0.0029960500076413156, "memory(GiB)": 160.86, "step": 1335, "token_acc": 0.9992425517589631, "train_speed(iter/s)": 0.041975 }, { "epoch": 0.49111233278358074, "grad_norm": 0.01644892431795597, "learning_rate": 9.35297513410599e-06, "loss": 0.001054964866489172, "memory(GiB)": 160.86, "step": 1340, "token_acc": 0.999663242970197, "train_speed(iter/s)": 0.041987 }, { "epoch": 0.49294484148799705, "grad_norm": 0.06923960894346237, "learning_rate": 9.348245514251515e-06, "loss": 0.0015572577714920044, "memory(GiB)": 160.86, "step": 1345, "token_acc": 0.99949499200404, "train_speed(iter/s)": 0.041999 }, { "epoch": 0.4947773501924134, "grad_norm": 0.4345010817050934, "learning_rate": 9.343499875875226e-06, "loss": 0.0008648891933262348, "memory(GiB)": 160.86, "step": 1350, "token_acc": 0.9998317914213625, "train_speed(iter/s)": 0.042012 }, { "epoch": 0.49660985889682974, "grad_norm": 0.12544922530651093, "learning_rate": 9.338738236459606e-06, "loss": 0.0008970722556114197, "memory(GiB)": 160.86, "step": 1355, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.042024 }, { "epoch": 0.4984423676012461, "grad_norm": 0.04251859337091446, "learning_rate": 9.333960613546079e-06, "loss": 0.0008619870990514755, "memory(GiB)": 160.86, "step": 1360, "token_acc": 0.9996632146164857, "train_speed(iter/s)": 0.042036 }, { "epoch": 0.5002748763056625, "grad_norm": 0.05376381427049637, "learning_rate": 9.329167024734951e-06, "loss": 0.0009831368923187255, "memory(GiB)": 160.86, "step": 1365, "token_acc": 0.9996631862579993, "train_speed(iter/s)": 0.042046 }, { "epoch": 0.5021073850100788, "grad_norm": 0.03389672935009003, "learning_rate": 9.32435748768535e-06, "loss": 0.001122223772108555, "memory(GiB)": 160.86, "step": 1370, "token_acc": 0.9995790891489182, "train_speed(iter/s)": 0.042057 }, { "epoch": 0.5039398937144951, "grad_norm": 0.07879503071308136, "learning_rate": 9.319532020115147e-06, "loss": 0.0011348828673362731, "memory(GiB)": 160.86, "step": 1375, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.042069 }, { "epoch": 0.5057724024189115, "grad_norm": 0.004050049465149641, "learning_rate": 9.314690639800906e-06, "loss": 0.0002213560277596116, "memory(GiB)": 160.86, "step": 1380, "token_acc": 1.0, "train_speed(iter/s)": 0.04208 }, { "epoch": 0.5076049111233278, "grad_norm": 0.028278427198529243, "learning_rate": 9.30983336457781e-06, "loss": 0.0009013951756060123, "memory(GiB)": 160.86, "step": 1385, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.042093 }, { "epoch": 0.5094374198277442, "grad_norm": 0.020806804299354553, "learning_rate": 9.304960212339602e-06, "loss": 0.001097150705754757, "memory(GiB)": 160.86, "step": 1390, "token_acc": 0.9995791954216462, "train_speed(iter/s)": 0.042093 }, { "epoch": 0.5112699285321606, "grad_norm": 0.05375039204955101, "learning_rate": 9.300071201038503e-06, "loss": 0.0004816567990928888, "memory(GiB)": 160.86, "step": 1395, "token_acc": 0.9998316073082428, "train_speed(iter/s)": 0.042105 }, { "epoch": 0.5131024372365769, "grad_norm": 0.005027708597481251, "learning_rate": 9.295166348685169e-06, "loss": 0.0004785487428307533, "memory(GiB)": 160.86, "step": 1400, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.042115 }, { "epoch": 0.5149349459409932, "grad_norm": 0.007288212422281504, "learning_rate": 9.290245673348609e-06, "loss": 0.00039666993543505666, "memory(GiB)": 160.86, "step": 1405, "token_acc": 0.9998316781686585, "train_speed(iter/s)": 0.042118 }, { "epoch": 0.5167674546454095, "grad_norm": 0.0003485670604277402, "learning_rate": 9.285309193156118e-06, "loss": 0.0002419668948277831, "memory(GiB)": 160.86, "step": 1410, "token_acc": 0.9999158461667929, "train_speed(iter/s)": 0.042128 }, { "epoch": 0.5185999633498259, "grad_norm": 0.05836885794997215, "learning_rate": 9.280356926293222e-06, "loss": 0.0011019782163202763, "memory(GiB)": 160.86, "step": 1415, "token_acc": 0.999663356337317, "train_speed(iter/s)": 0.04214 }, { "epoch": 0.5204324720542423, "grad_norm": 0.030392736196517944, "learning_rate": 9.275388891003596e-06, "loss": 0.0003588124178349972, "memory(GiB)": 160.86, "step": 1420, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.042152 }, { "epoch": 0.5222649807586586, "grad_norm": 0.10738146305084229, "learning_rate": 9.270405105589012e-06, "loss": 0.0022922657430171967, "memory(GiB)": 160.86, "step": 1425, "token_acc": 0.9995792308339645, "train_speed(iter/s)": 0.042164 }, { "epoch": 0.524097489463075, "grad_norm": 0.024856839329004288, "learning_rate": 9.265405588409258e-06, "loss": 0.000432960782200098, "memory(GiB)": 160.86, "step": 1430, "token_acc": 0.9999158390843292, "train_speed(iter/s)": 0.042176 }, { "epoch": 0.5259299981674913, "grad_norm": 0.023576080799102783, "learning_rate": 9.26039035788208e-06, "loss": 0.0014881092123687268, "memory(GiB)": 160.86, "step": 1435, "token_acc": 0.9995794078061911, "train_speed(iter/s)": 0.042185 }, { "epoch": 0.5277625068719076, "grad_norm": 0.025212427601218224, "learning_rate": 9.255359432483106e-06, "loss": 0.0006445163395255804, "memory(GiB)": 160.86, "step": 1440, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.042195 }, { "epoch": 0.5295950155763239, "grad_norm": 0.05869888886809349, "learning_rate": 9.25031283074579e-06, "loss": 0.0012847738340497016, "memory(GiB)": 160.86, "step": 1445, "token_acc": 0.9995791245791246, "train_speed(iter/s)": 0.042206 }, { "epoch": 0.5314275242807404, "grad_norm": 0.02733391709625721, "learning_rate": 9.245250571261328e-06, "loss": 0.0012956521473824977, "memory(GiB)": 160.86, "step": 1450, "token_acc": 0.9998317064961293, "train_speed(iter/s)": 0.042217 }, { "epoch": 0.5332600329851567, "grad_norm": 0.01605917513370514, "learning_rate": 9.240172672678603e-06, "loss": 0.0010051255114376545, "memory(GiB)": 160.86, "step": 1455, "token_acc": 0.9997476234541937, "train_speed(iter/s)": 0.042217 }, { "epoch": 0.535092541689573, "grad_norm": 0.07777733355760574, "learning_rate": 9.235079153704108e-06, "loss": 0.001209939643740654, "memory(GiB)": 160.86, "step": 1460, "token_acc": 0.9994948644552955, "train_speed(iter/s)": 0.042228 }, { "epoch": 0.5369250503939894, "grad_norm": 0.024418100714683533, "learning_rate": 9.229970033101881e-06, "loss": 0.0006480346899479627, "memory(GiB)": 160.86, "step": 1465, "token_acc": 0.9998315789473684, "train_speed(iter/s)": 0.042239 }, { "epoch": 0.5387575590984057, "grad_norm": 0.051130812615156174, "learning_rate": 9.224845329693434e-06, "loss": 0.0005965878255665303, "memory(GiB)": 160.86, "step": 1470, "token_acc": 0.9998316214850985, "train_speed(iter/s)": 0.04225 }, { "epoch": 0.540590067802822, "grad_norm": 0.03825452923774719, "learning_rate": 9.21970506235769e-06, "loss": 0.0003675919026136398, "memory(GiB)": 160.86, "step": 1475, "token_acc": 0.9998315221969506, "train_speed(iter/s)": 0.042259 }, { "epoch": 0.5424225765072385, "grad_norm": 0.05280032381415367, "learning_rate": 9.214549250030899e-06, "loss": 0.00044973762705922125, "memory(GiB)": 160.86, "step": 1480, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.042271 }, { "epoch": 0.5442550852116548, "grad_norm": 0.13924196362495422, "learning_rate": 9.209377911706585e-06, "loss": 0.0010926604270935058, "memory(GiB)": 160.86, "step": 1485, "token_acc": 0.9996634979389248, "train_speed(iter/s)": 0.042282 }, { "epoch": 0.5460875939160711, "grad_norm": 0.0010057148756459355, "learning_rate": 9.204191066435463e-06, "loss": 7.150891469791532e-05, "memory(GiB)": 160.86, "step": 1490, "token_acc": 1.0, "train_speed(iter/s)": 0.042286 }, { "epoch": 0.5479201026204874, "grad_norm": 0.0028190938755869865, "learning_rate": 9.198988733325381e-06, "loss": 0.00018844833830371498, "memory(GiB)": 160.86, "step": 1495, "token_acc": 1.0, "train_speed(iter/s)": 0.042297 }, { "epoch": 0.5497526113249038, "grad_norm": 0.2529807388782501, "learning_rate": 9.19377093154123e-06, "loss": 0.0006476116366684436, "memory(GiB)": 160.86, "step": 1500, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.034291 }, { "epoch": 0.5497526113249038, "eval_loss": 0.0008861870155669749, "eval_runtime": 172.4847, "eval_samples_per_second": 2.551, "eval_steps_per_second": 2.551, "eval_token_acc": 0.999755138270358, "step": 1500 }, { "epoch": 0.5515851200293201, "grad_norm": 0.07361137121915817, "learning_rate": 9.188537680304901e-06, "loss": 0.001575019396841526, "memory(GiB)": 160.86, "step": 1505, "token_acc": 0.9997150923359839, "train_speed(iter/s)": 0.033682 }, { "epoch": 0.5534176287337365, "grad_norm": 0.1123221218585968, "learning_rate": 9.18328899889519e-06, "loss": 0.0008759641088545323, "memory(GiB)": 160.86, "step": 1510, "token_acc": 0.9997474960020201, "train_speed(iter/s)": 0.033712 }, { "epoch": 0.5552501374381529, "grad_norm": 0.031373172998428345, "learning_rate": 9.17802490664774e-06, "loss": 0.0005370716098695993, "memory(GiB)": 160.86, "step": 1515, "token_acc": 0.9997475385003787, "train_speed(iter/s)": 0.033741 }, { "epoch": 0.5570826461425692, "grad_norm": 0.00548228295519948, "learning_rate": 9.172745422954961e-06, "loss": 0.0006150617729872466, "memory(GiB)": 160.86, "step": 1520, "token_acc": 0.9997476234541937, "train_speed(iter/s)": 0.033771 }, { "epoch": 0.5589151548469855, "grad_norm": 0.09783894568681717, "learning_rate": 9.167450567265972e-06, "loss": 0.0003677058033645153, "memory(GiB)": 160.86, "step": 1525, "token_acc": 0.9999158036541214, "train_speed(iter/s)": 0.033793 }, { "epoch": 0.5607476635514018, "grad_norm": 0.02310693822801113, "learning_rate": 9.162140359086515e-06, "loss": 0.0013180834241211415, "memory(GiB)": 160.86, "step": 1530, "token_acc": 0.9994106751978448, "train_speed(iter/s)": 0.033822 }, { "epoch": 0.5625801722558182, "grad_norm": 0.07956714183092117, "learning_rate": 9.156814817978889e-06, "loss": 0.0014457314275205136, "memory(GiB)": 160.86, "step": 1535, "token_acc": 0.9994950345059754, "train_speed(iter/s)": 0.033851 }, { "epoch": 0.5644126809602346, "grad_norm": 0.007547269109636545, "learning_rate": 9.151473963561884e-06, "loss": 0.0004539607558399439, "memory(GiB)": 160.86, "step": 1540, "token_acc": 0.9998316356595673, "train_speed(iter/s)": 0.033879 }, { "epoch": 0.5662451896646509, "grad_norm": 0.016255052760243416, "learning_rate": 9.146117815510691e-06, "loss": 0.0003765122266486287, "memory(GiB)": 160.86, "step": 1545, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.033907 }, { "epoch": 0.5680776983690673, "grad_norm": 0.06404280662536621, "learning_rate": 9.140746393556853e-06, "loss": 0.0009273691102862358, "memory(GiB)": 160.86, "step": 1550, "token_acc": 0.9994106751978448, "train_speed(iter/s)": 0.033936 }, { "epoch": 0.5699102070734836, "grad_norm": 0.030146759003400803, "learning_rate": 9.135359717488179e-06, "loss": 0.0006903111469000577, "memory(GiB)": 160.86, "step": 1555, "token_acc": 0.9997473471450228, "train_speed(iter/s)": 0.033965 }, { "epoch": 0.5717427157778999, "grad_norm": 0.017701471224427223, "learning_rate": 9.129957807148666e-06, "loss": 0.0014588728547096253, "memory(GiB)": 160.86, "step": 1560, "token_acc": 0.999663356337317, "train_speed(iter/s)": 0.033993 }, { "epoch": 0.5735752244823162, "grad_norm": 0.02424156479537487, "learning_rate": 9.124540682438438e-06, "loss": 0.00092041976749897, "memory(GiB)": 160.86, "step": 1565, "token_acc": 0.9997475809844342, "train_speed(iter/s)": 0.034021 }, { "epoch": 0.5754077331867327, "grad_norm": 0.06382456421852112, "learning_rate": 9.119108363313665e-06, "loss": 0.0009634297341108323, "memory(GiB)": 160.86, "step": 1570, "token_acc": 0.9996634413125789, "train_speed(iter/s)": 0.034047 }, { "epoch": 0.577240241891149, "grad_norm": 0.011778367683291435, "learning_rate": 9.113660869786491e-06, "loss": 0.0007347457576543093, "memory(GiB)": 160.86, "step": 1575, "token_acc": 0.999663242970197, "train_speed(iter/s)": 0.034075 }, { "epoch": 0.5790727505955653, "grad_norm": 0.01488505955785513, "learning_rate": 9.108198221924966e-06, "loss": 0.0007065658923238516, "memory(GiB)": 160.86, "step": 1580, "token_acc": 0.9996636677036912, "train_speed(iter/s)": 0.034103 }, { "epoch": 0.5809052592999817, "grad_norm": 0.016339842230081558, "learning_rate": 9.102720439852964e-06, "loss": 0.0004196997731924057, "memory(GiB)": 160.86, "step": 1585, "token_acc": 0.9999158674070335, "train_speed(iter/s)": 0.034131 }, { "epoch": 0.582737768004398, "grad_norm": 0.03133771941065788, "learning_rate": 9.097227543750109e-06, "loss": 0.0003929842729121447, "memory(GiB)": 160.86, "step": 1590, "token_acc": 0.9999158320006734, "train_speed(iter/s)": 0.034104 }, { "epoch": 0.5845702767088143, "grad_norm": 0.10911545157432556, "learning_rate": 9.091719553851707e-06, "loss": 0.00033823368139564993, "memory(GiB)": 160.86, "step": 1595, "token_acc": 0.9998316781686585, "train_speed(iter/s)": 0.034131 }, { "epoch": 0.5864027854132308, "grad_norm": 0.06253647804260254, "learning_rate": 9.086196490448668e-06, "loss": 0.0004926771856844425, "memory(GiB)": 160.86, "step": 1600, "token_acc": 0.9998316923335858, "train_speed(iter/s)": 0.034154 }, { "epoch": 0.5882352941176471, "grad_norm": 0.01017008163034916, "learning_rate": 9.080658373887432e-06, "loss": 0.0021519148722290993, "memory(GiB)": 160.86, "step": 1605, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.034177 }, { "epoch": 0.5900678028220634, "grad_norm": 0.027529926970601082, "learning_rate": 9.07510522456989e-06, "loss": 0.000728294812142849, "memory(GiB)": 160.86, "step": 1610, "token_acc": 0.9996633280026934, "train_speed(iter/s)": 0.034203 }, { "epoch": 0.5919003115264797, "grad_norm": 0.14524707198143005, "learning_rate": 9.069537062953318e-06, "loss": 0.0007321128156036139, "memory(GiB)": 160.86, "step": 1615, "token_acc": 0.9996633846671716, "train_speed(iter/s)": 0.03423 }, { "epoch": 0.5937328202308961, "grad_norm": 0.010788935236632824, "learning_rate": 9.063953909550289e-06, "loss": 0.0007929414510726929, "memory(GiB)": 160.86, "step": 1620, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.034256 }, { "epoch": 0.5955653289353124, "grad_norm": 0.04031025990843773, "learning_rate": 9.05835578492861e-06, "loss": 0.00044157886877655984, "memory(GiB)": 160.86, "step": 1625, "token_acc": 0.9998316214850985, "train_speed(iter/s)": 0.034282 }, { "epoch": 0.5973978376397288, "grad_norm": 0.005226753186434507, "learning_rate": 9.052742709711234e-06, "loss": 0.0007471313234418631, "memory(GiB)": 160.86, "step": 1630, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.034307 }, { "epoch": 0.5992303463441452, "grad_norm": 0.006849437486380339, "learning_rate": 9.0471147045762e-06, "loss": 0.00016981502994894981, "memory(GiB)": 160.86, "step": 1635, "token_acc": 1.0, "train_speed(iter/s)": 0.034314 }, { "epoch": 0.6010628550485615, "grad_norm": 0.0021249176934361458, "learning_rate": 9.041471790256543e-06, "loss": 0.0004975998308509588, "memory(GiB)": 160.86, "step": 1640, "token_acc": 0.9999157965644998, "train_speed(iter/s)": 0.034341 }, { "epoch": 0.6028953637529778, "grad_norm": 0.03091166540980339, "learning_rate": 9.035813987540216e-06, "loss": 0.001137539092451334, "memory(GiB)": 160.86, "step": 1645, "token_acc": 0.999579018270607, "train_speed(iter/s)": 0.034367 }, { "epoch": 0.6047278724573941, "grad_norm": 0.020048417150974274, "learning_rate": 9.030141317270026e-06, "loss": 0.0009108279831707477, "memory(GiB)": 160.86, "step": 1650, "token_acc": 0.9997473471450228, "train_speed(iter/s)": 0.034393 }, { "epoch": 0.6065603811618105, "grad_norm": 0.0024872045032680035, "learning_rate": 9.02445380034355e-06, "loss": 0.00014628460630774497, "memory(GiB)": 160.86, "step": 1655, "token_acc": 1.0, "train_speed(iter/s)": 0.034418 }, { "epoch": 0.6083928898662269, "grad_norm": 0.1102481409907341, "learning_rate": 9.018751457713062e-06, "loss": 0.002010086178779602, "memory(GiB)": 160.86, "step": 1660, "token_acc": 0.9996634413125789, "train_speed(iter/s)": 0.034443 }, { "epoch": 0.6102253985706432, "grad_norm": 0.0067368666641414165, "learning_rate": 9.013034310385442e-06, "loss": 0.0004647184628993273, "memory(GiB)": 160.86, "step": 1665, "token_acc": 0.9997474322276477, "train_speed(iter/s)": 0.034469 }, { "epoch": 0.6120579072750596, "grad_norm": 0.0039915889501571655, "learning_rate": 9.007302379422118e-06, "loss": 0.0008955980651080608, "memory(GiB)": 160.86, "step": 1670, "token_acc": 0.999663129526697, "train_speed(iter/s)": 0.03449 }, { "epoch": 0.6138904159794759, "grad_norm": 0.04223395511507988, "learning_rate": 9.00155568593898e-06, "loss": 0.0006724436767399311, "memory(GiB)": 160.86, "step": 1675, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.034516 }, { "epoch": 0.6157229246838922, "grad_norm": 0.013977458700537682, "learning_rate": 8.995794251106295e-06, "loss": 0.0012857289984822273, "memory(GiB)": 160.86, "step": 1680, "token_acc": 0.9995791245791246, "train_speed(iter/s)": 0.034534 }, { "epoch": 0.6175554333883086, "grad_norm": 0.02960984595119953, "learning_rate": 8.99001809614864e-06, "loss": 0.0006384906824678183, "memory(GiB)": 160.86, "step": 1685, "token_acc": 0.9997474109623642, "train_speed(iter/s)": 0.034559 }, { "epoch": 0.619387942092725, "grad_norm": 0.14135026931762695, "learning_rate": 8.98422724234482e-06, "loss": 0.0018129302188754082, "memory(GiB)": 160.86, "step": 1690, "token_acc": 0.9994108735903047, "train_speed(iter/s)": 0.034584 }, { "epoch": 0.6212204507971413, "grad_norm": 0.011938896030187607, "learning_rate": 8.978421711027789e-06, "loss": 0.0010257656686007977, "memory(GiB)": 160.86, "step": 1695, "token_acc": 0.999579018270607, "train_speed(iter/s)": 0.034609 }, { "epoch": 0.6230529595015576, "grad_norm": 0.02054041065275669, "learning_rate": 8.97260152358457e-06, "loss": 0.0010426132939755917, "memory(GiB)": 160.86, "step": 1700, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.034627 }, { "epoch": 0.624885468205974, "grad_norm": 0.057805608958005905, "learning_rate": 8.966766701456177e-06, "loss": 0.0011805295012891292, "memory(GiB)": 160.86, "step": 1705, "token_acc": 0.9994950345059754, "train_speed(iter/s)": 0.03465 }, { "epoch": 0.6267179769103903, "grad_norm": 0.01560523733496666, "learning_rate": 8.96091726613754e-06, "loss": 0.0006526369601488113, "memory(GiB)": 160.86, "step": 1710, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.034675 }, { "epoch": 0.6285504856148066, "grad_norm": 0.02277560532093048, "learning_rate": 8.95505323917742e-06, "loss": 0.0003244250314310193, "memory(GiB)": 160.86, "step": 1715, "token_acc": 0.9999158249158249, "train_speed(iter/s)": 0.034697 }, { "epoch": 0.6303829943192231, "grad_norm": 0.03905067220330238, "learning_rate": 8.949174642178333e-06, "loss": 0.0006646113935858012, "memory(GiB)": 160.86, "step": 1720, "token_acc": 0.9998317064961293, "train_speed(iter/s)": 0.034715 }, { "epoch": 0.6322155030236394, "grad_norm": 0.004376774653792381, "learning_rate": 8.94328149679647e-06, "loss": 0.0006781556177884341, "memory(GiB)": 160.86, "step": 1725, "token_acc": 0.9996631011538786, "train_speed(iter/s)": 0.034739 }, { "epoch": 0.6340480117280557, "grad_norm": 0.08241453766822815, "learning_rate": 8.937373824741618e-06, "loss": 0.0007374928332865238, "memory(GiB)": 160.86, "step": 1730, "token_acc": 0.9998317206562894, "train_speed(iter/s)": 0.034764 }, { "epoch": 0.635880520432472, "grad_norm": 0.02215947024524212, "learning_rate": 8.931451647777076e-06, "loss": 0.001058538444340229, "memory(GiB)": 160.86, "step": 1735, "token_acc": 0.9994950770007573, "train_speed(iter/s)": 0.034781 }, { "epoch": 0.6377130291368884, "grad_norm": 0.05471364036202431, "learning_rate": 8.92551498771958e-06, "loss": 0.0005416409578174353, "memory(GiB)": 160.86, "step": 1740, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.034805 }, { "epoch": 0.6395455378413047, "grad_norm": 0.0009198402985930443, "learning_rate": 8.919563866439218e-06, "loss": 0.0011710536666214467, "memory(GiB)": 160.86, "step": 1745, "token_acc": 0.9995790537127462, "train_speed(iter/s)": 0.034822 }, { "epoch": 0.6413780465457211, "grad_norm": 0.02374288998544216, "learning_rate": 8.913598305859354e-06, "loss": 0.0002880813553929329, "memory(GiB)": 160.86, "step": 1750, "token_acc": 0.9999158249158249, "train_speed(iter/s)": 0.034839 }, { "epoch": 0.6432105552501375, "grad_norm": 0.03671794757246971, "learning_rate": 8.907618327956546e-06, "loss": 0.0009451866149902344, "memory(GiB)": 160.86, "step": 1755, "token_acc": 0.9997473896934995, "train_speed(iter/s)": 0.034863 }, { "epoch": 0.6450430639545538, "grad_norm": 0.02204386703670025, "learning_rate": 8.90162395476046e-06, "loss": 0.00012790242908522487, "memory(GiB)": 160.86, "step": 1760, "token_acc": 1.0, "train_speed(iter/s)": 0.034887 }, { "epoch": 0.6468755726589701, "grad_norm": 0.006437621079385281, "learning_rate": 8.895615208353796e-06, "loss": 0.0011966807767748832, "memory(GiB)": 160.86, "step": 1765, "token_acc": 0.9996632146164857, "train_speed(iter/s)": 0.034911 }, { "epoch": 0.6487080813633864, "grad_norm": 0.06638949364423752, "learning_rate": 8.889592110872203e-06, "loss": 0.0013600192032754421, "memory(GiB)": 160.86, "step": 1770, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.034934 }, { "epoch": 0.6505405900678028, "grad_norm": 0.029982449486851692, "learning_rate": 8.883554684504198e-06, "loss": 0.00047690006904304026, "memory(GiB)": 160.86, "step": 1775, "token_acc": 0.9999158249158249, "train_speed(iter/s)": 0.034958 }, { "epoch": 0.6523730987722192, "grad_norm": 0.0004446969833225012, "learning_rate": 8.877502951491083e-06, "loss": 0.0002472808351740241, "memory(GiB)": 160.86, "step": 1780, "token_acc": 0.9999158674070335, "train_speed(iter/s)": 0.034982 }, { "epoch": 0.6542056074766355, "grad_norm": 0.045220986008644104, "learning_rate": 8.871436934126865e-06, "loss": 0.00016599131049588323, "memory(GiB)": 160.86, "step": 1785, "token_acc": 0.9999158107425492, "train_speed(iter/s)": 0.035005 }, { "epoch": 0.6560381161810519, "grad_norm": 0.08464392274618149, "learning_rate": 8.865356654758175e-06, "loss": 0.0011138648726046086, "memory(GiB)": 160.86, "step": 1790, "token_acc": 0.9997474534893509, "train_speed(iter/s)": 0.035029 }, { "epoch": 0.6578706248854682, "grad_norm": 0.018666911870241165, "learning_rate": 8.859262135784184e-06, "loss": 0.0008051570504903794, "memory(GiB)": 160.86, "step": 1795, "token_acc": 0.9998317206562894, "train_speed(iter/s)": 0.035052 }, { "epoch": 0.6597031335898845, "grad_norm": 0.03633316978812218, "learning_rate": 8.853153399656513e-06, "loss": 0.0012314721010625363, "memory(GiB)": 160.86, "step": 1800, "token_acc": 0.9997476022211005, "train_speed(iter/s)": 0.035075 }, { "epoch": 0.661535642294301, "grad_norm": 0.07466746866703033, "learning_rate": 8.84703046887917e-06, "loss": 0.0005056848283857107, "memory(GiB)": 160.86, "step": 1805, "token_acc": 0.9998315789473684, "train_speed(iter/s)": 0.035098 }, { "epoch": 0.6633681509987173, "grad_norm": 0.058270856738090515, "learning_rate": 8.840893366008443e-06, "loss": 0.0027731884270906447, "memory(GiB)": 160.86, "step": 1810, "token_acc": 0.9989051709617652, "train_speed(iter/s)": 0.03512 }, { "epoch": 0.6652006597031336, "grad_norm": 0.053415171802043915, "learning_rate": 8.834742113652835e-06, "loss": 0.0012996003031730651, "memory(GiB)": 160.86, "step": 1815, "token_acc": 0.9996633846671716, "train_speed(iter/s)": 0.035143 }, { "epoch": 0.6670331684075499, "grad_norm": 0.17921970784664154, "learning_rate": 8.828576734472975e-06, "loss": 0.002054636925458908, "memory(GiB)": 160.86, "step": 1820, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.035166 }, { "epoch": 0.6688656771119663, "grad_norm": 0.2059200257062912, "learning_rate": 8.82239725118153e-06, "loss": 0.000544156739488244, "memory(GiB)": 160.86, "step": 1825, "token_acc": 0.9998317914213625, "train_speed(iter/s)": 0.035188 }, { "epoch": 0.6706981858163826, "grad_norm": 0.0659668818116188, "learning_rate": 8.816203686543128e-06, "loss": 0.0011439280584454536, "memory(GiB)": 160.86, "step": 1830, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.03521 }, { "epoch": 0.6725306945207989, "grad_norm": 0.027126120403409004, "learning_rate": 8.80999606337427e-06, "loss": 0.0006697001401335001, "memory(GiB)": 160.86, "step": 1835, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.035233 }, { "epoch": 0.6743632032252154, "grad_norm": 0.04717881977558136, "learning_rate": 8.803774404543246e-06, "loss": 0.0008460984565317631, "memory(GiB)": 160.86, "step": 1840, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.035255 }, { "epoch": 0.6761957119296317, "grad_norm": 0.03212764859199524, "learning_rate": 8.79753873297006e-06, "loss": 0.0013919253833591938, "memory(GiB)": 160.86, "step": 1845, "token_acc": 0.9995793370351674, "train_speed(iter/s)": 0.035277 }, { "epoch": 0.678028220634048, "grad_norm": 0.004734094720333815, "learning_rate": 8.791289071626324e-06, "loss": 0.0017154796048998832, "memory(GiB)": 160.86, "step": 1850, "token_acc": 0.9994106751978448, "train_speed(iter/s)": 0.035298 }, { "epoch": 0.6798607293384643, "grad_norm": 0.002792911371216178, "learning_rate": 8.7850254435352e-06, "loss": 0.00024983214680105446, "memory(GiB)": 160.86, "step": 1855, "token_acc": 0.9999158532480646, "train_speed(iter/s)": 0.035304 }, { "epoch": 0.6816932380428807, "grad_norm": 0.069346122443676, "learning_rate": 8.778747871771293e-06, "loss": 0.0004865613766014576, "memory(GiB)": 160.86, "step": 1860, "token_acc": 0.9999158603281447, "train_speed(iter/s)": 0.035326 }, { "epoch": 0.683525746747297, "grad_norm": 0.0010090708965435624, "learning_rate": 8.772456379460578e-06, "loss": 0.0005619535222649574, "memory(GiB)": 160.86, "step": 1865, "token_acc": 0.9998316923335858, "train_speed(iter/s)": 0.035348 }, { "epoch": 0.6853582554517134, "grad_norm": 0.00402231328189373, "learning_rate": 8.766150989780317e-06, "loss": 0.00032461092341691257, "memory(GiB)": 160.86, "step": 1870, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.03537 }, { "epoch": 0.6871907641561298, "grad_norm": 0.016630422323942184, "learning_rate": 8.759831725958963e-06, "loss": 0.0007076055742800235, "memory(GiB)": 160.86, "step": 1875, "token_acc": 0.9999158461667929, "train_speed(iter/s)": 0.035386 }, { "epoch": 0.6890232728605461, "grad_norm": 0.13864953815937042, "learning_rate": 8.75349861127608e-06, "loss": 0.0009592998772859574, "memory(GiB)": 160.86, "step": 1880, "token_acc": 0.999663356337317, "train_speed(iter/s)": 0.035408 }, { "epoch": 0.6908557815649624, "grad_norm": 0.12857644259929657, "learning_rate": 8.747151669062256e-06, "loss": 0.0003430765587836504, "memory(GiB)": 160.86, "step": 1885, "token_acc": 0.9998316781686585, "train_speed(iter/s)": 0.035429 }, { "epoch": 0.6926882902693787, "grad_norm": 0.007042865734547377, "learning_rate": 8.740790922699024e-06, "loss": 0.0002988249296322465, "memory(GiB)": 160.86, "step": 1890, "token_acc": 0.9999157823816742, "train_speed(iter/s)": 0.035451 }, { "epoch": 0.6945207989737952, "grad_norm": 0.004211138002574444, "learning_rate": 8.73441639561877e-06, "loss": 0.000298920925706625, "memory(GiB)": 160.86, "step": 1895, "token_acc": 0.9998316781686585, "train_speed(iter/s)": 0.035464 }, { "epoch": 0.6963533076782115, "grad_norm": 0.10895411670207977, "learning_rate": 8.728028111304639e-06, "loss": 0.0018308842554688454, "memory(GiB)": 160.86, "step": 1900, "token_acc": 0.9995788054923764, "train_speed(iter/s)": 0.035485 }, { "epoch": 0.6981858163826278, "grad_norm": 0.05376400053501129, "learning_rate": 8.721626093290461e-06, "loss": 0.0004374215379357338, "memory(GiB)": 160.86, "step": 1905, "token_acc": 0.9998316923335858, "train_speed(iter/s)": 0.035506 }, { "epoch": 0.7000183250870442, "grad_norm": 0.007238362450152636, "learning_rate": 8.715210365160662e-06, "loss": 6.630108109675347e-05, "memory(GiB)": 160.86, "step": 1910, "token_acc": 1.0, "train_speed(iter/s)": 0.035527 }, { "epoch": 0.7018508337914605, "grad_norm": 0.00040705734863877296, "learning_rate": 8.708780950550173e-06, "loss": 0.0006973243784159422, "memory(GiB)": 160.86, "step": 1915, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.035548 }, { "epoch": 0.7036833424958768, "grad_norm": 0.0899810642004013, "learning_rate": 8.702337873144343e-06, "loss": 0.0013748856261372566, "memory(GiB)": 160.86, "step": 1920, "token_acc": 0.9994948219247285, "train_speed(iter/s)": 0.035569 }, { "epoch": 0.7055158512002933, "grad_norm": 0.08953223377466202, "learning_rate": 8.695881156678856e-06, "loss": 0.0006622021552175284, "memory(GiB)": 160.86, "step": 1925, "token_acc": 0.9997475597441938, "train_speed(iter/s)": 0.035589 }, { "epoch": 0.7073483599047096, "grad_norm": 0.015041066333651543, "learning_rate": 8.689410824939639e-06, "loss": 0.0003675042651593685, "memory(GiB)": 160.86, "step": 1930, "token_acc": 0.9999158249158249, "train_speed(iter/s)": 0.03561 }, { "epoch": 0.7091808686091259, "grad_norm": 0.015323134139180183, "learning_rate": 8.682926901762776e-06, "loss": 0.0009645667858421802, "memory(GiB)": 160.86, "step": 1935, "token_acc": 0.999663242970197, "train_speed(iter/s)": 0.03563 }, { "epoch": 0.7110133773135422, "grad_norm": 0.05264544486999512, "learning_rate": 8.676429411034423e-06, "loss": 0.0006276907399296761, "memory(GiB)": 160.86, "step": 1940, "token_acc": 0.9996633280026934, "train_speed(iter/s)": 0.035648 }, { "epoch": 0.7128458860179586, "grad_norm": 0.0028159820940345526, "learning_rate": 8.669918376690716e-06, "loss": 0.00036051685456186535, "memory(GiB)": 160.86, "step": 1945, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.035668 }, { "epoch": 0.7146783947223749, "grad_norm": 0.0341511145234108, "learning_rate": 8.663393822717686e-06, "loss": 0.0003709573531523347, "memory(GiB)": 160.86, "step": 1950, "token_acc": 0.9998317064961293, "train_speed(iter/s)": 0.035688 }, { "epoch": 0.7165109034267912, "grad_norm": 0.0006480200099758804, "learning_rate": 8.656855773151163e-06, "loss": 0.0003987106028944254, "memory(GiB)": 160.86, "step": 1955, "token_acc": 0.9998315789473684, "train_speed(iter/s)": 0.035709 }, { "epoch": 0.7183434121312077, "grad_norm": 0.0002706103550735861, "learning_rate": 8.650304252076704e-06, "loss": 0.0003762753214687109, "memory(GiB)": 160.86, "step": 1960, "token_acc": 0.9998316356595673, "train_speed(iter/s)": 0.035729 }, { "epoch": 0.720175920835624, "grad_norm": 0.00926526915282011, "learning_rate": 8.643739283629484e-06, "loss": 0.00021247351542115213, "memory(GiB)": 160.86, "step": 1965, "token_acc": 0.9999158603281447, "train_speed(iter/s)": 0.035749 }, { "epoch": 0.7220084295400403, "grad_norm": 0.11203871667385101, "learning_rate": 8.63716089199422e-06, "loss": 0.0012671677395701408, "memory(GiB)": 160.86, "step": 1970, "token_acc": 0.9995792662403231, "train_speed(iter/s)": 0.03577 }, { "epoch": 0.7238409382444566, "grad_norm": 0.027508899569511414, "learning_rate": 8.630569101405084e-06, "loss": 0.0016218043863773346, "memory(GiB)": 160.86, "step": 1975, "token_acc": 0.99949499200404, "train_speed(iter/s)": 0.03579 }, { "epoch": 0.725673446948873, "grad_norm": 0.03338692709803581, "learning_rate": 8.6239639361456e-06, "loss": 0.0007595627568662167, "memory(GiB)": 160.86, "step": 1980, "token_acc": 0.9997473045822103, "train_speed(iter/s)": 0.03581 }, { "epoch": 0.7275059556532893, "grad_norm": 0.01979021355509758, "learning_rate": 8.617345420548568e-06, "loss": 0.00039132642559707164, "memory(GiB)": 160.86, "step": 1985, "token_acc": 0.9998317631224765, "train_speed(iter/s)": 0.035829 }, { "epoch": 0.7293384643577057, "grad_norm": 0.0021872930228710175, "learning_rate": 8.610713578995969e-06, "loss": 0.0002923472551628947, "memory(GiB)": 160.86, "step": 1990, "token_acc": 0.9999158603281447, "train_speed(iter/s)": 0.035848 }, { "epoch": 0.7311709730621221, "grad_norm": 0.007450213190168142, "learning_rate": 8.604068435918876e-06, "loss": 0.0004648041445761919, "memory(GiB)": 160.86, "step": 1995, "token_acc": 0.9998316356595673, "train_speed(iter/s)": 0.035868 }, { "epoch": 0.7330034817665384, "grad_norm": 0.018950950354337692, "learning_rate": 8.597410015797358e-06, "loss": 0.0011166405864059925, "memory(GiB)": 160.86, "step": 2000, "token_acc": 0.9996636394214599, "train_speed(iter/s)": 0.035879 }, { "epoch": 0.7330034817665384, "eval_loss": 0.0007337583811022341, "eval_runtime": 199.2224, "eval_samples_per_second": 2.209, "eval_steps_per_second": 2.209, "eval_token_acc": 0.9997704421284606, "step": 2000 }, { "epoch": 0.7348359904709547, "grad_norm": 0.0039305477403104305, "learning_rate": 8.590738343160402e-06, "loss": 0.00037078014574944975, "memory(GiB)": 160.86, "step": 2005, "token_acc": 0.9997927917427509, "train_speed(iter/s)": 0.035487 }, { "epoch": 0.736668499175371, "grad_norm": 0.013306787237524986, "learning_rate": 8.584053442585816e-06, "loss": 0.0020991813391447066, "memory(GiB)": 160.86, "step": 2010, "token_acc": 0.9996633846671716, "train_speed(iter/s)": 0.035507 }, { "epoch": 0.7385010078797875, "grad_norm": 0.006368038710206747, "learning_rate": 8.577355338700133e-06, "loss": 0.000787766557186842, "memory(GiB)": 160.86, "step": 2015, "token_acc": 0.9997473896934995, "train_speed(iter/s)": 0.035525 }, { "epoch": 0.7403335165842038, "grad_norm": 0.010385467670857906, "learning_rate": 8.570644056178533e-06, "loss": 0.0008328554220497608, "memory(GiB)": 160.86, "step": 2020, "token_acc": 0.9997476871320438, "train_speed(iter/s)": 0.035538 }, { "epoch": 0.7421660252886201, "grad_norm": 0.01632188819348812, "learning_rate": 8.563919619744735e-06, "loss": 0.0005637739785015583, "memory(GiB)": 160.86, "step": 2025, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.035559 }, { "epoch": 0.7439985339930365, "grad_norm": 0.011626377701759338, "learning_rate": 8.557182054170926e-06, "loss": 0.0005918642971664667, "memory(GiB)": 160.86, "step": 2030, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.035578 }, { "epoch": 0.7458310426974528, "grad_norm": 0.0031517872121185064, "learning_rate": 8.550431384277654e-06, "loss": 0.00141130480915308, "memory(GiB)": 160.86, "step": 2035, "token_acc": 0.9995790891489182, "train_speed(iter/s)": 0.035597 }, { "epoch": 0.7476635514018691, "grad_norm": 0.05396876111626625, "learning_rate": 8.543667634933743e-06, "loss": 0.0004124412313103676, "memory(GiB)": 160.86, "step": 2040, "token_acc": 0.9998316214850985, "train_speed(iter/s)": 0.035616 }, { "epoch": 0.7494960601062856, "grad_norm": 0.0036719287745654583, "learning_rate": 8.536890831056199e-06, "loss": 0.0014296333305537702, "memory(GiB)": 160.86, "step": 2045, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.035636 }, { "epoch": 0.7513285688107019, "grad_norm": 0.01854000613093376, "learning_rate": 8.530100997610125e-06, "loss": 0.00037872311659157274, "memory(GiB)": 160.86, "step": 2050, "token_acc": 0.9999158886365548, "train_speed(iter/s)": 0.035656 }, { "epoch": 0.7531610775151182, "grad_norm": 0.022685358300805092, "learning_rate": 8.523298159608615e-06, "loss": 0.0005078110843896866, "memory(GiB)": 160.86, "step": 2055, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.035675 }, { "epoch": 0.7549935862195345, "grad_norm": 0.0069847991690039635, "learning_rate": 8.51648234211268e-06, "loss": 0.0006114406045526266, "memory(GiB)": 160.86, "step": 2060, "token_acc": 0.9999158036541214, "train_speed(iter/s)": 0.035694 }, { "epoch": 0.7568260949239509, "grad_norm": 0.005377015098929405, "learning_rate": 8.509653570231139e-06, "loss": 0.000488346815109253, "memory(GiB)": 160.86, "step": 2065, "token_acc": 0.9998316356595673, "train_speed(iter/s)": 0.035714 }, { "epoch": 0.7586586036283672, "grad_norm": 0.13766171038150787, "learning_rate": 8.502811869120537e-06, "loss": 0.0007873100228607654, "memory(GiB)": 160.86, "step": 2070, "token_acc": 0.9997473471450228, "train_speed(iter/s)": 0.035733 }, { "epoch": 0.7604911123327835, "grad_norm": 0.08824609220027924, "learning_rate": 8.495957263985049e-06, "loss": 0.0008373255841434002, "memory(GiB)": 160.86, "step": 2075, "token_acc": 0.9995790537127462, "train_speed(iter/s)": 0.035751 }, { "epoch": 0.7623236210372, "grad_norm": 0.006550587713718414, "learning_rate": 8.489089780076387e-06, "loss": 0.00012923479080200194, "memory(GiB)": 160.86, "step": 2080, "token_acc": 1.0, "train_speed(iter/s)": 0.03577 }, { "epoch": 0.7641561297416163, "grad_norm": 0.06086429953575134, "learning_rate": 8.482209442693706e-06, "loss": 0.002163195610046387, "memory(GiB)": 160.86, "step": 2085, "token_acc": 0.9990743078347218, "train_speed(iter/s)": 0.03579 }, { "epoch": 0.7659886384460326, "grad_norm": 0.045746754854917526, "learning_rate": 8.47531627718351e-06, "loss": 0.00045907222665846347, "memory(GiB)": 160.86, "step": 2090, "token_acc": 0.9998315080033698, "train_speed(iter/s)": 0.035808 }, { "epoch": 0.7678211471504489, "grad_norm": 0.01716403290629387, "learning_rate": 8.46841030893957e-06, "loss": 0.0005397152155637742, "memory(GiB)": 160.86, "step": 2095, "token_acc": 0.9997475597441938, "train_speed(iter/s)": 0.035827 }, { "epoch": 0.7696536558548653, "grad_norm": 0.0022040277253836393, "learning_rate": 8.461491563402807e-06, "loss": 0.0012433138675987721, "memory(GiB)": 160.86, "step": 2100, "token_acc": 0.9997475385003787, "train_speed(iter/s)": 0.035846 }, { "epoch": 0.7714861645592817, "grad_norm": 0.028352022171020508, "learning_rate": 8.454560066061225e-06, "loss": 0.0011054543778300286, "memory(GiB)": 160.86, "step": 2105, "token_acc": 0.9995790891489182, "train_speed(iter/s)": 0.035865 }, { "epoch": 0.773318673263698, "grad_norm": 0.017512010410428047, "learning_rate": 8.447615842449799e-06, "loss": 0.00045901937410235404, "memory(GiB)": 160.86, "step": 2110, "token_acc": 0.9999158249158249, "train_speed(iter/s)": 0.035883 }, { "epoch": 0.7751511819681144, "grad_norm": 0.014501676894724369, "learning_rate": 8.440658918150383e-06, "loss": 0.0004790318664163351, "memory(GiB)": 160.86, "step": 2115, "token_acc": 0.9997476446837147, "train_speed(iter/s)": 0.035901 }, { "epoch": 0.7769836906725307, "grad_norm": 0.06630018353462219, "learning_rate": 8.433689318791628e-06, "loss": 0.0008208448067307472, "memory(GiB)": 160.86, "step": 2120, "token_acc": 0.999663356337317, "train_speed(iter/s)": 0.03592 }, { "epoch": 0.778816199376947, "grad_norm": 0.029544832184910774, "learning_rate": 8.426707070048867e-06, "loss": 0.00034202171955257656, "memory(GiB)": 160.86, "step": 2125, "token_acc": 0.9999158036541214, "train_speed(iter/s)": 0.035938 }, { "epoch": 0.7806487080813634, "grad_norm": 0.020295366644859314, "learning_rate": 8.419712197644042e-06, "loss": 0.00047438177280128, "memory(GiB)": 160.86, "step": 2130, "token_acc": 0.9998316356595673, "train_speed(iter/s)": 0.035956 }, { "epoch": 0.7824812167857798, "grad_norm": 0.021269747987389565, "learning_rate": 8.412704727345597e-06, "loss": 0.0006256222724914551, "memory(GiB)": 160.86, "step": 2135, "token_acc": 0.9999158390843292, "train_speed(iter/s)": 0.035974 }, { "epoch": 0.7843137254901961, "grad_norm": 0.035125475376844406, "learning_rate": 8.405684684968383e-06, "loss": 0.0005730021744966507, "memory(GiB)": 160.86, "step": 2140, "token_acc": 0.9998315647633484, "train_speed(iter/s)": 0.035992 }, { "epoch": 0.7861462341946124, "grad_norm": 0.06994622200727463, "learning_rate": 8.398652096373566e-06, "loss": 0.0003744778921827674, "memory(GiB)": 160.86, "step": 2145, "token_acc": 0.9999157894736842, "train_speed(iter/s)": 0.03601 }, { "epoch": 0.7879787428990288, "grad_norm": 0.006813399959355593, "learning_rate": 8.39160698746853e-06, "loss": 0.0007882724516093731, "memory(GiB)": 160.86, "step": 2150, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.036027 }, { "epoch": 0.7898112516034451, "grad_norm": 0.20248223841190338, "learning_rate": 8.38454938420679e-06, "loss": 0.00029504401609301565, "memory(GiB)": 160.86, "step": 2155, "token_acc": 0.9999157823816742, "train_speed(iter/s)": 0.036045 }, { "epoch": 0.7916437603078614, "grad_norm": 0.10259495675563812, "learning_rate": 8.37747931258788e-06, "loss": 0.0013766267336905002, "memory(GiB)": 160.86, "step": 2160, "token_acc": 0.9995792662403231, "train_speed(iter/s)": 0.036063 }, { "epoch": 0.7934762690122779, "grad_norm": 0.022682547569274902, "learning_rate": 8.370396798657269e-06, "loss": 0.0003458364633843303, "memory(GiB)": 160.86, "step": 2165, "token_acc": 0.9999158532480646, "train_speed(iter/s)": 0.036081 }, { "epoch": 0.7953087777166942, "grad_norm": 0.05654159560799599, "learning_rate": 8.363301868506264e-06, "loss": 0.0008417519740760327, "memory(GiB)": 160.86, "step": 2170, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.036099 }, { "epoch": 0.7971412864211105, "grad_norm": 0.010379817336797714, "learning_rate": 8.35619454827191e-06, "loss": 0.00014047393342480062, "memory(GiB)": 160.86, "step": 2175, "token_acc": 1.0, "train_speed(iter/s)": 0.036117 }, { "epoch": 0.7989737951255268, "grad_norm": 0.002908756723627448, "learning_rate": 8.349074864136897e-06, "loss": 0.0010122337378561496, "memory(GiB)": 160.86, "step": 2180, "token_acc": 0.9995790891489182, "train_speed(iter/s)": 0.036134 }, { "epoch": 0.8008063038299432, "grad_norm": 0.015968699008226395, "learning_rate": 8.341942842329465e-06, "loss": 0.0010151905938982964, "memory(GiB)": 160.86, "step": 2185, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.036152 }, { "epoch": 0.8026388125343595, "grad_norm": 0.02950908988714218, "learning_rate": 8.3347985091233e-06, "loss": 0.0006167484447360039, "memory(GiB)": 160.86, "step": 2190, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.036169 }, { "epoch": 0.8044713212387758, "grad_norm": 0.004527771379798651, "learning_rate": 8.327641890837443e-06, "loss": 0.0001240343088284135, "memory(GiB)": 160.86, "step": 2195, "token_acc": 1.0, "train_speed(iter/s)": 0.036187 }, { "epoch": 0.8063038299431923, "grad_norm": 0.09493066370487213, "learning_rate": 8.320473013836197e-06, "loss": 0.0003447512863203883, "memory(GiB)": 160.86, "step": 2200, "token_acc": 0.9999158603281447, "train_speed(iter/s)": 0.036205 }, { "epoch": 0.8081363386476086, "grad_norm": 0.016084903851151466, "learning_rate": 8.313291904529018e-06, "loss": 0.0009649941697716713, "memory(GiB)": 160.86, "step": 2205, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.036222 }, { "epoch": 0.8099688473520249, "grad_norm": 0.05419844388961792, "learning_rate": 8.306098589370427e-06, "loss": 0.0005068023223429918, "memory(GiB)": 160.86, "step": 2210, "token_acc": 0.9998317206562894, "train_speed(iter/s)": 0.036239 }, { "epoch": 0.8118013560564412, "grad_norm": 0.12476948648691177, "learning_rate": 8.298893094859916e-06, "loss": 0.0009864597581326962, "memory(GiB)": 160.86, "step": 2215, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.036257 }, { "epoch": 0.8136338647608576, "grad_norm": 0.06563253700733185, "learning_rate": 8.291675447541834e-06, "loss": 0.000346578611060977, "memory(GiB)": 160.86, "step": 2220, "token_acc": 0.999831734814067, "train_speed(iter/s)": 0.036274 }, { "epoch": 0.815466373465274, "grad_norm": 0.0007064275559969246, "learning_rate": 8.28444567400531e-06, "loss": 0.0002860090462490916, "memory(GiB)": 160.86, "step": 2225, "token_acc": 0.9998316073082428, "train_speed(iter/s)": 0.03629 }, { "epoch": 0.8172988821696903, "grad_norm": 0.06441126018762589, "learning_rate": 8.277203800884137e-06, "loss": 0.0004928476177155971, "memory(GiB)": 160.86, "step": 2230, "token_acc": 0.9999158036541214, "train_speed(iter/s)": 0.036307 }, { "epoch": 0.8191313908741067, "grad_norm": 0.07549826800823212, "learning_rate": 8.269949854856687e-06, "loss": 0.0014977409504354, "memory(GiB)": 160.86, "step": 2235, "token_acc": 0.9997476022211005, "train_speed(iter/s)": 0.036324 }, { "epoch": 0.820963899578523, "grad_norm": 0.02339329943060875, "learning_rate": 8.262683862645804e-06, "loss": 0.00037619960494339466, "memory(GiB)": 160.86, "step": 2240, "token_acc": 0.9998315221969506, "train_speed(iter/s)": 0.036341 }, { "epoch": 0.8227964082829393, "grad_norm": 0.013340925797820091, "learning_rate": 8.255405851018713e-06, "loss": 0.0004039745777845383, "memory(GiB)": 160.86, "step": 2245, "token_acc": 0.9999158249158249, "train_speed(iter/s)": 0.036358 }, { "epoch": 0.8246289169873557, "grad_norm": 0.1738908737897873, "learning_rate": 8.24811584678691e-06, "loss": 0.0009243869222700596, "memory(GiB)": 160.86, "step": 2250, "token_acc": 0.999663356337317, "train_speed(iter/s)": 0.036375 }, { "epoch": 0.8264614256917721, "grad_norm": 0.1292845755815506, "learning_rate": 8.24081387680608e-06, "loss": 0.0004229114390909672, "memory(GiB)": 160.86, "step": 2255, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.036392 }, { "epoch": 0.8282939343961884, "grad_norm": 0.03298277407884598, "learning_rate": 8.233499967975981e-06, "loss": 0.0003614515298977494, "memory(GiB)": 160.86, "step": 2260, "token_acc": 0.9999158674070335, "train_speed(iter/s)": 0.036406 }, { "epoch": 0.8301264431006047, "grad_norm": 0.0037736741360276937, "learning_rate": 8.226174147240359e-06, "loss": 0.0006478279829025269, "memory(GiB)": 160.86, "step": 2265, "token_acc": 0.9998315363881402, "train_speed(iter/s)": 0.036422 }, { "epoch": 0.8319589518050211, "grad_norm": 0.010557832196354866, "learning_rate": 8.218836441586834e-06, "loss": 0.0005696366541087627, "memory(GiB)": 160.86, "step": 2270, "token_acc": 0.9998317064961293, "train_speed(iter/s)": 0.036439 }, { "epoch": 0.8337914605094374, "grad_norm": 0.003406501142308116, "learning_rate": 8.211486878046819e-06, "loss": 0.0006424786522984504, "memory(GiB)": 160.86, "step": 2275, "token_acc": 0.9998316498316498, "train_speed(iter/s)": 0.036454 }, { "epoch": 0.8356239692138537, "grad_norm": 0.0992351546883583, "learning_rate": 8.204125483695403e-06, "loss": 0.0005788296461105346, "memory(GiB)": 160.86, "step": 2280, "token_acc": 0.9998317206562894, "train_speed(iter/s)": 0.036471 }, { "epoch": 0.8374564779182702, "grad_norm": 0.010372207500040531, "learning_rate": 8.196752285651261e-06, "loss": 0.00029938730876892804, "memory(GiB)": 160.86, "step": 2285, "token_acc": 0.9999157469036987, "train_speed(iter/s)": 0.036487 }, { "epoch": 0.8392889866226865, "grad_norm": 0.0683954581618309, "learning_rate": 8.189367311076551e-06, "loss": 0.0007511110045015812, "memory(GiB)": 160.86, "step": 2290, "token_acc": 0.9998317489694625, "train_speed(iter/s)": 0.036504 }, { "epoch": 0.8411214953271028, "grad_norm": 0.006293443962931633, "learning_rate": 8.181970587176814e-06, "loss": 0.0003692630911245942, "memory(GiB)": 160.86, "step": 2295, "token_acc": 0.9997475809844342, "train_speed(iter/s)": 0.03652 }, { "epoch": 0.8429540040315191, "grad_norm": 0.006763943005353212, "learning_rate": 8.174562141200878e-06, "loss": 0.0002094252035021782, "memory(GiB)": 160.86, "step": 2300, "token_acc": 0.9999158036541214, "train_speed(iter/s)": 0.036535 }, { "epoch": 0.8447865127359355, "grad_norm": 0.04695817828178406, "learning_rate": 8.167142000440749e-06, "loss": 0.0005172740202397108, "memory(GiB)": 160.86, "step": 2305, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.036551 }, { "epoch": 0.8466190214403518, "grad_norm": 0.026909319683909416, "learning_rate": 8.15971019223152e-06, "loss": 0.00024677792098373177, "memory(GiB)": 160.86, "step": 2310, "token_acc": 1.0, "train_speed(iter/s)": 0.036567 }, { "epoch": 0.8484515301447682, "grad_norm": 0.0009972673142328858, "learning_rate": 8.152266743951264e-06, "loss": 0.00048431595787405967, "memory(GiB)": 160.86, "step": 2315, "token_acc": 0.9999157752884696, "train_speed(iter/s)": 0.036583 }, { "epoch": 0.8502840388491846, "grad_norm": 0.1550913155078888, "learning_rate": 8.144811683020932e-06, "loss": 0.00014740382321178913, "memory(GiB)": 160.86, "step": 2320, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.036599 }, { "epoch": 0.8521165475536009, "grad_norm": 0.04358501732349396, "learning_rate": 8.13734503690426e-06, "loss": 0.0010699840262532235, "memory(GiB)": 160.86, "step": 2325, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.036612 }, { "epoch": 0.8539490562580172, "grad_norm": 0.002750721760094166, "learning_rate": 8.12986683310766e-06, "loss": 0.0002569463336840272, "memory(GiB)": 160.86, "step": 2330, "token_acc": 0.9999158390843292, "train_speed(iter/s)": 0.036627 }, { "epoch": 0.8557815649624335, "grad_norm": 0.010151500813663006, "learning_rate": 8.12237709918012e-06, "loss": 0.00014050663448870183, "memory(GiB)": 160.86, "step": 2335, "token_acc": 1.0, "train_speed(iter/s)": 0.036644 }, { "epoch": 0.8576140736668499, "grad_norm": 0.004389213863760233, "learning_rate": 8.114875862713107e-06, "loss": 5.258661694824695e-05, "memory(GiB)": 160.86, "step": 2340, "token_acc": 1.0, "train_speed(iter/s)": 0.036659 }, { "epoch": 0.8594465823712663, "grad_norm": 0.004478363320231438, "learning_rate": 8.10736315134046e-06, "loss": 0.0017528504133224488, "memory(GiB)": 160.86, "step": 2345, "token_acc": 0.9996633846671716, "train_speed(iter/s)": 0.036675 }, { "epoch": 0.8612790910756826, "grad_norm": 0.004733589943498373, "learning_rate": 8.099838992738292e-06, "loss": 0.0013998121954500674, "memory(GiB)": 160.86, "step": 2350, "token_acc": 0.9994953318193288, "train_speed(iter/s)": 0.03669 }, { "epoch": 0.863111599780099, "grad_norm": 0.00977323018014431, "learning_rate": 8.092303414624884e-06, "loss": 0.00046326019801199434, "memory(GiB)": 160.86, "step": 2355, "token_acc": 0.9998316781686585, "train_speed(iter/s)": 0.036705 }, { "epoch": 0.8649441084845153, "grad_norm": 0.04947784170508385, "learning_rate": 8.08475644476059e-06, "loss": 0.0001862859120592475, "memory(GiB)": 160.86, "step": 2360, "token_acc": 1.0, "train_speed(iter/s)": 0.036721 }, { "epoch": 0.8667766171889316, "grad_norm": 0.21693383157253265, "learning_rate": 8.077198110947725e-06, "loss": 0.0009612908586859703, "memory(GiB)": 160.86, "step": 2365, "token_acc": 0.9998316923335858, "train_speed(iter/s)": 0.036737 }, { "epoch": 0.868609125893348, "grad_norm": 0.023295719176530838, "learning_rate": 8.069628441030472e-06, "loss": 0.0004069589078426361, "memory(GiB)": 160.86, "step": 2370, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.036697 }, { "epoch": 0.8704416345977644, "grad_norm": 0.06745916604995728, "learning_rate": 8.062047462894771e-06, "loss": 0.0006006782408803701, "memory(GiB)": 160.86, "step": 2375, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.036712 }, { "epoch": 0.8722741433021807, "grad_norm": 0.05341252312064171, "learning_rate": 8.054455204468225e-06, "loss": 0.000835646316409111, "memory(GiB)": 160.86, "step": 2380, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.036728 }, { "epoch": 0.874106652006597, "grad_norm": 0.01815791241824627, "learning_rate": 8.046851693719986e-06, "loss": 0.00021557288710027933, "memory(GiB)": 160.86, "step": 2385, "token_acc": 1.0, "train_speed(iter/s)": 0.036743 }, { "epoch": 0.8759391607110134, "grad_norm": 0.0018982563633471727, "learning_rate": 8.039236958660666e-06, "loss": 0.00010541609954088927, "memory(GiB)": 160.86, "step": 2390, "token_acc": 1.0, "train_speed(iter/s)": 0.036759 }, { "epoch": 0.8777716694154297, "grad_norm": 0.0008025880670174956, "learning_rate": 8.031611027342221e-06, "loss": 0.00029539645183831455, "memory(GiB)": 160.86, "step": 2395, "token_acc": 0.9998317631224765, "train_speed(iter/s)": 0.036774 }, { "epoch": 0.879604178119846, "grad_norm": 0.02493736520409584, "learning_rate": 8.023973927857857e-06, "loss": 0.0010729983448982238, "memory(GiB)": 160.86, "step": 2400, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.036789 }, { "epoch": 0.8814366868242625, "grad_norm": 0.23594622313976288, "learning_rate": 8.016325688341919e-06, "loss": 0.0005186852067708969, "memory(GiB)": 160.86, "step": 2405, "token_acc": 0.9998316923335858, "train_speed(iter/s)": 0.036805 }, { "epoch": 0.8832691955286788, "grad_norm": 0.014162681996822357, "learning_rate": 8.00866633696979e-06, "loss": 0.00019059464102610946, "memory(GiB)": 160.86, "step": 2410, "token_acc": 0.9999158390843292, "train_speed(iter/s)": 0.03682 }, { "epoch": 0.8851017042330951, "grad_norm": 0.04650455340743065, "learning_rate": 8.000995901957792e-06, "loss": 0.0004015204031020403, "memory(GiB)": 160.86, "step": 2415, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.036835 }, { "epoch": 0.8869342129375114, "grad_norm": 0.04503090679645538, "learning_rate": 7.993314411563075e-06, "loss": 0.0006881221663206816, "memory(GiB)": 160.86, "step": 2420, "token_acc": 0.9997475172529877, "train_speed(iter/s)": 0.03685 }, { "epoch": 0.8887667216419278, "grad_norm": 0.008592194877564907, "learning_rate": 7.98562189408352e-06, "loss": 0.0002544657327234745, "memory(GiB)": 160.86, "step": 2425, "token_acc": 0.9999158107425492, "train_speed(iter/s)": 0.036865 }, { "epoch": 0.8905992303463441, "grad_norm": 0.04029720276594162, "learning_rate": 7.977918377857625e-06, "loss": 0.0004797634668648243, "memory(GiB)": 160.86, "step": 2430, "token_acc": 0.9998316498316498, "train_speed(iter/s)": 0.03688 }, { "epoch": 0.8924317390507605, "grad_norm": 0.012428953312337399, "learning_rate": 7.970203891264408e-06, "loss": 0.00046463338658213614, "memory(GiB)": 160.86, "step": 2435, "token_acc": 0.9998317631224765, "train_speed(iter/s)": 0.036895 }, { "epoch": 0.8942642477551769, "grad_norm": 0.1128624677658081, "learning_rate": 7.962478462723306e-06, "loss": 0.000577373243868351, "memory(GiB)": 160.86, "step": 2440, "token_acc": 0.9998316214850985, "train_speed(iter/s)": 0.03691 }, { "epoch": 0.8960967564595932, "grad_norm": 0.005943561438471079, "learning_rate": 7.954742120694059e-06, "loss": 0.0005296251736581325, "memory(GiB)": 160.86, "step": 2445, "token_acc": 0.9998317489694625, "train_speed(iter/s)": 0.036925 }, { "epoch": 0.8979292651640095, "grad_norm": 0.014219972304999828, "learning_rate": 7.946994893676611e-06, "loss": 5.174783291295171e-05, "memory(GiB)": 160.86, "step": 2450, "token_acc": 1.0, "train_speed(iter/s)": 0.03694 }, { "epoch": 0.8997617738684259, "grad_norm": 0.01472583319991827, "learning_rate": 7.93923681021101e-06, "loss": 0.0009220579639077186, "memory(GiB)": 160.86, "step": 2455, "token_acc": 0.9996634413125789, "train_speed(iter/s)": 0.036954 }, { "epoch": 0.9015942825728422, "grad_norm": 0.0020888156723231077, "learning_rate": 7.931467898877298e-06, "loss": 0.0004309060052037239, "memory(GiB)": 160.86, "step": 2460, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.036969 }, { "epoch": 0.9034267912772586, "grad_norm": 0.054128147661685944, "learning_rate": 7.9236881882954e-06, "loss": 0.00036832981277257204, "memory(GiB)": 160.86, "step": 2465, "token_acc": 0.9999157752884696, "train_speed(iter/s)": 0.036983 }, { "epoch": 0.9052592999816749, "grad_norm": 0.009187346324324608, "learning_rate": 7.915897707125027e-06, "loss": 0.0009874864481389523, "memory(GiB)": 160.86, "step": 2470, "token_acc": 0.9996633280026934, "train_speed(iter/s)": 0.036998 }, { "epoch": 0.9070918086860913, "grad_norm": 0.015212767757475376, "learning_rate": 7.908096484065569e-06, "loss": 0.00035822123754769564, "memory(GiB)": 160.86, "step": 2475, "token_acc": 0.9998317206562894, "train_speed(iter/s)": 0.037012 }, { "epoch": 0.9089243173905076, "grad_norm": 0.028434082865715027, "learning_rate": 7.900284547855992e-06, "loss": 0.00033626847434788945, "memory(GiB)": 160.86, "step": 2480, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.037027 }, { "epoch": 0.9107568260949239, "grad_norm": 0.003858706448227167, "learning_rate": 7.892461927274719e-06, "loss": 0.00038427968975156545, "memory(GiB)": 160.86, "step": 2485, "token_acc": 0.9998316923335858, "train_speed(iter/s)": 0.037041 }, { "epoch": 0.9125893347993403, "grad_norm": 0.028237823396921158, "learning_rate": 7.884628651139543e-06, "loss": 0.0008647294715046882, "memory(GiB)": 160.86, "step": 2490, "token_acc": 0.9995789119083712, "train_speed(iter/s)": 0.037056 }, { "epoch": 0.9144218435037567, "grad_norm": 0.014561748132109642, "learning_rate": 7.876784748307502e-06, "loss": 8.994525414891541e-05, "memory(GiB)": 160.86, "step": 2495, "token_acc": 1.0, "train_speed(iter/s)": 0.03707 }, { "epoch": 0.916254352208173, "grad_norm": 0.011074830777943134, "learning_rate": 7.868930247674787e-06, "loss": 0.0002087874570861459, "memory(GiB)": 160.86, "step": 2500, "token_acc": 0.9999158107425492, "train_speed(iter/s)": 0.037084 }, { "epoch": 0.916254352208173, "eval_loss": 0.0007594987982884049, "eval_runtime": 172.1874, "eval_samples_per_second": 2.555, "eval_steps_per_second": 2.555, "eval_token_acc": 0.9997704421284606, "step": 2500 }, { "epoch": 0.9180868609125893, "grad_norm": 0.04182349890470505, "learning_rate": 7.86106517817663e-06, "loss": 0.00022406417410820724, "memory(GiB)": 160.86, "step": 2505, "token_acc": 0.9997928078422231, "train_speed(iter/s)": 0.036773 }, { "epoch": 0.9199193696170057, "grad_norm": 0.010813858360052109, "learning_rate": 7.8531895687872e-06, "loss": 0.0001518705626949668, "memory(GiB)": 160.86, "step": 2510, "token_acc": 0.9999158603281447, "train_speed(iter/s)": 0.036788 }, { "epoch": 0.921751878321422, "grad_norm": 4.607898881658912e-05, "learning_rate": 7.845303448519486e-06, "loss": 0.0005594564136117697, "memory(GiB)": 160.86, "step": 2515, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.036802 }, { "epoch": 0.9235843870258383, "grad_norm": 0.059696584939956665, "learning_rate": 7.837406846425205e-06, "loss": 0.0005560083314776421, "memory(GiB)": 160.86, "step": 2520, "token_acc": 0.9994947368421052, "train_speed(iter/s)": 0.036817 }, { "epoch": 0.9254168957302548, "grad_norm": 0.1952117681503296, "learning_rate": 7.829499791594684e-06, "loss": 0.0007309889886528253, "memory(GiB)": 160.86, "step": 2525, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.036831 }, { "epoch": 0.9272494044346711, "grad_norm": 0.005678711924701929, "learning_rate": 7.821582313156763e-06, "loss": 0.00012894930550828577, "memory(GiB)": 160.86, "step": 2530, "token_acc": 1.0, "train_speed(iter/s)": 0.036845 }, { "epoch": 0.9290819131390874, "grad_norm": 0.0016558946808800101, "learning_rate": 7.813654440278677e-06, "loss": 0.0004136775154620409, "memory(GiB)": 160.86, "step": 2535, "token_acc": 0.9998317206562894, "train_speed(iter/s)": 0.036859 }, { "epoch": 0.9309144218435037, "grad_norm": 0.0007809648523107171, "learning_rate": 7.805716202165949e-06, "loss": 4.669466288760304e-05, "memory(GiB)": 160.86, "step": 2540, "token_acc": 1.0, "train_speed(iter/s)": 0.036873 }, { "epoch": 0.9327469305479201, "grad_norm": 0.0005511490162461996, "learning_rate": 7.797767628062296e-06, "loss": 2.539183187764138e-05, "memory(GiB)": 160.86, "step": 2545, "token_acc": 1.0, "train_speed(iter/s)": 0.036887 }, { "epoch": 0.9345794392523364, "grad_norm": 0.008907792158424854, "learning_rate": 7.789808747249505e-06, "loss": 8.047035662457347e-05, "memory(GiB)": 160.86, "step": 2550, "token_acc": 1.0, "train_speed(iter/s)": 0.036901 }, { "epoch": 0.9364119479567528, "grad_norm": 0.16766001284122467, "learning_rate": 7.781839589047336e-06, "loss": 0.001341984234750271, "memory(GiB)": 160.86, "step": 2555, "token_acc": 0.9997474960020201, "train_speed(iter/s)": 0.036915 }, { "epoch": 0.9382444566611692, "grad_norm": 0.0007593165501020849, "learning_rate": 7.773860182813404e-06, "loss": 6.514263805001974e-05, "memory(GiB)": 160.86, "step": 2560, "token_acc": 1.0, "train_speed(iter/s)": 0.036929 }, { "epoch": 0.9400769653655855, "grad_norm": 0.02255651168525219, "learning_rate": 7.765870557943083e-06, "loss": 0.0009576915763318539, "memory(GiB)": 160.86, "step": 2565, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.036943 }, { "epoch": 0.9419094740700018, "grad_norm": 0.04713983088731766, "learning_rate": 7.75787074386939e-06, "loss": 0.0006936299148947, "memory(GiB)": 160.86, "step": 2570, "token_acc": 0.9997474322276477, "train_speed(iter/s)": 0.036957 }, { "epoch": 0.9437419827744182, "grad_norm": 0.038788143545389175, "learning_rate": 7.749860770062874e-06, "loss": 0.0007801173254847526, "memory(GiB)": 160.86, "step": 2575, "token_acc": 0.9998316073082428, "train_speed(iter/s)": 0.036971 }, { "epoch": 0.9455744914788345, "grad_norm": 0.026828216388821602, "learning_rate": 7.741840666031517e-06, "loss": 0.0009264941327273846, "memory(GiB)": 160.86, "step": 2580, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.036984 }, { "epoch": 0.9474070001832509, "grad_norm": 0.03660447522997856, "learning_rate": 7.733810461320619e-06, "loss": 0.0004160061478614807, "memory(GiB)": 160.86, "step": 2585, "token_acc": 0.9998317064961293, "train_speed(iter/s)": 0.036998 }, { "epoch": 0.9492395088876672, "grad_norm": 0.004005759488791227, "learning_rate": 7.725770185512685e-06, "loss": 0.00036098186392337085, "memory(GiB)": 160.86, "step": 2590, "token_acc": 0.9999157752884696, "train_speed(iter/s)": 0.037012 }, { "epoch": 0.9510720175920836, "grad_norm": 0.0006123992498032749, "learning_rate": 7.717719868227327e-06, "loss": 0.0003307197941467166, "memory(GiB)": 160.86, "step": 2595, "token_acc": 0.9999158320006734, "train_speed(iter/s)": 0.037025 }, { "epoch": 0.9529045262964999, "grad_norm": 0.029207419604063034, "learning_rate": 7.709659539121144e-06, "loss": 7.62599753215909e-05, "memory(GiB)": 160.86, "step": 2600, "token_acc": 1.0, "train_speed(iter/s)": 0.037039 }, { "epoch": 0.9547370350009162, "grad_norm": 0.03443612530827522, "learning_rate": 7.70158922788762e-06, "loss": 0.00035016366746276617, "memory(GiB)": 160.86, "step": 2605, "token_acc": 0.9999158461667929, "train_speed(iter/s)": 0.037052 }, { "epoch": 0.9565695437053326, "grad_norm": 0.020582979544997215, "learning_rate": 7.693508964257015e-06, "loss": 0.0006867663934826851, "memory(GiB)": 160.86, "step": 2610, "token_acc": 0.9999158320006734, "train_speed(iter/s)": 0.037066 }, { "epoch": 0.958402052409749, "grad_norm": 0.010320069268345833, "learning_rate": 7.685418777996245e-06, "loss": 0.0002992436056956649, "memory(GiB)": 160.86, "step": 2615, "token_acc": 0.9998317064961293, "train_speed(iter/s)": 0.037079 }, { "epoch": 0.9602345611141653, "grad_norm": 0.06350167840719223, "learning_rate": 7.677318698908788e-06, "loss": 0.0014985553920269013, "memory(GiB)": 160.86, "step": 2620, "token_acc": 0.9995792662403231, "train_speed(iter/s)": 0.037092 }, { "epoch": 0.9620670698185816, "grad_norm": 0.0018099630251526833, "learning_rate": 7.669208756834563e-06, "loss": 0.0006455457769334316, "memory(GiB)": 160.86, "step": 2625, "token_acc": 0.9997474109623642, "train_speed(iter/s)": 0.037106 }, { "epoch": 0.963899578522998, "grad_norm": 0.02232094667851925, "learning_rate": 7.66108898164982e-06, "loss": 0.0005441450979560613, "memory(GiB)": 160.86, "step": 2630, "token_acc": 0.9998316923335858, "train_speed(iter/s)": 0.037119 }, { "epoch": 0.9657320872274143, "grad_norm": 0.08803337812423706, "learning_rate": 7.65295940326704e-06, "loss": 0.00035574983339756725, "memory(GiB)": 160.86, "step": 2635, "token_acc": 0.9998316498316498, "train_speed(iter/s)": 0.037132 }, { "epoch": 0.9675645959318306, "grad_norm": 0.003819872625172138, "learning_rate": 7.644820051634813e-06, "loss": 0.0005564328283071518, "memory(GiB)": 160.86, "step": 2640, "token_acc": 0.9998315363881402, "train_speed(iter/s)": 0.037146 }, { "epoch": 0.9693971046362471, "grad_norm": 0.012264705263078213, "learning_rate": 7.636670956737735e-06, "loss": 0.0008389626629650593, "memory(GiB)": 160.86, "step": 2645, "token_acc": 0.9995793016407236, "train_speed(iter/s)": 0.037159 }, { "epoch": 0.9712296133406634, "grad_norm": 0.012444542720913887, "learning_rate": 7.628512148596292e-06, "loss": 0.0002988637425005436, "memory(GiB)": 160.86, "step": 2650, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.037172 }, { "epoch": 0.9730621220450797, "grad_norm": 0.04613952711224556, "learning_rate": 7.620343657266758e-06, "loss": 0.0006712310016155243, "memory(GiB)": 160.86, "step": 2655, "token_acc": 0.9997473896934995, "train_speed(iter/s)": 0.037185 }, { "epoch": 0.974894630749496, "grad_norm": 0.009678124450147152, "learning_rate": 7.612165512841076e-06, "loss": 0.0002654188079759479, "memory(GiB)": 160.86, "step": 2660, "token_acc": 0.9999158320006734, "train_speed(iter/s)": 0.037198 }, { "epoch": 0.9767271394539124, "grad_norm": 0.10645924508571625, "learning_rate": 7.603977745446749e-06, "loss": 0.0006820098031312227, "memory(GiB)": 160.86, "step": 2665, "token_acc": 0.999578947368421, "train_speed(iter/s)": 0.037212 }, { "epoch": 0.9785596481583287, "grad_norm": 0.052510544657707214, "learning_rate": 7.595780385246729e-06, "loss": 0.000298806675709784, "memory(GiB)": 160.86, "step": 2670, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.037225 }, { "epoch": 0.9803921568627451, "grad_norm": 0.010894379578530788, "learning_rate": 7.587573462439315e-06, "loss": 0.0006402578670531512, "memory(GiB)": 160.86, "step": 2675, "token_acc": 0.9996632996632997, "train_speed(iter/s)": 0.037237 }, { "epoch": 0.9822246655671615, "grad_norm": 0.04109283536672592, "learning_rate": 7.579357007258022e-06, "loss": 0.0008437959477305412, "memory(GiB)": 160.86, "step": 2680, "token_acc": 0.9997474534893509, "train_speed(iter/s)": 0.03725 }, { "epoch": 0.9840571742715778, "grad_norm": 0.005569992121309042, "learning_rate": 7.571131049971492e-06, "loss": 0.00014509292086586356, "memory(GiB)": 160.86, "step": 2685, "token_acc": 1.0, "train_speed(iter/s)": 0.037263 }, { "epoch": 0.9858896829759941, "grad_norm": 0.03271030634641647, "learning_rate": 7.562895620883364e-06, "loss": 0.0003884633770212531, "memory(GiB)": 160.86, "step": 2690, "token_acc": 0.9999158320006734, "train_speed(iter/s)": 0.037276 }, { "epoch": 0.9877221916804105, "grad_norm": 0.01711997203528881, "learning_rate": 7.554650750332175e-06, "loss": 0.0009255507960915565, "memory(GiB)": 160.86, "step": 2695, "token_acc": 0.9998315647633484, "train_speed(iter/s)": 0.037289 }, { "epoch": 0.9895547003848268, "grad_norm": 0.02630673162639141, "learning_rate": 7.546396468691241e-06, "loss": 0.0005463588051497937, "memory(GiB)": 160.86, "step": 2700, "token_acc": 0.9998316214850985, "train_speed(iter/s)": 0.037302 }, { "epoch": 0.9913872090892432, "grad_norm": 0.005354244727641344, "learning_rate": 7.53813280636855e-06, "loss": 0.000519955437630415, "memory(GiB)": 160.86, "step": 2705, "token_acc": 0.9999157823816742, "train_speed(iter/s)": 0.037314 }, { "epoch": 0.9932197177936595, "grad_norm": 0.028666380792856216, "learning_rate": 7.5298597938066446e-06, "loss": 0.0007598635274916887, "memory(GiB)": 160.86, "step": 2710, "token_acc": 0.9997474747474747, "train_speed(iter/s)": 0.037327 }, { "epoch": 0.9950522264980759, "grad_norm": 0.027820078656077385, "learning_rate": 7.5215774614825144e-06, "loss": 0.00038032070733606815, "memory(GiB)": 160.86, "step": 2715, "token_acc": 0.9998315931289997, "train_speed(iter/s)": 0.037339 }, { "epoch": 0.9968847352024922, "grad_norm": 0.03211966156959534, "learning_rate": 7.51328583990748e-06, "loss": 0.0006773354019969702, "memory(GiB)": 160.86, "step": 2720, "token_acc": 0.9996630443939011, "train_speed(iter/s)": 0.037349 }, { "epoch": 0.9987172439069085, "grad_norm": 0.008736282587051392, "learning_rate": 7.504984959627089e-06, "loss": 0.0001820398378185928, "memory(GiB)": 160.86, "step": 2725, "token_acc": 0.9999157894736842, "train_speed(iter/s)": 0.037362 }, { "epoch": 1.0003665017408834, "grad_norm": 0.04173569008708, "learning_rate": 7.4966748512209884e-06, "loss": 0.00037901154719293116, "memory(GiB)": 160.86, "step": 2730, "token_acc": 0.9998129267608269, "train_speed(iter/s)": 0.037379 }, { "epoch": 1.0021990104452996, "grad_norm": 0.002946143504232168, "learning_rate": 7.488355545302829e-06, "loss": 0.00021834177896380426, "memory(GiB)": 160.86, "step": 2735, "token_acc": 0.9999157965644998, "train_speed(iter/s)": 0.037391 }, { "epoch": 1.004031519149716, "grad_norm": 0.020436054095625877, "learning_rate": 7.480027072520137e-06, "loss": 0.0004638895858079195, "memory(GiB)": 160.86, "step": 2740, "token_acc": 0.9998317206562894, "train_speed(iter/s)": 0.037403 }, { "epoch": 1.0058640278541322, "grad_norm": 0.00012372307537589222, "learning_rate": 7.471689463554212e-06, "loss": 0.00014013800537213684, "memory(GiB)": 160.86, "step": 2745, "token_acc": 0.9999158320006734, "train_speed(iter/s)": 0.037415 }, { "epoch": 1.0076965365585486, "grad_norm": 0.10363256931304932, "learning_rate": 7.463342749120014e-06, "loss": 0.0012814832851290702, "memory(GiB)": 160.86, "step": 2750, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.037427 }, { "epoch": 1.009529045262965, "grad_norm": 0.0360257662832737, "learning_rate": 7.454986959966038e-06, "loss": 0.0002859779866412282, "memory(GiB)": 160.86, "step": 2755, "token_acc": 0.9998315363881402, "train_speed(iter/s)": 0.037439 }, { "epoch": 1.0113615539673813, "grad_norm": 0.0018664754461497068, "learning_rate": 7.446622126874219e-06, "loss": 0.0011785308830440044, "memory(GiB)": 160.86, "step": 2760, "token_acc": 0.999663129526697, "train_speed(iter/s)": 0.037451 }, { "epoch": 1.0131940626717977, "grad_norm": 0.03385569900274277, "learning_rate": 7.438248280659801e-06, "loss": 0.00015975049464032054, "memory(GiB)": 160.86, "step": 2765, "token_acc": 1.0, "train_speed(iter/s)": 0.037463 }, { "epoch": 1.015026571376214, "grad_norm": 0.017654770985245705, "learning_rate": 7.4298654521712364e-06, "loss": 0.0003454319899901748, "memory(GiB)": 160.86, "step": 2770, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.037475 }, { "epoch": 1.0168590800806303, "grad_norm": 0.05392535775899887, "learning_rate": 7.4214736722900675e-06, "loss": 0.0005449390038847924, "memory(GiB)": 160.86, "step": 2775, "token_acc": 0.9997476022211005, "train_speed(iter/s)": 0.037487 }, { "epoch": 1.0186915887850467, "grad_norm": 0.004342063330113888, "learning_rate": 7.413072971930807e-06, "loss": 0.0007950126193463803, "memory(GiB)": 160.86, "step": 2780, "token_acc": 0.9998315647633484, "train_speed(iter/s)": 0.037499 }, { "epoch": 1.0205240974894632, "grad_norm": 0.00310046155937016, "learning_rate": 7.404663382040838e-06, "loss": 0.0002729130210354924, "memory(GiB)": 160.86, "step": 2785, "token_acc": 0.9999158532480646, "train_speed(iter/s)": 0.03751 }, { "epoch": 1.0223566061938794, "grad_norm": 0.0021550292149186134, "learning_rate": 7.396244933600285e-06, "loss": 0.00016694137593731284, "memory(GiB)": 160.86, "step": 2790, "token_acc": 0.9999158603281447, "train_speed(iter/s)": 0.037522 }, { "epoch": 1.0241891148982958, "grad_norm": 0.000986380036920309, "learning_rate": 7.387817657621911e-06, "loss": 0.00015597309684380888, "memory(GiB)": 160.86, "step": 2795, "token_acc": 0.9999158744847312, "train_speed(iter/s)": 0.037533 }, { "epoch": 1.0260216236027122, "grad_norm": 0.001334765343926847, "learning_rate": 7.379381585150997e-06, "loss": 2.5839175214059652e-05, "memory(GiB)": 160.86, "step": 2800, "token_acc": 1.0, "train_speed(iter/s)": 0.037545 }, { "epoch": 1.0278541323071284, "grad_norm": 0.0036596362479031086, "learning_rate": 7.370936747265226e-06, "loss": 0.00017838862258940936, "memory(GiB)": 160.86, "step": 2805, "token_acc": 0.9999157752884696, "train_speed(iter/s)": 0.037557 }, { "epoch": 1.0296866410115448, "grad_norm": 0.04679948464035988, "learning_rate": 7.36248317507458e-06, "loss": 9.25394706428051e-05, "memory(GiB)": 160.86, "step": 2810, "token_acc": 0.9999157256025619, "train_speed(iter/s)": 0.037568 }, { "epoch": 1.0315191497159613, "grad_norm": 0.014712713658809662, "learning_rate": 7.35402089972121e-06, "loss": 0.00011562753934413195, "memory(GiB)": 160.86, "step": 2815, "token_acc": 1.0, "train_speed(iter/s)": 0.03758 }, { "epoch": 1.0333516584203775, "grad_norm": 2.521344504202716e-05, "learning_rate": 7.345549952379334e-06, "loss": 3.463000466581434e-05, "memory(GiB)": 160.86, "step": 2820, "token_acc": 1.0, "train_speed(iter/s)": 0.037592 }, { "epoch": 1.0351841671247939, "grad_norm": 0.24957123398780823, "learning_rate": 7.337070364255112e-06, "loss": 0.0008360546082258225, "memory(GiB)": 160.86, "step": 2825, "token_acc": 0.9996632713191346, "train_speed(iter/s)": 0.037604 }, { "epoch": 1.03701667582921, "grad_norm": 0.21494735777378082, "learning_rate": 7.32858216658654e-06, "loss": 0.0008594411425292492, "memory(GiB)": 160.86, "step": 2830, "token_acc": 0.9999158532480646, "train_speed(iter/s)": 0.037615 }, { "epoch": 1.0388491845336265, "grad_norm": 0.008956658653914928, "learning_rate": 7.320085390643326e-06, "loss": 0.00030957753770053385, "memory(GiB)": 160.86, "step": 2835, "token_acc": 0.9999158532480646, "train_speed(iter/s)": 0.037627 }, { "epoch": 1.040681693238043, "grad_norm": 0.002504108939319849, "learning_rate": 7.311580067726783e-06, "loss": 0.000167914351914078, "memory(GiB)": 160.86, "step": 2840, "token_acc": 0.9999158886365548, "train_speed(iter/s)": 0.037638 }, { "epoch": 1.0425142019424591, "grad_norm": 0.0135150495916605, "learning_rate": 7.3030662291697105e-06, "loss": 4.5498591498471795e-05, "memory(GiB)": 160.86, "step": 2845, "token_acc": 1.0, "train_speed(iter/s)": 0.03765 }, { "epoch": 1.0443467106468756, "grad_norm": 0.002792476676404476, "learning_rate": 7.294543906336279e-06, "loss": 0.000167688459623605, "memory(GiB)": 160.86, "step": 2850, "token_acc": 0.9999157823816742, "train_speed(iter/s)": 0.037661 }, { "epoch": 1.046179219351292, "grad_norm": 0.04909972473978996, "learning_rate": 7.28601313062191e-06, "loss": 0.000728160934522748, "memory(GiB)": 160.86, "step": 2855, "token_acc": 0.9996635545462192, "train_speed(iter/s)": 0.037672 }, { "epoch": 1.0480117280557082, "grad_norm": 0.002446983242407441, "learning_rate": 7.27747393345317e-06, "loss": 0.0003103788709267974, "memory(GiB)": 160.86, "step": 2860, "token_acc": 0.9998317206562894, "train_speed(iter/s)": 0.037684 }, { "epoch": 1.0498442367601246, "grad_norm": 0.005002601537853479, "learning_rate": 7.268926346287647e-06, "loss": 0.000590520678088069, "memory(GiB)": 160.86, "step": 2865, "token_acc": 0.9998316781686585, "train_speed(iter/s)": 0.037695 }, { "epoch": 1.051676745464541, "grad_norm": 0.0063280281610786915, "learning_rate": 7.2603704006138365e-06, "loss": 0.0006456949282437563, "memory(GiB)": 160.86, "step": 2870, "token_acc": 0.9997474109623642, "train_speed(iter/s)": 0.037707 }, { "epoch": 1.0535092541689572, "grad_norm": 0.005347462370991707, "learning_rate": 7.251806127951025e-06, "loss": 0.00015139146707952023, "memory(GiB)": 160.86, "step": 2875, "token_acc": 0.9999158320006734, "train_speed(iter/s)": 0.037718 }, { "epoch": 1.0553417628733737, "grad_norm": 0.005681968294084072, "learning_rate": 7.243233559849179e-06, "loss": 0.00019556223414838315, "memory(GiB)": 160.86, "step": 2880, "token_acc": 0.9999158320006734, "train_speed(iter/s)": 0.037729 }, { "epoch": 1.05717427157779, "grad_norm": 0.0017381316283717752, "learning_rate": 7.234652727888819e-06, "loss": 0.0006761848460882902, "memory(GiB)": 160.86, "step": 2885, "token_acc": 0.9998316498316498, "train_speed(iter/s)": 0.03774 }, { "epoch": 1.0590067802822063, "grad_norm": 0.012453123927116394, "learning_rate": 7.226063663680915e-06, "loss": 0.0005378074944019318, "memory(GiB)": 160.86, "step": 2890, "token_acc": 0.999663356337317, "train_speed(iter/s)": 0.037751 }, { "epoch": 1.0608392889866227, "grad_norm": 0.026770737022161484, "learning_rate": 7.217466398866757e-06, "loss": 0.0007396583911031485, "memory(GiB)": 160.86, "step": 2895, "token_acc": 0.9997474534893509, "train_speed(iter/s)": 0.037762 }, { "epoch": 1.062671797691039, "grad_norm": 0.13343772292137146, "learning_rate": 7.2088609651178505e-06, "loss": 0.0006303425878286361, "memory(GiB)": 160.86, "step": 2900, "token_acc": 0.9997473045822103, "train_speed(iter/s)": 0.037773 }, { "epoch": 1.0645043063954553, "grad_norm": 0.04957849159836769, "learning_rate": 7.200247394135793e-06, "loss": 0.0002914240350946784, "memory(GiB)": 160.86, "step": 2905, "token_acc": 0.9999157894736842, "train_speed(iter/s)": 0.037784 }, { "epoch": 1.0663368150998718, "grad_norm": 0.0030663548968732357, "learning_rate": 7.191625717652158e-06, "loss": 0.0006854488048702479, "memory(GiB)": 160.86, "step": 2910, "token_acc": 0.9997475597441938, "train_speed(iter/s)": 0.037795 }, { "epoch": 1.068169323804288, "grad_norm": 0.044960979372262955, "learning_rate": 7.18299596742838e-06, "loss": 0.0005464905872941018, "memory(GiB)": 160.86, "step": 2915, "token_acc": 0.9998315789473684, "train_speed(iter/s)": 0.037806 }, { "epoch": 1.0700018325087044, "grad_norm": 0.05764192342758179, "learning_rate": 7.174358175255636e-06, "loss": 0.0005072502885013819, "memory(GiB)": 160.86, "step": 2920, "token_acc": 0.9998316640013467, "train_speed(iter/s)": 0.037816 }, { "epoch": 1.0718343412131208, "grad_norm": 0.010302331298589706, "learning_rate": 7.1657123729547275e-06, "loss": 0.0011625357903540135, "memory(GiB)": 160.86, "step": 2925, "token_acc": 0.9999158178297837, "train_speed(iter/s)": 0.037827 }, { "epoch": 1.073666849917537, "grad_norm": 0.04408176988363266, "learning_rate": 7.157058592375966e-06, "loss": 0.0004973907489329576, "memory(GiB)": 160.86, "step": 2930, "token_acc": 0.9998316498316498, "train_speed(iter/s)": 0.037838 }, { "epoch": 1.0754993586219534, "grad_norm": 0.0012950595701113343, "learning_rate": 7.148396865399054e-06, "loss": 0.00015295968623831868, "memory(GiB)": 160.86, "step": 2935, "token_acc": 0.9999158886365548, "train_speed(iter/s)": 0.037849 }, { "epoch": 1.0773318673263699, "grad_norm": 0.032750971615314484, "learning_rate": 7.1397272239329684e-06, "loss": 0.0010722282342612744, "memory(GiB)": 160.86, "step": 2940, "token_acc": 0.999663242970197, "train_speed(iter/s)": 0.03786 }, { "epoch": 1.079164376030786, "grad_norm": 0.0168730691075325, "learning_rate": 7.131049699915842e-06, "loss": 7.366950740106404e-05, "memory(GiB)": 160.86, "step": 2945, "token_acc": 1.0, "train_speed(iter/s)": 0.037871 }, { "epoch": 1.0809968847352025, "grad_norm": 0.007587254513055086, "learning_rate": 7.122364325314844e-06, "loss": 0.0006255113985389471, "memory(GiB)": 160.86, "step": 2950, "token_acc": 0.9999158107425492, "train_speed(iter/s)": 0.037881 }, { "epoch": 1.082829393439619, "grad_norm": 0.0203808955848217, "learning_rate": 7.113671132126067e-06, "loss": 0.00010994931217283011, "memory(GiB)": 160.86, "step": 2955, "token_acc": 1.0, "train_speed(iter/s)": 0.037892 }, { "epoch": 1.0846619021440351, "grad_norm": 0.00795274693518877, "learning_rate": 7.104970152374405e-06, "loss": 0.00014865098055452108, "memory(GiB)": 160.86, "step": 2960, "token_acc": 1.0, "train_speed(iter/s)": 0.037902 }, { "epoch": 1.0864944108484516, "grad_norm": 0.005757440812885761, "learning_rate": 7.09626141811344e-06, "loss": 0.000553938839584589, "memory(GiB)": 160.86, "step": 2965, "token_acc": 0.9998316923335858, "train_speed(iter/s)": 0.037913 }, { "epoch": 1.088326919552868, "grad_norm": 0.010678775608539581, "learning_rate": 7.087544961425317e-06, "loss": 0.0004192313179373741, "memory(GiB)": 160.86, "step": 2970, "token_acc": 0.9999157752884696, "train_speed(iter/s)": 0.037924 }, { "epoch": 1.0901594282572842, "grad_norm": 0.0032097063958644867, "learning_rate": 7.078820814420629e-06, "loss": 0.0006281842943280935, "memory(GiB)": 160.86, "step": 2975, "token_acc": 0.9997473045822103, "train_speed(iter/s)": 0.037935 }, { "epoch": 1.0919919369617006, "grad_norm": 0.012336465530097485, "learning_rate": 7.070089009238306e-06, "loss": 0.000180811935570091, "memory(GiB)": 160.86, "step": 2980, "token_acc": 1.0, "train_speed(iter/s)": 0.037945 }, { "epoch": 1.093824445666117, "grad_norm": 0.0761614739894867, "learning_rate": 7.061349578045481e-06, "loss": 0.0011349070817232132, "memory(GiB)": 160.86, "step": 2985, "token_acc": 0.999578947368421, "train_speed(iter/s)": 0.037956 }, { "epoch": 1.0956569543705332, "grad_norm": 0.0008425001287832856, "learning_rate": 7.05260255303739e-06, "loss": 0.000435651745647192, "memory(GiB)": 160.86, "step": 2990, "token_acc": 0.9999158461667929, "train_speed(iter/s)": 0.037967 }, { "epoch": 1.0974894630749497, "grad_norm": 0.0662672221660614, "learning_rate": 7.043847966437235e-06, "loss": 0.0007866304367780685, "memory(GiB)": 160.86, "step": 2995, "token_acc": 0.9996635262449529, "train_speed(iter/s)": 0.037978 }, { "epoch": 1.0993219717793659, "grad_norm": 0.02012745290994644, "learning_rate": 7.035085850496079e-06, "loss": 6.958455196581781e-05, "memory(GiB)": 160.86, "step": 3000, "token_acc": 1.0, "train_speed(iter/s)": 0.037988 }, { "epoch": 1.0993219717793659, "eval_loss": 0.0006502080941572785, "eval_runtime": 172.5767, "eval_samples_per_second": 2.55, "eval_steps_per_second": 2.55, "eval_token_acc": 0.9997857459865632, "step": 3000 } ], "logging_steps": 5, "max_steps": 8184, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.74067294651731e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }